xref: /aosp_15_r20/external/toolchain-utils/lock_machine.py (revision 760c253c1ed00ce9abd48f8546f08516e57485fe)
1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3#
4# Copyright 2019 The ChromiumOS Authors
5# Use of this source code is governed by a BSD-style license that can be
6# found in the LICENSE file.
7
8"""This module controls locking and unlocking of test machines."""
9
10
11import argparse
12import enum
13import getpass
14import os
15import sys
16
17from cros_utils import command_executer
18from cros_utils import logger
19from cros_utils import machines
20import file_lock_machine
21
22
23class LockException(Exception):
24    """Base class for exceptions in this module."""
25
26
27class MachineNotPingable(LockException):
28    """Raised when machine does not respond to ping."""
29
30
31class LockingError(LockException):
32    """Raised when server fails to lock/unlock machine as requested."""
33
34
35class DontOwnLock(LockException):
36    """Raised when user attmepts to unlock machine locked by someone else."""
37
38    # This should not be raised if the user specified '--force'
39
40
41class MachineType(enum.Enum):
42    """Enum class to hold machine type."""
43
44    LOCAL = "local"
45    CROSFLEET = "crosfleet"
46
47
48class LockManager(object):
49    """Class for locking/unlocking machines vie three different modes.
50
51    This class contains methods for checking the locked status of machines,
52    and for changing the locked status.  It handles HW lab machines and local
53    machines, using appropriate locking mechanisms for each.
54    """
55
56    CROSFLEET_PATH = "crosfleet"
57
58    # TODO(zhizhouy): lease time may needs to be dynamically adjusted. For now we
59    # set it long enough to cover the period to finish nightly rotation tests.
60    LEASE_MINS = 1439
61
62    CROSFLEET_CREDENTIAL = (
63        "/usr/local/google/home/mobiletc-prebuild"
64        "/sheriff_utils/credentials/skylab"
65        "/chromeos-swarming-credential.json"
66    )
67    SWARMING = "~/cipd_binaries/swarming"
68    SUCCESS = 0
69
70    def __init__(
71        self, remotes, force_option, chromeos_root, locks_dir="", log=None
72    ):
73        """Initializes an LockManager object.
74
75        Args:
76          remotes: A list of machine names or ip addresses to be managed.  Names
77            and ip addresses should be represented as strings.  If the list is
78            empty, the lock manager will get all known machines.
79          force_option: A Boolean indicating whether or not to force an unlock of
80            a machine that was locked by someone else.
81          chromeos_root: The ChromeOS chroot to use for the autotest scripts.
82          locks_dir: A directory used for file locking local devices.
83          log: If not None, this is the logger object to be used for writing out
84            informational output messages.  It is expected to be an instance of
85            Logger class from cros_utils/logger.py.
86        """
87        self.chromeos_root = chromeos_root
88        self.user = getpass.getuser()
89        self.logger = log or logger.GetLogger()
90        self.ce = command_executer.GetCommandExecuter(self.logger)
91
92        sys.path.append(chromeos_root)
93
94        self.locks_dir = locks_dir
95
96        self.machines = list(set(remotes)) or []
97        self.toolchain_lab_machines = self.GetAllToolchainLabMachines()
98
99        if not self.machines:
100            self.machines = self.toolchain_lab_machines
101        self.force = force_option
102
103        self.local_machines = []
104        self.crosfleet_machines = []
105
106    def CheckMachine(self, machine, error_msg):
107        """Verifies that machine is responding to ping.
108
109        Args:
110          machine: String containing the name or ip address of machine to check.
111          error_msg: Message to print if ping fails.
112
113        Raises:
114          MachineNotPingable:  If machine is not responding to 'ping'
115        """
116        if not machines.MachineIsPingable(machine, logging_level="none"):
117            cros_machine = machine + ".cros"
118            if not machines.MachineIsPingable(
119                cros_machine, logging_level="none"
120            ):
121                raise MachineNotPingable(error_msg)
122
123    def GetAllToolchainLabMachines(self):
124        """Gets a list of all the toolchain machines in the ChromeOS HW lab.
125
126        Returns:
127          A list of names of the toolchain machines in the ChromeOS HW lab.
128        """
129        machines_file = os.path.join(
130            os.path.dirname(__file__), "crosperf", "default_remotes"
131        )
132        machine_list = []
133        with open(machines_file, "r") as input_file:
134            lines = input_file.readlines()
135            for line in lines:
136                _, remotes = line.split(":")
137                remotes = remotes.strip()
138                for r in remotes.split():
139                    machine_list.append(r.strip())
140        return machine_list
141
142    def GetMachineType(self, m):
143        """Get where the machine is located.
144
145        Args:
146          m: String containing the name or ip address of machine.
147
148        Returns:
149          Value of the type in MachineType Enum.
150        """
151        if m in self.local_machines:
152            return MachineType.LOCAL
153        if m in self.crosfleet_machines:
154            return MachineType.CROSFLEET
155
156    def PrintStatusHeader(self):
157        """Prints the status header lines for machines."""
158        print("\nMachine (Board)\t\t\t\t\tStatus")
159        print("---------------\t\t\t\t\t------")
160
161    def PrintStatus(self, m, state, machine_type):
162        """Prints status for a single machine.
163
164        Args:
165          m: String containing the name or ip address of machine.
166          state: A dictionary of the current state of the machine.
167          machine_type: MachineType to determine where the machine is located.
168        """
169        if state["locked"]:
170            print(
171                "%s (%s)\t\t%slocked by %s since %s"
172                % (
173                    m,
174                    state["board"],
175                    "\t\t" if machine_type == MachineType.LOCAL else "",
176                    state["locked_by"],
177                    state["lock_time"],
178                )
179            )
180        else:
181            print(
182                "%s (%s)\t\t%sunlocked"
183                % (
184                    m,
185                    state["board"],
186                    "\t\t" if machine_type == MachineType.LOCAL else "",
187                )
188            )
189
190    def AddMachineToLocal(self, machine):
191        """Adds a machine to local machine list.
192
193        Args:
194          machine: The machine to be added.
195        """
196        if machine not in self.local_machines:
197            self.local_machines.append(machine)
198
199    def AddMachineToCrosfleet(self, machine):
200        """Adds a machine to crosfleet machine list.
201
202        Args:
203          machine: The machine to be added.
204        """
205        if machine not in self.crosfleet_machines:
206            self.crosfleet_machines.append(machine)
207
208    def ListMachineStates(self, machine_states):
209        """Gets and prints the current status for a list of machines.
210
211        Prints out the current status for all of the machines in the current
212        LockManager's list of machines (set when the object is initialized).
213
214        Args:
215          machine_states: A dictionary of the current state of every machine in
216            the current LockManager's list of machines.  Normally obtained by
217            calling LockManager::GetMachineStates.
218        """
219        self.PrintStatusHeader()
220        for m in machine_states:
221            machine_type = self.GetMachineType(m)
222            state = machine_states[m]
223            self.PrintStatus(m, state, machine_type)
224
225    def UpdateLockInCrosfleet(self, should_lock_machine, machine):
226        """Ask crosfleet to lease/release a machine.
227
228        Args:
229          should_lock_machine: Boolean indicating whether to lock the machine (True)
230            or unlock the machine (False).
231          machine: The machine to update.
232
233        Returns:
234          True if requested action succeeded, else False.
235        """
236        try:
237            if should_lock_machine:
238                ret = self.LeaseCrosfleetMachine(machine)
239            else:
240                ret = self.ReleaseCrosfleetMachine(machine)
241        except Exception:
242            return False
243        return ret
244
245    def UpdateFileLock(self, should_lock_machine, machine):
246        """Use file lock for local machines,
247
248        Args:
249          should_lock_machine: Boolean indicating whether to lock the machine (True)
250            or unlock the machine (False).
251          machine: The machine to update.
252
253        Returns:
254          True if requested action succeeded, else False.
255        """
256        try:
257            if should_lock_machine:
258                ret = file_lock_machine.Machine(machine, self.locks_dir).Lock(
259                    True, sys.argv[0]
260                )
261            else:
262                ret = file_lock_machine.Machine(machine, self.locks_dir).Unlock(
263                    True
264                )
265        except Exception:
266            return False
267        return ret
268
269    def UpdateMachines(self, lock_machines):
270        """Sets the locked state of the machines to the requested value.
271
272        The machines updated are the ones in self.machines (specified when the
273        class object was intialized).
274
275        Args:
276          lock_machines: Boolean indicating whether to lock the machines (True) or
277            unlock the machines (False).
278
279        Returns:
280          A list of the machines whose state was successfully updated.
281        """
282        updated_machines = []
283        action = "Locking" if lock_machines else "Unlocking"
284        for m in self.machines:
285            # TODO(zhizhouy): Handling exceptions with more details when locking
286            # doesn't succeed.
287            machine_type = self.GetMachineType(m)
288            if machine_type == MachineType.CROSFLEET:
289                ret = self.UpdateLockInCrosfleet(lock_machines, m)
290            elif machine_type == MachineType.LOCAL:
291                ret = self.UpdateFileLock(lock_machines, m)
292
293            if ret:
294                self.logger.LogOutput(
295                    "%s %s machine succeeded: %s."
296                    % (action, machine_type.value, m)
297                )
298                updated_machines.append(m)
299            else:
300                self.logger.LogOutput(
301                    "%s %s machine failed: %s."
302                    % (action, machine_type.value, m)
303                )
304
305        self.machines = updated_machines
306        return updated_machines
307
308    def _InternalRemoveMachine(self, machine):
309        """Remove machine from internal list of machines.
310
311        Args:
312          machine: Name of machine to be removed from internal list.
313        """
314        # Check to see if machine is lab machine and if so, make sure it has
315        # ".cros" on the end.
316        cros_machine = machine
317        if machine.find("rack") > 0 and machine.find("row") > 0:
318            if machine.find(".cros") == -1:
319                cros_machine = cros_machine + ".cros"
320
321        self.machines = [
322            m for m in self.machines if m not in (cros_machine, machine)
323        ]
324
325    def CheckMachineLocks(self, machine_states, cmd):
326        """Check that every machine in requested list is in the proper state.
327
328        If the cmd is 'unlock' verify that every machine is locked by requestor.
329        If the cmd is 'lock' verify that every machine is currently unlocked.
330
331        Args:
332          machine_states: A dictionary of the current state of every machine in
333            the current LockManager's list of machines.  Normally obtained by
334            calling LockManager::GetMachineStates.
335          cmd: The user-requested action for the machines: 'lock' or 'unlock'.
336
337        Raises:
338          DontOwnLock: The lock on a requested machine is owned by someone else.
339        """
340        for k, state in machine_states.items():
341            if cmd == "unlock":
342                if not state["locked"]:
343                    self.logger.LogWarning(
344                        "Attempt to unlock already unlocked machine "
345                        "(%s)." % k
346                    )
347                    self._InternalRemoveMachine(k)
348
349                # TODO(zhizhouy): Crosfleet doesn't support host info such as locked_by.
350                # Need to update this when crosfleet supports it.
351                if (
352                    state["locked"]
353                    and state["locked_by"]
354                    and state["locked_by"] != self.user
355                ):
356                    raise DontOwnLock(
357                        "Attempt to unlock machine (%s) locked by someone "
358                        "else (%s)." % (k, state["locked_by"])
359                    )
360            elif cmd == "lock":
361                if state["locked"]:
362                    self.logger.LogWarning(
363                        "Attempt to lock already locked machine (%s)" % k
364                    )
365                    self._InternalRemoveMachine(k)
366
367    def GetMachineStates(self, cmd=""):
368        """Gets the current state of all the requested machines.
369
370        Gets the current state of all the requested machines. Stores the data in a
371        dictionary keyed by machine name.
372
373        Args:
374          cmd: The command for which we are getting the machine states. This is
375            important because if one of the requested machines is missing we raise
376            an exception, unless the requested command is 'add'.
377
378        Returns:
379          A dictionary of machine states for all the machines in the LockManager
380          object.
381        """
382        machine_list = {}
383        for m in self.machines:
384            # For local or crosfleet machines, we simply set {'locked': status} for
385            # them
386            # TODO(zhizhouy): This is a quick fix since crosfleet cannot return host
387            # info as afe does. We need to get more info such as locked_by when
388            # crosfleet supports that.
389            values = {
390                "locked": 0 if cmd == "lock" else 1,
391                "board": "??",
392                "locked_by": "",
393                "lock_time": "",
394            }
395            machine_list[m] = values
396
397        self.ListMachineStates(machine_list)
398
399        return machine_list
400
401    def CheckMachineInCrosfleet(self, machine):
402        """Run command to check if machine is in Crosfleet or not.
403
404        Returns:
405          True if machine in crosfleet, else False
406        """
407        credential = ""
408        if os.path.exists(self.CROSFLEET_CREDENTIAL):
409            credential = "--service-account-json %s" % self.CROSFLEET_CREDENTIAL
410        server = "--server https://chromeos-swarming.appspot.com"
411        dimensions = "--dimension dut_name=%s" % machine.rstrip(".cros")
412
413        cmd = f"{self.SWARMING} bots {server} {credential} {dimensions}"
414        exit_code, stdout, stderr = self.ce.RunCommandWOutput(cmd)
415        if exit_code:
416            raise ValueError(
417                "Querying bots failed (2); stdout: %r; stderr: %r"
418                % (stdout, stderr)
419            )
420
421        # The command will return a json output as stdout. If machine not in
422        # crosfleet, stdout will look like this:
423        #  {
424        #    "death_timeout": "600",
425        #    "now": "TIMESTAMP"
426        #  }
427        # Otherwise there will be a tuple starting with 'items', we simply detect
428        # this keyword for result.
429        return stdout != "[]"
430
431    def LeaseCrosfleetMachine(self, machine):
432        """Run command to lease dut from crosfleet.
433
434        Returns:
435          True if succeeded, False if failed.
436        """
437        credential = ""
438        if os.path.exists(self.CROSFLEET_CREDENTIAL):
439            credential = "-service-account-json %s" % self.CROSFLEET_CREDENTIAL
440        cmd = ("%s dut lease -minutes %s %s %s %s") % (
441            self.CROSFLEET_PATH,
442            self.LEASE_MINS,
443            credential,
444            "-host",
445            machine.rstrip(".cros"),
446        )
447        # Wait 8 minutes for server to start the lease task, if not started,
448        # we will treat it as unavailable.
449        check_interval_time = 480
450        retval = self.ce.RunCommand(cmd, command_timeout=check_interval_time)
451        return retval == self.SUCCESS
452
453    def ReleaseCrosfleetMachine(self, machine):
454        """Run command to release dut from crosfleet.
455
456        Returns:
457          True if succeeded, False if failed.
458        """
459        credential = ""
460        if os.path.exists(self.CROSFLEET_CREDENTIAL):
461            credential = "-service-account-json %s" % self.CROSFLEET_CREDENTIAL
462
463        cmd = ("%s dut abandon %s %s") % (
464            self.CROSFLEET_PATH,
465            credential,
466            machine.rstrip(".cros"),
467        )
468        retval = self.ce.RunCommand(cmd)
469        return retval == self.SUCCESS
470
471
472def Main(argv):
473    """Parse the options, initialize lock manager and dispatch proper method.
474
475    Args:
476      argv: The options with which this script was invoked.
477
478    Returns:
479      0 unless an exception is raised.
480    """
481    parser = argparse.ArgumentParser()
482
483    parser.add_argument(
484        "--list",
485        dest="cmd",
486        action="store_const",
487        const="status",
488        help="List current status of all known machines.",
489    )
490    parser.add_argument(
491        "--lock",
492        dest="cmd",
493        action="store_const",
494        const="lock",
495        help="Lock given machine(s).",
496    )
497    parser.add_argument(
498        "--unlock",
499        dest="cmd",
500        action="store_const",
501        const="unlock",
502        help="Unlock given machine(s).",
503    )
504    parser.add_argument(
505        "--status",
506        dest="cmd",
507        action="store_const",
508        const="status",
509        help="List current status of given machine(s).",
510    )
511    parser.add_argument(
512        "--remote", dest="remote", help="machines on which to operate"
513    )
514    parser.add_argument(
515        "--chromeos_root",
516        dest="chromeos_root",
517        required=True,
518        help="ChromeOS root to use for autotest scripts.",
519    )
520    parser.add_argument(
521        "--force",
522        dest="force",
523        action="store_true",
524        default=False,
525        help="Force lock/unlock of machines, even if not"
526        " current lock owner.",
527    )
528
529    options = parser.parse_args(argv)
530
531    if not options.remote and options.cmd != "status":
532        parser.error("No machines specified for operation.")
533
534    if not os.path.isdir(options.chromeos_root):
535        parser.error("Cannot find chromeos_root: %s." % options.chromeos_root)
536
537    if not options.cmd:
538        parser.error(
539            "No operation selected (--list, --status, --lock, --unlock,"
540            " --add_machine, --remove_machine)."
541        )
542
543    machine_list = []
544    if options.remote:
545        machine_list = options.remote.split()
546
547    lock_manager = LockManager(
548        machine_list, options.force, options.chromeos_root
549    )
550
551    machine_states = lock_manager.GetMachineStates(cmd=options.cmd)
552    cmd = options.cmd
553
554    if cmd == "status":
555        lock_manager.ListMachineStates(machine_states)
556
557    elif cmd == "lock":
558        if not lock_manager.force:
559            lock_manager.CheckMachineLocks(machine_states, cmd)
560            lock_manager.UpdateMachines(True)
561
562    elif cmd == "unlock":
563        if not lock_manager.force:
564            lock_manager.CheckMachineLocks(machine_states, cmd)
565            lock_manager.UpdateMachines(False)
566
567    return 0
568
569
570if __name__ == "__main__":
571    sys.exit(Main(sys.argv[1:]))
572