xref: /aosp_15_r20/external/toolchain-utils/crosperf/machine_manager.py (revision 760c253c1ed00ce9abd48f8546f08516e57485fe)
1*760c253cSXin Li# -*- coding: utf-8 -*-
2*760c253cSXin Li# Copyright 2013 The ChromiumOS Authors
3*760c253cSXin Li# Use of this source code is governed by a BSD-style license that can be
4*760c253cSXin Li# found in the LICENSE file.
5*760c253cSXin Li
6*760c253cSXin Li"""Machine Manager module."""
7*760c253cSXin Li
8*760c253cSXin Li
9*760c253cSXin Liimport collections
10*760c253cSXin Liimport hashlib
11*760c253cSXin Liimport math
12*760c253cSXin Liimport os.path
13*760c253cSXin Liimport re
14*760c253cSXin Liimport sys
15*760c253cSXin Liimport threading
16*760c253cSXin Liimport time
17*760c253cSXin Li
18*760c253cSXin Lifrom cros_utils import command_executer
19*760c253cSXin Lifrom cros_utils import logger
20*760c253cSXin Liimport file_lock_machine
21*760c253cSXin Liimport image_chromeos
22*760c253cSXin Liimport test_flag
23*760c253cSXin Li
24*760c253cSXin Li
25*760c253cSXin LiCHECKSUM_FILE = "/usr/local/osimage_checksum_file"
26*760c253cSXin Li
27*760c253cSXin Li
28*760c253cSXin Liclass BadChecksum(Exception):
29*760c253cSXin Li    """Raised if all machines for a label don't have the same checksum."""
30*760c253cSXin Li
31*760c253cSXin Li
32*760c253cSXin Liclass BadChecksumString(Exception):
33*760c253cSXin Li    """Raised if all machines for a label don't have the same checksum string."""
34*760c253cSXin Li
35*760c253cSXin Li
36*760c253cSXin Liclass MissingLocksDirectory(Exception):
37*760c253cSXin Li    """Raised when cannot find/access the machine locks directory."""
38*760c253cSXin Li
39*760c253cSXin Li
40*760c253cSXin Liclass CrosCommandError(Exception):
41*760c253cSXin Li    """Raised when an error occurs running command on DUT."""
42*760c253cSXin Li
43*760c253cSXin Li
44*760c253cSXin Liclass CrosMachine(object):
45*760c253cSXin Li    """The machine class."""
46*760c253cSXin Li
47*760c253cSXin Li    def __init__(self, name, chromeos_root, log_level, cmd_exec=None):
48*760c253cSXin Li        self.name = name
49*760c253cSXin Li        self.image = None
50*760c253cSXin Li        # We relate a dut with a label if we reimage the dut using label or we
51*760c253cSXin Li        # detect at the very beginning that the dut is running this label.
52*760c253cSXin Li        self.label = None
53*760c253cSXin Li        self.checksum = None
54*760c253cSXin Li        self.locked = False
55*760c253cSXin Li        self.released_time = time.time()
56*760c253cSXin Li        self.test_run = None
57*760c253cSXin Li        self.chromeos_root = chromeos_root
58*760c253cSXin Li        self.log_level = log_level
59*760c253cSXin Li        self.cpuinfo = None
60*760c253cSXin Li        self.machine_id = None
61*760c253cSXin Li        self.checksum_string = None
62*760c253cSXin Li        self.meminfo = None
63*760c253cSXin Li        self.phys_kbytes = None
64*760c253cSXin Li        self.cooldown_wait_time = 0
65*760c253cSXin Li        self.ce = cmd_exec or command_executer.GetCommandExecuter(
66*760c253cSXin Li            log_level=self.log_level
67*760c253cSXin Li        )
68*760c253cSXin Li        self.SetUpChecksumInfo()
69*760c253cSXin Li
70*760c253cSXin Li    def SetUpChecksumInfo(self):
71*760c253cSXin Li        if not self.IsReachable():
72*760c253cSXin Li            self.machine_checksum = None
73*760c253cSXin Li            return
74*760c253cSXin Li        self._GetMemoryInfo()
75*760c253cSXin Li        self._GetCPUInfo()
76*760c253cSXin Li        self._ComputeMachineChecksumString()
77*760c253cSXin Li        self._GetMachineID()
78*760c253cSXin Li        self.machine_checksum = self._GetMD5Checksum(self.checksum_string)
79*760c253cSXin Li        self.machine_id_checksum = self._GetMD5Checksum(self.machine_id)
80*760c253cSXin Li
81*760c253cSXin Li    def IsReachable(self):
82*760c253cSXin Li        command = "ls"
83*760c253cSXin Li        ret = self.ce.CrosRunCommand(
84*760c253cSXin Li            command, machine=self.name, chromeos_root=self.chromeos_root
85*760c253cSXin Li        )
86*760c253cSXin Li        if ret:
87*760c253cSXin Li            return False
88*760c253cSXin Li        return True
89*760c253cSXin Li
90*760c253cSXin Li    def AddCooldownWaitTime(self, wait_time):
91*760c253cSXin Li        self.cooldown_wait_time += wait_time
92*760c253cSXin Li
93*760c253cSXin Li    def GetCooldownWaitTime(self):
94*760c253cSXin Li        return self.cooldown_wait_time
95*760c253cSXin Li
96*760c253cSXin Li    def _ParseMemoryInfo(self):
97*760c253cSXin Li        line = self.meminfo.splitlines()[0]
98*760c253cSXin Li        usable_kbytes = int(line.split()[1])
99*760c253cSXin Li        # This code is from src/third_party/test/files/client/bin/base_utils.py
100*760c253cSXin Li        # usable_kbytes is system's usable DRAM in kbytes,
101*760c253cSXin Li        #   as reported by memtotal() from device /proc/meminfo memtotal
102*760c253cSXin Li        #   after Linux deducts 1.5% to 9.5% for system table overhead
103*760c253cSXin Li        # Undo the unknown actual deduction by rounding up
104*760c253cSXin Li        #   to next small multiple of a big power-of-two
105*760c253cSXin Li        #   eg  12GB - 5.1% gets rounded back up to 12GB
106*760c253cSXin Li        mindeduct = 0.005  # 0.5 percent
107*760c253cSXin Li        maxdeduct = 0.095  # 9.5 percent
108*760c253cSXin Li        # deduction range 1.5% .. 9.5% supports physical mem sizes
109*760c253cSXin Li        #    6GB .. 12GB in steps of .5GB
110*760c253cSXin Li        #   12GB .. 24GB in steps of 1 GB
111*760c253cSXin Li        #   24GB .. 48GB in steps of 2 GB ...
112*760c253cSXin Li        # Finer granularity in physical mem sizes would require
113*760c253cSXin Li        #   tighter spread between min and max possible deductions
114*760c253cSXin Li
115*760c253cSXin Li        # increase mem size by at least min deduction, without rounding
116*760c253cSXin Li        min_kbytes = int(usable_kbytes / (1.0 - mindeduct))
117*760c253cSXin Li        # increase mem size further by 2**n rounding, by 0..roundKb or more
118*760c253cSXin Li        round_kbytes = int(usable_kbytes / (1.0 - maxdeduct)) - min_kbytes
119*760c253cSXin Li        # find least binary roundup 2**n that covers worst-cast roundKb
120*760c253cSXin Li        mod2n = 1 << int(math.ceil(math.log(round_kbytes, 2)))
121*760c253cSXin Li        # have round_kbytes <= mod2n < round_kbytes*2
122*760c253cSXin Li        # round min_kbytes up to next multiple of mod2n
123*760c253cSXin Li        phys_kbytes = min_kbytes + mod2n - 1
124*760c253cSXin Li        phys_kbytes -= phys_kbytes % mod2n  # clear low bits
125*760c253cSXin Li        self.phys_kbytes = phys_kbytes
126*760c253cSXin Li
127*760c253cSXin Li    def _GetMemoryInfo(self):
128*760c253cSXin Li        # TODO yunlian: when the machine in rebooting, it will not return
129*760c253cSXin Li        # meminfo, the assert does not catch it either
130*760c253cSXin Li        command = "cat /proc/meminfo"
131*760c253cSXin Li        ret, self.meminfo, _ = self.ce.CrosRunCommandWOutput(
132*760c253cSXin Li            command, machine=self.name, chromeos_root=self.chromeos_root
133*760c253cSXin Li        )
134*760c253cSXin Li        assert ret == 0, "Could not get meminfo from machine: %s" % self.name
135*760c253cSXin Li        if ret == 0:
136*760c253cSXin Li            self._ParseMemoryInfo()
137*760c253cSXin Li
138*760c253cSXin Li    def _GetCPUInfo(self):
139*760c253cSXin Li        command = "cat /proc/cpuinfo"
140*760c253cSXin Li        ret, self.cpuinfo, _ = self.ce.CrosRunCommandWOutput(
141*760c253cSXin Li            command, machine=self.name, chromeos_root=self.chromeos_root
142*760c253cSXin Li        )
143*760c253cSXin Li        assert ret == 0, "Could not get cpuinfo from machine: %s" % self.name
144*760c253cSXin Li
145*760c253cSXin Li    def _ComputeMachineChecksumString(self):
146*760c253cSXin Li        self.checksum_string = ""
147*760c253cSXin Li        # Some lines from cpuinfo have to be excluded because they are not
148*760c253cSXin Li        # persistent across DUTs.
149*760c253cSXin Li        # MHz, BogoMIPS are dynamically changing values.
150*760c253cSXin Li        # core id, apicid are identifiers assigned on startup
151*760c253cSXin Li        # and may differ on the same type of machine.
152*760c253cSXin Li        exclude_lines_list = [
153*760c253cSXin Li            "MHz",
154*760c253cSXin Li            "BogoMIPS",
155*760c253cSXin Li            "bogomips",
156*760c253cSXin Li            "core id",
157*760c253cSXin Li            "apicid",
158*760c253cSXin Li        ]
159*760c253cSXin Li        for line in self.cpuinfo.splitlines():
160*760c253cSXin Li            if not any(e in line for e in exclude_lines_list):
161*760c253cSXin Li                self.checksum_string += line
162*760c253cSXin Li        self.checksum_string += " " + str(self.phys_kbytes)
163*760c253cSXin Li
164*760c253cSXin Li    def _GetMD5Checksum(self, ss):
165*760c253cSXin Li        if ss:
166*760c253cSXin Li            return hashlib.md5(ss.encode("utf-8")).hexdigest()
167*760c253cSXin Li        return ""
168*760c253cSXin Li
169*760c253cSXin Li    def _GetMachineID(self):
170*760c253cSXin Li        command = "dump_vpd_log --full --stdout"
171*760c253cSXin Li        _, if_out, _ = self.ce.CrosRunCommandWOutput(
172*760c253cSXin Li            command, machine=self.name, chromeos_root=self.chromeos_root
173*760c253cSXin Li        )
174*760c253cSXin Li        b = if_out.splitlines()
175*760c253cSXin Li        a = [l for l in b if "Product" in l]
176*760c253cSXin Li        if a:
177*760c253cSXin Li            self.machine_id = a[0]
178*760c253cSXin Li            return
179*760c253cSXin Li        command = "ifconfig"
180*760c253cSXin Li        _, if_out, _ = self.ce.CrosRunCommandWOutput(
181*760c253cSXin Li            command, machine=self.name, chromeos_root=self.chromeos_root
182*760c253cSXin Li        )
183*760c253cSXin Li        b = if_out.splitlines()
184*760c253cSXin Li        a = [l for l in b if "HWaddr" in l]
185*760c253cSXin Li        if a:
186*760c253cSXin Li            self.machine_id = "_".join(a)
187*760c253cSXin Li            return
188*760c253cSXin Li        a = [l for l in b if "ether" in l]
189*760c253cSXin Li        if a:
190*760c253cSXin Li            self.machine_id = "_".join(a)
191*760c253cSXin Li            return
192*760c253cSXin Li        assert 0, "Could not get machine_id from machine: %s" % self.name
193*760c253cSXin Li
194*760c253cSXin Li    def __str__(self):
195*760c253cSXin Li        l = []
196*760c253cSXin Li        l.append(self.name)
197*760c253cSXin Li        l.append(str(self.image))
198*760c253cSXin Li        l.append(str(self.checksum))
199*760c253cSXin Li        l.append(str(self.locked))
200*760c253cSXin Li        l.append(str(self.released_time))
201*760c253cSXin Li        return ", ".join(l)
202*760c253cSXin Li
203*760c253cSXin Li
204*760c253cSXin Liclass MachineManager(object):
205*760c253cSXin Li    """Lock, image and unlock machines locally for benchmark runs.
206*760c253cSXin Li
207*760c253cSXin Li    This class contains methods and calls to lock, unlock and image
208*760c253cSXin Li    machines and distribute machines to each benchmark run.  The assumption is
209*760c253cSXin Li    that all of the machines for the experiment have been globally locked
210*760c253cSXin Li    in the ExperimentRunner, but the machines still need to be locally
211*760c253cSXin Li    locked/unlocked (allocated to benchmark runs) to prevent multiple benchmark
212*760c253cSXin Li    runs within the same experiment from trying to use the same machine at the
213*760c253cSXin Li    same time.
214*760c253cSXin Li    """
215*760c253cSXin Li
216*760c253cSXin Li    def __init__(
217*760c253cSXin Li        self,
218*760c253cSXin Li        chromeos_root,
219*760c253cSXin Li        acquire_timeout,
220*760c253cSXin Li        log_level,
221*760c253cSXin Li        locks_dir,
222*760c253cSXin Li        cmd_exec=None,
223*760c253cSXin Li        lgr=None,
224*760c253cSXin Li        keep_stateful: bool = False,
225*760c253cSXin Li    ):
226*760c253cSXin Li        self._lock = threading.RLock()
227*760c253cSXin Li        self._all_machines = []
228*760c253cSXin Li        self._machines = []
229*760c253cSXin Li        self.image_lock = threading.Lock()
230*760c253cSXin Li        self.num_reimages = 0
231*760c253cSXin Li        self.chromeos_root = None
232*760c253cSXin Li        self.machine_checksum = {}
233*760c253cSXin Li        self.machine_checksum_string = {}
234*760c253cSXin Li        self.acquire_timeout = acquire_timeout
235*760c253cSXin Li        self.log_level = log_level
236*760c253cSXin Li        self.locks_dir = locks_dir
237*760c253cSXin Li        self.keep_stateful = keep_stateful
238*760c253cSXin Li        self.ce = cmd_exec or command_executer.GetCommandExecuter(
239*760c253cSXin Li            log_level=self.log_level
240*760c253cSXin Li        )
241*760c253cSXin Li        self.logger = lgr or logger.GetLogger()
242*760c253cSXin Li
243*760c253cSXin Li        if self.locks_dir and not os.path.isdir(self.locks_dir):
244*760c253cSXin Li            raise MissingLocksDirectory(
245*760c253cSXin Li                "Cannot access locks directory: %s" % self.locks_dir
246*760c253cSXin Li            )
247*760c253cSXin Li
248*760c253cSXin Li        self._initialized_machines = []
249*760c253cSXin Li        self.chromeos_root = chromeos_root
250*760c253cSXin Li
251*760c253cSXin Li    def RemoveNonLockedMachines(self, locked_machines):
252*760c253cSXin Li        for m in self._all_machines:
253*760c253cSXin Li            if m.name not in locked_machines:
254*760c253cSXin Li                self._all_machines.remove(m)
255*760c253cSXin Li
256*760c253cSXin Li        for m in self._machines:
257*760c253cSXin Li            if m.name not in locked_machines:
258*760c253cSXin Li                self._machines.remove(m)
259*760c253cSXin Li
260*760c253cSXin Li    def GetChromeVersion(self, machine):
261*760c253cSXin Li        """Get the version of Chrome running on the DUT."""
262*760c253cSXin Li
263*760c253cSXin Li        cmd = "/opt/google/chrome/chrome --version"
264*760c253cSXin Li        ret, version, _ = self.ce.CrosRunCommandWOutput(
265*760c253cSXin Li            cmd, machine=machine.name, chromeos_root=self.chromeos_root
266*760c253cSXin Li        )
267*760c253cSXin Li        if ret != 0:
268*760c253cSXin Li            raise CrosCommandError(
269*760c253cSXin Li                "Couldn't get Chrome version from %s." % machine.name
270*760c253cSXin Li            )
271*760c253cSXin Li
272*760c253cSXin Li        if ret != 0:
273*760c253cSXin Li            version = ""
274*760c253cSXin Li        return version.rstrip()
275*760c253cSXin Li
276*760c253cSXin Li    def ImageMachine(self, machine, label):
277*760c253cSXin Li        checksum = label.checksum
278*760c253cSXin Li
279*760c253cSXin Li        if checksum and (machine.checksum == checksum):
280*760c253cSXin Li            return
281*760c253cSXin Li        chromeos_root = label.chromeos_root
282*760c253cSXin Li        if not chromeos_root:
283*760c253cSXin Li            chromeos_root = self.chromeos_root
284*760c253cSXin Li        image_chromeos_args = [
285*760c253cSXin Li            image_chromeos.__file__,
286*760c253cSXin Li            "--no_lock",
287*760c253cSXin Li            f"--chromeos_root={chromeos_root}",
288*760c253cSXin Li            f"--image={label.chromeos_image}",
289*760c253cSXin Li            f"--image_args={label.image_args}",
290*760c253cSXin Li            f"--remote={machine.name}",
291*760c253cSXin Li            f"--logging_level={self.log_level}",
292*760c253cSXin Li        ]
293*760c253cSXin Li        if label.board:
294*760c253cSXin Li            image_chromeos_args.append(f"--board={label.board}")
295*760c253cSXin Li        if self.keep_stateful:
296*760c253cSXin Li            image_chromeos_args.append("--keep_stateful")
297*760c253cSXin Li
298*760c253cSXin Li        # Currently can't image two machines at once.
299*760c253cSXin Li        # So have to serialized on this lock.
300*760c253cSXin Li        save_ce_log_level = self.ce.log_level
301*760c253cSXin Li        if self.log_level != "verbose":
302*760c253cSXin Li            self.ce.log_level = "average"
303*760c253cSXin Li
304*760c253cSXin Li        with self.image_lock:
305*760c253cSXin Li            if self.log_level != "verbose":
306*760c253cSXin Li                self.logger.LogOutput("Pushing image onto machine.")
307*760c253cSXin Li                self.logger.LogOutput(
308*760c253cSXin Li                    "Running image_chromeos.DoImage with %s"
309*760c253cSXin Li                    % " ".join(image_chromeos_args)
310*760c253cSXin Li                )
311*760c253cSXin Li            retval = 0
312*760c253cSXin Li            if not test_flag.GetTestMode():
313*760c253cSXin Li                retval = image_chromeos.DoImage(image_chromeos_args)
314*760c253cSXin Li            if retval:
315*760c253cSXin Li                cmd = "reboot && exit"
316*760c253cSXin Li                if self.log_level != "verbose":
317*760c253cSXin Li                    self.logger.LogOutput("reboot & exit.")
318*760c253cSXin Li                self.ce.CrosRunCommand(
319*760c253cSXin Li                    cmd, machine=machine.name, chromeos_root=self.chromeos_root
320*760c253cSXin Li                )
321*760c253cSXin Li                time.sleep(60)
322*760c253cSXin Li                if self.log_level != "verbose":
323*760c253cSXin Li                    self.logger.LogOutput("Pushing image onto machine.")
324*760c253cSXin Li                    self.logger.LogOutput(
325*760c253cSXin Li                        "Running image_chromeos.DoImage with %s"
326*760c253cSXin Li                        % " ".join(image_chromeos_args)
327*760c253cSXin Li                    )
328*760c253cSXin Li                retval = image_chromeos.DoImage(image_chromeos_args)
329*760c253cSXin Li            if retval:
330*760c253cSXin Li                raise RuntimeError(
331*760c253cSXin Li                    "Could not image machine: '%s'." % machine.name
332*760c253cSXin Li                )
333*760c253cSXin Li
334*760c253cSXin Li            self.num_reimages += 1
335*760c253cSXin Li            machine.checksum = checksum
336*760c253cSXin Li            machine.image = label.chromeos_image
337*760c253cSXin Li            machine.label = label
338*760c253cSXin Li
339*760c253cSXin Li        if not label.chrome_version:
340*760c253cSXin Li            label.chrome_version = self.GetChromeVersion(machine)
341*760c253cSXin Li
342*760c253cSXin Li        self.ce.log_level = save_ce_log_level
343*760c253cSXin Li        return retval
344*760c253cSXin Li
345*760c253cSXin Li    def ComputeCommonCheckSum(self, label):
346*760c253cSXin Li        # Since this is used for cache lookups before the machines have been
347*760c253cSXin Li        # compared/verified, check here to make sure they all have the same
348*760c253cSXin Li        # checksum (otherwise the cache lookup may not be valid).
349*760c253cSXin Li        base = None
350*760c253cSXin Li        for machine in self.GetMachines(label):
351*760c253cSXin Li            # Make sure the machine's checksums are calculated.
352*760c253cSXin Li            if not machine.machine_checksum:
353*760c253cSXin Li                machine.SetUpChecksumInfo()
354*760c253cSXin Li            # Use the first machine as the basis for comparison.
355*760c253cSXin Li            if not base:
356*760c253cSXin Li                base = machine
357*760c253cSXin Li            # Make sure this machine's checksum matches our 'common' checksum.
358*760c253cSXin Li            if base.machine_checksum != machine.machine_checksum:
359*760c253cSXin Li                # Found a difference. Fatal error.
360*760c253cSXin Li                # Extract non-matching part and report it.
361*760c253cSXin Li                for mismatch_index in range(len(base.checksum_string)):
362*760c253cSXin Li                    if (
363*760c253cSXin Li                        mismatch_index >= len(machine.checksum_string)
364*760c253cSXin Li                        or base.checksum_string[mismatch_index]
365*760c253cSXin Li                        != machine.checksum_string[mismatch_index]
366*760c253cSXin Li                    ):
367*760c253cSXin Li                        break
368*760c253cSXin Li                # We want to show some context after the mismatch.
369*760c253cSXin Li                end_ind = mismatch_index + 8
370*760c253cSXin Li                # Print a mismatching string.
371*760c253cSXin Li                raise BadChecksum(
372*760c253cSXin Li                    "Machine checksums do not match!\n"
373*760c253cSXin Li                    "Diff:\n"
374*760c253cSXin Li                    f"{base.name}: {base.checksum_string[:end_ind]}\n"
375*760c253cSXin Li                    f"{machine.name}: {machine.checksum_string[:end_ind]}\n"
376*760c253cSXin Li                    "\nCheck for matching /proc/cpuinfo and /proc/meminfo on DUTs.\n"
377*760c253cSXin Li                )
378*760c253cSXin Li        self.machine_checksum[label.name] = base.machine_checksum
379*760c253cSXin Li
380*760c253cSXin Li    def ComputeCommonCheckSumString(self, label):
381*760c253cSXin Li        # The assumption is that this function is only called AFTER
382*760c253cSXin Li        # ComputeCommonCheckSum, so there is no need to verify the machines
383*760c253cSXin Li        # are the same here.  If this is ever changed, this function should be
384*760c253cSXin Li        # modified to verify that all the machines for a given label are the
385*760c253cSXin Li        # same.
386*760c253cSXin Li        for machine in self.GetMachines(label):
387*760c253cSXin Li            if machine.checksum_string:
388*760c253cSXin Li                self.machine_checksum_string[
389*760c253cSXin Li                    label.name
390*760c253cSXin Li                ] = machine.checksum_string
391*760c253cSXin Li                break
392*760c253cSXin Li
393*760c253cSXin Li    def _TryToLockMachine(self, cros_machine):
394*760c253cSXin Li        with self._lock:
395*760c253cSXin Li            assert cros_machine, "Machine can't be None"
396*760c253cSXin Li            for m in self._machines:
397*760c253cSXin Li                if m.name == cros_machine.name:
398*760c253cSXin Li                    return
399*760c253cSXin Li            locked = True
400*760c253cSXin Li            if self.locks_dir:
401*760c253cSXin Li                locked = file_lock_machine.Machine(
402*760c253cSXin Li                    cros_machine.name, self.locks_dir
403*760c253cSXin Li                ).Lock(True, sys.argv[0])
404*760c253cSXin Li            if locked:
405*760c253cSXin Li                self._machines.append(cros_machine)
406*760c253cSXin Li                command = "cat %s" % CHECKSUM_FILE
407*760c253cSXin Li                ret, out, _ = self.ce.CrosRunCommandWOutput(
408*760c253cSXin Li                    command,
409*760c253cSXin Li                    chromeos_root=self.chromeos_root,
410*760c253cSXin Li                    machine=cros_machine.name,
411*760c253cSXin Li                )
412*760c253cSXin Li                if ret == 0:
413*760c253cSXin Li                    cros_machine.checksum = out.strip()
414*760c253cSXin Li            elif self.locks_dir:
415*760c253cSXin Li                self.logger.LogOutput("Couldn't lock: %s" % cros_machine.name)
416*760c253cSXin Li
417*760c253cSXin Li    # This is called from single threaded mode.
418*760c253cSXin Li    def AddMachine(self, machine_name):
419*760c253cSXin Li        with self._lock:
420*760c253cSXin Li            for m in self._all_machines:
421*760c253cSXin Li                assert m.name != machine_name, (
422*760c253cSXin Li                    "Tried to double-add %s" % machine_name
423*760c253cSXin Li                )
424*760c253cSXin Li
425*760c253cSXin Li            if self.log_level != "verbose":
426*760c253cSXin Li                self.logger.LogOutput(
427*760c253cSXin Li                    "Setting up remote access to %s" % machine_name
428*760c253cSXin Li                )
429*760c253cSXin Li                self.logger.LogOutput(
430*760c253cSXin Li                    "Checking machine characteristics for %s" % machine_name
431*760c253cSXin Li                )
432*760c253cSXin Li            cm = CrosMachine(machine_name, self.chromeos_root, self.log_level)
433*760c253cSXin Li            if cm.machine_checksum:
434*760c253cSXin Li                self._all_machines.append(cm)
435*760c253cSXin Li
436*760c253cSXin Li    def RemoveMachine(self, machine_name):
437*760c253cSXin Li        with self._lock:
438*760c253cSXin Li            self._machines = [
439*760c253cSXin Li                m for m in self._machines if m.name != machine_name
440*760c253cSXin Li            ]
441*760c253cSXin Li            if self.locks_dir:
442*760c253cSXin Li                res = file_lock_machine.Machine(
443*760c253cSXin Li                    machine_name, self.locks_dir
444*760c253cSXin Li                ).Unlock(True)
445*760c253cSXin Li                if not res:
446*760c253cSXin Li                    self.logger.LogError(
447*760c253cSXin Li                        "Could not unlock machine: '%s'." % machine_name
448*760c253cSXin Li                    )
449*760c253cSXin Li
450*760c253cSXin Li    def ForceSameImageToAllMachines(self, label):
451*760c253cSXin Li        machines = self.GetMachines(label)
452*760c253cSXin Li        for m in machines:
453*760c253cSXin Li            self.ImageMachine(m, label)
454*760c253cSXin Li            m.SetUpChecksumInfo()
455*760c253cSXin Li
456*760c253cSXin Li    def AcquireMachine(self, label):
457*760c253cSXin Li        image_checksum = label.checksum
458*760c253cSXin Li        machines = self.GetMachines(label)
459*760c253cSXin Li        check_interval_time = 120
460*760c253cSXin Li        with self._lock:
461*760c253cSXin Li            # Lazily external lock machines
462*760c253cSXin Li            while self.acquire_timeout >= 0:
463*760c253cSXin Li                for m in machines:
464*760c253cSXin Li                    new_machine = m not in self._all_machines
465*760c253cSXin Li                    self._TryToLockMachine(m)
466*760c253cSXin Li                    if new_machine:
467*760c253cSXin Li                        m.released_time = time.time()
468*760c253cSXin Li                if self.GetAvailableMachines(label):
469*760c253cSXin Li                    break
470*760c253cSXin Li                sleep_time = max(
471*760c253cSXin Li                    1, min(self.acquire_timeout, check_interval_time)
472*760c253cSXin Li                )
473*760c253cSXin Li                time.sleep(sleep_time)
474*760c253cSXin Li                self.acquire_timeout -= sleep_time
475*760c253cSXin Li
476*760c253cSXin Li            if self.acquire_timeout < 0:
477*760c253cSXin Li                self.logger.LogFatal(
478*760c253cSXin Li                    "Could not acquire any of the "
479*760c253cSXin Li                    "following machines: '%s'"
480*760c253cSXin Li                    % ", ".join(machine.name for machine in machines)
481*760c253cSXin Li                )
482*760c253cSXin Li
483*760c253cSXin Li            ###      for m in self._machines:
484*760c253cSXin Li            ###        if (m.locked and time.time() - m.released_time < 10 and
485*760c253cSXin Li            ###            m.checksum == image_checksum):
486*760c253cSXin Li            ###          return None
487*760c253cSXin Li            unlocked_machines = [
488*760c253cSXin Li                machine
489*760c253cSXin Li                for machine in self.GetAvailableMachines(label)
490*760c253cSXin Li                if not machine.locked
491*760c253cSXin Li            ]
492*760c253cSXin Li            for m in unlocked_machines:
493*760c253cSXin Li                if image_checksum and m.checksum == image_checksum:
494*760c253cSXin Li                    m.locked = True
495*760c253cSXin Li                    m.test_run = threading.current_thread()
496*760c253cSXin Li                    return m
497*760c253cSXin Li            for m in unlocked_machines:
498*760c253cSXin Li                if not m.checksum:
499*760c253cSXin Li                    m.locked = True
500*760c253cSXin Li                    m.test_run = threading.current_thread()
501*760c253cSXin Li                    return m
502*760c253cSXin Li            # This logic ensures that threads waiting on a machine will get a machine
503*760c253cSXin Li            # with a checksum equal to their image over other threads. This saves time
504*760c253cSXin Li            # when crosperf initially assigns the machines to threads by minimizing
505*760c253cSXin Li            # the number of re-images.
506*760c253cSXin Li            # TODO(asharif): If we centralize the thread-scheduler, we wont need this
507*760c253cSXin Li            # code and can implement minimal reimaging code more cleanly.
508*760c253cSXin Li            for m in unlocked_machines:
509*760c253cSXin Li                if time.time() - m.released_time > 15:
510*760c253cSXin Li                    # The release time gap is too large, so it is probably in the start
511*760c253cSXin Li                    # stage, we need to reset the released_time.
512*760c253cSXin Li                    m.released_time = time.time()
513*760c253cSXin Li                elif time.time() - m.released_time > 8:
514*760c253cSXin Li                    m.locked = True
515*760c253cSXin Li                    m.test_run = threading.current_thread()
516*760c253cSXin Li                    return m
517*760c253cSXin Li        return None
518*760c253cSXin Li
519*760c253cSXin Li    def GetAvailableMachines(self, label=None):
520*760c253cSXin Li        if not label:
521*760c253cSXin Li            return self._machines
522*760c253cSXin Li        return [m for m in self._machines if m.name in label.remote]
523*760c253cSXin Li
524*760c253cSXin Li    def GetMachines(self, label=None):
525*760c253cSXin Li        if not label:
526*760c253cSXin Li            return self._all_machines
527*760c253cSXin Li        return [m for m in self._all_machines if m.name in label.remote]
528*760c253cSXin Li
529*760c253cSXin Li    def ReleaseMachine(self, machine):
530*760c253cSXin Li        with self._lock:
531*760c253cSXin Li            for m in self._machines:
532*760c253cSXin Li                if machine.name == m.name:
533*760c253cSXin Li                    assert m.locked, "Tried to double-release %s" % m.name
534*760c253cSXin Li                    m.released_time = time.time()
535*760c253cSXin Li                    m.locked = False
536*760c253cSXin Li                    m.status = "Available"
537*760c253cSXin Li                    break
538*760c253cSXin Li
539*760c253cSXin Li    def Cleanup(self):
540*760c253cSXin Li        with self._lock:
541*760c253cSXin Li            # Unlock all machines (via file lock)
542*760c253cSXin Li            for m in self._machines:
543*760c253cSXin Li                res = file_lock_machine.Machine(m.name, self.locks_dir).Unlock(
544*760c253cSXin Li                    True
545*760c253cSXin Li                )
546*760c253cSXin Li
547*760c253cSXin Li                if not res:
548*760c253cSXin Li                    self.logger.LogError(
549*760c253cSXin Li                        "Could not unlock machine: '%s'." % m.name
550*760c253cSXin Li                    )
551*760c253cSXin Li
552*760c253cSXin Li    def __str__(self):
553*760c253cSXin Li        with self._lock:
554*760c253cSXin Li            l = ["MachineManager Status:"] + [str(m) for m in self._machines]
555*760c253cSXin Li            return "\n".join(l)
556*760c253cSXin Li
557*760c253cSXin Li    def AsString(self):
558*760c253cSXin Li        with self._lock:
559*760c253cSXin Li            stringify_fmt = "%-30s %-10s %-4s %-25s %-32s"
560*760c253cSXin Li            header = stringify_fmt % (
561*760c253cSXin Li                "Machine",
562*760c253cSXin Li                "Thread",
563*760c253cSXin Li                "Lock",
564*760c253cSXin Li                "Status",
565*760c253cSXin Li                "Checksum",
566*760c253cSXin Li            )
567*760c253cSXin Li            table = [header]
568*760c253cSXin Li            for m in self._machines:
569*760c253cSXin Li                if m.test_run:
570*760c253cSXin Li                    test_name = m.test_run.name
571*760c253cSXin Li                    test_status = m.test_run.timeline.GetLastEvent()
572*760c253cSXin Li                else:
573*760c253cSXin Li                    test_name = ""
574*760c253cSXin Li                    test_status = ""
575*760c253cSXin Li
576*760c253cSXin Li                try:
577*760c253cSXin Li                    machine_string = stringify_fmt % (
578*760c253cSXin Li                        m.name,
579*760c253cSXin Li                        test_name,
580*760c253cSXin Li                        m.locked,
581*760c253cSXin Li                        test_status,
582*760c253cSXin Li                        m.checksum,
583*760c253cSXin Li                    )
584*760c253cSXin Li                except ValueError:
585*760c253cSXin Li                    machine_string = ""
586*760c253cSXin Li                table.append(machine_string)
587*760c253cSXin Li            return "Machine Status:\n%s" % "\n".join(table)
588*760c253cSXin Li
589*760c253cSXin Li    def GetAllCPUInfo(self, labels):
590*760c253cSXin Li        """Get cpuinfo for labels, merge them if their cpuinfo are the same."""
591*760c253cSXin Li        dic = collections.defaultdict(list)
592*760c253cSXin Li        for label in labels:
593*760c253cSXin Li            for machine in self._all_machines:
594*760c253cSXin Li                if machine.name in label.remote:
595*760c253cSXin Li                    dic[machine.cpuinfo].append(label.name)
596*760c253cSXin Li                    break
597*760c253cSXin Li        output_segs = []
598*760c253cSXin Li        for key, v in dic.items():
599*760c253cSXin Li            output = " ".join(v)
600*760c253cSXin Li            output += "\n-------------------\n"
601*760c253cSXin Li            output += key
602*760c253cSXin Li            output += "\n\n\n"
603*760c253cSXin Li            output_segs.append(output)
604*760c253cSXin Li        return "".join(output_segs)
605*760c253cSXin Li
606*760c253cSXin Li    def GetAllMachines(self):
607*760c253cSXin Li        return self._all_machines
608*760c253cSXin Li
609*760c253cSXin Li
610*760c253cSXin Liclass MockCrosMachine(CrosMachine):
611*760c253cSXin Li    """Mock cros machine class."""
612*760c253cSXin Li
613*760c253cSXin Li    # pylint: disable=super-init-not-called
614*760c253cSXin Li
615*760c253cSXin Li    MEMINFO_STRING = """MemTotal:        3990332 kB
616*760c253cSXin LiMemFree:         2608396 kB
617*760c253cSXin LiBuffers:          147168 kB
618*760c253cSXin LiCached:           811560 kB
619*760c253cSXin LiSwapCached:            0 kB
620*760c253cSXin LiActive:           503480 kB
621*760c253cSXin LiInactive:         628572 kB
622*760c253cSXin LiActive(anon):     174532 kB
623*760c253cSXin LiInactive(anon):    88576 kB
624*760c253cSXin LiActive(file):     328948 kB
625*760c253cSXin LiInactive(file):   539996 kB
626*760c253cSXin LiUnevictable:           0 kB
627*760c253cSXin LiMlocked:               0 kB
628*760c253cSXin LiSwapTotal:       5845212 kB
629*760c253cSXin LiSwapFree:        5845212 kB
630*760c253cSXin LiDirty:              9384 kB
631*760c253cSXin LiWriteback:             0 kB
632*760c253cSXin LiAnonPages:        173408 kB
633*760c253cSXin LiMapped:           146268 kB
634*760c253cSXin LiShmem:             89676 kB
635*760c253cSXin LiSlab:             188260 kB
636*760c253cSXin LiSReclaimable:     169208 kB
637*760c253cSXin LiSUnreclaim:        19052 kB
638*760c253cSXin LiKernelStack:        2032 kB
639*760c253cSXin LiPageTables:         7120 kB
640*760c253cSXin LiNFS_Unstable:          0 kB
641*760c253cSXin LiBounce:                0 kB
642*760c253cSXin LiWritebackTmp:          0 kB
643*760c253cSXin LiCommitLimit:     7840376 kB
644*760c253cSXin LiCommitted_AS:    1082032 kB
645*760c253cSXin LiVmallocTotal:   34359738367 kB
646*760c253cSXin LiVmallocUsed:      364980 kB
647*760c253cSXin LiVmallocChunk:   34359369407 kB
648*760c253cSXin LiDirectMap4k:       45824 kB
649*760c253cSXin LiDirectMap2M:     4096000 kB
650*760c253cSXin Li"""
651*760c253cSXin Li
652*760c253cSXin Li    CPUINFO_STRING = """processor: 0
653*760c253cSXin Livendor_id: GenuineIntel
654*760c253cSXin Licpu family: 6
655*760c253cSXin Limodel: 42
656*760c253cSXin Limodel name: Intel(R) Celeron(R) CPU 867 @ 1.30GHz
657*760c253cSXin Listepping: 7
658*760c253cSXin Limicrocode: 0x25
659*760c253cSXin Licpu MHz: 1300.000
660*760c253cSXin Licache size: 2048 KB
661*760c253cSXin Liphysical id: 0
662*760c253cSXin Lisiblings: 2
663*760c253cSXin Licore id: 0
664*760c253cSXin Licpu cores: 2
665*760c253cSXin Liapicid: 0
666*760c253cSXin Liinitial apicid: 0
667*760c253cSXin Lifpu: yes
668*760c253cSXin Lifpu_exception: yes
669*760c253cSXin Licpuid level: 13
670*760c253cSXin Liwp: yes
671*760c253cSXin Liflags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer xsave lahf_lm arat epb xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
672*760c253cSXin Libogomips: 2594.17
673*760c253cSXin Liclflush size: 64
674*760c253cSXin Licache_alignment: 64
675*760c253cSXin Liaddress sizes: 36 bits physical, 48 bits virtual
676*760c253cSXin Lipower management:
677*760c253cSXin Li
678*760c253cSXin Liprocessor: 1
679*760c253cSXin Livendor_id: GenuineIntel
680*760c253cSXin Licpu family: 6
681*760c253cSXin Limodel: 42
682*760c253cSXin Limodel name: Intel(R) Celeron(R) CPU 867 @ 1.30GHz
683*760c253cSXin Listepping: 7
684*760c253cSXin Limicrocode: 0x25
685*760c253cSXin Licpu MHz: 1300.000
686*760c253cSXin Licache size: 2048 KB
687*760c253cSXin Liphysical id: 0
688*760c253cSXin Lisiblings: 2
689*760c253cSXin Licore id: 1
690*760c253cSXin Licpu cores: 2
691*760c253cSXin Liapicid: 2
692*760c253cSXin Liinitial apicid: 2
693*760c253cSXin Lifpu: yes
694*760c253cSXin Lifpu_exception: yes
695*760c253cSXin Licpuid level: 13
696*760c253cSXin Liwp: yes
697*760c253cSXin Liflags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer xsave lahf_lm arat epb xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
698*760c253cSXin Libogomips: 2594.17
699*760c253cSXin Liclflush size: 64
700*760c253cSXin Licache_alignment: 64
701*760c253cSXin Liaddress sizes: 36 bits physical, 48 bits virtual
702*760c253cSXin Lipower management:
703*760c253cSXin Li"""
704*760c253cSXin Li
705*760c253cSXin Li    def __init__(self, name, chromeos_root, log_level):
706*760c253cSXin Li        self.name = name
707*760c253cSXin Li        self.image = None
708*760c253cSXin Li        self.checksum = None
709*760c253cSXin Li        self.locked = False
710*760c253cSXin Li        self.released_time = time.time()
711*760c253cSXin Li        self.test_run = None
712*760c253cSXin Li        self.chromeos_root = chromeos_root
713*760c253cSXin Li        self.checksum_string = re.sub(r"\d", "", name)
714*760c253cSXin Li        # In test, we assume "lumpy1", "lumpy2" are the same machine.
715*760c253cSXin Li        self.machine_checksum = self._GetMD5Checksum(self.checksum_string)
716*760c253cSXin Li        self.log_level = log_level
717*760c253cSXin Li        self.label = None
718*760c253cSXin Li        self.cooldown_wait_time = 0
719*760c253cSXin Li        self.ce = command_executer.GetCommandExecuter(log_level=self.log_level)
720*760c253cSXin Li        self._GetCPUInfo()
721*760c253cSXin Li
722*760c253cSXin Li    def IsReachable(self):
723*760c253cSXin Li        return True
724*760c253cSXin Li
725*760c253cSXin Li    def _GetMemoryInfo(self):
726*760c253cSXin Li        self.meminfo = self.MEMINFO_STRING
727*760c253cSXin Li        self._ParseMemoryInfo()
728*760c253cSXin Li
729*760c253cSXin Li    def _GetCPUInfo(self):
730*760c253cSXin Li        self.cpuinfo = self.CPUINFO_STRING
731*760c253cSXin Li
732*760c253cSXin Li
733*760c253cSXin Liclass MockMachineManager(MachineManager):
734*760c253cSXin Li    """Mock machine manager class."""
735*760c253cSXin Li
736*760c253cSXin Li    def __init__(
737*760c253cSXin Li        self,
738*760c253cSXin Li        chromeos_root,
739*760c253cSXin Li        acquire_timeout,
740*760c253cSXin Li        log_level,
741*760c253cSXin Li        locks_dir,
742*760c253cSXin Li        keep_stateful: bool = False,
743*760c253cSXin Li    ):
744*760c253cSXin Li        super(MockMachineManager, self).__init__(
745*760c253cSXin Li            chromeos_root,
746*760c253cSXin Li            acquire_timeout,
747*760c253cSXin Li            log_level,
748*760c253cSXin Li            locks_dir,
749*760c253cSXin Li            keep_stateful=keep_stateful,
750*760c253cSXin Li        )
751*760c253cSXin Li
752*760c253cSXin Li    def _TryToLockMachine(self, cros_machine):
753*760c253cSXin Li        self._machines.append(cros_machine)
754*760c253cSXin Li        cros_machine.checksum = ""
755*760c253cSXin Li
756*760c253cSXin Li    def AddMachine(self, machine_name):
757*760c253cSXin Li        with self._lock:
758*760c253cSXin Li            for m in self._all_machines:
759*760c253cSXin Li                assert m.name != machine_name, (
760*760c253cSXin Li                    "Tried to double-add %s" % machine_name
761*760c253cSXin Li                )
762*760c253cSXin Li            cm = MockCrosMachine(
763*760c253cSXin Li                machine_name, self.chromeos_root, self.log_level
764*760c253cSXin Li            )
765*760c253cSXin Li            assert cm.machine_checksum, (
766*760c253cSXin Li                "Could not find checksum for machine %s" % machine_name
767*760c253cSXin Li            )
768*760c253cSXin Li            # In Original MachineManager, the test is 'if cm.machine_checksum:' - if a
769*760c253cSXin Li            # machine is unreachable, then its machine_checksum is None. Here we
770*760c253cSXin Li            # cannot do this, because machine_checksum is always faked, so we directly
771*760c253cSXin Li            # test cm.IsReachable, which is properly mocked.
772*760c253cSXin Li            if cm.IsReachable():
773*760c253cSXin Li                self._all_machines.append(cm)
774*760c253cSXin Li
775*760c253cSXin Li    def GetChromeVersion(self, machine):
776*760c253cSXin Li        return "Mock Chrome Version R50"
777*760c253cSXin Li
778*760c253cSXin Li    def AcquireMachine(self, label):
779*760c253cSXin Li        for machine in self._all_machines:
780*760c253cSXin Li            if not machine.locked:
781*760c253cSXin Li                machine.locked = True
782*760c253cSXin Li                return machine
783*760c253cSXin Li        return None
784*760c253cSXin Li
785*760c253cSXin Li    def ImageMachine(self, machine, label):
786*760c253cSXin Li        if machine or label:
787*760c253cSXin Li            return 0
788*760c253cSXin Li        return 1
789*760c253cSXin Li
790*760c253cSXin Li    def ReleaseMachine(self, machine):
791*760c253cSXin Li        machine.locked = False
792*760c253cSXin Li
793*760c253cSXin Li    def GetMachines(self, label=None):
794*760c253cSXin Li        return self._all_machines
795*760c253cSXin Li
796*760c253cSXin Li    def GetAvailableMachines(self, label=None):
797*760c253cSXin Li        return self._all_machines
798*760c253cSXin Li
799*760c253cSXin Li    def ForceSameImageToAllMachines(self, label=None):
800*760c253cSXin Li        return 0
801*760c253cSXin Li
802*760c253cSXin Li    def ComputeCommonCheckSum(self, label=None):
803*760c253cSXin Li        common_checksum = 12345
804*760c253cSXin Li        for machine in self.GetMachines(label):
805*760c253cSXin Li            machine.machine_checksum = common_checksum
806*760c253cSXin Li        self.machine_checksum[label.name] = common_checksum
807*760c253cSXin Li
808*760c253cSXin Li    def GetAllMachines(self):
809*760c253cSXin Li        return self._all_machines
810