1*760c253cSXin Li# -*- coding: utf-8 -*- 2*760c253cSXin Li# Copyright 2013 The ChromiumOS Authors 3*760c253cSXin Li# Use of this source code is governed by a BSD-style license that can be 4*760c253cSXin Li# found in the LICENSE file. 5*760c253cSXin Li 6*760c253cSXin Li"""Machine Manager module.""" 7*760c253cSXin Li 8*760c253cSXin Li 9*760c253cSXin Liimport collections 10*760c253cSXin Liimport hashlib 11*760c253cSXin Liimport math 12*760c253cSXin Liimport os.path 13*760c253cSXin Liimport re 14*760c253cSXin Liimport sys 15*760c253cSXin Liimport threading 16*760c253cSXin Liimport time 17*760c253cSXin Li 18*760c253cSXin Lifrom cros_utils import command_executer 19*760c253cSXin Lifrom cros_utils import logger 20*760c253cSXin Liimport file_lock_machine 21*760c253cSXin Liimport image_chromeos 22*760c253cSXin Liimport test_flag 23*760c253cSXin Li 24*760c253cSXin Li 25*760c253cSXin LiCHECKSUM_FILE = "/usr/local/osimage_checksum_file" 26*760c253cSXin Li 27*760c253cSXin Li 28*760c253cSXin Liclass BadChecksum(Exception): 29*760c253cSXin Li """Raised if all machines for a label don't have the same checksum.""" 30*760c253cSXin Li 31*760c253cSXin Li 32*760c253cSXin Liclass BadChecksumString(Exception): 33*760c253cSXin Li """Raised if all machines for a label don't have the same checksum string.""" 34*760c253cSXin Li 35*760c253cSXin Li 36*760c253cSXin Liclass MissingLocksDirectory(Exception): 37*760c253cSXin Li """Raised when cannot find/access the machine locks directory.""" 38*760c253cSXin Li 39*760c253cSXin Li 40*760c253cSXin Liclass CrosCommandError(Exception): 41*760c253cSXin Li """Raised when an error occurs running command on DUT.""" 42*760c253cSXin Li 43*760c253cSXin Li 44*760c253cSXin Liclass CrosMachine(object): 45*760c253cSXin Li """The machine class.""" 46*760c253cSXin Li 47*760c253cSXin Li def __init__(self, name, chromeos_root, log_level, cmd_exec=None): 48*760c253cSXin Li self.name = name 49*760c253cSXin Li self.image = None 50*760c253cSXin Li # We relate a dut with a label if we reimage the dut using label or we 51*760c253cSXin Li # detect at the very beginning that the dut is running this label. 52*760c253cSXin Li self.label = None 53*760c253cSXin Li self.checksum = None 54*760c253cSXin Li self.locked = False 55*760c253cSXin Li self.released_time = time.time() 56*760c253cSXin Li self.test_run = None 57*760c253cSXin Li self.chromeos_root = chromeos_root 58*760c253cSXin Li self.log_level = log_level 59*760c253cSXin Li self.cpuinfo = None 60*760c253cSXin Li self.machine_id = None 61*760c253cSXin Li self.checksum_string = None 62*760c253cSXin Li self.meminfo = None 63*760c253cSXin Li self.phys_kbytes = None 64*760c253cSXin Li self.cooldown_wait_time = 0 65*760c253cSXin Li self.ce = cmd_exec or command_executer.GetCommandExecuter( 66*760c253cSXin Li log_level=self.log_level 67*760c253cSXin Li ) 68*760c253cSXin Li self.SetUpChecksumInfo() 69*760c253cSXin Li 70*760c253cSXin Li def SetUpChecksumInfo(self): 71*760c253cSXin Li if not self.IsReachable(): 72*760c253cSXin Li self.machine_checksum = None 73*760c253cSXin Li return 74*760c253cSXin Li self._GetMemoryInfo() 75*760c253cSXin Li self._GetCPUInfo() 76*760c253cSXin Li self._ComputeMachineChecksumString() 77*760c253cSXin Li self._GetMachineID() 78*760c253cSXin Li self.machine_checksum = self._GetMD5Checksum(self.checksum_string) 79*760c253cSXin Li self.machine_id_checksum = self._GetMD5Checksum(self.machine_id) 80*760c253cSXin Li 81*760c253cSXin Li def IsReachable(self): 82*760c253cSXin Li command = "ls" 83*760c253cSXin Li ret = self.ce.CrosRunCommand( 84*760c253cSXin Li command, machine=self.name, chromeos_root=self.chromeos_root 85*760c253cSXin Li ) 86*760c253cSXin Li if ret: 87*760c253cSXin Li return False 88*760c253cSXin Li return True 89*760c253cSXin Li 90*760c253cSXin Li def AddCooldownWaitTime(self, wait_time): 91*760c253cSXin Li self.cooldown_wait_time += wait_time 92*760c253cSXin Li 93*760c253cSXin Li def GetCooldownWaitTime(self): 94*760c253cSXin Li return self.cooldown_wait_time 95*760c253cSXin Li 96*760c253cSXin Li def _ParseMemoryInfo(self): 97*760c253cSXin Li line = self.meminfo.splitlines()[0] 98*760c253cSXin Li usable_kbytes = int(line.split()[1]) 99*760c253cSXin Li # This code is from src/third_party/test/files/client/bin/base_utils.py 100*760c253cSXin Li # usable_kbytes is system's usable DRAM in kbytes, 101*760c253cSXin Li # as reported by memtotal() from device /proc/meminfo memtotal 102*760c253cSXin Li # after Linux deducts 1.5% to 9.5% for system table overhead 103*760c253cSXin Li # Undo the unknown actual deduction by rounding up 104*760c253cSXin Li # to next small multiple of a big power-of-two 105*760c253cSXin Li # eg 12GB - 5.1% gets rounded back up to 12GB 106*760c253cSXin Li mindeduct = 0.005 # 0.5 percent 107*760c253cSXin Li maxdeduct = 0.095 # 9.5 percent 108*760c253cSXin Li # deduction range 1.5% .. 9.5% supports physical mem sizes 109*760c253cSXin Li # 6GB .. 12GB in steps of .5GB 110*760c253cSXin Li # 12GB .. 24GB in steps of 1 GB 111*760c253cSXin Li # 24GB .. 48GB in steps of 2 GB ... 112*760c253cSXin Li # Finer granularity in physical mem sizes would require 113*760c253cSXin Li # tighter spread between min and max possible deductions 114*760c253cSXin Li 115*760c253cSXin Li # increase mem size by at least min deduction, without rounding 116*760c253cSXin Li min_kbytes = int(usable_kbytes / (1.0 - mindeduct)) 117*760c253cSXin Li # increase mem size further by 2**n rounding, by 0..roundKb or more 118*760c253cSXin Li round_kbytes = int(usable_kbytes / (1.0 - maxdeduct)) - min_kbytes 119*760c253cSXin Li # find least binary roundup 2**n that covers worst-cast roundKb 120*760c253cSXin Li mod2n = 1 << int(math.ceil(math.log(round_kbytes, 2))) 121*760c253cSXin Li # have round_kbytes <= mod2n < round_kbytes*2 122*760c253cSXin Li # round min_kbytes up to next multiple of mod2n 123*760c253cSXin Li phys_kbytes = min_kbytes + mod2n - 1 124*760c253cSXin Li phys_kbytes -= phys_kbytes % mod2n # clear low bits 125*760c253cSXin Li self.phys_kbytes = phys_kbytes 126*760c253cSXin Li 127*760c253cSXin Li def _GetMemoryInfo(self): 128*760c253cSXin Li # TODO yunlian: when the machine in rebooting, it will not return 129*760c253cSXin Li # meminfo, the assert does not catch it either 130*760c253cSXin Li command = "cat /proc/meminfo" 131*760c253cSXin Li ret, self.meminfo, _ = self.ce.CrosRunCommandWOutput( 132*760c253cSXin Li command, machine=self.name, chromeos_root=self.chromeos_root 133*760c253cSXin Li ) 134*760c253cSXin Li assert ret == 0, "Could not get meminfo from machine: %s" % self.name 135*760c253cSXin Li if ret == 0: 136*760c253cSXin Li self._ParseMemoryInfo() 137*760c253cSXin Li 138*760c253cSXin Li def _GetCPUInfo(self): 139*760c253cSXin Li command = "cat /proc/cpuinfo" 140*760c253cSXin Li ret, self.cpuinfo, _ = self.ce.CrosRunCommandWOutput( 141*760c253cSXin Li command, machine=self.name, chromeos_root=self.chromeos_root 142*760c253cSXin Li ) 143*760c253cSXin Li assert ret == 0, "Could not get cpuinfo from machine: %s" % self.name 144*760c253cSXin Li 145*760c253cSXin Li def _ComputeMachineChecksumString(self): 146*760c253cSXin Li self.checksum_string = "" 147*760c253cSXin Li # Some lines from cpuinfo have to be excluded because they are not 148*760c253cSXin Li # persistent across DUTs. 149*760c253cSXin Li # MHz, BogoMIPS are dynamically changing values. 150*760c253cSXin Li # core id, apicid are identifiers assigned on startup 151*760c253cSXin Li # and may differ on the same type of machine. 152*760c253cSXin Li exclude_lines_list = [ 153*760c253cSXin Li "MHz", 154*760c253cSXin Li "BogoMIPS", 155*760c253cSXin Li "bogomips", 156*760c253cSXin Li "core id", 157*760c253cSXin Li "apicid", 158*760c253cSXin Li ] 159*760c253cSXin Li for line in self.cpuinfo.splitlines(): 160*760c253cSXin Li if not any(e in line for e in exclude_lines_list): 161*760c253cSXin Li self.checksum_string += line 162*760c253cSXin Li self.checksum_string += " " + str(self.phys_kbytes) 163*760c253cSXin Li 164*760c253cSXin Li def _GetMD5Checksum(self, ss): 165*760c253cSXin Li if ss: 166*760c253cSXin Li return hashlib.md5(ss.encode("utf-8")).hexdigest() 167*760c253cSXin Li return "" 168*760c253cSXin Li 169*760c253cSXin Li def _GetMachineID(self): 170*760c253cSXin Li command = "dump_vpd_log --full --stdout" 171*760c253cSXin Li _, if_out, _ = self.ce.CrosRunCommandWOutput( 172*760c253cSXin Li command, machine=self.name, chromeos_root=self.chromeos_root 173*760c253cSXin Li ) 174*760c253cSXin Li b = if_out.splitlines() 175*760c253cSXin Li a = [l for l in b if "Product" in l] 176*760c253cSXin Li if a: 177*760c253cSXin Li self.machine_id = a[0] 178*760c253cSXin Li return 179*760c253cSXin Li command = "ifconfig" 180*760c253cSXin Li _, if_out, _ = self.ce.CrosRunCommandWOutput( 181*760c253cSXin Li command, machine=self.name, chromeos_root=self.chromeos_root 182*760c253cSXin Li ) 183*760c253cSXin Li b = if_out.splitlines() 184*760c253cSXin Li a = [l for l in b if "HWaddr" in l] 185*760c253cSXin Li if a: 186*760c253cSXin Li self.machine_id = "_".join(a) 187*760c253cSXin Li return 188*760c253cSXin Li a = [l for l in b if "ether" in l] 189*760c253cSXin Li if a: 190*760c253cSXin Li self.machine_id = "_".join(a) 191*760c253cSXin Li return 192*760c253cSXin Li assert 0, "Could not get machine_id from machine: %s" % self.name 193*760c253cSXin Li 194*760c253cSXin Li def __str__(self): 195*760c253cSXin Li l = [] 196*760c253cSXin Li l.append(self.name) 197*760c253cSXin Li l.append(str(self.image)) 198*760c253cSXin Li l.append(str(self.checksum)) 199*760c253cSXin Li l.append(str(self.locked)) 200*760c253cSXin Li l.append(str(self.released_time)) 201*760c253cSXin Li return ", ".join(l) 202*760c253cSXin Li 203*760c253cSXin Li 204*760c253cSXin Liclass MachineManager(object): 205*760c253cSXin Li """Lock, image and unlock machines locally for benchmark runs. 206*760c253cSXin Li 207*760c253cSXin Li This class contains methods and calls to lock, unlock and image 208*760c253cSXin Li machines and distribute machines to each benchmark run. The assumption is 209*760c253cSXin Li that all of the machines for the experiment have been globally locked 210*760c253cSXin Li in the ExperimentRunner, but the machines still need to be locally 211*760c253cSXin Li locked/unlocked (allocated to benchmark runs) to prevent multiple benchmark 212*760c253cSXin Li runs within the same experiment from trying to use the same machine at the 213*760c253cSXin Li same time. 214*760c253cSXin Li """ 215*760c253cSXin Li 216*760c253cSXin Li def __init__( 217*760c253cSXin Li self, 218*760c253cSXin Li chromeos_root, 219*760c253cSXin Li acquire_timeout, 220*760c253cSXin Li log_level, 221*760c253cSXin Li locks_dir, 222*760c253cSXin Li cmd_exec=None, 223*760c253cSXin Li lgr=None, 224*760c253cSXin Li keep_stateful: bool = False, 225*760c253cSXin Li ): 226*760c253cSXin Li self._lock = threading.RLock() 227*760c253cSXin Li self._all_machines = [] 228*760c253cSXin Li self._machines = [] 229*760c253cSXin Li self.image_lock = threading.Lock() 230*760c253cSXin Li self.num_reimages = 0 231*760c253cSXin Li self.chromeos_root = None 232*760c253cSXin Li self.machine_checksum = {} 233*760c253cSXin Li self.machine_checksum_string = {} 234*760c253cSXin Li self.acquire_timeout = acquire_timeout 235*760c253cSXin Li self.log_level = log_level 236*760c253cSXin Li self.locks_dir = locks_dir 237*760c253cSXin Li self.keep_stateful = keep_stateful 238*760c253cSXin Li self.ce = cmd_exec or command_executer.GetCommandExecuter( 239*760c253cSXin Li log_level=self.log_level 240*760c253cSXin Li ) 241*760c253cSXin Li self.logger = lgr or logger.GetLogger() 242*760c253cSXin Li 243*760c253cSXin Li if self.locks_dir and not os.path.isdir(self.locks_dir): 244*760c253cSXin Li raise MissingLocksDirectory( 245*760c253cSXin Li "Cannot access locks directory: %s" % self.locks_dir 246*760c253cSXin Li ) 247*760c253cSXin Li 248*760c253cSXin Li self._initialized_machines = [] 249*760c253cSXin Li self.chromeos_root = chromeos_root 250*760c253cSXin Li 251*760c253cSXin Li def RemoveNonLockedMachines(self, locked_machines): 252*760c253cSXin Li for m in self._all_machines: 253*760c253cSXin Li if m.name not in locked_machines: 254*760c253cSXin Li self._all_machines.remove(m) 255*760c253cSXin Li 256*760c253cSXin Li for m in self._machines: 257*760c253cSXin Li if m.name not in locked_machines: 258*760c253cSXin Li self._machines.remove(m) 259*760c253cSXin Li 260*760c253cSXin Li def GetChromeVersion(self, machine): 261*760c253cSXin Li """Get the version of Chrome running on the DUT.""" 262*760c253cSXin Li 263*760c253cSXin Li cmd = "/opt/google/chrome/chrome --version" 264*760c253cSXin Li ret, version, _ = self.ce.CrosRunCommandWOutput( 265*760c253cSXin Li cmd, machine=machine.name, chromeos_root=self.chromeos_root 266*760c253cSXin Li ) 267*760c253cSXin Li if ret != 0: 268*760c253cSXin Li raise CrosCommandError( 269*760c253cSXin Li "Couldn't get Chrome version from %s." % machine.name 270*760c253cSXin Li ) 271*760c253cSXin Li 272*760c253cSXin Li if ret != 0: 273*760c253cSXin Li version = "" 274*760c253cSXin Li return version.rstrip() 275*760c253cSXin Li 276*760c253cSXin Li def ImageMachine(self, machine, label): 277*760c253cSXin Li checksum = label.checksum 278*760c253cSXin Li 279*760c253cSXin Li if checksum and (machine.checksum == checksum): 280*760c253cSXin Li return 281*760c253cSXin Li chromeos_root = label.chromeos_root 282*760c253cSXin Li if not chromeos_root: 283*760c253cSXin Li chromeos_root = self.chromeos_root 284*760c253cSXin Li image_chromeos_args = [ 285*760c253cSXin Li image_chromeos.__file__, 286*760c253cSXin Li "--no_lock", 287*760c253cSXin Li f"--chromeos_root={chromeos_root}", 288*760c253cSXin Li f"--image={label.chromeos_image}", 289*760c253cSXin Li f"--image_args={label.image_args}", 290*760c253cSXin Li f"--remote={machine.name}", 291*760c253cSXin Li f"--logging_level={self.log_level}", 292*760c253cSXin Li ] 293*760c253cSXin Li if label.board: 294*760c253cSXin Li image_chromeos_args.append(f"--board={label.board}") 295*760c253cSXin Li if self.keep_stateful: 296*760c253cSXin Li image_chromeos_args.append("--keep_stateful") 297*760c253cSXin Li 298*760c253cSXin Li # Currently can't image two machines at once. 299*760c253cSXin Li # So have to serialized on this lock. 300*760c253cSXin Li save_ce_log_level = self.ce.log_level 301*760c253cSXin Li if self.log_level != "verbose": 302*760c253cSXin Li self.ce.log_level = "average" 303*760c253cSXin Li 304*760c253cSXin Li with self.image_lock: 305*760c253cSXin Li if self.log_level != "verbose": 306*760c253cSXin Li self.logger.LogOutput("Pushing image onto machine.") 307*760c253cSXin Li self.logger.LogOutput( 308*760c253cSXin Li "Running image_chromeos.DoImage with %s" 309*760c253cSXin Li % " ".join(image_chromeos_args) 310*760c253cSXin Li ) 311*760c253cSXin Li retval = 0 312*760c253cSXin Li if not test_flag.GetTestMode(): 313*760c253cSXin Li retval = image_chromeos.DoImage(image_chromeos_args) 314*760c253cSXin Li if retval: 315*760c253cSXin Li cmd = "reboot && exit" 316*760c253cSXin Li if self.log_level != "verbose": 317*760c253cSXin Li self.logger.LogOutput("reboot & exit.") 318*760c253cSXin Li self.ce.CrosRunCommand( 319*760c253cSXin Li cmd, machine=machine.name, chromeos_root=self.chromeos_root 320*760c253cSXin Li ) 321*760c253cSXin Li time.sleep(60) 322*760c253cSXin Li if self.log_level != "verbose": 323*760c253cSXin Li self.logger.LogOutput("Pushing image onto machine.") 324*760c253cSXin Li self.logger.LogOutput( 325*760c253cSXin Li "Running image_chromeos.DoImage with %s" 326*760c253cSXin Li % " ".join(image_chromeos_args) 327*760c253cSXin Li ) 328*760c253cSXin Li retval = image_chromeos.DoImage(image_chromeos_args) 329*760c253cSXin Li if retval: 330*760c253cSXin Li raise RuntimeError( 331*760c253cSXin Li "Could not image machine: '%s'." % machine.name 332*760c253cSXin Li ) 333*760c253cSXin Li 334*760c253cSXin Li self.num_reimages += 1 335*760c253cSXin Li machine.checksum = checksum 336*760c253cSXin Li machine.image = label.chromeos_image 337*760c253cSXin Li machine.label = label 338*760c253cSXin Li 339*760c253cSXin Li if not label.chrome_version: 340*760c253cSXin Li label.chrome_version = self.GetChromeVersion(machine) 341*760c253cSXin Li 342*760c253cSXin Li self.ce.log_level = save_ce_log_level 343*760c253cSXin Li return retval 344*760c253cSXin Li 345*760c253cSXin Li def ComputeCommonCheckSum(self, label): 346*760c253cSXin Li # Since this is used for cache lookups before the machines have been 347*760c253cSXin Li # compared/verified, check here to make sure they all have the same 348*760c253cSXin Li # checksum (otherwise the cache lookup may not be valid). 349*760c253cSXin Li base = None 350*760c253cSXin Li for machine in self.GetMachines(label): 351*760c253cSXin Li # Make sure the machine's checksums are calculated. 352*760c253cSXin Li if not machine.machine_checksum: 353*760c253cSXin Li machine.SetUpChecksumInfo() 354*760c253cSXin Li # Use the first machine as the basis for comparison. 355*760c253cSXin Li if not base: 356*760c253cSXin Li base = machine 357*760c253cSXin Li # Make sure this machine's checksum matches our 'common' checksum. 358*760c253cSXin Li if base.machine_checksum != machine.machine_checksum: 359*760c253cSXin Li # Found a difference. Fatal error. 360*760c253cSXin Li # Extract non-matching part and report it. 361*760c253cSXin Li for mismatch_index in range(len(base.checksum_string)): 362*760c253cSXin Li if ( 363*760c253cSXin Li mismatch_index >= len(machine.checksum_string) 364*760c253cSXin Li or base.checksum_string[mismatch_index] 365*760c253cSXin Li != machine.checksum_string[mismatch_index] 366*760c253cSXin Li ): 367*760c253cSXin Li break 368*760c253cSXin Li # We want to show some context after the mismatch. 369*760c253cSXin Li end_ind = mismatch_index + 8 370*760c253cSXin Li # Print a mismatching string. 371*760c253cSXin Li raise BadChecksum( 372*760c253cSXin Li "Machine checksums do not match!\n" 373*760c253cSXin Li "Diff:\n" 374*760c253cSXin Li f"{base.name}: {base.checksum_string[:end_ind]}\n" 375*760c253cSXin Li f"{machine.name}: {machine.checksum_string[:end_ind]}\n" 376*760c253cSXin Li "\nCheck for matching /proc/cpuinfo and /proc/meminfo on DUTs.\n" 377*760c253cSXin Li ) 378*760c253cSXin Li self.machine_checksum[label.name] = base.machine_checksum 379*760c253cSXin Li 380*760c253cSXin Li def ComputeCommonCheckSumString(self, label): 381*760c253cSXin Li # The assumption is that this function is only called AFTER 382*760c253cSXin Li # ComputeCommonCheckSum, so there is no need to verify the machines 383*760c253cSXin Li # are the same here. If this is ever changed, this function should be 384*760c253cSXin Li # modified to verify that all the machines for a given label are the 385*760c253cSXin Li # same. 386*760c253cSXin Li for machine in self.GetMachines(label): 387*760c253cSXin Li if machine.checksum_string: 388*760c253cSXin Li self.machine_checksum_string[ 389*760c253cSXin Li label.name 390*760c253cSXin Li ] = machine.checksum_string 391*760c253cSXin Li break 392*760c253cSXin Li 393*760c253cSXin Li def _TryToLockMachine(self, cros_machine): 394*760c253cSXin Li with self._lock: 395*760c253cSXin Li assert cros_machine, "Machine can't be None" 396*760c253cSXin Li for m in self._machines: 397*760c253cSXin Li if m.name == cros_machine.name: 398*760c253cSXin Li return 399*760c253cSXin Li locked = True 400*760c253cSXin Li if self.locks_dir: 401*760c253cSXin Li locked = file_lock_machine.Machine( 402*760c253cSXin Li cros_machine.name, self.locks_dir 403*760c253cSXin Li ).Lock(True, sys.argv[0]) 404*760c253cSXin Li if locked: 405*760c253cSXin Li self._machines.append(cros_machine) 406*760c253cSXin Li command = "cat %s" % CHECKSUM_FILE 407*760c253cSXin Li ret, out, _ = self.ce.CrosRunCommandWOutput( 408*760c253cSXin Li command, 409*760c253cSXin Li chromeos_root=self.chromeos_root, 410*760c253cSXin Li machine=cros_machine.name, 411*760c253cSXin Li ) 412*760c253cSXin Li if ret == 0: 413*760c253cSXin Li cros_machine.checksum = out.strip() 414*760c253cSXin Li elif self.locks_dir: 415*760c253cSXin Li self.logger.LogOutput("Couldn't lock: %s" % cros_machine.name) 416*760c253cSXin Li 417*760c253cSXin Li # This is called from single threaded mode. 418*760c253cSXin Li def AddMachine(self, machine_name): 419*760c253cSXin Li with self._lock: 420*760c253cSXin Li for m in self._all_machines: 421*760c253cSXin Li assert m.name != machine_name, ( 422*760c253cSXin Li "Tried to double-add %s" % machine_name 423*760c253cSXin Li ) 424*760c253cSXin Li 425*760c253cSXin Li if self.log_level != "verbose": 426*760c253cSXin Li self.logger.LogOutput( 427*760c253cSXin Li "Setting up remote access to %s" % machine_name 428*760c253cSXin Li ) 429*760c253cSXin Li self.logger.LogOutput( 430*760c253cSXin Li "Checking machine characteristics for %s" % machine_name 431*760c253cSXin Li ) 432*760c253cSXin Li cm = CrosMachine(machine_name, self.chromeos_root, self.log_level) 433*760c253cSXin Li if cm.machine_checksum: 434*760c253cSXin Li self._all_machines.append(cm) 435*760c253cSXin Li 436*760c253cSXin Li def RemoveMachine(self, machine_name): 437*760c253cSXin Li with self._lock: 438*760c253cSXin Li self._machines = [ 439*760c253cSXin Li m for m in self._machines if m.name != machine_name 440*760c253cSXin Li ] 441*760c253cSXin Li if self.locks_dir: 442*760c253cSXin Li res = file_lock_machine.Machine( 443*760c253cSXin Li machine_name, self.locks_dir 444*760c253cSXin Li ).Unlock(True) 445*760c253cSXin Li if not res: 446*760c253cSXin Li self.logger.LogError( 447*760c253cSXin Li "Could not unlock machine: '%s'." % machine_name 448*760c253cSXin Li ) 449*760c253cSXin Li 450*760c253cSXin Li def ForceSameImageToAllMachines(self, label): 451*760c253cSXin Li machines = self.GetMachines(label) 452*760c253cSXin Li for m in machines: 453*760c253cSXin Li self.ImageMachine(m, label) 454*760c253cSXin Li m.SetUpChecksumInfo() 455*760c253cSXin Li 456*760c253cSXin Li def AcquireMachine(self, label): 457*760c253cSXin Li image_checksum = label.checksum 458*760c253cSXin Li machines = self.GetMachines(label) 459*760c253cSXin Li check_interval_time = 120 460*760c253cSXin Li with self._lock: 461*760c253cSXin Li # Lazily external lock machines 462*760c253cSXin Li while self.acquire_timeout >= 0: 463*760c253cSXin Li for m in machines: 464*760c253cSXin Li new_machine = m not in self._all_machines 465*760c253cSXin Li self._TryToLockMachine(m) 466*760c253cSXin Li if new_machine: 467*760c253cSXin Li m.released_time = time.time() 468*760c253cSXin Li if self.GetAvailableMachines(label): 469*760c253cSXin Li break 470*760c253cSXin Li sleep_time = max( 471*760c253cSXin Li 1, min(self.acquire_timeout, check_interval_time) 472*760c253cSXin Li ) 473*760c253cSXin Li time.sleep(sleep_time) 474*760c253cSXin Li self.acquire_timeout -= sleep_time 475*760c253cSXin Li 476*760c253cSXin Li if self.acquire_timeout < 0: 477*760c253cSXin Li self.logger.LogFatal( 478*760c253cSXin Li "Could not acquire any of the " 479*760c253cSXin Li "following machines: '%s'" 480*760c253cSXin Li % ", ".join(machine.name for machine in machines) 481*760c253cSXin Li ) 482*760c253cSXin Li 483*760c253cSXin Li ### for m in self._machines: 484*760c253cSXin Li ### if (m.locked and time.time() - m.released_time < 10 and 485*760c253cSXin Li ### m.checksum == image_checksum): 486*760c253cSXin Li ### return None 487*760c253cSXin Li unlocked_machines = [ 488*760c253cSXin Li machine 489*760c253cSXin Li for machine in self.GetAvailableMachines(label) 490*760c253cSXin Li if not machine.locked 491*760c253cSXin Li ] 492*760c253cSXin Li for m in unlocked_machines: 493*760c253cSXin Li if image_checksum and m.checksum == image_checksum: 494*760c253cSXin Li m.locked = True 495*760c253cSXin Li m.test_run = threading.current_thread() 496*760c253cSXin Li return m 497*760c253cSXin Li for m in unlocked_machines: 498*760c253cSXin Li if not m.checksum: 499*760c253cSXin Li m.locked = True 500*760c253cSXin Li m.test_run = threading.current_thread() 501*760c253cSXin Li return m 502*760c253cSXin Li # This logic ensures that threads waiting on a machine will get a machine 503*760c253cSXin Li # with a checksum equal to their image over other threads. This saves time 504*760c253cSXin Li # when crosperf initially assigns the machines to threads by minimizing 505*760c253cSXin Li # the number of re-images. 506*760c253cSXin Li # TODO(asharif): If we centralize the thread-scheduler, we wont need this 507*760c253cSXin Li # code and can implement minimal reimaging code more cleanly. 508*760c253cSXin Li for m in unlocked_machines: 509*760c253cSXin Li if time.time() - m.released_time > 15: 510*760c253cSXin Li # The release time gap is too large, so it is probably in the start 511*760c253cSXin Li # stage, we need to reset the released_time. 512*760c253cSXin Li m.released_time = time.time() 513*760c253cSXin Li elif time.time() - m.released_time > 8: 514*760c253cSXin Li m.locked = True 515*760c253cSXin Li m.test_run = threading.current_thread() 516*760c253cSXin Li return m 517*760c253cSXin Li return None 518*760c253cSXin Li 519*760c253cSXin Li def GetAvailableMachines(self, label=None): 520*760c253cSXin Li if not label: 521*760c253cSXin Li return self._machines 522*760c253cSXin Li return [m for m in self._machines if m.name in label.remote] 523*760c253cSXin Li 524*760c253cSXin Li def GetMachines(self, label=None): 525*760c253cSXin Li if not label: 526*760c253cSXin Li return self._all_machines 527*760c253cSXin Li return [m for m in self._all_machines if m.name in label.remote] 528*760c253cSXin Li 529*760c253cSXin Li def ReleaseMachine(self, machine): 530*760c253cSXin Li with self._lock: 531*760c253cSXin Li for m in self._machines: 532*760c253cSXin Li if machine.name == m.name: 533*760c253cSXin Li assert m.locked, "Tried to double-release %s" % m.name 534*760c253cSXin Li m.released_time = time.time() 535*760c253cSXin Li m.locked = False 536*760c253cSXin Li m.status = "Available" 537*760c253cSXin Li break 538*760c253cSXin Li 539*760c253cSXin Li def Cleanup(self): 540*760c253cSXin Li with self._lock: 541*760c253cSXin Li # Unlock all machines (via file lock) 542*760c253cSXin Li for m in self._machines: 543*760c253cSXin Li res = file_lock_machine.Machine(m.name, self.locks_dir).Unlock( 544*760c253cSXin Li True 545*760c253cSXin Li ) 546*760c253cSXin Li 547*760c253cSXin Li if not res: 548*760c253cSXin Li self.logger.LogError( 549*760c253cSXin Li "Could not unlock machine: '%s'." % m.name 550*760c253cSXin Li ) 551*760c253cSXin Li 552*760c253cSXin Li def __str__(self): 553*760c253cSXin Li with self._lock: 554*760c253cSXin Li l = ["MachineManager Status:"] + [str(m) for m in self._machines] 555*760c253cSXin Li return "\n".join(l) 556*760c253cSXin Li 557*760c253cSXin Li def AsString(self): 558*760c253cSXin Li with self._lock: 559*760c253cSXin Li stringify_fmt = "%-30s %-10s %-4s %-25s %-32s" 560*760c253cSXin Li header = stringify_fmt % ( 561*760c253cSXin Li "Machine", 562*760c253cSXin Li "Thread", 563*760c253cSXin Li "Lock", 564*760c253cSXin Li "Status", 565*760c253cSXin Li "Checksum", 566*760c253cSXin Li ) 567*760c253cSXin Li table = [header] 568*760c253cSXin Li for m in self._machines: 569*760c253cSXin Li if m.test_run: 570*760c253cSXin Li test_name = m.test_run.name 571*760c253cSXin Li test_status = m.test_run.timeline.GetLastEvent() 572*760c253cSXin Li else: 573*760c253cSXin Li test_name = "" 574*760c253cSXin Li test_status = "" 575*760c253cSXin Li 576*760c253cSXin Li try: 577*760c253cSXin Li machine_string = stringify_fmt % ( 578*760c253cSXin Li m.name, 579*760c253cSXin Li test_name, 580*760c253cSXin Li m.locked, 581*760c253cSXin Li test_status, 582*760c253cSXin Li m.checksum, 583*760c253cSXin Li ) 584*760c253cSXin Li except ValueError: 585*760c253cSXin Li machine_string = "" 586*760c253cSXin Li table.append(machine_string) 587*760c253cSXin Li return "Machine Status:\n%s" % "\n".join(table) 588*760c253cSXin Li 589*760c253cSXin Li def GetAllCPUInfo(self, labels): 590*760c253cSXin Li """Get cpuinfo for labels, merge them if their cpuinfo are the same.""" 591*760c253cSXin Li dic = collections.defaultdict(list) 592*760c253cSXin Li for label in labels: 593*760c253cSXin Li for machine in self._all_machines: 594*760c253cSXin Li if machine.name in label.remote: 595*760c253cSXin Li dic[machine.cpuinfo].append(label.name) 596*760c253cSXin Li break 597*760c253cSXin Li output_segs = [] 598*760c253cSXin Li for key, v in dic.items(): 599*760c253cSXin Li output = " ".join(v) 600*760c253cSXin Li output += "\n-------------------\n" 601*760c253cSXin Li output += key 602*760c253cSXin Li output += "\n\n\n" 603*760c253cSXin Li output_segs.append(output) 604*760c253cSXin Li return "".join(output_segs) 605*760c253cSXin Li 606*760c253cSXin Li def GetAllMachines(self): 607*760c253cSXin Li return self._all_machines 608*760c253cSXin Li 609*760c253cSXin Li 610*760c253cSXin Liclass MockCrosMachine(CrosMachine): 611*760c253cSXin Li """Mock cros machine class.""" 612*760c253cSXin Li 613*760c253cSXin Li # pylint: disable=super-init-not-called 614*760c253cSXin Li 615*760c253cSXin Li MEMINFO_STRING = """MemTotal: 3990332 kB 616*760c253cSXin LiMemFree: 2608396 kB 617*760c253cSXin LiBuffers: 147168 kB 618*760c253cSXin LiCached: 811560 kB 619*760c253cSXin LiSwapCached: 0 kB 620*760c253cSXin LiActive: 503480 kB 621*760c253cSXin LiInactive: 628572 kB 622*760c253cSXin LiActive(anon): 174532 kB 623*760c253cSXin LiInactive(anon): 88576 kB 624*760c253cSXin LiActive(file): 328948 kB 625*760c253cSXin LiInactive(file): 539996 kB 626*760c253cSXin LiUnevictable: 0 kB 627*760c253cSXin LiMlocked: 0 kB 628*760c253cSXin LiSwapTotal: 5845212 kB 629*760c253cSXin LiSwapFree: 5845212 kB 630*760c253cSXin LiDirty: 9384 kB 631*760c253cSXin LiWriteback: 0 kB 632*760c253cSXin LiAnonPages: 173408 kB 633*760c253cSXin LiMapped: 146268 kB 634*760c253cSXin LiShmem: 89676 kB 635*760c253cSXin LiSlab: 188260 kB 636*760c253cSXin LiSReclaimable: 169208 kB 637*760c253cSXin LiSUnreclaim: 19052 kB 638*760c253cSXin LiKernelStack: 2032 kB 639*760c253cSXin LiPageTables: 7120 kB 640*760c253cSXin LiNFS_Unstable: 0 kB 641*760c253cSXin LiBounce: 0 kB 642*760c253cSXin LiWritebackTmp: 0 kB 643*760c253cSXin LiCommitLimit: 7840376 kB 644*760c253cSXin LiCommitted_AS: 1082032 kB 645*760c253cSXin LiVmallocTotal: 34359738367 kB 646*760c253cSXin LiVmallocUsed: 364980 kB 647*760c253cSXin LiVmallocChunk: 34359369407 kB 648*760c253cSXin LiDirectMap4k: 45824 kB 649*760c253cSXin LiDirectMap2M: 4096000 kB 650*760c253cSXin Li""" 651*760c253cSXin Li 652*760c253cSXin Li CPUINFO_STRING = """processor: 0 653*760c253cSXin Livendor_id: GenuineIntel 654*760c253cSXin Licpu family: 6 655*760c253cSXin Limodel: 42 656*760c253cSXin Limodel name: Intel(R) Celeron(R) CPU 867 @ 1.30GHz 657*760c253cSXin Listepping: 7 658*760c253cSXin Limicrocode: 0x25 659*760c253cSXin Licpu MHz: 1300.000 660*760c253cSXin Licache size: 2048 KB 661*760c253cSXin Liphysical id: 0 662*760c253cSXin Lisiblings: 2 663*760c253cSXin Licore id: 0 664*760c253cSXin Licpu cores: 2 665*760c253cSXin Liapicid: 0 666*760c253cSXin Liinitial apicid: 0 667*760c253cSXin Lifpu: yes 668*760c253cSXin Lifpu_exception: yes 669*760c253cSXin Licpuid level: 13 670*760c253cSXin Liwp: yes 671*760c253cSXin Liflags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer xsave lahf_lm arat epb xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid 672*760c253cSXin Libogomips: 2594.17 673*760c253cSXin Liclflush size: 64 674*760c253cSXin Licache_alignment: 64 675*760c253cSXin Liaddress sizes: 36 bits physical, 48 bits virtual 676*760c253cSXin Lipower management: 677*760c253cSXin Li 678*760c253cSXin Liprocessor: 1 679*760c253cSXin Livendor_id: GenuineIntel 680*760c253cSXin Licpu family: 6 681*760c253cSXin Limodel: 42 682*760c253cSXin Limodel name: Intel(R) Celeron(R) CPU 867 @ 1.30GHz 683*760c253cSXin Listepping: 7 684*760c253cSXin Limicrocode: 0x25 685*760c253cSXin Licpu MHz: 1300.000 686*760c253cSXin Licache size: 2048 KB 687*760c253cSXin Liphysical id: 0 688*760c253cSXin Lisiblings: 2 689*760c253cSXin Licore id: 1 690*760c253cSXin Licpu cores: 2 691*760c253cSXin Liapicid: 2 692*760c253cSXin Liinitial apicid: 2 693*760c253cSXin Lifpu: yes 694*760c253cSXin Lifpu_exception: yes 695*760c253cSXin Licpuid level: 13 696*760c253cSXin Liwp: yes 697*760c253cSXin Liflags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer xsave lahf_lm arat epb xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid 698*760c253cSXin Libogomips: 2594.17 699*760c253cSXin Liclflush size: 64 700*760c253cSXin Licache_alignment: 64 701*760c253cSXin Liaddress sizes: 36 bits physical, 48 bits virtual 702*760c253cSXin Lipower management: 703*760c253cSXin Li""" 704*760c253cSXin Li 705*760c253cSXin Li def __init__(self, name, chromeos_root, log_level): 706*760c253cSXin Li self.name = name 707*760c253cSXin Li self.image = None 708*760c253cSXin Li self.checksum = None 709*760c253cSXin Li self.locked = False 710*760c253cSXin Li self.released_time = time.time() 711*760c253cSXin Li self.test_run = None 712*760c253cSXin Li self.chromeos_root = chromeos_root 713*760c253cSXin Li self.checksum_string = re.sub(r"\d", "", name) 714*760c253cSXin Li # In test, we assume "lumpy1", "lumpy2" are the same machine. 715*760c253cSXin Li self.machine_checksum = self._GetMD5Checksum(self.checksum_string) 716*760c253cSXin Li self.log_level = log_level 717*760c253cSXin Li self.label = None 718*760c253cSXin Li self.cooldown_wait_time = 0 719*760c253cSXin Li self.ce = command_executer.GetCommandExecuter(log_level=self.log_level) 720*760c253cSXin Li self._GetCPUInfo() 721*760c253cSXin Li 722*760c253cSXin Li def IsReachable(self): 723*760c253cSXin Li return True 724*760c253cSXin Li 725*760c253cSXin Li def _GetMemoryInfo(self): 726*760c253cSXin Li self.meminfo = self.MEMINFO_STRING 727*760c253cSXin Li self._ParseMemoryInfo() 728*760c253cSXin Li 729*760c253cSXin Li def _GetCPUInfo(self): 730*760c253cSXin Li self.cpuinfo = self.CPUINFO_STRING 731*760c253cSXin Li 732*760c253cSXin Li 733*760c253cSXin Liclass MockMachineManager(MachineManager): 734*760c253cSXin Li """Mock machine manager class.""" 735*760c253cSXin Li 736*760c253cSXin Li def __init__( 737*760c253cSXin Li self, 738*760c253cSXin Li chromeos_root, 739*760c253cSXin Li acquire_timeout, 740*760c253cSXin Li log_level, 741*760c253cSXin Li locks_dir, 742*760c253cSXin Li keep_stateful: bool = False, 743*760c253cSXin Li ): 744*760c253cSXin Li super(MockMachineManager, self).__init__( 745*760c253cSXin Li chromeos_root, 746*760c253cSXin Li acquire_timeout, 747*760c253cSXin Li log_level, 748*760c253cSXin Li locks_dir, 749*760c253cSXin Li keep_stateful=keep_stateful, 750*760c253cSXin Li ) 751*760c253cSXin Li 752*760c253cSXin Li def _TryToLockMachine(self, cros_machine): 753*760c253cSXin Li self._machines.append(cros_machine) 754*760c253cSXin Li cros_machine.checksum = "" 755*760c253cSXin Li 756*760c253cSXin Li def AddMachine(self, machine_name): 757*760c253cSXin Li with self._lock: 758*760c253cSXin Li for m in self._all_machines: 759*760c253cSXin Li assert m.name != machine_name, ( 760*760c253cSXin Li "Tried to double-add %s" % machine_name 761*760c253cSXin Li ) 762*760c253cSXin Li cm = MockCrosMachine( 763*760c253cSXin Li machine_name, self.chromeos_root, self.log_level 764*760c253cSXin Li ) 765*760c253cSXin Li assert cm.machine_checksum, ( 766*760c253cSXin Li "Could not find checksum for machine %s" % machine_name 767*760c253cSXin Li ) 768*760c253cSXin Li # In Original MachineManager, the test is 'if cm.machine_checksum:' - if a 769*760c253cSXin Li # machine is unreachable, then its machine_checksum is None. Here we 770*760c253cSXin Li # cannot do this, because machine_checksum is always faked, so we directly 771*760c253cSXin Li # test cm.IsReachable, which is properly mocked. 772*760c253cSXin Li if cm.IsReachable(): 773*760c253cSXin Li self._all_machines.append(cm) 774*760c253cSXin Li 775*760c253cSXin Li def GetChromeVersion(self, machine): 776*760c253cSXin Li return "Mock Chrome Version R50" 777*760c253cSXin Li 778*760c253cSXin Li def AcquireMachine(self, label): 779*760c253cSXin Li for machine in self._all_machines: 780*760c253cSXin Li if not machine.locked: 781*760c253cSXin Li machine.locked = True 782*760c253cSXin Li return machine 783*760c253cSXin Li return None 784*760c253cSXin Li 785*760c253cSXin Li def ImageMachine(self, machine, label): 786*760c253cSXin Li if machine or label: 787*760c253cSXin Li return 0 788*760c253cSXin Li return 1 789*760c253cSXin Li 790*760c253cSXin Li def ReleaseMachine(self, machine): 791*760c253cSXin Li machine.locked = False 792*760c253cSXin Li 793*760c253cSXin Li def GetMachines(self, label=None): 794*760c253cSXin Li return self._all_machines 795*760c253cSXin Li 796*760c253cSXin Li def GetAvailableMachines(self, label=None): 797*760c253cSXin Li return self._all_machines 798*760c253cSXin Li 799*760c253cSXin Li def ForceSameImageToAllMachines(self, label=None): 800*760c253cSXin Li return 0 801*760c253cSXin Li 802*760c253cSXin Li def ComputeCommonCheckSum(self, label=None): 803*760c253cSXin Li common_checksum = 12345 804*760c253cSXin Li for machine in self.GetMachines(label): 805*760c253cSXin Li machine.machine_checksum = common_checksum 806*760c253cSXin Li self.machine_checksum[label.name] = common_checksum 807*760c253cSXin Li 808*760c253cSXin Li def GetAllMachines(self): 809*760c253cSXin Li return self._all_machines 810