1# mypy: allow-untyped-defs 2""" 3This is a script for launching PyTorch inference on Intel(R) Xeon(R) Scalable Processors with optimal configurations. 4 5Single instance inference, multi-instance inference are enabled. 6 7Note: term "instance" here doesn't refer to a cloud instance. This script is executed as a single process. It invokes 8multiple "instances" which are formed from multiple threads for each. "instance" is kind of group of threads in this 9context. 10 11Illustrated as below: 12 13:: 14 15 +-----------------------------+----------------------+-------+ 16 | process | thread | core | 17 +=============================+======================+=======+ 18 | torch.backends.xeon.run_cpu | instance 0: thread 0 | 0 | 19 | | thread 1 | 1 | 20 | +----------------------+-------+ 21 | | instance 1: thread 0 | 2 | 22 | | thread 1 | 3 | 23 | +----------------------+-------+ 24 | | ... | ... | 25 | +----------------------+-------+ 26 | | instance N: thread 0 | M | 27 | | thread 1 | M+1 | 28 +-----------------------------+----------------------+-------+ 29 30To get the peak performance on Intel(R) Xeon(R) Scalable Processors, the script optimizes the configuration of thread and memory 31management. For thread management, the script configures thread affinity and the preload of Intel OMP library. 32For memory management, it configures NUMA binding and preload optimized memory allocation library (e.g. tcmalloc, jemalloc). 33 34Environment variables that will be set by this script: 35 36+------------------+-------------------------------------------------------------------------------------------------+ 37| Environ Variable | Value | 38+==================+=================================================================================================+ 39| LD_PRELOAD | Depending on knobs you set, <lib>/libiomp5.so, <lib>/libjemalloc.so, <lib>/libtcmalloc.so might | 40| | be appended to LD_PRELOAD. | 41+------------------+-------------------------------------------------------------------------------------------------+ 42| KMP_AFFINITY | If libiomp5.so is preloaded, KMP_AFFINITY could be set to "granularity=fine,compact,1,0". | 43+------------------+-------------------------------------------------------------------------------------------------+ 44| KMP_BLOCKTIME | If libiomp5.so is preloaded, KMP_BLOCKTIME is set to "1". | 45+------------------+-------------------------------------------------------------------------------------------------+ 46| OMP_NUM_THREADS | value of ncores_per_instance | 47+------------------+-------------------------------------------------------------------------------------------------+ 48| MALLOC_CONF | If libjemalloc.so is preloaded, MALLOC_CONF will be set to | 49| | "oversize_threshold:1,background_thread:true,metadata_thp:auto". | 50+------------------+-------------------------------------------------------------------------------------------------+ 51 52*Note*: This script respects environment variables set preliminarily. I.e. If you set the environment variables 53mentioned above before running the script, the script will not overwrite the values in the script. 54 55How to use this module: 56~~~~~~~~~~~~~~~~~~~~~~~ 57 58Single instance inference 59------------------------- 60 611. Run single-instance inference on a single node with all CPU nodes. 62 63:: 64 65 python -m torch.backends.xeon.run_cpu --throughput-mode script.py args 66 672. Run single-instance inference on a single CPU node. 68 69:: 70 71 python -m torch.backends.xeon.run_cpu --node-id 1 script.py args 72 73Multi-instance inference 74------------------------ 75 761. Multi-instance 77 By default this tool runs one process per node. If you want to set the instance numbers and core per instance, 78 --ninstances and --ncores-per-instance should be set. 79 80:: 81 82 python -m torch.backends.xeon.run_cpu -- python_script args 83 84 eg: on an Intel(R) Xeon(R) Scalable Processor with 14 instance, 4 cores per instance 85 86:: 87 88 python -m torch.backends.xeon.run_cpu --ninstances 14 --ncores-per-instance 4 python_script args 89 902. Run single-instance inference among multiple instances. 91 By default, runs all ninstances. If you want to independently run a single instance among ninstances, specify rank. 92 93 eg: run 0th instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance (i.e., numactl -C 0-27) 94 95:: 96 97 python -m torch.backends.xeon.run_cpu --ninstances 2 --rank 0 python_script args 98 99 eg: run 1st instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance (i.e., numactl -C 28-55) 100 101:: 102 103 python -m torch.backends.xeon.run_cpu --ninstances 2 --rank 1 python_script args 104 105 eg: run 0th instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance, 2 cores per instance, 106 first four cores (i.e., numactl -C 0-1) 107 108:: 109 110 python -m torch.backends.xeon.run_cpu --core-list "0, 1, 2, 3" --ninstances 2 --ncores-per-instance 2 111 --rank 0 python_script args 112 1133. To look up what optional arguments this module offers: 114 115:: 116 117 python -m torch.backends.xeon.run_cpu --help 118 119Memory allocator 120---------------- 121 122"--enable-tcmalloc" and "--enable-jemalloc" can be used to enable different memory allcator. 123 124""" 125 126import glob 127import logging 128import os 129import platform 130import re 131import subprocess 132import sys 133from argparse import ArgumentParser, RawTextHelpFormatter, REMAINDER 134from os.path import expanduser 135from typing import Dict, List 136 137from torch.distributed.elastic.multiprocessing import ( 138 DefaultLogsSpecs as _DefaultLogsSpecs, 139 start_processes, 140 Std, 141) 142 143 144format_str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 145logging.basicConfig(level=logging.INFO, format=format_str) 146logger = logging.getLogger(__name__) 147 148 149class _CPUinfo: 150 """Get CPU information, such as cores list and NUMA information.""" 151 152 def __init__(self, test_input=""): 153 self.cpuinfo = [] 154 if platform.system() in ["Windows", "Darwin"]: 155 raise RuntimeError(f"{platform.system()} is not supported!!!") 156 elif platform.system() == "Linux": 157 # Sample output of: `lscpu --parse=CPU,Core,Socket,Node` 158 # 159 # # The following is the parsable format, which can be fed to other 160 # # programs. Each different item in every column has an unique ID 161 # # starting from zero. 162 # # CPU,Core,Socket,Node 163 # 0,0,0,0 164 # 1,1,0,0 165 # ... 166 if test_input == "": 167 lscpu_cmd = ["lscpu", "--parse=CPU,Core,Socket,Node"] 168 lscpu_info = subprocess.check_output( 169 lscpu_cmd, universal_newlines=True 170 ).split("\n") 171 else: 172 lscpu_info = test_input.split("\n") 173 174 # Get information about cpu, core, socket and node 175 for line in lscpu_info: 176 pattern = r"^([\d]+,[\d]+,[\d]+,[\d]?)" 177 regex_out = re.search(pattern, line) 178 if regex_out: 179 self.cpuinfo.append(regex_out.group(1).strip().split(",")) 180 181 # physical cores := core column in lscpu output 182 # logical cores := cPU column in lscpu output 183 self.node_nums = int(max(line[3] for line in self.cpuinfo)) + 1 184 self.node_physical_cores: List[List[int]] = [] # node_id is index 185 self.node_logical_cores: List[List[int]] = [] # node_id is index 186 self.physical_core_node_map = {} # physical core to numa node id 187 self.logical_core_node_map = {} # logical core to numa node id 188 189 for node_id in range(self.node_nums): 190 cur_node_physical_core = [] 191 cur_node_logical_core = [] 192 for cpuinfo in self.cpuinfo: 193 nid = cpuinfo[3] if cpuinfo[3] != "" else "0" 194 if node_id == int(nid): 195 if int(cpuinfo[1]) not in cur_node_physical_core: 196 cur_node_physical_core.append(int(cpuinfo[1])) 197 self.physical_core_node_map[int(cpuinfo[1])] = int(node_id) 198 cur_node_logical_core.append(int(cpuinfo[0])) 199 self.logical_core_node_map[int(cpuinfo[0])] = int(node_id) 200 self.node_physical_cores.append(cur_node_physical_core) 201 self.node_logical_cores.append(cur_node_logical_core) 202 203 def _physical_core_nums(self): 204 return len(self.node_physical_cores) * len(self.node_physical_cores[0]) 205 206 def _logical_core_nums(self): 207 return len(self.node_logical_cores) * len(self.node_logical_cores[0]) 208 209 def get_node_physical_cores(self, node_id): 210 if node_id < 0 or node_id > self.node_nums - 1: 211 raise ValueError( 212 f"Invalid node id: {node_id}. Valid node ids: {list(range(len(self.node_physical_cores)))}" 213 ) 214 return self.node_physical_cores[node_id] 215 216 def get_node_logical_cores(self, node_id): 217 if node_id < 0 or node_id > self.node_nums - 1: 218 raise ValueError( 219 f"Invalid node id: {node_id}. Valid node ids: {list(range(len(self.node_physical_cores)))}" 220 ) 221 return self.node_logical_cores[node_id] 222 223 def get_all_physical_cores(self): 224 all_cores = [] 225 for cores in self.node_physical_cores: 226 all_cores.extend(cores) 227 return all_cores 228 229 def get_all_logical_cores(self): 230 all_cores = [] 231 for cores in self.node_logical_cores: 232 all_cores.extend(cores) 233 return all_cores 234 235 def numa_aware_check(self, core_list): 236 """ 237 Check whether all cores in core_list are in the same NUMA node. 238 239 Cross NUMA will reduce performance. 240 We strongly advice to not use cores on different nodes. 241 """ 242 cores_numa_map = self.logical_core_node_map 243 numa_ids = [] 244 for core in core_list: 245 numa_id = cores_numa_map[core] 246 if numa_id not in numa_ids: 247 numa_ids.append(numa_id) 248 if len(numa_ids) > 1: 249 logger.warning( 250 "Numa Aware: cores:%s on different NUMA nodes:%s. To avoid \ 251this behavior, please use --ncores-per-instance knob to make sure number of cores is divisible by --ncores-per-\ 252instance. Alternatively, please use --skip-cross-node-cores knob.", 253 str(core_list), 254 str(numa_ids), 255 ) 256 if len(numa_ids) == 0: 257 raise RuntimeError( 258 "invalid number of NUMA nodes; please make sure numa_ids >= 1" 259 ) 260 return numa_ids 261 262 263class _Launcher: 264 r"""Class for launcher.""" 265 266 msg_lib_notfound = f"Unable to find the {{0}} library file lib{{1}}.so in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib \ 267or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or \ 268{expanduser('~')}/.local/lib/ so the LD_PRELOAD environment variable will not be set." 269 270 def __init__(self) -> None: 271 self.cpuinfo = _CPUinfo() 272 273 def add_lib_preload(self, lib_type): 274 """Enable TCMalloc/JeMalloc/intel OpenMP.""" 275 library_paths = [] 276 if "CONDA_PREFIX" in os.environ: 277 library_paths.append(f"{os.environ['CONDA_PREFIX']}/lib") 278 if "VIRTUAL_ENV" in os.environ: 279 library_paths.append(f"{os.environ['VIRTUAL_ENV']}/lib") 280 281 library_paths += [ 282 f"{expanduser('~')}/.local/lib", 283 "/usr/local/lib", 284 "/usr/local/lib64", 285 "/usr/lib", 286 "/usr/lib64", 287 ] 288 289 lib_find = False 290 lib_set = False 291 for item in os.getenv("LD_PRELOAD", "").split(":"): 292 if item.endswith(f"lib{lib_type}.so"): 293 lib_set = True 294 break 295 if not lib_set: 296 for lib_path in library_paths: 297 library_file = os.path.join(lib_path, f"lib{lib_type}.so") 298 matches = glob.glob(library_file) 299 if len(matches) > 0: 300 ld_preloads = [f"{matches[0]}", os.getenv("LD_PRELOAD", "")] 301 os.environ["LD_PRELOAD"] = os.pathsep.join( 302 [p.strip(os.pathsep) for p in ld_preloads if p] 303 ) 304 lib_find = True 305 break 306 return lib_set or lib_find 307 308 def is_numactl_available(self): 309 numactl_available = False 310 try: 311 cmd = ["numactl", "-C", "0", "-m", "0", "hostname"] 312 r = subprocess.run( 313 cmd, 314 env=os.environ, 315 stdout=subprocess.DEVNULL, 316 stderr=subprocess.DEVNULL, 317 check=False, 318 ) 319 if r.returncode == 0: 320 numactl_available = True 321 except Exception: 322 pass 323 return numactl_available 324 325 def set_memory_allocator( 326 self, enable_tcmalloc=True, enable_jemalloc=False, use_default_allocator=False 327 ): 328 """ 329 Enable TCMalloc/JeMalloc with LD_PRELOAD and set configuration for JeMalloc. 330 331 By default, PTMalloc will be used for PyTorch, but TCMalloc and JeMalloc can get better 332 memory reuse and reduce page fault to improve performance. 333 """ 334 if enable_tcmalloc and enable_jemalloc: 335 raise RuntimeError( 336 "Unable to enable TCMalloc and JEMalloc at the same time." 337 ) 338 339 if enable_tcmalloc: 340 find_tc = self.add_lib_preload(lib_type="tcmalloc") 341 if not find_tc: 342 msg = f'{self.msg_lib_notfound} you can use "conda install -c conda-forge gperftools" to install {{0}}' 343 logger.warning(msg.format("TCmalloc", "tcmalloc")) # noqa: G001 344 else: 345 logger.info("Use TCMalloc memory allocator") 346 347 elif enable_jemalloc: 348 find_je = self.add_lib_preload(lib_type="jemalloc") 349 if not find_je: 350 msg = f'{self.msg_lib_notfound} you can use "conda install -c conda-forge jemalloc" to install {{0}}' 351 logger.warning(msg.format("Jemalloc", "jemalloc")) # noqa: G001 352 else: 353 logger.info("Use JeMalloc memory allocator") 354 self.set_env( 355 "MALLOC_CONF", 356 "oversize_threshold:1,background_thread:true,metadata_thp:auto", 357 ) 358 359 elif use_default_allocator: 360 pass 361 362 else: 363 find_tc = self.add_lib_preload(lib_type="tcmalloc") 364 if find_tc: 365 logger.info("Use TCMalloc memory allocator") 366 return 367 find_je = self.add_lib_preload(lib_type="jemalloc") 368 if find_je: 369 logger.info("Use JeMalloc memory allocator") 370 return 371 logger.warning( 372 """Neither TCMalloc nor JeMalloc is found in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib 373 or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or 374 %s/.local/lib/ so the LD_PRELOAD environment variable will not be set. 375 This may drop the performance""", 376 expanduser("~"), 377 ) 378 379 def log_env_var(self, env_var_name=""): 380 if env_var_name in os.environ: 381 logger.info("%s=%s", env_var_name, os.environ[env_var_name]) 382 383 def set_env(self, env_name, env_value): 384 if not env_value: 385 logger.warning("%s is None", env_name) 386 if env_name not in os.environ: 387 os.environ[env_name] = env_value 388 elif os.environ[env_name] != env_value: 389 logger.warning( 390 "Overriding value with the one set in environment variable: %s. \ 391Value applied: %s. Value ignored: %s", 392 env_name, 393 os.environ[env_name], 394 env_value, 395 ) 396 self.log_env_var(env_name) 397 398 # set_kmp_affinity is used to control whether to set KMP_AFFINITY or not. 399 # In scenario that use all cores on all nodes, including logical cores, setting KMP_AFFINITY disables logical cores. 400 # In this case, KMP_AFFINITY should not be set. 401 def set_multi_thread_and_allocator( 402 self, 403 ncores_per_instance, 404 disable_iomp=False, 405 set_kmp_affinity=True, 406 enable_tcmalloc=True, 407 enable_jemalloc=False, 408 use_default_allocator=False, 409 ): 410 """ 411 Set multi-thread configuration and enable Intel openMP and TCMalloc/JeMalloc. 412 413 By default, GNU openMP and PTMalloc are used in PyTorch. but Intel openMP and TCMalloc/JeMalloc are better alternatives 414 to get performance benefit. 415 """ 416 self.set_memory_allocator( 417 enable_tcmalloc, enable_jemalloc, use_default_allocator 418 ) 419 self.set_env("OMP_NUM_THREADS", str(ncores_per_instance)) 420 if not disable_iomp: 421 find_iomp = self.add_lib_preload(lib_type="iomp5") 422 if not find_iomp: 423 msg = f'{self.msg_lib_notfound} you can use "conda install mkl" to install {{0}}' 424 logger.warning(msg.format("iomp", "iomp5")) # noqa: G001 425 else: 426 logger.info("Using Intel OpenMP") 427 if set_kmp_affinity: 428 self.set_env("KMP_AFFINITY", "granularity=fine,compact,1,0") 429 self.set_env("KMP_BLOCKTIME", "1") 430 self.log_env_var("LD_PRELOAD") 431 432 r""" 433 Launcher for single instance and multi-instance 434 """ 435 436 def launch(self, args): 437 cores = [] 438 set_kmp_affinity = True 439 enable_taskset = False 440 if args.core_list: # user specify what cores will be used by params 441 cores = [int(x) for x in args.core_list.split(",")] 442 if args.ncores_per_instance == -1: 443 raise RuntimeError( 444 'please specify the "--ncores-per-instance" if you have pass the --core-list params' 445 ) 446 elif ( 447 args.ninstances > 1 448 and args.ncores_per_instance * args.ninstances < len(cores) 449 ): 450 logger.warning( 451 "only first %s cores will be used, \ 452but you specify %s cores in core_list", 453 args.ncores_per_instance * args.ninstances, 454 len(cores), 455 ) 456 else: 457 args.ninstances = len(cores) // args.ncores_per_instance 458 459 else: 460 if args.use_logical_core: 461 if args.node_id != -1: 462 cores = self.cpuinfo.get_node_logical_cores(args.node_id) 463 else: 464 cores = self.cpuinfo.get_all_logical_cores() 465 # When using all cores on all nodes, including logical cores, 466 # setting KMP_AFFINITY disables logical cores. Thus, KMP_AFFINITY should not be set. 467 set_kmp_affinity = False 468 else: 469 if args.node_id != -1: 470 cores = self.cpuinfo.get_node_physical_cores(args.node_id) 471 else: 472 cores = self.cpuinfo.get_all_physical_cores() 473 if ( 474 not args.multi_instance 475 and args.ninstances == -1 476 and args.ncores_per_instance == -1 477 ): 478 args.ninstances = 1 479 args.ncores_per_instance = len(cores) 480 elif ( 481 args.multi_instance 482 and args.ninstances == -1 483 and args.ncores_per_instance == -1 484 ): 485 args.throughput_mode = True 486 elif args.ncores_per_instance == -1 and args.ninstances != -1: 487 if args.ninstances > len(cores): 488 raise RuntimeError( 489 f"there are {len(cores)} total cores but you specify {args.ninstances} ninstances; \ 490please make sure ninstances <= total_cores)" 491 ) 492 else: 493 args.ncores_per_instance = len(cores) // args.ninstances 494 elif args.ncores_per_instance != -1 and args.ninstances == -1: 495 if not args.skip_cross_node_cores: 496 args.ninstances = len(cores) // args.ncores_per_instance 497 else: 498 ncore_per_node = len(self.cpuinfo.node_physical_cores[0]) 499 num_leftover_cores = ncore_per_node % args.ncores_per_instance 500 if args.ncores_per_instance > ncore_per_node: 501 # too many ncores_per_instance to skip cross-node cores 502 logger.warning( 503 "there are %s core(s) per socket, but you specify %s ncores_per_instance and \ 504skip_cross_node_cores. Please make sure --ncores-per-instance < core(s) per \ 505socket", 506 ncore_per_node, 507 args.ncores_per_instance, 508 ) 509 sys.exit(-1) 510 elif num_leftover_cores == 0: 511 # aren't any cross-node cores 512 logger.info( 513 "--skip-cross-node-cores is set, but there are no cross-node cores." 514 ) 515 args.ninstances = len(cores) // args.ncores_per_instance 516 else: 517 # skip cross-node cores 518 if args.ninstances != -1: 519 logger.warning( 520 "--skip-cross-node-cores is exclusive to --ninstances. --ninstances \ 521won't take effect even if it is set explicitly." 522 ) 523 524 i = 1 525 leftover_cores = set() 526 while ncore_per_node * i <= len(cores): 527 leftover_cores.update( 528 cores[ 529 ncore_per_node * i 530 - num_leftover_cores : ncore_per_node * i 531 ] 532 ) 533 i += 1 534 cores = list(set(cores) - leftover_cores) 535 assert len(cores) % args.ncores_per_instance == 0 536 args.ninstances = len(cores) // args.ncores_per_instance 537 else: 538 if args.ninstances * args.ncores_per_instance > len(cores): 539 raise RuntimeError( 540 "Please make sure ninstances * ncores_per_instance <= total_cores" 541 ) 542 if args.latency_mode: 543 logger.warning( 544 "--latency-mode is exclusive to --ninstances, --ncores-per-instance, --node-id and \ 545--use-logical-core. They won't take effect even they are set explicitly." 546 ) 547 args.ncores_per_instance = 4 548 cores = self.cpuinfo.get_all_physical_cores() 549 args.ninstances = len(cores) // args.ncores_per_instance 550 551 if args.throughput_mode: 552 logger.warning( 553 "--throughput-mode is exclusive to --ninstances, --ncores-per-instance, --node-id and \ 554--use-logical-core. They won't take effect even they are set explicitly." 555 ) 556 args.ninstances = self.cpuinfo.node_nums 557 cores = self.cpuinfo.get_all_physical_cores() 558 args.ncores_per_instance = len(cores) // args.ninstances 559 560 if args.ninstances > 1 and args.rank != -1: 561 logger.info( 562 "assigning %s cores for instance %s", 563 args.ncores_per_instance, 564 args.rank, 565 ) 566 567 if not args.disable_numactl: 568 numactl_available = self.is_numactl_available() 569 if not numactl_available: 570 if not args.disable_taskset: 571 logger.warning( 572 "Core binding with numactl is not available. Disabling numactl and using taskset instead. \ 573 This may affect performance in multi-socket system; please use numactl if memory binding is needed." 574 ) 575 args.disable_numactl = True 576 enable_taskset = True 577 else: 578 logger.warning( 579 "Core binding with numactl is not available, and --disable_taskset is set. \ 580 Please unset --disable_taskset to use taskset instead of numactl." 581 ) 582 sys.exit(-1) 583 584 if not args.disable_taskset: 585 enable_taskset = True 586 587 self.set_multi_thread_and_allocator( 588 args.ncores_per_instance, 589 args.disable_iomp, 590 set_kmp_affinity, 591 args.enable_tcmalloc, 592 args.enable_jemalloc, 593 args.use_default_allocator, 594 ) 595 entrypoint = "" 596 launch_args = {} 597 launch_envs: Dict[int, Dict] = {} 598 launch_tee = {} 599 # check whether is launched from torchrun with --nproc-per-node <num workers> 600 local_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) 601 local_rank = int(os.environ.get("LOCAL_RANK", 0)) 602 for i in range(args.ninstances): 603 cmd = [] 604 cur_process_cores = "" 605 if not args.disable_numactl or enable_taskset: 606 if not args.disable_numactl: 607 cmd = ["numactl"] 608 elif enable_taskset: 609 cmd = ["taskset"] 610 cores = sorted(cores) 611 if ( 612 args.rank == -1 613 ): # sequentially assign ncores_per_instance to ninstances 614 core_list = cores[ 615 i 616 * args.ncores_per_instance : (i + 1) 617 * args.ncores_per_instance 618 ] 619 else: # assign ncores_per_instance from rank 620 core_list = cores[ 621 args.rank 622 * args.ncores_per_instance : (args.rank + 1) 623 * args.ncores_per_instance 624 ] 625 626 core_ranges: List[Dict] = [] 627 if local_size > 1: 628 total_num_cores = len(core_list) 629 cores_per_rank = total_num_cores // local_size 630 assert ( 631 cores_per_rank >= 1 632 ), "At least one core needs to be assigned to each rank" 633 core_list = core_list[ 634 cores_per_rank * local_rank : cores_per_rank * (local_rank + 1) 635 ] 636 for core in core_list: 637 if len(core_ranges) == 0: 638 range_elem = {"start": core, "end": core} 639 core_ranges.append(range_elem) 640 else: 641 if core - core_ranges[-1]["end"] == 1: 642 core_ranges[-1]["end"] = core 643 else: 644 range_elem = {"start": core, "end": core} 645 core_ranges.append(range_elem) 646 for r in core_ranges: 647 cur_process_cores = f"{cur_process_cores}{r['start']}-{r['end']}," 648 cur_process_cores = cur_process_cores[:-1] 649 if not args.disable_numactl: 650 numa_params = f"-C {cur_process_cores} " 651 numa_ids = ",".join( 652 [ 653 str(numa_id) 654 for numa_id in self.cpuinfo.numa_aware_check(core_list) 655 ] 656 ) 657 numa_params += f"-m {numa_ids}" 658 cmd.extend(numa_params.split()) 659 elif enable_taskset: 660 taskset_params = f"-c {cur_process_cores} " 661 cmd.extend(taskset_params.split()) 662 with_python = not args.no_python 663 if with_python: 664 cmd.append(sys.executable) 665 cmd.append("-u") 666 if args.module: 667 cmd.append("-m") 668 cmd.append(args.program) 669 cmd.extend(args.program_args) 670 cmd_s = " ".join(cmd) 671 logger.info(cmd_s) 672 if entrypoint == "": 673 entrypoint = cmd[0] 674 del cmd[0] 675 launch_args[i] = tuple(cmd) 676 launch_envs[i] = {} 677 launch_tee[i] = Std.ALL 678 679 if args.rank != -1: # launches single instance, rank, only 680 break 681 682 ctx = start_processes( 683 name=args.log_file_prefix, 684 entrypoint=entrypoint, 685 args=launch_args, 686 envs=launch_envs, 687 logs_specs=_DefaultLogsSpecs(log_dir=args.log_path, tee=launch_tee), 688 ) 689 ctx.wait() 690 691 692def _add_memory_allocator_params(parser): 693 group = parser.add_argument_group("Memory Allocator Parameters") 694 # allocator control 695 group.add_argument( 696 "--enable-tcmalloc", 697 "--enable_tcmalloc", 698 action="store_true", 699 default=False, 700 help="Enable tcmalloc allocator", 701 ) 702 group.add_argument( 703 "--enable-jemalloc", 704 "--enable_jemalloc", 705 action="store_true", 706 default=False, 707 help="Enable jemalloc allocator", 708 ) 709 group.add_argument( 710 "--use-default-allocator", 711 "--use_default_allocator", 712 action="store_true", 713 default=False, 714 help="Use default memory allocator", 715 ) 716 717 718def _add_multi_instance_params(parser): 719 group = parser.add_argument_group("Multi-instance Parameters") 720 # multi-instance control 721 group.add_argument( 722 "--ncores-per-instance", 723 "--ncores_per_instance", 724 metavar="\b", 725 default=-1, 726 type=int, 727 help="Cores per instance", 728 ) 729 group.add_argument( 730 "--ninstances", 731 metavar="\b", 732 default=-1, 733 type=int, 734 help="For multi-instance, you should give the cores number you used for per instance.", 735 ) 736 group.add_argument( 737 "--skip-cross-node-cores", 738 "--skip_cross_node_cores", 739 action="store_true", 740 default=False, 741 help="If specified --ncores-per-instance, skips cross-node cores.", 742 ) 743 group.add_argument( 744 "--rank", 745 metavar="\b", 746 default="-1", 747 type=int, 748 help="Specify instance index to assign ncores_per_instance for rank; \ 749otherwise ncores_per_instance will be assigned sequentially to ninstances. Please refer to \ 750https://github.com/intel/intel-extension-for-pytorch/blob/master/docs/tutorials/performance_tuning/launch_script.md", 751 ) 752 group.add_argument( 753 "--latency-mode", 754 "--latency_mode", 755 action="store_true", 756 default=False, 757 help="By default 4 core per instance and use all physical cores", 758 ) 759 group.add_argument( 760 "--throughput-mode", 761 "--throughput_mode", 762 action="store_true", 763 default=False, 764 help="By default one instance per node and use all physical cores", 765 ) 766 group.add_argument( 767 "--node-id", 768 "--node_id", 769 metavar="\b", 770 default=-1, 771 type=int, 772 help="node id for multi-instance, by default all nodes will be used", 773 ) 774 group.add_argument( 775 "--use-logical-core", 776 "--use_logical_core", 777 action="store_true", 778 default=False, 779 help="Whether only use physical cores", 780 ) 781 group.add_argument( 782 "--disable-numactl", 783 "--disable_numactl", 784 action="store_true", 785 default=False, 786 help="Disable numactl", 787 ) 788 group.add_argument( 789 "--disable-taskset", 790 "--disable_taskset", 791 action="store_true", 792 default=False, 793 help="Disable taskset", 794 ) 795 group.add_argument( 796 "--core-list", 797 "--core_list", 798 metavar="\b", 799 default=None, 800 type=str, 801 help='Specify the core list as "core_id, core_id, ....", otherwise, all the cores will be used.', 802 ) 803 group.add_argument( 804 "--log-path", 805 "--log_path", 806 metavar="\b", 807 default="", 808 type=str, 809 help="The log file directory. Default path is " 810 ", which means disable logging to files.", 811 ) 812 group.add_argument( 813 "--log-file-prefix", 814 "--log_file_prefix", 815 metavar="\b", 816 default="run", 817 type=str, 818 help="log file prefix", 819 ) 820 821 822def _add_kmp_iomp_params(parser): 823 group = parser.add_argument_group("IOMP Parameters") 824 group.add_argument( 825 "--disable-iomp", 826 "--disable_iomp", 827 action="store_true", 828 default=False, 829 help="By default, we use Intel OpenMP and libiomp5.so will be add to LD_PRELOAD", 830 ) 831 832 833def create_args(parser=None): 834 """ 835 Parse the command line options. 836 837 @retval ArgumentParser 838 """ 839 parser.add_argument( 840 "--multi-instance", 841 "--multi_instance", 842 action="store_true", 843 default=False, 844 help="Enable multi-instance, by default one instance per node", 845 ) 846 847 parser.add_argument( 848 "-m", 849 "--module", 850 default=False, 851 action="store_true", 852 help="Changes each process to interpret the launch script " 853 "as a python module, executing with the same behavior as" 854 '"python -m".', 855 ) 856 857 parser.add_argument( 858 "--no-python", 859 "--no_python", 860 default=False, 861 action="store_true", 862 help='Do not prepend the --program script with "python" - just exec ' 863 "it directly. Useful when the script is not a Python script.", 864 ) 865 866 _add_memory_allocator_params(parser) 867 _add_kmp_iomp_params(parser) 868 869 _add_multi_instance_params(parser) 870 # positional 871 parser.add_argument( 872 "program", 873 type=str, 874 help="The full path to the program/script to be launched. " 875 "followed by all the arguments for the script", 876 ) 877 878 # rest from the training program 879 parser.add_argument("program_args", nargs=REMAINDER) 880 881 882def main(args): 883 env_before = set(os.environ.keys()) 884 if platform.system() in ["Windows", "Darwin"]: 885 raise RuntimeError(f"{platform.system()} is not supported!!!") 886 887 if args.log_path: 888 os.makedirs(args.log_path, exist_ok=True) 889 else: 890 args.log_path = os.devnull 891 892 if args.latency_mode and args.throughput_mode: 893 raise RuntimeError( 894 "Either args.latency_mode or args.throughput_mode should be set" 895 ) 896 897 if not args.no_python and not args.program.endswith(".py"): 898 raise RuntimeError( 899 'For non Python script, you should use "--no-python" parameter.' 900 ) 901 902 # Verify LD_PRELOAD 903 if "LD_PRELOAD" in os.environ: 904 lst_valid = [] 905 tmp_ldpreload = os.environ["LD_PRELOAD"] 906 for item in tmp_ldpreload.split(":"): 907 matches = glob.glob(item) 908 if len(matches) > 0: 909 lst_valid.append(item) 910 else: 911 logger.warning("%s doesn't exist. Removing it from LD_PRELOAD.", item) 912 if len(lst_valid) > 0: 913 os.environ["LD_PRELOAD"] = ":".join(lst_valid) 914 else: 915 os.environ["LD_PRELOAD"] = "" 916 917 launcher = _Launcher() 918 launcher.launch(args) 919 for x in sorted(set(os.environ.keys()) - env_before): 920 logger.debug("%s=%s", x, os.environ[x]) 921 922 923if __name__ == "__main__": 924 parser = ArgumentParser( 925 description="This is a script for launching PyTorch inference on Intel(R) Xeon(R) Scalable " 926 "Processors with optimal configurations. Single instance inference, " 927 "multi-instance inference are enable. To get the peak performance on Intel(R) " 928 "Xeon(R) Scalable Processors, the script optimizes the configuration " 929 "of thread and memory management. For thread management, the script configures thread " 930 "affinity and the preload of Intel OMP library. For memory management, it configures " 931 "NUMA binding and preload optimized memory allocation library (e.g. tcmalloc, jemalloc) " 932 "\n################################# Basic usage ############################# \n" 933 "\n 1. single instance\n" 934 "\n >>> python -m torch.backends.xeon.run_cpu python_script args \n" 935 "\n2. multi-instance \n" 936 "\n >>> python -m torch.backends.xeon.run_cpu --ninstances xxx " 937 "--ncores-per-instance xx python_script args\n" 938 "\n############################################################################# \n", 939 formatter_class=RawTextHelpFormatter, 940 ) 941 create_args(parser) 942 args = parser.parse_args() 943 main(args) 944