1import argparse 2import logging 3import os 4 5import torch 6import torch.distributed as c10d 7 8 9logging.basicConfig( 10 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO 11) 12 13if __name__ == "__main__": 14 parser = argparse.ArgumentParser( 15 description="Simple script to simulate NCCL errors. The script is " 16 "supposed to be run on multiple different nodes simultaneously with " 17 "appropriate rank and world_size. The script run an allreduce() on " 18 "the rank 0 node and aborts all the other nodes to simulate an error " 19 "in NCCL" 20 ) 21 parser.add_argument("addr", help="address of the master node to connect to.") 22 parser.add_argument("port", help="port of the master node to connect to.") 23 parser.add_argument("rank", help="rank of this node") 24 parser.add_argument("world_size", help="number of nodes in process group") 25 args = parser.parse_args() 26 rank = int(args.rank) 27 world_size = int(args.world_size) 28 port = int(args.port) 29 30 store = c10d.TCPStore(args.addr, port, world_size, rank == 0) 31 process_group = c10d.ProcessGroupNCCL(store, rank, world_size) 32 logging.info("Running first allreduce") 33 process_group.allreduce(torch.rand(10).cuda(rank)).wait() 34 if rank == 0: 35 logging.info("Running second allreduce only on rank 0") 36 work = process_group.allreduce(torch.rand(10).cuda(rank)) 37 logging.info("Waiting for allreduce to complete...") 38 work.wait() 39 logging.info("Second allreduce successful: %s", work.is_success()) 40 else: 41 logging.info("Aborting all other ranks.") 42 os.abort() 43