xref: /aosp_15_r20/external/pytorch/test/simulate_nccl_errors.py (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1import argparse
2import logging
3import os
4
5import torch
6import torch.distributed as c10d
7
8
9logging.basicConfig(
10    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
11)
12
13if __name__ == "__main__":
14    parser = argparse.ArgumentParser(
15        description="Simple script to simulate NCCL errors. The script is "
16        "supposed to be run on multiple different nodes simultaneously with "
17        "appropriate rank and world_size. The script run an allreduce() on "
18        "the rank 0 node and aborts all the other nodes to simulate an error "
19        "in NCCL"
20    )
21    parser.add_argument("addr", help="address of the master node to connect to.")
22    parser.add_argument("port", help="port of the master node to connect to.")
23    parser.add_argument("rank", help="rank of this node")
24    parser.add_argument("world_size", help="number of nodes in process group")
25    args = parser.parse_args()
26    rank = int(args.rank)
27    world_size = int(args.world_size)
28    port = int(args.port)
29
30    store = c10d.TCPStore(args.addr, port, world_size, rank == 0)
31    process_group = c10d.ProcessGroupNCCL(store, rank, world_size)
32    logging.info("Running first allreduce")
33    process_group.allreduce(torch.rand(10).cuda(rank)).wait()
34    if rank == 0:
35        logging.info("Running second allreduce only on rank 0")
36        work = process_group.allreduce(torch.rand(10).cuda(rank))
37        logging.info("Waiting for allreduce to complete...")
38        work.wait()
39        logging.info("Second allreduce successful: %s", work.is_success())
40    else:
41        logging.info("Aborting all other ranks.")
42        os.abort()
43