1import argparse 2import sys 3import timeit 4 5import torch 6from torch.utils.benchmark import Timer 7 8 9PARALLEL_TASKS_NUM = 4 10INTERNAL_ITER = None 11 12 13def loop_workload(x): 14 for i in range(INTERNAL_ITER): 15 x = torch.mm(x, x) 16 return x 17 18 19def parallel_workload(x): 20 def parallel_task(x): 21 for i in range(int(INTERNAL_ITER / PARALLEL_TASKS_NUM)): 22 x = torch.mm(x, x) 23 return x 24 25 futs = [] 26 for i in range(PARALLEL_TASKS_NUM): 27 futs.append(torch.jit._fork(parallel_task, x)) 28 for i in range(PARALLEL_TASKS_NUM): 29 torch.jit._wait(futs[i]) 30 return x 31 32 33if __name__ == "__main__": 34 torch._C._set_graph_executor_optimize(False) 35 parser = argparse.ArgumentParser(description="Profiler benchmark") 36 37 parser.add_argument("--with-cuda", "--with_cuda", action="store_true") 38 parser.add_argument("--with-stack", "--with_stack", action="store_true") 39 parser.add_argument("--use-script", "--use_script", action="store_true") 40 parser.add_argument("--use-kineto", "--use_kineto", action="store_true") 41 parser.add_argument( 42 "--profiling-tensor-size", "--profiling_tensor_size", default=1, type=int 43 ) 44 parser.add_argument("--workload", "--workload", default="loop", type=str) 45 parser.add_argument("--internal-iter", "--internal_iter", default=256, type=int) 46 parser.add_argument( 47 "--timer-min-run-time", "--timer_min_run_time", default=10, type=int 48 ) 49 parser.add_argument("--cuda-only", "--cuda_only", action="store_true") 50 51 args = parser.parse_args() 52 53 if args.with_cuda and not torch.cuda.is_available(): 54 print("No CUDA available") 55 sys.exit() 56 57 print( 58 f"Payload: {args.workload}, {args.internal_iter} iterations; timer min. runtime = {args.timer_min_run_time}\n" 59 ) 60 INTERNAL_ITER = args.internal_iter 61 62 for profiling_enabled in [False, True]: 63 print( 64 "Profiling {}, tensor size {}x{}, use cuda: {}, use kineto: {}, with stacks: {}, use script: {}".format( 65 "enabled" if profiling_enabled else "disabled", 66 args.profiling_tensor_size, 67 args.profiling_tensor_size, 68 args.with_cuda, 69 args.use_kineto, 70 args.with_stack, 71 args.use_script, 72 ) 73 ) 74 75 input_x = torch.rand(args.profiling_tensor_size, args.profiling_tensor_size) 76 77 if args.with_cuda: 78 input_x = input_x.cuda() 79 80 workload = None 81 assert args.workload in ["loop", "parallel"] 82 if args.workload == "loop": 83 workload = loop_workload 84 else: 85 workload = parallel_workload 86 87 if args.use_script: 88 traced_workload = torch.jit.trace(workload, (input_x,)) 89 workload = traced_workload 90 91 if profiling_enabled: 92 93 def payload(): 94 x = None 95 with torch.autograd.profiler.profile( 96 use_cuda=args.with_cuda, 97 with_stack=args.with_stack, 98 use_kineto=args.use_kineto, 99 use_cpu=not args.cuda_only, 100 ) as prof: 101 x = workload(input_x) 102 return x 103 104 else: 105 106 def payload(): 107 return workload(input_x) 108 109 t = Timer( 110 "payload()", 111 globals={"payload": payload}, 112 timer=timeit.default_timer, 113 ).blocked_autorange(min_run_time=args.timer_min_run_time) 114 print(t) 115