# Sparse benchmarks # This benchmark is for sparse matmul performance test. # They exist for comparing the performance of sparse matrix routines # `sparse @ vector`, `sparse @ sparse` and `sparse @ dense` with different backends (CPU/CUDA) # and with other frameworks such as scipy. import argparse import os import sys from scipy.sparse import isspmatrix import torch import torch.utils.benchmark as benchmark_utils from .utils import load_dlmc_dataset def scipy_matmul(mat1, mat2): if isspmatrix(mat1) and isspmatrix(mat2): return mat1.dot(mat2).tocoo() return mat1.dot(mat2) def matmul_backward(a_dense, b_dense, grad_output): r1 = a_dense.matmul(b_dense) r1.backward(grad_output) def sparse_matmul_backward(a, b, grad_output): c = torch.sparse.mm(a, b) c.backward(grad_output) OPS_MAP = { "sparse@sparse": "torch.sparse.mm", "sparse@dense": "torch.matmul", "sparse@vector": "torch.matmul", } # also get the arguments as input from the user using `argparse` def parse_args(): parser = argparse.ArgumentParser(description="matmul benchmark") parser.add_argument("--path", type=str, help="DLMC dataset path") parser.add_argument("--dataset", type=str, default="magnitude_pruning") parser.add_argument("--hidden-size", "--hidden_size", default=2048, type=int) parser.add_argument("--backward-test", "--backward_test", action="store_true") parser.add_argument( "--operation", type=str, help="|".join(OPS_MAP.keys()), default=next(iter(OPS_MAP)), ) parser.add_argument("--with-cuda", "--with_cuda", action="store_true") parser.add_argument( "--timer-min-run-time", "--timer_min_run_time", default=1, type=float ) return parser def get_tasks(op, backward_test, device): def filter_ops(operation): if backward_test: test_name = device + ":matmul-backward" return [ ( test_name, device, "torch:" + operation.replace("sparse", "dense"), "matmul_backward(dx, dy, grad_output)", ), ( test_name, device, "torch:" + operation, "sparse_matmul_backward(x, y, sparse_grad_output)", ), ] else: test_name = device + ":matmul-forward" return list( filter( None, [ ( test_name, device, "torch:" + operation.replace("sparse", "dense"), f"{OPS_MAP[operation]}(dx, dy)", ), ( test_name, device, "torch:" + operation, f"{OPS_MAP[operation]}(x, y)", ), ( test_name, device, "scipy:" + operation, "scipy_matmul(sx, sy)", ) if device == "cpu" else None, ], ) ) all_operations = { "sparse@sparse": filter_ops("sparse@sparse"), "sparse@dense": filter_ops("sparse@dense"), "sparse@vector": filter_ops("sparse@vector"), } return all_operations[op] if __name__ == "__main__": parser = parse_args() args = parser.parse_args() if args.with_cuda and not torch.cuda.is_available(): raise RuntimeError("No CUDA available") dataset_path = args.path dataset_name = args.dataset dataset_path = os.path.join(dataset_path, dataset_name) device = "cuda" if args.with_cuda else "cpu" tasks = get_tasks(args.operation, args.backward_test, device) repeats = 3 timers = [ benchmark_utils.Timer( stmt=stmt, globals={ "scipy_matmul": scipy_matmul, "matmul_backward": matmul_backward, "sparse_matmul_backward": sparse_matmul_backward, **variables, }, label=label, sub_label=sub_label, description=f"{sparsity}", env=device, ) for sparsity in [0.5, 0.7, 0.8, 0.9, 0.95, 0.98] for label, device, sub_label, stmt in tasks for variables in load_dlmc_dataset( dataset_path, args.operation, args.hidden_size, sparsity, device, args.backward_test, ) ] measurements = [] for i, timer in enumerate(timers * repeats): m = timer.blocked_autorange(min_run_time=args.timer_min_run_time) m.metadata = {"device": "cuda" if m.task_spec.env.find("cuda") >= 0 else "cpu"} measurements.append(m) print(f"\r{i + 1} / {len(timers) * repeats}", end="") sys.stdout.flush() print() comparison = benchmark_utils.Compare(measurements) print("== Results " + "=" * 80 + "\n" + "/" * 95 + "\n") comparison.print()