xref: /aosp_15_r20/external/pytorch/tools/stats/monitor.py (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1*da0073e9SAndroid Build Coastguard Worker#!/usr/bin/env python3
2*da0073e9SAndroid Build Coastguard Worker
3*da0073e9SAndroid Build Coastguard Workerfrom __future__ import annotations
4*da0073e9SAndroid Build Coastguard Worker
5*da0073e9SAndroid Build Coastguard Workerimport datetime
6*da0073e9SAndroid Build Coastguard Workerimport json
7*da0073e9SAndroid Build Coastguard Workerimport signal
8*da0073e9SAndroid Build Coastguard Workerimport time
9*da0073e9SAndroid Build Coastguard Workerfrom typing import Any
10*da0073e9SAndroid Build Coastguard Worker
11*da0073e9SAndroid Build Coastguard Workerimport psutil  # type: ignore[import]
12*da0073e9SAndroid Build Coastguard Worker
13*da0073e9SAndroid Build Coastguard Worker
14*da0073e9SAndroid Build Coastguard Workerdef get_processes_running_python_tests() -> list[Any]:
15*da0073e9SAndroid Build Coastguard Worker    python_processes = []
16*da0073e9SAndroid Build Coastguard Worker    for process in psutil.process_iter():
17*da0073e9SAndroid Build Coastguard Worker        try:
18*da0073e9SAndroid Build Coastguard Worker            if "python" in process.name() and process.cmdline():
19*da0073e9SAndroid Build Coastguard Worker                python_processes.append(process)
20*da0073e9SAndroid Build Coastguard Worker        except (psutil.NoSuchProcess, psutil.AccessDenied):
21*da0073e9SAndroid Build Coastguard Worker            # access denied or the process died
22*da0073e9SAndroid Build Coastguard Worker            pass
23*da0073e9SAndroid Build Coastguard Worker    return python_processes
24*da0073e9SAndroid Build Coastguard Worker
25*da0073e9SAndroid Build Coastguard Worker
26*da0073e9SAndroid Build Coastguard Workerdef get_per_process_cpu_info() -> list[dict[str, Any]]:
27*da0073e9SAndroid Build Coastguard Worker    processes = get_processes_running_python_tests()
28*da0073e9SAndroid Build Coastguard Worker    per_process_info = []
29*da0073e9SAndroid Build Coastguard Worker    for p in processes:
30*da0073e9SAndroid Build Coastguard Worker        info = {
31*da0073e9SAndroid Build Coastguard Worker            "pid": p.pid,
32*da0073e9SAndroid Build Coastguard Worker            "cmd": " ".join(p.cmdline()),
33*da0073e9SAndroid Build Coastguard Worker            "cpu_percent": p.cpu_percent(),
34*da0073e9SAndroid Build Coastguard Worker            "rss_memory": p.memory_info().rss,
35*da0073e9SAndroid Build Coastguard Worker        }
36*da0073e9SAndroid Build Coastguard Worker
37*da0073e9SAndroid Build Coastguard Worker        # https://psutil.readthedocs.io/en/latest/index.html?highlight=memory_full_info
38*da0073e9SAndroid Build Coastguard Worker        # requires higher user privileges and could throw AccessDenied error, i.e. mac
39*da0073e9SAndroid Build Coastguard Worker        try:
40*da0073e9SAndroid Build Coastguard Worker            memory_full_info = p.memory_full_info()
41*da0073e9SAndroid Build Coastguard Worker
42*da0073e9SAndroid Build Coastguard Worker            info["uss_memory"] = memory_full_info.uss
43*da0073e9SAndroid Build Coastguard Worker            if "pss" in memory_full_info:
44*da0073e9SAndroid Build Coastguard Worker                # only availiable in linux
45*da0073e9SAndroid Build Coastguard Worker                info["pss_memory"] = memory_full_info.pss
46*da0073e9SAndroid Build Coastguard Worker
47*da0073e9SAndroid Build Coastguard Worker        except psutil.AccessDenied as e:
48*da0073e9SAndroid Build Coastguard Worker            # It's ok to skip this
49*da0073e9SAndroid Build Coastguard Worker            pass
50*da0073e9SAndroid Build Coastguard Worker
51*da0073e9SAndroid Build Coastguard Worker        per_process_info.append(info)
52*da0073e9SAndroid Build Coastguard Worker    return per_process_info
53*da0073e9SAndroid Build Coastguard Worker
54*da0073e9SAndroid Build Coastguard Worker
55*da0073e9SAndroid Build Coastguard Workerdef get_per_process_gpu_info(handle: Any) -> list[dict[str, Any]]:
56*da0073e9SAndroid Build Coastguard Worker    processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
57*da0073e9SAndroid Build Coastguard Worker    per_process_info = []
58*da0073e9SAndroid Build Coastguard Worker    for p in processes:
59*da0073e9SAndroid Build Coastguard Worker        info = {"pid": p.pid, "gpu_memory": p.usedGpuMemory}
60*da0073e9SAndroid Build Coastguard Worker        per_process_info.append(info)
61*da0073e9SAndroid Build Coastguard Worker    return per_process_info
62*da0073e9SAndroid Build Coastguard Worker
63*da0073e9SAndroid Build Coastguard Worker
64*da0073e9SAndroid Build Coastguard Workerdef rocm_get_per_process_gpu_info(handle: Any) -> list[dict[str, Any]]:
65*da0073e9SAndroid Build Coastguard Worker    processes = amdsmi.amdsmi_get_gpu_process_list(handle)
66*da0073e9SAndroid Build Coastguard Worker    per_process_info = []
67*da0073e9SAndroid Build Coastguard Worker    for p in processes:
68*da0073e9SAndroid Build Coastguard Worker        try:
69*da0073e9SAndroid Build Coastguard Worker            proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p)
70*da0073e9SAndroid Build Coastguard Worker        except AttributeError:
71*da0073e9SAndroid Build Coastguard Worker            # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7
72*da0073e9SAndroid Build Coastguard Worker            # BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi
73*da0073e9SAndroid Build Coastguard Worker            proc_info = p
74*da0073e9SAndroid Build Coastguard Worker        info = {
75*da0073e9SAndroid Build Coastguard Worker            "pid": proc_info["pid"],
76*da0073e9SAndroid Build Coastguard Worker            "gpu_memory": proc_info["memory_usage"]["vram_mem"],
77*da0073e9SAndroid Build Coastguard Worker        }
78*da0073e9SAndroid Build Coastguard Worker        per_process_info.append(info)
79*da0073e9SAndroid Build Coastguard Worker    return per_process_info
80*da0073e9SAndroid Build Coastguard Worker
81*da0073e9SAndroid Build Coastguard Worker
82*da0073e9SAndroid Build Coastguard Workerif __name__ == "__main__":
83*da0073e9SAndroid Build Coastguard Worker    handle = None
84*da0073e9SAndroid Build Coastguard Worker    try:
85*da0073e9SAndroid Build Coastguard Worker        import pynvml  # type: ignore[import]
86*da0073e9SAndroid Build Coastguard Worker
87*da0073e9SAndroid Build Coastguard Worker        try:
88*da0073e9SAndroid Build Coastguard Worker            pynvml.nvmlInit()
89*da0073e9SAndroid Build Coastguard Worker            handle = pynvml.nvmlDeviceGetHandleByIndex(0)
90*da0073e9SAndroid Build Coastguard Worker        except pynvml.NVMLError:
91*da0073e9SAndroid Build Coastguard Worker            pass
92*da0073e9SAndroid Build Coastguard Worker    except ModuleNotFoundError:
93*da0073e9SAndroid Build Coastguard Worker        # no pynvml avaliable, probably because not cuda
94*da0073e9SAndroid Build Coastguard Worker        pass
95*da0073e9SAndroid Build Coastguard Worker    try:
96*da0073e9SAndroid Build Coastguard Worker        import amdsmi  # type: ignore[import]
97*da0073e9SAndroid Build Coastguard Worker
98*da0073e9SAndroid Build Coastguard Worker        try:
99*da0073e9SAndroid Build Coastguard Worker            amdsmi.amdsmi_init()
100*da0073e9SAndroid Build Coastguard Worker            amdsmi_handle = amdsmi.amdsmi_get_processor_handles()[0]
101*da0073e9SAndroid Build Coastguard Worker        except amdsmi.AmdSmiException:
102*da0073e9SAndroid Build Coastguard Worker            pass
103*da0073e9SAndroid Build Coastguard Worker    except ModuleNotFoundError:
104*da0073e9SAndroid Build Coastguard Worker        # no amdsmi is available
105*da0073e9SAndroid Build Coastguard Worker        pass
106*da0073e9SAndroid Build Coastguard Worker
107*da0073e9SAndroid Build Coastguard Worker    kill_now = False
108*da0073e9SAndroid Build Coastguard Worker
109*da0073e9SAndroid Build Coastguard Worker    def exit_gracefully(*args: Any) -> None:
110*da0073e9SAndroid Build Coastguard Worker        global kill_now
111*da0073e9SAndroid Build Coastguard Worker        kill_now = True
112*da0073e9SAndroid Build Coastguard Worker
113*da0073e9SAndroid Build Coastguard Worker    signal.signal(signal.SIGTERM, exit_gracefully)
114*da0073e9SAndroid Build Coastguard Worker
115*da0073e9SAndroid Build Coastguard Worker    while not kill_now:
116*da0073e9SAndroid Build Coastguard Worker        try:
117*da0073e9SAndroid Build Coastguard Worker            stats = {
118*da0073e9SAndroid Build Coastguard Worker                "time": datetime.datetime.utcnow().isoformat("T") + "Z",
119*da0073e9SAndroid Build Coastguard Worker                "total_cpu_percent": psutil.cpu_percent(),
120*da0073e9SAndroid Build Coastguard Worker                "per_process_cpu_info": get_per_process_cpu_info(),
121*da0073e9SAndroid Build Coastguard Worker            }
122*da0073e9SAndroid Build Coastguard Worker            if handle is not None:
123*da0073e9SAndroid Build Coastguard Worker                stats["per_process_gpu_info"] = get_per_process_gpu_info(handle)
124*da0073e9SAndroid Build Coastguard Worker                # https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html
125*da0073e9SAndroid Build Coastguard Worker                gpu_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
126*da0073e9SAndroid Build Coastguard Worker                stats["total_gpu_utilization"] = gpu_utilization.gpu
127*da0073e9SAndroid Build Coastguard Worker                stats["total_gpu_mem_utilization"] = gpu_utilization.memory
128*da0073e9SAndroid Build Coastguard Worker            if amdsmi_handle is not None:
129*da0073e9SAndroid Build Coastguard Worker                stats["per_process_gpu_info"] = rocm_get_per_process_gpu_info(
130*da0073e9SAndroid Build Coastguard Worker                    amdsmi_handle
131*da0073e9SAndroid Build Coastguard Worker                )
132*da0073e9SAndroid Build Coastguard Worker                stats["total_gpu_utilization"] = amdsmi.amdsmi_get_gpu_activity(
133*da0073e9SAndroid Build Coastguard Worker                    amdsmi_handle
134*da0073e9SAndroid Build Coastguard Worker                )["gfx_activity"]
135*da0073e9SAndroid Build Coastguard Worker                stats["total_gpu_mem_utilization"] = amdsmi.amdsmi_get_gpu_activity(
136*da0073e9SAndroid Build Coastguard Worker                    amdsmi_handle
137*da0073e9SAndroid Build Coastguard Worker                )["umc_activity"]
138*da0073e9SAndroid Build Coastguard Worker        except Exception as e:
139*da0073e9SAndroid Build Coastguard Worker            stats = {
140*da0073e9SAndroid Build Coastguard Worker                "time": datetime.datetime.utcnow().isoformat("T") + "Z",
141*da0073e9SAndroid Build Coastguard Worker                "error": str(e),
142*da0073e9SAndroid Build Coastguard Worker            }
143*da0073e9SAndroid Build Coastguard Worker        finally:
144*da0073e9SAndroid Build Coastguard Worker            print(json.dumps(stats))
145*da0073e9SAndroid Build Coastguard Worker            time.sleep(1)
146