1*da0073e9SAndroid Build Coastguard Worker#!/usr/bin/env python3 2*da0073e9SAndroid Build Coastguard Worker 3*da0073e9SAndroid Build Coastguard Workerfrom __future__ import annotations 4*da0073e9SAndroid Build Coastguard Worker 5*da0073e9SAndroid Build Coastguard Workerimport datetime 6*da0073e9SAndroid Build Coastguard Workerimport json 7*da0073e9SAndroid Build Coastguard Workerimport signal 8*da0073e9SAndroid Build Coastguard Workerimport time 9*da0073e9SAndroid Build Coastguard Workerfrom typing import Any 10*da0073e9SAndroid Build Coastguard Worker 11*da0073e9SAndroid Build Coastguard Workerimport psutil # type: ignore[import] 12*da0073e9SAndroid Build Coastguard Worker 13*da0073e9SAndroid Build Coastguard Worker 14*da0073e9SAndroid Build Coastguard Workerdef get_processes_running_python_tests() -> list[Any]: 15*da0073e9SAndroid Build Coastguard Worker python_processes = [] 16*da0073e9SAndroid Build Coastguard Worker for process in psutil.process_iter(): 17*da0073e9SAndroid Build Coastguard Worker try: 18*da0073e9SAndroid Build Coastguard Worker if "python" in process.name() and process.cmdline(): 19*da0073e9SAndroid Build Coastguard Worker python_processes.append(process) 20*da0073e9SAndroid Build Coastguard Worker except (psutil.NoSuchProcess, psutil.AccessDenied): 21*da0073e9SAndroid Build Coastguard Worker # access denied or the process died 22*da0073e9SAndroid Build Coastguard Worker pass 23*da0073e9SAndroid Build Coastguard Worker return python_processes 24*da0073e9SAndroid Build Coastguard Worker 25*da0073e9SAndroid Build Coastguard Worker 26*da0073e9SAndroid Build Coastguard Workerdef get_per_process_cpu_info() -> list[dict[str, Any]]: 27*da0073e9SAndroid Build Coastguard Worker processes = get_processes_running_python_tests() 28*da0073e9SAndroid Build Coastguard Worker per_process_info = [] 29*da0073e9SAndroid Build Coastguard Worker for p in processes: 30*da0073e9SAndroid Build Coastguard Worker info = { 31*da0073e9SAndroid Build Coastguard Worker "pid": p.pid, 32*da0073e9SAndroid Build Coastguard Worker "cmd": " ".join(p.cmdline()), 33*da0073e9SAndroid Build Coastguard Worker "cpu_percent": p.cpu_percent(), 34*da0073e9SAndroid Build Coastguard Worker "rss_memory": p.memory_info().rss, 35*da0073e9SAndroid Build Coastguard Worker } 36*da0073e9SAndroid Build Coastguard Worker 37*da0073e9SAndroid Build Coastguard Worker # https://psutil.readthedocs.io/en/latest/index.html?highlight=memory_full_info 38*da0073e9SAndroid Build Coastguard Worker # requires higher user privileges and could throw AccessDenied error, i.e. mac 39*da0073e9SAndroid Build Coastguard Worker try: 40*da0073e9SAndroid Build Coastguard Worker memory_full_info = p.memory_full_info() 41*da0073e9SAndroid Build Coastguard Worker 42*da0073e9SAndroid Build Coastguard Worker info["uss_memory"] = memory_full_info.uss 43*da0073e9SAndroid Build Coastguard Worker if "pss" in memory_full_info: 44*da0073e9SAndroid Build Coastguard Worker # only availiable in linux 45*da0073e9SAndroid Build Coastguard Worker info["pss_memory"] = memory_full_info.pss 46*da0073e9SAndroid Build Coastguard Worker 47*da0073e9SAndroid Build Coastguard Worker except psutil.AccessDenied as e: 48*da0073e9SAndroid Build Coastguard Worker # It's ok to skip this 49*da0073e9SAndroid Build Coastguard Worker pass 50*da0073e9SAndroid Build Coastguard Worker 51*da0073e9SAndroid Build Coastguard Worker per_process_info.append(info) 52*da0073e9SAndroid Build Coastguard Worker return per_process_info 53*da0073e9SAndroid Build Coastguard Worker 54*da0073e9SAndroid Build Coastguard Worker 55*da0073e9SAndroid Build Coastguard Workerdef get_per_process_gpu_info(handle: Any) -> list[dict[str, Any]]: 56*da0073e9SAndroid Build Coastguard Worker processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) 57*da0073e9SAndroid Build Coastguard Worker per_process_info = [] 58*da0073e9SAndroid Build Coastguard Worker for p in processes: 59*da0073e9SAndroid Build Coastguard Worker info = {"pid": p.pid, "gpu_memory": p.usedGpuMemory} 60*da0073e9SAndroid Build Coastguard Worker per_process_info.append(info) 61*da0073e9SAndroid Build Coastguard Worker return per_process_info 62*da0073e9SAndroid Build Coastguard Worker 63*da0073e9SAndroid Build Coastguard Worker 64*da0073e9SAndroid Build Coastguard Workerdef rocm_get_per_process_gpu_info(handle: Any) -> list[dict[str, Any]]: 65*da0073e9SAndroid Build Coastguard Worker processes = amdsmi.amdsmi_get_gpu_process_list(handle) 66*da0073e9SAndroid Build Coastguard Worker per_process_info = [] 67*da0073e9SAndroid Build Coastguard Worker for p in processes: 68*da0073e9SAndroid Build Coastguard Worker try: 69*da0073e9SAndroid Build Coastguard Worker proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) 70*da0073e9SAndroid Build Coastguard Worker except AttributeError: 71*da0073e9SAndroid Build Coastguard Worker # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 72*da0073e9SAndroid Build Coastguard Worker # BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi 73*da0073e9SAndroid Build Coastguard Worker proc_info = p 74*da0073e9SAndroid Build Coastguard Worker info = { 75*da0073e9SAndroid Build Coastguard Worker "pid": proc_info["pid"], 76*da0073e9SAndroid Build Coastguard Worker "gpu_memory": proc_info["memory_usage"]["vram_mem"], 77*da0073e9SAndroid Build Coastguard Worker } 78*da0073e9SAndroid Build Coastguard Worker per_process_info.append(info) 79*da0073e9SAndroid Build Coastguard Worker return per_process_info 80*da0073e9SAndroid Build Coastguard Worker 81*da0073e9SAndroid Build Coastguard Worker 82*da0073e9SAndroid Build Coastguard Workerif __name__ == "__main__": 83*da0073e9SAndroid Build Coastguard Worker handle = None 84*da0073e9SAndroid Build Coastguard Worker try: 85*da0073e9SAndroid Build Coastguard Worker import pynvml # type: ignore[import] 86*da0073e9SAndroid Build Coastguard Worker 87*da0073e9SAndroid Build Coastguard Worker try: 88*da0073e9SAndroid Build Coastguard Worker pynvml.nvmlInit() 89*da0073e9SAndroid Build Coastguard Worker handle = pynvml.nvmlDeviceGetHandleByIndex(0) 90*da0073e9SAndroid Build Coastguard Worker except pynvml.NVMLError: 91*da0073e9SAndroid Build Coastguard Worker pass 92*da0073e9SAndroid Build Coastguard Worker except ModuleNotFoundError: 93*da0073e9SAndroid Build Coastguard Worker # no pynvml avaliable, probably because not cuda 94*da0073e9SAndroid Build Coastguard Worker pass 95*da0073e9SAndroid Build Coastguard Worker try: 96*da0073e9SAndroid Build Coastguard Worker import amdsmi # type: ignore[import] 97*da0073e9SAndroid Build Coastguard Worker 98*da0073e9SAndroid Build Coastguard Worker try: 99*da0073e9SAndroid Build Coastguard Worker amdsmi.amdsmi_init() 100*da0073e9SAndroid Build Coastguard Worker amdsmi_handle = amdsmi.amdsmi_get_processor_handles()[0] 101*da0073e9SAndroid Build Coastguard Worker except amdsmi.AmdSmiException: 102*da0073e9SAndroid Build Coastguard Worker pass 103*da0073e9SAndroid Build Coastguard Worker except ModuleNotFoundError: 104*da0073e9SAndroid Build Coastguard Worker # no amdsmi is available 105*da0073e9SAndroid Build Coastguard Worker pass 106*da0073e9SAndroid Build Coastguard Worker 107*da0073e9SAndroid Build Coastguard Worker kill_now = False 108*da0073e9SAndroid Build Coastguard Worker 109*da0073e9SAndroid Build Coastguard Worker def exit_gracefully(*args: Any) -> None: 110*da0073e9SAndroid Build Coastguard Worker global kill_now 111*da0073e9SAndroid Build Coastguard Worker kill_now = True 112*da0073e9SAndroid Build Coastguard Worker 113*da0073e9SAndroid Build Coastguard Worker signal.signal(signal.SIGTERM, exit_gracefully) 114*da0073e9SAndroid Build Coastguard Worker 115*da0073e9SAndroid Build Coastguard Worker while not kill_now: 116*da0073e9SAndroid Build Coastguard Worker try: 117*da0073e9SAndroid Build Coastguard Worker stats = { 118*da0073e9SAndroid Build Coastguard Worker "time": datetime.datetime.utcnow().isoformat("T") + "Z", 119*da0073e9SAndroid Build Coastguard Worker "total_cpu_percent": psutil.cpu_percent(), 120*da0073e9SAndroid Build Coastguard Worker "per_process_cpu_info": get_per_process_cpu_info(), 121*da0073e9SAndroid Build Coastguard Worker } 122*da0073e9SAndroid Build Coastguard Worker if handle is not None: 123*da0073e9SAndroid Build Coastguard Worker stats["per_process_gpu_info"] = get_per_process_gpu_info(handle) 124*da0073e9SAndroid Build Coastguard Worker # https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html 125*da0073e9SAndroid Build Coastguard Worker gpu_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) 126*da0073e9SAndroid Build Coastguard Worker stats["total_gpu_utilization"] = gpu_utilization.gpu 127*da0073e9SAndroid Build Coastguard Worker stats["total_gpu_mem_utilization"] = gpu_utilization.memory 128*da0073e9SAndroid Build Coastguard Worker if amdsmi_handle is not None: 129*da0073e9SAndroid Build Coastguard Worker stats["per_process_gpu_info"] = rocm_get_per_process_gpu_info( 130*da0073e9SAndroid Build Coastguard Worker amdsmi_handle 131*da0073e9SAndroid Build Coastguard Worker ) 132*da0073e9SAndroid Build Coastguard Worker stats["total_gpu_utilization"] = amdsmi.amdsmi_get_gpu_activity( 133*da0073e9SAndroid Build Coastguard Worker amdsmi_handle 134*da0073e9SAndroid Build Coastguard Worker )["gfx_activity"] 135*da0073e9SAndroid Build Coastguard Worker stats["total_gpu_mem_utilization"] = amdsmi.amdsmi_get_gpu_activity( 136*da0073e9SAndroid Build Coastguard Worker amdsmi_handle 137*da0073e9SAndroid Build Coastguard Worker )["umc_activity"] 138*da0073e9SAndroid Build Coastguard Worker except Exception as e: 139*da0073e9SAndroid Build Coastguard Worker stats = { 140*da0073e9SAndroid Build Coastguard Worker "time": datetime.datetime.utcnow().isoformat("T") + "Z", 141*da0073e9SAndroid Build Coastguard Worker "error": str(e), 142*da0073e9SAndroid Build Coastguard Worker } 143*da0073e9SAndroid Build Coastguard Worker finally: 144*da0073e9SAndroid Build Coastguard Worker print(json.dumps(stats)) 145*da0073e9SAndroid Build Coastguard Worker time.sleep(1) 146