1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# drsnoop Trace direct reclaim and print details including issuing PID. 5# For Linux, uses BCC, eBPF. 6# 7# This uses in-kernel eBPF maps to cache process details (PID and comm) by 8# direct reclaim begin, as well as a starting timestamp for calculating 9# latency. 10# 11# Copyright (c) 2019 Wenbo Zhang 12# Licensed under the Apache License, Version 2.0 (the "License") 13# 14# 20-Feb-2019 Wenbo Zhang Created this. 15# 09-Mar-2019 Wenbo Zhang Updated for show sys mem info. 16 17from __future__ import print_function 18from bcc import ArgString, BPF 19import argparse 20from datetime import datetime, timedelta 21import os 22import math 23import sys 24 25# symbols 26kallsyms = "/proc/kallsyms" 27 28# arguments 29examples = """examples: 30 ./drsnoop # trace all direct reclaim 31 ./drsnoop -T # include timestamps 32 ./drsnoop -U # include UID 33 ./drsnoop -P 181 # only trace PID 181 34 ./drsnoop -t 123 # only trace TID 123 35 ./drsnoop -u 1000 # only trace UID 1000 36 ./drsnoop -d 10 # trace for 10 seconds only 37 ./drsnoop -n main # only print process names containing "main" 38""" 39parser = argparse.ArgumentParser( 40 description="Trace direct reclaim", 41 formatter_class=argparse.RawDescriptionHelpFormatter, 42 epilog=examples) 43parser.add_argument("-T", "--timestamp", action="store_true", 44 help="include timestamp on output") 45parser.add_argument("-U", "--print-uid", action="store_true", 46 help="print UID column") 47parser.add_argument("-p", "--pid", 48 help="trace this PID only") 49parser.add_argument("-t", "--tid", 50 help="trace this TID only") 51parser.add_argument("-u", "--uid", 52 help="trace this UID only") 53parser.add_argument("-d", "--duration", 54 help="total duration of trace in seconds") 55parser.add_argument("-n", "--name", 56 type=ArgString, 57 help="only print process names containing this name") 58parser.add_argument("-v", "--verbose", action="store_true", 59 help="show system memory state") 60parser.add_argument("--ebpf", action="store_true", 61 help=argparse.SUPPRESS) 62args = parser.parse_args() 63debug = 0 64if args.duration: 65 args.duration = timedelta(seconds=int(args.duration)) 66 67 68# vm_stat 69vm_stat_addr = '' 70with open(kallsyms) as syms: 71 for line in syms: 72 (addr, size, name) = line.rstrip().split(" ", 2) 73 name = name.split("\t")[0] 74 if name == "vm_stat": 75 vm_stat_addr = "0x" + addr 76 break 77 if name == "vm_zone_stat": 78 vm_stat_addr = "0x" + addr 79 break 80 if vm_stat_addr == '': 81 print("ERROR: no vm_stat or vm_zone_stat in /proc/kallsyms. Exiting.") 82 print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.") 83 exit() 84 85NR_FREE_PAGES = 0 86 87PAGE_SIZE = os.sysconf("SC_PAGE_SIZE") 88PAGE_SHIFT = int(math.log(PAGE_SIZE) / math.log(2)) 89 90def K(x): 91 return x << (PAGE_SHIFT - 10) 92 93# load BPF program 94bpf_text = """ 95#include <uapi/linux/ptrace.h> 96#include <linux/sched.h> 97#include <linux/mmzone.h> 98 99struct val_t { 100 u64 id; 101 u64 ts; // start time 102 char name[TASK_COMM_LEN]; 103 u64 vm_stat[NR_VM_ZONE_STAT_ITEMS]; 104}; 105 106struct data_t { 107 u64 id; 108 u32 uid; 109 u64 nr_reclaimed; 110 u64 delta; 111 u64 ts; // end time 112 char name[TASK_COMM_LEN]; 113 u64 vm_stat[NR_VM_ZONE_STAT_ITEMS]; 114}; 115 116BPF_HASH(start, u64, struct val_t); 117BPF_PERF_OUTPUT(events); 118 119TRACEPOINT_PROBE(vmscan, mm_vmscan_direct_reclaim_begin) { 120 struct val_t val = {}; 121 u64 id = bpf_get_current_pid_tgid(); 122 u32 pid = id >> 32; // PID is higher part 123 u32 tid = id; // Cast and get the lower part 124 u32 uid = bpf_get_current_uid_gid(); 125 u64 ts; 126 127 PID_TID_FILTER 128 UID_FILTER 129 if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) { 130 val.id = id; 131 val.ts = bpf_ktime_get_ns(); 132 bpf_probe_read_kernel(&val.vm_stat, sizeof(val.vm_stat), (const void *)%s); 133 start.update(&id, &val); 134 } 135 return 0; 136} 137 138TRACEPOINT_PROBE(vmscan, mm_vmscan_direct_reclaim_end) { 139 u64 id = bpf_get_current_pid_tgid(); 140 struct val_t *valp; 141 struct data_t data = {}; 142 u64 ts = bpf_ktime_get_ns(); 143 144 valp = start.lookup(&id); 145 if (valp == NULL) { 146 // missed entry 147 return 0; 148 } 149 150 data.delta = ts - valp->ts; 151 data.ts = ts / 1000; 152 data.id = valp->id; 153 data.uid = bpf_get_current_uid_gid(); 154 bpf_probe_read_kernel(&data.name, sizeof(data.name), valp->name); 155 bpf_probe_read_kernel(&data.vm_stat, sizeof(data.vm_stat), valp->vm_stat); 156 data.nr_reclaimed = args->nr_reclaimed; 157 158 events.perf_submit(args, &data, sizeof(data)); 159 start.delete(&id); 160 161 return 0; 162} 163""" % vm_stat_addr 164 165if args.tid: # TID trumps PID 166 bpf_text = bpf_text.replace('PID_TID_FILTER', 167 'if (tid != %s) { return 0; }' % args.tid) 168elif args.pid: 169 bpf_text = bpf_text.replace('PID_TID_FILTER', 170 'if (pid != %s) { return 0; }' % args.pid) 171else: 172 bpf_text = bpf_text.replace('PID_TID_FILTER', '') 173if args.uid: 174 bpf_text = bpf_text.replace('UID_FILTER', 175 'if (uid != %s) { return 0; }' % args.uid) 176else: 177 bpf_text = bpf_text.replace('UID_FILTER', '') 178if debug or args.ebpf: 179 print(bpf_text) 180 if args.ebpf: 181 exit() 182 183# initialize BPF 184b = BPF(text=bpf_text) 185 186initial_ts = 0 187 188# header 189if args.timestamp: 190 print("%-14s" % ("TIME(s)"), end="") 191if args.print_uid: 192 print("%-6s" % ("UID"), end="") 193print("%-14s %-6s %8s %5s" % 194 ("COMM", "TID" if args.tid else "PID", "LAT(ms)", "PAGES"), end="") 195if args.verbose: 196 print("%10s" % ("FREE(KB)")) 197else: 198 print("") 199 200# process event 201def print_event(cpu, data, size): 202 event = b["events"].event(data) 203 204 global initial_ts 205 206 if not initial_ts: 207 initial_ts = event.ts 208 209 if args.name and bytes(args.name) not in event.name: 210 return 211 212 if args.timestamp: 213 delta = event.ts - initial_ts 214 print("%-14.9f" % (float(delta) / 1000000), end="") 215 216 if args.print_uid: 217 print("%-6d" % event.uid, end="") 218 219 print("%-14.14s %-6s %8.2f %5d" % 220 (event.name.decode('utf-8', 'replace'), 221 event.id & 0xffffffff if args.tid else event.id >> 32, 222 float(event.delta) / 1000000, event.nr_reclaimed), end="") 223 if args.verbose: 224 print("%10d" % K(event.vm_stat[NR_FREE_PAGES])) 225 else: 226 print("") 227 228 sys.stdout.flush() 229 230 231# loop with callback to print_event 232b["events"].open_perf_buffer(print_event, page_cnt=64) 233start_time = datetime.now() 234while not args.duration or datetime.now() - start_time < args.duration: 235 try: 236 b.perf_buffer_poll() 237 except KeyboardInterrupt: 238 exit() 239