xref: /aosp_15_r20/external/bcc/libbpf-tools/ksnoop.bpf.c (revision 387f9dfdfa2baef462e92476d413c7bc2470293e)
1 /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
2 /* Copyright (c) 2021, Oracle and/or its affiliates. */
3 
4 #include "vmlinux.h"
5 
6 #include <bpf/bpf_helpers.h>
7 #include <bpf/bpf_tracing.h>
8 #include <bpf/bpf_core_read.h>
9 
10 #include "ksnoop.h"
11 
12 /* For kretprobes, the instruction pointer in the struct pt_regs context
13  * is the kretprobe_trampoline.  We derive the instruction pointer
14  * by pushing it onto a function stack on entry and popping it on return.
15  *
16  * We could use bpf_get_func_ip(), but "stack mode" - where we
17  * specify functions "a", "b and "c" and only want to see a trace if "a"
18  * calls "b" and "b" calls "c" - utilizes this stack to determine if trace
19  * data should be collected.
20  */
21 #define FUNC_MAX_STACK_DEPTH	16
22 /* used to convince verifier we do not stray outside of array bounds */
23 #define FUNC_STACK_DEPTH_MASK	(FUNC_MAX_STACK_DEPTH - 1)
24 
25 #ifndef ENOSPC
26 #define ENOSPC			28
27 #endif
28 
29 struct func_stack {
30 	__u64 task;
31 	__u64 ips[FUNC_MAX_STACK_DEPTH];
32 	__u8 stack_depth;
33 };
34 
35 #define MAX_TASKS		2048
36 
37 /* function call stack hashed on a per-task key */
38 struct {
39 	__uint(type, BPF_MAP_TYPE_HASH);
40 	/* function call stack for functions we are tracing */
41 	__uint(max_entries, MAX_TASKS);
42 	__type(key, __u64);
43 	__type(value, struct func_stack);
44 } ksnoop_func_stack SEC(".maps");
45 
46 /* per-cpu trace info hashed on function address */
47 struct {
48 	__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
49 	__uint(max_entries, MAX_FUNC_TRACES);
50 	__type(key, __u64);
51 	__type(value, struct trace);
52 } ksnoop_func_map SEC(".maps");
53 
54 struct {
55 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
56 	__uint(value_size, sizeof(int));
57 	__uint(key_size, sizeof(int));
58 } ksnoop_perf_map SEC(".maps");
59 
clear_trace(struct trace * trace)60 static void clear_trace(struct trace *trace)
61 {
62 	__builtin_memset(&trace->trace_data, 0, sizeof(trace->trace_data));
63 	trace->data_flags = 0;
64 	trace->buf_len = 0;
65 }
66 
get_trace(struct pt_regs * ctx,bool entry)67 static struct trace *get_trace(struct pt_regs *ctx, bool entry)
68 {
69 	__u8 stack_depth, last_stack_depth;
70 	struct func_stack *func_stack;
71 	__u64 ip, last_ip = 0, task;
72 	struct trace *trace;
73 
74 	task = bpf_get_current_task();
75 
76 	func_stack = bpf_map_lookup_elem(&ksnoop_func_stack, &task);
77 	if (!func_stack) {
78 		struct func_stack new_stack = { .task = task };
79 
80 		bpf_map_update_elem(&ksnoop_func_stack, &task, &new_stack,
81 				    BPF_NOEXIST);
82 		func_stack = bpf_map_lookup_elem(&ksnoop_func_stack, &task);
83 		if (!func_stack)
84 			return NULL;
85 	}
86 
87 	stack_depth = func_stack->stack_depth;
88 	if (stack_depth > FUNC_MAX_STACK_DEPTH)
89 		return NULL;
90 
91 	if (entry) {
92 		if (bpf_core_enum_value_exists(enum bpf_func_id,
93 					       BPF_FUNC_get_func_ip))
94 			ip = bpf_get_func_ip(ctx);
95 		else
96 			ip = KSNOOP_IP_FIX(PT_REGS_IP_CORE(ctx));
97 		if (stack_depth >= FUNC_MAX_STACK_DEPTH - 1)
98 			return NULL;
99 		/* verifier doesn't like using "stack_depth - 1" as array index
100 		 * directly.
101 		 */
102 		last_stack_depth = stack_depth - 1;
103 		/* get address of last function we called */
104 		if (last_stack_depth >= 0 &&
105 		    last_stack_depth < FUNC_MAX_STACK_DEPTH)
106 			last_ip = func_stack->ips[last_stack_depth];
107 		/* push ip onto stack. return will pop it. */
108 		func_stack->ips[stack_depth] = ip;
109 		/* mask used in case bounds checks are optimized out */
110 		stack_depth = (stack_depth + 1) & FUNC_STACK_DEPTH_MASK;
111 		func_stack->stack_depth = stack_depth;
112 		/* rather than zero stack entries on popping, we zero the
113 		 * (stack_depth + 1)'th entry when pushing the current
114 		 * entry.  The reason we take this approach is that
115 		 * when tracking the set of functions we returned from,
116 		 * we want the history of functions we returned from to
117 		 * be preserved.
118 		 */
119 		if (stack_depth < FUNC_MAX_STACK_DEPTH)
120 			func_stack->ips[stack_depth] = 0;
121 	} else {
122 		if (stack_depth == 0 || stack_depth >= FUNC_MAX_STACK_DEPTH)
123 			return NULL;
124 		last_stack_depth = stack_depth;
125 		/* get address of last function we returned from */
126 		if (last_stack_depth >= 0 &&
127 		    last_stack_depth < FUNC_MAX_STACK_DEPTH)
128 			last_ip = func_stack->ips[last_stack_depth];
129 		if (stack_depth > 0) {
130 			/* logical OR convinces verifier that we don't
131 			 * end up with a < 0 value, translating to 0xff
132 			 * and an outside of map element access.
133 			 */
134 			stack_depth = (stack_depth - 1) & FUNC_STACK_DEPTH_MASK;
135 		}
136 		/* retrieve ip from stack as IP in pt_regs is
137 		 * bpf kretprobe trampoline address.
138 		 */
139 		if (stack_depth >= 0 && stack_depth < FUNC_MAX_STACK_DEPTH)
140 			ip = func_stack->ips[stack_depth];
141 		if (stack_depth >= 0 && stack_depth < FUNC_MAX_STACK_DEPTH)
142 			func_stack->stack_depth = stack_depth;
143 	}
144 
145 	trace = bpf_map_lookup_elem(&ksnoop_func_map, &ip);
146 	if (!trace)
147 		return NULL;
148 
149 	/* we may stash data on entry since predicates are a mix
150 	 * of entry/return; in such cases, trace->flags specifies
151 	 * KSNOOP_F_STASH, and we will output stashed data on return.
152 	 * If returning, make sure we don't clear our stashed data.
153 	 */
154 	if (!entry && (trace->flags & KSNOOP_F_STASH)) {
155 		/* skip clearing trace data */
156 		if (!(trace->data_flags & KSNOOP_F_STASHED)) {
157 			/* predicate must have failed */
158 			return NULL;
159 		}
160 		/* skip clearing trace data */
161 	} else {
162 		/* clear trace data before starting. */
163 		clear_trace(trace);
164 	}
165 
166 	if (entry) {
167 		/* if in stack mode, check if previous fn matches */
168 		if (trace->prev_ip && trace->prev_ip != last_ip)
169 			return NULL;
170 		/* if tracing intermediate fn in stack of fns, stash data. */
171 		if (trace->next_ip)
172 			trace->data_flags |= KSNOOP_F_STASH;
173 		/* we may stash data on entry since predicates are a mix
174 		 * of entry/return; in such cases, trace->flags specifies
175 		 * KSNOOP_F_STASH, and we will output stashed data on return.
176 		 */
177 		if (trace->flags & KSNOOP_F_STASH)
178 			trace->data_flags |= KSNOOP_F_STASH;
179 		/* otherwise the data is outputted (because we've reached
180 		 * the last fn in the set of fns specified).
181 		 */
182 	} else {
183 		/* In stack mode, check if next fn matches the last fn
184 		 * we returned from; i.e. "a" called "b", and now
185 		 * we're at "a", was the last fn we returned from "b"?
186 		 * If so, stash data for later display (when we reach the
187 		 * first fn in the set of stack fns).
188 		 */
189 		if (trace->next_ip && trace->next_ip != last_ip)
190 			return NULL;
191 		if (trace->prev_ip)
192 			trace->data_flags |= KSNOOP_F_STASH;
193 		/* If there is no "prev" function, i.e. we are at the
194 		 * first function in a set of stack functions, the trace
195 		 * info is shown (along with any stashed info associated
196 		 * with callers).
197 		 */
198 	}
199 	trace->task = task;
200 	return trace;
201 }
202 
output_trace(struct pt_regs * ctx,struct trace * trace)203 static void output_trace(struct pt_regs *ctx, struct trace *trace)
204 {
205 	__u16 trace_len;
206 
207 	if (trace->buf_len == 0)
208 		goto skip;
209 
210 	/* we may be simply stashing values, and will report later */
211 	if (trace->data_flags & KSNOOP_F_STASH) {
212 		trace->data_flags &= ~KSNOOP_F_STASH;
213 		trace->data_flags |= KSNOOP_F_STASHED;
214 		return;
215 	}
216 	/* we may be outputting earlier stashed data */
217 	if (trace->data_flags & KSNOOP_F_STASHED)
218 		trace->data_flags &= ~KSNOOP_F_STASHED;
219 
220 	/* trim perf event size to only contain data we've recorded. */
221 	trace_len = sizeof(*trace) + trace->buf_len - MAX_TRACE_BUF;
222 
223 	if (trace_len <= sizeof(*trace))
224 		bpf_perf_event_output(ctx, &ksnoop_perf_map,
225 				      BPF_F_CURRENT_CPU,
226 				      trace, trace_len);
227 skip:
228 	clear_trace(trace);
229 }
230 
output_stashed_traces(struct pt_regs * ctx,struct trace * currtrace,bool entry)231 static void output_stashed_traces(struct pt_regs *ctx,
232 					 struct trace *currtrace,
233 					 bool entry)
234 {
235 	struct func_stack *func_stack;
236 	struct trace *trace = NULL;
237 	__u8 i;
238 	__u64 task = 0;
239 
240 	task = bpf_get_current_task();
241 	func_stack = bpf_map_lookup_elem(&ksnoop_func_stack, &task);
242 	if (!func_stack)
243 		return;
244 
245 	if (entry) {
246 		/* iterate from bottom to top of stack, outputting stashed
247 		 * data we find.  This corresponds to the set of functions
248 		 * we called before the current function.
249 		 */
250 		for (i = 0;
251 		     i < func_stack->stack_depth - 1 && i < FUNC_MAX_STACK_DEPTH;
252 		     i++) {
253 			trace = bpf_map_lookup_elem(&ksnoop_func_map,
254 						    &func_stack->ips[i]);
255 			if (!trace || !(trace->data_flags & KSNOOP_F_STASHED))
256 				break;
257 			if (trace->task != task)
258 				return;
259 			output_trace(ctx, trace);
260 		}
261 	} else {
262 		/* iterate from top to bottom of stack, outputting stashed
263 		 * data we find.  This corresponds to the set of functions
264 		 * that returned prior to the current returning function.
265 		 */
266 		for (i = FUNC_MAX_STACK_DEPTH; i > 0; i--) {
267 			__u64 ip;
268 
269 			ip = func_stack->ips[i];
270 			if (!ip)
271 				continue;
272 			trace = bpf_map_lookup_elem(&ksnoop_func_map, &ip);
273 			if (!trace || !(trace->data_flags & KSNOOP_F_STASHED))
274 				break;
275 			if (trace->task != task)
276 				return;
277 			output_trace(ctx, trace);
278 		}
279 	}
280 	/* finally output the current trace info */
281 	output_trace(ctx, currtrace);
282 }
283 
get_arg(struct pt_regs * ctx,enum arg argnum)284 static __u64 get_arg(struct pt_regs *ctx, enum arg argnum)
285 {
286 	switch (argnum) {
287 	case KSNOOP_ARG1:
288 		return PT_REGS_PARM1_CORE(ctx);
289 	case KSNOOP_ARG2:
290 		return PT_REGS_PARM2_CORE(ctx);
291 	case KSNOOP_ARG3:
292 		return PT_REGS_PARM3_CORE(ctx);
293 	case KSNOOP_ARG4:
294 		return PT_REGS_PARM4_CORE(ctx);
295 	case KSNOOP_ARG5:
296 		return PT_REGS_PARM5_CORE(ctx);
297 	case KSNOOP_RETURN:
298 		return PT_REGS_RC_CORE(ctx);
299 	default:
300 		return 0;
301 	}
302 }
303 
ksnoop(struct pt_regs * ctx,bool entry)304 static int ksnoop(struct pt_regs *ctx, bool entry)
305 {
306 	void *data_ptr = NULL;
307 	struct trace *trace;
308 	__u64 data;
309 	__u32 currpid;
310 	int ret;
311 	__u8 i;
312 
313 	trace = get_trace(ctx, entry);
314 	if (!trace)
315 		return 0;
316 
317 	/* make sure we want events from this pid */
318 	currpid = bpf_get_current_pid_tgid();
319 	if (trace->filter_pid && trace->filter_pid != currpid)
320 		return 0;
321 	trace->pid = currpid;
322 
323 	trace->cpu = bpf_get_smp_processor_id();
324 	trace->time = bpf_ktime_get_ns();
325 
326 	trace->data_flags &= ~(KSNOOP_F_ENTRY | KSNOOP_F_RETURN);
327 	if (entry)
328 		trace->data_flags |= KSNOOP_F_ENTRY;
329 	else
330 		trace->data_flags |= KSNOOP_F_RETURN;
331 
332 
333 	for (i = 0; i < MAX_TRACES; i++) {
334 		struct trace_data *currdata;
335 		struct value *currtrace;
336 		char *buf_offset = NULL;
337 		__u32 tracesize;
338 
339 		currdata = &trace->trace_data[i];
340 		currtrace = &trace->traces[i];
341 
342 		if ((entry && !base_arg_is_entry(currtrace->base_arg)) ||
343 		    (!entry && base_arg_is_entry(currtrace->base_arg)))
344 			continue;
345 
346 		/* skip void (unused) trace arguments, ensuring not to
347 		 * skip "void *".
348 		 */
349 		if (currtrace->type_id == 0 &&
350 		    !(currtrace->flags & KSNOOP_F_PTR))
351 			continue;
352 
353 		data = get_arg(ctx, currtrace->base_arg);
354 
355 		/* look up member value and read into data field. */
356 		if (currtrace->flags & KSNOOP_F_MEMBER) {
357 			if (currtrace->offset)
358 				data += currtrace->offset;
359 
360 			/* member is a pointer; read it in */
361 			if (currtrace->flags & KSNOOP_F_PTR) {
362 				void *dataptr = (void *)data;
363 
364 				ret = bpf_probe_read_kernel(&data, sizeof(data), dataptr);
365 				if (ret) {
366 					currdata->err_type_id = currtrace->type_id;
367 					currdata->err = ret;
368 					continue;
369 				}
370 				currdata->raw_value = data;
371 			} else if (currtrace->size <=
372 				   sizeof(currdata->raw_value)) {
373 				/* read member value for predicate comparison */
374 				bpf_probe_read_kernel(&currdata->raw_value, currtrace->size, (void*)data);
375 			}
376 		} else {
377 			currdata->raw_value = data;
378 		}
379 
380 		/* simple predicate evaluation: if any predicate fails,
381 		 * skip all tracing for this function.
382 		 */
383 		if (currtrace->flags & KSNOOP_F_PREDICATE_MASK) {
384 			bool ok = false;
385 
386 			if (currtrace->flags & KSNOOP_F_PREDICATE_EQ &&
387 			    currdata->raw_value == currtrace->predicate_value)
388 				ok = true;
389 
390 			if (currtrace->flags & KSNOOP_F_PREDICATE_NOTEQ &&
391 			    currdata->raw_value != currtrace->predicate_value)
392 				ok = true;
393 
394 			if (currtrace->flags & KSNOOP_F_PREDICATE_GT &&
395 			    currdata->raw_value > currtrace->predicate_value)
396 				ok = true;
397 
398 			if (currtrace->flags & KSNOOP_F_PREDICATE_LT &&
399 			    currdata->raw_value < currtrace->predicate_value)
400 				ok = true;
401 
402 			if (!ok) {
403 				clear_trace(trace);
404 				return 0;
405 			}
406 		}
407 
408 		if (currtrace->flags & (KSNOOP_F_PTR | KSNOOP_F_MEMBER))
409 			data_ptr = (void *)data;
410 		else
411 			data_ptr = &data;
412 
413 		if (trace->buf_len + MAX_TRACE_DATA >= MAX_TRACE_BUF)
414 			break;
415 
416 		buf_offset = &trace->buf[trace->buf_len];
417 		if (buf_offset > &trace->buf[MAX_TRACE_BUF]) {
418 			currdata->err_type_id = currtrace->type_id;
419 			currdata->err = -ENOSPC;
420 			continue;
421 		}
422 		currdata->buf_offset = trace->buf_len;
423 
424 		tracesize = currtrace->size;
425 		if (tracesize > MAX_TRACE_DATA)
426 			tracesize = MAX_TRACE_DATA;
427 		ret = bpf_probe_read_kernel(buf_offset, tracesize, data_ptr);
428 		if (ret < 0) {
429 			currdata->err_type_id = currtrace->type_id;
430 			currdata->err = ret;
431 			continue;
432 		} else {
433 			currdata->buf_len = tracesize;
434 			trace->buf_len += tracesize;
435 		}
436 	}
437 
438 	/* show accumulated stashed traces (if any) */
439 	if ((entry && trace->prev_ip && !trace->next_ip) ||
440 	    (!entry && trace->next_ip && !trace->prev_ip))
441 		output_stashed_traces(ctx, trace, entry);
442 	else
443 		output_trace(ctx, trace);
444 
445 	return 0;
446 }
447 
448 SEC("kprobe/foo")
BPF_KPROBE(kprobe_entry)449 int BPF_KPROBE(kprobe_entry)
450 {
451 	return ksnoop(ctx, true);
452 }
453 
454 SEC("kretprobe/foo")
BPF_KRETPROBE(kprobe_return)455 int BPF_KRETPROBE(kprobe_return)
456 {
457 	return ksnoop(ctx, false);
458 }
459 
460 char _license[] SEC("license") = "Dual BSD/GPL";
461