1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# tcpretrans Trace or count TCP retransmits and TLPs. 5# For Linux, uses BCC, eBPF. Embedded C. 6# 7# USAGE: tcpretrans [-c] [-h] [-l] [-4 | -6] 8# 9# This uses dynamic tracing of kernel functions, and will need to be updated 10# to match kernel changes. 11# 12# Copyright 2016 Netflix, Inc. 13# Licensed under the Apache License, Version 2.0 (the "License") 14# 15# 14-Feb-2016 Brendan Gregg Created this. 16# 03-Nov-2017 Matthias Tafelmeier Extended this. 17 18from __future__ import print_function 19from bcc import BPF 20import argparse 21from time import strftime 22from socket import inet_ntop, AF_INET, AF_INET6 23from struct import pack 24from time import sleep 25 26# arguments 27examples = """examples: 28 ./tcpretrans # trace TCP retransmits 29 ./tcpretrans -l # include TLP attempts 30 ./tcpretrans -4 # trace IPv4 family only 31 ./tcpretrans -6 # trace IPv6 family only 32""" 33parser = argparse.ArgumentParser( 34 description="Trace TCP retransmits", 35 formatter_class=argparse.RawDescriptionHelpFormatter, 36 epilog=examples) 37parser.add_argument("-s", "--sequence", action="store_true", 38 help="display TCP sequence numbers") 39parser.add_argument("-l", "--lossprobe", action="store_true", 40 help="include tail loss probe attempts") 41parser.add_argument("-c", "--count", action="store_true", 42 help="count occurred retransmits per flow") 43group = parser.add_mutually_exclusive_group() 44group.add_argument("-4", "--ipv4", action="store_true", 45 help="trace IPv4 family only") 46group.add_argument("-6", "--ipv6", action="store_true", 47 help="trace IPv6 family only") 48parser.add_argument("--ebpf", action="store_true", 49 help=argparse.SUPPRESS) 50args = parser.parse_args() 51debug = 0 52 53# define BPF program 54bpf_text = """ 55#include <uapi/linux/ptrace.h> 56#include <net/sock.h> 57#include <net/tcp.h> 58#include <bcc/proto.h> 59 60#define RETRANSMIT 1 61#define TLP 2 62 63// separate data structs for ipv4 and ipv6 64struct ipv4_data_t { 65 u32 pid; 66 u64 ip; 67 u32 seq; 68 u32 saddr; 69 u32 daddr; 70 u16 lport; 71 u16 dport; 72 u64 state; 73 u64 type; 74}; 75BPF_PERF_OUTPUT(ipv4_events); 76 77struct ipv6_data_t { 78 u32 pid; 79 u32 seq; 80 u64 ip; 81 unsigned __int128 saddr; 82 unsigned __int128 daddr; 83 u16 lport; 84 u16 dport; 85 u64 state; 86 u64 type; 87}; 88BPF_PERF_OUTPUT(ipv6_events); 89 90// separate flow keys per address family 91struct ipv4_flow_key_t { 92 u32 saddr; 93 u32 daddr; 94 u16 lport; 95 u16 dport; 96}; 97BPF_HASH(ipv4_count, struct ipv4_flow_key_t); 98 99struct ipv6_flow_key_t { 100 unsigned __int128 saddr; 101 unsigned __int128 daddr; 102 u16 lport; 103 u16 dport; 104}; 105BPF_HASH(ipv6_count, struct ipv6_flow_key_t); 106""" 107 108bpf_text_kprobe = """ 109static int trace_event(struct pt_regs *ctx, struct sock *skp, struct sk_buff *skb, int type) 110{ 111 struct tcp_skb_cb *tcb; 112 u32 seq; 113 114 if (skp == NULL) 115 return 0; 116 u32 pid = bpf_get_current_pid_tgid() >> 32; 117 118 // pull in details 119 u16 family = skp->__sk_common.skc_family; 120 u16 lport = skp->__sk_common.skc_num; 121 u16 dport = skp->__sk_common.skc_dport; 122 char state = skp->__sk_common.skc_state; 123 124 seq = 0; 125 if (skb) { 126 /* macro TCP_SKB_CB from net/tcp.h */ 127 tcb = ((struct tcp_skb_cb *)&((skb)->cb[0])); 128 seq = tcb->seq; 129 } 130 131 FILTER_FAMILY 132 133 if (family == AF_INET) { 134 IPV4_INIT 135 IPV4_CORE 136 } else if (family == AF_INET6) { 137 IPV6_INIT 138 IPV6_CORE 139 } 140 // else drop 141 142 return 0; 143} 144""" 145 146bpf_text_kprobe_retransmit = """ 147int trace_retransmit(struct pt_regs *ctx, struct sock *sk, struct sk_buff *skb) 148{ 149 trace_event(ctx, sk, skb, RETRANSMIT); 150 return 0; 151} 152""" 153 154bpf_text_kprobe_tlp = """ 155int trace_tlp(struct pt_regs *ctx, struct sock *sk) 156{ 157 trace_event(ctx, sk, NULL, TLP); 158 return 0; 159} 160""" 161 162bpf_text_tracepoint = """ 163TRACEPOINT_PROBE(tcp, tcp_retransmit_skb) 164{ 165 struct tcp_skb_cb *tcb; 166 u32 seq; 167 168 u32 pid = bpf_get_current_pid_tgid() >> 32; 169 const struct sock *skp = (const struct sock *)args->skaddr; 170 const struct sk_buff *skb = (const struct sk_buff *)args->skbaddr; 171 u16 lport = args->sport; 172 u16 dport = args->dport; 173 char state = skp->__sk_common.skc_state; 174 u16 family = skp->__sk_common.skc_family; 175 176 seq = 0; 177 if (skb) { 178 /* macro TCP_SKB_CB from net/tcp.h */ 179 tcb = ((struct tcp_skb_cb *)&((skb)->cb[0])); 180 seq = tcb->seq; 181 } 182 183 FILTER_FAMILY 184 185 if (family == AF_INET) { 186 IPV4_CODE 187 } else if (family == AF_INET6) { 188 IPV6_CODE 189 } 190 return 0; 191} 192""" 193 194struct_init = { 'ipv4': 195 { 'count' : 196 """ 197 struct ipv4_flow_key_t flow_key = {}; 198 flow_key.saddr = skp->__sk_common.skc_rcv_saddr; 199 flow_key.daddr = skp->__sk_common.skc_daddr; 200 // lport is host order 201 flow_key.lport = lport; 202 flow_key.dport = ntohs(dport);""", 203 'trace' : 204 """ 205 struct ipv4_data_t data4 = {}; 206 data4.pid = pid; 207 data4.ip = 4; 208 data4.seq = seq; 209 data4.type = type; 210 data4.saddr = skp->__sk_common.skc_rcv_saddr; 211 data4.daddr = skp->__sk_common.skc_daddr; 212 // lport is host order 213 data4.lport = lport; 214 data4.dport = ntohs(dport); 215 data4.state = state; """ 216 }, 217 'ipv6': 218 { 'count' : 219 """ 220 struct ipv6_flow_key_t flow_key = {}; 221 bpf_probe_read_kernel(&flow_key.saddr, sizeof(flow_key.saddr), 222 skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); 223 bpf_probe_read_kernel(&flow_key.daddr, sizeof(flow_key.daddr), 224 skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32); 225 // lport is host order 226 flow_key.lport = lport; 227 flow_key.dport = ntohs(dport);""", 228 'trace' : """ 229 struct ipv6_data_t data6 = {}; 230 data6.pid = pid; 231 data6.ip = 6; 232 data6.seq = seq; 233 data6.type = type; 234 bpf_probe_read_kernel(&data6.saddr, sizeof(data6.saddr), 235 skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); 236 bpf_probe_read_kernel(&data6.daddr, sizeof(data6.daddr), 237 skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32); 238 // lport is host order 239 data6.lport = lport; 240 data6.dport = ntohs(dport); 241 data6.state = state;""" 242 } 243 } 244 245struct_init_tracepoint = { 'ipv4': 246 { 'count' : """ 247 struct ipv4_flow_key_t flow_key = {}; 248 __builtin_memcpy(&flow_key.saddr, args->saddr, sizeof(flow_key.saddr)); 249 __builtin_memcpy(&flow_key.daddr, args->daddr, sizeof(flow_key.daddr)); 250 flow_key.lport = lport; 251 flow_key.dport = dport; 252 ipv4_count.increment(flow_key); 253 """, 254 'trace' : """ 255 struct ipv4_data_t data4 = {}; 256 data4.pid = pid; 257 data4.lport = lport; 258 data4.dport = dport; 259 data4.type = RETRANSMIT; 260 data4.ip = 4; 261 data4.seq = seq; 262 data4.state = state; 263 __builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr)); 264 __builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr)); 265 ipv4_events.perf_submit(args, &data4, sizeof(data4)); 266 """ 267 }, 268 'ipv6': 269 { 'count' : """ 270 struct ipv6_flow_key_t flow_key = {}; 271 __builtin_memcpy(&flow_key.saddr, args->saddr_v6, sizeof(flow_key.saddr)); 272 __builtin_memcpy(&flow_key.daddr, args->daddr_v6, sizeof(flow_key.daddr)); 273 flow_key.lport = lport; 274 flow_key.dport = dport; 275 ipv6_count.increment(flow_key); 276 """, 277 'trace' : """ 278 struct ipv6_data_t data6 = {}; 279 data6.pid = pid; 280 data6.lport = lport; 281 data6.dport = dport; 282 data6.type = RETRANSMIT; 283 data6.ip = 6; 284 data6.seq = seq; 285 data6.state = state; 286 __builtin_memcpy(&data6.saddr, args->saddr_v6, sizeof(data6.saddr)); 287 __builtin_memcpy(&data6.daddr, args->daddr_v6, sizeof(data6.daddr)); 288 ipv6_events.perf_submit(args, &data6, sizeof(data6)); 289 """ 290 } 291 } 292 293count_core_base = """ 294 COUNT_STRUCT.increment(flow_key); 295""" 296 297if BPF.tracepoint_exists("tcp", "tcp_retransmit_skb"): 298 if args.count: 299 bpf_text_tracepoint = bpf_text_tracepoint.replace("IPV4_CODE", struct_init_tracepoint['ipv4']['count']) 300 bpf_text_tracepoint = bpf_text_tracepoint.replace("IPV6_CODE", struct_init_tracepoint['ipv6']['count']) 301 else: 302 bpf_text_tracepoint = bpf_text_tracepoint.replace("IPV4_CODE", struct_init_tracepoint['ipv4']['trace']) 303 bpf_text_tracepoint = bpf_text_tracepoint.replace("IPV6_CODE", struct_init_tracepoint['ipv6']['trace']) 304 bpf_text += bpf_text_tracepoint 305 306if args.lossprobe or not BPF.tracepoint_exists("tcp", "tcp_retransmit_skb"): 307 bpf_text += bpf_text_kprobe 308 if args.count: 309 bpf_text = bpf_text.replace("IPV4_INIT", struct_init['ipv4']['count']) 310 bpf_text = bpf_text.replace("IPV6_INIT", struct_init['ipv6']['count']) 311 bpf_text = bpf_text.replace("IPV4_CORE", count_core_base.replace("COUNT_STRUCT", 'ipv4_count')) 312 bpf_text = bpf_text.replace("IPV6_CORE", count_core_base.replace("COUNT_STRUCT", 'ipv6_count')) 313 else: 314 bpf_text = bpf_text.replace("IPV4_INIT", struct_init['ipv4']['trace']) 315 bpf_text = bpf_text.replace("IPV6_INIT", struct_init['ipv6']['trace']) 316 bpf_text = bpf_text.replace("IPV4_CORE", "ipv4_events.perf_submit(ctx, &data4, sizeof(data4));") 317 bpf_text = bpf_text.replace("IPV6_CORE", "ipv6_events.perf_submit(ctx, &data6, sizeof(data6));") 318 if args.lossprobe: 319 bpf_text += bpf_text_kprobe_tlp 320 if not BPF.tracepoint_exists("tcp", "tcp_retransmit_skb"): 321 bpf_text += bpf_text_kprobe_retransmit 322if args.ipv4: 323 bpf_text = bpf_text.replace('FILTER_FAMILY', 324 'if (family != AF_INET) { return 0; }') 325elif args.ipv6: 326 bpf_text = bpf_text.replace('FILTER_FAMILY', 327 'if (family != AF_INET6) { return 0; }') 328else: 329 bpf_text = bpf_text.replace('FILTER_FAMILY', '') 330if debug or args.ebpf: 331 print(bpf_text) 332 if args.ebpf: 333 exit() 334 335# from bpf_text: 336type = {} 337type[1] = 'R' 338type[2] = 'L' 339 340# from include/net/tcp_states.h: 341tcpstate = {} 342tcpstate[1] = 'ESTABLISHED' 343tcpstate[2] = 'SYN_SENT' 344tcpstate[3] = 'SYN_RECV' 345tcpstate[4] = 'FIN_WAIT1' 346tcpstate[5] = 'FIN_WAIT2' 347tcpstate[6] = 'TIME_WAIT' 348tcpstate[7] = 'CLOSE' 349tcpstate[8] = 'CLOSE_WAIT' 350tcpstate[9] = 'LAST_ACK' 351tcpstate[10] = 'LISTEN' 352tcpstate[11] = 'CLOSING' 353tcpstate[12] = 'NEW_SYN_RECV' 354 355# process event 356def print_ipv4_event(cpu, data, size): 357 event = b["ipv4_events"].event(data) 358 print("%-8s %-7d %-2d %-20s %1s> %-20s" % ( 359 strftime("%H:%M:%S"), event.pid, event.ip, 360 "%s:%d" % (inet_ntop(AF_INET, pack('I', event.saddr)), event.lport), 361 type[event.type], 362 "%s:%s" % (inet_ntop(AF_INET, pack('I', event.daddr)), event.dport)), 363 end='') 364 if args.sequence: 365 print(" %-12s %s" % (tcpstate[event.state], event.seq)) 366 else: 367 print(" %s" % (tcpstate[event.state])) 368 369def print_ipv6_event(cpu, data, size): 370 event = b["ipv6_events"].event(data) 371 print("%-8s %-7d %-2d %-20s %1s> %-20s" % ( 372 strftime("%H:%M:%S"), event.pid, event.ip, 373 "%s:%d" % (inet_ntop(AF_INET6, event.saddr), event.lport), 374 type[event.type], 375 "%s:%d" % (inet_ntop(AF_INET6, event.daddr), event.dport)), 376 end='') 377 if args.sequence: 378 print(" %-12s %s" % (tcpstate[event.state], event.seq)) 379 else: 380 print(" %s" % (tcpstate[event.state])) 381 382def depict_cnt(counts_tab, l3prot='ipv4'): 383 for k, v in sorted(counts_tab.items(), key=lambda counts: counts[1].value): 384 depict_key = "" 385 ep_fmt = "[%s]#%d" 386 if l3prot == 'ipv4': 387 depict_key = "%-20s <-> %-20s" % (ep_fmt % (inet_ntop(AF_INET, pack('I', k.saddr)), k.lport), 388 ep_fmt % (inet_ntop(AF_INET, pack('I', k.daddr)), k.dport)) 389 else: 390 depict_key = "%-20s <-> %-20s" % (ep_fmt % (inet_ntop(AF_INET6, k.saddr), k.lport), 391 ep_fmt % (inet_ntop(AF_INET6, k.daddr), k.dport)) 392 393 print ("%s %10d" % (depict_key, v.value)) 394 395# initialize BPF 396b = BPF(text=bpf_text) 397if not BPF.tracepoint_exists("tcp", "tcp_retransmit_skb"): 398 b.attach_kprobe(event="tcp_retransmit_skb", fn_name="trace_retransmit") 399if args.lossprobe: 400 b.attach_kprobe(event="tcp_send_loss_probe", fn_name="trace_tlp") 401 402print("Tracing retransmits ... Hit Ctrl-C to end") 403if args.count: 404 try: 405 while 1: 406 sleep(99999999) 407 except BaseException: 408 pass 409 410 # header 411 print("\n%-25s %-25s %-10s" % ( 412 "LADDR:LPORT", "RADDR:RPORT", "RETRANSMITS")) 413 depict_cnt(b.get_table("ipv4_count")) 414 depict_cnt(b.get_table("ipv6_count"), l3prot='ipv6') 415# read events 416else: 417 # header 418 print("%-8s %-7s %-2s %-20s %1s> %-20s" % ("TIME", "PID", "IP", 419 "LADDR:LPORT", "T", "RADDR:RPORT"), end='') 420 if args.sequence: 421 print(" %-12s %-10s" % ("STATE", "SEQ")) 422 else: 423 print(" %-4s" % ("STATE")) 424 b["ipv4_events"].open_perf_buffer(print_ipv4_event) 425 b["ipv6_events"].open_perf_buffer(print_ipv6_event) 426 while 1: 427 try: 428 b.perf_buffer_poll() 429 except KeyboardInterrupt: 430 exit() 431