1#!/usr/bin/python 2# 3# Bertrone Matteo - Polytechnic of Turin 4# November 2015 5# 6# eBPF application that parses HTTP packets 7# and extracts (and prints on screen) the URL 8# contained in the GET/POST request. 9# 10# eBPF program http_filter is used as SOCKET_FILTER attached to eth0 interface. 11# Only packets of type ip and tcp containing HTTP GET/POST are 12# returned to userspace, others dropped 13# 14# Python script uses bcc BPF Compiler Collection by 15# iovisor (https://github.com/iovisor/bcc) and prints on stdout the first 16# line of the HTTP GET/POST request containing the url 17 18from __future__ import print_function 19from bcc import BPF 20from sys import argv 21 22import socket 23import os 24import binascii 25import time 26 27CLEANUP_N_PACKETS = 50 # cleanup every CLEANUP_N_PACKETS packets received 28MAX_URL_STRING_LEN = 8192 # max url string len (usually 8K) 29MAX_AGE_SECONDS = 30 # max age entry in bpf_sessions map 30 31 32# print str until CR+LF 33def printUntilCRLF(s): 34 print(s.split(b'\r\n')[0].decode()) 35 36 37# cleanup function 38def cleanup(): 39 # get current time in seconds 40 current_time = int(time.time()) 41 # looking for leaf having: 42 # timestap == 0 --> update with current timestamp 43 # AGE > MAX_AGE_SECONDS --> delete item 44 for key, leaf in bpf_sessions.items(): 45 try: 46 current_leaf = bpf_sessions[key] 47 # set timestamp if timestamp == 0 48 if (current_leaf.timestamp == 0): 49 bpf_sessions[key] = bpf_sessions.Leaf(current_time) 50 else: 51 # delete older entries 52 if (current_time - current_leaf.timestamp > MAX_AGE_SECONDS): 53 del bpf_sessions[key] 54 except: 55 print("cleanup exception.") 56 return 57 58 59# args 60def usage(): 61 print("USAGE: %s [-i <if_name>]" % argv[0]) 62 print("") 63 print("Try '%s -h' for more options." % argv[0]) 64 exit() 65 66 67# help 68def help(): 69 print("USAGE: %s [-i <if_name>]" % argv[0]) 70 print("") 71 print("optional arguments:") 72 print(" -h print this help") 73 print(" -i if_name select interface if_name. Default is eth0") 74 print("") 75 print("examples:") 76 print(" http-parse # bind socket to eth0") 77 print(" http-parse -i wlan0 # bind socket to wlan0") 78 exit() 79 80 81# arguments 82interface = "eth0" 83 84if len(argv) == 2: 85 if str(argv[1]) == '-h': 86 help() 87 else: 88 usage() 89 90if len(argv) == 3: 91 if str(argv[1]) == '-i': 92 interface = argv[2] 93 else: 94 usage() 95 96if len(argv) > 3: 97 usage() 98 99print("binding socket to '%s'" % interface) 100 101# initialize BPF - load source code from http-parse-complete.c 102bpf = BPF(src_file="http-parse-complete.c", debug=0) 103 104# load eBPF program http_filter of type SOCKET_FILTER into the kernel eBPF vm 105# more info about eBPF program types 106# http://man7.org/linux/man-pages/man2/bpf.2.html 107function_http_filter = bpf.load_func("http_filter", BPF.SOCKET_FILTER) 108 109# create raw socket, bind it to interface 110# attach bpf program to socket created 111BPF.attach_raw_socket(function_http_filter, interface) 112 113# get file descriptor of the socket previously 114# created inside BPF.attach_raw_socket 115socket_fd = function_http_filter.sock 116 117# create python socket object, from the file descriptor 118sock = socket.fromfd(socket_fd, socket.PF_PACKET, 119 socket.SOCK_RAW, socket.IPPROTO_IP) 120# set it as blocking socket 121sock.setblocking(True) 122 123# get pointer to bpf map of type hash 124bpf_sessions = bpf.get_table("sessions") 125 126# packets counter 127packet_count = 0 128 129# dictionary containing association 130# <key(ipsrc,ipdst,portsrc,portdst),payload_string>. 131# if url is not entirely contained in only one packet, 132# save the firt part of it in this local dict 133# when I find \r\n in a next pkt, append and print the whole url 134local_dictionary = {} 135 136while 1: 137 # retrieve raw packet from socket 138 packet_str = os.read(socket_fd, 4096) # set packet length to max packet length on the interface 139 packet_count += 1 140 141 # DEBUG - print raw packet in hex format 142 # packet_hex = binascii.hexlify(packet_str) 143 # print ("%s" % packet_hex) 144 145 # convert packet into bytearray 146 packet_bytearray = bytearray(packet_str) 147 148 # ethernet header length 149 ETH_HLEN = 14 150 151 # IP HEADER 152 # https://tools.ietf.org/html/rfc791 153 # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 154 # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 155 # |Version| IHL |Type of Service| Total Length | 156 # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 157 # 158 # IHL : Internet Header Length is the length of the internet header 159 # value to multiply * 4 byte 160 # e.g. IHL = 5 ; IP Header Length = 5 * 4 byte = 20 byte 161 # 162 # Total length: This 16-bit field defines the entire packet size, 163 # including header and data, in bytes. 164 165 # calculate packet total length 166 total_length = packet_bytearray[ETH_HLEN + 2] # load MSB 167 total_length = total_length << 8 # shift MSB 168 total_length = total_length + packet_bytearray[ETH_HLEN + 3] # add LSB 169 170 # calculate ip header length 171 ip_header_length = packet_bytearray[ETH_HLEN] # load Byte 172 ip_header_length = ip_header_length & 0x0F # mask bits 0..3 173 ip_header_length = ip_header_length << 2 # shift to obtain length 174 175 # retrieve ip source/dest 176 ip_src_str = packet_str[ETH_HLEN + 12: ETH_HLEN + 16] # ip source offset 12..15 177 ip_dst_str = packet_str[ETH_HLEN + 16:ETH_HLEN + 20] # ip dest offset 16..19 178 179 ip_src = int(binascii.hexlify(ip_src_str), 16) 180 ip_dst = int(binascii.hexlify(ip_dst_str), 16) 181 182 # TCP HEADER 183 # https://www.rfc-editor.org/rfc/rfc793.txt 184 # 12 13 14 15 185 # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 186 # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 187 # | Data | |U|A|P|R|S|F| | 188 # | Offset| Reserved |R|C|S|S|Y|I| Window | 189 # | | |G|K|H|T|N|N| | 190 # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 191 # 192 # Data Offset: This indicates where the data begins. 193 # The TCP header is an integral number of 32 bits long. 194 # value to multiply * 4 byte 195 # e.g. DataOffset = 5 ; TCP Header Length = 5 * 4 byte = 20 byte 196 197 # calculate tcp header length 198 tcp_header_length = packet_bytearray[ETH_HLEN + ip_header_length + 12] # load Byte 199 tcp_header_length = tcp_header_length & 0xF0 # mask bit 4..7 200 tcp_header_length = tcp_header_length >> 2 # SHR 4 ; SHL 2 -> SHR 2 201 202 # retrieve port source/dest 203 port_src_str = packet_str[ETH_HLEN + ip_header_length:ETH_HLEN + ip_header_length + 2] 204 port_dst_str = packet_str[ETH_HLEN + ip_header_length + 2:ETH_HLEN + ip_header_length + 4] 205 206 port_src = int(binascii.hexlify(port_src_str), 16) 207 port_dst = int(binascii.hexlify(port_dst_str), 16) 208 209 # calculate payload offset 210 payload_offset = ETH_HLEN + ip_header_length + tcp_header_length 211 212 # payload_string contains only packet payload 213 payload_string = packet_str[(payload_offset):(len(packet_bytearray))] 214 # CR + LF (substring to find) 215 crlf = b'\r\n' 216 217 # current_Key contains ip source/dest and port source/map 218 # useful for direct bpf_sessions map access 219 current_Key = bpf_sessions.Key(ip_src, ip_dst, port_src, port_dst) 220 221 # looking for HTTP GET/POST request 222 if ((payload_string[:3] == b'GET') or (payload_string[:4] == b'POST') 223 or (payload_string[:4] == b'HTTP') or (payload_string[:3] == b'PUT') 224 or (payload_string[:6] == b'DELETE') or (payload_string[:4] == b'HEAD')): 225 # match: HTTP GET/POST packet found 226 if (crlf in payload_string): 227 # url entirely contained in first packet -> print it all 228 printUntilCRLF(payload_string) 229 230 # delete current_Key from bpf_sessions, url already printed. 231 # current session not useful anymore 232 try: 233 del bpf_sessions[current_Key] 234 except: 235 print("error during delete from bpf map ") 236 else: 237 # url NOT entirely contained in first packet 238 # not found \r\n in payload. 239 # save current part of the payload_string in dictionary 240 # <key(ips,ipd,ports,portd),payload_string> 241 local_dictionary[binascii.hexlify(current_Key)] = payload_string 242 else: 243 # NO match: HTTP GET/POST NOT found 244 245 # check if the packet belong to a session saved in bpf_sessions 246 if (current_Key in bpf_sessions): 247 # check id the packet belong to a session saved in local_dictionary 248 # (local_dictionary maintains HTTP GET/POST url not 249 # printed yet because split in N packets) 250 if (binascii.hexlify(current_Key) in local_dictionary): 251 # first part of the HTTP GET/POST url is already present in 252 # local dictionary (prev_payload_string) 253 prev_payload_string = local_dictionary[binascii.hexlify(current_Key)] 254 # looking for CR+LF in current packet. 255 if (crlf in payload_string): 256 # last packet. containing last part of HTTP GET/POST 257 # url split in N packets. Append current payload 258 prev_payload_string += payload_string 259 # print HTTP GET/POST url 260 printUntilCRLF(prev_payload_string) 261 # clean bpf_sessions & local_dictionary 262 try: 263 del bpf_sessions[current_Key] 264 del local_dictionary[binascii.hexlify(current_Key)] 265 except: 266 print("error deleting from map or dictionary") 267 else: 268 # NOT last packet. Containing part of HTTP GET/POST url 269 # split in N packets. 270 # Append current payload 271 prev_payload_string += payload_string 272 # check if not size exceeding 273 # (usually HTTP GET/POST url < 8K ) 274 if (len(prev_payload_string) > MAX_URL_STRING_LEN): 275 print("url too long") 276 try: 277 del bpf_sessions[current_Key] 278 del local_dictionary[binascii.hexlify(current_Key)] 279 except: 280 print("error deleting from map or dict") 281 # update dictionary 282 local_dictionary[binascii.hexlify(current_Key)] = prev_payload_string 283 else: 284 # first part of the HTTP GET/POST url is 285 # NOT present in local dictionary 286 # bpf_sessions contains invalid entry -> delete it 287 try: 288 del bpf_sessions[current_Key] 289 except: 290 print("error del bpf_session") 291 292 # check if dirty entry are present in bpf_sessions 293 if (((packet_count) % CLEANUP_N_PACKETS) == 0): 294 cleanup() 295