xref: /aosp_15_r20/external/bcc/examples/networking/http_filter/http-parse-complete.py (revision 387f9dfdfa2baef462e92476d413c7bc2470293e)
1#!/usr/bin/python
2#
3# Bertrone Matteo - Polytechnic of Turin
4# November 2015
5#
6# eBPF application that parses HTTP packets
7# and extracts (and prints on screen) the URL
8# contained in the GET/POST request.
9#
10# eBPF program http_filter is used as SOCKET_FILTER attached to eth0 interface.
11# Only packets of type ip and tcp containing HTTP GET/POST are
12# returned to userspace, others dropped
13#
14# Python script uses bcc BPF Compiler Collection by
15# iovisor (https://github.com/iovisor/bcc) and prints on stdout the first
16# line of the HTTP GET/POST request containing the url
17
18from __future__ import print_function
19from bcc import BPF
20from sys import argv
21
22import socket
23import os
24import binascii
25import time
26
27CLEANUP_N_PACKETS = 50     # cleanup every CLEANUP_N_PACKETS packets received
28MAX_URL_STRING_LEN = 8192  # max url string len (usually 8K)
29MAX_AGE_SECONDS = 30       # max age entry in bpf_sessions map
30
31
32# print str until CR+LF
33def printUntilCRLF(s):
34    print(s.split(b'\r\n')[0].decode())
35
36
37# cleanup function
38def cleanup():
39    # get current time in seconds
40    current_time = int(time.time())
41    # looking for leaf having:
42    # timestap  == 0        --> update with current timestamp
43    # AGE > MAX_AGE_SECONDS --> delete item
44    for key, leaf in bpf_sessions.items():
45        try:
46            current_leaf = bpf_sessions[key]
47            # set timestamp if timestamp == 0
48            if (current_leaf.timestamp == 0):
49                bpf_sessions[key] = bpf_sessions.Leaf(current_time)
50            else:
51                # delete older entries
52                if (current_time - current_leaf.timestamp > MAX_AGE_SECONDS):
53                    del bpf_sessions[key]
54        except:
55            print("cleanup exception.")
56    return
57
58
59# args
60def usage():
61    print("USAGE: %s [-i <if_name>]" % argv[0])
62    print("")
63    print("Try '%s -h' for more options." % argv[0])
64    exit()
65
66
67# help
68def help():
69    print("USAGE: %s [-i <if_name>]" % argv[0])
70    print("")
71    print("optional arguments:")
72    print("   -h                       print this help")
73    print("   -i if_name               select interface if_name. Default is eth0")
74    print("")
75    print("examples:")
76    print("    http-parse              # bind socket to eth0")
77    print("    http-parse -i wlan0     # bind socket to wlan0")
78    exit()
79
80
81# arguments
82interface = "eth0"
83
84if len(argv) == 2:
85    if str(argv[1]) == '-h':
86        help()
87    else:
88        usage()
89
90if len(argv) == 3:
91    if str(argv[1]) == '-i':
92        interface = argv[2]
93    else:
94        usage()
95
96if len(argv) > 3:
97    usage()
98
99print("binding socket to '%s'" % interface)
100
101# initialize BPF - load source code from http-parse-complete.c
102bpf = BPF(src_file="http-parse-complete.c", debug=0)
103
104# load eBPF program http_filter of type SOCKET_FILTER into the kernel eBPF vm
105# more info about eBPF program types
106# http://man7.org/linux/man-pages/man2/bpf.2.html
107function_http_filter = bpf.load_func("http_filter", BPF.SOCKET_FILTER)
108
109# create raw socket, bind it to interface
110# attach bpf program to socket created
111BPF.attach_raw_socket(function_http_filter, interface)
112
113# get file descriptor of the socket previously
114# created inside BPF.attach_raw_socket
115socket_fd = function_http_filter.sock
116
117# create python socket object, from the file descriptor
118sock = socket.fromfd(socket_fd, socket.PF_PACKET,
119                     socket.SOCK_RAW, socket.IPPROTO_IP)
120# set it as blocking socket
121sock.setblocking(True)
122
123# get pointer to bpf map of type hash
124bpf_sessions = bpf.get_table("sessions")
125
126# packets counter
127packet_count = 0
128
129# dictionary containing association
130# <key(ipsrc,ipdst,portsrc,portdst),payload_string>.
131# if url is not entirely contained in only one packet,
132# save the firt part of it in this local dict
133# when I find \r\n in a next pkt, append and print the whole url
134local_dictionary = {}
135
136while 1:
137    # retrieve raw packet from socket
138    packet_str = os.read(socket_fd, 4096)  # set packet length to max packet length on the interface
139    packet_count += 1
140
141    # DEBUG - print raw packet in hex format
142    # packet_hex = binascii.hexlify(packet_str)
143    # print ("%s" % packet_hex)
144
145    # convert packet into bytearray
146    packet_bytearray = bytearray(packet_str)
147
148    # ethernet header length
149    ETH_HLEN = 14
150
151    # IP HEADER
152    # https://tools.ietf.org/html/rfc791
153    # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
154    # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
155    # |Version|  IHL  |Type of Service|          Total Length         |
156    # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
157    #
158    # IHL : Internet Header Length is the length of the internet header
159    # value to multiply * 4 byte
160    # e.g. IHL = 5 ; IP Header Length = 5 * 4 byte = 20 byte
161    #
162    # Total length: This 16-bit field defines the entire packet size,
163    # including header and data, in bytes.
164
165    # calculate packet total length
166    total_length = packet_bytearray[ETH_HLEN + 2]                 # load MSB
167    total_length = total_length << 8                              # shift MSB
168    total_length = total_length + packet_bytearray[ETH_HLEN + 3]  # add LSB
169
170    # calculate ip header length
171    ip_header_length = packet_bytearray[ETH_HLEN]     # load Byte
172    ip_header_length = ip_header_length & 0x0F        # mask bits 0..3
173    ip_header_length = ip_header_length << 2          # shift to obtain length
174
175    # retrieve ip source/dest
176    ip_src_str = packet_str[ETH_HLEN + 12: ETH_HLEN + 16]  # ip source offset 12..15
177    ip_dst_str = packet_str[ETH_HLEN + 16:ETH_HLEN + 20]   # ip dest   offset 16..19
178
179    ip_src = int(binascii.hexlify(ip_src_str), 16)
180    ip_dst = int(binascii.hexlify(ip_dst_str), 16)
181
182    # TCP HEADER
183    # https://www.rfc-editor.org/rfc/rfc793.txt
184    #  12              13              14              15
185    #  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
186    # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
187    # |  Data |           |U|A|P|R|S|F|                               |
188    # | Offset| Reserved  |R|C|S|S|Y|I|            Window             |
189    # |       |           |G|K|H|T|N|N|                               |
190    # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
191    #
192    # Data Offset: This indicates where the data begins.
193    # The TCP header is an integral number of 32 bits long.
194    # value to multiply * 4 byte
195    # e.g. DataOffset = 5 ; TCP Header Length = 5 * 4 byte = 20 byte
196
197    # calculate tcp header length
198    tcp_header_length = packet_bytearray[ETH_HLEN + ip_header_length + 12]  # load Byte
199    tcp_header_length = tcp_header_length & 0xF0    # mask bit 4..7
200    tcp_header_length = tcp_header_length >> 2      # SHR 4 ; SHL 2 -> SHR 2
201
202    # retrieve port source/dest
203    port_src_str = packet_str[ETH_HLEN + ip_header_length:ETH_HLEN + ip_header_length + 2]
204    port_dst_str = packet_str[ETH_HLEN + ip_header_length + 2:ETH_HLEN + ip_header_length + 4]
205
206    port_src = int(binascii.hexlify(port_src_str), 16)
207    port_dst = int(binascii.hexlify(port_dst_str), 16)
208
209    # calculate payload offset
210    payload_offset = ETH_HLEN + ip_header_length + tcp_header_length
211
212    # payload_string contains only packet payload
213    payload_string = packet_str[(payload_offset):(len(packet_bytearray))]
214    # CR + LF (substring to find)
215    crlf = b'\r\n'
216
217    # current_Key contains ip source/dest and port source/map
218    # useful for direct bpf_sessions map access
219    current_Key = bpf_sessions.Key(ip_src, ip_dst, port_src, port_dst)
220
221    # looking for HTTP GET/POST request
222    if ((payload_string[:3] == b'GET') or (payload_string[:4] == b'POST')
223            or (payload_string[:4] == b'HTTP') or (payload_string[:3] == b'PUT')
224            or (payload_string[:6] == b'DELETE') or (payload_string[:4] == b'HEAD')):
225        # match: HTTP GET/POST packet found
226        if (crlf in payload_string):
227            # url entirely contained in first packet -> print it all
228            printUntilCRLF(payload_string)
229
230            # delete current_Key from bpf_sessions, url already printed.
231            # current session not useful anymore
232            try:
233                del bpf_sessions[current_Key]
234            except:
235                print("error during delete from bpf map ")
236        else:
237            # url NOT entirely contained in first packet
238            # not found \r\n in payload.
239            # save current part of the payload_string in dictionary
240            # <key(ips,ipd,ports,portd),payload_string>
241            local_dictionary[binascii.hexlify(current_Key)] = payload_string
242    else:
243        # NO match: HTTP GET/POST  NOT found
244
245        # check if the packet belong to a session saved in bpf_sessions
246        if (current_Key in bpf_sessions):
247            # check id the packet belong to a session saved in local_dictionary
248            # (local_dictionary maintains HTTP GET/POST url not
249            # printed yet because split in N packets)
250            if (binascii.hexlify(current_Key) in local_dictionary):
251                # first part of the HTTP GET/POST url is already present in
252                # local dictionary (prev_payload_string)
253                prev_payload_string = local_dictionary[binascii.hexlify(current_Key)]
254                # looking for CR+LF in current packet.
255                if (crlf in payload_string):
256                    # last packet. containing last part of HTTP GET/POST
257                    # url split in N packets. Append current payload
258                    prev_payload_string += payload_string
259                    # print HTTP GET/POST url
260                    printUntilCRLF(prev_payload_string)
261                    # clean bpf_sessions & local_dictionary
262                    try:
263                        del bpf_sessions[current_Key]
264                        del local_dictionary[binascii.hexlify(current_Key)]
265                    except:
266                        print("error deleting from map or dictionary")
267                else:
268                    # NOT last packet. Containing part of HTTP GET/POST url
269                    # split in N packets.
270                    # Append current payload
271                    prev_payload_string += payload_string
272                    # check if not size exceeding
273                    # (usually HTTP GET/POST url < 8K )
274                    if (len(prev_payload_string) > MAX_URL_STRING_LEN):
275                        print("url too long")
276                        try:
277                            del bpf_sessions[current_Key]
278                            del local_dictionary[binascii.hexlify(current_Key)]
279                        except:
280                            print("error deleting from map or dict")
281                    # update dictionary
282                    local_dictionary[binascii.hexlify(current_Key)] = prev_payload_string
283            else:
284                # first part of the HTTP GET/POST url is
285                # NOT present in local dictionary
286                # bpf_sessions contains invalid entry -> delete it
287                try:
288                    del bpf_sessions[current_Key]
289                except:
290                    print("error del bpf_session")
291
292    # check if dirty entry are present in bpf_sessions
293    if (((packet_count) % CLEANUP_N_PACKETS) == 0):
294        cleanup()
295