1 /*
2  * Copyright (C) 2021 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // The resulting .o needs to load on Android T+
18 #define BPFLOADER_MIN_VER BPFLOADER_MAINLINE_T_VERSION
19 
20 #include "bpf_net_helpers.h"
21 #include "dscpPolicy.h"
22 
23 #define ECN_MASK 3
24 #define UPDATE_TOS(dscp, tos) ((dscp) << 2) | ((tos) & ECN_MASK)
25 
26 // The cache is never read nor written by userspace and is indexed by socket cookie % CACHE_MAP_SIZE
27 #define CACHE_MAP_SIZE 32  // should be a power of two so we can % cheaply
DEFINE_BPF_MAP_KERNEL_INTERNAL(socket_policy_cache_map,PERCPU_ARRAY,uint32_t,RuleEntry,CACHE_MAP_SIZE)28 DEFINE_BPF_MAP_KERNEL_INTERNAL(socket_policy_cache_map, PERCPU_ARRAY, uint32_t, RuleEntry,
29                                CACHE_MAP_SIZE)
30 
31 DEFINE_BPF_MAP_GRW(ipv4_dscp_policies_map, ARRAY, uint32_t, DscpPolicy, MAX_POLICIES, AID_SYSTEM)
32 DEFINE_BPF_MAP_GRW(ipv6_dscp_policies_map, ARRAY, uint32_t, DscpPolicy, MAX_POLICIES, AID_SYSTEM)
33 
34 static inline __always_inline uint64_t calculate_u64(uint64_t v) {
35     COMPILER_FORCE_CALCULATION(v);
36     return v;
37 }
38 
match_policy(struct __sk_buff * skb,const bool ipv4)39 static inline __always_inline void match_policy(struct __sk_buff* skb, const bool ipv4) {
40     void* data = (void*)(long)skb->data;
41     const void* data_end = (void*)(long)skb->data_end;
42 
43     const int l2_header_size = sizeof(struct ethhdr);
44     struct ethhdr* eth = data;
45 
46     if (data + l2_header_size > data_end) return;
47 
48     int hdr_size = 0;
49 
50     // used for map lookup
51     uint64_t cookie = bpf_get_socket_cookie(skb);
52     if (!cookie) return;
53 
54     uint32_t cacheid = cookie % CACHE_MAP_SIZE;
55 
56     __be16 sport = 0;
57     uint16_t dport = 0;
58     uint8_t protocol = 0;  // TODO: Use are reserved value? Or int (-1) and cast to uint below?
59     struct in6_addr src_ip = {};
60     struct in6_addr dst_ip = {};
61     uint8_t tos = 0;            // Only used for IPv4
62     __be32 old_first_be32 = 0;  // Only used for IPv6
63     if (ipv4) {
64         const struct iphdr* const iph = (void*)(eth + 1);
65         hdr_size = l2_header_size + sizeof(struct iphdr);
66         // Must have ipv4 header
67         if (data + hdr_size > data_end) return;
68 
69         // IP version must be 4
70         if (iph->version != 4) return;
71 
72         // We cannot handle IP options, just standard 20 byte == 5 dword minimal IPv4 header
73         if (iph->ihl != 5) return;
74 
75         // V4 mapped address in in6_addr sets 10/11 position to 0xff.
76         src_ip.s6_addr32[2] = htonl(0x0000ffff);
77         dst_ip.s6_addr32[2] = htonl(0x0000ffff);
78 
79         // Copy IPv4 address into in6_addr for easy comparison below.
80         src_ip.s6_addr32[3] = iph->saddr;
81         dst_ip.s6_addr32[3] = iph->daddr;
82         protocol = iph->protocol;
83         tos = iph->tos;
84     } else {
85         struct ipv6hdr* ip6h = (void*)(eth + 1);
86         hdr_size = l2_header_size + sizeof(struct ipv6hdr);
87         // Must have ipv6 header
88         if (data + hdr_size > data_end) return;
89 
90         if (ip6h->version != 6) return;
91 
92         src_ip = ip6h->saddr;
93         dst_ip = ip6h->daddr;
94         protocol = ip6h->nexthdr;
95         old_first_be32 = *(__be32*)ip6h;
96     }
97 
98     switch (protocol) {
99         case IPPROTO_UDP:
100         case IPPROTO_UDPLITE: {
101             struct udphdr* udp;
102             udp = data + hdr_size;
103             if ((void*)(udp + 1) > data_end) return;
104             sport = udp->source;
105             dport = ntohs(udp->dest);
106         } break;
107         case IPPROTO_TCP: {
108             struct tcphdr* tcp;
109             tcp = data + hdr_size;
110             if ((void*)(tcp + 1) > data_end) return;
111             sport = tcp->source;
112             dport = ntohs(tcp->dest);
113         } break;
114         default:
115             return;
116     }
117 
118     // this array lookup cannot actually fail
119     RuleEntry* existing_rule = bpf_socket_policy_cache_map_lookup_elem(&cacheid);
120 
121     if (!existing_rule) return; // impossible
122 
123     uint64_t nomatch = 0;
124     nomatch |= v6_not_equal(src_ip, existing_rule->src_ip);
125     nomatch |= v6_not_equal(dst_ip, existing_rule->dst_ip);
126     nomatch |= (skb->ifindex ^ existing_rule->ifindex);
127     nomatch |= (sport ^ existing_rule->src_port);
128     nomatch |= (dport ^ existing_rule->dst_port);
129     nomatch |= (protocol ^ existing_rule->proto);
130     COMPILER_FORCE_CALCULATION(nomatch);
131 
132     /*
133      * After the above funky bitwise arithmetic we have 'nomatch == 0' iff
134      *   src_ip == existing_rule->src_ip &&
135      *   dst_ip == existing_rule->dst_ip &&
136      *   skb->ifindex == existing_rule->ifindex &&
137      *   sport == existing_rule->src_port &&
138      *   dport == existing_rule->dst_port &&
139      *   protocol == existing_rule->proto
140      */
141 
142     if (!nomatch) {
143         if (existing_rule->dscp_val < 0) return;  // cached no-op
144 
145         if (ipv4) {
146             uint8_t newTos = UPDATE_TOS(existing_rule->dscp_val, tos);
147             bpf_l3_csum_replace(skb, l2_header_size + IP4_OFFSET(check), htons(tos), htons(newTos),
148                                 sizeof(uint16_t));
149             bpf_skb_store_bytes(skb, l2_header_size + IP4_OFFSET(tos), &newTos, sizeof(newTos), 0);
150         } else {
151             __be32 new_first_be32 =
152                 htonl(ntohl(old_first_be32) & 0xF03FFFFF | (existing_rule->dscp_val << 22));
153             bpf_skb_store_bytes(skb, l2_header_size, &new_first_be32, sizeof(__be32),
154                 BPF_F_RECOMPUTE_CSUM);
155         }
156         return;  // cached DSCP mutation
157     }
158 
159     // Linear scan ipv?_dscp_policies_map since stored params didn't match skb.
160     uint64_t best_score = 0;
161     int8_t new_dscp = -1;  // meaning no mutation
162 
163     for (register uint64_t i = 0; i < MAX_POLICIES; i++) {
164         // Using a uint64 in for loop prevents infinite loop during BPF load,
165         // but the key is uint32, so convert back.
166         uint32_t key = i;
167 
168         DscpPolicy* policy;
169         if (ipv4) {
170             policy = bpf_ipv4_dscp_policies_map_lookup_elem(&key);
171         } else {
172             policy = bpf_ipv6_dscp_policies_map_lookup_elem(&key);
173         }
174 
175         // Lookup failure cannot happen on an array with MAX_POLICIES entries.
176         // While 'continue' would make logical sense here, 'return' should be
177         // easier for the verifier to analyze.
178         if (!policy) return;
179 
180         // Think of 'nomatch' as a 64-bit boolean: false iff zero, true iff non-zero.
181         // Start off with nomatch being false, ie. we assume things *are* matching.
182         uint64_t nomatch = 0;
183 
184         // Due to 'a ^ b' being 0 iff a == b:
185         //   nomatch |= a ^ b
186         // should/can be read as:
187         //   nomatch ||= (a != b)
188         // which you can also think of as:
189         //   match &&= (a == b)
190 
191         // If policy iface index does not match skb, then skip to next policy.
192         nomatch |= (policy->ifindex ^ skb->ifindex);
193 
194         // policy->match_* are normal booleans, and should thus always be 0 or 1,
195         // thus you can think of these as:
196         //   if (policy->match_foo) match &&= (foo == policy->foo);
197         nomatch |= policy->match_proto * (protocol ^ policy->proto);
198         nomatch |= policy->match_src_ip * v6_not_equal(src_ip, policy->src_ip);
199         nomatch |= policy->match_dst_ip * v6_not_equal(dst_ip, policy->dst_ip);
200         nomatch |= policy->match_src_port * (sport ^ policy->src_port);
201 
202         // Since these values are u16s (<=63 bits), we can rely on u64 subtraction
203         // underflow setting the topmost bit.  Basically, you can think of:
204         //   nomatch |= (a - b) >> 63
205         // as:
206         //   match &&= (a >= b)
207         uint64_t dport64 = dport;  // Note: dst_port_{start_end} range is inclusive of both ends.
208         nomatch |= calculate_u64(dport64 - policy->dst_port_start) >> 63;
209         nomatch |= calculate_u64(policy->dst_port_end - dport64) >> 63;
210 
211         // score is 0x10000 for each matched field (proto, src_ip, dst_ip, src_port)
212         // plus 1..0x10000 for the dst_port range match (smaller for bigger ranges)
213         uint64_t score = 0;
214         score += policy->match_proto;  // reminder: match_* are boolean, thus 0 or 1
215         score += policy->match_src_ip;
216         score += policy->match_dst_ip;
217         score += policy->match_src_port;
218         score += 1;  // for a 1 element dst_port_{start,end} range
219         score <<= 16;  // scale up: ie. *= 0x10000
220         // now reduce score if the dst_port range is more than a single element
221         // we want to prioritize (ie. better score) matches of smaller ranges
222         score -= (policy->dst_port_end - policy->dst_port_start);  // -= 0..0xFFFF
223 
224         // Here we need:
225         //   match &&= (score > best_score)
226         // which is the same as
227         //   match &&= (score >= best_score + 1)
228         // > not >= because we want equal score matches to prefer choosing earlier policies
229         nomatch |= calculate_u64(score - best_score - 1) >> 63;
230 
231         COMPILER_FORCE_CALCULATION(nomatch);
232         if (nomatch) continue;
233 
234         // only reachable if we matched the policy and (score > best_score)
235         best_score = score;
236         new_dscp = policy->dscp_val;
237     }
238 
239     // Update cache with found policy.
240     *existing_rule = (RuleEntry){
241         .src_ip = src_ip,
242         .dst_ip = dst_ip,
243         .ifindex = skb->ifindex,
244         .src_port = sport,
245         .dst_port = dport,
246         .proto = protocol,
247         .dscp_val = new_dscp,
248     };
249 
250     if (new_dscp < 0) return;
251 
252     // Need to store bytes after updating map or program will not load.
253     if (ipv4) {
254         uint8_t new_tos = UPDATE_TOS(new_dscp, tos);
255         bpf_l3_csum_replace(skb, l2_header_size + IP4_OFFSET(check), htons(tos), htons(new_tos), 2);
256         bpf_skb_store_bytes(skb, l2_header_size + IP4_OFFSET(tos), &new_tos, sizeof(new_tos), 0);
257     } else {
258         __be32 new_first_be32 = htonl(ntohl(old_first_be32) & 0xF03FFFFF | (new_dscp << 22));
259         bpf_skb_store_bytes(skb, l2_header_size, &new_first_be32, sizeof(__be32),
260             BPF_F_RECOMPUTE_CSUM);
261     }
262     return;
263 }
264 
265 DEFINE_BPF_PROG_KVER("schedcls/set_dscp_ether", AID_ROOT, AID_SYSTEM, schedcls_set_dscp_ether,
266                      KVER_5_15)
267 (struct __sk_buff* skb) {
268     if (skb->pkt_type != PACKET_HOST) return TC_ACT_PIPE;
269 
270     if (skb->protocol == htons(ETH_P_IP)) {
271         match_policy(skb, true);
272     } else if (skb->protocol == htons(ETH_P_IPV6)) {
273         match_policy(skb, false);
274     }
275 
276     // Always return TC_ACT_PIPE
277     return TC_ACT_PIPE;
278 }
279 
280 LICENSE("Apache 2.0");
281 CRITICAL("Connectivity");
282