]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/core/filter.c
bpf: add BPF_SIZEOF and BPF_FIELD_SIZEOF macros
[mirror_ubuntu-zesty-kernel.git] / net / core / filter.c
CommitLineData
1da177e4
LT
1/*
2 * Linux Socket Filter - Kernel level socket filtering
3 *
bd4cf0ed
AS
4 * Based on the design of the Berkeley Packet Filter. The new
5 * internal format has been designed by PLUMgrid:
1da177e4 6 *
bd4cf0ed
AS
7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
8 *
9 * Authors:
10 *
11 * Jay Schulist <jschlst@samba.org>
12 * Alexei Starovoitov <ast@plumgrid.com>
13 * Daniel Borkmann <dborkman@redhat.com>
1da177e4
LT
14 *
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version
18 * 2 of the License, or (at your option) any later version.
19 *
20 * Andi Kleen - Fix a few bad bugs and races.
4df95ff4 21 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
1da177e4
LT
22 */
23
24#include <linux/module.h>
25#include <linux/types.h>
1da177e4
LT
26#include <linux/mm.h>
27#include <linux/fcntl.h>
28#include <linux/socket.h>
29#include <linux/in.h>
30#include <linux/inet.h>
31#include <linux/netdevice.h>
32#include <linux/if_packet.h>
5a0e3ad6 33#include <linux/gfp.h>
1da177e4
LT
34#include <net/ip.h>
35#include <net/protocol.h>
4738c1db 36#include <net/netlink.h>
1da177e4
LT
37#include <linux/skbuff.h>
38#include <net/sock.h>
10b89ee4 39#include <net/flow_dissector.h>
1da177e4
LT
40#include <linux/errno.h>
41#include <linux/timer.h>
1da177e4 42#include <asm/uaccess.h>
40daafc8 43#include <asm/unaligned.h>
1da177e4 44#include <linux/filter.h>
86e4ca66 45#include <linux/ratelimit.h>
46b325c7 46#include <linux/seccomp.h>
f3335031 47#include <linux/if_vlan.h>
89aa0758 48#include <linux/bpf.h>
d691f9e8 49#include <net/sch_generic.h>
8d20aabe 50#include <net/cls_cgroup.h>
d3aa45ce 51#include <net/dst_metadata.h>
c46646d0 52#include <net/dst.h>
538950a1 53#include <net/sock_reuseport.h>
1da177e4 54
43db6d65 55/**
f4979fce 56 * sk_filter_trim_cap - run a packet through a socket filter
43db6d65
SH
57 * @sk: sock associated with &sk_buff
58 * @skb: buffer to filter
f4979fce 59 * @cap: limit on how short the eBPF program may trim the packet
43db6d65 60 *
ff936a04
AS
61 * Run the eBPF program and then cut skb->data to correct size returned by
62 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
43db6d65 63 * than pkt_len we keep whole skb->data. This is the socket level
ff936a04 64 * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
43db6d65
SH
65 * be accepted or -EPERM if the packet should be tossed.
66 *
67 */
f4979fce 68int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
43db6d65
SH
69{
70 int err;
71 struct sk_filter *filter;
72
c93bdd0e
MG
73 /*
74 * If the skb was allocated from pfmemalloc reserves, only
75 * allow SOCK_MEMALLOC sockets to use it as this socket is
76 * helping free memory
77 */
78 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
79 return -ENOMEM;
80
43db6d65
SH
81 err = security_sock_rcv_skb(sk, skb);
82 if (err)
83 return err;
84
80f8f102
ED
85 rcu_read_lock();
86 filter = rcu_dereference(sk->sk_filter);
43db6d65 87 if (filter) {
ff936a04 88 unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
f4979fce 89 err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
43db6d65 90 }
80f8f102 91 rcu_read_unlock();
43db6d65
SH
92
93 return err;
94}
f4979fce 95EXPORT_SYMBOL(sk_filter_trim_cap);
43db6d65 96
30743837 97static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
bd4cf0ed 98{
56193d1b 99 return skb_get_poff((struct sk_buff *)(unsigned long) ctx);
bd4cf0ed
AS
100}
101
30743837 102static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
bd4cf0ed 103{
eb9672f4 104 struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
bd4cf0ed
AS
105 struct nlattr *nla;
106
107 if (skb_is_nonlinear(skb))
108 return 0;
109
05ab8f26
MK
110 if (skb->len < sizeof(struct nlattr))
111 return 0;
112
30743837 113 if (a > skb->len - sizeof(struct nlattr))
bd4cf0ed
AS
114 return 0;
115
30743837 116 nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
bd4cf0ed
AS
117 if (nla)
118 return (void *) nla - (void *) skb->data;
119
120 return 0;
121}
122
30743837 123static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
bd4cf0ed 124{
eb9672f4 125 struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
bd4cf0ed
AS
126 struct nlattr *nla;
127
128 if (skb_is_nonlinear(skb))
129 return 0;
130
05ab8f26
MK
131 if (skb->len < sizeof(struct nlattr))
132 return 0;
133
30743837 134 if (a > skb->len - sizeof(struct nlattr))
bd4cf0ed
AS
135 return 0;
136
30743837
DB
137 nla = (struct nlattr *) &skb->data[a];
138 if (nla->nla_len > skb->len - a)
bd4cf0ed
AS
139 return 0;
140
30743837 141 nla = nla_find_nested(nla, x);
bd4cf0ed
AS
142 if (nla)
143 return (void *) nla - (void *) skb->data;
144
145 return 0;
146}
147
30743837 148static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
bd4cf0ed
AS
149{
150 return raw_smp_processor_id();
151}
152
80b48c44
DB
153static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
154 .func = __get_raw_cpu_id,
155 .gpl_only = false,
156 .ret_type = RET_INTEGER,
157};
158
9bac3d6d
AS
159static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
160 struct bpf_insn *insn_buf)
161{
162 struct bpf_insn *insn = insn_buf;
163
164 switch (skb_field) {
165 case SKF_AD_MARK:
166 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
167
168 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
169 offsetof(struct sk_buff, mark));
170 break;
171
172 case SKF_AD_PKTTYPE:
173 *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
174 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
175#ifdef __BIG_ENDIAN_BITFIELD
176 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
177#endif
178 break;
179
180 case SKF_AD_QUEUE:
181 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
182
183 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
184 offsetof(struct sk_buff, queue_mapping));
185 break;
c2497395 186
c2497395
AS
187 case SKF_AD_VLAN_TAG:
188 case SKF_AD_VLAN_TAG_PRESENT:
189 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
190 BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
191
192 /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
193 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
194 offsetof(struct sk_buff, vlan_tci));
195 if (skb_field == SKF_AD_VLAN_TAG) {
196 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg,
197 ~VLAN_TAG_PRESENT);
198 } else {
199 /* dst_reg >>= 12 */
200 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12);
201 /* dst_reg &= 1 */
202 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1);
203 }
204 break;
9bac3d6d
AS
205 }
206
207 return insn - insn_buf;
208}
209
bd4cf0ed 210static bool convert_bpf_extensions(struct sock_filter *fp,
2695fb55 211 struct bpf_insn **insnp)
bd4cf0ed 212{
2695fb55 213 struct bpf_insn *insn = *insnp;
9bac3d6d 214 u32 cnt;
bd4cf0ed
AS
215
216 switch (fp->k) {
217 case SKF_AD_OFF + SKF_AD_PROTOCOL:
0b8c707d
DB
218 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
219
220 /* A = *(u16 *) (CTX + offsetof(protocol)) */
221 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
222 offsetof(struct sk_buff, protocol));
223 /* A = ntohs(A) [emitting a nop or swap16] */
224 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
bd4cf0ed
AS
225 break;
226
227 case SKF_AD_OFF + SKF_AD_PKTTYPE:
9bac3d6d
AS
228 cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
229 insn += cnt - 1;
bd4cf0ed
AS
230 break;
231
232 case SKF_AD_OFF + SKF_AD_IFINDEX:
233 case SKF_AD_OFF + SKF_AD_HATYPE:
bd4cf0ed
AS
234 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
235 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
f8f6d679 236
f035a515 237 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
f8f6d679
DB
238 BPF_REG_TMP, BPF_REG_CTX,
239 offsetof(struct sk_buff, dev));
240 /* if (tmp != 0) goto pc + 1 */
241 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
242 *insn++ = BPF_EXIT_INSN();
243 if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
244 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
245 offsetof(struct net_device, ifindex));
246 else
247 *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
248 offsetof(struct net_device, type));
bd4cf0ed
AS
249 break;
250
251 case SKF_AD_OFF + SKF_AD_MARK:
9bac3d6d
AS
252 cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
253 insn += cnt - 1;
bd4cf0ed
AS
254 break;
255
256 case SKF_AD_OFF + SKF_AD_RXHASH:
257 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
258
9739eef1
AS
259 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
260 offsetof(struct sk_buff, hash));
bd4cf0ed
AS
261 break;
262
263 case SKF_AD_OFF + SKF_AD_QUEUE:
9bac3d6d
AS
264 cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
265 insn += cnt - 1;
bd4cf0ed
AS
266 break;
267
268 case SKF_AD_OFF + SKF_AD_VLAN_TAG:
c2497395
AS
269 cnt = convert_skb_access(SKF_AD_VLAN_TAG,
270 BPF_REG_A, BPF_REG_CTX, insn);
271 insn += cnt - 1;
272 break;
bd4cf0ed 273
c2497395
AS
274 case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
275 cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
276 BPF_REG_A, BPF_REG_CTX, insn);
277 insn += cnt - 1;
bd4cf0ed
AS
278 break;
279
27cd5452
MS
280 case SKF_AD_OFF + SKF_AD_VLAN_TPID:
281 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
282
283 /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
284 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
285 offsetof(struct sk_buff, vlan_proto));
286 /* A = ntohs(A) [emitting a nop or swap16] */
287 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
288 break;
289
bd4cf0ed
AS
290 case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
291 case SKF_AD_OFF + SKF_AD_NLATTR:
292 case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
293 case SKF_AD_OFF + SKF_AD_CPU:
4cd3675e 294 case SKF_AD_OFF + SKF_AD_RANDOM:
e430f34e 295 /* arg1 = CTX */
f8f6d679 296 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
bd4cf0ed 297 /* arg2 = A */
f8f6d679 298 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
bd4cf0ed 299 /* arg3 = X */
f8f6d679 300 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
e430f34e 301 /* Emit call(arg1=CTX, arg2=A, arg3=X) */
bd4cf0ed
AS
302 switch (fp->k) {
303 case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
f8f6d679 304 *insn = BPF_EMIT_CALL(__skb_get_pay_offset);
bd4cf0ed
AS
305 break;
306 case SKF_AD_OFF + SKF_AD_NLATTR:
f8f6d679 307 *insn = BPF_EMIT_CALL(__skb_get_nlattr);
bd4cf0ed
AS
308 break;
309 case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
f8f6d679 310 *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest);
bd4cf0ed
AS
311 break;
312 case SKF_AD_OFF + SKF_AD_CPU:
f8f6d679 313 *insn = BPF_EMIT_CALL(__get_raw_cpu_id);
bd4cf0ed 314 break;
4cd3675e 315 case SKF_AD_OFF + SKF_AD_RANDOM:
3ad00405
DB
316 *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
317 bpf_user_rnd_init_once();
4cd3675e 318 break;
bd4cf0ed
AS
319 }
320 break;
321
322 case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
9739eef1
AS
323 /* A ^= X */
324 *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
bd4cf0ed
AS
325 break;
326
327 default:
328 /* This is just a dummy call to avoid letting the compiler
329 * evict __bpf_call_base() as an optimization. Placed here
330 * where no-one bothers.
331 */
332 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
333 return false;
334 }
335
336 *insnp = insn;
337 return true;
338}
339
340/**
8fb575ca 341 * bpf_convert_filter - convert filter program
bd4cf0ed
AS
342 * @prog: the user passed filter program
343 * @len: the length of the user passed filter program
344 * @new_prog: buffer where converted program will be stored
345 * @new_len: pointer to store length of converted program
346 *
347 * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style.
348 * Conversion workflow:
349 *
350 * 1) First pass for calculating the new program length:
8fb575ca 351 * bpf_convert_filter(old_prog, old_len, NULL, &new_len)
bd4cf0ed
AS
352 *
353 * 2) 2nd pass to remap in two passes: 1st pass finds new
354 * jump offsets, 2nd pass remapping:
2695fb55 355 * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len);
8fb575ca 356 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
bd4cf0ed 357 */
d9e12f42
NS
358static int bpf_convert_filter(struct sock_filter *prog, int len,
359 struct bpf_insn *new_prog, int *new_len)
bd4cf0ed
AS
360{
361 int new_flen = 0, pass = 0, target, i;
2695fb55 362 struct bpf_insn *new_insn;
bd4cf0ed
AS
363 struct sock_filter *fp;
364 int *addrs = NULL;
365 u8 bpf_src;
366
367 BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
30743837 368 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
bd4cf0ed 369
6f9a093b 370 if (len <= 0 || len > BPF_MAXINSNS)
bd4cf0ed
AS
371 return -EINVAL;
372
373 if (new_prog) {
658da937
DB
374 addrs = kcalloc(len, sizeof(*addrs),
375 GFP_KERNEL | __GFP_NOWARN);
bd4cf0ed
AS
376 if (!addrs)
377 return -ENOMEM;
378 }
379
380do_pass:
381 new_insn = new_prog;
382 fp = prog;
383
8b614aeb
DB
384 /* Classic BPF related prologue emission. */
385 if (new_insn) {
386 /* Classic BPF expects A and X to be reset first. These need
387 * to be guaranteed to be the first two instructions.
388 */
389 *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
390 *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
391
392 /* All programs must keep CTX in callee saved BPF_REG_CTX.
393 * In eBPF case it's done by the compiler, here we need to
394 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
395 */
396 *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
397 } else {
398 new_insn += 3;
399 }
bd4cf0ed
AS
400
401 for (i = 0; i < len; fp++, i++) {
2695fb55
AS
402 struct bpf_insn tmp_insns[6] = { };
403 struct bpf_insn *insn = tmp_insns;
bd4cf0ed
AS
404
405 if (addrs)
406 addrs[i] = new_insn - new_prog;
407
408 switch (fp->code) {
409 /* All arithmetic insns and skb loads map as-is. */
410 case BPF_ALU | BPF_ADD | BPF_X:
411 case BPF_ALU | BPF_ADD | BPF_K:
412 case BPF_ALU | BPF_SUB | BPF_X:
413 case BPF_ALU | BPF_SUB | BPF_K:
414 case BPF_ALU | BPF_AND | BPF_X:
415 case BPF_ALU | BPF_AND | BPF_K:
416 case BPF_ALU | BPF_OR | BPF_X:
417 case BPF_ALU | BPF_OR | BPF_K:
418 case BPF_ALU | BPF_LSH | BPF_X:
419 case BPF_ALU | BPF_LSH | BPF_K:
420 case BPF_ALU | BPF_RSH | BPF_X:
421 case BPF_ALU | BPF_RSH | BPF_K:
422 case BPF_ALU | BPF_XOR | BPF_X:
423 case BPF_ALU | BPF_XOR | BPF_K:
424 case BPF_ALU | BPF_MUL | BPF_X:
425 case BPF_ALU | BPF_MUL | BPF_K:
426 case BPF_ALU | BPF_DIV | BPF_X:
427 case BPF_ALU | BPF_DIV | BPF_K:
428 case BPF_ALU | BPF_MOD | BPF_X:
429 case BPF_ALU | BPF_MOD | BPF_K:
430 case BPF_ALU | BPF_NEG:
431 case BPF_LD | BPF_ABS | BPF_W:
432 case BPF_LD | BPF_ABS | BPF_H:
433 case BPF_LD | BPF_ABS | BPF_B:
434 case BPF_LD | BPF_IND | BPF_W:
435 case BPF_LD | BPF_IND | BPF_H:
436 case BPF_LD | BPF_IND | BPF_B:
437 /* Check for overloaded BPF extension and
438 * directly convert it if found, otherwise
439 * just move on with mapping.
440 */
441 if (BPF_CLASS(fp->code) == BPF_LD &&
442 BPF_MODE(fp->code) == BPF_ABS &&
443 convert_bpf_extensions(fp, &insn))
444 break;
445
f8f6d679 446 *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
bd4cf0ed
AS
447 break;
448
f8f6d679
DB
449 /* Jump transformation cannot use BPF block macros
450 * everywhere as offset calculation and target updates
451 * require a bit more work than the rest, i.e. jump
452 * opcodes map as-is, but offsets need adjustment.
453 */
454
455#define BPF_EMIT_JMP \
bd4cf0ed
AS
456 do { \
457 if (target >= len || target < 0) \
458 goto err; \
459 insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \
460 /* Adjust pc relative offset for 2nd or 3rd insn. */ \
461 insn->off -= insn - tmp_insns; \
462 } while (0)
463
f8f6d679
DB
464 case BPF_JMP | BPF_JA:
465 target = i + fp->k + 1;
466 insn->code = fp->code;
467 BPF_EMIT_JMP;
bd4cf0ed
AS
468 break;
469
470 case BPF_JMP | BPF_JEQ | BPF_K:
471 case BPF_JMP | BPF_JEQ | BPF_X:
472 case BPF_JMP | BPF_JSET | BPF_K:
473 case BPF_JMP | BPF_JSET | BPF_X:
474 case BPF_JMP | BPF_JGT | BPF_K:
475 case BPF_JMP | BPF_JGT | BPF_X:
476 case BPF_JMP | BPF_JGE | BPF_K:
477 case BPF_JMP | BPF_JGE | BPF_X:
478 if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
479 /* BPF immediates are signed, zero extend
480 * immediate into tmp register and use it
481 * in compare insn.
482 */
f8f6d679 483 *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
bd4cf0ed 484
e430f34e
AS
485 insn->dst_reg = BPF_REG_A;
486 insn->src_reg = BPF_REG_TMP;
bd4cf0ed
AS
487 bpf_src = BPF_X;
488 } else {
e430f34e 489 insn->dst_reg = BPF_REG_A;
bd4cf0ed
AS
490 insn->imm = fp->k;
491 bpf_src = BPF_SRC(fp->code);
19539ce7 492 insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;
1da177e4 493 }
bd4cf0ed
AS
494
495 /* Common case where 'jump_false' is next insn. */
496 if (fp->jf == 0) {
497 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
498 target = i + fp->jt + 1;
f8f6d679 499 BPF_EMIT_JMP;
bd4cf0ed 500 break;
1da177e4 501 }
bd4cf0ed
AS
502
503 /* Convert JEQ into JNE when 'jump_true' is next insn. */
504 if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
505 insn->code = BPF_JMP | BPF_JNE | bpf_src;
506 target = i + fp->jf + 1;
f8f6d679 507 BPF_EMIT_JMP;
bd4cf0ed 508 break;
0b05b2a4 509 }
bd4cf0ed
AS
510
511 /* Other jumps are mapped into two insns: Jxx and JA. */
512 target = i + fp->jt + 1;
513 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
f8f6d679 514 BPF_EMIT_JMP;
bd4cf0ed
AS
515 insn++;
516
517 insn->code = BPF_JMP | BPF_JA;
518 target = i + fp->jf + 1;
f8f6d679 519 BPF_EMIT_JMP;
bd4cf0ed
AS
520 break;
521
522 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
523 case BPF_LDX | BPF_MSH | BPF_B:
9739eef1 524 /* tmp = A */
f8f6d679 525 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
1268e253 526 /* A = BPF_R0 = *(u8 *) (skb->data + K) */
f8f6d679 527 *insn++ = BPF_LD_ABS(BPF_B, fp->k);
9739eef1 528 /* A &= 0xf */
f8f6d679 529 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
9739eef1 530 /* A <<= 2 */
f8f6d679 531 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
9739eef1 532 /* X = A */
f8f6d679 533 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
9739eef1 534 /* A = tmp */
f8f6d679 535 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
bd4cf0ed
AS
536 break;
537
6205b9cf
DB
538 /* RET_K is remaped into 2 insns. RET_A case doesn't need an
539 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
540 */
bd4cf0ed
AS
541 case BPF_RET | BPF_A:
542 case BPF_RET | BPF_K:
6205b9cf
DB
543 if (BPF_RVAL(fp->code) == BPF_K)
544 *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
545 0, fp->k);
9739eef1 546 *insn = BPF_EXIT_INSN();
bd4cf0ed
AS
547 break;
548
549 /* Store to stack. */
550 case BPF_ST:
551 case BPF_STX:
f8f6d679
DB
552 *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
553 BPF_ST ? BPF_REG_A : BPF_REG_X,
554 -(BPF_MEMWORDS - fp->k) * 4);
bd4cf0ed
AS
555 break;
556
557 /* Load from stack. */
558 case BPF_LD | BPF_MEM:
559 case BPF_LDX | BPF_MEM:
f8f6d679
DB
560 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
561 BPF_REG_A : BPF_REG_X, BPF_REG_FP,
562 -(BPF_MEMWORDS - fp->k) * 4);
bd4cf0ed
AS
563 break;
564
565 /* A = K or X = K */
566 case BPF_LD | BPF_IMM:
567 case BPF_LDX | BPF_IMM:
f8f6d679
DB
568 *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
569 BPF_REG_A : BPF_REG_X, fp->k);
bd4cf0ed
AS
570 break;
571
572 /* X = A */
573 case BPF_MISC | BPF_TAX:
f8f6d679 574 *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
bd4cf0ed
AS
575 break;
576
577 /* A = X */
578 case BPF_MISC | BPF_TXA:
f8f6d679 579 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
bd4cf0ed
AS
580 break;
581
582 /* A = skb->len or X = skb->len */
583 case BPF_LD | BPF_W | BPF_LEN:
584 case BPF_LDX | BPF_W | BPF_LEN:
f8f6d679
DB
585 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
586 BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
587 offsetof(struct sk_buff, len));
bd4cf0ed
AS
588 break;
589
f8f6d679 590 /* Access seccomp_data fields. */
bd4cf0ed 591 case BPF_LDX | BPF_ABS | BPF_W:
9739eef1
AS
592 /* A = *(u32 *) (ctx + K) */
593 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
bd4cf0ed
AS
594 break;
595
ca9f1fd2 596 /* Unknown instruction. */
1da177e4 597 default:
bd4cf0ed 598 goto err;
1da177e4 599 }
bd4cf0ed
AS
600
601 insn++;
602 if (new_prog)
603 memcpy(new_insn, tmp_insns,
604 sizeof(*insn) * (insn - tmp_insns));
bd4cf0ed 605 new_insn += insn - tmp_insns;
1da177e4
LT
606 }
607
bd4cf0ed
AS
608 if (!new_prog) {
609 /* Only calculating new length. */
610 *new_len = new_insn - new_prog;
611 return 0;
612 }
613
614 pass++;
615 if (new_flen != new_insn - new_prog) {
616 new_flen = new_insn - new_prog;
617 if (pass > 2)
618 goto err;
bd4cf0ed
AS
619 goto do_pass;
620 }
621
622 kfree(addrs);
623 BUG_ON(*new_len != new_flen);
1da177e4 624 return 0;
bd4cf0ed
AS
625err:
626 kfree(addrs);
627 return -EINVAL;
1da177e4
LT
628}
629
bd4cf0ed 630/* Security:
bd4cf0ed 631 *
2d5311e4 632 * As we dont want to clear mem[] array for each packet going through
8ea6e345 633 * __bpf_prog_run(), we check that filter loaded by user never try to read
2d5311e4 634 * a cell if not previously written, and we check all branches to be sure
25985edc 635 * a malicious user doesn't try to abuse us.
2d5311e4 636 */
ec31a05c 637static int check_load_and_stores(const struct sock_filter *filter, int flen)
2d5311e4 638{
34805931 639 u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
2d5311e4
ED
640 int pc, ret = 0;
641
642 BUILD_BUG_ON(BPF_MEMWORDS > 16);
34805931 643
99e72a0f 644 masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
2d5311e4
ED
645 if (!masks)
646 return -ENOMEM;
34805931 647
2d5311e4
ED
648 memset(masks, 0xff, flen * sizeof(*masks));
649
650 for (pc = 0; pc < flen; pc++) {
651 memvalid &= masks[pc];
652
653 switch (filter[pc].code) {
34805931
DB
654 case BPF_ST:
655 case BPF_STX:
2d5311e4
ED
656 memvalid |= (1 << filter[pc].k);
657 break;
34805931
DB
658 case BPF_LD | BPF_MEM:
659 case BPF_LDX | BPF_MEM:
2d5311e4
ED
660 if (!(memvalid & (1 << filter[pc].k))) {
661 ret = -EINVAL;
662 goto error;
663 }
664 break;
34805931
DB
665 case BPF_JMP | BPF_JA:
666 /* A jump must set masks on target */
2d5311e4
ED
667 masks[pc + 1 + filter[pc].k] &= memvalid;
668 memvalid = ~0;
669 break;
34805931
DB
670 case BPF_JMP | BPF_JEQ | BPF_K:
671 case BPF_JMP | BPF_JEQ | BPF_X:
672 case BPF_JMP | BPF_JGE | BPF_K:
673 case BPF_JMP | BPF_JGE | BPF_X:
674 case BPF_JMP | BPF_JGT | BPF_K:
675 case BPF_JMP | BPF_JGT | BPF_X:
676 case BPF_JMP | BPF_JSET | BPF_K:
677 case BPF_JMP | BPF_JSET | BPF_X:
678 /* A jump must set masks on targets */
2d5311e4
ED
679 masks[pc + 1 + filter[pc].jt] &= memvalid;
680 masks[pc + 1 + filter[pc].jf] &= memvalid;
681 memvalid = ~0;
682 break;
683 }
684 }
685error:
686 kfree(masks);
687 return ret;
688}
689
34805931
DB
690static bool chk_code_allowed(u16 code_to_probe)
691{
692 static const bool codes[] = {
693 /* 32 bit ALU operations */
694 [BPF_ALU | BPF_ADD | BPF_K] = true,
695 [BPF_ALU | BPF_ADD | BPF_X] = true,
696 [BPF_ALU | BPF_SUB | BPF_K] = true,
697 [BPF_ALU | BPF_SUB | BPF_X] = true,
698 [BPF_ALU | BPF_MUL | BPF_K] = true,
699 [BPF_ALU | BPF_MUL | BPF_X] = true,
700 [BPF_ALU | BPF_DIV | BPF_K] = true,
701 [BPF_ALU | BPF_DIV | BPF_X] = true,
702 [BPF_ALU | BPF_MOD | BPF_K] = true,
703 [BPF_ALU | BPF_MOD | BPF_X] = true,
704 [BPF_ALU | BPF_AND | BPF_K] = true,
705 [BPF_ALU | BPF_AND | BPF_X] = true,
706 [BPF_ALU | BPF_OR | BPF_K] = true,
707 [BPF_ALU | BPF_OR | BPF_X] = true,
708 [BPF_ALU | BPF_XOR | BPF_K] = true,
709 [BPF_ALU | BPF_XOR | BPF_X] = true,
710 [BPF_ALU | BPF_LSH | BPF_K] = true,
711 [BPF_ALU | BPF_LSH | BPF_X] = true,
712 [BPF_ALU | BPF_RSH | BPF_K] = true,
713 [BPF_ALU | BPF_RSH | BPF_X] = true,
714 [BPF_ALU | BPF_NEG] = true,
715 /* Load instructions */
716 [BPF_LD | BPF_W | BPF_ABS] = true,
717 [BPF_LD | BPF_H | BPF_ABS] = true,
718 [BPF_LD | BPF_B | BPF_ABS] = true,
719 [BPF_LD | BPF_W | BPF_LEN] = true,
720 [BPF_LD | BPF_W | BPF_IND] = true,
721 [BPF_LD | BPF_H | BPF_IND] = true,
722 [BPF_LD | BPF_B | BPF_IND] = true,
723 [BPF_LD | BPF_IMM] = true,
724 [BPF_LD | BPF_MEM] = true,
725 [BPF_LDX | BPF_W | BPF_LEN] = true,
726 [BPF_LDX | BPF_B | BPF_MSH] = true,
727 [BPF_LDX | BPF_IMM] = true,
728 [BPF_LDX | BPF_MEM] = true,
729 /* Store instructions */
730 [BPF_ST] = true,
731 [BPF_STX] = true,
732 /* Misc instructions */
733 [BPF_MISC | BPF_TAX] = true,
734 [BPF_MISC | BPF_TXA] = true,
735 /* Return instructions */
736 [BPF_RET | BPF_K] = true,
737 [BPF_RET | BPF_A] = true,
738 /* Jump instructions */
739 [BPF_JMP | BPF_JA] = true,
740 [BPF_JMP | BPF_JEQ | BPF_K] = true,
741 [BPF_JMP | BPF_JEQ | BPF_X] = true,
742 [BPF_JMP | BPF_JGE | BPF_K] = true,
743 [BPF_JMP | BPF_JGE | BPF_X] = true,
744 [BPF_JMP | BPF_JGT | BPF_K] = true,
745 [BPF_JMP | BPF_JGT | BPF_X] = true,
746 [BPF_JMP | BPF_JSET | BPF_K] = true,
747 [BPF_JMP | BPF_JSET | BPF_X] = true,
748 };
749
750 if (code_to_probe >= ARRAY_SIZE(codes))
751 return false;
752
753 return codes[code_to_probe];
754}
755
f7bd9e36
DB
756static bool bpf_check_basics_ok(const struct sock_filter *filter,
757 unsigned int flen)
758{
759 if (filter == NULL)
760 return false;
761 if (flen == 0 || flen > BPF_MAXINSNS)
762 return false;
763
764 return true;
765}
766
1da177e4 767/**
4df95ff4 768 * bpf_check_classic - verify socket filter code
1da177e4
LT
769 * @filter: filter to verify
770 * @flen: length of filter
771 *
772 * Check the user's filter code. If we let some ugly
773 * filter code slip through kaboom! The filter must contain
93699863
KK
774 * no references or jumps that are out of range, no illegal
775 * instructions, and must end with a RET instruction.
1da177e4 776 *
7b11f69f
KK
777 * All jumps are forward as they are not signed.
778 *
779 * Returns 0 if the rule set is legal or -EINVAL if not.
1da177e4 780 */
d9e12f42
NS
781static int bpf_check_classic(const struct sock_filter *filter,
782 unsigned int flen)
1da177e4 783{
aa1113d9 784 bool anc_found;
34805931 785 int pc;
1da177e4 786
34805931 787 /* Check the filter code now */
1da177e4 788 for (pc = 0; pc < flen; pc++) {
ec31a05c 789 const struct sock_filter *ftest = &filter[pc];
93699863 790
34805931
DB
791 /* May we actually operate on this code? */
792 if (!chk_code_allowed(ftest->code))
cba328fc 793 return -EINVAL;
34805931 794
93699863 795 /* Some instructions need special checks */
34805931
DB
796 switch (ftest->code) {
797 case BPF_ALU | BPF_DIV | BPF_K:
798 case BPF_ALU | BPF_MOD | BPF_K:
799 /* Check for division by zero */
b6069a95
ED
800 if (ftest->k == 0)
801 return -EINVAL;
802 break;
229394e8
RV
803 case BPF_ALU | BPF_LSH | BPF_K:
804 case BPF_ALU | BPF_RSH | BPF_K:
805 if (ftest->k >= 32)
806 return -EINVAL;
807 break;
34805931
DB
808 case BPF_LD | BPF_MEM:
809 case BPF_LDX | BPF_MEM:
810 case BPF_ST:
811 case BPF_STX:
812 /* Check for invalid memory addresses */
93699863
KK
813 if (ftest->k >= BPF_MEMWORDS)
814 return -EINVAL;
815 break;
34805931
DB
816 case BPF_JMP | BPF_JA:
817 /* Note, the large ftest->k might cause loops.
93699863
KK
818 * Compare this with conditional jumps below,
819 * where offsets are limited. --ANK (981016)
820 */
34805931 821 if (ftest->k >= (unsigned int)(flen - pc - 1))
93699863 822 return -EINVAL;
01f2f3f6 823 break;
34805931
DB
824 case BPF_JMP | BPF_JEQ | BPF_K:
825 case BPF_JMP | BPF_JEQ | BPF_X:
826 case BPF_JMP | BPF_JGE | BPF_K:
827 case BPF_JMP | BPF_JGE | BPF_X:
828 case BPF_JMP | BPF_JGT | BPF_K:
829 case BPF_JMP | BPF_JGT | BPF_X:
830 case BPF_JMP | BPF_JSET | BPF_K:
831 case BPF_JMP | BPF_JSET | BPF_X:
832 /* Both conditionals must be safe */
e35bedf3 833 if (pc + ftest->jt + 1 >= flen ||
93699863
KK
834 pc + ftest->jf + 1 >= flen)
835 return -EINVAL;
cba328fc 836 break;
34805931
DB
837 case BPF_LD | BPF_W | BPF_ABS:
838 case BPF_LD | BPF_H | BPF_ABS:
839 case BPF_LD | BPF_B | BPF_ABS:
aa1113d9 840 anc_found = false;
34805931
DB
841 if (bpf_anc_helper(ftest) & BPF_ANC)
842 anc_found = true;
843 /* Ancillary operation unknown or unsupported */
aa1113d9
DB
844 if (anc_found == false && ftest->k >= SKF_AD_OFF)
845 return -EINVAL;
01f2f3f6
HPP
846 }
847 }
93699863 848
34805931 849 /* Last instruction must be a RET code */
01f2f3f6 850 switch (filter[flen - 1].code) {
34805931
DB
851 case BPF_RET | BPF_K:
852 case BPF_RET | BPF_A:
2d5311e4 853 return check_load_and_stores(filter, flen);
cba328fc 854 }
34805931 855
cba328fc 856 return -EINVAL;
1da177e4
LT
857}
858
7ae457c1
AS
859static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
860 const struct sock_fprog *fprog)
a3ea269b 861{
009937e7 862 unsigned int fsize = bpf_classic_proglen(fprog);
a3ea269b
DB
863 struct sock_fprog_kern *fkprog;
864
865 fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
866 if (!fp->orig_prog)
867 return -ENOMEM;
868
869 fkprog = fp->orig_prog;
870 fkprog->len = fprog->len;
658da937
DB
871
872 fkprog->filter = kmemdup(fp->insns, fsize,
873 GFP_KERNEL | __GFP_NOWARN);
a3ea269b
DB
874 if (!fkprog->filter) {
875 kfree(fp->orig_prog);
876 return -ENOMEM;
877 }
878
879 return 0;
880}
881
7ae457c1 882static void bpf_release_orig_filter(struct bpf_prog *fp)
a3ea269b
DB
883{
884 struct sock_fprog_kern *fprog = fp->orig_prog;
885
886 if (fprog) {
887 kfree(fprog->filter);
888 kfree(fprog);
889 }
890}
891
7ae457c1
AS
892static void __bpf_prog_release(struct bpf_prog *prog)
893{
24701ece 894 if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
89aa0758
AS
895 bpf_prog_put(prog);
896 } else {
897 bpf_release_orig_filter(prog);
898 bpf_prog_free(prog);
899 }
7ae457c1
AS
900}
901
34c5bd66
PN
902static void __sk_filter_release(struct sk_filter *fp)
903{
7ae457c1
AS
904 __bpf_prog_release(fp->prog);
905 kfree(fp);
34c5bd66
PN
906}
907
47e958ea 908/**
46bcf14f 909 * sk_filter_release_rcu - Release a socket filter by rcu_head
47e958ea
PE
910 * @rcu: rcu_head that contains the sk_filter to free
911 */
fbc907f0 912static void sk_filter_release_rcu(struct rcu_head *rcu)
47e958ea
PE
913{
914 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
915
34c5bd66 916 __sk_filter_release(fp);
47e958ea 917}
fbc907f0
DB
918
919/**
920 * sk_filter_release - release a socket filter
921 * @fp: filter to remove
922 *
923 * Remove a filter from a socket and release its resources.
924 */
925static void sk_filter_release(struct sk_filter *fp)
926{
927 if (atomic_dec_and_test(&fp->refcnt))
928 call_rcu(&fp->rcu, sk_filter_release_rcu);
929}
930
931void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
932{
7ae457c1 933 u32 filter_size = bpf_prog_size(fp->prog->len);
fbc907f0 934
278571ba
AS
935 atomic_sub(filter_size, &sk->sk_omem_alloc);
936 sk_filter_release(fp);
fbc907f0 937}
47e958ea 938
278571ba
AS
939/* try to charge the socket memory if there is space available
940 * return true on success
941 */
942bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
bd4cf0ed 943{
7ae457c1 944 u32 filter_size = bpf_prog_size(fp->prog->len);
278571ba
AS
945
946 /* same check as in sock_kmalloc() */
947 if (filter_size <= sysctl_optmem_max &&
948 atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
949 atomic_inc(&fp->refcnt);
950 atomic_add(filter_size, &sk->sk_omem_alloc);
951 return true;
bd4cf0ed 952 }
278571ba 953 return false;
bd4cf0ed
AS
954}
955
7ae457c1 956static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
bd4cf0ed
AS
957{
958 struct sock_filter *old_prog;
7ae457c1 959 struct bpf_prog *old_fp;
34805931 960 int err, new_len, old_len = fp->len;
bd4cf0ed
AS
961
962 /* We are free to overwrite insns et al right here as it
963 * won't be used at this point in time anymore internally
964 * after the migration to the internal BPF instruction
965 * representation.
966 */
967 BUILD_BUG_ON(sizeof(struct sock_filter) !=
2695fb55 968 sizeof(struct bpf_insn));
bd4cf0ed 969
bd4cf0ed
AS
970 /* Conversion cannot happen on overlapping memory areas,
971 * so we need to keep the user BPF around until the 2nd
972 * pass. At this time, the user BPF is stored in fp->insns.
973 */
974 old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
658da937 975 GFP_KERNEL | __GFP_NOWARN);
bd4cf0ed
AS
976 if (!old_prog) {
977 err = -ENOMEM;
978 goto out_err;
979 }
980
981 /* 1st pass: calculate the new program length. */
8fb575ca 982 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len);
bd4cf0ed
AS
983 if (err)
984 goto out_err_free;
985
986 /* Expand fp for appending the new filter representation. */
987 old_fp = fp;
60a3b225 988 fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
bd4cf0ed
AS
989 if (!fp) {
990 /* The old_fp is still around in case we couldn't
991 * allocate new memory, so uncharge on that one.
992 */
993 fp = old_fp;
994 err = -ENOMEM;
995 goto out_err_free;
996 }
997
bd4cf0ed
AS
998 fp->len = new_len;
999
2695fb55 1000 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
8fb575ca 1001 err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
bd4cf0ed 1002 if (err)
8fb575ca 1003 /* 2nd bpf_convert_filter() can fail only if it fails
bd4cf0ed
AS
1004 * to allocate memory, remapping must succeed. Note,
1005 * that at this time old_fp has already been released
278571ba 1006 * by krealloc().
bd4cf0ed
AS
1007 */
1008 goto out_err_free;
1009
d1c55ab5
DB
1010 /* We are guaranteed to never error here with cBPF to eBPF
1011 * transitions, since there's no issue with type compatibility
1012 * checks on program arrays.
1013 */
1014 fp = bpf_prog_select_runtime(fp, &err);
5fe821a9 1015
bd4cf0ed
AS
1016 kfree(old_prog);
1017 return fp;
1018
1019out_err_free:
1020 kfree(old_prog);
1021out_err:
7ae457c1 1022 __bpf_prog_release(fp);
bd4cf0ed
AS
1023 return ERR_PTR(err);
1024}
1025
ac67eb2c
DB
1026static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
1027 bpf_aux_classic_check_t trans)
302d6637
JP
1028{
1029 int err;
1030
bd4cf0ed 1031 fp->bpf_func = NULL;
a91263d5 1032 fp->jited = 0;
302d6637 1033
4df95ff4 1034 err = bpf_check_classic(fp->insns, fp->len);
418c96ac 1035 if (err) {
7ae457c1 1036 __bpf_prog_release(fp);
bd4cf0ed 1037 return ERR_PTR(err);
418c96ac 1038 }
302d6637 1039
4ae92bc7
NS
1040 /* There might be additional checks and transformations
1041 * needed on classic filters, f.e. in case of seccomp.
1042 */
1043 if (trans) {
1044 err = trans(fp->insns, fp->len);
1045 if (err) {
1046 __bpf_prog_release(fp);
1047 return ERR_PTR(err);
1048 }
1049 }
1050
bd4cf0ed
AS
1051 /* Probe if we can JIT compile the filter and if so, do
1052 * the compilation of the filter.
1053 */
302d6637 1054 bpf_jit_compile(fp);
bd4cf0ed
AS
1055
1056 /* JIT compiler couldn't process this filter, so do the
1057 * internal BPF translation for the optimized interpreter.
1058 */
5fe821a9 1059 if (!fp->jited)
7ae457c1 1060 fp = bpf_migrate_filter(fp);
bd4cf0ed
AS
1061
1062 return fp;
302d6637
JP
1063}
1064
1065/**
7ae457c1 1066 * bpf_prog_create - create an unattached filter
c6c4b97c 1067 * @pfp: the unattached filter that is created
677a9fd3 1068 * @fprog: the filter program
302d6637 1069 *
c6c4b97c 1070 * Create a filter independent of any socket. We first run some
302d6637
JP
1071 * sanity checks on it to make sure it does not explode on us later.
1072 * If an error occurs or there is insufficient memory for the filter
1073 * a negative errno code is returned. On success the return is zero.
1074 */
7ae457c1 1075int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
302d6637 1076{
009937e7 1077 unsigned int fsize = bpf_classic_proglen(fprog);
7ae457c1 1078 struct bpf_prog *fp;
302d6637
JP
1079
1080 /* Make sure new filter is there and in the right amounts. */
f7bd9e36 1081 if (!bpf_check_basics_ok(fprog->filter, fprog->len))
302d6637
JP
1082 return -EINVAL;
1083
60a3b225 1084 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
302d6637
JP
1085 if (!fp)
1086 return -ENOMEM;
a3ea269b 1087
302d6637
JP
1088 memcpy(fp->insns, fprog->filter, fsize);
1089
302d6637 1090 fp->len = fprog->len;
a3ea269b
DB
1091 /* Since unattached filters are not copied back to user
1092 * space through sk_get_filter(), we do not need to hold
1093 * a copy here, and can spare us the work.
1094 */
1095 fp->orig_prog = NULL;
302d6637 1096
7ae457c1 1097 /* bpf_prepare_filter() already takes care of freeing
bd4cf0ed
AS
1098 * memory in case something goes wrong.
1099 */
4ae92bc7 1100 fp = bpf_prepare_filter(fp, NULL);
bd4cf0ed
AS
1101 if (IS_ERR(fp))
1102 return PTR_ERR(fp);
302d6637
JP
1103
1104 *pfp = fp;
1105 return 0;
302d6637 1106}
7ae457c1 1107EXPORT_SYMBOL_GPL(bpf_prog_create);
302d6637 1108
ac67eb2c
DB
1109/**
1110 * bpf_prog_create_from_user - create an unattached filter from user buffer
1111 * @pfp: the unattached filter that is created
1112 * @fprog: the filter program
1113 * @trans: post-classic verifier transformation handler
bab18991 1114 * @save_orig: save classic BPF program
ac67eb2c
DB
1115 *
1116 * This function effectively does the same as bpf_prog_create(), only
1117 * that it builds up its insns buffer from user space provided buffer.
1118 * It also allows for passing a bpf_aux_classic_check_t handler.
1119 */
1120int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
bab18991 1121 bpf_aux_classic_check_t trans, bool save_orig)
ac67eb2c
DB
1122{
1123 unsigned int fsize = bpf_classic_proglen(fprog);
1124 struct bpf_prog *fp;
bab18991 1125 int err;
ac67eb2c
DB
1126
1127 /* Make sure new filter is there and in the right amounts. */
f7bd9e36 1128 if (!bpf_check_basics_ok(fprog->filter, fprog->len))
ac67eb2c
DB
1129 return -EINVAL;
1130
1131 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1132 if (!fp)
1133 return -ENOMEM;
1134
1135 if (copy_from_user(fp->insns, fprog->filter, fsize)) {
1136 __bpf_prog_free(fp);
1137 return -EFAULT;
1138 }
1139
1140 fp->len = fprog->len;
ac67eb2c
DB
1141 fp->orig_prog = NULL;
1142
bab18991
DB
1143 if (save_orig) {
1144 err = bpf_prog_store_orig_filter(fp, fprog);
1145 if (err) {
1146 __bpf_prog_free(fp);
1147 return -ENOMEM;
1148 }
1149 }
1150
ac67eb2c
DB
1151 /* bpf_prepare_filter() already takes care of freeing
1152 * memory in case something goes wrong.
1153 */
1154 fp = bpf_prepare_filter(fp, trans);
1155 if (IS_ERR(fp))
1156 return PTR_ERR(fp);
1157
1158 *pfp = fp;
1159 return 0;
1160}
2ea273d7 1161EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);
ac67eb2c 1162
7ae457c1 1163void bpf_prog_destroy(struct bpf_prog *fp)
302d6637 1164{
7ae457c1 1165 __bpf_prog_release(fp);
302d6637 1166}
7ae457c1 1167EXPORT_SYMBOL_GPL(bpf_prog_destroy);
302d6637 1168
8ced425e 1169static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
49b31e57
DB
1170{
1171 struct sk_filter *fp, *old_fp;
1172
1173 fp = kmalloc(sizeof(*fp), GFP_KERNEL);
1174 if (!fp)
1175 return -ENOMEM;
1176
1177 fp->prog = prog;
1178 atomic_set(&fp->refcnt, 0);
1179
1180 if (!sk_filter_charge(sk, fp)) {
1181 kfree(fp);
1182 return -ENOMEM;
1183 }
1184
8ced425e
HFS
1185 old_fp = rcu_dereference_protected(sk->sk_filter,
1186 lockdep_sock_is_held(sk));
49b31e57 1187 rcu_assign_pointer(sk->sk_filter, fp);
8ced425e 1188
49b31e57
DB
1189 if (old_fp)
1190 sk_filter_uncharge(sk, old_fp);
1191
1192 return 0;
1193}
1194
538950a1
CG
1195static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
1196{
1197 struct bpf_prog *old_prog;
1198 int err;
1199
1200 if (bpf_prog_size(prog->len) > sysctl_optmem_max)
1201 return -ENOMEM;
1202
fa463497 1203 if (sk_unhashed(sk) && sk->sk_reuseport) {
538950a1
CG
1204 err = reuseport_alloc(sk);
1205 if (err)
1206 return err;
1207 } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
1208 /* The socket wasn't bound with SO_REUSEPORT */
1209 return -EINVAL;
1210 }
1211
1212 old_prog = reuseport_attach_prog(sk, prog);
1213 if (old_prog)
1214 bpf_prog_destroy(old_prog);
1215
1216 return 0;
1217}
1218
1219static
1220struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
1da177e4 1221{
009937e7 1222 unsigned int fsize = bpf_classic_proglen(fprog);
7ae457c1 1223 struct bpf_prog *prog;
1da177e4
LT
1224 int err;
1225
d59577b6 1226 if (sock_flag(sk, SOCK_FILTER_LOCKED))
538950a1 1227 return ERR_PTR(-EPERM);
d59577b6 1228
1da177e4 1229 /* Make sure new filter is there and in the right amounts. */
f7bd9e36 1230 if (!bpf_check_basics_ok(fprog->filter, fprog->len))
538950a1 1231 return ERR_PTR(-EINVAL);
1da177e4 1232
f7bd9e36 1233 prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
7ae457c1 1234 if (!prog)
538950a1 1235 return ERR_PTR(-ENOMEM);
a3ea269b 1236
7ae457c1 1237 if (copy_from_user(prog->insns, fprog->filter, fsize)) {
c0d1379a 1238 __bpf_prog_free(prog);
538950a1 1239 return ERR_PTR(-EFAULT);
1da177e4
LT
1240 }
1241
7ae457c1 1242 prog->len = fprog->len;
1da177e4 1243
7ae457c1 1244 err = bpf_prog_store_orig_filter(prog, fprog);
a3ea269b 1245 if (err) {
c0d1379a 1246 __bpf_prog_free(prog);
538950a1 1247 return ERR_PTR(-ENOMEM);
a3ea269b
DB
1248 }
1249
7ae457c1 1250 /* bpf_prepare_filter() already takes care of freeing
bd4cf0ed
AS
1251 * memory in case something goes wrong.
1252 */
538950a1
CG
1253 return bpf_prepare_filter(prog, NULL);
1254}
1255
1256/**
1257 * sk_attach_filter - attach a socket filter
1258 * @fprog: the filter program
1259 * @sk: the socket to use
1260 *
1261 * Attach the user's filter code. We first run some sanity checks on
1262 * it to make sure it does not explode on us later. If an error
1263 * occurs or there is insufficient memory for the filter a negative
1264 * errno code is returned. On success the return is zero.
1265 */
8ced425e 1266int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
538950a1
CG
1267{
1268 struct bpf_prog *prog = __get_filter(fprog, sk);
1269 int err;
1270
7ae457c1
AS
1271 if (IS_ERR(prog))
1272 return PTR_ERR(prog);
1273
8ced425e 1274 err = __sk_attach_prog(prog, sk);
49b31e57 1275 if (err < 0) {
7ae457c1 1276 __bpf_prog_release(prog);
49b31e57 1277 return err;
278571ba
AS
1278 }
1279
d3904b73 1280 return 0;
1da177e4 1281}
8ced425e 1282EXPORT_SYMBOL_GPL(sk_attach_filter);
1da177e4 1283
538950a1 1284int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
89aa0758 1285{
538950a1 1286 struct bpf_prog *prog = __get_filter(fprog, sk);
49b31e57 1287 int err;
89aa0758 1288
538950a1
CG
1289 if (IS_ERR(prog))
1290 return PTR_ERR(prog);
1291
1292 err = __reuseport_attach_prog(prog, sk);
1293 if (err < 0) {
1294 __bpf_prog_release(prog);
1295 return err;
1296 }
1297
1298 return 0;
1299}
1300
1301static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
1302{
89aa0758 1303 if (sock_flag(sk, SOCK_FILTER_LOCKED))
538950a1 1304 return ERR_PTR(-EPERM);
89aa0758 1305
113214be 1306 return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
538950a1
CG
1307}
1308
1309int sk_attach_bpf(u32 ufd, struct sock *sk)
1310{
1311 struct bpf_prog *prog = __get_bpf(ufd, sk);
1312 int err;
1313
1314 if (IS_ERR(prog))
1315 return PTR_ERR(prog);
1316
8ced425e 1317 err = __sk_attach_prog(prog, sk);
49b31e57 1318 if (err < 0) {
89aa0758 1319 bpf_prog_put(prog);
49b31e57 1320 return err;
89aa0758
AS
1321 }
1322
89aa0758
AS
1323 return 0;
1324}
1325
538950a1
CG
1326int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
1327{
1328 struct bpf_prog *prog = __get_bpf(ufd, sk);
1329 int err;
1330
1331 if (IS_ERR(prog))
1332 return PTR_ERR(prog);
1333
1334 err = __reuseport_attach_prog(prog, sk);
1335 if (err < 0) {
1336 bpf_prog_put(prog);
1337 return err;
1338 }
1339
1340 return 0;
1341}
1342
21cafc1d
DB
1343struct bpf_scratchpad {
1344 union {
1345 __be32 diff[MAX_BPF_STACK / sizeof(__be32)];
1346 u8 buff[MAX_BPF_STACK];
1347 };
1348};
1349
1350static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
91bc4822 1351
5293efe6
DB
1352static inline int __bpf_try_make_writable(struct sk_buff *skb,
1353 unsigned int write_len)
1354{
1355 return skb_ensure_writable(skb, write_len);
1356}
1357
db58ba45
AS
1358static inline int bpf_try_make_writable(struct sk_buff *skb,
1359 unsigned int write_len)
1360{
5293efe6 1361 int err = __bpf_try_make_writable(skb, write_len);
db58ba45 1362
0ed661d5 1363 bpf_compute_data_end(skb);
db58ba45
AS
1364 return err;
1365}
1366
a2bfe6bf
DB
1367static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
1368{
1369 if (skb_at_tc_ingress(skb))
1370 skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1371}
1372
8065694e
DB
1373static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
1374{
1375 if (skb_at_tc_ingress(skb))
1376 skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1377}
1378
91bc4822 1379static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
608cd71a
AS
1380{
1381 struct sk_buff *skb = (struct sk_buff *) (long) r1;
0ed661d5 1382 unsigned int offset = (unsigned int) r2;
608cd71a
AS
1383 void *from = (void *) (long) r3;
1384 unsigned int len = (unsigned int) r4;
608cd71a
AS
1385 void *ptr;
1386
8afd54c8 1387 if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
781c53bc 1388 return -EINVAL;
0ed661d5 1389 if (unlikely(offset > 0xffff))
608cd71a 1390 return -EFAULT;
db58ba45 1391 if (unlikely(bpf_try_make_writable(skb, offset + len)))
608cd71a
AS
1392 return -EFAULT;
1393
0ed661d5 1394 ptr = skb->data + offset;
781c53bc 1395 if (flags & BPF_F_RECOMPUTE_CSUM)
479ffccc 1396 __skb_postpull_rcsum(skb, ptr, len, offset);
608cd71a
AS
1397
1398 memcpy(ptr, from, len);
1399
781c53bc 1400 if (flags & BPF_F_RECOMPUTE_CSUM)
479ffccc 1401 __skb_postpush_rcsum(skb, ptr, len, offset);
8afd54c8
DB
1402 if (flags & BPF_F_INVALIDATE_HASH)
1403 skb_clear_hash(skb);
f8ffad69 1404
608cd71a
AS
1405 return 0;
1406}
1407
577c50aa 1408static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
608cd71a
AS
1409 .func = bpf_skb_store_bytes,
1410 .gpl_only = false,
1411 .ret_type = RET_INTEGER,
1412 .arg1_type = ARG_PTR_TO_CTX,
1413 .arg2_type = ARG_ANYTHING,
1414 .arg3_type = ARG_PTR_TO_STACK,
1415 .arg4_type = ARG_CONST_STACK_SIZE,
91bc4822
AS
1416 .arg5_type = ARG_ANYTHING,
1417};
1418
05c74e5e
DB
1419static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1420{
1421 const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1;
0ed661d5 1422 unsigned int offset = (unsigned int) r2;
05c74e5e
DB
1423 void *to = (void *)(unsigned long) r3;
1424 unsigned int len = (unsigned int) r4;
1425 void *ptr;
1426
0ed661d5 1427 if (unlikely(offset > 0xffff))
074f528e 1428 goto err_clear;
05c74e5e
DB
1429
1430 ptr = skb_header_pointer(skb, offset, len, to);
1431 if (unlikely(!ptr))
074f528e 1432 goto err_clear;
05c74e5e
DB
1433 if (ptr != to)
1434 memcpy(to, ptr, len);
1435
1436 return 0;
074f528e
DB
1437err_clear:
1438 memset(to, 0, len);
1439 return -EFAULT;
05c74e5e
DB
1440}
1441
577c50aa 1442static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
05c74e5e
DB
1443 .func = bpf_skb_load_bytes,
1444 .gpl_only = false,
1445 .ret_type = RET_INTEGER,
1446 .arg1_type = ARG_PTR_TO_CTX,
1447 .arg2_type = ARG_ANYTHING,
074f528e 1448 .arg3_type = ARG_PTR_TO_RAW_STACK,
05c74e5e
DB
1449 .arg4_type = ARG_CONST_STACK_SIZE,
1450};
1451
a166151c 1452static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
91bc4822
AS
1453{
1454 struct sk_buff *skb = (struct sk_buff *) (long) r1;
0ed661d5
DB
1455 unsigned int offset = (unsigned int) r2;
1456 __sum16 *ptr;
91bc4822 1457
781c53bc
DB
1458 if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
1459 return -EINVAL;
0ed661d5 1460 if (unlikely(offset > 0xffff || offset & 1))
91bc4822 1461 return -EFAULT;
0ed661d5 1462 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
91bc4822
AS
1463 return -EFAULT;
1464
0ed661d5 1465 ptr = (__sum16 *)(skb->data + offset);
781c53bc 1466 switch (flags & BPF_F_HDR_FIELD_MASK) {
8050c0f0
DB
1467 case 0:
1468 if (unlikely(from != 0))
1469 return -EINVAL;
1470
1471 csum_replace_by_diff(ptr, to);
1472 break;
91bc4822
AS
1473 case 2:
1474 csum_replace2(ptr, from, to);
1475 break;
1476 case 4:
1477 csum_replace4(ptr, from, to);
1478 break;
1479 default:
1480 return -EINVAL;
1481 }
1482
91bc4822
AS
1483 return 0;
1484}
1485
577c50aa 1486static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
91bc4822
AS
1487 .func = bpf_l3_csum_replace,
1488 .gpl_only = false,
1489 .ret_type = RET_INTEGER,
1490 .arg1_type = ARG_PTR_TO_CTX,
1491 .arg2_type = ARG_ANYTHING,
1492 .arg3_type = ARG_ANYTHING,
1493 .arg4_type = ARG_ANYTHING,
1494 .arg5_type = ARG_ANYTHING,
1495};
1496
a166151c 1497static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
91bc4822
AS
1498{
1499 struct sk_buff *skb = (struct sk_buff *) (long) r1;
781c53bc 1500 bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
2f72959a 1501 bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
0ed661d5
DB
1502 unsigned int offset = (unsigned int) r2;
1503 __sum16 *ptr;
91bc4822 1504
2f72959a
DB
1505 if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
1506 BPF_F_HDR_FIELD_MASK)))
781c53bc 1507 return -EINVAL;
0ed661d5 1508 if (unlikely(offset > 0xffff || offset & 1))
91bc4822 1509 return -EFAULT;
0ed661d5 1510 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
91bc4822
AS
1511 return -EFAULT;
1512
0ed661d5 1513 ptr = (__sum16 *)(skb->data + offset);
2f72959a
DB
1514 if (is_mmzero && !*ptr)
1515 return 0;
91bc4822 1516
781c53bc 1517 switch (flags & BPF_F_HDR_FIELD_MASK) {
7d672345
DB
1518 case 0:
1519 if (unlikely(from != 0))
1520 return -EINVAL;
1521
1522 inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
1523 break;
91bc4822
AS
1524 case 2:
1525 inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
1526 break;
1527 case 4:
1528 inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
1529 break;
1530 default:
1531 return -EINVAL;
1532 }
1533
2f72959a
DB
1534 if (is_mmzero && !*ptr)
1535 *ptr = CSUM_MANGLED_0;
91bc4822
AS
1536 return 0;
1537}
1538
577c50aa 1539static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
91bc4822
AS
1540 .func = bpf_l4_csum_replace,
1541 .gpl_only = false,
1542 .ret_type = RET_INTEGER,
1543 .arg1_type = ARG_PTR_TO_CTX,
1544 .arg2_type = ARG_ANYTHING,
1545 .arg3_type = ARG_ANYTHING,
1546 .arg4_type = ARG_ANYTHING,
1547 .arg5_type = ARG_ANYTHING,
608cd71a
AS
1548};
1549
7d672345
DB
1550static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
1551{
21cafc1d 1552 struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
7d672345
DB
1553 u64 diff_size = from_size + to_size;
1554 __be32 *from = (__be32 *) (long) r1;
1555 __be32 *to = (__be32 *) (long) r3;
1556 int i, j = 0;
1557
1558 /* This is quite flexible, some examples:
1559 *
1560 * from_size == 0, to_size > 0, seed := csum --> pushing data
1561 * from_size > 0, to_size == 0, seed := csum --> pulling data
1562 * from_size > 0, to_size > 0, seed := 0 --> diffing data
1563 *
1564 * Even for diffing, from_size and to_size don't need to be equal.
1565 */
1566 if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
1567 diff_size > sizeof(sp->diff)))
1568 return -EINVAL;
1569
1570 for (i = 0; i < from_size / sizeof(__be32); i++, j++)
1571 sp->diff[j] = ~from[i];
1572 for (i = 0; i < to_size / sizeof(__be32); i++, j++)
1573 sp->diff[j] = to[i];
1574
1575 return csum_partial(sp->diff, diff_size, seed);
1576}
1577
577c50aa 1578static const struct bpf_func_proto bpf_csum_diff_proto = {
7d672345
DB
1579 .func = bpf_csum_diff,
1580 .gpl_only = false,
1581 .ret_type = RET_INTEGER,
1582 .arg1_type = ARG_PTR_TO_STACK,
1583 .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO,
1584 .arg3_type = ARG_PTR_TO_STACK,
1585 .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO,
1586 .arg5_type = ARG_ANYTHING,
1587};
1588
a70b506e
DB
1589static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
1590{
a70b506e
DB
1591 return dev_forward_skb(dev, skb);
1592}
1593
1594static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
1595{
1596 int ret;
1597
1598 if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
1599 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
1600 kfree_skb(skb);
1601 return -ENETDOWN;
1602 }
1603
1604 skb->dev = dev;
1605
1606 __this_cpu_inc(xmit_recursion);
1607 ret = dev_queue_xmit(skb);
1608 __this_cpu_dec(xmit_recursion);
1609
1610 return ret;
1611}
1612
3896d655
AS
1613static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
1614{
a70b506e 1615 struct sk_buff *skb = (struct sk_buff *) (long) r1;
3896d655
AS
1616 struct net_device *dev;
1617
781c53bc
DB
1618 if (unlikely(flags & ~(BPF_F_INGRESS)))
1619 return -EINVAL;
1620
3896d655
AS
1621 dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
1622 if (unlikely(!dev))
1623 return -EINVAL;
1624
a70b506e
DB
1625 skb = skb_clone(skb, GFP_ATOMIC);
1626 if (unlikely(!skb))
3896d655
AS
1627 return -ENOMEM;
1628
a2bfe6bf
DB
1629 bpf_push_mac_rcsum(skb);
1630
a70b506e
DB
1631 return flags & BPF_F_INGRESS ?
1632 __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
3896d655
AS
1633}
1634
577c50aa 1635static const struct bpf_func_proto bpf_clone_redirect_proto = {
3896d655
AS
1636 .func = bpf_clone_redirect,
1637 .gpl_only = false,
1638 .ret_type = RET_INTEGER,
1639 .arg1_type = ARG_PTR_TO_CTX,
1640 .arg2_type = ARG_ANYTHING,
1641 .arg3_type = ARG_ANYTHING,
1642};
1643
27b29f63
AS
1644struct redirect_info {
1645 u32 ifindex;
1646 u32 flags;
1647};
1648
1649static DEFINE_PER_CPU(struct redirect_info, redirect_info);
781c53bc 1650
27b29f63
AS
1651static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5)
1652{
1653 struct redirect_info *ri = this_cpu_ptr(&redirect_info);
1654
781c53bc
DB
1655 if (unlikely(flags & ~(BPF_F_INGRESS)))
1656 return TC_ACT_SHOT;
1657
27b29f63
AS
1658 ri->ifindex = ifindex;
1659 ri->flags = flags;
781c53bc 1660
27b29f63
AS
1661 return TC_ACT_REDIRECT;
1662}
1663
1664int skb_do_redirect(struct sk_buff *skb)
1665{
1666 struct redirect_info *ri = this_cpu_ptr(&redirect_info);
1667 struct net_device *dev;
1668
1669 dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
1670 ri->ifindex = 0;
1671 if (unlikely(!dev)) {
1672 kfree_skb(skb);
1673 return -EINVAL;
1674 }
1675
a2bfe6bf
DB
1676 bpf_push_mac_rcsum(skb);
1677
a70b506e
DB
1678 return ri->flags & BPF_F_INGRESS ?
1679 __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
27b29f63
AS
1680}
1681
577c50aa 1682static const struct bpf_func_proto bpf_redirect_proto = {
27b29f63
AS
1683 .func = bpf_redirect,
1684 .gpl_only = false,
1685 .ret_type = RET_INTEGER,
1686 .arg1_type = ARG_ANYTHING,
1687 .arg2_type = ARG_ANYTHING,
1688};
1689
8d20aabe
DB
1690static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1691{
1692 return task_get_classid((struct sk_buff *) (unsigned long) r1);
1693}
1694
1695static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
1696 .func = bpf_get_cgroup_classid,
1697 .gpl_only = false,
1698 .ret_type = RET_INTEGER,
1699 .arg1_type = ARG_PTR_TO_CTX,
1700};
1701
c46646d0
DB
1702static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1703{
808c1b69 1704 return dst_tclassid((struct sk_buff *) (unsigned long) r1);
c46646d0
DB
1705}
1706
1707static const struct bpf_func_proto bpf_get_route_realm_proto = {
1708 .func = bpf_get_route_realm,
1709 .gpl_only = false,
1710 .ret_type = RET_INTEGER,
1711 .arg1_type = ARG_PTR_TO_CTX,
1712};
1713
13c5c240
DB
1714static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1715{
1716 /* If skb_clear_hash() was called due to mangling, we can
1717 * trigger SW recalculation here. Later access to hash
1718 * can then use the inline skb->hash via context directly
1719 * instead of calling this helper again.
1720 */
1721 return skb_get_hash((struct sk_buff *) (unsigned long) r1);
1722}
1723
1724static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
1725 .func = bpf_get_hash_recalc,
1726 .gpl_only = false,
1727 .ret_type = RET_INTEGER,
1728 .arg1_type = ARG_PTR_TO_CTX,
1729};
1730
4e10df9a
AS
1731static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
1732{
1733 struct sk_buff *skb = (struct sk_buff *) (long) r1;
1734 __be16 vlan_proto = (__force __be16) r2;
db58ba45 1735 int ret;
4e10df9a
AS
1736
1737 if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
1738 vlan_proto != htons(ETH_P_8021AD)))
1739 vlan_proto = htons(ETH_P_8021Q);
1740
8065694e 1741 bpf_push_mac_rcsum(skb);
db58ba45 1742 ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
8065694e
DB
1743 bpf_pull_mac_rcsum(skb);
1744
db58ba45
AS
1745 bpf_compute_data_end(skb);
1746 return ret;
4e10df9a
AS
1747}
1748
1749const struct bpf_func_proto bpf_skb_vlan_push_proto = {
1750 .func = bpf_skb_vlan_push,
1751 .gpl_only = false,
1752 .ret_type = RET_INTEGER,
1753 .arg1_type = ARG_PTR_TO_CTX,
1754 .arg2_type = ARG_ANYTHING,
1755 .arg3_type = ARG_ANYTHING,
1756};
4d9c5c53 1757EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
4e10df9a
AS
1758
1759static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1760{
1761 struct sk_buff *skb = (struct sk_buff *) (long) r1;
db58ba45 1762 int ret;
4e10df9a 1763
8065694e 1764 bpf_push_mac_rcsum(skb);
db58ba45 1765 ret = skb_vlan_pop(skb);
8065694e
DB
1766 bpf_pull_mac_rcsum(skb);
1767
db58ba45
AS
1768 bpf_compute_data_end(skb);
1769 return ret;
4e10df9a
AS
1770}
1771
1772const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
1773 .func = bpf_skb_vlan_pop,
1774 .gpl_only = false,
1775 .ret_type = RET_INTEGER,
1776 .arg1_type = ARG_PTR_TO_CTX,
1777};
4d9c5c53 1778EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
4e10df9a 1779
6578171a
DB
1780static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
1781{
1782 /* Caller already did skb_cow() with len as headroom,
1783 * so no need to do it here.
1784 */
1785 skb_push(skb, len);
1786 memmove(skb->data, skb->data + len, off);
1787 memset(skb->data + off, 0, len);
1788
1789 /* No skb_postpush_rcsum(skb, skb->data + off, len)
1790 * needed here as it does not change the skb->csum
1791 * result for checksum complete when summing over
1792 * zeroed blocks.
1793 */
1794 return 0;
1795}
1796
1797static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
1798{
1799 /* skb_ensure_writable() is not needed here, as we're
1800 * already working on an uncloned skb.
1801 */
1802 if (unlikely(!pskb_may_pull(skb, off + len)))
1803 return -ENOMEM;
1804
1805 skb_postpull_rcsum(skb, skb->data + off, len);
1806 memmove(skb->data + len, skb->data, off);
1807 __skb_pull(skb, len);
1808
1809 return 0;
1810}
1811
1812static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
1813{
1814 bool trans_same = skb->transport_header == skb->network_header;
1815 int ret;
1816
1817 /* There's no need for __skb_push()/__skb_pull() pair to
1818 * get to the start of the mac header as we're guaranteed
1819 * to always start from here under eBPF.
1820 */
1821 ret = bpf_skb_generic_push(skb, off, len);
1822 if (likely(!ret)) {
1823 skb->mac_header -= len;
1824 skb->network_header -= len;
1825 if (trans_same)
1826 skb->transport_header = skb->network_header;
1827 }
1828
1829 return ret;
1830}
1831
1832static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
1833{
1834 bool trans_same = skb->transport_header == skb->network_header;
1835 int ret;
1836
1837 /* Same here, __skb_push()/__skb_pull() pair not needed. */
1838 ret = bpf_skb_generic_pop(skb, off, len);
1839 if (likely(!ret)) {
1840 skb->mac_header += len;
1841 skb->network_header += len;
1842 if (trans_same)
1843 skb->transport_header = skb->network_header;
1844 }
1845
1846 return ret;
1847}
1848
1849static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
1850{
1851 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
1852 u32 off = skb->network_header - skb->mac_header;
1853 int ret;
1854
1855 ret = skb_cow(skb, len_diff);
1856 if (unlikely(ret < 0))
1857 return ret;
1858
1859 ret = bpf_skb_net_hdr_push(skb, off, len_diff);
1860 if (unlikely(ret < 0))
1861 return ret;
1862
1863 if (skb_is_gso(skb)) {
1864 /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to
1865 * be changed into SKB_GSO_TCPV6.
1866 */
1867 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
1868 skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4;
1869 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
1870 }
1871
1872 /* Due to IPv6 header, MSS needs to be downgraded. */
1873 skb_shinfo(skb)->gso_size -= len_diff;
1874 /* Header must be checked, and gso_segs recomputed. */
1875 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1876 skb_shinfo(skb)->gso_segs = 0;
1877 }
1878
1879 skb->protocol = htons(ETH_P_IPV6);
1880 skb_clear_hash(skb);
1881
1882 return 0;
1883}
1884
1885static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
1886{
1887 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
1888 u32 off = skb->network_header - skb->mac_header;
1889 int ret;
1890
1891 ret = skb_unclone(skb, GFP_ATOMIC);
1892 if (unlikely(ret < 0))
1893 return ret;
1894
1895 ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
1896 if (unlikely(ret < 0))
1897 return ret;
1898
1899 if (skb_is_gso(skb)) {
1900 /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to
1901 * be changed into SKB_GSO_TCPV4.
1902 */
1903 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
1904 skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6;
1905 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
1906 }
1907
1908 /* Due to IPv4 header, MSS can be upgraded. */
1909 skb_shinfo(skb)->gso_size += len_diff;
1910 /* Header must be checked, and gso_segs recomputed. */
1911 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1912 skb_shinfo(skb)->gso_segs = 0;
1913 }
1914
1915 skb->protocol = htons(ETH_P_IP);
1916 skb_clear_hash(skb);
1917
1918 return 0;
1919}
1920
1921static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
1922{
1923 __be16 from_proto = skb->protocol;
1924
1925 if (from_proto == htons(ETH_P_IP) &&
1926 to_proto == htons(ETH_P_IPV6))
1927 return bpf_skb_proto_4_to_6(skb);
1928
1929 if (from_proto == htons(ETH_P_IPV6) &&
1930 to_proto == htons(ETH_P_IP))
1931 return bpf_skb_proto_6_to_4(skb);
1932
1933 return -ENOTSUPP;
1934}
1935
1936static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
1937{
1938 struct sk_buff *skb = (struct sk_buff *) (long) r1;
1939 __be16 proto = (__force __be16) r2;
1940 int ret;
1941
1942 if (unlikely(flags))
1943 return -EINVAL;
1944
1945 /* General idea is that this helper does the basic groundwork
1946 * needed for changing the protocol, and eBPF program fills the
1947 * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
1948 * and other helpers, rather than passing a raw buffer here.
1949 *
1950 * The rationale is to keep this minimal and without a need to
1951 * deal with raw packet data. F.e. even if we would pass buffers
1952 * here, the program still needs to call the bpf_lX_csum_replace()
1953 * helpers anyway. Plus, this way we keep also separation of
1954 * concerns, since f.e. bpf_skb_store_bytes() should only take
1955 * care of stores.
1956 *
1957 * Currently, additional options and extension header space are
1958 * not supported, but flags register is reserved so we can adapt
1959 * that. For offloads, we mark packet as dodgy, so that headers
1960 * need to be verified first.
1961 */
1962 ret = bpf_skb_proto_xlat(skb, proto);
1963 bpf_compute_data_end(skb);
1964 return ret;
1965}
1966
1967static const struct bpf_func_proto bpf_skb_change_proto_proto = {
1968 .func = bpf_skb_change_proto,
1969 .gpl_only = false,
1970 .ret_type = RET_INTEGER,
1971 .arg1_type = ARG_PTR_TO_CTX,
1972 .arg2_type = ARG_ANYTHING,
1973 .arg3_type = ARG_ANYTHING,
1974};
1975
d2485c42
DB
1976static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1977{
1978 struct sk_buff *skb = (struct sk_buff *) (long) r1;
1979 u32 pkt_type = r2;
1980
1981 /* We only allow a restricted subset to be changed for now. */
45c7fffa
DB
1982 if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
1983 !skb_pkt_type_ok(pkt_type)))
d2485c42
DB
1984 return -EINVAL;
1985
1986 skb->pkt_type = pkt_type;
1987 return 0;
1988}
1989
1990static const struct bpf_func_proto bpf_skb_change_type_proto = {
1991 .func = bpf_skb_change_type,
1992 .gpl_only = false,
1993 .ret_type = RET_INTEGER,
1994 .arg1_type = ARG_PTR_TO_CTX,
1995 .arg2_type = ARG_ANYTHING,
1996};
1997
5293efe6
DB
1998static u32 __bpf_skb_min_len(const struct sk_buff *skb)
1999{
2000 u32 min_len = skb_network_offset(skb);
2001
2002 if (skb_transport_header_was_set(skb))
2003 min_len = skb_transport_offset(skb);
2004 if (skb->ip_summed == CHECKSUM_PARTIAL)
2005 min_len = skb_checksum_start_offset(skb) +
2006 skb->csum_offset + sizeof(__sum16);
2007 return min_len;
2008}
2009
2010static u32 __bpf_skb_max_len(const struct sk_buff *skb)
2011{
6088b582 2012 return skb->dev->mtu + skb->dev->hard_header_len;
5293efe6
DB
2013}
2014
2015static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
2016{
2017 unsigned int old_len = skb->len;
2018 int ret;
2019
2020 ret = __skb_grow_rcsum(skb, new_len);
2021 if (!ret)
2022 memset(skb->data + old_len, 0, new_len - old_len);
2023 return ret;
2024}
2025
2026static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
2027{
2028 return __skb_trim_rcsum(skb, new_len);
2029}
2030
2031static u64 bpf_skb_change_tail(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
2032{
2033 struct sk_buff *skb = (struct sk_buff *)(long) r1;
2034 u32 max_len = __bpf_skb_max_len(skb);
2035 u32 min_len = __bpf_skb_min_len(skb);
2036 u32 new_len = (u32) r2;
2037 int ret;
2038
2039 if (unlikely(flags || new_len > max_len || new_len < min_len))
2040 return -EINVAL;
2041 if (skb->encapsulation)
2042 return -ENOTSUPP;
2043
2044 /* The basic idea of this helper is that it's performing the
2045 * needed work to either grow or trim an skb, and eBPF program
2046 * rewrites the rest via helpers like bpf_skb_store_bytes(),
2047 * bpf_lX_csum_replace() and others rather than passing a raw
2048 * buffer here. This one is a slow path helper and intended
2049 * for replies with control messages.
2050 *
2051 * Like in bpf_skb_change_proto(), we want to keep this rather
2052 * minimal and without protocol specifics so that we are able
2053 * to separate concerns as in bpf_skb_store_bytes() should only
2054 * be the one responsible for writing buffers.
2055 *
2056 * It's really expected to be a slow path operation here for
2057 * control message replies, so we're implicitly linearizing,
2058 * uncloning and drop offloads from the skb by this.
2059 */
2060 ret = __bpf_try_make_writable(skb, skb->len);
2061 if (!ret) {
2062 if (new_len > skb->len)
2063 ret = bpf_skb_grow_rcsum(skb, new_len);
2064 else if (new_len < skb->len)
2065 ret = bpf_skb_trim_rcsum(skb, new_len);
2066 if (!ret && skb_is_gso(skb))
2067 skb_gso_reset(skb);
2068 }
2069
2070 bpf_compute_data_end(skb);
2071 return ret;
2072}
2073
2074static const struct bpf_func_proto bpf_skb_change_tail_proto = {
2075 .func = bpf_skb_change_tail,
2076 .gpl_only = false,
2077 .ret_type = RET_INTEGER,
2078 .arg1_type = ARG_PTR_TO_CTX,
2079 .arg2_type = ARG_ANYTHING,
2080 .arg3_type = ARG_ANYTHING,
2081};
2082
4e10df9a
AS
2083bool bpf_helper_changes_skb_data(void *func)
2084{
2085 if (func == bpf_skb_vlan_push)
2086 return true;
2087 if (func == bpf_skb_vlan_pop)
2088 return true;
3697649f
DB
2089 if (func == bpf_skb_store_bytes)
2090 return true;
6578171a
DB
2091 if (func == bpf_skb_change_proto)
2092 return true;
5293efe6
DB
2093 if (func == bpf_skb_change_tail)
2094 return true;
3697649f
DB
2095 if (func == bpf_l3_csum_replace)
2096 return true;
2097 if (func == bpf_l4_csum_replace)
2098 return true;
2099
4e10df9a
AS
2100 return false;
2101}
2102
555c8a86 2103static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
aa7145c1 2104 unsigned long off, unsigned long len)
555c8a86 2105{
aa7145c1 2106 void *ptr = skb_header_pointer(skb, off, len, dst_buff);
555c8a86
DB
2107
2108 if (unlikely(!ptr))
2109 return len;
2110 if (ptr != dst_buff)
2111 memcpy(dst_buff, ptr, len);
2112
2113 return 0;
2114}
2115
2116static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4,
2117 u64 meta_size)
2118{
2119 struct sk_buff *skb = (struct sk_buff *)(long) r1;
2120 struct bpf_map *map = (struct bpf_map *)(long) r2;
2121 u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
2122 void *meta = (void *)(long) r4;
2123
2124 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
2125 return -EINVAL;
2126 if (unlikely(skb_size > skb->len))
2127 return -EFAULT;
2128
2129 return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
2130 bpf_skb_copy);
2131}
2132
2133static const struct bpf_func_proto bpf_skb_event_output_proto = {
2134 .func = bpf_skb_event_output,
2135 .gpl_only = true,
2136 .ret_type = RET_INTEGER,
2137 .arg1_type = ARG_PTR_TO_CTX,
2138 .arg2_type = ARG_CONST_MAP_PTR,
2139 .arg3_type = ARG_ANYTHING,
2140 .arg4_type = ARG_PTR_TO_STACK,
2141 .arg5_type = ARG_CONST_STACK_SIZE,
2142};
2143
c6c33454
DB
2144static unsigned short bpf_tunnel_key_af(u64 flags)
2145{
2146 return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
2147}
2148
d3aa45ce
AS
2149static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
2150{
2151 struct sk_buff *skb = (struct sk_buff *) (long) r1;
2152 struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
c6c33454
DB
2153 const struct ip_tunnel_info *info = skb_tunnel_info(skb);
2154 u8 compat[sizeof(struct bpf_tunnel_key)];
074f528e
DB
2155 void *to_orig = to;
2156 int err;
d3aa45ce 2157
074f528e
DB
2158 if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) {
2159 err = -EINVAL;
2160 goto err_clear;
2161 }
2162 if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
2163 err = -EPROTO;
2164 goto err_clear;
2165 }
c6c33454 2166 if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
074f528e 2167 err = -EINVAL;
c6c33454 2168 switch (size) {
4018ab18 2169 case offsetof(struct bpf_tunnel_key, tunnel_label):
c0e760c9 2170 case offsetof(struct bpf_tunnel_key, tunnel_ext):
4018ab18 2171 goto set_compat;
c6c33454
DB
2172 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
2173 /* Fixup deprecated structure layouts here, so we have
2174 * a common path later on.
2175 */
2176 if (ip_tunnel_info_af(info) != AF_INET)
074f528e 2177 goto err_clear;
4018ab18 2178set_compat:
c6c33454
DB
2179 to = (struct bpf_tunnel_key *)compat;
2180 break;
2181 default:
074f528e 2182 goto err_clear;
c6c33454
DB
2183 }
2184 }
d3aa45ce
AS
2185
2186 to->tunnel_id = be64_to_cpu(info->key.tun_id);
c6c33454
DB
2187 to->tunnel_tos = info->key.tos;
2188 to->tunnel_ttl = info->key.ttl;
2189
4018ab18 2190 if (flags & BPF_F_TUNINFO_IPV6) {
c6c33454
DB
2191 memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
2192 sizeof(to->remote_ipv6));
4018ab18
DB
2193 to->tunnel_label = be32_to_cpu(info->key.label);
2194 } else {
c6c33454 2195 to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
4018ab18 2196 }
c6c33454
DB
2197
2198 if (unlikely(size != sizeof(struct bpf_tunnel_key)))
074f528e 2199 memcpy(to_orig, to, size);
d3aa45ce
AS
2200
2201 return 0;
074f528e
DB
2202err_clear:
2203 memset(to_orig, 0, size);
2204 return err;
d3aa45ce
AS
2205}
2206
577c50aa 2207static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
d3aa45ce
AS
2208 .func = bpf_skb_get_tunnel_key,
2209 .gpl_only = false,
2210 .ret_type = RET_INTEGER,
2211 .arg1_type = ARG_PTR_TO_CTX,
074f528e 2212 .arg2_type = ARG_PTR_TO_RAW_STACK,
d3aa45ce
AS
2213 .arg3_type = ARG_CONST_STACK_SIZE,
2214 .arg4_type = ARG_ANYTHING,
2215};
2216
14ca0751
DB
2217static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
2218{
2219 struct sk_buff *skb = (struct sk_buff *) (long) r1;
2220 u8 *to = (u8 *) (long) r2;
2221 const struct ip_tunnel_info *info = skb_tunnel_info(skb);
074f528e 2222 int err;
14ca0751
DB
2223
2224 if (unlikely(!info ||
074f528e
DB
2225 !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) {
2226 err = -ENOENT;
2227 goto err_clear;
2228 }
2229 if (unlikely(size < info->options_len)) {
2230 err = -ENOMEM;
2231 goto err_clear;
2232 }
14ca0751
DB
2233
2234 ip_tunnel_info_opts_get(to, info);
074f528e
DB
2235 if (size > info->options_len)
2236 memset(to + info->options_len, 0, size - info->options_len);
14ca0751
DB
2237
2238 return info->options_len;
074f528e
DB
2239err_clear:
2240 memset(to, 0, size);
2241 return err;
14ca0751
DB
2242}
2243
2244static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
2245 .func = bpf_skb_get_tunnel_opt,
2246 .gpl_only = false,
2247 .ret_type = RET_INTEGER,
2248 .arg1_type = ARG_PTR_TO_CTX,
074f528e 2249 .arg2_type = ARG_PTR_TO_RAW_STACK,
14ca0751
DB
2250 .arg3_type = ARG_CONST_STACK_SIZE,
2251};
2252
d3aa45ce
AS
2253static struct metadata_dst __percpu *md_dst;
2254
2255static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
2256{
2257 struct sk_buff *skb = (struct sk_buff *) (long) r1;
2258 struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2;
2259 struct metadata_dst *md = this_cpu_ptr(md_dst);
c6c33454 2260 u8 compat[sizeof(struct bpf_tunnel_key)];
d3aa45ce
AS
2261 struct ip_tunnel_info *info;
2262
22080870
DB
2263 if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
2264 BPF_F_DONT_FRAGMENT)))
d3aa45ce 2265 return -EINVAL;
c6c33454
DB
2266 if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
2267 switch (size) {
4018ab18 2268 case offsetof(struct bpf_tunnel_key, tunnel_label):
c0e760c9 2269 case offsetof(struct bpf_tunnel_key, tunnel_ext):
c6c33454
DB
2270 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
2271 /* Fixup deprecated structure layouts here, so we have
2272 * a common path later on.
2273 */
2274 memcpy(compat, from, size);
2275 memset(compat + size, 0, sizeof(compat) - size);
2276 from = (struct bpf_tunnel_key *)compat;
2277 break;
2278 default:
2279 return -EINVAL;
2280 }
2281 }
c0e760c9
DB
2282 if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
2283 from->tunnel_ext))
4018ab18 2284 return -EINVAL;
d3aa45ce
AS
2285
2286 skb_dst_drop(skb);
2287 dst_hold((struct dst_entry *) md);
2288 skb_dst_set(skb, (struct dst_entry *) md);
2289
2290 info = &md->u.tun_info;
2291 info->mode = IP_TUNNEL_INFO_TX;
c6c33454 2292
db3c6139 2293 info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
22080870
DB
2294 if (flags & BPF_F_DONT_FRAGMENT)
2295 info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
2296
d3aa45ce 2297 info->key.tun_id = cpu_to_be64(from->tunnel_id);
c6c33454
DB
2298 info->key.tos = from->tunnel_tos;
2299 info->key.ttl = from->tunnel_ttl;
2300
2301 if (flags & BPF_F_TUNINFO_IPV6) {
2302 info->mode |= IP_TUNNEL_INFO_IPV6;
2303 memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
2304 sizeof(from->remote_ipv6));
4018ab18
DB
2305 info->key.label = cpu_to_be32(from->tunnel_label) &
2306 IPV6_FLOWLABEL_MASK;
c6c33454
DB
2307 } else {
2308 info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
2da897e5
DB
2309 if (flags & BPF_F_ZERO_CSUM_TX)
2310 info->key.tun_flags &= ~TUNNEL_CSUM;
c6c33454 2311 }
d3aa45ce
AS
2312
2313 return 0;
2314}
2315
577c50aa 2316static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
d3aa45ce
AS
2317 .func = bpf_skb_set_tunnel_key,
2318 .gpl_only = false,
2319 .ret_type = RET_INTEGER,
2320 .arg1_type = ARG_PTR_TO_CTX,
2321 .arg2_type = ARG_PTR_TO_STACK,
2322 .arg3_type = ARG_CONST_STACK_SIZE,
2323 .arg4_type = ARG_ANYTHING,
2324};
2325
14ca0751
DB
2326static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
2327{
2328 struct sk_buff *skb = (struct sk_buff *) (long) r1;
2329 u8 *from = (u8 *) (long) r2;
2330 struct ip_tunnel_info *info = skb_tunnel_info(skb);
2331 const struct metadata_dst *md = this_cpu_ptr(md_dst);
2332
2333 if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
2334 return -EINVAL;
fca5fdf6 2335 if (unlikely(size > IP_TUNNEL_OPTS_MAX))
14ca0751
DB
2336 return -ENOMEM;
2337
2338 ip_tunnel_info_opts_set(info, from, size);
2339
2340 return 0;
2341}
2342
2343static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
2344 .func = bpf_skb_set_tunnel_opt,
2345 .gpl_only = false,
2346 .ret_type = RET_INTEGER,
2347 .arg1_type = ARG_PTR_TO_CTX,
2348 .arg2_type = ARG_PTR_TO_STACK,
2349 .arg3_type = ARG_CONST_STACK_SIZE,
2350};
2351
2352static const struct bpf_func_proto *
2353bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
d3aa45ce
AS
2354{
2355 if (!md_dst) {
14ca0751
DB
2356 /* Race is not possible, since it's called from verifier
2357 * that is holding verifier mutex.
d3aa45ce 2358 */
fca5fdf6 2359 md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
14ca0751 2360 GFP_KERNEL);
d3aa45ce
AS
2361 if (!md_dst)
2362 return NULL;
2363 }
14ca0751
DB
2364
2365 switch (which) {
2366 case BPF_FUNC_skb_set_tunnel_key:
2367 return &bpf_skb_set_tunnel_key_proto;
2368 case BPF_FUNC_skb_set_tunnel_opt:
2369 return &bpf_skb_set_tunnel_opt_proto;
2370 default:
2371 return NULL;
2372 }
d3aa45ce
AS
2373}
2374
747ea55e 2375static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
4a482f34
MKL
2376{
2377 struct sk_buff *skb = (struct sk_buff *)(long)r1;
2378 struct bpf_map *map = (struct bpf_map *)(long)r2;
2379 struct bpf_array *array = container_of(map, struct bpf_array, map);
2380 struct cgroup *cgrp;
2381 struct sock *sk;
2382 u32 i = (u32)r3;
2383
2384 sk = skb->sk;
2385 if (!sk || !sk_fullsock(sk))
2386 return -ENOENT;
2387
2388 if (unlikely(i >= array->map.max_entries))
2389 return -E2BIG;
2390
2391 cgrp = READ_ONCE(array->ptrs[i]);
2392 if (unlikely(!cgrp))
2393 return -EAGAIN;
2394
54fd9c2d 2395 return sk_under_cgroup_hierarchy(sk, cgrp);
4a482f34
MKL
2396}
2397
747ea55e
DB
2398static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
2399 .func = bpf_skb_under_cgroup,
4a482f34
MKL
2400 .gpl_only = false,
2401 .ret_type = RET_INTEGER,
2402 .arg1_type = ARG_PTR_TO_CTX,
2403 .arg2_type = ARG_CONST_MAP_PTR,
2404 .arg3_type = ARG_ANYTHING,
2405};
4a482f34 2406
4de16969
DB
2407static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
2408 unsigned long off, unsigned long len)
2409{
2410 memcpy(dst_buff, src_buff + off, len);
2411 return 0;
2412}
2413
2414static u64 bpf_xdp_event_output(u64 r1, u64 r2, u64 flags, u64 r4,
2415 u64 meta_size)
2416{
2417 struct xdp_buff *xdp = (struct xdp_buff *)(long) r1;
2418 struct bpf_map *map = (struct bpf_map *)(long) r2;
2419 u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
2420 void *meta = (void *)(long) r4;
2421
2422 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
2423 return -EINVAL;
2424 if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
2425 return -EFAULT;
2426
2427 return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size,
2428 bpf_xdp_copy);
2429}
2430
2431static const struct bpf_func_proto bpf_xdp_event_output_proto = {
2432 .func = bpf_xdp_event_output,
2433 .gpl_only = true,
2434 .ret_type = RET_INTEGER,
2435 .arg1_type = ARG_PTR_TO_CTX,
2436 .arg2_type = ARG_CONST_MAP_PTR,
2437 .arg3_type = ARG_ANYTHING,
2438 .arg4_type = ARG_PTR_TO_STACK,
2439 .arg5_type = ARG_CONST_STACK_SIZE,
2440};
2441
d4052c4a
DB
2442static const struct bpf_func_proto *
2443sk_filter_func_proto(enum bpf_func_id func_id)
89aa0758
AS
2444{
2445 switch (func_id) {
2446 case BPF_FUNC_map_lookup_elem:
2447 return &bpf_map_lookup_elem_proto;
2448 case BPF_FUNC_map_update_elem:
2449 return &bpf_map_update_elem_proto;
2450 case BPF_FUNC_map_delete_elem:
2451 return &bpf_map_delete_elem_proto;
03e69b50
DB
2452 case BPF_FUNC_get_prandom_u32:
2453 return &bpf_get_prandom_u32_proto;
c04167ce 2454 case BPF_FUNC_get_smp_processor_id:
80b48c44 2455 return &bpf_get_raw_smp_processor_id_proto;
04fd61ab
AS
2456 case BPF_FUNC_tail_call:
2457 return &bpf_tail_call_proto;
17ca8cbf
DB
2458 case BPF_FUNC_ktime_get_ns:
2459 return &bpf_ktime_get_ns_proto;
0756ea3e 2460 case BPF_FUNC_trace_printk:
1be7f75d
AS
2461 if (capable(CAP_SYS_ADMIN))
2462 return bpf_get_trace_printk_proto();
89aa0758
AS
2463 default:
2464 return NULL;
2465 }
2466}
2467
608cd71a
AS
2468static const struct bpf_func_proto *
2469tc_cls_act_func_proto(enum bpf_func_id func_id)
2470{
2471 switch (func_id) {
2472 case BPF_FUNC_skb_store_bytes:
2473 return &bpf_skb_store_bytes_proto;
05c74e5e
DB
2474 case BPF_FUNC_skb_load_bytes:
2475 return &bpf_skb_load_bytes_proto;
7d672345
DB
2476 case BPF_FUNC_csum_diff:
2477 return &bpf_csum_diff_proto;
91bc4822
AS
2478 case BPF_FUNC_l3_csum_replace:
2479 return &bpf_l3_csum_replace_proto;
2480 case BPF_FUNC_l4_csum_replace:
2481 return &bpf_l4_csum_replace_proto;
3896d655
AS
2482 case BPF_FUNC_clone_redirect:
2483 return &bpf_clone_redirect_proto;
8d20aabe
DB
2484 case BPF_FUNC_get_cgroup_classid:
2485 return &bpf_get_cgroup_classid_proto;
4e10df9a
AS
2486 case BPF_FUNC_skb_vlan_push:
2487 return &bpf_skb_vlan_push_proto;
2488 case BPF_FUNC_skb_vlan_pop:
2489 return &bpf_skb_vlan_pop_proto;
6578171a
DB
2490 case BPF_FUNC_skb_change_proto:
2491 return &bpf_skb_change_proto_proto;
d2485c42
DB
2492 case BPF_FUNC_skb_change_type:
2493 return &bpf_skb_change_type_proto;
5293efe6
DB
2494 case BPF_FUNC_skb_change_tail:
2495 return &bpf_skb_change_tail_proto;
d3aa45ce
AS
2496 case BPF_FUNC_skb_get_tunnel_key:
2497 return &bpf_skb_get_tunnel_key_proto;
2498 case BPF_FUNC_skb_set_tunnel_key:
14ca0751
DB
2499 return bpf_get_skb_set_tunnel_proto(func_id);
2500 case BPF_FUNC_skb_get_tunnel_opt:
2501 return &bpf_skb_get_tunnel_opt_proto;
2502 case BPF_FUNC_skb_set_tunnel_opt:
2503 return bpf_get_skb_set_tunnel_proto(func_id);
27b29f63
AS
2504 case BPF_FUNC_redirect:
2505 return &bpf_redirect_proto;
c46646d0
DB
2506 case BPF_FUNC_get_route_realm:
2507 return &bpf_get_route_realm_proto;
13c5c240
DB
2508 case BPF_FUNC_get_hash_recalc:
2509 return &bpf_get_hash_recalc_proto;
bd570ff9 2510 case BPF_FUNC_perf_event_output:
555c8a86 2511 return &bpf_skb_event_output_proto;
80b48c44
DB
2512 case BPF_FUNC_get_smp_processor_id:
2513 return &bpf_get_smp_processor_id_proto;
747ea55e
DB
2514 case BPF_FUNC_skb_under_cgroup:
2515 return &bpf_skb_under_cgroup_proto;
608cd71a
AS
2516 default:
2517 return sk_filter_func_proto(func_id);
2518 }
2519}
2520
6a773a15
BB
2521static const struct bpf_func_proto *
2522xdp_func_proto(enum bpf_func_id func_id)
2523{
4de16969
DB
2524 switch (func_id) {
2525 case BPF_FUNC_perf_event_output:
2526 return &bpf_xdp_event_output_proto;
2527 default:
2528 return sk_filter_func_proto(func_id);
2529 }
6a773a15
BB
2530}
2531
d691f9e8 2532static bool __is_valid_access(int off, int size, enum bpf_access_type type)
89aa0758 2533{
9bac3d6d
AS
2534 if (off < 0 || off >= sizeof(struct __sk_buff))
2535 return false;
4936e352 2536 /* The verifier guarantees that size > 0. */
9bac3d6d
AS
2537 if (off % size != 0)
2538 return false;
4936e352 2539 if (size != sizeof(__u32))
9bac3d6d
AS
2540 return false;
2541
2542 return true;
2543}
2544
d691f9e8 2545static bool sk_filter_is_valid_access(int off, int size,
19de99f7
AS
2546 enum bpf_access_type type,
2547 enum bpf_reg_type *reg_type)
d691f9e8 2548{
db58ba45
AS
2549 switch (off) {
2550 case offsetof(struct __sk_buff, tc_classid):
2551 case offsetof(struct __sk_buff, data):
2552 case offsetof(struct __sk_buff, data_end):
045efa82 2553 return false;
db58ba45 2554 }
045efa82 2555
d691f9e8
AS
2556 if (type == BPF_WRITE) {
2557 switch (off) {
2558 case offsetof(struct __sk_buff, cb[0]) ...
4936e352 2559 offsetof(struct __sk_buff, cb[4]):
d691f9e8
AS
2560 break;
2561 default:
2562 return false;
2563 }
2564 }
2565
2566 return __is_valid_access(off, size, type);
2567}
2568
2569static bool tc_cls_act_is_valid_access(int off, int size,
19de99f7
AS
2570 enum bpf_access_type type,
2571 enum bpf_reg_type *reg_type)
d691f9e8
AS
2572{
2573 if (type == BPF_WRITE) {
2574 switch (off) {
2575 case offsetof(struct __sk_buff, mark):
2576 case offsetof(struct __sk_buff, tc_index):
754f1e6a 2577 case offsetof(struct __sk_buff, priority):
d691f9e8 2578 case offsetof(struct __sk_buff, cb[0]) ...
09c37a2c
DB
2579 offsetof(struct __sk_buff, cb[4]):
2580 case offsetof(struct __sk_buff, tc_classid):
d691f9e8
AS
2581 break;
2582 default:
2583 return false;
2584 }
2585 }
19de99f7
AS
2586
2587 switch (off) {
2588 case offsetof(struct __sk_buff, data):
2589 *reg_type = PTR_TO_PACKET;
2590 break;
2591 case offsetof(struct __sk_buff, data_end):
2592 *reg_type = PTR_TO_PACKET_END;
2593 break;
2594 }
2595
d691f9e8
AS
2596 return __is_valid_access(off, size, type);
2597}
2598
6a773a15
BB
2599static bool __is_valid_xdp_access(int off, int size,
2600 enum bpf_access_type type)
2601{
2602 if (off < 0 || off >= sizeof(struct xdp_md))
2603 return false;
2604 if (off % size != 0)
2605 return false;
6088b582 2606 if (size != sizeof(__u32))
6a773a15
BB
2607 return false;
2608
2609 return true;
2610}
2611
2612static bool xdp_is_valid_access(int off, int size,
2613 enum bpf_access_type type,
2614 enum bpf_reg_type *reg_type)
2615{
2616 if (type == BPF_WRITE)
2617 return false;
2618
2619 switch (off) {
2620 case offsetof(struct xdp_md, data):
2621 *reg_type = PTR_TO_PACKET;
2622 break;
2623 case offsetof(struct xdp_md, data_end):
2624 *reg_type = PTR_TO_PACKET_END;
2625 break;
2626 }
2627
2628 return __is_valid_xdp_access(off, size, type);
2629}
2630
2631void bpf_warn_invalid_xdp_action(u32 act)
2632{
2633 WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act);
2634}
2635EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
2636
d691f9e8
AS
2637static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2638 int src_reg, int ctx_off,
ff936a04
AS
2639 struct bpf_insn *insn_buf,
2640 struct bpf_prog *prog)
9bac3d6d
AS
2641{
2642 struct bpf_insn *insn = insn_buf;
2643
2644 switch (ctx_off) {
2645 case offsetof(struct __sk_buff, len):
2646 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
2647
2648 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
2649 offsetof(struct sk_buff, len));
2650 break;
2651
0b8c707d
DB
2652 case offsetof(struct __sk_buff, protocol):
2653 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
2654
2655 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
2656 offsetof(struct sk_buff, protocol));
2657 break;
2658
27cd5452
MS
2659 case offsetof(struct __sk_buff, vlan_proto):
2660 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
2661
2662 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
2663 offsetof(struct sk_buff, vlan_proto));
2664 break;
2665
bcad5718
DB
2666 case offsetof(struct __sk_buff, priority):
2667 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
2668
754f1e6a
DB
2669 if (type == BPF_WRITE)
2670 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
2671 offsetof(struct sk_buff, priority));
2672 else
2673 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
2674 offsetof(struct sk_buff, priority));
bcad5718
DB
2675 break;
2676
37e82c2f
AS
2677 case offsetof(struct __sk_buff, ingress_ifindex):
2678 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
2679
2680 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
2681 offsetof(struct sk_buff, skb_iif));
2682 break;
2683
2684 case offsetof(struct __sk_buff, ifindex):
2685 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
2686
f035a515 2687 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
37e82c2f
AS
2688 dst_reg, src_reg,
2689 offsetof(struct sk_buff, dev));
2690 *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
2691 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
2692 offsetof(struct net_device, ifindex));
2693 break;
2694
ba7591d8
DB
2695 case offsetof(struct __sk_buff, hash):
2696 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
2697
2698 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
2699 offsetof(struct sk_buff, hash));
2700 break;
2701
9bac3d6d 2702 case offsetof(struct __sk_buff, mark):
d691f9e8
AS
2703 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
2704
2705 if (type == BPF_WRITE)
2706 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
2707 offsetof(struct sk_buff, mark));
2708 else
2709 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
2710 offsetof(struct sk_buff, mark));
2711 break;
9bac3d6d
AS
2712
2713 case offsetof(struct __sk_buff, pkt_type):
2714 return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn);
2715
2716 case offsetof(struct __sk_buff, queue_mapping):
2717 return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn);
c2497395 2718
c2497395
AS
2719 case offsetof(struct __sk_buff, vlan_present):
2720 return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
2721 dst_reg, src_reg, insn);
2722
2723 case offsetof(struct __sk_buff, vlan_tci):
2724 return convert_skb_access(SKF_AD_VLAN_TAG,
2725 dst_reg, src_reg, insn);
d691f9e8
AS
2726
2727 case offsetof(struct __sk_buff, cb[0]) ...
6088b582 2728 offsetof(struct __sk_buff, cb[4]):
d691f9e8
AS
2729 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
2730
ff936a04 2731 prog->cb_access = 1;
d691f9e8
AS
2732 ctx_off -= offsetof(struct __sk_buff, cb[0]);
2733 ctx_off += offsetof(struct sk_buff, cb);
2734 ctx_off += offsetof(struct qdisc_skb_cb, data);
2735 if (type == BPF_WRITE)
2736 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
2737 else
2738 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
2739 break;
2740
045efa82
DB
2741 case offsetof(struct __sk_buff, tc_classid):
2742 ctx_off -= offsetof(struct __sk_buff, tc_classid);
2743 ctx_off += offsetof(struct sk_buff, cb);
2744 ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
09c37a2c
DB
2745 if (type == BPF_WRITE)
2746 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
2747 else
2748 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
045efa82
DB
2749 break;
2750
db58ba45 2751 case offsetof(struct __sk_buff, data):
f035a515 2752 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
db58ba45
AS
2753 dst_reg, src_reg,
2754 offsetof(struct sk_buff, data));
2755 break;
2756
2757 case offsetof(struct __sk_buff, data_end):
2758 ctx_off -= offsetof(struct __sk_buff, data_end);
2759 ctx_off += offsetof(struct sk_buff, cb);
2760 ctx_off += offsetof(struct bpf_skb_data_end, data_end);
f035a515
DB
2761 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg,
2762 ctx_off);
db58ba45
AS
2763 break;
2764
d691f9e8
AS
2765 case offsetof(struct __sk_buff, tc_index):
2766#ifdef CONFIG_NET_SCHED
2767 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
2768
2769 if (type == BPF_WRITE)
2770 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg,
2771 offsetof(struct sk_buff, tc_index));
2772 else
2773 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
2774 offsetof(struct sk_buff, tc_index));
2775 break;
2776#else
2777 if (type == BPF_WRITE)
2778 *insn++ = BPF_MOV64_REG(dst_reg, dst_reg);
2779 else
2780 *insn++ = BPF_MOV64_IMM(dst_reg, 0);
2781 break;
2782#endif
9bac3d6d
AS
2783 }
2784
2785 return insn - insn_buf;
89aa0758
AS
2786}
2787
6a773a15
BB
2788static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2789 int src_reg, int ctx_off,
2790 struct bpf_insn *insn_buf,
2791 struct bpf_prog *prog)
2792{
2793 struct bpf_insn *insn = insn_buf;
2794
2795 switch (ctx_off) {
2796 case offsetof(struct xdp_md, data):
f035a515 2797 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
6a773a15
BB
2798 dst_reg, src_reg,
2799 offsetof(struct xdp_buff, data));
2800 break;
2801 case offsetof(struct xdp_md, data_end):
f035a515 2802 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
6a773a15
BB
2803 dst_reg, src_reg,
2804 offsetof(struct xdp_buff, data_end));
2805 break;
2806 }
2807
2808 return insn - insn_buf;
2809}
2810
d4052c4a 2811static const struct bpf_verifier_ops sk_filter_ops = {
4936e352
DB
2812 .get_func_proto = sk_filter_func_proto,
2813 .is_valid_access = sk_filter_is_valid_access,
2814 .convert_ctx_access = bpf_net_convert_ctx_access,
89aa0758
AS
2815};
2816
608cd71a 2817static const struct bpf_verifier_ops tc_cls_act_ops = {
4936e352
DB
2818 .get_func_proto = tc_cls_act_func_proto,
2819 .is_valid_access = tc_cls_act_is_valid_access,
2820 .convert_ctx_access = bpf_net_convert_ctx_access,
608cd71a
AS
2821};
2822
6a773a15
BB
2823static const struct bpf_verifier_ops xdp_ops = {
2824 .get_func_proto = xdp_func_proto,
2825 .is_valid_access = xdp_is_valid_access,
2826 .convert_ctx_access = xdp_convert_ctx_access,
2827};
2828
d4052c4a 2829static struct bpf_prog_type_list sk_filter_type __read_mostly = {
4936e352
DB
2830 .ops = &sk_filter_ops,
2831 .type = BPF_PROG_TYPE_SOCKET_FILTER,
89aa0758
AS
2832};
2833
96be4325 2834static struct bpf_prog_type_list sched_cls_type __read_mostly = {
4936e352
DB
2835 .ops = &tc_cls_act_ops,
2836 .type = BPF_PROG_TYPE_SCHED_CLS,
96be4325
DB
2837};
2838
94caee8c 2839static struct bpf_prog_type_list sched_act_type __read_mostly = {
4936e352
DB
2840 .ops = &tc_cls_act_ops,
2841 .type = BPF_PROG_TYPE_SCHED_ACT,
94caee8c
DB
2842};
2843
6a773a15
BB
2844static struct bpf_prog_type_list xdp_type __read_mostly = {
2845 .ops = &xdp_ops,
2846 .type = BPF_PROG_TYPE_XDP,
2847};
2848
d4052c4a 2849static int __init register_sk_filter_ops(void)
89aa0758 2850{
d4052c4a 2851 bpf_register_prog_type(&sk_filter_type);
96be4325 2852 bpf_register_prog_type(&sched_cls_type);
94caee8c 2853 bpf_register_prog_type(&sched_act_type);
6a773a15 2854 bpf_register_prog_type(&xdp_type);
96be4325 2855
89aa0758
AS
2856 return 0;
2857}
d4052c4a
DB
2858late_initcall(register_sk_filter_ops);
2859
8ced425e 2860int sk_detach_filter(struct sock *sk)
55b33325
PE
2861{
2862 int ret = -ENOENT;
2863 struct sk_filter *filter;
2864
d59577b6
VB
2865 if (sock_flag(sk, SOCK_FILTER_LOCKED))
2866 return -EPERM;
2867
8ced425e
HFS
2868 filter = rcu_dereference_protected(sk->sk_filter,
2869 lockdep_sock_is_held(sk));
55b33325 2870 if (filter) {
a9b3cd7f 2871 RCU_INIT_POINTER(sk->sk_filter, NULL);
46bcf14f 2872 sk_filter_uncharge(sk, filter);
55b33325
PE
2873 ret = 0;
2874 }
a3ea269b 2875
55b33325
PE
2876 return ret;
2877}
8ced425e 2878EXPORT_SYMBOL_GPL(sk_detach_filter);
a8fc9277 2879
a3ea269b
DB
2880int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
2881 unsigned int len)
a8fc9277 2882{
a3ea269b 2883 struct sock_fprog_kern *fprog;
a8fc9277 2884 struct sk_filter *filter;
a3ea269b 2885 int ret = 0;
a8fc9277
PE
2886
2887 lock_sock(sk);
2888 filter = rcu_dereference_protected(sk->sk_filter,
8ced425e 2889 lockdep_sock_is_held(sk));
a8fc9277
PE
2890 if (!filter)
2891 goto out;
a3ea269b
DB
2892
2893 /* We're copying the filter that has been originally attached,
93d08b69
DB
2894 * so no conversion/decode needed anymore. eBPF programs that
2895 * have no original program cannot be dumped through this.
a3ea269b 2896 */
93d08b69 2897 ret = -EACCES;
7ae457c1 2898 fprog = filter->prog->orig_prog;
93d08b69
DB
2899 if (!fprog)
2900 goto out;
a3ea269b
DB
2901
2902 ret = fprog->len;
a8fc9277 2903 if (!len)
a3ea269b 2904 /* User space only enquires number of filter blocks. */
a8fc9277 2905 goto out;
a3ea269b 2906
a8fc9277 2907 ret = -EINVAL;
a3ea269b 2908 if (len < fprog->len)
a8fc9277
PE
2909 goto out;
2910
2911 ret = -EFAULT;
009937e7 2912 if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog)))
a3ea269b 2913 goto out;
a8fc9277 2914
a3ea269b
DB
2915 /* Instead of bytes, the API requests to return the number
2916 * of filter blocks.
2917 */
2918 ret = fprog->len;
a8fc9277
PE
2919out:
2920 release_sock(sk);
2921 return ret;
2922}