2 * Linux Socket Filter - Kernel level socket filtering
4 * Based on the design of the Berkeley Packet Filter. The new
5 * internal format has been designed by PLUMgrid:
7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
11 * Jay Schulist <jschlst@samba.org>
12 * Alexei Starovoitov <ast@plumgrid.com>
13 * Daniel Borkmann <dborkman@redhat.com>
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version
18 * 2 of the License, or (at your option) any later version.
20 * Andi Kleen - Fix a few bad bugs and races.
21 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
24 #include <linux/module.h>
25 #include <linux/types.h>
27 #include <linux/fcntl.h>
28 #include <linux/socket.h>
29 #include <linux/sock_diag.h>
31 #include <linux/inet.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_packet.h>
34 #include <linux/if_arp.h>
35 #include <linux/gfp.h>
37 #include <net/protocol.h>
38 #include <net/netlink.h>
39 #include <linux/skbuff.h>
41 #include <net/flow_dissector.h>
42 #include <linux/errno.h>
43 #include <linux/timer.h>
44 #include <linux/uaccess.h>
45 #include <asm/unaligned.h>
46 #include <asm/cmpxchg.h>
47 #include <linux/filter.h>
48 #include <linux/ratelimit.h>
49 #include <linux/seccomp.h>
50 #include <linux/if_vlan.h>
51 #include <linux/bpf.h>
52 #include <net/sch_generic.h>
53 #include <net/cls_cgroup.h>
54 #include <net/dst_metadata.h>
56 #include <net/sock_reuseport.h>
57 #include <net/busy_poll.h>
59 #include <linux/bpf_trace.h>
62 * sk_filter_trim_cap - run a packet through a socket filter
63 * @sk: sock associated with &sk_buff
64 * @skb: buffer to filter
65 * @cap: limit on how short the eBPF program may trim the packet
67 * Run the eBPF program and then cut skb->data to correct size returned by
68 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
69 * than pkt_len we keep whole skb->data. This is the socket level
70 * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
71 * be accepted or -EPERM if the packet should be tossed.
74 int sk_filter_trim_cap(struct sock
*sk
, struct sk_buff
*skb
, unsigned int cap
)
77 struct sk_filter
*filter
;
80 * If the skb was allocated from pfmemalloc reserves, only
81 * allow SOCK_MEMALLOC sockets to use it as this socket is
84 if (skb_pfmemalloc(skb
) && !sock_flag(sk
, SOCK_MEMALLOC
)) {
85 NET_INC_STATS(sock_net(sk
), LINUX_MIB_PFMEMALLOCDROP
);
88 err
= BPF_CGROUP_RUN_PROG_INET_INGRESS(sk
, skb
);
92 err
= security_sock_rcv_skb(sk
, skb
);
97 filter
= rcu_dereference(sk
->sk_filter
);
99 struct sock
*save_sk
= skb
->sk
;
100 unsigned int pkt_len
;
103 pkt_len
= bpf_prog_run_save_cb(filter
->prog
, skb
);
105 err
= pkt_len
? pskb_trim(skb
, max(cap
, pkt_len
)) : -EPERM
;
111 EXPORT_SYMBOL(sk_filter_trim_cap
);
113 BPF_CALL_1(__skb_get_pay_offset
, struct sk_buff
*, skb
)
115 return skb_get_poff(skb
);
118 BPF_CALL_3(__skb_get_nlattr
, struct sk_buff
*, skb
, u32
, a
, u32
, x
)
122 if (skb_is_nonlinear(skb
))
125 if (skb
->len
< sizeof(struct nlattr
))
128 if (a
> skb
->len
- sizeof(struct nlattr
))
131 nla
= nla_find((struct nlattr
*) &skb
->data
[a
], skb
->len
- a
, x
);
133 return (void *) nla
- (void *) skb
->data
;
138 BPF_CALL_3(__skb_get_nlattr_nest
, struct sk_buff
*, skb
, u32
, a
, u32
, x
)
142 if (skb_is_nonlinear(skb
))
145 if (skb
->len
< sizeof(struct nlattr
))
148 if (a
> skb
->len
- sizeof(struct nlattr
))
151 nla
= (struct nlattr
*) &skb
->data
[a
];
152 if (nla
->nla_len
> skb
->len
- a
)
155 nla
= nla_find_nested(nla
, x
);
157 return (void *) nla
- (void *) skb
->data
;
162 BPF_CALL_0(__get_raw_cpu_id
)
164 return raw_smp_processor_id();
167 static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto
= {
168 .func
= __get_raw_cpu_id
,
170 .ret_type
= RET_INTEGER
,
173 static u32
convert_skb_access(int skb_field
, int dst_reg
, int src_reg
,
174 struct bpf_insn
*insn_buf
)
176 struct bpf_insn
*insn
= insn_buf
;
180 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, mark
) != 4);
182 *insn
++ = BPF_LDX_MEM(BPF_W
, dst_reg
, src_reg
,
183 offsetof(struct sk_buff
, mark
));
187 *insn
++ = BPF_LDX_MEM(BPF_B
, dst_reg
, src_reg
, PKT_TYPE_OFFSET());
188 *insn
++ = BPF_ALU32_IMM(BPF_AND
, dst_reg
, PKT_TYPE_MAX
);
189 #ifdef __BIG_ENDIAN_BITFIELD
190 *insn
++ = BPF_ALU32_IMM(BPF_RSH
, dst_reg
, 5);
195 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, queue_mapping
) != 2);
197 *insn
++ = BPF_LDX_MEM(BPF_H
, dst_reg
, src_reg
,
198 offsetof(struct sk_buff
, queue_mapping
));
201 case SKF_AD_VLAN_TAG
:
202 case SKF_AD_VLAN_TAG_PRESENT
:
203 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, vlan_tci
) != 2);
204 BUILD_BUG_ON(VLAN_TAG_PRESENT
!= 0x1000);
206 /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
207 *insn
++ = BPF_LDX_MEM(BPF_H
, dst_reg
, src_reg
,
208 offsetof(struct sk_buff
, vlan_tci
));
209 if (skb_field
== SKF_AD_VLAN_TAG
) {
210 *insn
++ = BPF_ALU32_IMM(BPF_AND
, dst_reg
,
214 *insn
++ = BPF_ALU32_IMM(BPF_RSH
, dst_reg
, 12);
216 *insn
++ = BPF_ALU32_IMM(BPF_AND
, dst_reg
, 1);
221 return insn
- insn_buf
;
224 static bool convert_bpf_extensions(struct sock_filter
*fp
,
225 struct bpf_insn
**insnp
)
227 struct bpf_insn
*insn
= *insnp
;
231 case SKF_AD_OFF
+ SKF_AD_PROTOCOL
:
232 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, protocol
) != 2);
234 /* A = *(u16 *) (CTX + offsetof(protocol)) */
235 *insn
++ = BPF_LDX_MEM(BPF_H
, BPF_REG_A
, BPF_REG_CTX
,
236 offsetof(struct sk_buff
, protocol
));
237 /* A = ntohs(A) [emitting a nop or swap16] */
238 *insn
= BPF_ENDIAN(BPF_FROM_BE
, BPF_REG_A
, 16);
241 case SKF_AD_OFF
+ SKF_AD_PKTTYPE
:
242 cnt
= convert_skb_access(SKF_AD_PKTTYPE
, BPF_REG_A
, BPF_REG_CTX
, insn
);
246 case SKF_AD_OFF
+ SKF_AD_IFINDEX
:
247 case SKF_AD_OFF
+ SKF_AD_HATYPE
:
248 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device
, ifindex
) != 4);
249 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device
, type
) != 2);
251 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff
, dev
),
252 BPF_REG_TMP
, BPF_REG_CTX
,
253 offsetof(struct sk_buff
, dev
));
254 /* if (tmp != 0) goto pc + 1 */
255 *insn
++ = BPF_JMP_IMM(BPF_JNE
, BPF_REG_TMP
, 0, 1);
256 *insn
++ = BPF_EXIT_INSN();
257 if (fp
->k
== SKF_AD_OFF
+ SKF_AD_IFINDEX
)
258 *insn
= BPF_LDX_MEM(BPF_W
, BPF_REG_A
, BPF_REG_TMP
,
259 offsetof(struct net_device
, ifindex
));
261 *insn
= BPF_LDX_MEM(BPF_H
, BPF_REG_A
, BPF_REG_TMP
,
262 offsetof(struct net_device
, type
));
265 case SKF_AD_OFF
+ SKF_AD_MARK
:
266 cnt
= convert_skb_access(SKF_AD_MARK
, BPF_REG_A
, BPF_REG_CTX
, insn
);
270 case SKF_AD_OFF
+ SKF_AD_RXHASH
:
271 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, hash
) != 4);
273 *insn
= BPF_LDX_MEM(BPF_W
, BPF_REG_A
, BPF_REG_CTX
,
274 offsetof(struct sk_buff
, hash
));
277 case SKF_AD_OFF
+ SKF_AD_QUEUE
:
278 cnt
= convert_skb_access(SKF_AD_QUEUE
, BPF_REG_A
, BPF_REG_CTX
, insn
);
282 case SKF_AD_OFF
+ SKF_AD_VLAN_TAG
:
283 cnt
= convert_skb_access(SKF_AD_VLAN_TAG
,
284 BPF_REG_A
, BPF_REG_CTX
, insn
);
288 case SKF_AD_OFF
+ SKF_AD_VLAN_TAG_PRESENT
:
289 cnt
= convert_skb_access(SKF_AD_VLAN_TAG_PRESENT
,
290 BPF_REG_A
, BPF_REG_CTX
, insn
);
294 case SKF_AD_OFF
+ SKF_AD_VLAN_TPID
:
295 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, vlan_proto
) != 2);
297 /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
298 *insn
++ = BPF_LDX_MEM(BPF_H
, BPF_REG_A
, BPF_REG_CTX
,
299 offsetof(struct sk_buff
, vlan_proto
));
300 /* A = ntohs(A) [emitting a nop or swap16] */
301 *insn
= BPF_ENDIAN(BPF_FROM_BE
, BPF_REG_A
, 16);
304 case SKF_AD_OFF
+ SKF_AD_PAY_OFFSET
:
305 case SKF_AD_OFF
+ SKF_AD_NLATTR
:
306 case SKF_AD_OFF
+ SKF_AD_NLATTR_NEST
:
307 case SKF_AD_OFF
+ SKF_AD_CPU
:
308 case SKF_AD_OFF
+ SKF_AD_RANDOM
:
310 *insn
++ = BPF_MOV64_REG(BPF_REG_ARG1
, BPF_REG_CTX
);
312 *insn
++ = BPF_MOV64_REG(BPF_REG_ARG2
, BPF_REG_A
);
314 *insn
++ = BPF_MOV64_REG(BPF_REG_ARG3
, BPF_REG_X
);
315 /* Emit call(arg1=CTX, arg2=A, arg3=X) */
317 case SKF_AD_OFF
+ SKF_AD_PAY_OFFSET
:
318 *insn
= BPF_EMIT_CALL(__skb_get_pay_offset
);
320 case SKF_AD_OFF
+ SKF_AD_NLATTR
:
321 *insn
= BPF_EMIT_CALL(__skb_get_nlattr
);
323 case SKF_AD_OFF
+ SKF_AD_NLATTR_NEST
:
324 *insn
= BPF_EMIT_CALL(__skb_get_nlattr_nest
);
326 case SKF_AD_OFF
+ SKF_AD_CPU
:
327 *insn
= BPF_EMIT_CALL(__get_raw_cpu_id
);
329 case SKF_AD_OFF
+ SKF_AD_RANDOM
:
330 *insn
= BPF_EMIT_CALL(bpf_user_rnd_u32
);
331 bpf_user_rnd_init_once();
336 case SKF_AD_OFF
+ SKF_AD_ALU_XOR_X
:
338 *insn
= BPF_ALU32_REG(BPF_XOR
, BPF_REG_A
, BPF_REG_X
);
342 /* This is just a dummy call to avoid letting the compiler
343 * evict __bpf_call_base() as an optimization. Placed here
344 * where no-one bothers.
346 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
355 * bpf_convert_filter - convert filter program
356 * @prog: the user passed filter program
357 * @len: the length of the user passed filter program
358 * @new_prog: allocated 'struct bpf_prog' or NULL
359 * @new_len: pointer to store length of converted program
361 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
362 * style extended BPF (eBPF).
363 * Conversion workflow:
365 * 1) First pass for calculating the new program length:
366 * bpf_convert_filter(old_prog, old_len, NULL, &new_len)
368 * 2) 2nd pass to remap in two passes: 1st pass finds new
369 * jump offsets, 2nd pass remapping:
370 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
372 static int bpf_convert_filter(struct sock_filter
*prog
, int len
,
373 struct bpf_prog
*new_prog
, int *new_len
)
375 int new_flen
= 0, pass
= 0, target
, i
, stack_off
;
376 struct bpf_insn
*new_insn
, *first_insn
= NULL
;
377 struct sock_filter
*fp
;
381 BUILD_BUG_ON(BPF_MEMWORDS
* sizeof(u32
) > MAX_BPF_STACK
);
382 BUILD_BUG_ON(BPF_REG_FP
+ 1 != MAX_BPF_REG
);
384 if (len
<= 0 || len
> BPF_MAXINSNS
)
388 first_insn
= new_prog
->insnsi
;
389 addrs
= kcalloc(len
, sizeof(*addrs
),
390 GFP_KERNEL
| __GFP_NOWARN
);
396 new_insn
= first_insn
;
399 /* Classic BPF related prologue emission. */
401 /* Classic BPF expects A and X to be reset first. These need
402 * to be guaranteed to be the first two instructions.
404 *new_insn
++ = BPF_ALU64_REG(BPF_XOR
, BPF_REG_A
, BPF_REG_A
);
405 *new_insn
++ = BPF_ALU64_REG(BPF_XOR
, BPF_REG_X
, BPF_REG_X
);
407 /* All programs must keep CTX in callee saved BPF_REG_CTX.
408 * In eBPF case it's done by the compiler, here we need to
409 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
411 *new_insn
++ = BPF_MOV64_REG(BPF_REG_CTX
, BPF_REG_ARG1
);
416 for (i
= 0; i
< len
; fp
++, i
++) {
417 struct bpf_insn tmp_insns
[6] = { };
418 struct bpf_insn
*insn
= tmp_insns
;
421 addrs
[i
] = new_insn
- first_insn
;
424 /* All arithmetic insns and skb loads map as-is. */
425 case BPF_ALU
| BPF_ADD
| BPF_X
:
426 case BPF_ALU
| BPF_ADD
| BPF_K
:
427 case BPF_ALU
| BPF_SUB
| BPF_X
:
428 case BPF_ALU
| BPF_SUB
| BPF_K
:
429 case BPF_ALU
| BPF_AND
| BPF_X
:
430 case BPF_ALU
| BPF_AND
| BPF_K
:
431 case BPF_ALU
| BPF_OR
| BPF_X
:
432 case BPF_ALU
| BPF_OR
| BPF_K
:
433 case BPF_ALU
| BPF_LSH
| BPF_X
:
434 case BPF_ALU
| BPF_LSH
| BPF_K
:
435 case BPF_ALU
| BPF_RSH
| BPF_X
:
436 case BPF_ALU
| BPF_RSH
| BPF_K
:
437 case BPF_ALU
| BPF_XOR
| BPF_X
:
438 case BPF_ALU
| BPF_XOR
| BPF_K
:
439 case BPF_ALU
| BPF_MUL
| BPF_X
:
440 case BPF_ALU
| BPF_MUL
| BPF_K
:
441 case BPF_ALU
| BPF_DIV
| BPF_X
:
442 case BPF_ALU
| BPF_DIV
| BPF_K
:
443 case BPF_ALU
| BPF_MOD
| BPF_X
:
444 case BPF_ALU
| BPF_MOD
| BPF_K
:
445 case BPF_ALU
| BPF_NEG
:
446 case BPF_LD
| BPF_ABS
| BPF_W
:
447 case BPF_LD
| BPF_ABS
| BPF_H
:
448 case BPF_LD
| BPF_ABS
| BPF_B
:
449 case BPF_LD
| BPF_IND
| BPF_W
:
450 case BPF_LD
| BPF_IND
| BPF_H
:
451 case BPF_LD
| BPF_IND
| BPF_B
:
452 /* Check for overloaded BPF extension and
453 * directly convert it if found, otherwise
454 * just move on with mapping.
456 if (BPF_CLASS(fp
->code
) == BPF_LD
&&
457 BPF_MODE(fp
->code
) == BPF_ABS
&&
458 convert_bpf_extensions(fp
, &insn
))
461 if (fp
->code
== (BPF_ALU
| BPF_DIV
| BPF_X
) ||
462 fp
->code
== (BPF_ALU
| BPF_MOD
| BPF_X
))
463 *insn
++ = BPF_MOV32_REG(BPF_REG_X
, BPF_REG_X
);
465 *insn
= BPF_RAW_INSN(fp
->code
, BPF_REG_A
, BPF_REG_X
, 0, fp
->k
);
468 /* Jump transformation cannot use BPF block macros
469 * everywhere as offset calculation and target updates
470 * require a bit more work than the rest, i.e. jump
471 * opcodes map as-is, but offsets need adjustment.
474 #define BPF_EMIT_JMP \
476 if (target >= len || target < 0) \
478 insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \
479 /* Adjust pc relative offset for 2nd or 3rd insn. */ \
480 insn->off -= insn - tmp_insns; \
483 case BPF_JMP
| BPF_JA
:
484 target
= i
+ fp
->k
+ 1;
485 insn
->code
= fp
->code
;
489 case BPF_JMP
| BPF_JEQ
| BPF_K
:
490 case BPF_JMP
| BPF_JEQ
| BPF_X
:
491 case BPF_JMP
| BPF_JSET
| BPF_K
:
492 case BPF_JMP
| BPF_JSET
| BPF_X
:
493 case BPF_JMP
| BPF_JGT
| BPF_K
:
494 case BPF_JMP
| BPF_JGT
| BPF_X
:
495 case BPF_JMP
| BPF_JGE
| BPF_K
:
496 case BPF_JMP
| BPF_JGE
| BPF_X
:
497 if (BPF_SRC(fp
->code
) == BPF_K
&& (int) fp
->k
< 0) {
498 /* BPF immediates are signed, zero extend
499 * immediate into tmp register and use it
502 *insn
++ = BPF_MOV32_IMM(BPF_REG_TMP
, fp
->k
);
504 insn
->dst_reg
= BPF_REG_A
;
505 insn
->src_reg
= BPF_REG_TMP
;
508 insn
->dst_reg
= BPF_REG_A
;
510 bpf_src
= BPF_SRC(fp
->code
);
511 insn
->src_reg
= bpf_src
== BPF_X
? BPF_REG_X
: 0;
514 /* Common case where 'jump_false' is next insn. */
516 insn
->code
= BPF_JMP
| BPF_OP(fp
->code
) | bpf_src
;
517 target
= i
+ fp
->jt
+ 1;
522 /* Convert some jumps when 'jump_true' is next insn. */
524 switch (BPF_OP(fp
->code
)) {
526 insn
->code
= BPF_JMP
| BPF_JNE
| bpf_src
;
529 insn
->code
= BPF_JMP
| BPF_JLE
| bpf_src
;
532 insn
->code
= BPF_JMP
| BPF_JLT
| bpf_src
;
538 target
= i
+ fp
->jf
+ 1;
543 /* Other jumps are mapped into two insns: Jxx and JA. */
544 target
= i
+ fp
->jt
+ 1;
545 insn
->code
= BPF_JMP
| BPF_OP(fp
->code
) | bpf_src
;
549 insn
->code
= BPF_JMP
| BPF_JA
;
550 target
= i
+ fp
->jf
+ 1;
554 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
555 case BPF_LDX
| BPF_MSH
| BPF_B
:
557 *insn
++ = BPF_MOV64_REG(BPF_REG_TMP
, BPF_REG_A
);
558 /* A = BPF_R0 = *(u8 *) (skb->data + K) */
559 *insn
++ = BPF_LD_ABS(BPF_B
, fp
->k
);
561 *insn
++ = BPF_ALU32_IMM(BPF_AND
, BPF_REG_A
, 0xf);
563 *insn
++ = BPF_ALU32_IMM(BPF_LSH
, BPF_REG_A
, 2);
565 *insn
++ = BPF_MOV64_REG(BPF_REG_X
, BPF_REG_A
);
567 *insn
= BPF_MOV64_REG(BPF_REG_A
, BPF_REG_TMP
);
570 /* RET_K is remaped into 2 insns. RET_A case doesn't need an
571 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
573 case BPF_RET
| BPF_A
:
574 case BPF_RET
| BPF_K
:
575 if (BPF_RVAL(fp
->code
) == BPF_K
)
576 *insn
++ = BPF_MOV32_RAW(BPF_K
, BPF_REG_0
,
578 *insn
= BPF_EXIT_INSN();
581 /* Store to stack. */
584 stack_off
= fp
->k
* 4 + 4;
585 *insn
= BPF_STX_MEM(BPF_W
, BPF_REG_FP
, BPF_CLASS(fp
->code
) ==
586 BPF_ST
? BPF_REG_A
: BPF_REG_X
,
588 /* check_load_and_stores() verifies that classic BPF can
589 * load from stack only after write, so tracking
590 * stack_depth for ST|STX insns is enough
592 if (new_prog
&& new_prog
->aux
->stack_depth
< stack_off
)
593 new_prog
->aux
->stack_depth
= stack_off
;
596 /* Load from stack. */
597 case BPF_LD
| BPF_MEM
:
598 case BPF_LDX
| BPF_MEM
:
599 stack_off
= fp
->k
* 4 + 4;
600 *insn
= BPF_LDX_MEM(BPF_W
, BPF_CLASS(fp
->code
) == BPF_LD
?
601 BPF_REG_A
: BPF_REG_X
, BPF_REG_FP
,
606 case BPF_LD
| BPF_IMM
:
607 case BPF_LDX
| BPF_IMM
:
608 *insn
= BPF_MOV32_IMM(BPF_CLASS(fp
->code
) == BPF_LD
?
609 BPF_REG_A
: BPF_REG_X
, fp
->k
);
613 case BPF_MISC
| BPF_TAX
:
614 *insn
= BPF_MOV64_REG(BPF_REG_X
, BPF_REG_A
);
618 case BPF_MISC
| BPF_TXA
:
619 *insn
= BPF_MOV64_REG(BPF_REG_A
, BPF_REG_X
);
622 /* A = skb->len or X = skb->len */
623 case BPF_LD
| BPF_W
| BPF_LEN
:
624 case BPF_LDX
| BPF_W
| BPF_LEN
:
625 *insn
= BPF_LDX_MEM(BPF_W
, BPF_CLASS(fp
->code
) == BPF_LD
?
626 BPF_REG_A
: BPF_REG_X
, BPF_REG_CTX
,
627 offsetof(struct sk_buff
, len
));
630 /* Access seccomp_data fields. */
631 case BPF_LDX
| BPF_ABS
| BPF_W
:
632 /* A = *(u32 *) (ctx + K) */
633 *insn
= BPF_LDX_MEM(BPF_W
, BPF_REG_A
, BPF_REG_CTX
, fp
->k
);
636 /* Unknown instruction. */
643 memcpy(new_insn
, tmp_insns
,
644 sizeof(*insn
) * (insn
- tmp_insns
));
645 new_insn
+= insn
- tmp_insns
;
649 /* Only calculating new length. */
650 *new_len
= new_insn
- first_insn
;
655 if (new_flen
!= new_insn
- first_insn
) {
656 new_flen
= new_insn
- first_insn
;
663 BUG_ON(*new_len
!= new_flen
);
672 * As we dont want to clear mem[] array for each packet going through
673 * __bpf_prog_run(), we check that filter loaded by user never try to read
674 * a cell if not previously written, and we check all branches to be sure
675 * a malicious user doesn't try to abuse us.
677 static int check_load_and_stores(const struct sock_filter
*filter
, int flen
)
679 u16
*masks
, memvalid
= 0; /* One bit per cell, 16 cells */
682 BUILD_BUG_ON(BPF_MEMWORDS
> 16);
684 masks
= kmalloc_array(flen
, sizeof(*masks
), GFP_KERNEL
);
688 memset(masks
, 0xff, flen
* sizeof(*masks
));
690 for (pc
= 0; pc
< flen
; pc
++) {
691 memvalid
&= masks
[pc
];
693 switch (filter
[pc
].code
) {
696 memvalid
|= (1 << filter
[pc
].k
);
698 case BPF_LD
| BPF_MEM
:
699 case BPF_LDX
| BPF_MEM
:
700 if (!(memvalid
& (1 << filter
[pc
].k
))) {
705 case BPF_JMP
| BPF_JA
:
706 /* A jump must set masks on target */
707 masks
[pc
+ 1 + filter
[pc
].k
] &= memvalid
;
710 case BPF_JMP
| BPF_JEQ
| BPF_K
:
711 case BPF_JMP
| BPF_JEQ
| BPF_X
:
712 case BPF_JMP
| BPF_JGE
| BPF_K
:
713 case BPF_JMP
| BPF_JGE
| BPF_X
:
714 case BPF_JMP
| BPF_JGT
| BPF_K
:
715 case BPF_JMP
| BPF_JGT
| BPF_X
:
716 case BPF_JMP
| BPF_JSET
| BPF_K
:
717 case BPF_JMP
| BPF_JSET
| BPF_X
:
718 /* A jump must set masks on targets */
719 masks
[pc
+ 1 + filter
[pc
].jt
] &= memvalid
;
720 masks
[pc
+ 1 + filter
[pc
].jf
] &= memvalid
;
730 static bool chk_code_allowed(u16 code_to_probe
)
732 static const bool codes
[] = {
733 /* 32 bit ALU operations */
734 [BPF_ALU
| BPF_ADD
| BPF_K
] = true,
735 [BPF_ALU
| BPF_ADD
| BPF_X
] = true,
736 [BPF_ALU
| BPF_SUB
| BPF_K
] = true,
737 [BPF_ALU
| BPF_SUB
| BPF_X
] = true,
738 [BPF_ALU
| BPF_MUL
| BPF_K
] = true,
739 [BPF_ALU
| BPF_MUL
| BPF_X
] = true,
740 [BPF_ALU
| BPF_DIV
| BPF_K
] = true,
741 [BPF_ALU
| BPF_DIV
| BPF_X
] = true,
742 [BPF_ALU
| BPF_MOD
| BPF_K
] = true,
743 [BPF_ALU
| BPF_MOD
| BPF_X
] = true,
744 [BPF_ALU
| BPF_AND
| BPF_K
] = true,
745 [BPF_ALU
| BPF_AND
| BPF_X
] = true,
746 [BPF_ALU
| BPF_OR
| BPF_K
] = true,
747 [BPF_ALU
| BPF_OR
| BPF_X
] = true,
748 [BPF_ALU
| BPF_XOR
| BPF_K
] = true,
749 [BPF_ALU
| BPF_XOR
| BPF_X
] = true,
750 [BPF_ALU
| BPF_LSH
| BPF_K
] = true,
751 [BPF_ALU
| BPF_LSH
| BPF_X
] = true,
752 [BPF_ALU
| BPF_RSH
| BPF_K
] = true,
753 [BPF_ALU
| BPF_RSH
| BPF_X
] = true,
754 [BPF_ALU
| BPF_NEG
] = true,
755 /* Load instructions */
756 [BPF_LD
| BPF_W
| BPF_ABS
] = true,
757 [BPF_LD
| BPF_H
| BPF_ABS
] = true,
758 [BPF_LD
| BPF_B
| BPF_ABS
] = true,
759 [BPF_LD
| BPF_W
| BPF_LEN
] = true,
760 [BPF_LD
| BPF_W
| BPF_IND
] = true,
761 [BPF_LD
| BPF_H
| BPF_IND
] = true,
762 [BPF_LD
| BPF_B
| BPF_IND
] = true,
763 [BPF_LD
| BPF_IMM
] = true,
764 [BPF_LD
| BPF_MEM
] = true,
765 [BPF_LDX
| BPF_W
| BPF_LEN
] = true,
766 [BPF_LDX
| BPF_B
| BPF_MSH
] = true,
767 [BPF_LDX
| BPF_IMM
] = true,
768 [BPF_LDX
| BPF_MEM
] = true,
769 /* Store instructions */
772 /* Misc instructions */
773 [BPF_MISC
| BPF_TAX
] = true,
774 [BPF_MISC
| BPF_TXA
] = true,
775 /* Return instructions */
776 [BPF_RET
| BPF_K
] = true,
777 [BPF_RET
| BPF_A
] = true,
778 /* Jump instructions */
779 [BPF_JMP
| BPF_JA
] = true,
780 [BPF_JMP
| BPF_JEQ
| BPF_K
] = true,
781 [BPF_JMP
| BPF_JEQ
| BPF_X
] = true,
782 [BPF_JMP
| BPF_JGE
| BPF_K
] = true,
783 [BPF_JMP
| BPF_JGE
| BPF_X
] = true,
784 [BPF_JMP
| BPF_JGT
| BPF_K
] = true,
785 [BPF_JMP
| BPF_JGT
| BPF_X
] = true,
786 [BPF_JMP
| BPF_JSET
| BPF_K
] = true,
787 [BPF_JMP
| BPF_JSET
| BPF_X
] = true,
790 if (code_to_probe
>= ARRAY_SIZE(codes
))
793 return codes
[code_to_probe
];
796 static bool bpf_check_basics_ok(const struct sock_filter
*filter
,
801 if (flen
== 0 || flen
> BPF_MAXINSNS
)
808 * bpf_check_classic - verify socket filter code
809 * @filter: filter to verify
810 * @flen: length of filter
812 * Check the user's filter code. If we let some ugly
813 * filter code slip through kaboom! The filter must contain
814 * no references or jumps that are out of range, no illegal
815 * instructions, and must end with a RET instruction.
817 * All jumps are forward as they are not signed.
819 * Returns 0 if the rule set is legal or -EINVAL if not.
821 static int bpf_check_classic(const struct sock_filter
*filter
,
827 /* Check the filter code now */
828 for (pc
= 0; pc
< flen
; pc
++) {
829 const struct sock_filter
*ftest
= &filter
[pc
];
831 /* May we actually operate on this code? */
832 if (!chk_code_allowed(ftest
->code
))
835 /* Some instructions need special checks */
836 switch (ftest
->code
) {
837 case BPF_ALU
| BPF_DIV
| BPF_K
:
838 case BPF_ALU
| BPF_MOD
| BPF_K
:
839 /* Check for division by zero */
843 case BPF_ALU
| BPF_LSH
| BPF_K
:
844 case BPF_ALU
| BPF_RSH
| BPF_K
:
848 case BPF_LD
| BPF_MEM
:
849 case BPF_LDX
| BPF_MEM
:
852 /* Check for invalid memory addresses */
853 if (ftest
->k
>= BPF_MEMWORDS
)
856 case BPF_JMP
| BPF_JA
:
857 /* Note, the large ftest->k might cause loops.
858 * Compare this with conditional jumps below,
859 * where offsets are limited. --ANK (981016)
861 if (ftest
->k
>= (unsigned int)(flen
- pc
- 1))
864 case BPF_JMP
| BPF_JEQ
| BPF_K
:
865 case BPF_JMP
| BPF_JEQ
| BPF_X
:
866 case BPF_JMP
| BPF_JGE
| BPF_K
:
867 case BPF_JMP
| BPF_JGE
| BPF_X
:
868 case BPF_JMP
| BPF_JGT
| BPF_K
:
869 case BPF_JMP
| BPF_JGT
| BPF_X
:
870 case BPF_JMP
| BPF_JSET
| BPF_K
:
871 case BPF_JMP
| BPF_JSET
| BPF_X
:
872 /* Both conditionals must be safe */
873 if (pc
+ ftest
->jt
+ 1 >= flen
||
874 pc
+ ftest
->jf
+ 1 >= flen
)
877 case BPF_LD
| BPF_W
| BPF_ABS
:
878 case BPF_LD
| BPF_H
| BPF_ABS
:
879 case BPF_LD
| BPF_B
| BPF_ABS
:
881 if (bpf_anc_helper(ftest
) & BPF_ANC
)
883 /* Ancillary operation unknown or unsupported */
884 if (anc_found
== false && ftest
->k
>= SKF_AD_OFF
)
889 /* Last instruction must be a RET code */
890 switch (filter
[flen
- 1].code
) {
891 case BPF_RET
| BPF_K
:
892 case BPF_RET
| BPF_A
:
893 return check_load_and_stores(filter
, flen
);
899 static int bpf_prog_store_orig_filter(struct bpf_prog
*fp
,
900 const struct sock_fprog
*fprog
)
902 unsigned int fsize
= bpf_classic_proglen(fprog
);
903 struct sock_fprog_kern
*fkprog
;
905 fp
->orig_prog
= kmalloc(sizeof(*fkprog
), GFP_KERNEL
);
909 fkprog
= fp
->orig_prog
;
910 fkprog
->len
= fprog
->len
;
912 fkprog
->filter
= kmemdup(fp
->insns
, fsize
,
913 GFP_KERNEL
| __GFP_NOWARN
);
914 if (!fkprog
->filter
) {
915 kfree(fp
->orig_prog
);
922 static void bpf_release_orig_filter(struct bpf_prog
*fp
)
924 struct sock_fprog_kern
*fprog
= fp
->orig_prog
;
927 kfree(fprog
->filter
);
932 static void __bpf_prog_release(struct bpf_prog
*prog
)
934 if (prog
->type
== BPF_PROG_TYPE_SOCKET_FILTER
) {
937 bpf_release_orig_filter(prog
);
942 static void __sk_filter_release(struct sk_filter
*fp
)
944 __bpf_prog_release(fp
->prog
);
949 * sk_filter_release_rcu - Release a socket filter by rcu_head
950 * @rcu: rcu_head that contains the sk_filter to free
952 static void sk_filter_release_rcu(struct rcu_head
*rcu
)
954 struct sk_filter
*fp
= container_of(rcu
, struct sk_filter
, rcu
);
956 __sk_filter_release(fp
);
960 * sk_filter_release - release a socket filter
961 * @fp: filter to remove
963 * Remove a filter from a socket and release its resources.
965 static void sk_filter_release(struct sk_filter
*fp
)
967 if (refcount_dec_and_test(&fp
->refcnt
))
968 call_rcu(&fp
->rcu
, sk_filter_release_rcu
);
971 void sk_filter_uncharge(struct sock
*sk
, struct sk_filter
*fp
)
973 u32 filter_size
= bpf_prog_size(fp
->prog
->len
);
975 atomic_sub(filter_size
, &sk
->sk_omem_alloc
);
976 sk_filter_release(fp
);
979 /* try to charge the socket memory if there is space available
980 * return true on success
982 static bool __sk_filter_charge(struct sock
*sk
, struct sk_filter
*fp
)
984 u32 filter_size
= bpf_prog_size(fp
->prog
->len
);
986 /* same check as in sock_kmalloc() */
987 if (filter_size
<= sysctl_optmem_max
&&
988 atomic_read(&sk
->sk_omem_alloc
) + filter_size
< sysctl_optmem_max
) {
989 atomic_add(filter_size
, &sk
->sk_omem_alloc
);
995 bool sk_filter_charge(struct sock
*sk
, struct sk_filter
*fp
)
997 if (!refcount_inc_not_zero(&fp
->refcnt
))
1000 if (!__sk_filter_charge(sk
, fp
)) {
1001 sk_filter_release(fp
);
1007 static struct bpf_prog
*bpf_migrate_filter(struct bpf_prog
*fp
)
1009 struct sock_filter
*old_prog
;
1010 struct bpf_prog
*old_fp
;
1011 int err
, new_len
, old_len
= fp
->len
;
1013 /* We are free to overwrite insns et al right here as it
1014 * won't be used at this point in time anymore internally
1015 * after the migration to the internal BPF instruction
1018 BUILD_BUG_ON(sizeof(struct sock_filter
) !=
1019 sizeof(struct bpf_insn
));
1021 /* Conversion cannot happen on overlapping memory areas,
1022 * so we need to keep the user BPF around until the 2nd
1023 * pass. At this time, the user BPF is stored in fp->insns.
1025 old_prog
= kmemdup(fp
->insns
, old_len
* sizeof(struct sock_filter
),
1026 GFP_KERNEL
| __GFP_NOWARN
);
1032 /* 1st pass: calculate the new program length. */
1033 err
= bpf_convert_filter(old_prog
, old_len
, NULL
, &new_len
);
1037 /* Expand fp for appending the new filter representation. */
1039 fp
= bpf_prog_realloc(old_fp
, bpf_prog_size(new_len
), 0);
1041 /* The old_fp is still around in case we couldn't
1042 * allocate new memory, so uncharge on that one.
1051 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
1052 err
= bpf_convert_filter(old_prog
, old_len
, fp
, &new_len
);
1054 /* 2nd bpf_convert_filter() can fail only if it fails
1055 * to allocate memory, remapping must succeed. Note,
1056 * that at this time old_fp has already been released
1061 fp
= bpf_prog_select_runtime(fp
, &err
);
1071 __bpf_prog_release(fp
);
1072 return ERR_PTR(err
);
1075 static struct bpf_prog
*bpf_prepare_filter(struct bpf_prog
*fp
,
1076 bpf_aux_classic_check_t trans
)
1080 fp
->bpf_func
= NULL
;
1083 err
= bpf_check_classic(fp
->insns
, fp
->len
);
1085 __bpf_prog_release(fp
);
1086 return ERR_PTR(err
);
1089 /* There might be additional checks and transformations
1090 * needed on classic filters, f.e. in case of seccomp.
1093 err
= trans(fp
->insns
, fp
->len
);
1095 __bpf_prog_release(fp
);
1096 return ERR_PTR(err
);
1100 /* Probe if we can JIT compile the filter and if so, do
1101 * the compilation of the filter.
1103 bpf_jit_compile(fp
);
1105 /* JIT compiler couldn't process this filter, so do the
1106 * internal BPF translation for the optimized interpreter.
1109 fp
= bpf_migrate_filter(fp
);
1115 * bpf_prog_create - create an unattached filter
1116 * @pfp: the unattached filter that is created
1117 * @fprog: the filter program
1119 * Create a filter independent of any socket. We first run some
1120 * sanity checks on it to make sure it does not explode on us later.
1121 * If an error occurs or there is insufficient memory for the filter
1122 * a negative errno code is returned. On success the return is zero.
1124 int bpf_prog_create(struct bpf_prog
**pfp
, struct sock_fprog_kern
*fprog
)
1126 unsigned int fsize
= bpf_classic_proglen(fprog
);
1127 struct bpf_prog
*fp
;
1129 /* Make sure new filter is there and in the right amounts. */
1130 if (!bpf_check_basics_ok(fprog
->filter
, fprog
->len
))
1133 fp
= bpf_prog_alloc(bpf_prog_size(fprog
->len
), 0);
1137 memcpy(fp
->insns
, fprog
->filter
, fsize
);
1139 fp
->len
= fprog
->len
;
1140 /* Since unattached filters are not copied back to user
1141 * space through sk_get_filter(), we do not need to hold
1142 * a copy here, and can spare us the work.
1144 fp
->orig_prog
= NULL
;
1146 /* bpf_prepare_filter() already takes care of freeing
1147 * memory in case something goes wrong.
1149 fp
= bpf_prepare_filter(fp
, NULL
);
1156 EXPORT_SYMBOL_GPL(bpf_prog_create
);
1159 * bpf_prog_create_from_user - create an unattached filter from user buffer
1160 * @pfp: the unattached filter that is created
1161 * @fprog: the filter program
1162 * @trans: post-classic verifier transformation handler
1163 * @save_orig: save classic BPF program
1165 * This function effectively does the same as bpf_prog_create(), only
1166 * that it builds up its insns buffer from user space provided buffer.
1167 * It also allows for passing a bpf_aux_classic_check_t handler.
1169 int bpf_prog_create_from_user(struct bpf_prog
**pfp
, struct sock_fprog
*fprog
,
1170 bpf_aux_classic_check_t trans
, bool save_orig
)
1172 unsigned int fsize
= bpf_classic_proglen(fprog
);
1173 struct bpf_prog
*fp
;
1176 /* Make sure new filter is there and in the right amounts. */
1177 if (!bpf_check_basics_ok(fprog
->filter
, fprog
->len
))
1180 fp
= bpf_prog_alloc(bpf_prog_size(fprog
->len
), 0);
1184 if (copy_from_user(fp
->insns
, fprog
->filter
, fsize
)) {
1185 __bpf_prog_free(fp
);
1189 fp
->len
= fprog
->len
;
1190 fp
->orig_prog
= NULL
;
1193 err
= bpf_prog_store_orig_filter(fp
, fprog
);
1195 __bpf_prog_free(fp
);
1200 /* bpf_prepare_filter() already takes care of freeing
1201 * memory in case something goes wrong.
1203 fp
= bpf_prepare_filter(fp
, trans
);
1210 EXPORT_SYMBOL_GPL(bpf_prog_create_from_user
);
1212 void bpf_prog_destroy(struct bpf_prog
*fp
)
1214 __bpf_prog_release(fp
);
1216 EXPORT_SYMBOL_GPL(bpf_prog_destroy
);
1218 static int __sk_attach_prog(struct bpf_prog
*prog
, struct sock
*sk
)
1220 struct sk_filter
*fp
, *old_fp
;
1222 fp
= kmalloc(sizeof(*fp
), GFP_KERNEL
);
1228 if (!__sk_filter_charge(sk
, fp
)) {
1232 refcount_set(&fp
->refcnt
, 1);
1234 old_fp
= rcu_dereference_protected(sk
->sk_filter
,
1235 lockdep_sock_is_held(sk
));
1236 rcu_assign_pointer(sk
->sk_filter
, fp
);
1239 sk_filter_uncharge(sk
, old_fp
);
1244 static int __reuseport_attach_prog(struct bpf_prog
*prog
, struct sock
*sk
)
1246 struct bpf_prog
*old_prog
;
1249 if (bpf_prog_size(prog
->len
) > sysctl_optmem_max
)
1252 if (sk_unhashed(sk
) && sk
->sk_reuseport
) {
1253 err
= reuseport_alloc(sk
);
1256 } else if (!rcu_access_pointer(sk
->sk_reuseport_cb
)) {
1257 /* The socket wasn't bound with SO_REUSEPORT */
1261 old_prog
= reuseport_attach_prog(sk
, prog
);
1263 bpf_prog_destroy(old_prog
);
1269 struct bpf_prog
*__get_filter(struct sock_fprog
*fprog
, struct sock
*sk
)
1271 unsigned int fsize
= bpf_classic_proglen(fprog
);
1272 struct bpf_prog
*prog
;
1275 if (sock_flag(sk
, SOCK_FILTER_LOCKED
))
1276 return ERR_PTR(-EPERM
);
1278 /* Make sure new filter is there and in the right amounts. */
1279 if (!bpf_check_basics_ok(fprog
->filter
, fprog
->len
))
1280 return ERR_PTR(-EINVAL
);
1282 prog
= bpf_prog_alloc(bpf_prog_size(fprog
->len
), 0);
1284 return ERR_PTR(-ENOMEM
);
1286 if (copy_from_user(prog
->insns
, fprog
->filter
, fsize
)) {
1287 __bpf_prog_free(prog
);
1288 return ERR_PTR(-EFAULT
);
1291 prog
->len
= fprog
->len
;
1293 err
= bpf_prog_store_orig_filter(prog
, fprog
);
1295 __bpf_prog_free(prog
);
1296 return ERR_PTR(-ENOMEM
);
1299 /* bpf_prepare_filter() already takes care of freeing
1300 * memory in case something goes wrong.
1302 return bpf_prepare_filter(prog
, NULL
);
1306 * sk_attach_filter - attach a socket filter
1307 * @fprog: the filter program
1308 * @sk: the socket to use
1310 * Attach the user's filter code. We first run some sanity checks on
1311 * it to make sure it does not explode on us later. If an error
1312 * occurs or there is insufficient memory for the filter a negative
1313 * errno code is returned. On success the return is zero.
1315 int sk_attach_filter(struct sock_fprog
*fprog
, struct sock
*sk
)
1317 struct bpf_prog
*prog
= __get_filter(fprog
, sk
);
1321 return PTR_ERR(prog
);
1323 err
= __sk_attach_prog(prog
, sk
);
1325 __bpf_prog_release(prog
);
1331 EXPORT_SYMBOL_GPL(sk_attach_filter
);
1333 int sk_reuseport_attach_filter(struct sock_fprog
*fprog
, struct sock
*sk
)
1335 struct bpf_prog
*prog
= __get_filter(fprog
, sk
);
1339 return PTR_ERR(prog
);
1341 err
= __reuseport_attach_prog(prog
, sk
);
1343 __bpf_prog_release(prog
);
1350 static struct bpf_prog
*__get_bpf(u32 ufd
, struct sock
*sk
)
1352 if (sock_flag(sk
, SOCK_FILTER_LOCKED
))
1353 return ERR_PTR(-EPERM
);
1355 return bpf_prog_get_type(ufd
, BPF_PROG_TYPE_SOCKET_FILTER
);
1358 int sk_attach_bpf(u32 ufd
, struct sock
*sk
)
1360 struct bpf_prog
*prog
= __get_bpf(ufd
, sk
);
1364 return PTR_ERR(prog
);
1366 err
= __sk_attach_prog(prog
, sk
);
1375 int sk_reuseport_attach_bpf(u32 ufd
, struct sock
*sk
)
1377 struct bpf_prog
*prog
= __get_bpf(ufd
, sk
);
1381 return PTR_ERR(prog
);
1383 err
= __reuseport_attach_prog(prog
, sk
);
1392 struct bpf_scratchpad
{
1394 __be32 diff
[MAX_BPF_STACK
/ sizeof(__be32
)];
1395 u8 buff
[MAX_BPF_STACK
];
1399 static DEFINE_PER_CPU(struct bpf_scratchpad
, bpf_sp
);
1401 static inline int __bpf_try_make_writable(struct sk_buff
*skb
,
1402 unsigned int write_len
)
1404 return skb_ensure_writable(skb
, write_len
);
1407 static inline int bpf_try_make_writable(struct sk_buff
*skb
,
1408 unsigned int write_len
)
1410 int err
= __bpf_try_make_writable(skb
, write_len
);
1412 bpf_compute_data_pointers(skb
);
1416 static int bpf_try_make_head_writable(struct sk_buff
*skb
)
1418 return bpf_try_make_writable(skb
, skb_headlen(skb
));
1421 static inline void bpf_push_mac_rcsum(struct sk_buff
*skb
)
1423 if (skb_at_tc_ingress(skb
))
1424 skb_postpush_rcsum(skb
, skb_mac_header(skb
), skb
->mac_len
);
1427 static inline void bpf_pull_mac_rcsum(struct sk_buff
*skb
)
1429 if (skb_at_tc_ingress(skb
))
1430 skb_postpull_rcsum(skb
, skb_mac_header(skb
), skb
->mac_len
);
1433 BPF_CALL_5(bpf_skb_store_bytes
, struct sk_buff
*, skb
, u32
, offset
,
1434 const void *, from
, u32
, len
, u64
, flags
)
1438 if (unlikely(flags
& ~(BPF_F_RECOMPUTE_CSUM
| BPF_F_INVALIDATE_HASH
)))
1440 if (unlikely(offset
> 0xffff))
1442 if (unlikely(bpf_try_make_writable(skb
, offset
+ len
)))
1445 ptr
= skb
->data
+ offset
;
1446 if (flags
& BPF_F_RECOMPUTE_CSUM
)
1447 __skb_postpull_rcsum(skb
, ptr
, len
, offset
);
1449 memcpy(ptr
, from
, len
);
1451 if (flags
& BPF_F_RECOMPUTE_CSUM
)
1452 __skb_postpush_rcsum(skb
, ptr
, len
, offset
);
1453 if (flags
& BPF_F_INVALIDATE_HASH
)
1454 skb_clear_hash(skb
);
1459 static const struct bpf_func_proto bpf_skb_store_bytes_proto
= {
1460 .func
= bpf_skb_store_bytes
,
1462 .ret_type
= RET_INTEGER
,
1463 .arg1_type
= ARG_PTR_TO_CTX
,
1464 .arg2_type
= ARG_ANYTHING
,
1465 .arg3_type
= ARG_PTR_TO_MEM
,
1466 .arg4_type
= ARG_CONST_SIZE
,
1467 .arg5_type
= ARG_ANYTHING
,
1470 BPF_CALL_4(bpf_skb_load_bytes
, const struct sk_buff
*, skb
, u32
, offset
,
1471 void *, to
, u32
, len
)
1475 if (unlikely(offset
> 0xffff))
1478 ptr
= skb_header_pointer(skb
, offset
, len
, to
);
1482 memcpy(to
, ptr
, len
);
1490 static const struct bpf_func_proto bpf_skb_load_bytes_proto
= {
1491 .func
= bpf_skb_load_bytes
,
1493 .ret_type
= RET_INTEGER
,
1494 .arg1_type
= ARG_PTR_TO_CTX
,
1495 .arg2_type
= ARG_ANYTHING
,
1496 .arg3_type
= ARG_PTR_TO_UNINIT_MEM
,
1497 .arg4_type
= ARG_CONST_SIZE
,
1500 BPF_CALL_2(bpf_skb_pull_data
, struct sk_buff
*, skb
, u32
, len
)
1502 /* Idea is the following: should the needed direct read/write
1503 * test fail during runtime, we can pull in more data and redo
1504 * again, since implicitly, we invalidate previous checks here.
1506 * Or, since we know how much we need to make read/writeable,
1507 * this can be done once at the program beginning for direct
1508 * access case. By this we overcome limitations of only current
1509 * headroom being accessible.
1511 return bpf_try_make_writable(skb
, len
? : skb_headlen(skb
));
1514 static const struct bpf_func_proto bpf_skb_pull_data_proto
= {
1515 .func
= bpf_skb_pull_data
,
1517 .ret_type
= RET_INTEGER
,
1518 .arg1_type
= ARG_PTR_TO_CTX
,
1519 .arg2_type
= ARG_ANYTHING
,
1522 BPF_CALL_5(bpf_l3_csum_replace
, struct sk_buff
*, skb
, u32
, offset
,
1523 u64
, from
, u64
, to
, u64
, flags
)
1527 if (unlikely(flags
& ~(BPF_F_HDR_FIELD_MASK
)))
1529 if (unlikely(offset
> 0xffff || offset
& 1))
1531 if (unlikely(bpf_try_make_writable(skb
, offset
+ sizeof(*ptr
))))
1534 ptr
= (__sum16
*)(skb
->data
+ offset
);
1535 switch (flags
& BPF_F_HDR_FIELD_MASK
) {
1537 if (unlikely(from
!= 0))
1540 csum_replace_by_diff(ptr
, to
);
1543 csum_replace2(ptr
, from
, to
);
1546 csum_replace4(ptr
, from
, to
);
1555 static const struct bpf_func_proto bpf_l3_csum_replace_proto
= {
1556 .func
= bpf_l3_csum_replace
,
1558 .ret_type
= RET_INTEGER
,
1559 .arg1_type
= ARG_PTR_TO_CTX
,
1560 .arg2_type
= ARG_ANYTHING
,
1561 .arg3_type
= ARG_ANYTHING
,
1562 .arg4_type
= ARG_ANYTHING
,
1563 .arg5_type
= ARG_ANYTHING
,
1566 BPF_CALL_5(bpf_l4_csum_replace
, struct sk_buff
*, skb
, u32
, offset
,
1567 u64
, from
, u64
, to
, u64
, flags
)
1569 bool is_pseudo
= flags
& BPF_F_PSEUDO_HDR
;
1570 bool is_mmzero
= flags
& BPF_F_MARK_MANGLED_0
;
1571 bool do_mforce
= flags
& BPF_F_MARK_ENFORCE
;
1574 if (unlikely(flags
& ~(BPF_F_MARK_MANGLED_0
| BPF_F_MARK_ENFORCE
|
1575 BPF_F_PSEUDO_HDR
| BPF_F_HDR_FIELD_MASK
)))
1577 if (unlikely(offset
> 0xffff || offset
& 1))
1579 if (unlikely(bpf_try_make_writable(skb
, offset
+ sizeof(*ptr
))))
1582 ptr
= (__sum16
*)(skb
->data
+ offset
);
1583 if (is_mmzero
&& !do_mforce
&& !*ptr
)
1586 switch (flags
& BPF_F_HDR_FIELD_MASK
) {
1588 if (unlikely(from
!= 0))
1591 inet_proto_csum_replace_by_diff(ptr
, skb
, to
, is_pseudo
);
1594 inet_proto_csum_replace2(ptr
, skb
, from
, to
, is_pseudo
);
1597 inet_proto_csum_replace4(ptr
, skb
, from
, to
, is_pseudo
);
1603 if (is_mmzero
&& !*ptr
)
1604 *ptr
= CSUM_MANGLED_0
;
1608 static const struct bpf_func_proto bpf_l4_csum_replace_proto
= {
1609 .func
= bpf_l4_csum_replace
,
1611 .ret_type
= RET_INTEGER
,
1612 .arg1_type
= ARG_PTR_TO_CTX
,
1613 .arg2_type
= ARG_ANYTHING
,
1614 .arg3_type
= ARG_ANYTHING
,
1615 .arg4_type
= ARG_ANYTHING
,
1616 .arg5_type
= ARG_ANYTHING
,
1619 BPF_CALL_5(bpf_csum_diff
, __be32
*, from
, u32
, from_size
,
1620 __be32
*, to
, u32
, to_size
, __wsum
, seed
)
1622 struct bpf_scratchpad
*sp
= this_cpu_ptr(&bpf_sp
);
1623 u32 diff_size
= from_size
+ to_size
;
1626 /* This is quite flexible, some examples:
1628 * from_size == 0, to_size > 0, seed := csum --> pushing data
1629 * from_size > 0, to_size == 0, seed := csum --> pulling data
1630 * from_size > 0, to_size > 0, seed := 0 --> diffing data
1632 * Even for diffing, from_size and to_size don't need to be equal.
1634 if (unlikely(((from_size
| to_size
) & (sizeof(__be32
) - 1)) ||
1635 diff_size
> sizeof(sp
->diff
)))
1638 for (i
= 0; i
< from_size
/ sizeof(__be32
); i
++, j
++)
1639 sp
->diff
[j
] = ~from
[i
];
1640 for (i
= 0; i
< to_size
/ sizeof(__be32
); i
++, j
++)
1641 sp
->diff
[j
] = to
[i
];
1643 return csum_partial(sp
->diff
, diff_size
, seed
);
1646 static const struct bpf_func_proto bpf_csum_diff_proto
= {
1647 .func
= bpf_csum_diff
,
1650 .ret_type
= RET_INTEGER
,
1651 .arg1_type
= ARG_PTR_TO_MEM_OR_NULL
,
1652 .arg2_type
= ARG_CONST_SIZE_OR_ZERO
,
1653 .arg3_type
= ARG_PTR_TO_MEM_OR_NULL
,
1654 .arg4_type
= ARG_CONST_SIZE_OR_ZERO
,
1655 .arg5_type
= ARG_ANYTHING
,
1658 BPF_CALL_2(bpf_csum_update
, struct sk_buff
*, skb
, __wsum
, csum
)
1660 /* The interface is to be used in combination with bpf_csum_diff()
1661 * for direct packet writes. csum rotation for alignment as well
1662 * as emulating csum_sub() can be done from the eBPF program.
1664 if (skb
->ip_summed
== CHECKSUM_COMPLETE
)
1665 return (skb
->csum
= csum_add(skb
->csum
, csum
));
1670 static const struct bpf_func_proto bpf_csum_update_proto
= {
1671 .func
= bpf_csum_update
,
1673 .ret_type
= RET_INTEGER
,
1674 .arg1_type
= ARG_PTR_TO_CTX
,
1675 .arg2_type
= ARG_ANYTHING
,
1678 static inline int __bpf_rx_skb(struct net_device
*dev
, struct sk_buff
*skb
)
1680 return dev_forward_skb(dev
, skb
);
1683 static inline int __bpf_rx_skb_no_mac(struct net_device
*dev
,
1684 struct sk_buff
*skb
)
1686 int ret
= ____dev_forward_skb(dev
, skb
);
1690 ret
= netif_rx(skb
);
1696 static inline int __bpf_tx_skb(struct net_device
*dev
, struct sk_buff
*skb
)
1700 if (unlikely(__this_cpu_read(xmit_recursion
) > XMIT_RECURSION_LIMIT
)) {
1701 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
1709 __this_cpu_inc(xmit_recursion
);
1710 ret
= dev_queue_xmit(skb
);
1711 __this_cpu_dec(xmit_recursion
);
1716 static int __bpf_redirect_no_mac(struct sk_buff
*skb
, struct net_device
*dev
,
1719 unsigned int mlen
= skb_network_offset(skb
);
1722 __skb_pull(skb
, mlen
);
1724 /* At ingress, the mac header has already been pulled once.
1725 * At egress, skb_pospull_rcsum has to be done in case that
1726 * the skb is originated from ingress (i.e. a forwarded skb)
1727 * to ensure that rcsum starts at net header.
1729 if (!skb_at_tc_ingress(skb
))
1730 skb_postpull_rcsum(skb
, skb_mac_header(skb
), mlen
);
1732 skb_pop_mac_header(skb
);
1733 skb_reset_mac_len(skb
);
1734 return flags
& BPF_F_INGRESS
?
1735 __bpf_rx_skb_no_mac(dev
, skb
) : __bpf_tx_skb(dev
, skb
);
1738 static int __bpf_redirect_common(struct sk_buff
*skb
, struct net_device
*dev
,
1741 /* Verify that a link layer header is carried */
1742 if (unlikely(skb
->mac_header
>= skb
->network_header
)) {
1747 bpf_push_mac_rcsum(skb
);
1748 return flags
& BPF_F_INGRESS
?
1749 __bpf_rx_skb(dev
, skb
) : __bpf_tx_skb(dev
, skb
);
1752 static int __bpf_redirect(struct sk_buff
*skb
, struct net_device
*dev
,
1755 if (dev_is_mac_header_xmit(dev
))
1756 return __bpf_redirect_common(skb
, dev
, flags
);
1758 return __bpf_redirect_no_mac(skb
, dev
, flags
);
1761 BPF_CALL_3(bpf_clone_redirect
, struct sk_buff
*, skb
, u32
, ifindex
, u64
, flags
)
1763 struct net_device
*dev
;
1764 struct sk_buff
*clone
;
1767 if (unlikely(flags
& ~(BPF_F_INGRESS
)))
1770 dev
= dev_get_by_index_rcu(dev_net(skb
->dev
), ifindex
);
1774 clone
= skb_clone(skb
, GFP_ATOMIC
);
1775 if (unlikely(!clone
))
1778 /* For direct write, we need to keep the invariant that the skbs
1779 * we're dealing with need to be uncloned. Should uncloning fail
1780 * here, we need to free the just generated clone to unclone once
1783 ret
= bpf_try_make_head_writable(skb
);
1784 if (unlikely(ret
)) {
1789 return __bpf_redirect(clone
, dev
, flags
);
1792 static const struct bpf_func_proto bpf_clone_redirect_proto
= {
1793 .func
= bpf_clone_redirect
,
1795 .ret_type
= RET_INTEGER
,
1796 .arg1_type
= ARG_PTR_TO_CTX
,
1797 .arg2_type
= ARG_ANYTHING
,
1798 .arg3_type
= ARG_ANYTHING
,
1801 struct redirect_info
{
1804 struct bpf_map
*map
;
1805 struct bpf_map
*map_to_flush
;
1806 unsigned long map_owner
;
1809 static DEFINE_PER_CPU(struct redirect_info
, redirect_info
);
1811 BPF_CALL_2(bpf_redirect
, u32
, ifindex
, u64
, flags
)
1813 struct redirect_info
*ri
= this_cpu_ptr(&redirect_info
);
1815 if (unlikely(flags
& ~(BPF_F_INGRESS
)))
1818 ri
->ifindex
= ifindex
;
1821 return TC_ACT_REDIRECT
;
1824 int skb_do_redirect(struct sk_buff
*skb
)
1826 struct redirect_info
*ri
= this_cpu_ptr(&redirect_info
);
1827 struct net_device
*dev
;
1829 dev
= dev_get_by_index_rcu(dev_net(skb
->dev
), ri
->ifindex
);
1831 if (unlikely(!dev
)) {
1836 return __bpf_redirect(skb
, dev
, ri
->flags
);
1839 static const struct bpf_func_proto bpf_redirect_proto
= {
1840 .func
= bpf_redirect
,
1842 .ret_type
= RET_INTEGER
,
1843 .arg1_type
= ARG_ANYTHING
,
1844 .arg2_type
= ARG_ANYTHING
,
1847 BPF_CALL_4(bpf_sk_redirect_map
, struct sk_buff
*, skb
,
1848 struct bpf_map
*, map
, u32
, key
, u64
, flags
)
1850 struct tcp_skb_cb
*tcb
= TCP_SKB_CB(skb
);
1852 /* If user passes invalid input drop the packet. */
1853 if (unlikely(flags
))
1857 tcb
->bpf
.flags
= flags
;
1863 struct sock
*do_sk_redirect_map(struct sk_buff
*skb
)
1865 struct tcp_skb_cb
*tcb
= TCP_SKB_CB(skb
);
1866 struct sock
*sk
= NULL
;
1869 sk
= __sock_map_lookup_elem(tcb
->bpf
.map
, tcb
->bpf
.key
);
1872 tcb
->bpf
.map
= NULL
;
1878 static const struct bpf_func_proto bpf_sk_redirect_map_proto
= {
1879 .func
= bpf_sk_redirect_map
,
1881 .ret_type
= RET_INTEGER
,
1882 .arg1_type
= ARG_PTR_TO_CTX
,
1883 .arg2_type
= ARG_CONST_MAP_PTR
,
1884 .arg3_type
= ARG_ANYTHING
,
1885 .arg4_type
= ARG_ANYTHING
,
1888 BPF_CALL_1(bpf_get_cgroup_classid
, const struct sk_buff
*, skb
)
1890 return task_get_classid(skb
);
1893 static const struct bpf_func_proto bpf_get_cgroup_classid_proto
= {
1894 .func
= bpf_get_cgroup_classid
,
1896 .ret_type
= RET_INTEGER
,
1897 .arg1_type
= ARG_PTR_TO_CTX
,
1900 BPF_CALL_1(bpf_get_route_realm
, const struct sk_buff
*, skb
)
1902 return dst_tclassid(skb
);
1905 static const struct bpf_func_proto bpf_get_route_realm_proto
= {
1906 .func
= bpf_get_route_realm
,
1908 .ret_type
= RET_INTEGER
,
1909 .arg1_type
= ARG_PTR_TO_CTX
,
1912 BPF_CALL_1(bpf_get_hash_recalc
, struct sk_buff
*, skb
)
1914 /* If skb_clear_hash() was called due to mangling, we can
1915 * trigger SW recalculation here. Later access to hash
1916 * can then use the inline skb->hash via context directly
1917 * instead of calling this helper again.
1919 return skb_get_hash(skb
);
1922 static const struct bpf_func_proto bpf_get_hash_recalc_proto
= {
1923 .func
= bpf_get_hash_recalc
,
1925 .ret_type
= RET_INTEGER
,
1926 .arg1_type
= ARG_PTR_TO_CTX
,
1929 BPF_CALL_1(bpf_set_hash_invalid
, struct sk_buff
*, skb
)
1931 /* After all direct packet write, this can be used once for
1932 * triggering a lazy recalc on next skb_get_hash() invocation.
1934 skb_clear_hash(skb
);
1938 static const struct bpf_func_proto bpf_set_hash_invalid_proto
= {
1939 .func
= bpf_set_hash_invalid
,
1941 .ret_type
= RET_INTEGER
,
1942 .arg1_type
= ARG_PTR_TO_CTX
,
1945 BPF_CALL_2(bpf_set_hash
, struct sk_buff
*, skb
, u32
, hash
)
1947 /* Set user specified hash as L4(+), so that it gets returned
1948 * on skb_get_hash() call unless BPF prog later on triggers a
1951 __skb_set_sw_hash(skb
, hash
, true);
1955 static const struct bpf_func_proto bpf_set_hash_proto
= {
1956 .func
= bpf_set_hash
,
1958 .ret_type
= RET_INTEGER
,
1959 .arg1_type
= ARG_PTR_TO_CTX
,
1960 .arg2_type
= ARG_ANYTHING
,
1963 BPF_CALL_3(bpf_skb_vlan_push
, struct sk_buff
*, skb
, __be16
, vlan_proto
,
1968 if (unlikely(vlan_proto
!= htons(ETH_P_8021Q
) &&
1969 vlan_proto
!= htons(ETH_P_8021AD
)))
1970 vlan_proto
= htons(ETH_P_8021Q
);
1972 bpf_push_mac_rcsum(skb
);
1973 ret
= skb_vlan_push(skb
, vlan_proto
, vlan_tci
);
1974 bpf_pull_mac_rcsum(skb
);
1976 bpf_compute_data_pointers(skb
);
1980 const struct bpf_func_proto bpf_skb_vlan_push_proto
= {
1981 .func
= bpf_skb_vlan_push
,
1983 .ret_type
= RET_INTEGER
,
1984 .arg1_type
= ARG_PTR_TO_CTX
,
1985 .arg2_type
= ARG_ANYTHING
,
1986 .arg3_type
= ARG_ANYTHING
,
1988 EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto
);
1990 BPF_CALL_1(bpf_skb_vlan_pop
, struct sk_buff
*, skb
)
1994 bpf_push_mac_rcsum(skb
);
1995 ret
= skb_vlan_pop(skb
);
1996 bpf_pull_mac_rcsum(skb
);
1998 bpf_compute_data_pointers(skb
);
2002 const struct bpf_func_proto bpf_skb_vlan_pop_proto
= {
2003 .func
= bpf_skb_vlan_pop
,
2005 .ret_type
= RET_INTEGER
,
2006 .arg1_type
= ARG_PTR_TO_CTX
,
2008 EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto
);
2010 static int bpf_skb_generic_push(struct sk_buff
*skb
, u32 off
, u32 len
)
2012 /* Caller already did skb_cow() with len as headroom,
2013 * so no need to do it here.
2016 memmove(skb
->data
, skb
->data
+ len
, off
);
2017 memset(skb
->data
+ off
, 0, len
);
2019 /* No skb_postpush_rcsum(skb, skb->data + off, len)
2020 * needed here as it does not change the skb->csum
2021 * result for checksum complete when summing over
2027 static int bpf_skb_generic_pop(struct sk_buff
*skb
, u32 off
, u32 len
)
2029 /* skb_ensure_writable() is not needed here, as we're
2030 * already working on an uncloned skb.
2032 if (unlikely(!pskb_may_pull(skb
, off
+ len
)))
2035 skb_postpull_rcsum(skb
, skb
->data
+ off
, len
);
2036 memmove(skb
->data
+ len
, skb
->data
, off
);
2037 __skb_pull(skb
, len
);
2042 static int bpf_skb_net_hdr_push(struct sk_buff
*skb
, u32 off
, u32 len
)
2044 bool trans_same
= skb
->transport_header
== skb
->network_header
;
2047 /* There's no need for __skb_push()/__skb_pull() pair to
2048 * get to the start of the mac header as we're guaranteed
2049 * to always start from here under eBPF.
2051 ret
= bpf_skb_generic_push(skb
, off
, len
);
2053 skb
->mac_header
-= len
;
2054 skb
->network_header
-= len
;
2056 skb
->transport_header
= skb
->network_header
;
2062 static int bpf_skb_net_hdr_pop(struct sk_buff
*skb
, u32 off
, u32 len
)
2064 bool trans_same
= skb
->transport_header
== skb
->network_header
;
2067 /* Same here, __skb_push()/__skb_pull() pair not needed. */
2068 ret
= bpf_skb_generic_pop(skb
, off
, len
);
2070 skb
->mac_header
+= len
;
2071 skb
->network_header
+= len
;
2073 skb
->transport_header
= skb
->network_header
;
2079 static int bpf_skb_proto_4_to_6(struct sk_buff
*skb
)
2081 const u32 len_diff
= sizeof(struct ipv6hdr
) - sizeof(struct iphdr
);
2082 u32 off
= skb_mac_header_len(skb
);
2085 ret
= skb_cow(skb
, len_diff
);
2086 if (unlikely(ret
< 0))
2089 ret
= bpf_skb_net_hdr_push(skb
, off
, len_diff
);
2090 if (unlikely(ret
< 0))
2093 if (skb_is_gso(skb
)) {
2094 /* SKB_GSO_TCPV4 needs to be changed into
2097 if (skb_shinfo(skb
)->gso_type
& SKB_GSO_TCPV4
) {
2098 skb_shinfo(skb
)->gso_type
&= ~SKB_GSO_TCPV4
;
2099 skb_shinfo(skb
)->gso_type
|= SKB_GSO_TCPV6
;
2102 /* Due to IPv6 header, MSS needs to be downgraded. */
2103 skb_shinfo(skb
)->gso_size
-= len_diff
;
2104 /* Header must be checked, and gso_segs recomputed. */
2105 skb_shinfo(skb
)->gso_type
|= SKB_GSO_DODGY
;
2106 skb_shinfo(skb
)->gso_segs
= 0;
2109 skb
->protocol
= htons(ETH_P_IPV6
);
2110 skb_clear_hash(skb
);
2115 static int bpf_skb_proto_6_to_4(struct sk_buff
*skb
)
2117 const u32 len_diff
= sizeof(struct ipv6hdr
) - sizeof(struct iphdr
);
2118 u32 off
= skb_mac_header_len(skb
);
2121 ret
= skb_unclone(skb
, GFP_ATOMIC
);
2122 if (unlikely(ret
< 0))
2125 ret
= bpf_skb_net_hdr_pop(skb
, off
, len_diff
);
2126 if (unlikely(ret
< 0))
2129 if (skb_is_gso(skb
)) {
2130 /* SKB_GSO_TCPV6 needs to be changed into
2133 if (skb_shinfo(skb
)->gso_type
& SKB_GSO_TCPV6
) {
2134 skb_shinfo(skb
)->gso_type
&= ~SKB_GSO_TCPV6
;
2135 skb_shinfo(skb
)->gso_type
|= SKB_GSO_TCPV4
;
2138 /* Due to IPv4 header, MSS can be upgraded. */
2139 skb_shinfo(skb
)->gso_size
+= len_diff
;
2140 /* Header must be checked, and gso_segs recomputed. */
2141 skb_shinfo(skb
)->gso_type
|= SKB_GSO_DODGY
;
2142 skb_shinfo(skb
)->gso_segs
= 0;
2145 skb
->protocol
= htons(ETH_P_IP
);
2146 skb_clear_hash(skb
);
2151 static int bpf_skb_proto_xlat(struct sk_buff
*skb
, __be16 to_proto
)
2153 __be16 from_proto
= skb
->protocol
;
2155 if (from_proto
== htons(ETH_P_IP
) &&
2156 to_proto
== htons(ETH_P_IPV6
))
2157 return bpf_skb_proto_4_to_6(skb
);
2159 if (from_proto
== htons(ETH_P_IPV6
) &&
2160 to_proto
== htons(ETH_P_IP
))
2161 return bpf_skb_proto_6_to_4(skb
);
2166 BPF_CALL_3(bpf_skb_change_proto
, struct sk_buff
*, skb
, __be16
, proto
,
2171 if (unlikely(flags
))
2174 /* General idea is that this helper does the basic groundwork
2175 * needed for changing the protocol, and eBPF program fills the
2176 * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
2177 * and other helpers, rather than passing a raw buffer here.
2179 * The rationale is to keep this minimal and without a need to
2180 * deal with raw packet data. F.e. even if we would pass buffers
2181 * here, the program still needs to call the bpf_lX_csum_replace()
2182 * helpers anyway. Plus, this way we keep also separation of
2183 * concerns, since f.e. bpf_skb_store_bytes() should only take
2186 * Currently, additional options and extension header space are
2187 * not supported, but flags register is reserved so we can adapt
2188 * that. For offloads, we mark packet as dodgy, so that headers
2189 * need to be verified first.
2191 ret
= bpf_skb_proto_xlat(skb
, proto
);
2192 bpf_compute_data_pointers(skb
);
2196 static const struct bpf_func_proto bpf_skb_change_proto_proto
= {
2197 .func
= bpf_skb_change_proto
,
2199 .ret_type
= RET_INTEGER
,
2200 .arg1_type
= ARG_PTR_TO_CTX
,
2201 .arg2_type
= ARG_ANYTHING
,
2202 .arg3_type
= ARG_ANYTHING
,
2205 BPF_CALL_2(bpf_skb_change_type
, struct sk_buff
*, skb
, u32
, pkt_type
)
2207 /* We only allow a restricted subset to be changed for now. */
2208 if (unlikely(!skb_pkt_type_ok(skb
->pkt_type
) ||
2209 !skb_pkt_type_ok(pkt_type
)))
2212 skb
->pkt_type
= pkt_type
;
2216 static const struct bpf_func_proto bpf_skb_change_type_proto
= {
2217 .func
= bpf_skb_change_type
,
2219 .ret_type
= RET_INTEGER
,
2220 .arg1_type
= ARG_PTR_TO_CTX
,
2221 .arg2_type
= ARG_ANYTHING
,
2224 static u32
bpf_skb_net_base_len(const struct sk_buff
*skb
)
2226 switch (skb
->protocol
) {
2227 case htons(ETH_P_IP
):
2228 return sizeof(struct iphdr
);
2229 case htons(ETH_P_IPV6
):
2230 return sizeof(struct ipv6hdr
);
2236 static int bpf_skb_net_grow(struct sk_buff
*skb
, u32 len_diff
)
2238 u32 off
= skb_mac_header_len(skb
) + bpf_skb_net_base_len(skb
);
2241 ret
= skb_cow(skb
, len_diff
);
2242 if (unlikely(ret
< 0))
2245 ret
= bpf_skb_net_hdr_push(skb
, off
, len_diff
);
2246 if (unlikely(ret
< 0))
2249 if (skb_is_gso(skb
)) {
2250 /* Due to header grow, MSS needs to be downgraded. */
2251 skb_shinfo(skb
)->gso_size
-= len_diff
;
2252 /* Header must be checked, and gso_segs recomputed. */
2253 skb_shinfo(skb
)->gso_type
|= SKB_GSO_DODGY
;
2254 skb_shinfo(skb
)->gso_segs
= 0;
2260 static int bpf_skb_net_shrink(struct sk_buff
*skb
, u32 len_diff
)
2262 u32 off
= skb_mac_header_len(skb
) + bpf_skb_net_base_len(skb
);
2265 ret
= skb_unclone(skb
, GFP_ATOMIC
);
2266 if (unlikely(ret
< 0))
2269 ret
= bpf_skb_net_hdr_pop(skb
, off
, len_diff
);
2270 if (unlikely(ret
< 0))
2273 if (skb_is_gso(skb
)) {
2274 /* Due to header shrink, MSS can be upgraded. */
2275 skb_shinfo(skb
)->gso_size
+= len_diff
;
2276 /* Header must be checked, and gso_segs recomputed. */
2277 skb_shinfo(skb
)->gso_type
|= SKB_GSO_DODGY
;
2278 skb_shinfo(skb
)->gso_segs
= 0;
2284 static u32
__bpf_skb_max_len(const struct sk_buff
*skb
)
2286 return skb
->dev
? skb
->dev
->mtu
+ skb
->dev
->hard_header_len
:
2290 static int bpf_skb_adjust_net(struct sk_buff
*skb
, s32 len_diff
)
2292 bool trans_same
= skb
->transport_header
== skb
->network_header
;
2293 u32 len_cur
, len_diff_abs
= abs(len_diff
);
2294 u32 len_min
= bpf_skb_net_base_len(skb
);
2295 u32 len_max
= __bpf_skb_max_len(skb
);
2296 __be16 proto
= skb
->protocol
;
2297 bool shrink
= len_diff
< 0;
2300 if (unlikely(len_diff_abs
> 0xfffU
))
2302 if (unlikely(proto
!= htons(ETH_P_IP
) &&
2303 proto
!= htons(ETH_P_IPV6
)))
2306 len_cur
= skb
->len
- skb_network_offset(skb
);
2307 if (skb_transport_header_was_set(skb
) && !trans_same
)
2308 len_cur
= skb_network_header_len(skb
);
2309 if ((shrink
&& (len_diff_abs
>= len_cur
||
2310 len_cur
- len_diff_abs
< len_min
)) ||
2311 (!shrink
&& (skb
->len
+ len_diff_abs
> len_max
&&
2315 ret
= shrink
? bpf_skb_net_shrink(skb
, len_diff_abs
) :
2316 bpf_skb_net_grow(skb
, len_diff_abs
);
2318 bpf_compute_data_pointers(skb
);
2322 BPF_CALL_4(bpf_skb_adjust_room
, struct sk_buff
*, skb
, s32
, len_diff
,
2323 u32
, mode
, u64
, flags
)
2325 if (unlikely(flags
))
2327 if (likely(mode
== BPF_ADJ_ROOM_NET
))
2328 return bpf_skb_adjust_net(skb
, len_diff
);
2333 static const struct bpf_func_proto bpf_skb_adjust_room_proto
= {
2334 .func
= bpf_skb_adjust_room
,
2336 .ret_type
= RET_INTEGER
,
2337 .arg1_type
= ARG_PTR_TO_CTX
,
2338 .arg2_type
= ARG_ANYTHING
,
2339 .arg3_type
= ARG_ANYTHING
,
2340 .arg4_type
= ARG_ANYTHING
,
2343 static u32
__bpf_skb_min_len(const struct sk_buff
*skb
)
2345 u32 min_len
= skb_network_offset(skb
);
2347 if (skb_transport_header_was_set(skb
))
2348 min_len
= skb_transport_offset(skb
);
2349 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
2350 min_len
= skb_checksum_start_offset(skb
) +
2351 skb
->csum_offset
+ sizeof(__sum16
);
2355 static int bpf_skb_grow_rcsum(struct sk_buff
*skb
, unsigned int new_len
)
2357 unsigned int old_len
= skb
->len
;
2360 ret
= __skb_grow_rcsum(skb
, new_len
);
2362 memset(skb
->data
+ old_len
, 0, new_len
- old_len
);
2366 static int bpf_skb_trim_rcsum(struct sk_buff
*skb
, unsigned int new_len
)
2368 return __skb_trim_rcsum(skb
, new_len
);
2371 BPF_CALL_3(bpf_skb_change_tail
, struct sk_buff
*, skb
, u32
, new_len
,
2374 u32 max_len
= __bpf_skb_max_len(skb
);
2375 u32 min_len
= __bpf_skb_min_len(skb
);
2378 if (unlikely(flags
|| new_len
> max_len
|| new_len
< min_len
))
2380 if (skb
->encapsulation
)
2383 /* The basic idea of this helper is that it's performing the
2384 * needed work to either grow or trim an skb, and eBPF program
2385 * rewrites the rest via helpers like bpf_skb_store_bytes(),
2386 * bpf_lX_csum_replace() and others rather than passing a raw
2387 * buffer here. This one is a slow path helper and intended
2388 * for replies with control messages.
2390 * Like in bpf_skb_change_proto(), we want to keep this rather
2391 * minimal and without protocol specifics so that we are able
2392 * to separate concerns as in bpf_skb_store_bytes() should only
2393 * be the one responsible for writing buffers.
2395 * It's really expected to be a slow path operation here for
2396 * control message replies, so we're implicitly linearizing,
2397 * uncloning and drop offloads from the skb by this.
2399 ret
= __bpf_try_make_writable(skb
, skb
->len
);
2401 if (new_len
> skb
->len
)
2402 ret
= bpf_skb_grow_rcsum(skb
, new_len
);
2403 else if (new_len
< skb
->len
)
2404 ret
= bpf_skb_trim_rcsum(skb
, new_len
);
2405 if (!ret
&& skb_is_gso(skb
))
2409 bpf_compute_data_pointers(skb
);
2413 static const struct bpf_func_proto bpf_skb_change_tail_proto
= {
2414 .func
= bpf_skb_change_tail
,
2416 .ret_type
= RET_INTEGER
,
2417 .arg1_type
= ARG_PTR_TO_CTX
,
2418 .arg2_type
= ARG_ANYTHING
,
2419 .arg3_type
= ARG_ANYTHING
,
2422 BPF_CALL_3(bpf_skb_change_head
, struct sk_buff
*, skb
, u32
, head_room
,
2425 u32 max_len
= __bpf_skb_max_len(skb
);
2426 u32 new_len
= skb
->len
+ head_room
;
2429 if (unlikely(flags
|| (!skb_is_gso(skb
) && new_len
> max_len
) ||
2430 new_len
< skb
->len
))
2433 ret
= skb_cow(skb
, head_room
);
2435 /* Idea for this helper is that we currently only
2436 * allow to expand on mac header. This means that
2437 * skb->protocol network header, etc, stay as is.
2438 * Compared to bpf_skb_change_tail(), we're more
2439 * flexible due to not needing to linearize or
2440 * reset GSO. Intention for this helper is to be
2441 * used by an L3 skb that needs to push mac header
2442 * for redirection into L2 device.
2444 __skb_push(skb
, head_room
);
2445 memset(skb
->data
, 0, head_room
);
2446 skb_reset_mac_header(skb
);
2449 bpf_compute_data_pointers(skb
);
2453 static const struct bpf_func_proto bpf_skb_change_head_proto
= {
2454 .func
= bpf_skb_change_head
,
2456 .ret_type
= RET_INTEGER
,
2457 .arg1_type
= ARG_PTR_TO_CTX
,
2458 .arg2_type
= ARG_ANYTHING
,
2459 .arg3_type
= ARG_ANYTHING
,
2462 static unsigned long xdp_get_metalen(const struct xdp_buff
*xdp
)
2464 return xdp_data_meta_unsupported(xdp
) ? 0 :
2465 xdp
->data
- xdp
->data_meta
;
2468 BPF_CALL_2(bpf_xdp_adjust_head
, struct xdp_buff
*, xdp
, int, offset
)
2470 unsigned long metalen
= xdp_get_metalen(xdp
);
2471 void *data_start
= xdp
->data_hard_start
+ metalen
;
2472 void *data
= xdp
->data
+ offset
;
2474 if (unlikely(data
< data_start
||
2475 data
> xdp
->data_end
- ETH_HLEN
))
2479 memmove(xdp
->data_meta
+ offset
,
2480 xdp
->data_meta
, metalen
);
2481 xdp
->data_meta
+= offset
;
2487 static const struct bpf_func_proto bpf_xdp_adjust_head_proto
= {
2488 .func
= bpf_xdp_adjust_head
,
2490 .ret_type
= RET_INTEGER
,
2491 .arg1_type
= ARG_PTR_TO_CTX
,
2492 .arg2_type
= ARG_ANYTHING
,
2495 BPF_CALL_2(bpf_xdp_adjust_meta
, struct xdp_buff
*, xdp
, int, offset
)
2497 void *meta
= xdp
->data_meta
+ offset
;
2498 unsigned long metalen
= xdp
->data
- meta
;
2500 if (xdp_data_meta_unsupported(xdp
))
2502 if (unlikely(meta
< xdp
->data_hard_start
||
2505 if (unlikely((metalen
& (sizeof(__u32
) - 1)) ||
2509 xdp
->data_meta
= meta
;
2514 static const struct bpf_func_proto bpf_xdp_adjust_meta_proto
= {
2515 .func
= bpf_xdp_adjust_meta
,
2517 .ret_type
= RET_INTEGER
,
2518 .arg1_type
= ARG_PTR_TO_CTX
,
2519 .arg2_type
= ARG_ANYTHING
,
2522 static int __bpf_tx_xdp(struct net_device
*dev
,
2523 struct bpf_map
*map
,
2524 struct xdp_buff
*xdp
,
2529 if (!dev
->netdev_ops
->ndo_xdp_xmit
) {
2533 err
= dev
->netdev_ops
->ndo_xdp_xmit(dev
, xdp
);
2536 dev
->netdev_ops
->ndo_xdp_flush(dev
);
2540 static int __bpf_tx_xdp_map(struct net_device
*dev_rx
, void *fwd
,
2541 struct bpf_map
*map
,
2542 struct xdp_buff
*xdp
,
2547 if (map
->map_type
== BPF_MAP_TYPE_DEVMAP
) {
2548 struct net_device
*dev
= fwd
;
2550 if (!dev
->netdev_ops
->ndo_xdp_xmit
)
2553 err
= dev
->netdev_ops
->ndo_xdp_xmit(dev
, xdp
);
2556 __dev_map_insert_ctx(map
, index
);
2558 } else if (map
->map_type
== BPF_MAP_TYPE_CPUMAP
) {
2559 struct bpf_cpu_map_entry
*rcpu
= fwd
;
2561 err
= cpu_map_enqueue(rcpu
, xdp
, dev_rx
);
2564 __cpu_map_insert_ctx(map
, index
);
2569 void xdp_do_flush_map(void)
2571 struct redirect_info
*ri
= this_cpu_ptr(&redirect_info
);
2572 struct bpf_map
*map
= ri
->map_to_flush
;
2574 ri
->map_to_flush
= NULL
;
2576 switch (map
->map_type
) {
2577 case BPF_MAP_TYPE_DEVMAP
:
2578 __dev_map_flush(map
);
2580 case BPF_MAP_TYPE_CPUMAP
:
2581 __cpu_map_flush(map
);
2588 EXPORT_SYMBOL_GPL(xdp_do_flush_map
);
2590 static void *__xdp_map_lookup_elem(struct bpf_map
*map
, u32 index
)
2592 switch (map
->map_type
) {
2593 case BPF_MAP_TYPE_DEVMAP
:
2594 return __dev_map_lookup_elem(map
, index
);
2595 case BPF_MAP_TYPE_CPUMAP
:
2596 return __cpu_map_lookup_elem(map
, index
);
2602 static inline bool xdp_map_invalid(const struct bpf_prog
*xdp_prog
,
2605 return (unsigned long)xdp_prog
->aux
!= aux
;
2608 static int xdp_do_redirect_map(struct net_device
*dev
, struct xdp_buff
*xdp
,
2609 struct bpf_prog
*xdp_prog
)
2611 struct redirect_info
*ri
= this_cpu_ptr(&redirect_info
);
2612 unsigned long map_owner
= ri
->map_owner
;
2613 struct bpf_map
*map
= ri
->map
;
2614 u32 index
= ri
->ifindex
;
2622 if (unlikely(xdp_map_invalid(xdp_prog
, map_owner
))) {
2628 fwd
= __xdp_map_lookup_elem(map
, index
);
2633 if (ri
->map_to_flush
&& ri
->map_to_flush
!= map
)
2636 err
= __bpf_tx_xdp_map(dev
, fwd
, map
, xdp
, index
);
2640 ri
->map_to_flush
= map
;
2641 _trace_xdp_redirect_map(dev
, xdp_prog
, fwd
, map
, index
);
2644 _trace_xdp_redirect_map_err(dev
, xdp_prog
, fwd
, map
, index
, err
);
2648 int xdp_do_redirect(struct net_device
*dev
, struct xdp_buff
*xdp
,
2649 struct bpf_prog
*xdp_prog
)
2651 struct redirect_info
*ri
= this_cpu_ptr(&redirect_info
);
2652 struct net_device
*fwd
;
2653 u32 index
= ri
->ifindex
;
2657 return xdp_do_redirect_map(dev
, xdp
, xdp_prog
);
2659 fwd
= dev_get_by_index_rcu(dev_net(dev
), index
);
2661 if (unlikely(!fwd
)) {
2666 err
= __bpf_tx_xdp(fwd
, NULL
, xdp
, 0);
2670 _trace_xdp_redirect(dev
, xdp_prog
, index
);
2673 _trace_xdp_redirect_err(dev
, xdp_prog
, index
, err
);
2676 EXPORT_SYMBOL_GPL(xdp_do_redirect
);
2678 static int __xdp_generic_ok_fwd_dev(struct sk_buff
*skb
, struct net_device
*fwd
)
2682 if (unlikely(!(fwd
->flags
& IFF_UP
)))
2685 len
= fwd
->mtu
+ fwd
->hard_header_len
+ VLAN_HLEN
;
2692 int xdp_do_generic_redirect_map(struct net_device
*dev
, struct sk_buff
*skb
,
2693 struct bpf_prog
*xdp_prog
)
2695 struct redirect_info
*ri
= this_cpu_ptr(&redirect_info
);
2696 unsigned long map_owner
= ri
->map_owner
;
2697 struct bpf_map
*map
= ri
->map
;
2698 struct net_device
*fwd
= NULL
;
2699 u32 index
= ri
->ifindex
;
2706 if (unlikely(xdp_map_invalid(xdp_prog
, map_owner
))) {
2711 fwd
= __xdp_map_lookup_elem(map
, index
);
2712 if (unlikely(!fwd
)) {
2717 if (map
->map_type
== BPF_MAP_TYPE_DEVMAP
) {
2718 if (unlikely((err
= __xdp_generic_ok_fwd_dev(skb
, fwd
))))
2722 /* TODO: Handle BPF_MAP_TYPE_CPUMAP */
2727 _trace_xdp_redirect_map(dev
, xdp_prog
, fwd
, map
, index
);
2730 _trace_xdp_redirect_map_err(dev
, xdp_prog
, fwd
, map
, index
, err
);
2734 int xdp_do_generic_redirect(struct net_device
*dev
, struct sk_buff
*skb
,
2735 struct bpf_prog
*xdp_prog
)
2737 struct redirect_info
*ri
= this_cpu_ptr(&redirect_info
);
2738 u32 index
= ri
->ifindex
;
2739 struct net_device
*fwd
;
2743 return xdp_do_generic_redirect_map(dev
, skb
, xdp_prog
);
2746 fwd
= dev_get_by_index_rcu(dev_net(dev
), index
);
2747 if (unlikely(!fwd
)) {
2752 if (unlikely((err
= __xdp_generic_ok_fwd_dev(skb
, fwd
))))
2756 _trace_xdp_redirect(dev
, xdp_prog
, index
);
2759 _trace_xdp_redirect_err(dev
, xdp_prog
, index
, err
);
2762 EXPORT_SYMBOL_GPL(xdp_do_generic_redirect
);
2764 BPF_CALL_2(bpf_xdp_redirect
, u32
, ifindex
, u64
, flags
)
2766 struct redirect_info
*ri
= this_cpu_ptr(&redirect_info
);
2768 if (unlikely(flags
))
2771 ri
->ifindex
= ifindex
;
2776 return XDP_REDIRECT
;
2779 static const struct bpf_func_proto bpf_xdp_redirect_proto
= {
2780 .func
= bpf_xdp_redirect
,
2782 .ret_type
= RET_INTEGER
,
2783 .arg1_type
= ARG_ANYTHING
,
2784 .arg2_type
= ARG_ANYTHING
,
2787 BPF_CALL_4(bpf_xdp_redirect_map
, struct bpf_map
*, map
, u32
, ifindex
, u64
, flags
,
2788 unsigned long, map_owner
)
2790 struct redirect_info
*ri
= this_cpu_ptr(&redirect_info
);
2792 if (unlikely(flags
))
2795 ri
->ifindex
= ifindex
;
2798 ri
->map_owner
= map_owner
;
2800 return XDP_REDIRECT
;
2803 /* Note, arg4 is hidden from users and populated by the verifier
2804 * with the right pointer.
2806 static const struct bpf_func_proto bpf_xdp_redirect_map_proto
= {
2807 .func
= bpf_xdp_redirect_map
,
2809 .ret_type
= RET_INTEGER
,
2810 .arg1_type
= ARG_CONST_MAP_PTR
,
2811 .arg2_type
= ARG_ANYTHING
,
2812 .arg3_type
= ARG_ANYTHING
,
2815 bool bpf_helper_changes_pkt_data(void *func
)
2817 if (func
== bpf_skb_vlan_push
||
2818 func
== bpf_skb_vlan_pop
||
2819 func
== bpf_skb_store_bytes
||
2820 func
== bpf_skb_change_proto
||
2821 func
== bpf_skb_change_head
||
2822 func
== bpf_skb_change_tail
||
2823 func
== bpf_skb_adjust_room
||
2824 func
== bpf_skb_pull_data
||
2825 func
== bpf_clone_redirect
||
2826 func
== bpf_l3_csum_replace
||
2827 func
== bpf_l4_csum_replace
||
2828 func
== bpf_xdp_adjust_head
||
2829 func
== bpf_xdp_adjust_meta
)
2835 static unsigned long bpf_skb_copy(void *dst_buff
, const void *skb
,
2836 unsigned long off
, unsigned long len
)
2838 void *ptr
= skb_header_pointer(skb
, off
, len
, dst_buff
);
2842 if (ptr
!= dst_buff
)
2843 memcpy(dst_buff
, ptr
, len
);
2848 BPF_CALL_5(bpf_skb_event_output
, struct sk_buff
*, skb
, struct bpf_map
*, map
,
2849 u64
, flags
, void *, meta
, u64
, meta_size
)
2851 u64 skb_size
= (flags
& BPF_F_CTXLEN_MASK
) >> 32;
2853 if (unlikely(flags
& ~(BPF_F_CTXLEN_MASK
| BPF_F_INDEX_MASK
)))
2855 if (unlikely(skb_size
> skb
->len
))
2858 return bpf_event_output(map
, flags
, meta
, meta_size
, skb
, skb_size
,
2862 static const struct bpf_func_proto bpf_skb_event_output_proto
= {
2863 .func
= bpf_skb_event_output
,
2865 .ret_type
= RET_INTEGER
,
2866 .arg1_type
= ARG_PTR_TO_CTX
,
2867 .arg2_type
= ARG_CONST_MAP_PTR
,
2868 .arg3_type
= ARG_ANYTHING
,
2869 .arg4_type
= ARG_PTR_TO_MEM
,
2870 .arg5_type
= ARG_CONST_SIZE
,
2873 static unsigned short bpf_tunnel_key_af(u64 flags
)
2875 return flags
& BPF_F_TUNINFO_IPV6
? AF_INET6
: AF_INET
;
2878 BPF_CALL_4(bpf_skb_get_tunnel_key
, struct sk_buff
*, skb
, struct bpf_tunnel_key
*, to
,
2879 u32
, size
, u64
, flags
)
2881 const struct ip_tunnel_info
*info
= skb_tunnel_info(skb
);
2882 u8 compat
[sizeof(struct bpf_tunnel_key
)];
2886 if (unlikely(!info
|| (flags
& ~(BPF_F_TUNINFO_IPV6
)))) {
2890 if (ip_tunnel_info_af(info
) != bpf_tunnel_key_af(flags
)) {
2894 if (unlikely(size
!= sizeof(struct bpf_tunnel_key
))) {
2897 case offsetof(struct bpf_tunnel_key
, tunnel_label
):
2898 case offsetof(struct bpf_tunnel_key
, tunnel_ext
):
2900 case offsetof(struct bpf_tunnel_key
, remote_ipv6
[1]):
2901 /* Fixup deprecated structure layouts here, so we have
2902 * a common path later on.
2904 if (ip_tunnel_info_af(info
) != AF_INET
)
2907 to
= (struct bpf_tunnel_key
*)compat
;
2914 to
->tunnel_id
= be64_to_cpu(info
->key
.tun_id
);
2915 to
->tunnel_tos
= info
->key
.tos
;
2916 to
->tunnel_ttl
= info
->key
.ttl
;
2918 if (flags
& BPF_F_TUNINFO_IPV6
) {
2919 memcpy(to
->remote_ipv6
, &info
->key
.u
.ipv6
.src
,
2920 sizeof(to
->remote_ipv6
));
2921 to
->tunnel_label
= be32_to_cpu(info
->key
.label
);
2923 to
->remote_ipv4
= be32_to_cpu(info
->key
.u
.ipv4
.src
);
2926 if (unlikely(size
!= sizeof(struct bpf_tunnel_key
)))
2927 memcpy(to_orig
, to
, size
);
2931 memset(to_orig
, 0, size
);
2935 static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto
= {
2936 .func
= bpf_skb_get_tunnel_key
,
2938 .ret_type
= RET_INTEGER
,
2939 .arg1_type
= ARG_PTR_TO_CTX
,
2940 .arg2_type
= ARG_PTR_TO_UNINIT_MEM
,
2941 .arg3_type
= ARG_CONST_SIZE
,
2942 .arg4_type
= ARG_ANYTHING
,
2945 BPF_CALL_3(bpf_skb_get_tunnel_opt
, struct sk_buff
*, skb
, u8
*, to
, u32
, size
)
2947 const struct ip_tunnel_info
*info
= skb_tunnel_info(skb
);
2950 if (unlikely(!info
||
2951 !(info
->key
.tun_flags
& TUNNEL_OPTIONS_PRESENT
))) {
2955 if (unlikely(size
< info
->options_len
)) {
2960 ip_tunnel_info_opts_get(to
, info
);
2961 if (size
> info
->options_len
)
2962 memset(to
+ info
->options_len
, 0, size
- info
->options_len
);
2964 return info
->options_len
;
2966 memset(to
, 0, size
);
2970 static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto
= {
2971 .func
= bpf_skb_get_tunnel_opt
,
2973 .ret_type
= RET_INTEGER
,
2974 .arg1_type
= ARG_PTR_TO_CTX
,
2975 .arg2_type
= ARG_PTR_TO_UNINIT_MEM
,
2976 .arg3_type
= ARG_CONST_SIZE
,
2979 static struct metadata_dst __percpu
*md_dst
;
2981 BPF_CALL_4(bpf_skb_set_tunnel_key
, struct sk_buff
*, skb
,
2982 const struct bpf_tunnel_key
*, from
, u32
, size
, u64
, flags
)
2984 struct metadata_dst
*md
= this_cpu_ptr(md_dst
);
2985 u8 compat
[sizeof(struct bpf_tunnel_key
)];
2986 struct ip_tunnel_info
*info
;
2988 if (unlikely(flags
& ~(BPF_F_TUNINFO_IPV6
| BPF_F_ZERO_CSUM_TX
|
2989 BPF_F_DONT_FRAGMENT
)))
2991 if (unlikely(size
!= sizeof(struct bpf_tunnel_key
))) {
2993 case offsetof(struct bpf_tunnel_key
, tunnel_label
):
2994 case offsetof(struct bpf_tunnel_key
, tunnel_ext
):
2995 case offsetof(struct bpf_tunnel_key
, remote_ipv6
[1]):
2996 /* Fixup deprecated structure layouts here, so we have
2997 * a common path later on.
2999 memcpy(compat
, from
, size
);
3000 memset(compat
+ size
, 0, sizeof(compat
) - size
);
3001 from
= (const struct bpf_tunnel_key
*) compat
;
3007 if (unlikely((!(flags
& BPF_F_TUNINFO_IPV6
) && from
->tunnel_label
) ||
3012 dst_hold((struct dst_entry
*) md
);
3013 skb_dst_set(skb
, (struct dst_entry
*) md
);
3015 info
= &md
->u
.tun_info
;
3016 info
->mode
= IP_TUNNEL_INFO_TX
;
3018 info
->key
.tun_flags
= TUNNEL_KEY
| TUNNEL_CSUM
| TUNNEL_NOCACHE
;
3019 if (flags
& BPF_F_DONT_FRAGMENT
)
3020 info
->key
.tun_flags
|= TUNNEL_DONT_FRAGMENT
;
3022 info
->key
.tun_id
= cpu_to_be64(from
->tunnel_id
);
3023 info
->key
.tos
= from
->tunnel_tos
;
3024 info
->key
.ttl
= from
->tunnel_ttl
;
3026 if (flags
& BPF_F_TUNINFO_IPV6
) {
3027 info
->mode
|= IP_TUNNEL_INFO_IPV6
;
3028 memcpy(&info
->key
.u
.ipv6
.dst
, from
->remote_ipv6
,
3029 sizeof(from
->remote_ipv6
));
3030 info
->key
.label
= cpu_to_be32(from
->tunnel_label
) &
3031 IPV6_FLOWLABEL_MASK
;
3033 info
->key
.u
.ipv4
.dst
= cpu_to_be32(from
->remote_ipv4
);
3034 if (flags
& BPF_F_ZERO_CSUM_TX
)
3035 info
->key
.tun_flags
&= ~TUNNEL_CSUM
;
3041 static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto
= {
3042 .func
= bpf_skb_set_tunnel_key
,
3044 .ret_type
= RET_INTEGER
,
3045 .arg1_type
= ARG_PTR_TO_CTX
,
3046 .arg2_type
= ARG_PTR_TO_MEM
,
3047 .arg3_type
= ARG_CONST_SIZE
,
3048 .arg4_type
= ARG_ANYTHING
,
3051 BPF_CALL_3(bpf_skb_set_tunnel_opt
, struct sk_buff
*, skb
,
3052 const u8
*, from
, u32
, size
)
3054 struct ip_tunnel_info
*info
= skb_tunnel_info(skb
);
3055 const struct metadata_dst
*md
= this_cpu_ptr(md_dst
);
3057 if (unlikely(info
!= &md
->u
.tun_info
|| (size
& (sizeof(u32
) - 1))))
3059 if (unlikely(size
> IP_TUNNEL_OPTS_MAX
))
3062 ip_tunnel_info_opts_set(info
, from
, size
);
3067 static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto
= {
3068 .func
= bpf_skb_set_tunnel_opt
,
3070 .ret_type
= RET_INTEGER
,
3071 .arg1_type
= ARG_PTR_TO_CTX
,
3072 .arg2_type
= ARG_PTR_TO_MEM
,
3073 .arg3_type
= ARG_CONST_SIZE
,
3076 static const struct bpf_func_proto
*
3077 bpf_get_skb_set_tunnel_proto(enum bpf_func_id which
)
3080 struct metadata_dst __percpu
*tmp
;
3082 tmp
= metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX
,
3087 if (cmpxchg(&md_dst
, NULL
, tmp
))
3088 metadata_dst_free_percpu(tmp
);
3092 case BPF_FUNC_skb_set_tunnel_key
:
3093 return &bpf_skb_set_tunnel_key_proto
;
3094 case BPF_FUNC_skb_set_tunnel_opt
:
3095 return &bpf_skb_set_tunnel_opt_proto
;
3101 BPF_CALL_3(bpf_skb_under_cgroup
, struct sk_buff
*, skb
, struct bpf_map
*, map
,
3104 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
3105 struct cgroup
*cgrp
;
3108 sk
= skb_to_full_sk(skb
);
3109 if (!sk
|| !sk_fullsock(sk
))
3111 if (unlikely(idx
>= array
->map
.max_entries
))
3114 cgrp
= READ_ONCE(array
->ptrs
[idx
]);
3115 if (unlikely(!cgrp
))
3118 return sk_under_cgroup_hierarchy(sk
, cgrp
);
3121 static const struct bpf_func_proto bpf_skb_under_cgroup_proto
= {
3122 .func
= bpf_skb_under_cgroup
,
3124 .ret_type
= RET_INTEGER
,
3125 .arg1_type
= ARG_PTR_TO_CTX
,
3126 .arg2_type
= ARG_CONST_MAP_PTR
,
3127 .arg3_type
= ARG_ANYTHING
,
3130 static unsigned long bpf_xdp_copy(void *dst_buff
, const void *src_buff
,
3131 unsigned long off
, unsigned long len
)
3133 memcpy(dst_buff
, src_buff
+ off
, len
);
3137 BPF_CALL_5(bpf_xdp_event_output
, struct xdp_buff
*, xdp
, struct bpf_map
*, map
,
3138 u64
, flags
, void *, meta
, u64
, meta_size
)
3140 u64 xdp_size
= (flags
& BPF_F_CTXLEN_MASK
) >> 32;
3142 if (unlikely(flags
& ~(BPF_F_CTXLEN_MASK
| BPF_F_INDEX_MASK
)))
3144 if (unlikely(xdp_size
> (unsigned long)(xdp
->data_end
- xdp
->data
)))
3147 return bpf_event_output(map
, flags
, meta
, meta_size
, xdp
->data
,
3148 xdp_size
, bpf_xdp_copy
);
3151 static const struct bpf_func_proto bpf_xdp_event_output_proto
= {
3152 .func
= bpf_xdp_event_output
,
3154 .ret_type
= RET_INTEGER
,
3155 .arg1_type
= ARG_PTR_TO_CTX
,
3156 .arg2_type
= ARG_CONST_MAP_PTR
,
3157 .arg3_type
= ARG_ANYTHING
,
3158 .arg4_type
= ARG_PTR_TO_MEM
,
3159 .arg5_type
= ARG_CONST_SIZE
,
3162 BPF_CALL_1(bpf_get_socket_cookie
, struct sk_buff
*, skb
)
3164 return skb
->sk
? sock_gen_cookie(skb
->sk
) : 0;
3167 static const struct bpf_func_proto bpf_get_socket_cookie_proto
= {
3168 .func
= bpf_get_socket_cookie
,
3170 .ret_type
= RET_INTEGER
,
3171 .arg1_type
= ARG_PTR_TO_CTX
,
3174 BPF_CALL_1(bpf_get_socket_uid
, struct sk_buff
*, skb
)
3176 struct sock
*sk
= sk_to_full_sk(skb
->sk
);
3179 if (!sk
|| !sk_fullsock(sk
))
3181 kuid
= sock_net_uid(sock_net(sk
), sk
);
3182 return from_kuid_munged(sock_net(sk
)->user_ns
, kuid
);
3185 static const struct bpf_func_proto bpf_get_socket_uid_proto
= {
3186 .func
= bpf_get_socket_uid
,
3188 .ret_type
= RET_INTEGER
,
3189 .arg1_type
= ARG_PTR_TO_CTX
,
3192 BPF_CALL_5(bpf_setsockopt
, struct bpf_sock_ops_kern
*, bpf_sock
,
3193 int, level
, int, optname
, char *, optval
, int, optlen
)
3195 struct sock
*sk
= bpf_sock
->sk
;
3199 if (!sk_fullsock(sk
))
3202 if (level
== SOL_SOCKET
) {
3203 if (optlen
!= sizeof(int))
3205 val
= *((int *)optval
);
3207 /* Only some socketops are supported */
3210 val
= min_t(u32
, val
, sysctl_rmem_max
);
3211 sk
->sk_userlocks
|= SOCK_RCVBUF_LOCK
;
3212 sk
->sk_rcvbuf
= max_t(int, val
* 2, SOCK_MIN_RCVBUF
);
3215 val
= min_t(u32
, val
, sysctl_wmem_max
);
3216 sk
->sk_userlocks
|= SOCK_SNDBUF_LOCK
;
3217 sk
->sk_sndbuf
= max_t(int, val
* 2, SOCK_MIN_SNDBUF
);
3219 case SO_MAX_PACING_RATE
:
3220 sk
->sk_max_pacing_rate
= val
;
3221 sk
->sk_pacing_rate
= min(sk
->sk_pacing_rate
,
3222 sk
->sk_max_pacing_rate
);
3225 sk
->sk_priority
= val
;
3230 sk
->sk_rcvlowat
= val
? : 1;
3233 if (sk
->sk_mark
!= val
) {
3242 } else if (level
== SOL_TCP
&&
3243 sk
->sk_prot
->setsockopt
== tcp_setsockopt
) {
3244 if (optname
== TCP_CONGESTION
) {
3245 char name
[TCP_CA_NAME_MAX
];
3246 bool reinit
= bpf_sock
->op
> BPF_SOCK_OPS_NEEDS_ECN
;
3248 strncpy(name
, optval
, min_t(long, optlen
,
3249 TCP_CA_NAME_MAX
-1));
3250 name
[TCP_CA_NAME_MAX
-1] = 0;
3251 ret
= tcp_set_congestion_control(sk
, name
, false,
3254 struct tcp_sock
*tp
= tcp_sk(sk
);
3256 if (optlen
!= sizeof(int))
3259 val
= *((int *)optval
);
3260 /* Only some options are supported */
3263 if (val
<= 0 || tp
->data_segs_out
> tp
->syn_data
)
3268 case TCP_BPF_SNDCWND_CLAMP
:
3272 tp
->snd_cwnd_clamp
= val
;
3273 tp
->snd_ssthresh
= val
;
3287 static const struct bpf_func_proto bpf_setsockopt_proto
= {
3288 .func
= bpf_setsockopt
,
3290 .ret_type
= RET_INTEGER
,
3291 .arg1_type
= ARG_PTR_TO_CTX
,
3292 .arg2_type
= ARG_ANYTHING
,
3293 .arg3_type
= ARG_ANYTHING
,
3294 .arg4_type
= ARG_PTR_TO_MEM
,
3295 .arg5_type
= ARG_CONST_SIZE
,
3298 BPF_CALL_5(bpf_getsockopt
, struct bpf_sock_ops_kern
*, bpf_sock
,
3299 int, level
, int, optname
, char *, optval
, int, optlen
)
3301 struct sock
*sk
= bpf_sock
->sk
;
3303 if (!sk_fullsock(sk
))
3307 if (level
== SOL_TCP
&& sk
->sk_prot
->getsockopt
== tcp_getsockopt
) {
3308 if (optname
== TCP_CONGESTION
) {
3309 struct inet_connection_sock
*icsk
= inet_csk(sk
);
3311 if (!icsk
->icsk_ca_ops
|| optlen
<= 1)
3313 strncpy(optval
, icsk
->icsk_ca_ops
->name
, optlen
);
3314 optval
[optlen
- 1] = 0;
3324 memset(optval
, 0, optlen
);
3328 static const struct bpf_func_proto bpf_getsockopt_proto
= {
3329 .func
= bpf_getsockopt
,
3331 .ret_type
= RET_INTEGER
,
3332 .arg1_type
= ARG_PTR_TO_CTX
,
3333 .arg2_type
= ARG_ANYTHING
,
3334 .arg3_type
= ARG_ANYTHING
,
3335 .arg4_type
= ARG_PTR_TO_UNINIT_MEM
,
3336 .arg5_type
= ARG_CONST_SIZE
,
3339 static const struct bpf_func_proto
*
3340 bpf_base_func_proto(enum bpf_func_id func_id
)
3343 case BPF_FUNC_map_lookup_elem
:
3344 return &bpf_map_lookup_elem_proto
;
3345 case BPF_FUNC_map_update_elem
:
3346 return &bpf_map_update_elem_proto
;
3347 case BPF_FUNC_map_delete_elem
:
3348 return &bpf_map_delete_elem_proto
;
3349 case BPF_FUNC_get_prandom_u32
:
3350 return &bpf_get_prandom_u32_proto
;
3351 case BPF_FUNC_get_smp_processor_id
:
3352 return &bpf_get_raw_smp_processor_id_proto
;
3353 case BPF_FUNC_get_numa_node_id
:
3354 return &bpf_get_numa_node_id_proto
;
3355 case BPF_FUNC_tail_call
:
3356 return &bpf_tail_call_proto
;
3357 case BPF_FUNC_ktime_get_ns
:
3358 return &bpf_ktime_get_ns_proto
;
3359 case BPF_FUNC_trace_printk
:
3360 if (capable(CAP_SYS_ADMIN
))
3361 return bpf_get_trace_printk_proto();
3367 static const struct bpf_func_proto
*
3368 sock_filter_func_proto(enum bpf_func_id func_id
)
3371 /* inet and inet6 sockets are created in a process
3372 * context so there is always a valid uid/gid
3374 case BPF_FUNC_get_current_uid_gid
:
3375 return &bpf_get_current_uid_gid_proto
;
3377 return bpf_base_func_proto(func_id
);
3381 static const struct bpf_func_proto
*
3382 sk_filter_func_proto(enum bpf_func_id func_id
)
3385 case BPF_FUNC_skb_load_bytes
:
3386 return &bpf_skb_load_bytes_proto
;
3387 case BPF_FUNC_get_socket_cookie
:
3388 return &bpf_get_socket_cookie_proto
;
3389 case BPF_FUNC_get_socket_uid
:
3390 return &bpf_get_socket_uid_proto
;
3392 return bpf_base_func_proto(func_id
);
3396 static const struct bpf_func_proto
*
3397 tc_cls_act_func_proto(enum bpf_func_id func_id
)
3400 case BPF_FUNC_skb_store_bytes
:
3401 return &bpf_skb_store_bytes_proto
;
3402 case BPF_FUNC_skb_load_bytes
:
3403 return &bpf_skb_load_bytes_proto
;
3404 case BPF_FUNC_skb_pull_data
:
3405 return &bpf_skb_pull_data_proto
;
3406 case BPF_FUNC_csum_diff
:
3407 return &bpf_csum_diff_proto
;
3408 case BPF_FUNC_csum_update
:
3409 return &bpf_csum_update_proto
;
3410 case BPF_FUNC_l3_csum_replace
:
3411 return &bpf_l3_csum_replace_proto
;
3412 case BPF_FUNC_l4_csum_replace
:
3413 return &bpf_l4_csum_replace_proto
;
3414 case BPF_FUNC_clone_redirect
:
3415 return &bpf_clone_redirect_proto
;
3416 case BPF_FUNC_get_cgroup_classid
:
3417 return &bpf_get_cgroup_classid_proto
;
3418 case BPF_FUNC_skb_vlan_push
:
3419 return &bpf_skb_vlan_push_proto
;
3420 case BPF_FUNC_skb_vlan_pop
:
3421 return &bpf_skb_vlan_pop_proto
;
3422 case BPF_FUNC_skb_change_proto
:
3423 return &bpf_skb_change_proto_proto
;
3424 case BPF_FUNC_skb_change_type
:
3425 return &bpf_skb_change_type_proto
;
3426 case BPF_FUNC_skb_adjust_room
:
3427 return &bpf_skb_adjust_room_proto
;
3428 case BPF_FUNC_skb_change_tail
:
3429 return &bpf_skb_change_tail_proto
;
3430 case BPF_FUNC_skb_get_tunnel_key
:
3431 return &bpf_skb_get_tunnel_key_proto
;
3432 case BPF_FUNC_skb_set_tunnel_key
:
3433 return bpf_get_skb_set_tunnel_proto(func_id
);
3434 case BPF_FUNC_skb_get_tunnel_opt
:
3435 return &bpf_skb_get_tunnel_opt_proto
;
3436 case BPF_FUNC_skb_set_tunnel_opt
:
3437 return bpf_get_skb_set_tunnel_proto(func_id
);
3438 case BPF_FUNC_redirect
:
3439 return &bpf_redirect_proto
;
3440 case BPF_FUNC_get_route_realm
:
3441 return &bpf_get_route_realm_proto
;
3442 case BPF_FUNC_get_hash_recalc
:
3443 return &bpf_get_hash_recalc_proto
;
3444 case BPF_FUNC_set_hash_invalid
:
3445 return &bpf_set_hash_invalid_proto
;
3446 case BPF_FUNC_set_hash
:
3447 return &bpf_set_hash_proto
;
3448 case BPF_FUNC_perf_event_output
:
3449 return &bpf_skb_event_output_proto
;
3450 case BPF_FUNC_get_smp_processor_id
:
3451 return &bpf_get_smp_processor_id_proto
;
3452 case BPF_FUNC_skb_under_cgroup
:
3453 return &bpf_skb_under_cgroup_proto
;
3454 case BPF_FUNC_get_socket_cookie
:
3455 return &bpf_get_socket_cookie_proto
;
3456 case BPF_FUNC_get_socket_uid
:
3457 return &bpf_get_socket_uid_proto
;
3459 return bpf_base_func_proto(func_id
);
3463 static const struct bpf_func_proto
*
3464 xdp_func_proto(enum bpf_func_id func_id
)
3467 case BPF_FUNC_perf_event_output
:
3468 return &bpf_xdp_event_output_proto
;
3469 case BPF_FUNC_get_smp_processor_id
:
3470 return &bpf_get_smp_processor_id_proto
;
3471 case BPF_FUNC_xdp_adjust_head
:
3472 return &bpf_xdp_adjust_head_proto
;
3473 case BPF_FUNC_xdp_adjust_meta
:
3474 return &bpf_xdp_adjust_meta_proto
;
3475 case BPF_FUNC_redirect
:
3476 return &bpf_xdp_redirect_proto
;
3477 case BPF_FUNC_redirect_map
:
3478 return &bpf_xdp_redirect_map_proto
;
3480 return bpf_base_func_proto(func_id
);
3484 static const struct bpf_func_proto
*
3485 lwt_inout_func_proto(enum bpf_func_id func_id
)
3488 case BPF_FUNC_skb_load_bytes
:
3489 return &bpf_skb_load_bytes_proto
;
3490 case BPF_FUNC_skb_pull_data
:
3491 return &bpf_skb_pull_data_proto
;
3492 case BPF_FUNC_csum_diff
:
3493 return &bpf_csum_diff_proto
;
3494 case BPF_FUNC_get_cgroup_classid
:
3495 return &bpf_get_cgroup_classid_proto
;
3496 case BPF_FUNC_get_route_realm
:
3497 return &bpf_get_route_realm_proto
;
3498 case BPF_FUNC_get_hash_recalc
:
3499 return &bpf_get_hash_recalc_proto
;
3500 case BPF_FUNC_perf_event_output
:
3501 return &bpf_skb_event_output_proto
;
3502 case BPF_FUNC_get_smp_processor_id
:
3503 return &bpf_get_smp_processor_id_proto
;
3504 case BPF_FUNC_skb_under_cgroup
:
3505 return &bpf_skb_under_cgroup_proto
;
3507 return bpf_base_func_proto(func_id
);
3511 static const struct bpf_func_proto
*
3512 sock_ops_func_proto(enum bpf_func_id func_id
)
3515 case BPF_FUNC_setsockopt
:
3516 return &bpf_setsockopt_proto
;
3517 case BPF_FUNC_getsockopt
:
3518 return &bpf_getsockopt_proto
;
3519 case BPF_FUNC_sock_map_update
:
3520 return &bpf_sock_map_update_proto
;
3522 return bpf_base_func_proto(func_id
);
3526 static const struct bpf_func_proto
*sk_skb_func_proto(enum bpf_func_id func_id
)
3529 case BPF_FUNC_skb_store_bytes
:
3530 return &bpf_skb_store_bytes_proto
;
3531 case BPF_FUNC_skb_load_bytes
:
3532 return &bpf_skb_load_bytes_proto
;
3533 case BPF_FUNC_skb_pull_data
:
3534 return &bpf_skb_pull_data_proto
;
3535 case BPF_FUNC_skb_change_tail
:
3536 return &bpf_skb_change_tail_proto
;
3537 case BPF_FUNC_skb_change_head
:
3538 return &bpf_skb_change_head_proto
;
3539 case BPF_FUNC_get_socket_cookie
:
3540 return &bpf_get_socket_cookie_proto
;
3541 case BPF_FUNC_get_socket_uid
:
3542 return &bpf_get_socket_uid_proto
;
3543 case BPF_FUNC_sk_redirect_map
:
3544 return &bpf_sk_redirect_map_proto
;
3546 return bpf_base_func_proto(func_id
);
3550 static const struct bpf_func_proto
*
3551 lwt_xmit_func_proto(enum bpf_func_id func_id
)
3554 case BPF_FUNC_skb_get_tunnel_key
:
3555 return &bpf_skb_get_tunnel_key_proto
;
3556 case BPF_FUNC_skb_set_tunnel_key
:
3557 return bpf_get_skb_set_tunnel_proto(func_id
);
3558 case BPF_FUNC_skb_get_tunnel_opt
:
3559 return &bpf_skb_get_tunnel_opt_proto
;
3560 case BPF_FUNC_skb_set_tunnel_opt
:
3561 return bpf_get_skb_set_tunnel_proto(func_id
);
3562 case BPF_FUNC_redirect
:
3563 return &bpf_redirect_proto
;
3564 case BPF_FUNC_clone_redirect
:
3565 return &bpf_clone_redirect_proto
;
3566 case BPF_FUNC_skb_change_tail
:
3567 return &bpf_skb_change_tail_proto
;
3568 case BPF_FUNC_skb_change_head
:
3569 return &bpf_skb_change_head_proto
;
3570 case BPF_FUNC_skb_store_bytes
:
3571 return &bpf_skb_store_bytes_proto
;
3572 case BPF_FUNC_csum_update
:
3573 return &bpf_csum_update_proto
;
3574 case BPF_FUNC_l3_csum_replace
:
3575 return &bpf_l3_csum_replace_proto
;
3576 case BPF_FUNC_l4_csum_replace
:
3577 return &bpf_l4_csum_replace_proto
;
3578 case BPF_FUNC_set_hash_invalid
:
3579 return &bpf_set_hash_invalid_proto
;
3581 return lwt_inout_func_proto(func_id
);
3585 static bool bpf_skb_is_valid_access(int off
, int size
, enum bpf_access_type type
,
3586 struct bpf_insn_access_aux
*info
)
3588 const int size_default
= sizeof(__u32
);
3590 if (off
< 0 || off
>= sizeof(struct __sk_buff
))
3593 /* The verifier guarantees that size > 0. */
3594 if (off
% size
!= 0)
3598 case bpf_ctx_range_till(struct __sk_buff
, cb
[0], cb
[4]):
3599 if (off
+ size
> offsetofend(struct __sk_buff
, cb
[4]))
3602 case bpf_ctx_range_till(struct __sk_buff
, remote_ip6
[0], remote_ip6
[3]):
3603 case bpf_ctx_range_till(struct __sk_buff
, local_ip6
[0], local_ip6
[3]):
3604 case bpf_ctx_range_till(struct __sk_buff
, remote_ip4
, remote_ip4
):
3605 case bpf_ctx_range_till(struct __sk_buff
, local_ip4
, local_ip4
):
3606 case bpf_ctx_range(struct __sk_buff
, data
):
3607 case bpf_ctx_range(struct __sk_buff
, data_meta
):
3608 case bpf_ctx_range(struct __sk_buff
, data_end
):
3609 if (size
!= size_default
)
3613 /* Only narrow read access allowed for now. */
3614 if (type
== BPF_WRITE
) {
3615 if (size
!= size_default
)
3618 bpf_ctx_record_field_size(info
, size_default
);
3619 if (!bpf_ctx_narrow_access_ok(off
, size
, size_default
))
3627 static bool sk_filter_is_valid_access(int off
, int size
,
3628 enum bpf_access_type type
,
3629 struct bpf_insn_access_aux
*info
)
3632 case bpf_ctx_range(struct __sk_buff
, tc_classid
):
3633 case bpf_ctx_range(struct __sk_buff
, data
):
3634 case bpf_ctx_range(struct __sk_buff
, data_meta
):
3635 case bpf_ctx_range(struct __sk_buff
, data_end
):
3636 case bpf_ctx_range_till(struct __sk_buff
, family
, local_port
):
3640 if (type
== BPF_WRITE
) {
3642 case bpf_ctx_range_till(struct __sk_buff
, cb
[0], cb
[4]):
3649 return bpf_skb_is_valid_access(off
, size
, type
, info
);
3652 static bool lwt_is_valid_access(int off
, int size
,
3653 enum bpf_access_type type
,
3654 struct bpf_insn_access_aux
*info
)
3657 case bpf_ctx_range(struct __sk_buff
, tc_classid
):
3658 case bpf_ctx_range_till(struct __sk_buff
, family
, local_port
):
3659 case bpf_ctx_range(struct __sk_buff
, data_meta
):
3663 if (type
== BPF_WRITE
) {
3665 case bpf_ctx_range(struct __sk_buff
, mark
):
3666 case bpf_ctx_range(struct __sk_buff
, priority
):
3667 case bpf_ctx_range_till(struct __sk_buff
, cb
[0], cb
[4]):
3675 case bpf_ctx_range(struct __sk_buff
, data
):
3676 info
->reg_type
= PTR_TO_PACKET
;
3678 case bpf_ctx_range(struct __sk_buff
, data_end
):
3679 info
->reg_type
= PTR_TO_PACKET_END
;
3683 return bpf_skb_is_valid_access(off
, size
, type
, info
);
3686 static bool sock_filter_is_valid_access(int off
, int size
,
3687 enum bpf_access_type type
,
3688 struct bpf_insn_access_aux
*info
)
3690 if (type
== BPF_WRITE
) {
3692 case offsetof(struct bpf_sock
, bound_dev_if
):
3693 case offsetof(struct bpf_sock
, mark
):
3694 case offsetof(struct bpf_sock
, priority
):
3701 if (off
< 0 || off
+ size
> sizeof(struct bpf_sock
))
3703 /* The verifier guarantees that size > 0. */
3704 if (off
% size
!= 0)
3706 if (size
!= sizeof(__u32
))
3712 static int bpf_unclone_prologue(struct bpf_insn
*insn_buf
, bool direct_write
,
3713 const struct bpf_prog
*prog
, int drop_verdict
)
3715 struct bpf_insn
*insn
= insn_buf
;
3720 /* if (!skb->cloned)
3723 * (Fast-path, otherwise approximation that we might be
3724 * a clone, do the rest in helper.)
3726 *insn
++ = BPF_LDX_MEM(BPF_B
, BPF_REG_6
, BPF_REG_1
, CLONED_OFFSET());
3727 *insn
++ = BPF_ALU32_IMM(BPF_AND
, BPF_REG_6
, CLONED_MASK
);
3728 *insn
++ = BPF_JMP_IMM(BPF_JEQ
, BPF_REG_6
, 0, 7);
3730 /* ret = bpf_skb_pull_data(skb, 0); */
3731 *insn
++ = BPF_MOV64_REG(BPF_REG_6
, BPF_REG_1
);
3732 *insn
++ = BPF_ALU64_REG(BPF_XOR
, BPF_REG_2
, BPF_REG_2
);
3733 *insn
++ = BPF_RAW_INSN(BPF_JMP
| BPF_CALL
, 0, 0, 0,
3734 BPF_FUNC_skb_pull_data
);
3737 * return TC_ACT_SHOT;
3739 *insn
++ = BPF_JMP_IMM(BPF_JEQ
, BPF_REG_0
, 0, 2);
3740 *insn
++ = BPF_ALU32_IMM(BPF_MOV
, BPF_REG_0
, drop_verdict
);
3741 *insn
++ = BPF_EXIT_INSN();
3744 *insn
++ = BPF_MOV64_REG(BPF_REG_1
, BPF_REG_6
);
3746 *insn
++ = prog
->insnsi
[0];
3748 return insn
- insn_buf
;
3751 static int tc_cls_act_prologue(struct bpf_insn
*insn_buf
, bool direct_write
,
3752 const struct bpf_prog
*prog
)
3754 return bpf_unclone_prologue(insn_buf
, direct_write
, prog
, TC_ACT_SHOT
);
3757 static bool tc_cls_act_is_valid_access(int off
, int size
,
3758 enum bpf_access_type type
,
3759 struct bpf_insn_access_aux
*info
)
3761 if (type
== BPF_WRITE
) {
3763 case bpf_ctx_range(struct __sk_buff
, mark
):
3764 case bpf_ctx_range(struct __sk_buff
, tc_index
):
3765 case bpf_ctx_range(struct __sk_buff
, priority
):
3766 case bpf_ctx_range(struct __sk_buff
, tc_classid
):
3767 case bpf_ctx_range_till(struct __sk_buff
, cb
[0], cb
[4]):
3775 case bpf_ctx_range(struct __sk_buff
, data
):
3776 info
->reg_type
= PTR_TO_PACKET
;
3778 case bpf_ctx_range(struct __sk_buff
, data_meta
):
3779 info
->reg_type
= PTR_TO_PACKET_META
;
3781 case bpf_ctx_range(struct __sk_buff
, data_end
):
3782 info
->reg_type
= PTR_TO_PACKET_END
;
3784 case bpf_ctx_range_till(struct __sk_buff
, family
, local_port
):
3788 return bpf_skb_is_valid_access(off
, size
, type
, info
);
3791 static bool __is_valid_xdp_access(int off
, int size
)
3793 if (off
< 0 || off
>= sizeof(struct xdp_md
))
3795 if (off
% size
!= 0)
3797 if (size
!= sizeof(__u32
))
3803 static bool xdp_is_valid_access(int off
, int size
,
3804 enum bpf_access_type type
,
3805 struct bpf_insn_access_aux
*info
)
3807 if (type
== BPF_WRITE
)
3811 case offsetof(struct xdp_md
, data
):
3812 info
->reg_type
= PTR_TO_PACKET
;
3814 case offsetof(struct xdp_md
, data_meta
):
3815 info
->reg_type
= PTR_TO_PACKET_META
;
3817 case offsetof(struct xdp_md
, data_end
):
3818 info
->reg_type
= PTR_TO_PACKET_END
;
3822 return __is_valid_xdp_access(off
, size
);
3825 void bpf_warn_invalid_xdp_action(u32 act
)
3827 const u32 act_max
= XDP_REDIRECT
;
3829 WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n",
3830 act
> act_max
? "Illegal" : "Driver unsupported",
3833 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action
);
3835 static bool __is_valid_sock_ops_access(int off
, int size
)
3837 if (off
< 0 || off
>= sizeof(struct bpf_sock_ops
))
3839 /* The verifier guarantees that size > 0. */
3840 if (off
% size
!= 0)
3842 if (size
!= sizeof(__u32
))
3848 static bool sock_ops_is_valid_access(int off
, int size
,
3849 enum bpf_access_type type
,
3850 struct bpf_insn_access_aux
*info
)
3852 if (type
== BPF_WRITE
) {
3854 case offsetof(struct bpf_sock_ops
, op
) ...
3855 offsetof(struct bpf_sock_ops
, replylong
[3]):
3862 return __is_valid_sock_ops_access(off
, size
);
3865 static int sk_skb_prologue(struct bpf_insn
*insn_buf
, bool direct_write
,
3866 const struct bpf_prog
*prog
)
3868 return bpf_unclone_prologue(insn_buf
, direct_write
, prog
, SK_DROP
);
3871 static bool sk_skb_is_valid_access(int off
, int size
,
3872 enum bpf_access_type type
,
3873 struct bpf_insn_access_aux
*info
)
3876 case bpf_ctx_range(struct __sk_buff
, tc_classid
):
3877 case bpf_ctx_range(struct __sk_buff
, data_meta
):
3881 if (type
== BPF_WRITE
) {
3883 case bpf_ctx_range(struct __sk_buff
, tc_index
):
3884 case bpf_ctx_range(struct __sk_buff
, priority
):
3892 case bpf_ctx_range(struct __sk_buff
, mark
):
3894 case bpf_ctx_range(struct __sk_buff
, data
):
3895 info
->reg_type
= PTR_TO_PACKET
;
3897 case bpf_ctx_range(struct __sk_buff
, data_end
):
3898 info
->reg_type
= PTR_TO_PACKET_END
;
3902 return bpf_skb_is_valid_access(off
, size
, type
, info
);
3905 static u32
bpf_convert_ctx_access(enum bpf_access_type type
,
3906 const struct bpf_insn
*si
,
3907 struct bpf_insn
*insn_buf
,
3908 struct bpf_prog
*prog
, u32
*target_size
)
3910 struct bpf_insn
*insn
= insn_buf
;
3914 case offsetof(struct __sk_buff
, len
):
3915 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
3916 bpf_target_off(struct sk_buff
, len
, 4,
3920 case offsetof(struct __sk_buff
, protocol
):
3921 *insn
++ = BPF_LDX_MEM(BPF_H
, si
->dst_reg
, si
->src_reg
,
3922 bpf_target_off(struct sk_buff
, protocol
, 2,
3926 case offsetof(struct __sk_buff
, vlan_proto
):
3927 *insn
++ = BPF_LDX_MEM(BPF_H
, si
->dst_reg
, si
->src_reg
,
3928 bpf_target_off(struct sk_buff
, vlan_proto
, 2,
3932 case offsetof(struct __sk_buff
, priority
):
3933 if (type
== BPF_WRITE
)
3934 *insn
++ = BPF_STX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
3935 bpf_target_off(struct sk_buff
, priority
, 4,
3938 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
3939 bpf_target_off(struct sk_buff
, priority
, 4,
3943 case offsetof(struct __sk_buff
, ingress_ifindex
):
3944 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
3945 bpf_target_off(struct sk_buff
, skb_iif
, 4,
3949 case offsetof(struct __sk_buff
, ifindex
):
3950 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff
, dev
),
3951 si
->dst_reg
, si
->src_reg
,
3952 offsetof(struct sk_buff
, dev
));
3953 *insn
++ = BPF_JMP_IMM(BPF_JEQ
, si
->dst_reg
, 0, 1);
3954 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->dst_reg
,
3955 bpf_target_off(struct net_device
, ifindex
, 4,
3959 case offsetof(struct __sk_buff
, hash
):
3960 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
3961 bpf_target_off(struct sk_buff
, hash
, 4,
3965 case offsetof(struct __sk_buff
, mark
):
3966 if (type
== BPF_WRITE
)
3967 *insn
++ = BPF_STX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
3968 bpf_target_off(struct sk_buff
, mark
, 4,
3971 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
3972 bpf_target_off(struct sk_buff
, mark
, 4,
3976 case offsetof(struct __sk_buff
, pkt_type
):
3978 *insn
++ = BPF_LDX_MEM(BPF_B
, si
->dst_reg
, si
->src_reg
,
3980 *insn
++ = BPF_ALU32_IMM(BPF_AND
, si
->dst_reg
, PKT_TYPE_MAX
);
3981 #ifdef __BIG_ENDIAN_BITFIELD
3982 *insn
++ = BPF_ALU32_IMM(BPF_RSH
, si
->dst_reg
, 5);
3986 case offsetof(struct __sk_buff
, queue_mapping
):
3987 *insn
++ = BPF_LDX_MEM(BPF_H
, si
->dst_reg
, si
->src_reg
,
3988 bpf_target_off(struct sk_buff
, queue_mapping
, 2,
3992 case offsetof(struct __sk_buff
, vlan_present
):
3993 case offsetof(struct __sk_buff
, vlan_tci
):
3994 BUILD_BUG_ON(VLAN_TAG_PRESENT
!= 0x1000);
3996 *insn
++ = BPF_LDX_MEM(BPF_H
, si
->dst_reg
, si
->src_reg
,
3997 bpf_target_off(struct sk_buff
, vlan_tci
, 2,
3999 if (si
->off
== offsetof(struct __sk_buff
, vlan_tci
)) {
4000 *insn
++ = BPF_ALU32_IMM(BPF_AND
, si
->dst_reg
,
4003 *insn
++ = BPF_ALU32_IMM(BPF_RSH
, si
->dst_reg
, 12);
4004 *insn
++ = BPF_ALU32_IMM(BPF_AND
, si
->dst_reg
, 1);
4008 case offsetof(struct __sk_buff
, cb
[0]) ...
4009 offsetofend(struct __sk_buff
, cb
[4]) - 1:
4010 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb
, data
) < 20);
4011 BUILD_BUG_ON((offsetof(struct sk_buff
, cb
) +
4012 offsetof(struct qdisc_skb_cb
, data
)) %
4015 prog
->cb_access
= 1;
4017 off
-= offsetof(struct __sk_buff
, cb
[0]);
4018 off
+= offsetof(struct sk_buff
, cb
);
4019 off
+= offsetof(struct qdisc_skb_cb
, data
);
4020 if (type
== BPF_WRITE
)
4021 *insn
++ = BPF_STX_MEM(BPF_SIZE(si
->code
), si
->dst_reg
,
4024 *insn
++ = BPF_LDX_MEM(BPF_SIZE(si
->code
), si
->dst_reg
,
4028 case offsetof(struct __sk_buff
, tc_classid
):
4029 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb
, tc_classid
) != 2);
4032 off
-= offsetof(struct __sk_buff
, tc_classid
);
4033 off
+= offsetof(struct sk_buff
, cb
);
4034 off
+= offsetof(struct qdisc_skb_cb
, tc_classid
);
4036 if (type
== BPF_WRITE
)
4037 *insn
++ = BPF_STX_MEM(BPF_H
, si
->dst_reg
,
4040 *insn
++ = BPF_LDX_MEM(BPF_H
, si
->dst_reg
,
4044 case offsetof(struct __sk_buff
, data
):
4045 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff
, data
),
4046 si
->dst_reg
, si
->src_reg
,
4047 offsetof(struct sk_buff
, data
));
4050 case offsetof(struct __sk_buff
, data_meta
):
4052 off
-= offsetof(struct __sk_buff
, data_meta
);
4053 off
+= offsetof(struct sk_buff
, cb
);
4054 off
+= offsetof(struct bpf_skb_data_end
, data_meta
);
4055 *insn
++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si
->dst_reg
,
4059 case offsetof(struct __sk_buff
, data_end
):
4061 off
-= offsetof(struct __sk_buff
, data_end
);
4062 off
+= offsetof(struct sk_buff
, cb
);
4063 off
+= offsetof(struct bpf_skb_data_end
, data_end
);
4064 *insn
++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si
->dst_reg
,
4068 case offsetof(struct __sk_buff
, tc_index
):
4069 #ifdef CONFIG_NET_SCHED
4070 if (type
== BPF_WRITE
)
4071 *insn
++ = BPF_STX_MEM(BPF_H
, si
->dst_reg
, si
->src_reg
,
4072 bpf_target_off(struct sk_buff
, tc_index
, 2,
4075 *insn
++ = BPF_LDX_MEM(BPF_H
, si
->dst_reg
, si
->src_reg
,
4076 bpf_target_off(struct sk_buff
, tc_index
, 2,
4080 if (type
== BPF_WRITE
)
4081 *insn
++ = BPF_MOV64_REG(si
->dst_reg
, si
->dst_reg
);
4083 *insn
++ = BPF_MOV64_IMM(si
->dst_reg
, 0);
4087 case offsetof(struct __sk_buff
, napi_id
):
4088 #if defined(CONFIG_NET_RX_BUSY_POLL)
4089 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
4090 bpf_target_off(struct sk_buff
, napi_id
, 4,
4092 *insn
++ = BPF_JMP_IMM(BPF_JGE
, si
->dst_reg
, MIN_NAPI_ID
, 1);
4093 *insn
++ = BPF_MOV64_IMM(si
->dst_reg
, 0);
4096 *insn
++ = BPF_MOV64_IMM(si
->dst_reg
, 0);
4099 case offsetof(struct __sk_buff
, family
):
4100 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common
, skc_family
) != 2);
4102 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff
, sk
),
4103 si
->dst_reg
, si
->src_reg
,
4104 offsetof(struct sk_buff
, sk
));
4105 *insn
++ = BPF_LDX_MEM(BPF_H
, si
->dst_reg
, si
->dst_reg
,
4106 bpf_target_off(struct sock_common
,
4110 case offsetof(struct __sk_buff
, remote_ip4
):
4111 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common
, skc_daddr
) != 4);
4113 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff
, sk
),
4114 si
->dst_reg
, si
->src_reg
,
4115 offsetof(struct sk_buff
, sk
));
4116 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->dst_reg
,
4117 bpf_target_off(struct sock_common
,
4121 case offsetof(struct __sk_buff
, local_ip4
):
4122 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common
,
4123 skc_rcv_saddr
) != 4);
4125 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff
, sk
),
4126 si
->dst_reg
, si
->src_reg
,
4127 offsetof(struct sk_buff
, sk
));
4128 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->dst_reg
,
4129 bpf_target_off(struct sock_common
,
4133 case offsetof(struct __sk_buff
, remote_ip6
[0]) ...
4134 offsetof(struct __sk_buff
, remote_ip6
[3]):
4135 #if IS_ENABLED(CONFIG_IPV6)
4136 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common
,
4137 skc_v6_daddr
.s6_addr32
[0]) != 4);
4140 off
-= offsetof(struct __sk_buff
, remote_ip6
[0]);
4142 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff
, sk
),
4143 si
->dst_reg
, si
->src_reg
,
4144 offsetof(struct sk_buff
, sk
));
4145 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->dst_reg
,
4146 offsetof(struct sock_common
,
4147 skc_v6_daddr
.s6_addr32
[0]) +
4150 *insn
++ = BPF_MOV32_IMM(si
->dst_reg
, 0);
4153 case offsetof(struct __sk_buff
, local_ip6
[0]) ...
4154 offsetof(struct __sk_buff
, local_ip6
[3]):
4155 #if IS_ENABLED(CONFIG_IPV6)
4156 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common
,
4157 skc_v6_rcv_saddr
.s6_addr32
[0]) != 4);
4160 off
-= offsetof(struct __sk_buff
, local_ip6
[0]);
4162 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff
, sk
),
4163 si
->dst_reg
, si
->src_reg
,
4164 offsetof(struct sk_buff
, sk
));
4165 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->dst_reg
,
4166 offsetof(struct sock_common
,
4167 skc_v6_rcv_saddr
.s6_addr32
[0]) +
4170 *insn
++ = BPF_MOV32_IMM(si
->dst_reg
, 0);
4174 case offsetof(struct __sk_buff
, remote_port
):
4175 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common
, skc_dport
) != 2);
4177 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff
, sk
),
4178 si
->dst_reg
, si
->src_reg
,
4179 offsetof(struct sk_buff
, sk
));
4180 *insn
++ = BPF_LDX_MEM(BPF_H
, si
->dst_reg
, si
->dst_reg
,
4181 bpf_target_off(struct sock_common
,
4184 #ifndef __BIG_ENDIAN_BITFIELD
4185 *insn
++ = BPF_ALU32_IMM(BPF_LSH
, si
->dst_reg
, 16);
4189 case offsetof(struct __sk_buff
, local_port
):
4190 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common
, skc_num
) != 2);
4192 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff
, sk
),
4193 si
->dst_reg
, si
->src_reg
,
4194 offsetof(struct sk_buff
, sk
));
4195 *insn
++ = BPF_LDX_MEM(BPF_H
, si
->dst_reg
, si
->dst_reg
,
4196 bpf_target_off(struct sock_common
,
4197 skc_num
, 2, target_size
));
4201 return insn
- insn_buf
;
4204 static u32
sock_filter_convert_ctx_access(enum bpf_access_type type
,
4205 const struct bpf_insn
*si
,
4206 struct bpf_insn
*insn_buf
,
4207 struct bpf_prog
*prog
, u32
*target_size
)
4209 struct bpf_insn
*insn
= insn_buf
;
4212 case offsetof(struct bpf_sock
, bound_dev_if
):
4213 BUILD_BUG_ON(FIELD_SIZEOF(struct sock
, sk_bound_dev_if
) != 4);
4215 if (type
== BPF_WRITE
)
4216 *insn
++ = BPF_STX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
4217 offsetof(struct sock
, sk_bound_dev_if
));
4219 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
4220 offsetof(struct sock
, sk_bound_dev_if
));
4223 case offsetof(struct bpf_sock
, mark
):
4224 BUILD_BUG_ON(FIELD_SIZEOF(struct sock
, sk_mark
) != 4);
4226 if (type
== BPF_WRITE
)
4227 *insn
++ = BPF_STX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
4228 offsetof(struct sock
, sk_mark
));
4230 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
4231 offsetof(struct sock
, sk_mark
));
4234 case offsetof(struct bpf_sock
, priority
):
4235 BUILD_BUG_ON(FIELD_SIZEOF(struct sock
, sk_priority
) != 4);
4237 if (type
== BPF_WRITE
)
4238 *insn
++ = BPF_STX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
4239 offsetof(struct sock
, sk_priority
));
4241 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
4242 offsetof(struct sock
, sk_priority
));
4245 case offsetof(struct bpf_sock
, family
):
4246 BUILD_BUG_ON(FIELD_SIZEOF(struct sock
, sk_family
) != 2);
4248 *insn
++ = BPF_LDX_MEM(BPF_H
, si
->dst_reg
, si
->src_reg
,
4249 offsetof(struct sock
, sk_family
));
4252 case offsetof(struct bpf_sock
, type
):
4253 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
4254 offsetof(struct sock
, __sk_flags_offset
));
4255 *insn
++ = BPF_ALU32_IMM(BPF_AND
, si
->dst_reg
, SK_FL_TYPE_MASK
);
4256 *insn
++ = BPF_ALU32_IMM(BPF_RSH
, si
->dst_reg
, SK_FL_TYPE_SHIFT
);
4259 case offsetof(struct bpf_sock
, protocol
):
4260 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
4261 offsetof(struct sock
, __sk_flags_offset
));
4262 *insn
++ = BPF_ALU32_IMM(BPF_AND
, si
->dst_reg
, SK_FL_PROTO_MASK
);
4263 *insn
++ = BPF_ALU32_IMM(BPF_RSH
, si
->dst_reg
, SK_FL_PROTO_SHIFT
);
4267 return insn
- insn_buf
;
4270 static u32
tc_cls_act_convert_ctx_access(enum bpf_access_type type
,
4271 const struct bpf_insn
*si
,
4272 struct bpf_insn
*insn_buf
,
4273 struct bpf_prog
*prog
, u32
*target_size
)
4275 struct bpf_insn
*insn
= insn_buf
;
4278 case offsetof(struct __sk_buff
, ifindex
):
4279 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff
, dev
),
4280 si
->dst_reg
, si
->src_reg
,
4281 offsetof(struct sk_buff
, dev
));
4282 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->dst_reg
,
4283 bpf_target_off(struct net_device
, ifindex
, 4,
4287 return bpf_convert_ctx_access(type
, si
, insn_buf
, prog
,
4291 return insn
- insn_buf
;
4294 static u32
xdp_convert_ctx_access(enum bpf_access_type type
,
4295 const struct bpf_insn
*si
,
4296 struct bpf_insn
*insn_buf
,
4297 struct bpf_prog
*prog
, u32
*target_size
)
4299 struct bpf_insn
*insn
= insn_buf
;
4302 case offsetof(struct xdp_md
, data
):
4303 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff
, data
),
4304 si
->dst_reg
, si
->src_reg
,
4305 offsetof(struct xdp_buff
, data
));
4307 case offsetof(struct xdp_md
, data_meta
):
4308 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff
, data_meta
),
4309 si
->dst_reg
, si
->src_reg
,
4310 offsetof(struct xdp_buff
, data_meta
));
4312 case offsetof(struct xdp_md
, data_end
):
4313 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff
, data_end
),
4314 si
->dst_reg
, si
->src_reg
,
4315 offsetof(struct xdp_buff
, data_end
));
4319 return insn
- insn_buf
;
4322 static u32
sock_ops_convert_ctx_access(enum bpf_access_type type
,
4323 const struct bpf_insn
*si
,
4324 struct bpf_insn
*insn_buf
,
4325 struct bpf_prog
*prog
,
4328 struct bpf_insn
*insn
= insn_buf
;
4332 case offsetof(struct bpf_sock_ops
, op
) ...
4333 offsetof(struct bpf_sock_ops
, replylong
[3]):
4334 BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops
, op
) !=
4335 FIELD_SIZEOF(struct bpf_sock_ops_kern
, op
));
4336 BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops
, reply
) !=
4337 FIELD_SIZEOF(struct bpf_sock_ops_kern
, reply
));
4338 BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops
, replylong
) !=
4339 FIELD_SIZEOF(struct bpf_sock_ops_kern
, replylong
));
4341 off
-= offsetof(struct bpf_sock_ops
, op
);
4342 off
+= offsetof(struct bpf_sock_ops_kern
, op
);
4343 if (type
== BPF_WRITE
)
4344 *insn
++ = BPF_STX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
4347 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->src_reg
,
4351 case offsetof(struct bpf_sock_ops
, family
):
4352 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common
, skc_family
) != 2);
4354 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
4355 struct bpf_sock_ops_kern
, sk
),
4356 si
->dst_reg
, si
->src_reg
,
4357 offsetof(struct bpf_sock_ops_kern
, sk
));
4358 *insn
++ = BPF_LDX_MEM(BPF_H
, si
->dst_reg
, si
->dst_reg
,
4359 offsetof(struct sock_common
, skc_family
));
4362 case offsetof(struct bpf_sock_ops
, remote_ip4
):
4363 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common
, skc_daddr
) != 4);
4365 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
4366 struct bpf_sock_ops_kern
, sk
),
4367 si
->dst_reg
, si
->src_reg
,
4368 offsetof(struct bpf_sock_ops_kern
, sk
));
4369 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->dst_reg
,
4370 offsetof(struct sock_common
, skc_daddr
));
4373 case offsetof(struct bpf_sock_ops
, local_ip4
):
4374 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common
, skc_rcv_saddr
) != 4);
4376 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
4377 struct bpf_sock_ops_kern
, sk
),
4378 si
->dst_reg
, si
->src_reg
,
4379 offsetof(struct bpf_sock_ops_kern
, sk
));
4380 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->dst_reg
,
4381 offsetof(struct sock_common
,
4385 case offsetof(struct bpf_sock_ops
, remote_ip6
[0]) ...
4386 offsetof(struct bpf_sock_ops
, remote_ip6
[3]):
4387 #if IS_ENABLED(CONFIG_IPV6)
4388 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common
,
4389 skc_v6_daddr
.s6_addr32
[0]) != 4);
4392 off
-= offsetof(struct bpf_sock_ops
, remote_ip6
[0]);
4393 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
4394 struct bpf_sock_ops_kern
, sk
),
4395 si
->dst_reg
, si
->src_reg
,
4396 offsetof(struct bpf_sock_ops_kern
, sk
));
4397 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->dst_reg
,
4398 offsetof(struct sock_common
,
4399 skc_v6_daddr
.s6_addr32
[0]) +
4402 *insn
++ = BPF_MOV32_IMM(si
->dst_reg
, 0);
4406 case offsetof(struct bpf_sock_ops
, local_ip6
[0]) ...
4407 offsetof(struct bpf_sock_ops
, local_ip6
[3]):
4408 #if IS_ENABLED(CONFIG_IPV6)
4409 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common
,
4410 skc_v6_rcv_saddr
.s6_addr32
[0]) != 4);
4413 off
-= offsetof(struct bpf_sock_ops
, local_ip6
[0]);
4414 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
4415 struct bpf_sock_ops_kern
, sk
),
4416 si
->dst_reg
, si
->src_reg
,
4417 offsetof(struct bpf_sock_ops_kern
, sk
));
4418 *insn
++ = BPF_LDX_MEM(BPF_W
, si
->dst_reg
, si
->dst_reg
,
4419 offsetof(struct sock_common
,
4420 skc_v6_rcv_saddr
.s6_addr32
[0]) +
4423 *insn
++ = BPF_MOV32_IMM(si
->dst_reg
, 0);
4427 case offsetof(struct bpf_sock_ops
, remote_port
):
4428 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common
, skc_dport
) != 2);
4430 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
4431 struct bpf_sock_ops_kern
, sk
),
4432 si
->dst_reg
, si
->src_reg
,
4433 offsetof(struct bpf_sock_ops_kern
, sk
));
4434 *insn
++ = BPF_LDX_MEM(BPF_H
, si
->dst_reg
, si
->dst_reg
,
4435 offsetof(struct sock_common
, skc_dport
));
4436 #ifndef __BIG_ENDIAN_BITFIELD
4437 *insn
++ = BPF_ALU32_IMM(BPF_LSH
, si
->dst_reg
, 16);
4441 case offsetof(struct bpf_sock_ops
, local_port
):
4442 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common
, skc_num
) != 2);
4444 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
4445 struct bpf_sock_ops_kern
, sk
),
4446 si
->dst_reg
, si
->src_reg
,
4447 offsetof(struct bpf_sock_ops_kern
, sk
));
4448 *insn
++ = BPF_LDX_MEM(BPF_H
, si
->dst_reg
, si
->dst_reg
,
4449 offsetof(struct sock_common
, skc_num
));
4452 return insn
- insn_buf
;
4455 static u32
sk_skb_convert_ctx_access(enum bpf_access_type type
,
4456 const struct bpf_insn
*si
,
4457 struct bpf_insn
*insn_buf
,
4458 struct bpf_prog
*prog
, u32
*target_size
)
4460 struct bpf_insn
*insn
= insn_buf
;
4464 case offsetof(struct __sk_buff
, data_end
):
4466 off
-= offsetof(struct __sk_buff
, data_end
);
4467 off
+= offsetof(struct sk_buff
, cb
);
4468 off
+= offsetof(struct tcp_skb_cb
, bpf
.data_end
);
4469 *insn
++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si
->dst_reg
,
4473 return bpf_convert_ctx_access(type
, si
, insn_buf
, prog
,
4477 return insn
- insn_buf
;
4480 const struct bpf_verifier_ops sk_filter_verifier_ops
= {
4481 .get_func_proto
= sk_filter_func_proto
,
4482 .is_valid_access
= sk_filter_is_valid_access
,
4483 .convert_ctx_access
= bpf_convert_ctx_access
,
4486 const struct bpf_prog_ops sk_filter_prog_ops
= {
4489 const struct bpf_verifier_ops tc_cls_act_verifier_ops
= {
4490 .get_func_proto
= tc_cls_act_func_proto
,
4491 .is_valid_access
= tc_cls_act_is_valid_access
,
4492 .convert_ctx_access
= tc_cls_act_convert_ctx_access
,
4493 .gen_prologue
= tc_cls_act_prologue
,
4496 const struct bpf_prog_ops tc_cls_act_prog_ops
= {
4497 .test_run
= bpf_prog_test_run_skb
,
4500 const struct bpf_verifier_ops xdp_verifier_ops
= {
4501 .get_func_proto
= xdp_func_proto
,
4502 .is_valid_access
= xdp_is_valid_access
,
4503 .convert_ctx_access
= xdp_convert_ctx_access
,
4506 const struct bpf_prog_ops xdp_prog_ops
= {
4507 .test_run
= bpf_prog_test_run_xdp
,
4510 const struct bpf_verifier_ops cg_skb_verifier_ops
= {
4511 .get_func_proto
= sk_filter_func_proto
,
4512 .is_valid_access
= sk_filter_is_valid_access
,
4513 .convert_ctx_access
= bpf_convert_ctx_access
,
4516 const struct bpf_prog_ops cg_skb_prog_ops
= {
4517 .test_run
= bpf_prog_test_run_skb
,
4520 const struct bpf_verifier_ops lwt_inout_verifier_ops
= {
4521 .get_func_proto
= lwt_inout_func_proto
,
4522 .is_valid_access
= lwt_is_valid_access
,
4523 .convert_ctx_access
= bpf_convert_ctx_access
,
4526 const struct bpf_prog_ops lwt_inout_prog_ops
= {
4527 .test_run
= bpf_prog_test_run_skb
,
4530 const struct bpf_verifier_ops lwt_xmit_verifier_ops
= {
4531 .get_func_proto
= lwt_xmit_func_proto
,
4532 .is_valid_access
= lwt_is_valid_access
,
4533 .convert_ctx_access
= bpf_convert_ctx_access
,
4534 .gen_prologue
= tc_cls_act_prologue
,
4537 const struct bpf_prog_ops lwt_xmit_prog_ops
= {
4538 .test_run
= bpf_prog_test_run_skb
,
4541 const struct bpf_verifier_ops cg_sock_verifier_ops
= {
4542 .get_func_proto
= sock_filter_func_proto
,
4543 .is_valid_access
= sock_filter_is_valid_access
,
4544 .convert_ctx_access
= sock_filter_convert_ctx_access
,
4547 const struct bpf_prog_ops cg_sock_prog_ops
= {
4550 const struct bpf_verifier_ops sock_ops_verifier_ops
= {
4551 .get_func_proto
= sock_ops_func_proto
,
4552 .is_valid_access
= sock_ops_is_valid_access
,
4553 .convert_ctx_access
= sock_ops_convert_ctx_access
,
4556 const struct bpf_prog_ops sock_ops_prog_ops
= {
4559 const struct bpf_verifier_ops sk_skb_verifier_ops
= {
4560 .get_func_proto
= sk_skb_func_proto
,
4561 .is_valid_access
= sk_skb_is_valid_access
,
4562 .convert_ctx_access
= sk_skb_convert_ctx_access
,
4563 .gen_prologue
= sk_skb_prologue
,
4566 const struct bpf_prog_ops sk_skb_prog_ops
= {
4569 int sk_detach_filter(struct sock
*sk
)
4572 struct sk_filter
*filter
;
4574 if (sock_flag(sk
, SOCK_FILTER_LOCKED
))
4577 filter
= rcu_dereference_protected(sk
->sk_filter
,
4578 lockdep_sock_is_held(sk
));
4580 RCU_INIT_POINTER(sk
->sk_filter
, NULL
);
4581 sk_filter_uncharge(sk
, filter
);
4587 EXPORT_SYMBOL_GPL(sk_detach_filter
);
4589 int sk_get_filter(struct sock
*sk
, struct sock_filter __user
*ubuf
,
4592 struct sock_fprog_kern
*fprog
;
4593 struct sk_filter
*filter
;
4597 filter
= rcu_dereference_protected(sk
->sk_filter
,
4598 lockdep_sock_is_held(sk
));
4602 /* We're copying the filter that has been originally attached,
4603 * so no conversion/decode needed anymore. eBPF programs that
4604 * have no original program cannot be dumped through this.
4607 fprog
= filter
->prog
->orig_prog
;
4613 /* User space only enquires number of filter blocks. */
4617 if (len
< fprog
->len
)
4621 if (copy_to_user(ubuf
, fprog
->filter
, bpf_classic_proglen(fprog
)))
4624 /* Instead of bytes, the API requests to return the number