2 * Linux Socket Filter - Kernel level socket filtering
4 * Based on the design of the Berkeley Packet Filter. The new
5 * internal format has been designed by PLUMgrid:
7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
11 * Jay Schulist <jschlst@samba.org>
12 * Alexei Starovoitov <ast@plumgrid.com>
13 * Daniel Borkmann <dborkman@redhat.com>
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version
18 * 2 of the License, or (at your option) any later version.
20 * Andi Kleen - Fix a few bad bugs and races.
21 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
24 #include <linux/module.h>
25 #include <linux/types.h>
27 #include <linux/fcntl.h>
28 #include <linux/socket.h>
30 #include <linux/inet.h>
31 #include <linux/netdevice.h>
32 #include <linux/if_packet.h>
33 #include <linux/if_arp.h>
34 #include <linux/gfp.h>
36 #include <net/protocol.h>
37 #include <net/netlink.h>
38 #include <linux/skbuff.h>
40 #include <net/flow_dissector.h>
41 #include <linux/errno.h>
42 #include <linux/timer.h>
43 #include <linux/uaccess.h>
44 #include <asm/unaligned.h>
45 #include <linux/filter.h>
46 #include <linux/ratelimit.h>
47 #include <linux/seccomp.h>
48 #include <linux/if_vlan.h>
49 #include <linux/bpf.h>
50 #include <net/sch_generic.h>
51 #include <net/cls_cgroup.h>
52 #include <net/dst_metadata.h>
54 #include <net/sock_reuseport.h>
57 * sk_filter_trim_cap - run a packet through a socket filter
58 * @sk: sock associated with &sk_buff
59 * @skb: buffer to filter
60 * @cap: limit on how short the eBPF program may trim the packet
62 * Run the eBPF program and then cut skb->data to correct size returned by
63 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
64 * than pkt_len we keep whole skb->data. This is the socket level
65 * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
66 * be accepted or -EPERM if the packet should be tossed.
69 int sk_filter_trim_cap(struct sock
*sk
, struct sk_buff
*skb
, unsigned int cap
)
72 struct sk_filter
*filter
;
75 * If the skb was allocated from pfmemalloc reserves, only
76 * allow SOCK_MEMALLOC sockets to use it as this socket is
79 if (skb_pfmemalloc(skb
) && !sock_flag(sk
, SOCK_MEMALLOC
))
82 err
= BPF_CGROUP_RUN_PROG_INET_INGRESS(sk
, skb
);
86 err
= security_sock_rcv_skb(sk
, skb
);
91 filter
= rcu_dereference(sk
->sk_filter
);
93 unsigned int pkt_len
= bpf_prog_run_save_cb(filter
->prog
, skb
);
94 err
= pkt_len
? pskb_trim(skb
, max(cap
, pkt_len
)) : -EPERM
;
100 EXPORT_SYMBOL(sk_filter_trim_cap
);
102 BPF_CALL_1(__skb_get_pay_offset
, struct sk_buff
*, skb
)
104 return skb_get_poff(skb
);
107 BPF_CALL_3(__skb_get_nlattr
, struct sk_buff
*, skb
, u32
, a
, u32
, x
)
111 if (skb_is_nonlinear(skb
))
114 if (skb
->len
< sizeof(struct nlattr
))
117 if (a
> skb
->len
- sizeof(struct nlattr
))
120 nla
= nla_find((struct nlattr
*) &skb
->data
[a
], skb
->len
- a
, x
);
122 return (void *) nla
- (void *) skb
->data
;
127 BPF_CALL_3(__skb_get_nlattr_nest
, struct sk_buff
*, skb
, u32
, a
, u32
, x
)
131 if (skb_is_nonlinear(skb
))
134 if (skb
->len
< sizeof(struct nlattr
))
137 if (a
> skb
->len
- sizeof(struct nlattr
))
140 nla
= (struct nlattr
*) &skb
->data
[a
];
141 if (nla
->nla_len
> skb
->len
- a
)
144 nla
= nla_find_nested(nla
, x
);
146 return (void *) nla
- (void *) skb
->data
;
151 BPF_CALL_0(__get_raw_cpu_id
)
153 return raw_smp_processor_id();
156 static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto
= {
157 .func
= __get_raw_cpu_id
,
159 .ret_type
= RET_INTEGER
,
162 static u32
convert_skb_access(int skb_field
, int dst_reg
, int src_reg
,
163 struct bpf_insn
*insn_buf
)
165 struct bpf_insn
*insn
= insn_buf
;
169 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, mark
) != 4);
171 *insn
++ = BPF_LDX_MEM(BPF_W
, dst_reg
, src_reg
,
172 offsetof(struct sk_buff
, mark
));
176 *insn
++ = BPF_LDX_MEM(BPF_B
, dst_reg
, src_reg
, PKT_TYPE_OFFSET());
177 *insn
++ = BPF_ALU32_IMM(BPF_AND
, dst_reg
, PKT_TYPE_MAX
);
178 #ifdef __BIG_ENDIAN_BITFIELD
179 *insn
++ = BPF_ALU32_IMM(BPF_RSH
, dst_reg
, 5);
184 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, queue_mapping
) != 2);
186 *insn
++ = BPF_LDX_MEM(BPF_H
, dst_reg
, src_reg
,
187 offsetof(struct sk_buff
, queue_mapping
));
190 case SKF_AD_VLAN_TAG
:
191 case SKF_AD_VLAN_TAG_PRESENT
:
192 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, vlan_tci
) != 2);
193 BUILD_BUG_ON(VLAN_TAG_PRESENT
!= 0x1000);
195 /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
196 *insn
++ = BPF_LDX_MEM(BPF_H
, dst_reg
, src_reg
,
197 offsetof(struct sk_buff
, vlan_tci
));
198 if (skb_field
== SKF_AD_VLAN_TAG
) {
199 *insn
++ = BPF_ALU32_IMM(BPF_AND
, dst_reg
,
203 *insn
++ = BPF_ALU32_IMM(BPF_RSH
, dst_reg
, 12);
205 *insn
++ = BPF_ALU32_IMM(BPF_AND
, dst_reg
, 1);
210 return insn
- insn_buf
;
213 static bool convert_bpf_extensions(struct sock_filter
*fp
,
214 struct bpf_insn
**insnp
)
216 struct bpf_insn
*insn
= *insnp
;
220 case SKF_AD_OFF
+ SKF_AD_PROTOCOL
:
221 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, protocol
) != 2);
223 /* A = *(u16 *) (CTX + offsetof(protocol)) */
224 *insn
++ = BPF_LDX_MEM(BPF_H
, BPF_REG_A
, BPF_REG_CTX
,
225 offsetof(struct sk_buff
, protocol
));
226 /* A = ntohs(A) [emitting a nop or swap16] */
227 *insn
= BPF_ENDIAN(BPF_FROM_BE
, BPF_REG_A
, 16);
230 case SKF_AD_OFF
+ SKF_AD_PKTTYPE
:
231 cnt
= convert_skb_access(SKF_AD_PKTTYPE
, BPF_REG_A
, BPF_REG_CTX
, insn
);
235 case SKF_AD_OFF
+ SKF_AD_IFINDEX
:
236 case SKF_AD_OFF
+ SKF_AD_HATYPE
:
237 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device
, ifindex
) != 4);
238 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device
, type
) != 2);
240 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff
, dev
),
241 BPF_REG_TMP
, BPF_REG_CTX
,
242 offsetof(struct sk_buff
, dev
));
243 /* if (tmp != 0) goto pc + 1 */
244 *insn
++ = BPF_JMP_IMM(BPF_JNE
, BPF_REG_TMP
, 0, 1);
245 *insn
++ = BPF_EXIT_INSN();
246 if (fp
->k
== SKF_AD_OFF
+ SKF_AD_IFINDEX
)
247 *insn
= BPF_LDX_MEM(BPF_W
, BPF_REG_A
, BPF_REG_TMP
,
248 offsetof(struct net_device
, ifindex
));
250 *insn
= BPF_LDX_MEM(BPF_H
, BPF_REG_A
, BPF_REG_TMP
,
251 offsetof(struct net_device
, type
));
254 case SKF_AD_OFF
+ SKF_AD_MARK
:
255 cnt
= convert_skb_access(SKF_AD_MARK
, BPF_REG_A
, BPF_REG_CTX
, insn
);
259 case SKF_AD_OFF
+ SKF_AD_RXHASH
:
260 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, hash
) != 4);
262 *insn
= BPF_LDX_MEM(BPF_W
, BPF_REG_A
, BPF_REG_CTX
,
263 offsetof(struct sk_buff
, hash
));
266 case SKF_AD_OFF
+ SKF_AD_QUEUE
:
267 cnt
= convert_skb_access(SKF_AD_QUEUE
, BPF_REG_A
, BPF_REG_CTX
, insn
);
271 case SKF_AD_OFF
+ SKF_AD_VLAN_TAG
:
272 cnt
= convert_skb_access(SKF_AD_VLAN_TAG
,
273 BPF_REG_A
, BPF_REG_CTX
, insn
);
277 case SKF_AD_OFF
+ SKF_AD_VLAN_TAG_PRESENT
:
278 cnt
= convert_skb_access(SKF_AD_VLAN_TAG_PRESENT
,
279 BPF_REG_A
, BPF_REG_CTX
, insn
);
283 case SKF_AD_OFF
+ SKF_AD_VLAN_TPID
:
284 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, vlan_proto
) != 2);
286 /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
287 *insn
++ = BPF_LDX_MEM(BPF_H
, BPF_REG_A
, BPF_REG_CTX
,
288 offsetof(struct sk_buff
, vlan_proto
));
289 /* A = ntohs(A) [emitting a nop or swap16] */
290 *insn
= BPF_ENDIAN(BPF_FROM_BE
, BPF_REG_A
, 16);
293 case SKF_AD_OFF
+ SKF_AD_PAY_OFFSET
:
294 case SKF_AD_OFF
+ SKF_AD_NLATTR
:
295 case SKF_AD_OFF
+ SKF_AD_NLATTR_NEST
:
296 case SKF_AD_OFF
+ SKF_AD_CPU
:
297 case SKF_AD_OFF
+ SKF_AD_RANDOM
:
299 *insn
++ = BPF_MOV64_REG(BPF_REG_ARG1
, BPF_REG_CTX
);
301 *insn
++ = BPF_MOV64_REG(BPF_REG_ARG2
, BPF_REG_A
);
303 *insn
++ = BPF_MOV64_REG(BPF_REG_ARG3
, BPF_REG_X
);
304 /* Emit call(arg1=CTX, arg2=A, arg3=X) */
306 case SKF_AD_OFF
+ SKF_AD_PAY_OFFSET
:
307 *insn
= BPF_EMIT_CALL(__skb_get_pay_offset
);
309 case SKF_AD_OFF
+ SKF_AD_NLATTR
:
310 *insn
= BPF_EMIT_CALL(__skb_get_nlattr
);
312 case SKF_AD_OFF
+ SKF_AD_NLATTR_NEST
:
313 *insn
= BPF_EMIT_CALL(__skb_get_nlattr_nest
);
315 case SKF_AD_OFF
+ SKF_AD_CPU
:
316 *insn
= BPF_EMIT_CALL(__get_raw_cpu_id
);
318 case SKF_AD_OFF
+ SKF_AD_RANDOM
:
319 *insn
= BPF_EMIT_CALL(bpf_user_rnd_u32
);
320 bpf_user_rnd_init_once();
325 case SKF_AD_OFF
+ SKF_AD_ALU_XOR_X
:
327 *insn
= BPF_ALU32_REG(BPF_XOR
, BPF_REG_A
, BPF_REG_X
);
331 /* This is just a dummy call to avoid letting the compiler
332 * evict __bpf_call_base() as an optimization. Placed here
333 * where no-one bothers.
335 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
344 * bpf_convert_filter - convert filter program
345 * @prog: the user passed filter program
346 * @len: the length of the user passed filter program
347 * @new_prog: buffer where converted program will be stored
348 * @new_len: pointer to store length of converted program
350 * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style.
351 * Conversion workflow:
353 * 1) First pass for calculating the new program length:
354 * bpf_convert_filter(old_prog, old_len, NULL, &new_len)
356 * 2) 2nd pass to remap in two passes: 1st pass finds new
357 * jump offsets, 2nd pass remapping:
358 * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len);
359 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
361 static int bpf_convert_filter(struct sock_filter
*prog
, int len
,
362 struct bpf_insn
*new_prog
, int *new_len
)
364 int new_flen
= 0, pass
= 0, target
, i
;
365 struct bpf_insn
*new_insn
;
366 struct sock_filter
*fp
;
370 BUILD_BUG_ON(BPF_MEMWORDS
* sizeof(u32
) > MAX_BPF_STACK
);
371 BUILD_BUG_ON(BPF_REG_FP
+ 1 != MAX_BPF_REG
);
373 if (len
<= 0 || len
> BPF_MAXINSNS
)
377 addrs
= kcalloc(len
, sizeof(*addrs
),
378 GFP_KERNEL
| __GFP_NOWARN
);
387 /* Classic BPF related prologue emission. */
389 /* Classic BPF expects A and X to be reset first. These need
390 * to be guaranteed to be the first two instructions.
392 *new_insn
++ = BPF_ALU64_REG(BPF_XOR
, BPF_REG_A
, BPF_REG_A
);
393 *new_insn
++ = BPF_ALU64_REG(BPF_XOR
, BPF_REG_X
, BPF_REG_X
);
395 /* All programs must keep CTX in callee saved BPF_REG_CTX.
396 * In eBPF case it's done by the compiler, here we need to
397 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
399 *new_insn
++ = BPF_MOV64_REG(BPF_REG_CTX
, BPF_REG_ARG1
);
404 for (i
= 0; i
< len
; fp
++, i
++) {
405 struct bpf_insn tmp_insns
[6] = { };
406 struct bpf_insn
*insn
= tmp_insns
;
409 addrs
[i
] = new_insn
- new_prog
;
412 /* All arithmetic insns and skb loads map as-is. */
413 case BPF_ALU
| BPF_ADD
| BPF_X
:
414 case BPF_ALU
| BPF_ADD
| BPF_K
:
415 case BPF_ALU
| BPF_SUB
| BPF_X
:
416 case BPF_ALU
| BPF_SUB
| BPF_K
:
417 case BPF_ALU
| BPF_AND
| BPF_X
:
418 case BPF_ALU
| BPF_AND
| BPF_K
:
419 case BPF_ALU
| BPF_OR
| BPF_X
:
420 case BPF_ALU
| BPF_OR
| BPF_K
:
421 case BPF_ALU
| BPF_LSH
| BPF_X
:
422 case BPF_ALU
| BPF_LSH
| BPF_K
:
423 case BPF_ALU
| BPF_RSH
| BPF_X
:
424 case BPF_ALU
| BPF_RSH
| BPF_K
:
425 case BPF_ALU
| BPF_XOR
| BPF_X
:
426 case BPF_ALU
| BPF_XOR
| BPF_K
:
427 case BPF_ALU
| BPF_MUL
| BPF_X
:
428 case BPF_ALU
| BPF_MUL
| BPF_K
:
429 case BPF_ALU
| BPF_DIV
| BPF_X
:
430 case BPF_ALU
| BPF_DIV
| BPF_K
:
431 case BPF_ALU
| BPF_MOD
| BPF_X
:
432 case BPF_ALU
| BPF_MOD
| BPF_K
:
433 case BPF_ALU
| BPF_NEG
:
434 case BPF_LD
| BPF_ABS
| BPF_W
:
435 case BPF_LD
| BPF_ABS
| BPF_H
:
436 case BPF_LD
| BPF_ABS
| BPF_B
:
437 case BPF_LD
| BPF_IND
| BPF_W
:
438 case BPF_LD
| BPF_IND
| BPF_H
:
439 case BPF_LD
| BPF_IND
| BPF_B
:
440 /* Check for overloaded BPF extension and
441 * directly convert it if found, otherwise
442 * just move on with mapping.
444 if (BPF_CLASS(fp
->code
) == BPF_LD
&&
445 BPF_MODE(fp
->code
) == BPF_ABS
&&
446 convert_bpf_extensions(fp
, &insn
))
449 *insn
= BPF_RAW_INSN(fp
->code
, BPF_REG_A
, BPF_REG_X
, 0, fp
->k
);
452 /* Jump transformation cannot use BPF block macros
453 * everywhere as offset calculation and target updates
454 * require a bit more work than the rest, i.e. jump
455 * opcodes map as-is, but offsets need adjustment.
458 #define BPF_EMIT_JMP \
460 if (target >= len || target < 0) \
462 insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \
463 /* Adjust pc relative offset for 2nd or 3rd insn. */ \
464 insn->off -= insn - tmp_insns; \
467 case BPF_JMP
| BPF_JA
:
468 target
= i
+ fp
->k
+ 1;
469 insn
->code
= fp
->code
;
473 case BPF_JMP
| BPF_JEQ
| BPF_K
:
474 case BPF_JMP
| BPF_JEQ
| BPF_X
:
475 case BPF_JMP
| BPF_JSET
| BPF_K
:
476 case BPF_JMP
| BPF_JSET
| BPF_X
:
477 case BPF_JMP
| BPF_JGT
| BPF_K
:
478 case BPF_JMP
| BPF_JGT
| BPF_X
:
479 case BPF_JMP
| BPF_JGE
| BPF_K
:
480 case BPF_JMP
| BPF_JGE
| BPF_X
:
481 if (BPF_SRC(fp
->code
) == BPF_K
&& (int) fp
->k
< 0) {
482 /* BPF immediates are signed, zero extend
483 * immediate into tmp register and use it
486 *insn
++ = BPF_MOV32_IMM(BPF_REG_TMP
, fp
->k
);
488 insn
->dst_reg
= BPF_REG_A
;
489 insn
->src_reg
= BPF_REG_TMP
;
492 insn
->dst_reg
= BPF_REG_A
;
494 bpf_src
= BPF_SRC(fp
->code
);
495 insn
->src_reg
= bpf_src
== BPF_X
? BPF_REG_X
: 0;
498 /* Common case where 'jump_false' is next insn. */
500 insn
->code
= BPF_JMP
| BPF_OP(fp
->code
) | bpf_src
;
501 target
= i
+ fp
->jt
+ 1;
506 /* Convert JEQ into JNE when 'jump_true' is next insn. */
507 if (fp
->jt
== 0 && BPF_OP(fp
->code
) == BPF_JEQ
) {
508 insn
->code
= BPF_JMP
| BPF_JNE
| bpf_src
;
509 target
= i
+ fp
->jf
+ 1;
514 /* Other jumps are mapped into two insns: Jxx and JA. */
515 target
= i
+ fp
->jt
+ 1;
516 insn
->code
= BPF_JMP
| BPF_OP(fp
->code
) | bpf_src
;
520 insn
->code
= BPF_JMP
| BPF_JA
;
521 target
= i
+ fp
->jf
+ 1;
525 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
526 case BPF_LDX
| BPF_MSH
| BPF_B
:
528 *insn
++ = BPF_MOV64_REG(BPF_REG_TMP
, BPF_REG_A
);
529 /* A = BPF_R0 = *(u8 *) (skb->data + K) */
530 *insn
++ = BPF_LD_ABS(BPF_B
, fp
->k
);
532 *insn
++ = BPF_ALU32_IMM(BPF_AND
, BPF_REG_A
, 0xf);
534 *insn
++ = BPF_ALU32_IMM(BPF_LSH
, BPF_REG_A
, 2);
536 *insn
++ = BPF_MOV64_REG(BPF_REG_X
, BPF_REG_A
);
538 *insn
= BPF_MOV64_REG(BPF_REG_A
, BPF_REG_TMP
);
541 /* RET_K is remaped into 2 insns. RET_A case doesn't need an
542 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
544 case BPF_RET
| BPF_A
:
545 case BPF_RET
| BPF_K
:
546 if (BPF_RVAL(fp
->code
) == BPF_K
)
547 *insn
++ = BPF_MOV32_RAW(BPF_K
, BPF_REG_0
,
549 *insn
= BPF_EXIT_INSN();
552 /* Store to stack. */
555 *insn
= BPF_STX_MEM(BPF_W
, BPF_REG_FP
, BPF_CLASS(fp
->code
) ==
556 BPF_ST
? BPF_REG_A
: BPF_REG_X
,
557 -(BPF_MEMWORDS
- fp
->k
) * 4);
560 /* Load from stack. */
561 case BPF_LD
| BPF_MEM
:
562 case BPF_LDX
| BPF_MEM
:
563 *insn
= BPF_LDX_MEM(BPF_W
, BPF_CLASS(fp
->code
) == BPF_LD
?
564 BPF_REG_A
: BPF_REG_X
, BPF_REG_FP
,
565 -(BPF_MEMWORDS
- fp
->k
) * 4);
569 case BPF_LD
| BPF_IMM
:
570 case BPF_LDX
| BPF_IMM
:
571 *insn
= BPF_MOV32_IMM(BPF_CLASS(fp
->code
) == BPF_LD
?
572 BPF_REG_A
: BPF_REG_X
, fp
->k
);
576 case BPF_MISC
| BPF_TAX
:
577 *insn
= BPF_MOV64_REG(BPF_REG_X
, BPF_REG_A
);
581 case BPF_MISC
| BPF_TXA
:
582 *insn
= BPF_MOV64_REG(BPF_REG_A
, BPF_REG_X
);
585 /* A = skb->len or X = skb->len */
586 case BPF_LD
| BPF_W
| BPF_LEN
:
587 case BPF_LDX
| BPF_W
| BPF_LEN
:
588 *insn
= BPF_LDX_MEM(BPF_W
, BPF_CLASS(fp
->code
) == BPF_LD
?
589 BPF_REG_A
: BPF_REG_X
, BPF_REG_CTX
,
590 offsetof(struct sk_buff
, len
));
593 /* Access seccomp_data fields. */
594 case BPF_LDX
| BPF_ABS
| BPF_W
:
595 /* A = *(u32 *) (ctx + K) */
596 *insn
= BPF_LDX_MEM(BPF_W
, BPF_REG_A
, BPF_REG_CTX
, fp
->k
);
599 /* Unknown instruction. */
606 memcpy(new_insn
, tmp_insns
,
607 sizeof(*insn
) * (insn
- tmp_insns
));
608 new_insn
+= insn
- tmp_insns
;
612 /* Only calculating new length. */
613 *new_len
= new_insn
- new_prog
;
618 if (new_flen
!= new_insn
- new_prog
) {
619 new_flen
= new_insn
- new_prog
;
626 BUG_ON(*new_len
!= new_flen
);
635 * As we dont want to clear mem[] array for each packet going through
636 * __bpf_prog_run(), we check that filter loaded by user never try to read
637 * a cell if not previously written, and we check all branches to be sure
638 * a malicious user doesn't try to abuse us.
640 static int check_load_and_stores(const struct sock_filter
*filter
, int flen
)
642 u16
*masks
, memvalid
= 0; /* One bit per cell, 16 cells */
645 BUILD_BUG_ON(BPF_MEMWORDS
> 16);
647 masks
= kmalloc_array(flen
, sizeof(*masks
), GFP_KERNEL
);
651 memset(masks
, 0xff, flen
* sizeof(*masks
));
653 for (pc
= 0; pc
< flen
; pc
++) {
654 memvalid
&= masks
[pc
];
656 switch (filter
[pc
].code
) {
659 memvalid
|= (1 << filter
[pc
].k
);
661 case BPF_LD
| BPF_MEM
:
662 case BPF_LDX
| BPF_MEM
:
663 if (!(memvalid
& (1 << filter
[pc
].k
))) {
668 case BPF_JMP
| BPF_JA
:
669 /* A jump must set masks on target */
670 masks
[pc
+ 1 + filter
[pc
].k
] &= memvalid
;
673 case BPF_JMP
| BPF_JEQ
| BPF_K
:
674 case BPF_JMP
| BPF_JEQ
| BPF_X
:
675 case BPF_JMP
| BPF_JGE
| BPF_K
:
676 case BPF_JMP
| BPF_JGE
| BPF_X
:
677 case BPF_JMP
| BPF_JGT
| BPF_K
:
678 case BPF_JMP
| BPF_JGT
| BPF_X
:
679 case BPF_JMP
| BPF_JSET
| BPF_K
:
680 case BPF_JMP
| BPF_JSET
| BPF_X
:
681 /* A jump must set masks on targets */
682 masks
[pc
+ 1 + filter
[pc
].jt
] &= memvalid
;
683 masks
[pc
+ 1 + filter
[pc
].jf
] &= memvalid
;
693 static bool chk_code_allowed(u16 code_to_probe
)
695 static const bool codes
[] = {
696 /* 32 bit ALU operations */
697 [BPF_ALU
| BPF_ADD
| BPF_K
] = true,
698 [BPF_ALU
| BPF_ADD
| BPF_X
] = true,
699 [BPF_ALU
| BPF_SUB
| BPF_K
] = true,
700 [BPF_ALU
| BPF_SUB
| BPF_X
] = true,
701 [BPF_ALU
| BPF_MUL
| BPF_K
] = true,
702 [BPF_ALU
| BPF_MUL
| BPF_X
] = true,
703 [BPF_ALU
| BPF_DIV
| BPF_K
] = true,
704 [BPF_ALU
| BPF_DIV
| BPF_X
] = true,
705 [BPF_ALU
| BPF_MOD
| BPF_K
] = true,
706 [BPF_ALU
| BPF_MOD
| BPF_X
] = true,
707 [BPF_ALU
| BPF_AND
| BPF_K
] = true,
708 [BPF_ALU
| BPF_AND
| BPF_X
] = true,
709 [BPF_ALU
| BPF_OR
| BPF_K
] = true,
710 [BPF_ALU
| BPF_OR
| BPF_X
] = true,
711 [BPF_ALU
| BPF_XOR
| BPF_K
] = true,
712 [BPF_ALU
| BPF_XOR
| BPF_X
] = true,
713 [BPF_ALU
| BPF_LSH
| BPF_K
] = true,
714 [BPF_ALU
| BPF_LSH
| BPF_X
] = true,
715 [BPF_ALU
| BPF_RSH
| BPF_K
] = true,
716 [BPF_ALU
| BPF_RSH
| BPF_X
] = true,
717 [BPF_ALU
| BPF_NEG
] = true,
718 /* Load instructions */
719 [BPF_LD
| BPF_W
| BPF_ABS
] = true,
720 [BPF_LD
| BPF_H
| BPF_ABS
] = true,
721 [BPF_LD
| BPF_B
| BPF_ABS
] = true,
722 [BPF_LD
| BPF_W
| BPF_LEN
] = true,
723 [BPF_LD
| BPF_W
| BPF_IND
] = true,
724 [BPF_LD
| BPF_H
| BPF_IND
] = true,
725 [BPF_LD
| BPF_B
| BPF_IND
] = true,
726 [BPF_LD
| BPF_IMM
] = true,
727 [BPF_LD
| BPF_MEM
] = true,
728 [BPF_LDX
| BPF_W
| BPF_LEN
] = true,
729 [BPF_LDX
| BPF_B
| BPF_MSH
] = true,
730 [BPF_LDX
| BPF_IMM
] = true,
731 [BPF_LDX
| BPF_MEM
] = true,
732 /* Store instructions */
735 /* Misc instructions */
736 [BPF_MISC
| BPF_TAX
] = true,
737 [BPF_MISC
| BPF_TXA
] = true,
738 /* Return instructions */
739 [BPF_RET
| BPF_K
] = true,
740 [BPF_RET
| BPF_A
] = true,
741 /* Jump instructions */
742 [BPF_JMP
| BPF_JA
] = true,
743 [BPF_JMP
| BPF_JEQ
| BPF_K
] = true,
744 [BPF_JMP
| BPF_JEQ
| BPF_X
] = true,
745 [BPF_JMP
| BPF_JGE
| BPF_K
] = true,
746 [BPF_JMP
| BPF_JGE
| BPF_X
] = true,
747 [BPF_JMP
| BPF_JGT
| BPF_K
] = true,
748 [BPF_JMP
| BPF_JGT
| BPF_X
] = true,
749 [BPF_JMP
| BPF_JSET
| BPF_K
] = true,
750 [BPF_JMP
| BPF_JSET
| BPF_X
] = true,
753 if (code_to_probe
>= ARRAY_SIZE(codes
))
756 return codes
[code_to_probe
];
759 static bool bpf_check_basics_ok(const struct sock_filter
*filter
,
764 if (flen
== 0 || flen
> BPF_MAXINSNS
)
771 * bpf_check_classic - verify socket filter code
772 * @filter: filter to verify
773 * @flen: length of filter
775 * Check the user's filter code. If we let some ugly
776 * filter code slip through kaboom! The filter must contain
777 * no references or jumps that are out of range, no illegal
778 * instructions, and must end with a RET instruction.
780 * All jumps are forward as they are not signed.
782 * Returns 0 if the rule set is legal or -EINVAL if not.
784 static int bpf_check_classic(const struct sock_filter
*filter
,
790 /* Check the filter code now */
791 for (pc
= 0; pc
< flen
; pc
++) {
792 const struct sock_filter
*ftest
= &filter
[pc
];
794 /* May we actually operate on this code? */
795 if (!chk_code_allowed(ftest
->code
))
798 /* Some instructions need special checks */
799 switch (ftest
->code
) {
800 case BPF_ALU
| BPF_DIV
| BPF_K
:
801 case BPF_ALU
| BPF_MOD
| BPF_K
:
802 /* Check for division by zero */
806 case BPF_ALU
| BPF_LSH
| BPF_K
:
807 case BPF_ALU
| BPF_RSH
| BPF_K
:
811 case BPF_LD
| BPF_MEM
:
812 case BPF_LDX
| BPF_MEM
:
815 /* Check for invalid memory addresses */
816 if (ftest
->k
>= BPF_MEMWORDS
)
819 case BPF_JMP
| BPF_JA
:
820 /* Note, the large ftest->k might cause loops.
821 * Compare this with conditional jumps below,
822 * where offsets are limited. --ANK (981016)
824 if (ftest
->k
>= (unsigned int)(flen
- pc
- 1))
827 case BPF_JMP
| BPF_JEQ
| BPF_K
:
828 case BPF_JMP
| BPF_JEQ
| BPF_X
:
829 case BPF_JMP
| BPF_JGE
| BPF_K
:
830 case BPF_JMP
| BPF_JGE
| BPF_X
:
831 case BPF_JMP
| BPF_JGT
| BPF_K
:
832 case BPF_JMP
| BPF_JGT
| BPF_X
:
833 case BPF_JMP
| BPF_JSET
| BPF_K
:
834 case BPF_JMP
| BPF_JSET
| BPF_X
:
835 /* Both conditionals must be safe */
836 if (pc
+ ftest
->jt
+ 1 >= flen
||
837 pc
+ ftest
->jf
+ 1 >= flen
)
840 case BPF_LD
| BPF_W
| BPF_ABS
:
841 case BPF_LD
| BPF_H
| BPF_ABS
:
842 case BPF_LD
| BPF_B
| BPF_ABS
:
844 if (bpf_anc_helper(ftest
) & BPF_ANC
)
846 /* Ancillary operation unknown or unsupported */
847 if (anc_found
== false && ftest
->k
>= SKF_AD_OFF
)
852 /* Last instruction must be a RET code */
853 switch (filter
[flen
- 1].code
) {
854 case BPF_RET
| BPF_K
:
855 case BPF_RET
| BPF_A
:
856 return check_load_and_stores(filter
, flen
);
862 static int bpf_prog_store_orig_filter(struct bpf_prog
*fp
,
863 const struct sock_fprog
*fprog
)
865 unsigned int fsize
= bpf_classic_proglen(fprog
);
866 struct sock_fprog_kern
*fkprog
;
868 fp
->orig_prog
= kmalloc(sizeof(*fkprog
), GFP_KERNEL
);
872 fkprog
= fp
->orig_prog
;
873 fkprog
->len
= fprog
->len
;
875 fkprog
->filter
= kmemdup(fp
->insns
, fsize
,
876 GFP_KERNEL
| __GFP_NOWARN
);
877 if (!fkprog
->filter
) {
878 kfree(fp
->orig_prog
);
885 static void bpf_release_orig_filter(struct bpf_prog
*fp
)
887 struct sock_fprog_kern
*fprog
= fp
->orig_prog
;
890 kfree(fprog
->filter
);
895 static void __bpf_prog_release(struct bpf_prog
*prog
)
897 if (prog
->type
== BPF_PROG_TYPE_SOCKET_FILTER
) {
900 bpf_release_orig_filter(prog
);
905 static void __sk_filter_release(struct sk_filter
*fp
)
907 __bpf_prog_release(fp
->prog
);
912 * sk_filter_release_rcu - Release a socket filter by rcu_head
913 * @rcu: rcu_head that contains the sk_filter to free
915 static void sk_filter_release_rcu(struct rcu_head
*rcu
)
917 struct sk_filter
*fp
= container_of(rcu
, struct sk_filter
, rcu
);
919 __sk_filter_release(fp
);
923 * sk_filter_release - release a socket filter
924 * @fp: filter to remove
926 * Remove a filter from a socket and release its resources.
928 static void sk_filter_release(struct sk_filter
*fp
)
930 if (atomic_dec_and_test(&fp
->refcnt
))
931 call_rcu(&fp
->rcu
, sk_filter_release_rcu
);
934 void sk_filter_uncharge(struct sock
*sk
, struct sk_filter
*fp
)
936 u32 filter_size
= bpf_prog_size(fp
->prog
->len
);
938 atomic_sub(filter_size
, &sk
->sk_omem_alloc
);
939 sk_filter_release(fp
);
942 /* try to charge the socket memory if there is space available
943 * return true on success
945 bool sk_filter_charge(struct sock
*sk
, struct sk_filter
*fp
)
947 u32 filter_size
= bpf_prog_size(fp
->prog
->len
);
949 /* same check as in sock_kmalloc() */
950 if (filter_size
<= sysctl_optmem_max
&&
951 atomic_read(&sk
->sk_omem_alloc
) + filter_size
< sysctl_optmem_max
) {
952 atomic_inc(&fp
->refcnt
);
953 atomic_add(filter_size
, &sk
->sk_omem_alloc
);
959 static struct bpf_prog
*bpf_migrate_filter(struct bpf_prog
*fp
)
961 struct sock_filter
*old_prog
;
962 struct bpf_prog
*old_fp
;
963 int err
, new_len
, old_len
= fp
->len
;
965 /* We are free to overwrite insns et al right here as it
966 * won't be used at this point in time anymore internally
967 * after the migration to the internal BPF instruction
970 BUILD_BUG_ON(sizeof(struct sock_filter
) !=
971 sizeof(struct bpf_insn
));
973 /* Conversion cannot happen on overlapping memory areas,
974 * so we need to keep the user BPF around until the 2nd
975 * pass. At this time, the user BPF is stored in fp->insns.
977 old_prog
= kmemdup(fp
->insns
, old_len
* sizeof(struct sock_filter
),
978 GFP_KERNEL
| __GFP_NOWARN
);
984 /* 1st pass: calculate the new program length. */
985 err
= bpf_convert_filter(old_prog
, old_len
, NULL
, &new_len
);
989 /* Expand fp for appending the new filter representation. */
991 fp
= bpf_prog_realloc(old_fp
, bpf_prog_size(new_len
), 0);
993 /* The old_fp is still around in case we couldn't
994 * allocate new memory, so uncharge on that one.
1003 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
1004 err
= bpf_convert_filter(old_prog
, old_len
, fp
->insnsi
, &new_len
);
1006 /* 2nd bpf_convert_filter() can fail only if it fails
1007 * to allocate memory, remapping must succeed. Note,
1008 * that at this time old_fp has already been released
1013 /* We are guaranteed to never error here with cBPF to eBPF
1014 * transitions, since there's no issue with type compatibility
1015 * checks on program arrays.
1017 fp
= bpf_prog_select_runtime(fp
, &err
);
1025 __bpf_prog_release(fp
);
1026 return ERR_PTR(err
);
1029 static struct bpf_prog
*bpf_prepare_filter(struct bpf_prog
*fp
,
1030 bpf_aux_classic_check_t trans
)
1034 fp
->bpf_func
= NULL
;
1037 err
= bpf_check_classic(fp
->insns
, fp
->len
);
1039 __bpf_prog_release(fp
);
1040 return ERR_PTR(err
);
1043 /* There might be additional checks and transformations
1044 * needed on classic filters, f.e. in case of seccomp.
1047 err
= trans(fp
->insns
, fp
->len
);
1049 __bpf_prog_release(fp
);
1050 return ERR_PTR(err
);
1054 /* Probe if we can JIT compile the filter and if so, do
1055 * the compilation of the filter.
1057 bpf_jit_compile(fp
);
1059 /* JIT compiler couldn't process this filter, so do the
1060 * internal BPF translation for the optimized interpreter.
1063 fp
= bpf_migrate_filter(fp
);
1069 * bpf_prog_create - create an unattached filter
1070 * @pfp: the unattached filter that is created
1071 * @fprog: the filter program
1073 * Create a filter independent of any socket. We first run some
1074 * sanity checks on it to make sure it does not explode on us later.
1075 * If an error occurs or there is insufficient memory for the filter
1076 * a negative errno code is returned. On success the return is zero.
1078 int bpf_prog_create(struct bpf_prog
**pfp
, struct sock_fprog_kern
*fprog
)
1080 unsigned int fsize
= bpf_classic_proglen(fprog
);
1081 struct bpf_prog
*fp
;
1083 /* Make sure new filter is there and in the right amounts. */
1084 if (!bpf_check_basics_ok(fprog
->filter
, fprog
->len
))
1087 fp
= bpf_prog_alloc(bpf_prog_size(fprog
->len
), 0);
1091 memcpy(fp
->insns
, fprog
->filter
, fsize
);
1093 fp
->len
= fprog
->len
;
1094 /* Since unattached filters are not copied back to user
1095 * space through sk_get_filter(), we do not need to hold
1096 * a copy here, and can spare us the work.
1098 fp
->orig_prog
= NULL
;
1100 /* bpf_prepare_filter() already takes care of freeing
1101 * memory in case something goes wrong.
1103 fp
= bpf_prepare_filter(fp
, NULL
);
1110 EXPORT_SYMBOL_GPL(bpf_prog_create
);
1113 * bpf_prog_create_from_user - create an unattached filter from user buffer
1114 * @pfp: the unattached filter that is created
1115 * @fprog: the filter program
1116 * @trans: post-classic verifier transformation handler
1117 * @save_orig: save classic BPF program
1119 * This function effectively does the same as bpf_prog_create(), only
1120 * that it builds up its insns buffer from user space provided buffer.
1121 * It also allows for passing a bpf_aux_classic_check_t handler.
1123 int bpf_prog_create_from_user(struct bpf_prog
**pfp
, struct sock_fprog
*fprog
,
1124 bpf_aux_classic_check_t trans
, bool save_orig
)
1126 unsigned int fsize
= bpf_classic_proglen(fprog
);
1127 struct bpf_prog
*fp
;
1130 /* Make sure new filter is there and in the right amounts. */
1131 if (!bpf_check_basics_ok(fprog
->filter
, fprog
->len
))
1134 fp
= bpf_prog_alloc(bpf_prog_size(fprog
->len
), 0);
1138 if (copy_from_user(fp
->insns
, fprog
->filter
, fsize
)) {
1139 __bpf_prog_free(fp
);
1143 fp
->len
= fprog
->len
;
1144 fp
->orig_prog
= NULL
;
1147 err
= bpf_prog_store_orig_filter(fp
, fprog
);
1149 __bpf_prog_free(fp
);
1154 /* bpf_prepare_filter() already takes care of freeing
1155 * memory in case something goes wrong.
1157 fp
= bpf_prepare_filter(fp
, trans
);
1164 EXPORT_SYMBOL_GPL(bpf_prog_create_from_user
);
1166 void bpf_prog_destroy(struct bpf_prog
*fp
)
1168 __bpf_prog_release(fp
);
1170 EXPORT_SYMBOL_GPL(bpf_prog_destroy
);
1172 static int __sk_attach_prog(struct bpf_prog
*prog
, struct sock
*sk
)
1174 struct sk_filter
*fp
, *old_fp
;
1176 fp
= kmalloc(sizeof(*fp
), GFP_KERNEL
);
1181 atomic_set(&fp
->refcnt
, 0);
1183 if (!sk_filter_charge(sk
, fp
)) {
1188 old_fp
= rcu_dereference_protected(sk
->sk_filter
,
1189 lockdep_sock_is_held(sk
));
1190 rcu_assign_pointer(sk
->sk_filter
, fp
);
1193 sk_filter_uncharge(sk
, old_fp
);
1198 static int __reuseport_attach_prog(struct bpf_prog
*prog
, struct sock
*sk
)
1200 struct bpf_prog
*old_prog
;
1203 if (bpf_prog_size(prog
->len
) > sysctl_optmem_max
)
1206 if (sk_unhashed(sk
) && sk
->sk_reuseport
) {
1207 err
= reuseport_alloc(sk
);
1210 } else if (!rcu_access_pointer(sk
->sk_reuseport_cb
)) {
1211 /* The socket wasn't bound with SO_REUSEPORT */
1215 old_prog
= reuseport_attach_prog(sk
, prog
);
1217 bpf_prog_destroy(old_prog
);
1223 struct bpf_prog
*__get_filter(struct sock_fprog
*fprog
, struct sock
*sk
)
1225 unsigned int fsize
= bpf_classic_proglen(fprog
);
1226 struct bpf_prog
*prog
;
1229 if (sock_flag(sk
, SOCK_FILTER_LOCKED
))
1230 return ERR_PTR(-EPERM
);
1232 /* Make sure new filter is there and in the right amounts. */
1233 if (!bpf_check_basics_ok(fprog
->filter
, fprog
->len
))
1234 return ERR_PTR(-EINVAL
);
1236 prog
= bpf_prog_alloc(bpf_prog_size(fprog
->len
), 0);
1238 return ERR_PTR(-ENOMEM
);
1240 if (copy_from_user(prog
->insns
, fprog
->filter
, fsize
)) {
1241 __bpf_prog_free(prog
);
1242 return ERR_PTR(-EFAULT
);
1245 prog
->len
= fprog
->len
;
1247 err
= bpf_prog_store_orig_filter(prog
, fprog
);
1249 __bpf_prog_free(prog
);
1250 return ERR_PTR(-ENOMEM
);
1253 /* bpf_prepare_filter() already takes care of freeing
1254 * memory in case something goes wrong.
1256 return bpf_prepare_filter(prog
, NULL
);
1260 * sk_attach_filter - attach a socket filter
1261 * @fprog: the filter program
1262 * @sk: the socket to use
1264 * Attach the user's filter code. We first run some sanity checks on
1265 * it to make sure it does not explode on us later. If an error
1266 * occurs or there is insufficient memory for the filter a negative
1267 * errno code is returned. On success the return is zero.
1269 int sk_attach_filter(struct sock_fprog
*fprog
, struct sock
*sk
)
1271 struct bpf_prog
*prog
= __get_filter(fprog
, sk
);
1275 return PTR_ERR(prog
);
1277 err
= __sk_attach_prog(prog
, sk
);
1279 __bpf_prog_release(prog
);
1285 EXPORT_SYMBOL_GPL(sk_attach_filter
);
1287 int sk_reuseport_attach_filter(struct sock_fprog
*fprog
, struct sock
*sk
)
1289 struct bpf_prog
*prog
= __get_filter(fprog
, sk
);
1293 return PTR_ERR(prog
);
1295 err
= __reuseport_attach_prog(prog
, sk
);
1297 __bpf_prog_release(prog
);
1304 static struct bpf_prog
*__get_bpf(u32 ufd
, struct sock
*sk
)
1306 if (sock_flag(sk
, SOCK_FILTER_LOCKED
))
1307 return ERR_PTR(-EPERM
);
1309 return bpf_prog_get_type(ufd
, BPF_PROG_TYPE_SOCKET_FILTER
);
1312 int sk_attach_bpf(u32 ufd
, struct sock
*sk
)
1314 struct bpf_prog
*prog
= __get_bpf(ufd
, sk
);
1318 return PTR_ERR(prog
);
1320 err
= __sk_attach_prog(prog
, sk
);
1329 int sk_reuseport_attach_bpf(u32 ufd
, struct sock
*sk
)
1331 struct bpf_prog
*prog
= __get_bpf(ufd
, sk
);
1335 return PTR_ERR(prog
);
1337 err
= __reuseport_attach_prog(prog
, sk
);
1346 struct bpf_scratchpad
{
1348 __be32 diff
[MAX_BPF_STACK
/ sizeof(__be32
)];
1349 u8 buff
[MAX_BPF_STACK
];
1353 static DEFINE_PER_CPU(struct bpf_scratchpad
, bpf_sp
);
1355 static inline int __bpf_try_make_writable(struct sk_buff
*skb
,
1356 unsigned int write_len
)
1358 return skb_ensure_writable(skb
, write_len
);
1361 static inline int bpf_try_make_writable(struct sk_buff
*skb
,
1362 unsigned int write_len
)
1364 int err
= __bpf_try_make_writable(skb
, write_len
);
1366 bpf_compute_data_end(skb
);
1370 static int bpf_try_make_head_writable(struct sk_buff
*skb
)
1372 return bpf_try_make_writable(skb
, skb_headlen(skb
));
1375 static inline void bpf_push_mac_rcsum(struct sk_buff
*skb
)
1377 if (skb_at_tc_ingress(skb
))
1378 skb_postpush_rcsum(skb
, skb_mac_header(skb
), skb
->mac_len
);
1381 static inline void bpf_pull_mac_rcsum(struct sk_buff
*skb
)
1383 if (skb_at_tc_ingress(skb
))
1384 skb_postpull_rcsum(skb
, skb_mac_header(skb
), skb
->mac_len
);
1387 BPF_CALL_5(bpf_skb_store_bytes
, struct sk_buff
*, skb
, u32
, offset
,
1388 const void *, from
, u32
, len
, u64
, flags
)
1392 if (unlikely(flags
& ~(BPF_F_RECOMPUTE_CSUM
| BPF_F_INVALIDATE_HASH
)))
1394 if (unlikely(offset
> 0xffff))
1396 if (unlikely(bpf_try_make_writable(skb
, offset
+ len
)))
1399 ptr
= skb
->data
+ offset
;
1400 if (flags
& BPF_F_RECOMPUTE_CSUM
)
1401 __skb_postpull_rcsum(skb
, ptr
, len
, offset
);
1403 memcpy(ptr
, from
, len
);
1405 if (flags
& BPF_F_RECOMPUTE_CSUM
)
1406 __skb_postpush_rcsum(skb
, ptr
, len
, offset
);
1407 if (flags
& BPF_F_INVALIDATE_HASH
)
1408 skb_clear_hash(skb
);
1413 static const struct bpf_func_proto bpf_skb_store_bytes_proto
= {
1414 .func
= bpf_skb_store_bytes
,
1416 .ret_type
= RET_INTEGER
,
1417 .arg1_type
= ARG_PTR_TO_CTX
,
1418 .arg2_type
= ARG_ANYTHING
,
1419 .arg3_type
= ARG_PTR_TO_STACK
,
1420 .arg4_type
= ARG_CONST_STACK_SIZE
,
1421 .arg5_type
= ARG_ANYTHING
,
1424 BPF_CALL_4(bpf_skb_load_bytes
, const struct sk_buff
*, skb
, u32
, offset
,
1425 void *, to
, u32
, len
)
1429 if (unlikely(offset
> 0xffff))
1432 ptr
= skb_header_pointer(skb
, offset
, len
, to
);
1436 memcpy(to
, ptr
, len
);
1444 static const struct bpf_func_proto bpf_skb_load_bytes_proto
= {
1445 .func
= bpf_skb_load_bytes
,
1447 .ret_type
= RET_INTEGER
,
1448 .arg1_type
= ARG_PTR_TO_CTX
,
1449 .arg2_type
= ARG_ANYTHING
,
1450 .arg3_type
= ARG_PTR_TO_RAW_STACK
,
1451 .arg4_type
= ARG_CONST_STACK_SIZE
,
1454 BPF_CALL_2(bpf_skb_pull_data
, struct sk_buff
*, skb
, u32
, len
)
1456 /* Idea is the following: should the needed direct read/write
1457 * test fail during runtime, we can pull in more data and redo
1458 * again, since implicitly, we invalidate previous checks here.
1460 * Or, since we know how much we need to make read/writeable,
1461 * this can be done once at the program beginning for direct
1462 * access case. By this we overcome limitations of only current
1463 * headroom being accessible.
1465 return bpf_try_make_writable(skb
, len
? : skb_headlen(skb
));
1468 static const struct bpf_func_proto bpf_skb_pull_data_proto
= {
1469 .func
= bpf_skb_pull_data
,
1471 .ret_type
= RET_INTEGER
,
1472 .arg1_type
= ARG_PTR_TO_CTX
,
1473 .arg2_type
= ARG_ANYTHING
,
1476 BPF_CALL_5(bpf_l3_csum_replace
, struct sk_buff
*, skb
, u32
, offset
,
1477 u64
, from
, u64
, to
, u64
, flags
)
1481 if (unlikely(flags
& ~(BPF_F_HDR_FIELD_MASK
)))
1483 if (unlikely(offset
> 0xffff || offset
& 1))
1485 if (unlikely(bpf_try_make_writable(skb
, offset
+ sizeof(*ptr
))))
1488 ptr
= (__sum16
*)(skb
->data
+ offset
);
1489 switch (flags
& BPF_F_HDR_FIELD_MASK
) {
1491 if (unlikely(from
!= 0))
1494 csum_replace_by_diff(ptr
, to
);
1497 csum_replace2(ptr
, from
, to
);
1500 csum_replace4(ptr
, from
, to
);
1509 static const struct bpf_func_proto bpf_l3_csum_replace_proto
= {
1510 .func
= bpf_l3_csum_replace
,
1512 .ret_type
= RET_INTEGER
,
1513 .arg1_type
= ARG_PTR_TO_CTX
,
1514 .arg2_type
= ARG_ANYTHING
,
1515 .arg3_type
= ARG_ANYTHING
,
1516 .arg4_type
= ARG_ANYTHING
,
1517 .arg5_type
= ARG_ANYTHING
,
1520 BPF_CALL_5(bpf_l4_csum_replace
, struct sk_buff
*, skb
, u32
, offset
,
1521 u64
, from
, u64
, to
, u64
, flags
)
1523 bool is_pseudo
= flags
& BPF_F_PSEUDO_HDR
;
1524 bool is_mmzero
= flags
& BPF_F_MARK_MANGLED_0
;
1527 if (unlikely(flags
& ~(BPF_F_MARK_MANGLED_0
| BPF_F_PSEUDO_HDR
|
1528 BPF_F_HDR_FIELD_MASK
)))
1530 if (unlikely(offset
> 0xffff || offset
& 1))
1532 if (unlikely(bpf_try_make_writable(skb
, offset
+ sizeof(*ptr
))))
1535 ptr
= (__sum16
*)(skb
->data
+ offset
);
1536 if (is_mmzero
&& !*ptr
)
1539 switch (flags
& BPF_F_HDR_FIELD_MASK
) {
1541 if (unlikely(from
!= 0))
1544 inet_proto_csum_replace_by_diff(ptr
, skb
, to
, is_pseudo
);
1547 inet_proto_csum_replace2(ptr
, skb
, from
, to
, is_pseudo
);
1550 inet_proto_csum_replace4(ptr
, skb
, from
, to
, is_pseudo
);
1556 if (is_mmzero
&& !*ptr
)
1557 *ptr
= CSUM_MANGLED_0
;
1561 static const struct bpf_func_proto bpf_l4_csum_replace_proto
= {
1562 .func
= bpf_l4_csum_replace
,
1564 .ret_type
= RET_INTEGER
,
1565 .arg1_type
= ARG_PTR_TO_CTX
,
1566 .arg2_type
= ARG_ANYTHING
,
1567 .arg3_type
= ARG_ANYTHING
,
1568 .arg4_type
= ARG_ANYTHING
,
1569 .arg5_type
= ARG_ANYTHING
,
1572 BPF_CALL_5(bpf_csum_diff
, __be32
*, from
, u32
, from_size
,
1573 __be32
*, to
, u32
, to_size
, __wsum
, seed
)
1575 struct bpf_scratchpad
*sp
= this_cpu_ptr(&bpf_sp
);
1576 u32 diff_size
= from_size
+ to_size
;
1579 /* This is quite flexible, some examples:
1581 * from_size == 0, to_size > 0, seed := csum --> pushing data
1582 * from_size > 0, to_size == 0, seed := csum --> pulling data
1583 * from_size > 0, to_size > 0, seed := 0 --> diffing data
1585 * Even for diffing, from_size and to_size don't need to be equal.
1587 if (unlikely(((from_size
| to_size
) & (sizeof(__be32
) - 1)) ||
1588 diff_size
> sizeof(sp
->diff
)))
1591 for (i
= 0; i
< from_size
/ sizeof(__be32
); i
++, j
++)
1592 sp
->diff
[j
] = ~from
[i
];
1593 for (i
= 0; i
< to_size
/ sizeof(__be32
); i
++, j
++)
1594 sp
->diff
[j
] = to
[i
];
1596 return csum_partial(sp
->diff
, diff_size
, seed
);
1599 static const struct bpf_func_proto bpf_csum_diff_proto
= {
1600 .func
= bpf_csum_diff
,
1603 .ret_type
= RET_INTEGER
,
1604 .arg1_type
= ARG_PTR_TO_STACK
,
1605 .arg2_type
= ARG_CONST_STACK_SIZE_OR_ZERO
,
1606 .arg3_type
= ARG_PTR_TO_STACK
,
1607 .arg4_type
= ARG_CONST_STACK_SIZE_OR_ZERO
,
1608 .arg5_type
= ARG_ANYTHING
,
1611 BPF_CALL_2(bpf_csum_update
, struct sk_buff
*, skb
, __wsum
, csum
)
1613 /* The interface is to be used in combination with bpf_csum_diff()
1614 * for direct packet writes. csum rotation for alignment as well
1615 * as emulating csum_sub() can be done from the eBPF program.
1617 if (skb
->ip_summed
== CHECKSUM_COMPLETE
)
1618 return (skb
->csum
= csum_add(skb
->csum
, csum
));
1623 static const struct bpf_func_proto bpf_csum_update_proto
= {
1624 .func
= bpf_csum_update
,
1626 .ret_type
= RET_INTEGER
,
1627 .arg1_type
= ARG_PTR_TO_CTX
,
1628 .arg2_type
= ARG_ANYTHING
,
1631 static inline int __bpf_rx_skb(struct net_device
*dev
, struct sk_buff
*skb
)
1633 return dev_forward_skb(dev
, skb
);
1636 static inline int __bpf_rx_skb_no_mac(struct net_device
*dev
,
1637 struct sk_buff
*skb
)
1639 int ret
= ____dev_forward_skb(dev
, skb
);
1643 ret
= netif_rx(skb
);
1649 static inline int __bpf_tx_skb(struct net_device
*dev
, struct sk_buff
*skb
)
1653 if (unlikely(__this_cpu_read(xmit_recursion
) > XMIT_RECURSION_LIMIT
)) {
1654 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
1661 __this_cpu_inc(xmit_recursion
);
1662 ret
= dev_queue_xmit(skb
);
1663 __this_cpu_dec(xmit_recursion
);
1668 static int __bpf_redirect_no_mac(struct sk_buff
*skb
, struct net_device
*dev
,
1671 /* skb->mac_len is not set on normal egress */
1672 unsigned int mlen
= skb
->network_header
- skb
->mac_header
;
1674 __skb_pull(skb
, mlen
);
1676 /* At ingress, the mac header has already been pulled once.
1677 * At egress, skb_pospull_rcsum has to be done in case that
1678 * the skb is originated from ingress (i.e. a forwarded skb)
1679 * to ensure that rcsum starts at net header.
1681 if (!skb_at_tc_ingress(skb
))
1682 skb_postpull_rcsum(skb
, skb_mac_header(skb
), mlen
);
1683 skb_pop_mac_header(skb
);
1684 skb_reset_mac_len(skb
);
1685 return flags
& BPF_F_INGRESS
?
1686 __bpf_rx_skb_no_mac(dev
, skb
) : __bpf_tx_skb(dev
, skb
);
1689 static int __bpf_redirect_common(struct sk_buff
*skb
, struct net_device
*dev
,
1692 /* Verify that a link layer header is carried */
1693 if (unlikely(skb
->mac_header
>= skb
->network_header
)) {
1698 bpf_push_mac_rcsum(skb
);
1699 return flags
& BPF_F_INGRESS
?
1700 __bpf_rx_skb(dev
, skb
) : __bpf_tx_skb(dev
, skb
);
1703 static int __bpf_redirect(struct sk_buff
*skb
, struct net_device
*dev
,
1706 if (dev_is_mac_header_xmit(dev
))
1707 return __bpf_redirect_common(skb
, dev
, flags
);
1709 return __bpf_redirect_no_mac(skb
, dev
, flags
);
1712 BPF_CALL_3(bpf_clone_redirect
, struct sk_buff
*, skb
, u32
, ifindex
, u64
, flags
)
1714 struct net_device
*dev
;
1715 struct sk_buff
*clone
;
1718 if (unlikely(flags
& ~(BPF_F_INGRESS
)))
1721 dev
= dev_get_by_index_rcu(dev_net(skb
->dev
), ifindex
);
1725 clone
= skb_clone(skb
, GFP_ATOMIC
);
1726 if (unlikely(!clone
))
1729 /* For direct write, we need to keep the invariant that the skbs
1730 * we're dealing with need to be uncloned. Should uncloning fail
1731 * here, we need to free the just generated clone to unclone once
1734 ret
= bpf_try_make_head_writable(skb
);
1735 if (unlikely(ret
)) {
1740 return __bpf_redirect(clone
, dev
, flags
);
1743 static const struct bpf_func_proto bpf_clone_redirect_proto
= {
1744 .func
= bpf_clone_redirect
,
1746 .ret_type
= RET_INTEGER
,
1747 .arg1_type
= ARG_PTR_TO_CTX
,
1748 .arg2_type
= ARG_ANYTHING
,
1749 .arg3_type
= ARG_ANYTHING
,
1752 struct redirect_info
{
1757 static DEFINE_PER_CPU(struct redirect_info
, redirect_info
);
1759 BPF_CALL_2(bpf_redirect
, u32
, ifindex
, u64
, flags
)
1761 struct redirect_info
*ri
= this_cpu_ptr(&redirect_info
);
1763 if (unlikely(flags
& ~(BPF_F_INGRESS
)))
1766 ri
->ifindex
= ifindex
;
1769 return TC_ACT_REDIRECT
;
1772 int skb_do_redirect(struct sk_buff
*skb
)
1774 struct redirect_info
*ri
= this_cpu_ptr(&redirect_info
);
1775 struct net_device
*dev
;
1777 dev
= dev_get_by_index_rcu(dev_net(skb
->dev
), ri
->ifindex
);
1779 if (unlikely(!dev
)) {
1784 return __bpf_redirect(skb
, dev
, ri
->flags
);
1787 static const struct bpf_func_proto bpf_redirect_proto
= {
1788 .func
= bpf_redirect
,
1790 .ret_type
= RET_INTEGER
,
1791 .arg1_type
= ARG_ANYTHING
,
1792 .arg2_type
= ARG_ANYTHING
,
1795 BPF_CALL_1(bpf_get_cgroup_classid
, const struct sk_buff
*, skb
)
1797 return task_get_classid(skb
);
1800 static const struct bpf_func_proto bpf_get_cgroup_classid_proto
= {
1801 .func
= bpf_get_cgroup_classid
,
1803 .ret_type
= RET_INTEGER
,
1804 .arg1_type
= ARG_PTR_TO_CTX
,
1807 BPF_CALL_1(bpf_get_route_realm
, const struct sk_buff
*, skb
)
1809 return dst_tclassid(skb
);
1812 static const struct bpf_func_proto bpf_get_route_realm_proto
= {
1813 .func
= bpf_get_route_realm
,
1815 .ret_type
= RET_INTEGER
,
1816 .arg1_type
= ARG_PTR_TO_CTX
,
1819 BPF_CALL_1(bpf_get_hash_recalc
, struct sk_buff
*, skb
)
1821 /* If skb_clear_hash() was called due to mangling, we can
1822 * trigger SW recalculation here. Later access to hash
1823 * can then use the inline skb->hash via context directly
1824 * instead of calling this helper again.
1826 return skb_get_hash(skb
);
1829 static const struct bpf_func_proto bpf_get_hash_recalc_proto
= {
1830 .func
= bpf_get_hash_recalc
,
1832 .ret_type
= RET_INTEGER
,
1833 .arg1_type
= ARG_PTR_TO_CTX
,
1836 BPF_CALL_1(bpf_set_hash_invalid
, struct sk_buff
*, skb
)
1838 /* After all direct packet write, this can be used once for
1839 * triggering a lazy recalc on next skb_get_hash() invocation.
1841 skb_clear_hash(skb
);
1845 static const struct bpf_func_proto bpf_set_hash_invalid_proto
= {
1846 .func
= bpf_set_hash_invalid
,
1848 .ret_type
= RET_INTEGER
,
1849 .arg1_type
= ARG_PTR_TO_CTX
,
1852 BPF_CALL_3(bpf_skb_vlan_push
, struct sk_buff
*, skb
, __be16
, vlan_proto
,
1857 if (unlikely(vlan_proto
!= htons(ETH_P_8021Q
) &&
1858 vlan_proto
!= htons(ETH_P_8021AD
)))
1859 vlan_proto
= htons(ETH_P_8021Q
);
1861 bpf_push_mac_rcsum(skb
);
1862 ret
= skb_vlan_push(skb
, vlan_proto
, vlan_tci
);
1863 bpf_pull_mac_rcsum(skb
);
1865 bpf_compute_data_end(skb
);
1869 const struct bpf_func_proto bpf_skb_vlan_push_proto
= {
1870 .func
= bpf_skb_vlan_push
,
1872 .ret_type
= RET_INTEGER
,
1873 .arg1_type
= ARG_PTR_TO_CTX
,
1874 .arg2_type
= ARG_ANYTHING
,
1875 .arg3_type
= ARG_ANYTHING
,
1877 EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto
);
1879 BPF_CALL_1(bpf_skb_vlan_pop
, struct sk_buff
*, skb
)
1883 bpf_push_mac_rcsum(skb
);
1884 ret
= skb_vlan_pop(skb
);
1885 bpf_pull_mac_rcsum(skb
);
1887 bpf_compute_data_end(skb
);
1891 const struct bpf_func_proto bpf_skb_vlan_pop_proto
= {
1892 .func
= bpf_skb_vlan_pop
,
1894 .ret_type
= RET_INTEGER
,
1895 .arg1_type
= ARG_PTR_TO_CTX
,
1897 EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto
);
1899 static int bpf_skb_generic_push(struct sk_buff
*skb
, u32 off
, u32 len
)
1901 /* Caller already did skb_cow() with len as headroom,
1902 * so no need to do it here.
1905 memmove(skb
->data
, skb
->data
+ len
, off
);
1906 memset(skb
->data
+ off
, 0, len
);
1908 /* No skb_postpush_rcsum(skb, skb->data + off, len)
1909 * needed here as it does not change the skb->csum
1910 * result for checksum complete when summing over
1916 static int bpf_skb_generic_pop(struct sk_buff
*skb
, u32 off
, u32 len
)
1918 /* skb_ensure_writable() is not needed here, as we're
1919 * already working on an uncloned skb.
1921 if (unlikely(!pskb_may_pull(skb
, off
+ len
)))
1924 skb_postpull_rcsum(skb
, skb
->data
+ off
, len
);
1925 memmove(skb
->data
+ len
, skb
->data
, off
);
1926 __skb_pull(skb
, len
);
1931 static int bpf_skb_net_hdr_push(struct sk_buff
*skb
, u32 off
, u32 len
)
1933 bool trans_same
= skb
->transport_header
== skb
->network_header
;
1936 /* There's no need for __skb_push()/__skb_pull() pair to
1937 * get to the start of the mac header as we're guaranteed
1938 * to always start from here under eBPF.
1940 ret
= bpf_skb_generic_push(skb
, off
, len
);
1942 skb
->mac_header
-= len
;
1943 skb
->network_header
-= len
;
1945 skb
->transport_header
= skb
->network_header
;
1951 static int bpf_skb_net_hdr_pop(struct sk_buff
*skb
, u32 off
, u32 len
)
1953 bool trans_same
= skb
->transport_header
== skb
->network_header
;
1956 /* Same here, __skb_push()/__skb_pull() pair not needed. */
1957 ret
= bpf_skb_generic_pop(skb
, off
, len
);
1959 skb
->mac_header
+= len
;
1960 skb
->network_header
+= len
;
1962 skb
->transport_header
= skb
->network_header
;
1968 static int bpf_skb_proto_4_to_6(struct sk_buff
*skb
)
1970 const u32 len_diff
= sizeof(struct ipv6hdr
) - sizeof(struct iphdr
);
1971 u32 off
= skb
->network_header
- skb
->mac_header
;
1974 ret
= skb_cow(skb
, len_diff
);
1975 if (unlikely(ret
< 0))
1978 ret
= bpf_skb_net_hdr_push(skb
, off
, len_diff
);
1979 if (unlikely(ret
< 0))
1982 if (skb_is_gso(skb
)) {
1983 /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to
1984 * be changed into SKB_GSO_TCPV6.
1986 if (skb_shinfo(skb
)->gso_type
& SKB_GSO_TCPV4
) {
1987 skb_shinfo(skb
)->gso_type
&= ~SKB_GSO_TCPV4
;
1988 skb_shinfo(skb
)->gso_type
|= SKB_GSO_TCPV6
;
1991 /* Due to IPv6 header, MSS needs to be downgraded. */
1992 skb_shinfo(skb
)->gso_size
-= len_diff
;
1993 /* Header must be checked, and gso_segs recomputed. */
1994 skb_shinfo(skb
)->gso_type
|= SKB_GSO_DODGY
;
1995 skb_shinfo(skb
)->gso_segs
= 0;
1998 skb
->protocol
= htons(ETH_P_IPV6
);
1999 skb_clear_hash(skb
);
2004 static int bpf_skb_proto_6_to_4(struct sk_buff
*skb
)
2006 const u32 len_diff
= sizeof(struct ipv6hdr
) - sizeof(struct iphdr
);
2007 u32 off
= skb
->network_header
- skb
->mac_header
;
2010 ret
= skb_unclone(skb
, GFP_ATOMIC
);
2011 if (unlikely(ret
< 0))
2014 ret
= bpf_skb_net_hdr_pop(skb
, off
, len_diff
);
2015 if (unlikely(ret
< 0))
2018 if (skb_is_gso(skb
)) {
2019 /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to
2020 * be changed into SKB_GSO_TCPV4.
2022 if (skb_shinfo(skb
)->gso_type
& SKB_GSO_TCPV6
) {
2023 skb_shinfo(skb
)->gso_type
&= ~SKB_GSO_TCPV6
;
2024 skb_shinfo(skb
)->gso_type
|= SKB_GSO_TCPV4
;
2027 /* Due to IPv4 header, MSS can be upgraded. */
2028 skb_shinfo(skb
)->gso_size
+= len_diff
;
2029 /* Header must be checked, and gso_segs recomputed. */
2030 skb_shinfo(skb
)->gso_type
|= SKB_GSO_DODGY
;
2031 skb_shinfo(skb
)->gso_segs
= 0;
2034 skb
->protocol
= htons(ETH_P_IP
);
2035 skb_clear_hash(skb
);
2040 static int bpf_skb_proto_xlat(struct sk_buff
*skb
, __be16 to_proto
)
2042 __be16 from_proto
= skb
->protocol
;
2044 if (from_proto
== htons(ETH_P_IP
) &&
2045 to_proto
== htons(ETH_P_IPV6
))
2046 return bpf_skb_proto_4_to_6(skb
);
2048 if (from_proto
== htons(ETH_P_IPV6
) &&
2049 to_proto
== htons(ETH_P_IP
))
2050 return bpf_skb_proto_6_to_4(skb
);
2055 BPF_CALL_3(bpf_skb_change_proto
, struct sk_buff
*, skb
, __be16
, proto
,
2060 if (unlikely(flags
))
2063 /* General idea is that this helper does the basic groundwork
2064 * needed for changing the protocol, and eBPF program fills the
2065 * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
2066 * and other helpers, rather than passing a raw buffer here.
2068 * The rationale is to keep this minimal and without a need to
2069 * deal with raw packet data. F.e. even if we would pass buffers
2070 * here, the program still needs to call the bpf_lX_csum_replace()
2071 * helpers anyway. Plus, this way we keep also separation of
2072 * concerns, since f.e. bpf_skb_store_bytes() should only take
2075 * Currently, additional options and extension header space are
2076 * not supported, but flags register is reserved so we can adapt
2077 * that. For offloads, we mark packet as dodgy, so that headers
2078 * need to be verified first.
2080 ret
= bpf_skb_proto_xlat(skb
, proto
);
2081 bpf_compute_data_end(skb
);
2085 static const struct bpf_func_proto bpf_skb_change_proto_proto
= {
2086 .func
= bpf_skb_change_proto
,
2088 .ret_type
= RET_INTEGER
,
2089 .arg1_type
= ARG_PTR_TO_CTX
,
2090 .arg2_type
= ARG_ANYTHING
,
2091 .arg3_type
= ARG_ANYTHING
,
2094 BPF_CALL_2(bpf_skb_change_type
, struct sk_buff
*, skb
, u32
, pkt_type
)
2096 /* We only allow a restricted subset to be changed for now. */
2097 if (unlikely(!skb_pkt_type_ok(skb
->pkt_type
) ||
2098 !skb_pkt_type_ok(pkt_type
)))
2101 skb
->pkt_type
= pkt_type
;
2105 static const struct bpf_func_proto bpf_skb_change_type_proto
= {
2106 .func
= bpf_skb_change_type
,
2108 .ret_type
= RET_INTEGER
,
2109 .arg1_type
= ARG_PTR_TO_CTX
,
2110 .arg2_type
= ARG_ANYTHING
,
2113 static u32
__bpf_skb_min_len(const struct sk_buff
*skb
)
2115 u32 min_len
= skb_network_offset(skb
);
2117 if (skb_transport_header_was_set(skb
))
2118 min_len
= skb_transport_offset(skb
);
2119 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
2120 min_len
= skb_checksum_start_offset(skb
) +
2121 skb
->csum_offset
+ sizeof(__sum16
);
2125 static u32
__bpf_skb_max_len(const struct sk_buff
*skb
)
2127 return skb
->dev
->mtu
+ skb
->dev
->hard_header_len
;
2130 static int bpf_skb_grow_rcsum(struct sk_buff
*skb
, unsigned int new_len
)
2132 unsigned int old_len
= skb
->len
;
2135 ret
= __skb_grow_rcsum(skb
, new_len
);
2137 memset(skb
->data
+ old_len
, 0, new_len
- old_len
);
2141 static int bpf_skb_trim_rcsum(struct sk_buff
*skb
, unsigned int new_len
)
2143 return __skb_trim_rcsum(skb
, new_len
);
2146 BPF_CALL_3(bpf_skb_change_tail
, struct sk_buff
*, skb
, u32
, new_len
,
2149 u32 max_len
= __bpf_skb_max_len(skb
);
2150 u32 min_len
= __bpf_skb_min_len(skb
);
2153 if (unlikely(flags
|| new_len
> max_len
|| new_len
< min_len
))
2155 if (skb
->encapsulation
)
2158 /* The basic idea of this helper is that it's performing the
2159 * needed work to either grow or trim an skb, and eBPF program
2160 * rewrites the rest via helpers like bpf_skb_store_bytes(),
2161 * bpf_lX_csum_replace() and others rather than passing a raw
2162 * buffer here. This one is a slow path helper and intended
2163 * for replies with control messages.
2165 * Like in bpf_skb_change_proto(), we want to keep this rather
2166 * minimal and without protocol specifics so that we are able
2167 * to separate concerns as in bpf_skb_store_bytes() should only
2168 * be the one responsible for writing buffers.
2170 * It's really expected to be a slow path operation here for
2171 * control message replies, so we're implicitly linearizing,
2172 * uncloning and drop offloads from the skb by this.
2174 ret
= __bpf_try_make_writable(skb
, skb
->len
);
2176 if (new_len
> skb
->len
)
2177 ret
= bpf_skb_grow_rcsum(skb
, new_len
);
2178 else if (new_len
< skb
->len
)
2179 ret
= bpf_skb_trim_rcsum(skb
, new_len
);
2180 if (!ret
&& skb_is_gso(skb
))
2184 bpf_compute_data_end(skb
);
2188 static const struct bpf_func_proto bpf_skb_change_tail_proto
= {
2189 .func
= bpf_skb_change_tail
,
2191 .ret_type
= RET_INTEGER
,
2192 .arg1_type
= ARG_PTR_TO_CTX
,
2193 .arg2_type
= ARG_ANYTHING
,
2194 .arg3_type
= ARG_ANYTHING
,
2197 BPF_CALL_3(bpf_skb_change_head
, struct sk_buff
*, skb
, u32
, head_room
,
2200 u32 max_len
= __bpf_skb_max_len(skb
);
2201 u32 new_len
= skb
->len
+ head_room
;
2204 if (unlikely(flags
|| (!skb_is_gso(skb
) && new_len
> max_len
) ||
2205 new_len
< skb
->len
))
2208 ret
= skb_cow(skb
, head_room
);
2210 /* Idea for this helper is that we currently only
2211 * allow to expand on mac header. This means that
2212 * skb->protocol network header, etc, stay as is.
2213 * Compared to bpf_skb_change_tail(), we're more
2214 * flexible due to not needing to linearize or
2215 * reset GSO. Intention for this helper is to be
2216 * used by an L3 skb that needs to push mac header
2217 * for redirection into L2 device.
2219 __skb_push(skb
, head_room
);
2220 memset(skb
->data
, 0, head_room
);
2221 skb_reset_mac_header(skb
);
2224 bpf_compute_data_end(skb
);
2228 static const struct bpf_func_proto bpf_skb_change_head_proto
= {
2229 .func
= bpf_skb_change_head
,
2231 .ret_type
= RET_INTEGER
,
2232 .arg1_type
= ARG_PTR_TO_CTX
,
2233 .arg2_type
= ARG_ANYTHING
,
2234 .arg3_type
= ARG_ANYTHING
,
2237 BPF_CALL_2(bpf_xdp_adjust_head
, struct xdp_buff
*, xdp
, int, offset
)
2239 void *data
= xdp
->data
+ offset
;
2241 if (unlikely(data
< xdp
->data_hard_start
||
2242 data
> xdp
->data_end
- ETH_HLEN
))
2250 static const struct bpf_func_proto bpf_xdp_adjust_head_proto
= {
2251 .func
= bpf_xdp_adjust_head
,
2253 .ret_type
= RET_INTEGER
,
2254 .arg1_type
= ARG_PTR_TO_CTX
,
2255 .arg2_type
= ARG_ANYTHING
,
2258 bool bpf_helper_changes_pkt_data(void *func
)
2260 if (func
== bpf_skb_vlan_push
||
2261 func
== bpf_skb_vlan_pop
||
2262 func
== bpf_skb_store_bytes
||
2263 func
== bpf_skb_change_proto
||
2264 func
== bpf_skb_change_head
||
2265 func
== bpf_skb_change_tail
||
2266 func
== bpf_skb_pull_data
||
2267 func
== bpf_l3_csum_replace
||
2268 func
== bpf_l4_csum_replace
||
2269 func
== bpf_xdp_adjust_head
)
2275 static unsigned long bpf_skb_copy(void *dst_buff
, const void *skb
,
2276 unsigned long off
, unsigned long len
)
2278 void *ptr
= skb_header_pointer(skb
, off
, len
, dst_buff
);
2282 if (ptr
!= dst_buff
)
2283 memcpy(dst_buff
, ptr
, len
);
2288 BPF_CALL_5(bpf_skb_event_output
, struct sk_buff
*, skb
, struct bpf_map
*, map
,
2289 u64
, flags
, void *, meta
, u64
, meta_size
)
2291 u64 skb_size
= (flags
& BPF_F_CTXLEN_MASK
) >> 32;
2293 if (unlikely(flags
& ~(BPF_F_CTXLEN_MASK
| BPF_F_INDEX_MASK
)))
2295 if (unlikely(skb_size
> skb
->len
))
2298 return bpf_event_output(map
, flags
, meta
, meta_size
, skb
, skb_size
,
2302 static const struct bpf_func_proto bpf_skb_event_output_proto
= {
2303 .func
= bpf_skb_event_output
,
2305 .ret_type
= RET_INTEGER
,
2306 .arg1_type
= ARG_PTR_TO_CTX
,
2307 .arg2_type
= ARG_CONST_MAP_PTR
,
2308 .arg3_type
= ARG_ANYTHING
,
2309 .arg4_type
= ARG_PTR_TO_STACK
,
2310 .arg5_type
= ARG_CONST_STACK_SIZE
,
2313 static unsigned short bpf_tunnel_key_af(u64 flags
)
2315 return flags
& BPF_F_TUNINFO_IPV6
? AF_INET6
: AF_INET
;
2318 BPF_CALL_4(bpf_skb_get_tunnel_key
, struct sk_buff
*, skb
, struct bpf_tunnel_key
*, to
,
2319 u32
, size
, u64
, flags
)
2321 const struct ip_tunnel_info
*info
= skb_tunnel_info(skb
);
2322 u8 compat
[sizeof(struct bpf_tunnel_key
)];
2326 if (unlikely(!info
|| (flags
& ~(BPF_F_TUNINFO_IPV6
)))) {
2330 if (ip_tunnel_info_af(info
) != bpf_tunnel_key_af(flags
)) {
2334 if (unlikely(size
!= sizeof(struct bpf_tunnel_key
))) {
2337 case offsetof(struct bpf_tunnel_key
, tunnel_label
):
2338 case offsetof(struct bpf_tunnel_key
, tunnel_ext
):
2340 case offsetof(struct bpf_tunnel_key
, remote_ipv6
[1]):
2341 /* Fixup deprecated structure layouts here, so we have
2342 * a common path later on.
2344 if (ip_tunnel_info_af(info
) != AF_INET
)
2347 to
= (struct bpf_tunnel_key
*)compat
;
2354 to
->tunnel_id
= be64_to_cpu(info
->key
.tun_id
);
2355 to
->tunnel_tos
= info
->key
.tos
;
2356 to
->tunnel_ttl
= info
->key
.ttl
;
2358 if (flags
& BPF_F_TUNINFO_IPV6
) {
2359 memcpy(to
->remote_ipv6
, &info
->key
.u
.ipv6
.src
,
2360 sizeof(to
->remote_ipv6
));
2361 to
->tunnel_label
= be32_to_cpu(info
->key
.label
);
2363 to
->remote_ipv4
= be32_to_cpu(info
->key
.u
.ipv4
.src
);
2366 if (unlikely(size
!= sizeof(struct bpf_tunnel_key
)))
2367 memcpy(to_orig
, to
, size
);
2371 memset(to_orig
, 0, size
);
2375 static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto
= {
2376 .func
= bpf_skb_get_tunnel_key
,
2378 .ret_type
= RET_INTEGER
,
2379 .arg1_type
= ARG_PTR_TO_CTX
,
2380 .arg2_type
= ARG_PTR_TO_RAW_STACK
,
2381 .arg3_type
= ARG_CONST_STACK_SIZE
,
2382 .arg4_type
= ARG_ANYTHING
,
2385 BPF_CALL_3(bpf_skb_get_tunnel_opt
, struct sk_buff
*, skb
, u8
*, to
, u32
, size
)
2387 const struct ip_tunnel_info
*info
= skb_tunnel_info(skb
);
2390 if (unlikely(!info
||
2391 !(info
->key
.tun_flags
& TUNNEL_OPTIONS_PRESENT
))) {
2395 if (unlikely(size
< info
->options_len
)) {
2400 ip_tunnel_info_opts_get(to
, info
);
2401 if (size
> info
->options_len
)
2402 memset(to
+ info
->options_len
, 0, size
- info
->options_len
);
2404 return info
->options_len
;
2406 memset(to
, 0, size
);
2410 static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto
= {
2411 .func
= bpf_skb_get_tunnel_opt
,
2413 .ret_type
= RET_INTEGER
,
2414 .arg1_type
= ARG_PTR_TO_CTX
,
2415 .arg2_type
= ARG_PTR_TO_RAW_STACK
,
2416 .arg3_type
= ARG_CONST_STACK_SIZE
,
2419 static struct metadata_dst __percpu
*md_dst
;
2421 BPF_CALL_4(bpf_skb_set_tunnel_key
, struct sk_buff
*, skb
,
2422 const struct bpf_tunnel_key
*, from
, u32
, size
, u64
, flags
)
2424 struct metadata_dst
*md
= this_cpu_ptr(md_dst
);
2425 u8 compat
[sizeof(struct bpf_tunnel_key
)];
2426 struct ip_tunnel_info
*info
;
2428 if (unlikely(flags
& ~(BPF_F_TUNINFO_IPV6
| BPF_F_ZERO_CSUM_TX
|
2429 BPF_F_DONT_FRAGMENT
)))
2431 if (unlikely(size
!= sizeof(struct bpf_tunnel_key
))) {
2433 case offsetof(struct bpf_tunnel_key
, tunnel_label
):
2434 case offsetof(struct bpf_tunnel_key
, tunnel_ext
):
2435 case offsetof(struct bpf_tunnel_key
, remote_ipv6
[1]):
2436 /* Fixup deprecated structure layouts here, so we have
2437 * a common path later on.
2439 memcpy(compat
, from
, size
);
2440 memset(compat
+ size
, 0, sizeof(compat
) - size
);
2441 from
= (const struct bpf_tunnel_key
*) compat
;
2447 if (unlikely((!(flags
& BPF_F_TUNINFO_IPV6
) && from
->tunnel_label
) ||
2452 dst_hold((struct dst_entry
*) md
);
2453 skb_dst_set(skb
, (struct dst_entry
*) md
);
2455 info
= &md
->u
.tun_info
;
2456 info
->mode
= IP_TUNNEL_INFO_TX
;
2458 info
->key
.tun_flags
= TUNNEL_KEY
| TUNNEL_CSUM
| TUNNEL_NOCACHE
;
2459 if (flags
& BPF_F_DONT_FRAGMENT
)
2460 info
->key
.tun_flags
|= TUNNEL_DONT_FRAGMENT
;
2462 info
->key
.tun_id
= cpu_to_be64(from
->tunnel_id
);
2463 info
->key
.tos
= from
->tunnel_tos
;
2464 info
->key
.ttl
= from
->tunnel_ttl
;
2466 if (flags
& BPF_F_TUNINFO_IPV6
) {
2467 info
->mode
|= IP_TUNNEL_INFO_IPV6
;
2468 memcpy(&info
->key
.u
.ipv6
.dst
, from
->remote_ipv6
,
2469 sizeof(from
->remote_ipv6
));
2470 info
->key
.label
= cpu_to_be32(from
->tunnel_label
) &
2471 IPV6_FLOWLABEL_MASK
;
2473 info
->key
.u
.ipv4
.dst
= cpu_to_be32(from
->remote_ipv4
);
2474 if (flags
& BPF_F_ZERO_CSUM_TX
)
2475 info
->key
.tun_flags
&= ~TUNNEL_CSUM
;
2481 static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto
= {
2482 .func
= bpf_skb_set_tunnel_key
,
2484 .ret_type
= RET_INTEGER
,
2485 .arg1_type
= ARG_PTR_TO_CTX
,
2486 .arg2_type
= ARG_PTR_TO_STACK
,
2487 .arg3_type
= ARG_CONST_STACK_SIZE
,
2488 .arg4_type
= ARG_ANYTHING
,
2491 BPF_CALL_3(bpf_skb_set_tunnel_opt
, struct sk_buff
*, skb
,
2492 const u8
*, from
, u32
, size
)
2494 struct ip_tunnel_info
*info
= skb_tunnel_info(skb
);
2495 const struct metadata_dst
*md
= this_cpu_ptr(md_dst
);
2497 if (unlikely(info
!= &md
->u
.tun_info
|| (size
& (sizeof(u32
) - 1))))
2499 if (unlikely(size
> IP_TUNNEL_OPTS_MAX
))
2502 ip_tunnel_info_opts_set(info
, from
, size
);
2507 static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto
= {
2508 .func
= bpf_skb_set_tunnel_opt
,
2510 .ret_type
= RET_INTEGER
,
2511 .arg1_type
= ARG_PTR_TO_CTX
,
2512 .arg2_type
= ARG_PTR_TO_STACK
,
2513 .arg3_type
= ARG_CONST_STACK_SIZE
,
2516 static const struct bpf_func_proto
*
2517 bpf_get_skb_set_tunnel_proto(enum bpf_func_id which
)
2520 /* Race is not possible, since it's called from verifier
2521 * that is holding verifier mutex.
2523 md_dst
= metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX
,
2530 case BPF_FUNC_skb_set_tunnel_key
:
2531 return &bpf_skb_set_tunnel_key_proto
;
2532 case BPF_FUNC_skb_set_tunnel_opt
:
2533 return &bpf_skb_set_tunnel_opt_proto
;
2539 BPF_CALL_3(bpf_skb_under_cgroup
, struct sk_buff
*, skb
, struct bpf_map
*, map
,
2542 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
2543 struct cgroup
*cgrp
;
2546 sk
= skb_to_full_sk(skb
);
2547 if (!sk
|| !sk_fullsock(sk
))
2549 if (unlikely(idx
>= array
->map
.max_entries
))
2552 cgrp
= READ_ONCE(array
->ptrs
[idx
]);
2553 if (unlikely(!cgrp
))
2556 return sk_under_cgroup_hierarchy(sk
, cgrp
);
2559 static const struct bpf_func_proto bpf_skb_under_cgroup_proto
= {
2560 .func
= bpf_skb_under_cgroup
,
2562 .ret_type
= RET_INTEGER
,
2563 .arg1_type
= ARG_PTR_TO_CTX
,
2564 .arg2_type
= ARG_CONST_MAP_PTR
,
2565 .arg3_type
= ARG_ANYTHING
,
2568 static unsigned long bpf_xdp_copy(void *dst_buff
, const void *src_buff
,
2569 unsigned long off
, unsigned long len
)
2571 memcpy(dst_buff
, src_buff
+ off
, len
);
2575 BPF_CALL_5(bpf_xdp_event_output
, struct xdp_buff
*, xdp
, struct bpf_map
*, map
,
2576 u64
, flags
, void *, meta
, u64
, meta_size
)
2578 u64 xdp_size
= (flags
& BPF_F_CTXLEN_MASK
) >> 32;
2580 if (unlikely(flags
& ~(BPF_F_CTXLEN_MASK
| BPF_F_INDEX_MASK
)))
2582 if (unlikely(xdp_size
> (unsigned long)(xdp
->data_end
- xdp
->data
)))
2585 return bpf_event_output(map
, flags
, meta
, meta_size
, xdp
, xdp_size
,
2589 static const struct bpf_func_proto bpf_xdp_event_output_proto
= {
2590 .func
= bpf_xdp_event_output
,
2592 .ret_type
= RET_INTEGER
,
2593 .arg1_type
= ARG_PTR_TO_CTX
,
2594 .arg2_type
= ARG_CONST_MAP_PTR
,
2595 .arg3_type
= ARG_ANYTHING
,
2596 .arg4_type
= ARG_PTR_TO_STACK
,
2597 .arg5_type
= ARG_CONST_STACK_SIZE
,
2600 static const struct bpf_func_proto
*
2601 sk_filter_func_proto(enum bpf_func_id func_id
)
2604 case BPF_FUNC_map_lookup_elem
:
2605 return &bpf_map_lookup_elem_proto
;
2606 case BPF_FUNC_map_update_elem
:
2607 return &bpf_map_update_elem_proto
;
2608 case BPF_FUNC_map_delete_elem
:
2609 return &bpf_map_delete_elem_proto
;
2610 case BPF_FUNC_get_prandom_u32
:
2611 return &bpf_get_prandom_u32_proto
;
2612 case BPF_FUNC_get_smp_processor_id
:
2613 return &bpf_get_raw_smp_processor_id_proto
;
2614 case BPF_FUNC_get_numa_node_id
:
2615 return &bpf_get_numa_node_id_proto
;
2616 case BPF_FUNC_tail_call
:
2617 return &bpf_tail_call_proto
;
2618 case BPF_FUNC_ktime_get_ns
:
2619 return &bpf_ktime_get_ns_proto
;
2620 case BPF_FUNC_trace_printk
:
2621 if (capable(CAP_SYS_ADMIN
))
2622 return bpf_get_trace_printk_proto();
2628 static const struct bpf_func_proto
*
2629 tc_cls_act_func_proto(enum bpf_func_id func_id
)
2632 case BPF_FUNC_skb_store_bytes
:
2633 return &bpf_skb_store_bytes_proto
;
2634 case BPF_FUNC_skb_load_bytes
:
2635 return &bpf_skb_load_bytes_proto
;
2636 case BPF_FUNC_skb_pull_data
:
2637 return &bpf_skb_pull_data_proto
;
2638 case BPF_FUNC_csum_diff
:
2639 return &bpf_csum_diff_proto
;
2640 case BPF_FUNC_csum_update
:
2641 return &bpf_csum_update_proto
;
2642 case BPF_FUNC_l3_csum_replace
:
2643 return &bpf_l3_csum_replace_proto
;
2644 case BPF_FUNC_l4_csum_replace
:
2645 return &bpf_l4_csum_replace_proto
;
2646 case BPF_FUNC_clone_redirect
:
2647 return &bpf_clone_redirect_proto
;
2648 case BPF_FUNC_get_cgroup_classid
:
2649 return &bpf_get_cgroup_classid_proto
;
2650 case BPF_FUNC_skb_vlan_push
:
2651 return &bpf_skb_vlan_push_proto
;
2652 case BPF_FUNC_skb_vlan_pop
:
2653 return &bpf_skb_vlan_pop_proto
;
2654 case BPF_FUNC_skb_change_proto
:
2655 return &bpf_skb_change_proto_proto
;
2656 case BPF_FUNC_skb_change_type
:
2657 return &bpf_skb_change_type_proto
;
2658 case BPF_FUNC_skb_change_tail
:
2659 return &bpf_skb_change_tail_proto
;
2660 case BPF_FUNC_skb_get_tunnel_key
:
2661 return &bpf_skb_get_tunnel_key_proto
;
2662 case BPF_FUNC_skb_set_tunnel_key
:
2663 return bpf_get_skb_set_tunnel_proto(func_id
);
2664 case BPF_FUNC_skb_get_tunnel_opt
:
2665 return &bpf_skb_get_tunnel_opt_proto
;
2666 case BPF_FUNC_skb_set_tunnel_opt
:
2667 return bpf_get_skb_set_tunnel_proto(func_id
);
2668 case BPF_FUNC_redirect
:
2669 return &bpf_redirect_proto
;
2670 case BPF_FUNC_get_route_realm
:
2671 return &bpf_get_route_realm_proto
;
2672 case BPF_FUNC_get_hash_recalc
:
2673 return &bpf_get_hash_recalc_proto
;
2674 case BPF_FUNC_set_hash_invalid
:
2675 return &bpf_set_hash_invalid_proto
;
2676 case BPF_FUNC_perf_event_output
:
2677 return &bpf_skb_event_output_proto
;
2678 case BPF_FUNC_get_smp_processor_id
:
2679 return &bpf_get_smp_processor_id_proto
;
2680 case BPF_FUNC_skb_under_cgroup
:
2681 return &bpf_skb_under_cgroup_proto
;
2683 return sk_filter_func_proto(func_id
);
2687 static const struct bpf_func_proto
*
2688 xdp_func_proto(enum bpf_func_id func_id
)
2691 case BPF_FUNC_perf_event_output
:
2692 return &bpf_xdp_event_output_proto
;
2693 case BPF_FUNC_get_smp_processor_id
:
2694 return &bpf_get_smp_processor_id_proto
;
2695 case BPF_FUNC_xdp_adjust_head
:
2696 return &bpf_xdp_adjust_head_proto
;
2698 return sk_filter_func_proto(func_id
);
2702 static const struct bpf_func_proto
*
2703 cg_skb_func_proto(enum bpf_func_id func_id
)
2706 case BPF_FUNC_skb_load_bytes
:
2707 return &bpf_skb_load_bytes_proto
;
2709 return sk_filter_func_proto(func_id
);
2713 static const struct bpf_func_proto
*
2714 lwt_inout_func_proto(enum bpf_func_id func_id
)
2717 case BPF_FUNC_skb_load_bytes
:
2718 return &bpf_skb_load_bytes_proto
;
2719 case BPF_FUNC_skb_pull_data
:
2720 return &bpf_skb_pull_data_proto
;
2721 case BPF_FUNC_csum_diff
:
2722 return &bpf_csum_diff_proto
;
2723 case BPF_FUNC_get_cgroup_classid
:
2724 return &bpf_get_cgroup_classid_proto
;
2725 case BPF_FUNC_get_route_realm
:
2726 return &bpf_get_route_realm_proto
;
2727 case BPF_FUNC_get_hash_recalc
:
2728 return &bpf_get_hash_recalc_proto
;
2729 case BPF_FUNC_perf_event_output
:
2730 return &bpf_skb_event_output_proto
;
2731 case BPF_FUNC_get_smp_processor_id
:
2732 return &bpf_get_smp_processor_id_proto
;
2733 case BPF_FUNC_skb_under_cgroup
:
2734 return &bpf_skb_under_cgroup_proto
;
2736 return sk_filter_func_proto(func_id
);
2740 static const struct bpf_func_proto
*
2741 lwt_xmit_func_proto(enum bpf_func_id func_id
)
2744 case BPF_FUNC_skb_get_tunnel_key
:
2745 return &bpf_skb_get_tunnel_key_proto
;
2746 case BPF_FUNC_skb_set_tunnel_key
:
2747 return bpf_get_skb_set_tunnel_proto(func_id
);
2748 case BPF_FUNC_skb_get_tunnel_opt
:
2749 return &bpf_skb_get_tunnel_opt_proto
;
2750 case BPF_FUNC_skb_set_tunnel_opt
:
2751 return bpf_get_skb_set_tunnel_proto(func_id
);
2752 case BPF_FUNC_redirect
:
2753 return &bpf_redirect_proto
;
2754 case BPF_FUNC_clone_redirect
:
2755 return &bpf_clone_redirect_proto
;
2756 case BPF_FUNC_skb_change_tail
:
2757 return &bpf_skb_change_tail_proto
;
2758 case BPF_FUNC_skb_change_head
:
2759 return &bpf_skb_change_head_proto
;
2760 case BPF_FUNC_skb_store_bytes
:
2761 return &bpf_skb_store_bytes_proto
;
2762 case BPF_FUNC_csum_update
:
2763 return &bpf_csum_update_proto
;
2764 case BPF_FUNC_l3_csum_replace
:
2765 return &bpf_l3_csum_replace_proto
;
2766 case BPF_FUNC_l4_csum_replace
:
2767 return &bpf_l4_csum_replace_proto
;
2768 case BPF_FUNC_set_hash_invalid
:
2769 return &bpf_set_hash_invalid_proto
;
2771 return lwt_inout_func_proto(func_id
);
2775 static bool __is_valid_access(int off
, int size
)
2777 if (off
< 0 || off
>= sizeof(struct __sk_buff
))
2779 /* The verifier guarantees that size > 0. */
2780 if (off
% size
!= 0)
2782 if (size
!= sizeof(__u32
))
2788 static bool sk_filter_is_valid_access(int off
, int size
,
2789 enum bpf_access_type type
,
2790 enum bpf_reg_type
*reg_type
)
2793 case offsetof(struct __sk_buff
, tc_classid
):
2794 case offsetof(struct __sk_buff
, data
):
2795 case offsetof(struct __sk_buff
, data_end
):
2799 if (type
== BPF_WRITE
) {
2801 case offsetof(struct __sk_buff
, cb
[0]) ...
2802 offsetof(struct __sk_buff
, cb
[4]):
2809 return __is_valid_access(off
, size
);
2812 static bool lwt_is_valid_access(int off
, int size
,
2813 enum bpf_access_type type
,
2814 enum bpf_reg_type
*reg_type
)
2817 case offsetof(struct __sk_buff
, tc_classid
):
2821 if (type
== BPF_WRITE
) {
2823 case offsetof(struct __sk_buff
, mark
):
2824 case offsetof(struct __sk_buff
, priority
):
2825 case offsetof(struct __sk_buff
, cb
[0]) ...
2826 offsetof(struct __sk_buff
, cb
[4]):
2834 case offsetof(struct __sk_buff
, data
):
2835 *reg_type
= PTR_TO_PACKET
;
2837 case offsetof(struct __sk_buff
, data_end
):
2838 *reg_type
= PTR_TO_PACKET_END
;
2842 return __is_valid_access(off
, size
);
2845 static bool sock_filter_is_valid_access(int off
, int size
,
2846 enum bpf_access_type type
,
2847 enum bpf_reg_type
*reg_type
)
2849 if (type
== BPF_WRITE
) {
2851 case offsetof(struct bpf_sock
, bound_dev_if
):
2858 if (off
< 0 || off
+ size
> sizeof(struct bpf_sock
))
2860 /* The verifier guarantees that size > 0. */
2861 if (off
% size
!= 0)
2863 if (size
!= sizeof(__u32
))
2869 static int tc_cls_act_prologue(struct bpf_insn
*insn_buf
, bool direct_write
,
2870 const struct bpf_prog
*prog
)
2872 struct bpf_insn
*insn
= insn_buf
;
2877 /* if (!skb->cloned)
2880 * (Fast-path, otherwise approximation that we might be
2881 * a clone, do the rest in helper.)
2883 *insn
++ = BPF_LDX_MEM(BPF_B
, BPF_REG_6
, BPF_REG_1
, CLONED_OFFSET());
2884 *insn
++ = BPF_ALU32_IMM(BPF_AND
, BPF_REG_6
, CLONED_MASK
);
2885 *insn
++ = BPF_JMP_IMM(BPF_JEQ
, BPF_REG_6
, 0, 7);
2887 /* ret = bpf_skb_pull_data(skb, 0); */
2888 *insn
++ = BPF_MOV64_REG(BPF_REG_6
, BPF_REG_1
);
2889 *insn
++ = BPF_ALU64_REG(BPF_XOR
, BPF_REG_2
, BPF_REG_2
);
2890 *insn
++ = BPF_RAW_INSN(BPF_JMP
| BPF_CALL
, 0, 0, 0,
2891 BPF_FUNC_skb_pull_data
);
2894 * return TC_ACT_SHOT;
2896 *insn
++ = BPF_JMP_IMM(BPF_JEQ
, BPF_REG_0
, 0, 2);
2897 *insn
++ = BPF_ALU32_IMM(BPF_MOV
, BPF_REG_0
, TC_ACT_SHOT
);
2898 *insn
++ = BPF_EXIT_INSN();
2901 *insn
++ = BPF_MOV64_REG(BPF_REG_1
, BPF_REG_6
);
2903 *insn
++ = prog
->insnsi
[0];
2905 return insn
- insn_buf
;
2908 static bool tc_cls_act_is_valid_access(int off
, int size
,
2909 enum bpf_access_type type
,
2910 enum bpf_reg_type
*reg_type
)
2912 if (type
== BPF_WRITE
) {
2914 case offsetof(struct __sk_buff
, mark
):
2915 case offsetof(struct __sk_buff
, tc_index
):
2916 case offsetof(struct __sk_buff
, priority
):
2917 case offsetof(struct __sk_buff
, cb
[0]) ...
2918 offsetof(struct __sk_buff
, cb
[4]):
2919 case offsetof(struct __sk_buff
, tc_classid
):
2927 case offsetof(struct __sk_buff
, data
):
2928 *reg_type
= PTR_TO_PACKET
;
2930 case offsetof(struct __sk_buff
, data_end
):
2931 *reg_type
= PTR_TO_PACKET_END
;
2935 return __is_valid_access(off
, size
);
2938 static bool __is_valid_xdp_access(int off
, int size
)
2940 if (off
< 0 || off
>= sizeof(struct xdp_md
))
2942 if (off
% size
!= 0)
2944 if (size
!= sizeof(__u32
))
2950 static bool xdp_is_valid_access(int off
, int size
,
2951 enum bpf_access_type type
,
2952 enum bpf_reg_type
*reg_type
)
2954 if (type
== BPF_WRITE
)
2958 case offsetof(struct xdp_md
, data
):
2959 *reg_type
= PTR_TO_PACKET
;
2961 case offsetof(struct xdp_md
, data_end
):
2962 *reg_type
= PTR_TO_PACKET_END
;
2966 return __is_valid_xdp_access(off
, size
);
2969 void bpf_warn_invalid_xdp_action(u32 act
)
2971 WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act
);
2973 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action
);
2975 static u32
sk_filter_convert_ctx_access(enum bpf_access_type type
, int dst_reg
,
2976 int src_reg
, int ctx_off
,
2977 struct bpf_insn
*insn_buf
,
2978 struct bpf_prog
*prog
)
2980 struct bpf_insn
*insn
= insn_buf
;
2983 case offsetof(struct __sk_buff
, len
):
2984 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, len
) != 4);
2986 *insn
++ = BPF_LDX_MEM(BPF_W
, dst_reg
, src_reg
,
2987 offsetof(struct sk_buff
, len
));
2990 case offsetof(struct __sk_buff
, protocol
):
2991 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, protocol
) != 2);
2993 *insn
++ = BPF_LDX_MEM(BPF_H
, dst_reg
, src_reg
,
2994 offsetof(struct sk_buff
, protocol
));
2997 case offsetof(struct __sk_buff
, vlan_proto
):
2998 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, vlan_proto
) != 2);
3000 *insn
++ = BPF_LDX_MEM(BPF_H
, dst_reg
, src_reg
,
3001 offsetof(struct sk_buff
, vlan_proto
));
3004 case offsetof(struct __sk_buff
, priority
):
3005 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, priority
) != 4);
3007 if (type
== BPF_WRITE
)
3008 *insn
++ = BPF_STX_MEM(BPF_W
, dst_reg
, src_reg
,
3009 offsetof(struct sk_buff
, priority
));
3011 *insn
++ = BPF_LDX_MEM(BPF_W
, dst_reg
, src_reg
,
3012 offsetof(struct sk_buff
, priority
));
3015 case offsetof(struct __sk_buff
, ingress_ifindex
):
3016 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, skb_iif
) != 4);
3018 *insn
++ = BPF_LDX_MEM(BPF_W
, dst_reg
, src_reg
,
3019 offsetof(struct sk_buff
, skb_iif
));
3022 case offsetof(struct __sk_buff
, ifindex
):
3023 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device
, ifindex
) != 4);
3025 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff
, dev
),
3027 offsetof(struct sk_buff
, dev
));
3028 *insn
++ = BPF_JMP_IMM(BPF_JEQ
, dst_reg
, 0, 1);
3029 *insn
++ = BPF_LDX_MEM(BPF_W
, dst_reg
, dst_reg
,
3030 offsetof(struct net_device
, ifindex
));
3033 case offsetof(struct __sk_buff
, hash
):
3034 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, hash
) != 4);
3036 *insn
++ = BPF_LDX_MEM(BPF_W
, dst_reg
, src_reg
,
3037 offsetof(struct sk_buff
, hash
));
3040 case offsetof(struct __sk_buff
, mark
):
3041 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, mark
) != 4);
3043 if (type
== BPF_WRITE
)
3044 *insn
++ = BPF_STX_MEM(BPF_W
, dst_reg
, src_reg
,
3045 offsetof(struct sk_buff
, mark
));
3047 *insn
++ = BPF_LDX_MEM(BPF_W
, dst_reg
, src_reg
,
3048 offsetof(struct sk_buff
, mark
));
3051 case offsetof(struct __sk_buff
, pkt_type
):
3052 return convert_skb_access(SKF_AD_PKTTYPE
, dst_reg
, src_reg
, insn
);
3054 case offsetof(struct __sk_buff
, queue_mapping
):
3055 return convert_skb_access(SKF_AD_QUEUE
, dst_reg
, src_reg
, insn
);
3057 case offsetof(struct __sk_buff
, vlan_present
):
3058 return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT
,
3059 dst_reg
, src_reg
, insn
);
3061 case offsetof(struct __sk_buff
, vlan_tci
):
3062 return convert_skb_access(SKF_AD_VLAN_TAG
,
3063 dst_reg
, src_reg
, insn
);
3065 case offsetof(struct __sk_buff
, cb
[0]) ...
3066 offsetof(struct __sk_buff
, cb
[4]):
3067 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb
, data
) < 20);
3069 prog
->cb_access
= 1;
3070 ctx_off
-= offsetof(struct __sk_buff
, cb
[0]);
3071 ctx_off
+= offsetof(struct sk_buff
, cb
);
3072 ctx_off
+= offsetof(struct qdisc_skb_cb
, data
);
3073 if (type
== BPF_WRITE
)
3074 *insn
++ = BPF_STX_MEM(BPF_W
, dst_reg
, src_reg
, ctx_off
);
3076 *insn
++ = BPF_LDX_MEM(BPF_W
, dst_reg
, src_reg
, ctx_off
);
3079 case offsetof(struct __sk_buff
, tc_classid
):
3080 ctx_off
-= offsetof(struct __sk_buff
, tc_classid
);
3081 ctx_off
+= offsetof(struct sk_buff
, cb
);
3082 ctx_off
+= offsetof(struct qdisc_skb_cb
, tc_classid
);
3083 if (type
== BPF_WRITE
)
3084 *insn
++ = BPF_STX_MEM(BPF_H
, dst_reg
, src_reg
, ctx_off
);
3086 *insn
++ = BPF_LDX_MEM(BPF_H
, dst_reg
, src_reg
, ctx_off
);
3089 case offsetof(struct __sk_buff
, data
):
3090 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff
, data
),
3092 offsetof(struct sk_buff
, data
));
3095 case offsetof(struct __sk_buff
, data_end
):
3096 ctx_off
-= offsetof(struct __sk_buff
, data_end
);
3097 ctx_off
+= offsetof(struct sk_buff
, cb
);
3098 ctx_off
+= offsetof(struct bpf_skb_data_end
, data_end
);
3099 *insn
++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg
, src_reg
,
3103 case offsetof(struct __sk_buff
, tc_index
):
3104 #ifdef CONFIG_NET_SCHED
3105 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff
, tc_index
) != 2);
3107 if (type
== BPF_WRITE
)
3108 *insn
++ = BPF_STX_MEM(BPF_H
, dst_reg
, src_reg
,
3109 offsetof(struct sk_buff
, tc_index
));
3111 *insn
++ = BPF_LDX_MEM(BPF_H
, dst_reg
, src_reg
,
3112 offsetof(struct sk_buff
, tc_index
));
3115 if (type
== BPF_WRITE
)
3116 *insn
++ = BPF_MOV64_REG(dst_reg
, dst_reg
);
3118 *insn
++ = BPF_MOV64_IMM(dst_reg
, 0);
3123 return insn
- insn_buf
;
3126 static u32
sock_filter_convert_ctx_access(enum bpf_access_type type
,
3127 int dst_reg
, int src_reg
,
3129 struct bpf_insn
*insn_buf
,
3130 struct bpf_prog
*prog
)
3132 struct bpf_insn
*insn
= insn_buf
;
3135 case offsetof(struct bpf_sock
, bound_dev_if
):
3136 BUILD_BUG_ON(FIELD_SIZEOF(struct sock
, sk_bound_dev_if
) != 4);
3138 if (type
== BPF_WRITE
)
3139 *insn
++ = BPF_STX_MEM(BPF_W
, dst_reg
, src_reg
,
3140 offsetof(struct sock
, sk_bound_dev_if
));
3142 *insn
++ = BPF_LDX_MEM(BPF_W
, dst_reg
, src_reg
,
3143 offsetof(struct sock
, sk_bound_dev_if
));
3146 case offsetof(struct bpf_sock
, family
):
3147 BUILD_BUG_ON(FIELD_SIZEOF(struct sock
, sk_family
) != 2);
3149 *insn
++ = BPF_LDX_MEM(BPF_H
, dst_reg
, src_reg
,
3150 offsetof(struct sock
, sk_family
));
3153 case offsetof(struct bpf_sock
, type
):
3154 *insn
++ = BPF_LDX_MEM(BPF_W
, dst_reg
, src_reg
,
3155 offsetof(struct sock
, __sk_flags_offset
));
3156 *insn
++ = BPF_ALU32_IMM(BPF_AND
, dst_reg
, SK_FL_TYPE_MASK
);
3157 *insn
++ = BPF_ALU32_IMM(BPF_RSH
, dst_reg
, SK_FL_TYPE_SHIFT
);
3160 case offsetof(struct bpf_sock
, protocol
):
3161 *insn
++ = BPF_LDX_MEM(BPF_W
, dst_reg
, src_reg
,
3162 offsetof(struct sock
, __sk_flags_offset
));
3163 *insn
++ = BPF_ALU32_IMM(BPF_AND
, dst_reg
, SK_FL_PROTO_MASK
);
3164 *insn
++ = BPF_ALU32_IMM(BPF_RSH
, dst_reg
, SK_FL_PROTO_SHIFT
);
3168 return insn
- insn_buf
;
3171 static u32
tc_cls_act_convert_ctx_access(enum bpf_access_type type
, int dst_reg
,
3172 int src_reg
, int ctx_off
,
3173 struct bpf_insn
*insn_buf
,
3174 struct bpf_prog
*prog
)
3176 struct bpf_insn
*insn
= insn_buf
;
3179 case offsetof(struct __sk_buff
, ifindex
):
3180 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device
, ifindex
) != 4);
3182 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff
, dev
),
3184 offsetof(struct sk_buff
, dev
));
3185 *insn
++ = BPF_LDX_MEM(BPF_W
, dst_reg
, dst_reg
,
3186 offsetof(struct net_device
, ifindex
));
3189 return sk_filter_convert_ctx_access(type
, dst_reg
, src_reg
,
3190 ctx_off
, insn_buf
, prog
);
3193 return insn
- insn_buf
;
3196 static u32
xdp_convert_ctx_access(enum bpf_access_type type
, int dst_reg
,
3197 int src_reg
, int ctx_off
,
3198 struct bpf_insn
*insn_buf
,
3199 struct bpf_prog
*prog
)
3201 struct bpf_insn
*insn
= insn_buf
;
3204 case offsetof(struct xdp_md
, data
):
3205 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff
, data
),
3207 offsetof(struct xdp_buff
, data
));
3209 case offsetof(struct xdp_md
, data_end
):
3210 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff
, data_end
),
3212 offsetof(struct xdp_buff
, data_end
));
3216 return insn
- insn_buf
;
3219 static const struct bpf_verifier_ops sk_filter_ops
= {
3220 .get_func_proto
= sk_filter_func_proto
,
3221 .is_valid_access
= sk_filter_is_valid_access
,
3222 .convert_ctx_access
= sk_filter_convert_ctx_access
,
3225 static const struct bpf_verifier_ops tc_cls_act_ops
= {
3226 .get_func_proto
= tc_cls_act_func_proto
,
3227 .is_valid_access
= tc_cls_act_is_valid_access
,
3228 .convert_ctx_access
= tc_cls_act_convert_ctx_access
,
3229 .gen_prologue
= tc_cls_act_prologue
,
3232 static const struct bpf_verifier_ops xdp_ops
= {
3233 .get_func_proto
= xdp_func_proto
,
3234 .is_valid_access
= xdp_is_valid_access
,
3235 .convert_ctx_access
= xdp_convert_ctx_access
,
3238 static const struct bpf_verifier_ops cg_skb_ops
= {
3239 .get_func_proto
= cg_skb_func_proto
,
3240 .is_valid_access
= sk_filter_is_valid_access
,
3241 .convert_ctx_access
= sk_filter_convert_ctx_access
,
3244 static const struct bpf_verifier_ops lwt_inout_ops
= {
3245 .get_func_proto
= lwt_inout_func_proto
,
3246 .is_valid_access
= lwt_is_valid_access
,
3247 .convert_ctx_access
= sk_filter_convert_ctx_access
,
3250 static const struct bpf_verifier_ops lwt_xmit_ops
= {
3251 .get_func_proto
= lwt_xmit_func_proto
,
3252 .is_valid_access
= lwt_is_valid_access
,
3253 .convert_ctx_access
= sk_filter_convert_ctx_access
,
3254 .gen_prologue
= tc_cls_act_prologue
,
3257 static const struct bpf_verifier_ops cg_sock_ops
= {
3258 .get_func_proto
= sk_filter_func_proto
,
3259 .is_valid_access
= sock_filter_is_valid_access
,
3260 .convert_ctx_access
= sock_filter_convert_ctx_access
,
3263 static struct bpf_prog_type_list sk_filter_type __read_mostly
= {
3264 .ops
= &sk_filter_ops
,
3265 .type
= BPF_PROG_TYPE_SOCKET_FILTER
,
3268 static struct bpf_prog_type_list sched_cls_type __read_mostly
= {
3269 .ops
= &tc_cls_act_ops
,
3270 .type
= BPF_PROG_TYPE_SCHED_CLS
,
3273 static struct bpf_prog_type_list sched_act_type __read_mostly
= {
3274 .ops
= &tc_cls_act_ops
,
3275 .type
= BPF_PROG_TYPE_SCHED_ACT
,
3278 static struct bpf_prog_type_list xdp_type __read_mostly
= {
3280 .type
= BPF_PROG_TYPE_XDP
,
3283 static struct bpf_prog_type_list cg_skb_type __read_mostly
= {
3285 .type
= BPF_PROG_TYPE_CGROUP_SKB
,
3288 static struct bpf_prog_type_list lwt_in_type __read_mostly
= {
3289 .ops
= &lwt_inout_ops
,
3290 .type
= BPF_PROG_TYPE_LWT_IN
,
3293 static struct bpf_prog_type_list lwt_out_type __read_mostly
= {
3294 .ops
= &lwt_inout_ops
,
3295 .type
= BPF_PROG_TYPE_LWT_OUT
,
3298 static struct bpf_prog_type_list lwt_xmit_type __read_mostly
= {
3299 .ops
= &lwt_xmit_ops
,
3300 .type
= BPF_PROG_TYPE_LWT_XMIT
,
3303 static struct bpf_prog_type_list cg_sock_type __read_mostly
= {
3304 .ops
= &cg_sock_ops
,
3305 .type
= BPF_PROG_TYPE_CGROUP_SOCK
3308 static int __init
register_sk_filter_ops(void)
3310 bpf_register_prog_type(&sk_filter_type
);
3311 bpf_register_prog_type(&sched_cls_type
);
3312 bpf_register_prog_type(&sched_act_type
);
3313 bpf_register_prog_type(&xdp_type
);
3314 bpf_register_prog_type(&cg_skb_type
);
3315 bpf_register_prog_type(&cg_sock_type
);
3316 bpf_register_prog_type(&lwt_in_type
);
3317 bpf_register_prog_type(&lwt_out_type
);
3318 bpf_register_prog_type(&lwt_xmit_type
);
3322 late_initcall(register_sk_filter_ops
);
3324 int sk_detach_filter(struct sock
*sk
)
3327 struct sk_filter
*filter
;
3329 if (sock_flag(sk
, SOCK_FILTER_LOCKED
))
3332 filter
= rcu_dereference_protected(sk
->sk_filter
,
3333 lockdep_sock_is_held(sk
));
3335 RCU_INIT_POINTER(sk
->sk_filter
, NULL
);
3336 sk_filter_uncharge(sk
, filter
);
3342 EXPORT_SYMBOL_GPL(sk_detach_filter
);
3344 int sk_get_filter(struct sock
*sk
, struct sock_filter __user
*ubuf
,
3347 struct sock_fprog_kern
*fprog
;
3348 struct sk_filter
*filter
;
3352 filter
= rcu_dereference_protected(sk
->sk_filter
,
3353 lockdep_sock_is_held(sk
));
3357 /* We're copying the filter that has been originally attached,
3358 * so no conversion/decode needed anymore. eBPF programs that
3359 * have no original program cannot be dumped through this.
3362 fprog
= filter
->prog
->orig_prog
;
3368 /* User space only enquires number of filter blocks. */
3372 if (len
< fprog
->len
)
3376 if (copy_to_user(ubuf
, fprog
->filter
, bpf_classic_proglen(fprog
)))
3379 /* Instead of bytes, the API requests to return the number