]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - net/core/filter.c
ipv6: invert join/leave anycast rtnl/socket locking order
[mirror_ubuntu-artful-kernel.git] / net / core / filter.c
CommitLineData
1da177e4
LT
1/*
2 * Linux Socket Filter - Kernel level socket filtering
3 *
bd4cf0ed
AS
4 * Based on the design of the Berkeley Packet Filter. The new
5 * internal format has been designed by PLUMgrid:
1da177e4 6 *
bd4cf0ed
AS
7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
8 *
9 * Authors:
10 *
11 * Jay Schulist <jschlst@samba.org>
12 * Alexei Starovoitov <ast@plumgrid.com>
13 * Daniel Borkmann <dborkman@redhat.com>
1da177e4
LT
14 *
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version
18 * 2 of the License, or (at your option) any later version.
19 *
20 * Andi Kleen - Fix a few bad bugs and races.
4df95ff4 21 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
1da177e4
LT
22 */
23
24#include <linux/module.h>
25#include <linux/types.h>
1da177e4
LT
26#include <linux/mm.h>
27#include <linux/fcntl.h>
28#include <linux/socket.h>
29#include <linux/in.h>
30#include <linux/inet.h>
31#include <linux/netdevice.h>
32#include <linux/if_packet.h>
5a0e3ad6 33#include <linux/gfp.h>
1da177e4
LT
34#include <net/ip.h>
35#include <net/protocol.h>
4738c1db 36#include <net/netlink.h>
1da177e4
LT
37#include <linux/skbuff.h>
38#include <net/sock.h>
39#include <linux/errno.h>
40#include <linux/timer.h>
1da177e4 41#include <asm/uaccess.h>
40daafc8 42#include <asm/unaligned.h>
1da177e4 43#include <linux/filter.h>
86e4ca66 44#include <linux/ratelimit.h>
46b325c7 45#include <linux/seccomp.h>
f3335031 46#include <linux/if_vlan.h>
89aa0758 47#include <linux/bpf.h>
1da177e4 48
43db6d65
SH
49/**
50 * sk_filter - run a packet through a socket filter
51 * @sk: sock associated with &sk_buff
52 * @skb: buffer to filter
43db6d65
SH
53 *
54 * Run the filter code and then cut skb->data to correct size returned by
8ea6e345 55 * SK_RUN_FILTER. If pkt_len is 0 we toss packet. If skb->len is smaller
43db6d65 56 * than pkt_len we keep whole skb->data. This is the socket level
8ea6e345 57 * wrapper to SK_RUN_FILTER. It returns 0 if the packet should
43db6d65
SH
58 * be accepted or -EPERM if the packet should be tossed.
59 *
60 */
61int sk_filter(struct sock *sk, struct sk_buff *skb)
62{
63 int err;
64 struct sk_filter *filter;
65
c93bdd0e
MG
66 /*
67 * If the skb was allocated from pfmemalloc reserves, only
68 * allow SOCK_MEMALLOC sockets to use it as this socket is
69 * helping free memory
70 */
71 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
72 return -ENOMEM;
73
43db6d65
SH
74 err = security_sock_rcv_skb(sk, skb);
75 if (err)
76 return err;
77
80f8f102
ED
78 rcu_read_lock();
79 filter = rcu_dereference(sk->sk_filter);
43db6d65 80 if (filter) {
0a14842f 81 unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
0d7da9dd 82
43db6d65
SH
83 err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
84 }
80f8f102 85 rcu_read_unlock();
43db6d65
SH
86
87 return err;
88}
89EXPORT_SYMBOL(sk_filter);
90
30743837 91static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
bd4cf0ed 92{
56193d1b 93 return skb_get_poff((struct sk_buff *)(unsigned long) ctx);
bd4cf0ed
AS
94}
95
30743837 96static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
bd4cf0ed 97{
eb9672f4 98 struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
bd4cf0ed
AS
99 struct nlattr *nla;
100
101 if (skb_is_nonlinear(skb))
102 return 0;
103
05ab8f26
MK
104 if (skb->len < sizeof(struct nlattr))
105 return 0;
106
30743837 107 if (a > skb->len - sizeof(struct nlattr))
bd4cf0ed
AS
108 return 0;
109
30743837 110 nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
bd4cf0ed
AS
111 if (nla)
112 return (void *) nla - (void *) skb->data;
113
114 return 0;
115}
116
30743837 117static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
bd4cf0ed 118{
eb9672f4 119 struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
bd4cf0ed
AS
120 struct nlattr *nla;
121
122 if (skb_is_nonlinear(skb))
123 return 0;
124
05ab8f26
MK
125 if (skb->len < sizeof(struct nlattr))
126 return 0;
127
30743837 128 if (a > skb->len - sizeof(struct nlattr))
bd4cf0ed
AS
129 return 0;
130
30743837
DB
131 nla = (struct nlattr *) &skb->data[a];
132 if (nla->nla_len > skb->len - a)
bd4cf0ed
AS
133 return 0;
134
30743837 135 nla = nla_find_nested(nla, x);
bd4cf0ed
AS
136 if (nla)
137 return (void *) nla - (void *) skb->data;
138
139 return 0;
140}
141
30743837 142static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
bd4cf0ed
AS
143{
144 return raw_smp_processor_id();
145}
146
4cd3675e 147/* note that this only generates 32-bit random numbers */
30743837 148static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
4cd3675e 149{
eb9672f4 150 return prandom_u32();
4cd3675e
CG
151}
152
9bac3d6d
AS
153static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
154 struct bpf_insn *insn_buf)
155{
156 struct bpf_insn *insn = insn_buf;
157
158 switch (skb_field) {
159 case SKF_AD_MARK:
160 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
161
162 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
163 offsetof(struct sk_buff, mark));
164 break;
165
166 case SKF_AD_PKTTYPE:
167 *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
168 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
169#ifdef __BIG_ENDIAN_BITFIELD
170 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
171#endif
172 break;
173
174 case SKF_AD_QUEUE:
175 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
176
177 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
178 offsetof(struct sk_buff, queue_mapping));
179 break;
c2497395
AS
180
181 case SKF_AD_PROTOCOL:
182 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
183
184 /* dst_reg = *(u16 *) (src_reg + offsetof(protocol)) */
185 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
186 offsetof(struct sk_buff, protocol));
187 /* dst_reg = ntohs(dst_reg) [emitting a nop or swap16] */
188 *insn++ = BPF_ENDIAN(BPF_FROM_BE, dst_reg, 16);
189 break;
190
191 case SKF_AD_VLAN_TAG:
192 case SKF_AD_VLAN_TAG_PRESENT:
193 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
194 BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
195
196 /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
197 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
198 offsetof(struct sk_buff, vlan_tci));
199 if (skb_field == SKF_AD_VLAN_TAG) {
200 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg,
201 ~VLAN_TAG_PRESENT);
202 } else {
203 /* dst_reg >>= 12 */
204 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12);
205 /* dst_reg &= 1 */
206 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1);
207 }
208 break;
9bac3d6d
AS
209 }
210
211 return insn - insn_buf;
212}
213
bd4cf0ed 214static bool convert_bpf_extensions(struct sock_filter *fp,
2695fb55 215 struct bpf_insn **insnp)
bd4cf0ed 216{
2695fb55 217 struct bpf_insn *insn = *insnp;
9bac3d6d 218 u32 cnt;
bd4cf0ed
AS
219
220 switch (fp->k) {
221 case SKF_AD_OFF + SKF_AD_PROTOCOL:
c2497395
AS
222 cnt = convert_skb_access(SKF_AD_PROTOCOL, BPF_REG_A, BPF_REG_CTX, insn);
223 insn += cnt - 1;
bd4cf0ed
AS
224 break;
225
226 case SKF_AD_OFF + SKF_AD_PKTTYPE:
9bac3d6d
AS
227 cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
228 insn += cnt - 1;
bd4cf0ed
AS
229 break;
230
231 case SKF_AD_OFF + SKF_AD_IFINDEX:
232 case SKF_AD_OFF + SKF_AD_HATYPE:
bd4cf0ed
AS
233 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
234 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
f8f6d679
DB
235 BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0);
236
237 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
238 BPF_REG_TMP, BPF_REG_CTX,
239 offsetof(struct sk_buff, dev));
240 /* if (tmp != 0) goto pc + 1 */
241 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
242 *insn++ = BPF_EXIT_INSN();
243 if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
244 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
245 offsetof(struct net_device, ifindex));
246 else
247 *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
248 offsetof(struct net_device, type));
bd4cf0ed
AS
249 break;
250
251 case SKF_AD_OFF + SKF_AD_MARK:
9bac3d6d
AS
252 cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
253 insn += cnt - 1;
bd4cf0ed
AS
254 break;
255
256 case SKF_AD_OFF + SKF_AD_RXHASH:
257 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
258
9739eef1
AS
259 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
260 offsetof(struct sk_buff, hash));
bd4cf0ed
AS
261 break;
262
263 case SKF_AD_OFF + SKF_AD_QUEUE:
9bac3d6d
AS
264 cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
265 insn += cnt - 1;
bd4cf0ed
AS
266 break;
267
268 case SKF_AD_OFF + SKF_AD_VLAN_TAG:
c2497395
AS
269 cnt = convert_skb_access(SKF_AD_VLAN_TAG,
270 BPF_REG_A, BPF_REG_CTX, insn);
271 insn += cnt - 1;
272 break;
bd4cf0ed 273
c2497395
AS
274 case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
275 cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
276 BPF_REG_A, BPF_REG_CTX, insn);
277 insn += cnt - 1;
bd4cf0ed
AS
278 break;
279
280 case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
281 case SKF_AD_OFF + SKF_AD_NLATTR:
282 case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
283 case SKF_AD_OFF + SKF_AD_CPU:
4cd3675e 284 case SKF_AD_OFF + SKF_AD_RANDOM:
e430f34e 285 /* arg1 = CTX */
f8f6d679 286 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
bd4cf0ed 287 /* arg2 = A */
f8f6d679 288 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
bd4cf0ed 289 /* arg3 = X */
f8f6d679 290 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
e430f34e 291 /* Emit call(arg1=CTX, arg2=A, arg3=X) */
bd4cf0ed
AS
292 switch (fp->k) {
293 case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
f8f6d679 294 *insn = BPF_EMIT_CALL(__skb_get_pay_offset);
bd4cf0ed
AS
295 break;
296 case SKF_AD_OFF + SKF_AD_NLATTR:
f8f6d679 297 *insn = BPF_EMIT_CALL(__skb_get_nlattr);
bd4cf0ed
AS
298 break;
299 case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
f8f6d679 300 *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest);
bd4cf0ed
AS
301 break;
302 case SKF_AD_OFF + SKF_AD_CPU:
f8f6d679 303 *insn = BPF_EMIT_CALL(__get_raw_cpu_id);
bd4cf0ed 304 break;
4cd3675e 305 case SKF_AD_OFF + SKF_AD_RANDOM:
f8f6d679 306 *insn = BPF_EMIT_CALL(__get_random_u32);
4cd3675e 307 break;
bd4cf0ed
AS
308 }
309 break;
310
311 case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
9739eef1
AS
312 /* A ^= X */
313 *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
bd4cf0ed
AS
314 break;
315
316 default:
317 /* This is just a dummy call to avoid letting the compiler
318 * evict __bpf_call_base() as an optimization. Placed here
319 * where no-one bothers.
320 */
321 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
322 return false;
323 }
324
325 *insnp = insn;
326 return true;
327}
328
329/**
8fb575ca 330 * bpf_convert_filter - convert filter program
bd4cf0ed
AS
331 * @prog: the user passed filter program
332 * @len: the length of the user passed filter program
333 * @new_prog: buffer where converted program will be stored
334 * @new_len: pointer to store length of converted program
335 *
336 * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style.
337 * Conversion workflow:
338 *
339 * 1) First pass for calculating the new program length:
8fb575ca 340 * bpf_convert_filter(old_prog, old_len, NULL, &new_len)
bd4cf0ed
AS
341 *
342 * 2) 2nd pass to remap in two passes: 1st pass finds new
343 * jump offsets, 2nd pass remapping:
2695fb55 344 * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len);
8fb575ca 345 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
bd4cf0ed
AS
346 *
347 * User BPF's register A is mapped to our BPF register 6, user BPF
348 * register X is mapped to BPF register 7; frame pointer is always
349 * register 10; Context 'void *ctx' is stored in register 1, that is,
350 * for socket filters: ctx == 'struct sk_buff *', for seccomp:
351 * ctx == 'struct seccomp_data *'.
352 */
8fb575ca
AS
353int bpf_convert_filter(struct sock_filter *prog, int len,
354 struct bpf_insn *new_prog, int *new_len)
bd4cf0ed
AS
355{
356 int new_flen = 0, pass = 0, target, i;
2695fb55 357 struct bpf_insn *new_insn;
bd4cf0ed
AS
358 struct sock_filter *fp;
359 int *addrs = NULL;
360 u8 bpf_src;
361
362 BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
30743837 363 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
bd4cf0ed 364
6f9a093b 365 if (len <= 0 || len > BPF_MAXINSNS)
bd4cf0ed
AS
366 return -EINVAL;
367
368 if (new_prog) {
99e72a0f 369 addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL);
bd4cf0ed
AS
370 if (!addrs)
371 return -ENOMEM;
372 }
373
374do_pass:
375 new_insn = new_prog;
376 fp = prog;
377
f8f6d679
DB
378 if (new_insn)
379 *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
bd4cf0ed
AS
380 new_insn++;
381
382 for (i = 0; i < len; fp++, i++) {
2695fb55
AS
383 struct bpf_insn tmp_insns[6] = { };
384 struct bpf_insn *insn = tmp_insns;
bd4cf0ed
AS
385
386 if (addrs)
387 addrs[i] = new_insn - new_prog;
388
389 switch (fp->code) {
390 /* All arithmetic insns and skb loads map as-is. */
391 case BPF_ALU | BPF_ADD | BPF_X:
392 case BPF_ALU | BPF_ADD | BPF_K:
393 case BPF_ALU | BPF_SUB | BPF_X:
394 case BPF_ALU | BPF_SUB | BPF_K:
395 case BPF_ALU | BPF_AND | BPF_X:
396 case BPF_ALU | BPF_AND | BPF_K:
397 case BPF_ALU | BPF_OR | BPF_X:
398 case BPF_ALU | BPF_OR | BPF_K:
399 case BPF_ALU | BPF_LSH | BPF_X:
400 case BPF_ALU | BPF_LSH | BPF_K:
401 case BPF_ALU | BPF_RSH | BPF_X:
402 case BPF_ALU | BPF_RSH | BPF_K:
403 case BPF_ALU | BPF_XOR | BPF_X:
404 case BPF_ALU | BPF_XOR | BPF_K:
405 case BPF_ALU | BPF_MUL | BPF_X:
406 case BPF_ALU | BPF_MUL | BPF_K:
407 case BPF_ALU | BPF_DIV | BPF_X:
408 case BPF_ALU | BPF_DIV | BPF_K:
409 case BPF_ALU | BPF_MOD | BPF_X:
410 case BPF_ALU | BPF_MOD | BPF_K:
411 case BPF_ALU | BPF_NEG:
412 case BPF_LD | BPF_ABS | BPF_W:
413 case BPF_LD | BPF_ABS | BPF_H:
414 case BPF_LD | BPF_ABS | BPF_B:
415 case BPF_LD | BPF_IND | BPF_W:
416 case BPF_LD | BPF_IND | BPF_H:
417 case BPF_LD | BPF_IND | BPF_B:
418 /* Check for overloaded BPF extension and
419 * directly convert it if found, otherwise
420 * just move on with mapping.
421 */
422 if (BPF_CLASS(fp->code) == BPF_LD &&
423 BPF_MODE(fp->code) == BPF_ABS &&
424 convert_bpf_extensions(fp, &insn))
425 break;
426
f8f6d679 427 *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
bd4cf0ed
AS
428 break;
429
f8f6d679
DB
430 /* Jump transformation cannot use BPF block macros
431 * everywhere as offset calculation and target updates
432 * require a bit more work than the rest, i.e. jump
433 * opcodes map as-is, but offsets need adjustment.
434 */
435
436#define BPF_EMIT_JMP \
bd4cf0ed
AS
437 do { \
438 if (target >= len || target < 0) \
439 goto err; \
440 insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \
441 /* Adjust pc relative offset for 2nd or 3rd insn. */ \
442 insn->off -= insn - tmp_insns; \
443 } while (0)
444
f8f6d679
DB
445 case BPF_JMP | BPF_JA:
446 target = i + fp->k + 1;
447 insn->code = fp->code;
448 BPF_EMIT_JMP;
bd4cf0ed
AS
449 break;
450
451 case BPF_JMP | BPF_JEQ | BPF_K:
452 case BPF_JMP | BPF_JEQ | BPF_X:
453 case BPF_JMP | BPF_JSET | BPF_K:
454 case BPF_JMP | BPF_JSET | BPF_X:
455 case BPF_JMP | BPF_JGT | BPF_K:
456 case BPF_JMP | BPF_JGT | BPF_X:
457 case BPF_JMP | BPF_JGE | BPF_K:
458 case BPF_JMP | BPF_JGE | BPF_X:
459 if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
460 /* BPF immediates are signed, zero extend
461 * immediate into tmp register and use it
462 * in compare insn.
463 */
f8f6d679 464 *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
bd4cf0ed 465
e430f34e
AS
466 insn->dst_reg = BPF_REG_A;
467 insn->src_reg = BPF_REG_TMP;
bd4cf0ed
AS
468 bpf_src = BPF_X;
469 } else {
e430f34e
AS
470 insn->dst_reg = BPF_REG_A;
471 insn->src_reg = BPF_REG_X;
bd4cf0ed
AS
472 insn->imm = fp->k;
473 bpf_src = BPF_SRC(fp->code);
1da177e4 474 }
bd4cf0ed
AS
475
476 /* Common case where 'jump_false' is next insn. */
477 if (fp->jf == 0) {
478 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
479 target = i + fp->jt + 1;
f8f6d679 480 BPF_EMIT_JMP;
bd4cf0ed 481 break;
1da177e4 482 }
bd4cf0ed
AS
483
484 /* Convert JEQ into JNE when 'jump_true' is next insn. */
485 if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
486 insn->code = BPF_JMP | BPF_JNE | bpf_src;
487 target = i + fp->jf + 1;
f8f6d679 488 BPF_EMIT_JMP;
bd4cf0ed 489 break;
0b05b2a4 490 }
bd4cf0ed
AS
491
492 /* Other jumps are mapped into two insns: Jxx and JA. */
493 target = i + fp->jt + 1;
494 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
f8f6d679 495 BPF_EMIT_JMP;
bd4cf0ed
AS
496 insn++;
497
498 insn->code = BPF_JMP | BPF_JA;
499 target = i + fp->jf + 1;
f8f6d679 500 BPF_EMIT_JMP;
bd4cf0ed
AS
501 break;
502
503 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
504 case BPF_LDX | BPF_MSH | BPF_B:
9739eef1 505 /* tmp = A */
f8f6d679 506 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
1268e253 507 /* A = BPF_R0 = *(u8 *) (skb->data + K) */
f8f6d679 508 *insn++ = BPF_LD_ABS(BPF_B, fp->k);
9739eef1 509 /* A &= 0xf */
f8f6d679 510 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
9739eef1 511 /* A <<= 2 */
f8f6d679 512 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
9739eef1 513 /* X = A */
f8f6d679 514 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
9739eef1 515 /* A = tmp */
f8f6d679 516 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
bd4cf0ed
AS
517 break;
518
519 /* RET_K, RET_A are remaped into 2 insns. */
520 case BPF_RET | BPF_A:
521 case BPF_RET | BPF_K:
f8f6d679
DB
522 *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ?
523 BPF_K : BPF_X, BPF_REG_0,
524 BPF_REG_A, fp->k);
9739eef1 525 *insn = BPF_EXIT_INSN();
bd4cf0ed
AS
526 break;
527
528 /* Store to stack. */
529 case BPF_ST:
530 case BPF_STX:
f8f6d679
DB
531 *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
532 BPF_ST ? BPF_REG_A : BPF_REG_X,
533 -(BPF_MEMWORDS - fp->k) * 4);
bd4cf0ed
AS
534 break;
535
536 /* Load from stack. */
537 case BPF_LD | BPF_MEM:
538 case BPF_LDX | BPF_MEM:
f8f6d679
DB
539 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
540 BPF_REG_A : BPF_REG_X, BPF_REG_FP,
541 -(BPF_MEMWORDS - fp->k) * 4);
bd4cf0ed
AS
542 break;
543
544 /* A = K or X = K */
545 case BPF_LD | BPF_IMM:
546 case BPF_LDX | BPF_IMM:
f8f6d679
DB
547 *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
548 BPF_REG_A : BPF_REG_X, fp->k);
bd4cf0ed
AS
549 break;
550
551 /* X = A */
552 case BPF_MISC | BPF_TAX:
f8f6d679 553 *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
bd4cf0ed
AS
554 break;
555
556 /* A = X */
557 case BPF_MISC | BPF_TXA:
f8f6d679 558 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
bd4cf0ed
AS
559 break;
560
561 /* A = skb->len or X = skb->len */
562 case BPF_LD | BPF_W | BPF_LEN:
563 case BPF_LDX | BPF_W | BPF_LEN:
f8f6d679
DB
564 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
565 BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
566 offsetof(struct sk_buff, len));
bd4cf0ed
AS
567 break;
568
f8f6d679 569 /* Access seccomp_data fields. */
bd4cf0ed 570 case BPF_LDX | BPF_ABS | BPF_W:
9739eef1
AS
571 /* A = *(u32 *) (ctx + K) */
572 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
bd4cf0ed
AS
573 break;
574
ca9f1fd2 575 /* Unknown instruction. */
1da177e4 576 default:
bd4cf0ed 577 goto err;
1da177e4 578 }
bd4cf0ed
AS
579
580 insn++;
581 if (new_prog)
582 memcpy(new_insn, tmp_insns,
583 sizeof(*insn) * (insn - tmp_insns));
bd4cf0ed 584 new_insn += insn - tmp_insns;
1da177e4
LT
585 }
586
bd4cf0ed
AS
587 if (!new_prog) {
588 /* Only calculating new length. */
589 *new_len = new_insn - new_prog;
590 return 0;
591 }
592
593 pass++;
594 if (new_flen != new_insn - new_prog) {
595 new_flen = new_insn - new_prog;
596 if (pass > 2)
597 goto err;
bd4cf0ed
AS
598 goto do_pass;
599 }
600
601 kfree(addrs);
602 BUG_ON(*new_len != new_flen);
1da177e4 603 return 0;
bd4cf0ed
AS
604err:
605 kfree(addrs);
606 return -EINVAL;
1da177e4
LT
607}
608
bd4cf0ed 609/* Security:
bd4cf0ed 610 *
2d5311e4 611 * As we dont want to clear mem[] array for each packet going through
8ea6e345 612 * __bpf_prog_run(), we check that filter loaded by user never try to read
2d5311e4 613 * a cell if not previously written, and we check all branches to be sure
25985edc 614 * a malicious user doesn't try to abuse us.
2d5311e4 615 */
ec31a05c 616static int check_load_and_stores(const struct sock_filter *filter, int flen)
2d5311e4 617{
34805931 618 u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
2d5311e4
ED
619 int pc, ret = 0;
620
621 BUILD_BUG_ON(BPF_MEMWORDS > 16);
34805931 622
99e72a0f 623 masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
2d5311e4
ED
624 if (!masks)
625 return -ENOMEM;
34805931 626
2d5311e4
ED
627 memset(masks, 0xff, flen * sizeof(*masks));
628
629 for (pc = 0; pc < flen; pc++) {
630 memvalid &= masks[pc];
631
632 switch (filter[pc].code) {
34805931
DB
633 case BPF_ST:
634 case BPF_STX:
2d5311e4
ED
635 memvalid |= (1 << filter[pc].k);
636 break;
34805931
DB
637 case BPF_LD | BPF_MEM:
638 case BPF_LDX | BPF_MEM:
2d5311e4
ED
639 if (!(memvalid & (1 << filter[pc].k))) {
640 ret = -EINVAL;
641 goto error;
642 }
643 break;
34805931
DB
644 case BPF_JMP | BPF_JA:
645 /* A jump must set masks on target */
2d5311e4
ED
646 masks[pc + 1 + filter[pc].k] &= memvalid;
647 memvalid = ~0;
648 break;
34805931
DB
649 case BPF_JMP | BPF_JEQ | BPF_K:
650 case BPF_JMP | BPF_JEQ | BPF_X:
651 case BPF_JMP | BPF_JGE | BPF_K:
652 case BPF_JMP | BPF_JGE | BPF_X:
653 case BPF_JMP | BPF_JGT | BPF_K:
654 case BPF_JMP | BPF_JGT | BPF_X:
655 case BPF_JMP | BPF_JSET | BPF_K:
656 case BPF_JMP | BPF_JSET | BPF_X:
657 /* A jump must set masks on targets */
2d5311e4
ED
658 masks[pc + 1 + filter[pc].jt] &= memvalid;
659 masks[pc + 1 + filter[pc].jf] &= memvalid;
660 memvalid = ~0;
661 break;
662 }
663 }
664error:
665 kfree(masks);
666 return ret;
667}
668
34805931
DB
669static bool chk_code_allowed(u16 code_to_probe)
670{
671 static const bool codes[] = {
672 /* 32 bit ALU operations */
673 [BPF_ALU | BPF_ADD | BPF_K] = true,
674 [BPF_ALU | BPF_ADD | BPF_X] = true,
675 [BPF_ALU | BPF_SUB | BPF_K] = true,
676 [BPF_ALU | BPF_SUB | BPF_X] = true,
677 [BPF_ALU | BPF_MUL | BPF_K] = true,
678 [BPF_ALU | BPF_MUL | BPF_X] = true,
679 [BPF_ALU | BPF_DIV | BPF_K] = true,
680 [BPF_ALU | BPF_DIV | BPF_X] = true,
681 [BPF_ALU | BPF_MOD | BPF_K] = true,
682 [BPF_ALU | BPF_MOD | BPF_X] = true,
683 [BPF_ALU | BPF_AND | BPF_K] = true,
684 [BPF_ALU | BPF_AND | BPF_X] = true,
685 [BPF_ALU | BPF_OR | BPF_K] = true,
686 [BPF_ALU | BPF_OR | BPF_X] = true,
687 [BPF_ALU | BPF_XOR | BPF_K] = true,
688 [BPF_ALU | BPF_XOR | BPF_X] = true,
689 [BPF_ALU | BPF_LSH | BPF_K] = true,
690 [BPF_ALU | BPF_LSH | BPF_X] = true,
691 [BPF_ALU | BPF_RSH | BPF_K] = true,
692 [BPF_ALU | BPF_RSH | BPF_X] = true,
693 [BPF_ALU | BPF_NEG] = true,
694 /* Load instructions */
695 [BPF_LD | BPF_W | BPF_ABS] = true,
696 [BPF_LD | BPF_H | BPF_ABS] = true,
697 [BPF_LD | BPF_B | BPF_ABS] = true,
698 [BPF_LD | BPF_W | BPF_LEN] = true,
699 [BPF_LD | BPF_W | BPF_IND] = true,
700 [BPF_LD | BPF_H | BPF_IND] = true,
701 [BPF_LD | BPF_B | BPF_IND] = true,
702 [BPF_LD | BPF_IMM] = true,
703 [BPF_LD | BPF_MEM] = true,
704 [BPF_LDX | BPF_W | BPF_LEN] = true,
705 [BPF_LDX | BPF_B | BPF_MSH] = true,
706 [BPF_LDX | BPF_IMM] = true,
707 [BPF_LDX | BPF_MEM] = true,
708 /* Store instructions */
709 [BPF_ST] = true,
710 [BPF_STX] = true,
711 /* Misc instructions */
712 [BPF_MISC | BPF_TAX] = true,
713 [BPF_MISC | BPF_TXA] = true,
714 /* Return instructions */
715 [BPF_RET | BPF_K] = true,
716 [BPF_RET | BPF_A] = true,
717 /* Jump instructions */
718 [BPF_JMP | BPF_JA] = true,
719 [BPF_JMP | BPF_JEQ | BPF_K] = true,
720 [BPF_JMP | BPF_JEQ | BPF_X] = true,
721 [BPF_JMP | BPF_JGE | BPF_K] = true,
722 [BPF_JMP | BPF_JGE | BPF_X] = true,
723 [BPF_JMP | BPF_JGT | BPF_K] = true,
724 [BPF_JMP | BPF_JGT | BPF_X] = true,
725 [BPF_JMP | BPF_JSET | BPF_K] = true,
726 [BPF_JMP | BPF_JSET | BPF_X] = true,
727 };
728
729 if (code_to_probe >= ARRAY_SIZE(codes))
730 return false;
731
732 return codes[code_to_probe];
733}
734
1da177e4 735/**
4df95ff4 736 * bpf_check_classic - verify socket filter code
1da177e4
LT
737 * @filter: filter to verify
738 * @flen: length of filter
739 *
740 * Check the user's filter code. If we let some ugly
741 * filter code slip through kaboom! The filter must contain
93699863
KK
742 * no references or jumps that are out of range, no illegal
743 * instructions, and must end with a RET instruction.
1da177e4 744 *
7b11f69f
KK
745 * All jumps are forward as they are not signed.
746 *
747 * Returns 0 if the rule set is legal or -EINVAL if not.
1da177e4 748 */
4df95ff4 749int bpf_check_classic(const struct sock_filter *filter, unsigned int flen)
1da177e4 750{
aa1113d9 751 bool anc_found;
34805931 752 int pc;
1da177e4 753
1b93ae64 754 if (flen == 0 || flen > BPF_MAXINSNS)
1da177e4
LT
755 return -EINVAL;
756
34805931 757 /* Check the filter code now */
1da177e4 758 for (pc = 0; pc < flen; pc++) {
ec31a05c 759 const struct sock_filter *ftest = &filter[pc];
93699863 760
34805931
DB
761 /* May we actually operate on this code? */
762 if (!chk_code_allowed(ftest->code))
cba328fc 763 return -EINVAL;
34805931 764
93699863 765 /* Some instructions need special checks */
34805931
DB
766 switch (ftest->code) {
767 case BPF_ALU | BPF_DIV | BPF_K:
768 case BPF_ALU | BPF_MOD | BPF_K:
769 /* Check for division by zero */
b6069a95
ED
770 if (ftest->k == 0)
771 return -EINVAL;
772 break;
34805931
DB
773 case BPF_LD | BPF_MEM:
774 case BPF_LDX | BPF_MEM:
775 case BPF_ST:
776 case BPF_STX:
777 /* Check for invalid memory addresses */
93699863
KK
778 if (ftest->k >= BPF_MEMWORDS)
779 return -EINVAL;
780 break;
34805931
DB
781 case BPF_JMP | BPF_JA:
782 /* Note, the large ftest->k might cause loops.
93699863
KK
783 * Compare this with conditional jumps below,
784 * where offsets are limited. --ANK (981016)
785 */
34805931 786 if (ftest->k >= (unsigned int)(flen - pc - 1))
93699863 787 return -EINVAL;
01f2f3f6 788 break;
34805931
DB
789 case BPF_JMP | BPF_JEQ | BPF_K:
790 case BPF_JMP | BPF_JEQ | BPF_X:
791 case BPF_JMP | BPF_JGE | BPF_K:
792 case BPF_JMP | BPF_JGE | BPF_X:
793 case BPF_JMP | BPF_JGT | BPF_K:
794 case BPF_JMP | BPF_JGT | BPF_X:
795 case BPF_JMP | BPF_JSET | BPF_K:
796 case BPF_JMP | BPF_JSET | BPF_X:
797 /* Both conditionals must be safe */
e35bedf3 798 if (pc + ftest->jt + 1 >= flen ||
93699863
KK
799 pc + ftest->jf + 1 >= flen)
800 return -EINVAL;
cba328fc 801 break;
34805931
DB
802 case BPF_LD | BPF_W | BPF_ABS:
803 case BPF_LD | BPF_H | BPF_ABS:
804 case BPF_LD | BPF_B | BPF_ABS:
aa1113d9 805 anc_found = false;
34805931
DB
806 if (bpf_anc_helper(ftest) & BPF_ANC)
807 anc_found = true;
808 /* Ancillary operation unknown or unsupported */
aa1113d9
DB
809 if (anc_found == false && ftest->k >= SKF_AD_OFF)
810 return -EINVAL;
01f2f3f6
HPP
811 }
812 }
93699863 813
34805931 814 /* Last instruction must be a RET code */
01f2f3f6 815 switch (filter[flen - 1].code) {
34805931
DB
816 case BPF_RET | BPF_K:
817 case BPF_RET | BPF_A:
2d5311e4 818 return check_load_and_stores(filter, flen);
cba328fc 819 }
34805931 820
cba328fc 821 return -EINVAL;
1da177e4 822}
4df95ff4 823EXPORT_SYMBOL(bpf_check_classic);
1da177e4 824
7ae457c1
AS
825static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
826 const struct sock_fprog *fprog)
a3ea269b 827{
009937e7 828 unsigned int fsize = bpf_classic_proglen(fprog);
a3ea269b
DB
829 struct sock_fprog_kern *fkprog;
830
831 fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
832 if (!fp->orig_prog)
833 return -ENOMEM;
834
835 fkprog = fp->orig_prog;
836 fkprog->len = fprog->len;
837 fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL);
838 if (!fkprog->filter) {
839 kfree(fp->orig_prog);
840 return -ENOMEM;
841 }
842
843 return 0;
844}
845
7ae457c1 846static void bpf_release_orig_filter(struct bpf_prog *fp)
a3ea269b
DB
847{
848 struct sock_fprog_kern *fprog = fp->orig_prog;
849
850 if (fprog) {
851 kfree(fprog->filter);
852 kfree(fprog);
853 }
854}
855
7ae457c1
AS
856static void __bpf_prog_release(struct bpf_prog *prog)
857{
24701ece 858 if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
89aa0758
AS
859 bpf_prog_put(prog);
860 } else {
861 bpf_release_orig_filter(prog);
862 bpf_prog_free(prog);
863 }
7ae457c1
AS
864}
865
34c5bd66
PN
866static void __sk_filter_release(struct sk_filter *fp)
867{
7ae457c1
AS
868 __bpf_prog_release(fp->prog);
869 kfree(fp);
34c5bd66
PN
870}
871
47e958ea 872/**
46bcf14f 873 * sk_filter_release_rcu - Release a socket filter by rcu_head
47e958ea
PE
874 * @rcu: rcu_head that contains the sk_filter to free
875 */
fbc907f0 876static void sk_filter_release_rcu(struct rcu_head *rcu)
47e958ea
PE
877{
878 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
879
34c5bd66 880 __sk_filter_release(fp);
47e958ea 881}
fbc907f0
DB
882
883/**
884 * sk_filter_release - release a socket filter
885 * @fp: filter to remove
886 *
887 * Remove a filter from a socket and release its resources.
888 */
889static void sk_filter_release(struct sk_filter *fp)
890{
891 if (atomic_dec_and_test(&fp->refcnt))
892 call_rcu(&fp->rcu, sk_filter_release_rcu);
893}
894
895void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
896{
7ae457c1 897 u32 filter_size = bpf_prog_size(fp->prog->len);
fbc907f0 898
278571ba
AS
899 atomic_sub(filter_size, &sk->sk_omem_alloc);
900 sk_filter_release(fp);
fbc907f0 901}
47e958ea 902
278571ba
AS
903/* try to charge the socket memory if there is space available
904 * return true on success
905 */
906bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
bd4cf0ed 907{
7ae457c1 908 u32 filter_size = bpf_prog_size(fp->prog->len);
278571ba
AS
909
910 /* same check as in sock_kmalloc() */
911 if (filter_size <= sysctl_optmem_max &&
912 atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
913 atomic_inc(&fp->refcnt);
914 atomic_add(filter_size, &sk->sk_omem_alloc);
915 return true;
bd4cf0ed 916 }
278571ba 917 return false;
bd4cf0ed
AS
918}
919
7ae457c1 920static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
bd4cf0ed
AS
921{
922 struct sock_filter *old_prog;
7ae457c1 923 struct bpf_prog *old_fp;
34805931 924 int err, new_len, old_len = fp->len;
bd4cf0ed
AS
925
926 /* We are free to overwrite insns et al right here as it
927 * won't be used at this point in time anymore internally
928 * after the migration to the internal BPF instruction
929 * representation.
930 */
931 BUILD_BUG_ON(sizeof(struct sock_filter) !=
2695fb55 932 sizeof(struct bpf_insn));
bd4cf0ed 933
bd4cf0ed
AS
934 /* Conversion cannot happen on overlapping memory areas,
935 * so we need to keep the user BPF around until the 2nd
936 * pass. At this time, the user BPF is stored in fp->insns.
937 */
938 old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
939 GFP_KERNEL);
940 if (!old_prog) {
941 err = -ENOMEM;
942 goto out_err;
943 }
944
945 /* 1st pass: calculate the new program length. */
8fb575ca 946 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len);
bd4cf0ed
AS
947 if (err)
948 goto out_err_free;
949
950 /* Expand fp for appending the new filter representation. */
951 old_fp = fp;
60a3b225 952 fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
bd4cf0ed
AS
953 if (!fp) {
954 /* The old_fp is still around in case we couldn't
955 * allocate new memory, so uncharge on that one.
956 */
957 fp = old_fp;
958 err = -ENOMEM;
959 goto out_err_free;
960 }
961
bd4cf0ed
AS
962 fp->len = new_len;
963
2695fb55 964 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
8fb575ca 965 err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
bd4cf0ed 966 if (err)
8fb575ca 967 /* 2nd bpf_convert_filter() can fail only if it fails
bd4cf0ed
AS
968 * to allocate memory, remapping must succeed. Note,
969 * that at this time old_fp has already been released
278571ba 970 * by krealloc().
bd4cf0ed
AS
971 */
972 goto out_err_free;
973
7ae457c1 974 bpf_prog_select_runtime(fp);
5fe821a9 975
bd4cf0ed
AS
976 kfree(old_prog);
977 return fp;
978
979out_err_free:
980 kfree(old_prog);
981out_err:
7ae457c1 982 __bpf_prog_release(fp);
bd4cf0ed
AS
983 return ERR_PTR(err);
984}
985
7ae457c1 986static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
302d6637
JP
987{
988 int err;
989
bd4cf0ed 990 fp->bpf_func = NULL;
286aad3c 991 fp->jited = false;
302d6637 992
4df95ff4 993 err = bpf_check_classic(fp->insns, fp->len);
418c96ac 994 if (err) {
7ae457c1 995 __bpf_prog_release(fp);
bd4cf0ed 996 return ERR_PTR(err);
418c96ac 997 }
302d6637 998
bd4cf0ed
AS
999 /* Probe if we can JIT compile the filter and if so, do
1000 * the compilation of the filter.
1001 */
302d6637 1002 bpf_jit_compile(fp);
bd4cf0ed
AS
1003
1004 /* JIT compiler couldn't process this filter, so do the
1005 * internal BPF translation for the optimized interpreter.
1006 */
5fe821a9 1007 if (!fp->jited)
7ae457c1 1008 fp = bpf_migrate_filter(fp);
bd4cf0ed
AS
1009
1010 return fp;
302d6637
JP
1011}
1012
1013/**
7ae457c1 1014 * bpf_prog_create - create an unattached filter
c6c4b97c 1015 * @pfp: the unattached filter that is created
677a9fd3 1016 * @fprog: the filter program
302d6637 1017 *
c6c4b97c 1018 * Create a filter independent of any socket. We first run some
302d6637
JP
1019 * sanity checks on it to make sure it does not explode on us later.
1020 * If an error occurs or there is insufficient memory for the filter
1021 * a negative errno code is returned. On success the return is zero.
1022 */
7ae457c1 1023int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
302d6637 1024{
009937e7 1025 unsigned int fsize = bpf_classic_proglen(fprog);
7ae457c1 1026 struct bpf_prog *fp;
302d6637
JP
1027
1028 /* Make sure new filter is there and in the right amounts. */
1029 if (fprog->filter == NULL)
1030 return -EINVAL;
1031
60a3b225 1032 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
302d6637
JP
1033 if (!fp)
1034 return -ENOMEM;
a3ea269b 1035
302d6637
JP
1036 memcpy(fp->insns, fprog->filter, fsize);
1037
302d6637 1038 fp->len = fprog->len;
a3ea269b
DB
1039 /* Since unattached filters are not copied back to user
1040 * space through sk_get_filter(), we do not need to hold
1041 * a copy here, and can spare us the work.
1042 */
1043 fp->orig_prog = NULL;
302d6637 1044
7ae457c1 1045 /* bpf_prepare_filter() already takes care of freeing
bd4cf0ed
AS
1046 * memory in case something goes wrong.
1047 */
7ae457c1 1048 fp = bpf_prepare_filter(fp);
bd4cf0ed
AS
1049 if (IS_ERR(fp))
1050 return PTR_ERR(fp);
302d6637
JP
1051
1052 *pfp = fp;
1053 return 0;
302d6637 1054}
7ae457c1 1055EXPORT_SYMBOL_GPL(bpf_prog_create);
302d6637 1056
7ae457c1 1057void bpf_prog_destroy(struct bpf_prog *fp)
302d6637 1058{
7ae457c1 1059 __bpf_prog_release(fp);
302d6637 1060}
7ae457c1 1061EXPORT_SYMBOL_GPL(bpf_prog_destroy);
302d6637 1062
49b31e57
DB
1063static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
1064{
1065 struct sk_filter *fp, *old_fp;
1066
1067 fp = kmalloc(sizeof(*fp), GFP_KERNEL);
1068 if (!fp)
1069 return -ENOMEM;
1070
1071 fp->prog = prog;
1072 atomic_set(&fp->refcnt, 0);
1073
1074 if (!sk_filter_charge(sk, fp)) {
1075 kfree(fp);
1076 return -ENOMEM;
1077 }
1078
1079 old_fp = rcu_dereference_protected(sk->sk_filter,
1080 sock_owned_by_user(sk));
1081 rcu_assign_pointer(sk->sk_filter, fp);
1082
1083 if (old_fp)
1084 sk_filter_uncharge(sk, old_fp);
1085
1086 return 0;
1087}
1088
1da177e4
LT
1089/**
1090 * sk_attach_filter - attach a socket filter
1091 * @fprog: the filter program
1092 * @sk: the socket to use
1093 *
1094 * Attach the user's filter code. We first run some sanity checks on
1095 * it to make sure it does not explode on us later. If an error
1096 * occurs or there is insufficient memory for the filter a negative
1097 * errno code is returned. On success the return is zero.
1098 */
1099int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1100{
009937e7 1101 unsigned int fsize = bpf_classic_proglen(fprog);
7ae457c1
AS
1102 unsigned int bpf_fsize = bpf_prog_size(fprog->len);
1103 struct bpf_prog *prog;
1da177e4
LT
1104 int err;
1105
d59577b6
VB
1106 if (sock_flag(sk, SOCK_FILTER_LOCKED))
1107 return -EPERM;
1108
1da177e4 1109 /* Make sure new filter is there and in the right amounts. */
e35bedf3
KK
1110 if (fprog->filter == NULL)
1111 return -EINVAL;
1da177e4 1112
60a3b225 1113 prog = bpf_prog_alloc(bpf_fsize, 0);
7ae457c1 1114 if (!prog)
1da177e4 1115 return -ENOMEM;
a3ea269b 1116
7ae457c1 1117 if (copy_from_user(prog->insns, fprog->filter, fsize)) {
c0d1379a 1118 __bpf_prog_free(prog);
1da177e4
LT
1119 return -EFAULT;
1120 }
1121
7ae457c1 1122 prog->len = fprog->len;
1da177e4 1123
7ae457c1 1124 err = bpf_prog_store_orig_filter(prog, fprog);
a3ea269b 1125 if (err) {
c0d1379a 1126 __bpf_prog_free(prog);
a3ea269b
DB
1127 return -ENOMEM;
1128 }
1129
7ae457c1 1130 /* bpf_prepare_filter() already takes care of freeing
bd4cf0ed
AS
1131 * memory in case something goes wrong.
1132 */
7ae457c1
AS
1133 prog = bpf_prepare_filter(prog);
1134 if (IS_ERR(prog))
1135 return PTR_ERR(prog);
1136
49b31e57
DB
1137 err = __sk_attach_prog(prog, sk);
1138 if (err < 0) {
7ae457c1 1139 __bpf_prog_release(prog);
49b31e57 1140 return err;
278571ba
AS
1141 }
1142
d3904b73 1143 return 0;
1da177e4 1144}
5ff3f073 1145EXPORT_SYMBOL_GPL(sk_attach_filter);
1da177e4 1146
89aa0758
AS
1147int sk_attach_bpf(u32 ufd, struct sock *sk)
1148{
89aa0758 1149 struct bpf_prog *prog;
49b31e57 1150 int err;
89aa0758
AS
1151
1152 if (sock_flag(sk, SOCK_FILTER_LOCKED))
1153 return -EPERM;
1154
1155 prog = bpf_prog_get(ufd);
198bf1b0
AS
1156 if (IS_ERR(prog))
1157 return PTR_ERR(prog);
89aa0758 1158
24701ece 1159 if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
89aa0758
AS
1160 bpf_prog_put(prog);
1161 return -EINVAL;
1162 }
1163
49b31e57
DB
1164 err = __sk_attach_prog(prog, sk);
1165 if (err < 0) {
89aa0758 1166 bpf_prog_put(prog);
49b31e57 1167 return err;
89aa0758
AS
1168 }
1169
89aa0758
AS
1170 return 0;
1171}
1172
d4052c4a
DB
1173static const struct bpf_func_proto *
1174sk_filter_func_proto(enum bpf_func_id func_id)
89aa0758
AS
1175{
1176 switch (func_id) {
1177 case BPF_FUNC_map_lookup_elem:
1178 return &bpf_map_lookup_elem_proto;
1179 case BPF_FUNC_map_update_elem:
1180 return &bpf_map_update_elem_proto;
1181 case BPF_FUNC_map_delete_elem:
1182 return &bpf_map_delete_elem_proto;
03e69b50
DB
1183 case BPF_FUNC_get_prandom_u32:
1184 return &bpf_get_prandom_u32_proto;
c04167ce
DB
1185 case BPF_FUNC_get_smp_processor_id:
1186 return &bpf_get_smp_processor_id_proto;
89aa0758
AS
1187 default:
1188 return NULL;
1189 }
1190}
1191
d4052c4a
DB
1192static bool sk_filter_is_valid_access(int off, int size,
1193 enum bpf_access_type type)
89aa0758 1194{
9bac3d6d
AS
1195 /* only read is allowed */
1196 if (type != BPF_READ)
1197 return false;
1198
1199 /* check bounds */
1200 if (off < 0 || off >= sizeof(struct __sk_buff))
1201 return false;
1202
1203 /* disallow misaligned access */
1204 if (off % size != 0)
1205 return false;
1206
1207 /* all __sk_buff fields are __u32 */
1208 if (size != 4)
1209 return false;
1210
1211 return true;
1212}
1213
1214static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
1215 struct bpf_insn *insn_buf)
1216{
1217 struct bpf_insn *insn = insn_buf;
1218
1219 switch (ctx_off) {
1220 case offsetof(struct __sk_buff, len):
1221 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
1222
1223 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
1224 offsetof(struct sk_buff, len));
1225 break;
1226
1227 case offsetof(struct __sk_buff, mark):
1228 return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn);
1229
1230 case offsetof(struct __sk_buff, pkt_type):
1231 return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn);
1232
1233 case offsetof(struct __sk_buff, queue_mapping):
1234 return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn);
c2497395
AS
1235
1236 case offsetof(struct __sk_buff, protocol):
1237 return convert_skb_access(SKF_AD_PROTOCOL, dst_reg, src_reg, insn);
1238
1239 case offsetof(struct __sk_buff, vlan_present):
1240 return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
1241 dst_reg, src_reg, insn);
1242
1243 case offsetof(struct __sk_buff, vlan_tci):
1244 return convert_skb_access(SKF_AD_VLAN_TAG,
1245 dst_reg, src_reg, insn);
9bac3d6d
AS
1246 }
1247
1248 return insn - insn_buf;
89aa0758
AS
1249}
1250
d4052c4a
DB
1251static const struct bpf_verifier_ops sk_filter_ops = {
1252 .get_func_proto = sk_filter_func_proto,
1253 .is_valid_access = sk_filter_is_valid_access,
9bac3d6d 1254 .convert_ctx_access = sk_filter_convert_ctx_access,
89aa0758
AS
1255};
1256
d4052c4a
DB
1257static struct bpf_prog_type_list sk_filter_type __read_mostly = {
1258 .ops = &sk_filter_ops,
89aa0758
AS
1259 .type = BPF_PROG_TYPE_SOCKET_FILTER,
1260};
1261
96be4325
DB
1262static struct bpf_prog_type_list sched_cls_type __read_mostly = {
1263 .ops = &sk_filter_ops,
1264 .type = BPF_PROG_TYPE_SCHED_CLS,
1265};
1266
d4052c4a 1267static int __init register_sk_filter_ops(void)
89aa0758 1268{
d4052c4a 1269 bpf_register_prog_type(&sk_filter_type);
96be4325
DB
1270 bpf_register_prog_type(&sched_cls_type);
1271
89aa0758
AS
1272 return 0;
1273}
d4052c4a
DB
1274late_initcall(register_sk_filter_ops);
1275
55b33325
PE
1276int sk_detach_filter(struct sock *sk)
1277{
1278 int ret = -ENOENT;
1279 struct sk_filter *filter;
1280
d59577b6
VB
1281 if (sock_flag(sk, SOCK_FILTER_LOCKED))
1282 return -EPERM;
1283
f91ff5b9
ED
1284 filter = rcu_dereference_protected(sk->sk_filter,
1285 sock_owned_by_user(sk));
55b33325 1286 if (filter) {
a9b3cd7f 1287 RCU_INIT_POINTER(sk->sk_filter, NULL);
46bcf14f 1288 sk_filter_uncharge(sk, filter);
55b33325
PE
1289 ret = 0;
1290 }
a3ea269b 1291
55b33325
PE
1292 return ret;
1293}
5ff3f073 1294EXPORT_SYMBOL_GPL(sk_detach_filter);
a8fc9277 1295
a3ea269b
DB
1296int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
1297 unsigned int len)
a8fc9277 1298{
a3ea269b 1299 struct sock_fprog_kern *fprog;
a8fc9277 1300 struct sk_filter *filter;
a3ea269b 1301 int ret = 0;
a8fc9277
PE
1302
1303 lock_sock(sk);
1304 filter = rcu_dereference_protected(sk->sk_filter,
a3ea269b 1305 sock_owned_by_user(sk));
a8fc9277
PE
1306 if (!filter)
1307 goto out;
a3ea269b
DB
1308
1309 /* We're copying the filter that has been originally attached,
1310 * so no conversion/decode needed anymore.
1311 */
7ae457c1 1312 fprog = filter->prog->orig_prog;
a3ea269b
DB
1313
1314 ret = fprog->len;
a8fc9277 1315 if (!len)
a3ea269b 1316 /* User space only enquires number of filter blocks. */
a8fc9277 1317 goto out;
a3ea269b 1318
a8fc9277 1319 ret = -EINVAL;
a3ea269b 1320 if (len < fprog->len)
a8fc9277
PE
1321 goto out;
1322
1323 ret = -EFAULT;
009937e7 1324 if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog)))
a3ea269b 1325 goto out;
a8fc9277 1326
a3ea269b
DB
1327 /* Instead of bytes, the API requests to return the number
1328 * of filter blocks.
1329 */
1330 ret = fprog->len;
a8fc9277
PE
1331out:
1332 release_sock(sk);
1333 return ret;
1334}