2 * Berkeley Packet Filter based traffic classifier
4 * Might be used to classify traffic through flexible, user-defined and
5 * possibly JIT-ed BPF filters for traffic control as an alternative to
8 * (C) 2013 Daniel Borkmann <dborkman@redhat.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
15 #include <linux/module.h>
16 #include <linux/types.h>
17 #include <linux/skbuff.h>
18 #include <linux/filter.h>
19 #include <linux/bpf.h>
20 #include <linux/idr.h>
22 #include <net/rtnetlink.h>
23 #include <net/pkt_cls.h>
26 MODULE_LICENSE("GPL");
27 MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
28 MODULE_DESCRIPTION("TC BPF based classifier");
30 #define CLS_BPF_NAME_LEN 256
31 #define CLS_BPF_SUPPORTED_GEN_FLAGS \
32 (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW)
35 struct list_head plist
;
36 struct idr handle_idr
;
41 struct bpf_prog
*filter
;
42 struct list_head link
;
43 struct tcf_result res
;
50 struct sock_filter
*bpf_ops
;
54 struct work_struct work
;
59 static const struct nla_policy bpf_policy
[TCA_BPF_MAX
+ 1] = {
60 [TCA_BPF_CLASSID
] = { .type
= NLA_U32
},
61 [TCA_BPF_FLAGS
] = { .type
= NLA_U32
},
62 [TCA_BPF_FLAGS_GEN
] = { .type
= NLA_U32
},
63 [TCA_BPF_FD
] = { .type
= NLA_U32
},
64 [TCA_BPF_NAME
] = { .type
= NLA_NUL_STRING
,
65 .len
= CLS_BPF_NAME_LEN
},
66 [TCA_BPF_OPS_LEN
] = { .type
= NLA_U16
},
67 [TCA_BPF_OPS
] = { .type
= NLA_BINARY
,
68 .len
= sizeof(struct sock_filter
) * BPF_MAXINSNS
},
71 static int cls_bpf_exec_opcode(int code
)
86 static int cls_bpf_classify(struct sk_buff
*skb
, const struct tcf_proto
*tp
,
87 struct tcf_result
*res
)
89 struct cls_bpf_head
*head
= rcu_dereference_bh(tp
->root
);
90 bool at_ingress
= skb_at_tc_ingress(skb
);
91 struct cls_bpf_prog
*prog
;
94 /* Needed here for accessing maps. */
96 list_for_each_entry_rcu(prog
, &head
->plist
, link
) {
99 qdisc_skb_cb(skb
)->tc_classid
= prog
->res
.classid
;
101 if (tc_skip_sw(prog
->gen_flags
)) {
102 filter_res
= prog
->exts_integrated
? TC_ACT_UNSPEC
: 0;
103 } else if (at_ingress
) {
104 /* It is safe to push/pull even if skb_shared() */
105 __skb_push(skb
, skb
->mac_len
);
106 bpf_compute_data_pointers(skb
);
107 filter_res
= BPF_PROG_RUN(prog
->filter
, skb
);
108 __skb_pull(skb
, skb
->mac_len
);
110 bpf_compute_data_pointers(skb
);
111 filter_res
= BPF_PROG_RUN(prog
->filter
, skb
);
114 if (prog
->exts_integrated
) {
116 res
->classid
= TC_H_MAJ(prog
->res
.classid
) |
117 qdisc_skb_cb(skb
)->tc_classid
;
119 ret
= cls_bpf_exec_opcode(filter_res
);
120 if (ret
== TC_ACT_UNSPEC
)
127 if (filter_res
!= -1) {
129 res
->classid
= filter_res
;
134 ret
= tcf_exts_exec(skb
, &prog
->exts
, res
);
145 static bool cls_bpf_is_ebpf(const struct cls_bpf_prog
*prog
)
147 return !prog
->bpf_ops
;
150 static int cls_bpf_offload_cmd(struct tcf_proto
*tp
, struct cls_bpf_prog
*prog
,
151 enum tc_clsbpf_command cmd
)
153 bool addorrep
= cmd
== TC_CLSBPF_ADD
|| cmd
== TC_CLSBPF_REPLACE
;
154 struct tcf_block
*block
= tp
->chain
->block
;
155 bool skip_sw
= tc_skip_sw(prog
->gen_flags
);
156 struct tc_cls_bpf_offload cls_bpf
= {};
159 tc_cls_common_offload_init(&cls_bpf
.common
, tp
);
160 cls_bpf
.command
= cmd
;
161 cls_bpf
.exts
= &prog
->exts
;
162 cls_bpf
.prog
= prog
->filter
;
163 cls_bpf
.name
= prog
->bpf_name
;
164 cls_bpf
.exts_integrated
= prog
->exts_integrated
;
165 cls_bpf
.gen_flags
= prog
->gen_flags
;
167 err
= tc_setup_cb_call(block
, NULL
, TC_SETUP_CLSBPF
, &cls_bpf
, skip_sw
);
170 cls_bpf_offload_cmd(tp
, prog
, TC_CLSBPF_DESTROY
);
172 } else if (err
> 0) {
173 prog
->gen_flags
|= TCA_CLS_FLAGS_IN_HW
;
177 if (addorrep
&& skip_sw
&& !(prog
->gen_flags
& TCA_CLS_FLAGS_IN_HW
))
183 static int cls_bpf_offload(struct tcf_proto
*tp
, struct cls_bpf_prog
*prog
,
184 struct cls_bpf_prog
*oldprog
)
186 struct cls_bpf_prog
*obj
= prog
;
187 enum tc_clsbpf_command cmd
;
191 skip_sw
= tc_skip_sw(prog
->gen_flags
) ||
192 (oldprog
&& tc_skip_sw(oldprog
->gen_flags
));
194 if (oldprog
&& oldprog
->offloaded
) {
195 if (!tc_skip_hw(prog
->gen_flags
)) {
196 cmd
= TC_CLSBPF_REPLACE
;
197 } else if (!tc_skip_sw(prog
->gen_flags
)) {
199 cmd
= TC_CLSBPF_DESTROY
;
204 if (tc_skip_hw(prog
->gen_flags
))
205 return skip_sw
? -EINVAL
: 0;
209 ret
= cls_bpf_offload_cmd(tp
, obj
, cmd
);
213 obj
->offloaded
= true;
215 oldprog
->offloaded
= false;
220 static void cls_bpf_stop_offload(struct tcf_proto
*tp
,
221 struct cls_bpf_prog
*prog
)
225 if (!prog
->offloaded
)
228 err
= cls_bpf_offload_cmd(tp
, prog
, TC_CLSBPF_DESTROY
);
230 pr_err("Stopping hardware offload failed: %d\n", err
);
234 prog
->offloaded
= false;
237 static void cls_bpf_offload_update_stats(struct tcf_proto
*tp
,
238 struct cls_bpf_prog
*prog
)
240 if (!prog
->offloaded
)
243 cls_bpf_offload_cmd(tp
, prog
, TC_CLSBPF_STATS
);
246 static int cls_bpf_init(struct tcf_proto
*tp
)
248 struct cls_bpf_head
*head
;
250 head
= kzalloc(sizeof(*head
), GFP_KERNEL
);
254 INIT_LIST_HEAD_RCU(&head
->plist
);
255 idr_init(&head
->handle_idr
);
256 rcu_assign_pointer(tp
->root
, head
);
261 static void cls_bpf_free_parms(struct cls_bpf_prog
*prog
)
263 if (cls_bpf_is_ebpf(prog
))
264 bpf_prog_put(prog
->filter
);
266 bpf_prog_destroy(prog
->filter
);
268 kfree(prog
->bpf_name
);
269 kfree(prog
->bpf_ops
);
272 static void __cls_bpf_delete_prog(struct cls_bpf_prog
*prog
)
274 tcf_exts_destroy(&prog
->exts
);
275 tcf_exts_put_net(&prog
->exts
);
277 cls_bpf_free_parms(prog
);
281 static void cls_bpf_delete_prog_work(struct work_struct
*work
)
283 struct cls_bpf_prog
*prog
= container_of(work
, struct cls_bpf_prog
, work
);
286 __cls_bpf_delete_prog(prog
);
290 static void cls_bpf_delete_prog_rcu(struct rcu_head
*rcu
)
292 struct cls_bpf_prog
*prog
= container_of(rcu
, struct cls_bpf_prog
, rcu
);
294 INIT_WORK(&prog
->work
, cls_bpf_delete_prog_work
);
295 tcf_queue_work(&prog
->work
);
298 static void __cls_bpf_delete(struct tcf_proto
*tp
, struct cls_bpf_prog
*prog
)
300 struct cls_bpf_head
*head
= rtnl_dereference(tp
->root
);
302 idr_remove_ext(&head
->handle_idr
, prog
->handle
);
303 cls_bpf_stop_offload(tp
, prog
);
304 list_del_rcu(&prog
->link
);
305 tcf_unbind_filter(tp
, &prog
->res
);
306 if (tcf_exts_get_net(&prog
->exts
))
307 call_rcu(&prog
->rcu
, cls_bpf_delete_prog_rcu
);
309 __cls_bpf_delete_prog(prog
);
312 static int cls_bpf_delete(struct tcf_proto
*tp
, void *arg
, bool *last
)
314 struct cls_bpf_head
*head
= rtnl_dereference(tp
->root
);
316 __cls_bpf_delete(tp
, arg
);
317 *last
= list_empty(&head
->plist
);
321 static void cls_bpf_destroy(struct tcf_proto
*tp
)
323 struct cls_bpf_head
*head
= rtnl_dereference(tp
->root
);
324 struct cls_bpf_prog
*prog
, *tmp
;
326 list_for_each_entry_safe(prog
, tmp
, &head
->plist
, link
)
327 __cls_bpf_delete(tp
, prog
);
329 idr_destroy(&head
->handle_idr
);
330 kfree_rcu(head
, rcu
);
333 static void *cls_bpf_get(struct tcf_proto
*tp
, u32 handle
)
335 struct cls_bpf_head
*head
= rtnl_dereference(tp
->root
);
336 struct cls_bpf_prog
*prog
;
338 list_for_each_entry(prog
, &head
->plist
, link
) {
339 if (prog
->handle
== handle
)
346 static int cls_bpf_prog_from_ops(struct nlattr
**tb
, struct cls_bpf_prog
*prog
)
348 struct sock_filter
*bpf_ops
;
349 struct sock_fprog_kern fprog_tmp
;
351 u16 bpf_size
, bpf_num_ops
;
354 bpf_num_ops
= nla_get_u16(tb
[TCA_BPF_OPS_LEN
]);
355 if (bpf_num_ops
> BPF_MAXINSNS
|| bpf_num_ops
== 0)
358 bpf_size
= bpf_num_ops
* sizeof(*bpf_ops
);
359 if (bpf_size
!= nla_len(tb
[TCA_BPF_OPS
]))
362 bpf_ops
= kzalloc(bpf_size
, GFP_KERNEL
);
366 memcpy(bpf_ops
, nla_data(tb
[TCA_BPF_OPS
]), bpf_size
);
368 fprog_tmp
.len
= bpf_num_ops
;
369 fprog_tmp
.filter
= bpf_ops
;
371 ret
= bpf_prog_create(&fp
, &fprog_tmp
);
377 prog
->bpf_ops
= bpf_ops
;
378 prog
->bpf_num_ops
= bpf_num_ops
;
379 prog
->bpf_name
= NULL
;
385 static int cls_bpf_prog_from_efd(struct nlattr
**tb
, struct cls_bpf_prog
*prog
,
386 u32 gen_flags
, const struct tcf_proto
*tp
)
393 bpf_fd
= nla_get_u32(tb
[TCA_BPF_FD
]);
394 skip_sw
= gen_flags
& TCA_CLS_FLAGS_SKIP_SW
;
396 fp
= bpf_prog_get_type_dev(bpf_fd
, BPF_PROG_TYPE_SCHED_CLS
, skip_sw
);
400 if (tb
[TCA_BPF_NAME
]) {
401 name
= nla_memdup(tb
[TCA_BPF_NAME
], GFP_KERNEL
);
408 prog
->bpf_ops
= NULL
;
409 prog
->bpf_name
= name
;
412 if (fp
->dst_needed
&& !(tp
->q
->flags
& TCQ_F_INGRESS
))
413 netif_keep_dst(qdisc_dev(tp
->q
));
418 static int cls_bpf_set_parms(struct net
*net
, struct tcf_proto
*tp
,
419 struct cls_bpf_prog
*prog
, unsigned long base
,
420 struct nlattr
**tb
, struct nlattr
*est
, bool ovr
)
422 bool is_bpf
, is_ebpf
, have_exts
= false;
426 is_bpf
= tb
[TCA_BPF_OPS_LEN
] && tb
[TCA_BPF_OPS
];
427 is_ebpf
= tb
[TCA_BPF_FD
];
428 if ((!is_bpf
&& !is_ebpf
) || (is_bpf
&& is_ebpf
))
431 ret
= tcf_exts_validate(net
, tp
, tb
, est
, &prog
->exts
, ovr
);
435 if (tb
[TCA_BPF_FLAGS
]) {
436 u32 bpf_flags
= nla_get_u32(tb
[TCA_BPF_FLAGS
]);
438 if (bpf_flags
& ~TCA_BPF_FLAG_ACT_DIRECT
)
441 have_exts
= bpf_flags
& TCA_BPF_FLAG_ACT_DIRECT
;
443 if (tb
[TCA_BPF_FLAGS_GEN
]) {
444 gen_flags
= nla_get_u32(tb
[TCA_BPF_FLAGS_GEN
]);
445 if (gen_flags
& ~CLS_BPF_SUPPORTED_GEN_FLAGS
||
446 !tc_flags_valid(gen_flags
))
450 prog
->exts_integrated
= have_exts
;
451 prog
->gen_flags
= gen_flags
;
453 ret
= is_bpf
? cls_bpf_prog_from_ops(tb
, prog
) :
454 cls_bpf_prog_from_efd(tb
, prog
, gen_flags
, tp
);
458 if (tb
[TCA_BPF_CLASSID
]) {
459 prog
->res
.classid
= nla_get_u32(tb
[TCA_BPF_CLASSID
]);
460 tcf_bind_filter(tp
, &prog
->res
, base
);
466 static int cls_bpf_change(struct net
*net
, struct sk_buff
*in_skb
,
467 struct tcf_proto
*tp
, unsigned long base
,
468 u32 handle
, struct nlattr
**tca
,
469 void **arg
, bool ovr
)
471 struct cls_bpf_head
*head
= rtnl_dereference(tp
->root
);
472 struct cls_bpf_prog
*oldprog
= *arg
;
473 struct nlattr
*tb
[TCA_BPF_MAX
+ 1];
474 struct cls_bpf_prog
*prog
;
475 unsigned long idr_index
;
478 if (tca
[TCA_OPTIONS
] == NULL
)
481 ret
= nla_parse_nested(tb
, TCA_BPF_MAX
, tca
[TCA_OPTIONS
], bpf_policy
,
486 prog
= kzalloc(sizeof(*prog
), GFP_KERNEL
);
490 ret
= tcf_exts_init(&prog
->exts
, TCA_BPF_ACT
, TCA_BPF_POLICE
);
495 if (handle
&& oldprog
->handle
!= handle
) {
502 ret
= idr_alloc_ext(&head
->handle_idr
, prog
, &idr_index
,
503 1, 0x7FFFFFFF, GFP_KERNEL
);
506 prog
->handle
= idr_index
;
509 ret
= idr_alloc_ext(&head
->handle_idr
, prog
, &idr_index
,
510 handle
, handle
+ 1, GFP_KERNEL
);
514 prog
->handle
= handle
;
517 ret
= cls_bpf_set_parms(net
, tp
, prog
, base
, tb
, tca
[TCA_RATE
], ovr
);
521 ret
= cls_bpf_offload(tp
, prog
, oldprog
);
525 if (!tc_in_hw(prog
->gen_flags
))
526 prog
->gen_flags
|= TCA_CLS_FLAGS_NOT_IN_HW
;
529 idr_replace_ext(&head
->handle_idr
, prog
, handle
);
530 list_replace_rcu(&oldprog
->link
, &prog
->link
);
531 tcf_unbind_filter(tp
, &oldprog
->res
);
532 tcf_exts_get_net(&oldprog
->exts
);
533 call_rcu(&oldprog
->rcu
, cls_bpf_delete_prog_rcu
);
535 list_add_rcu(&prog
->link
, &head
->plist
);
542 cls_bpf_free_parms(prog
);
545 idr_remove_ext(&head
->handle_idr
, prog
->handle
);
547 tcf_exts_destroy(&prog
->exts
);
552 static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog
*prog
,
557 if (nla_put_u16(skb
, TCA_BPF_OPS_LEN
, prog
->bpf_num_ops
))
560 nla
= nla_reserve(skb
, TCA_BPF_OPS
, prog
->bpf_num_ops
*
561 sizeof(struct sock_filter
));
565 memcpy(nla_data(nla
), prog
->bpf_ops
, nla_len(nla
));
570 static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog
*prog
,
575 if (prog
->bpf_name
&&
576 nla_put_string(skb
, TCA_BPF_NAME
, prog
->bpf_name
))
579 if (nla_put_u32(skb
, TCA_BPF_ID
, prog
->filter
->aux
->id
))
582 nla
= nla_reserve(skb
, TCA_BPF_TAG
, sizeof(prog
->filter
->tag
));
586 memcpy(nla_data(nla
), prog
->filter
->tag
, nla_len(nla
));
591 static int cls_bpf_dump(struct net
*net
, struct tcf_proto
*tp
, void *fh
,
592 struct sk_buff
*skb
, struct tcmsg
*tm
)
594 struct cls_bpf_prog
*prog
= fh
;
602 tm
->tcm_handle
= prog
->handle
;
604 cls_bpf_offload_update_stats(tp
, prog
);
606 nest
= nla_nest_start(skb
, TCA_OPTIONS
);
608 goto nla_put_failure
;
610 if (prog
->res
.classid
&&
611 nla_put_u32(skb
, TCA_BPF_CLASSID
, prog
->res
.classid
))
612 goto nla_put_failure
;
614 if (cls_bpf_is_ebpf(prog
))
615 ret
= cls_bpf_dump_ebpf_info(prog
, skb
);
617 ret
= cls_bpf_dump_bpf_info(prog
, skb
);
619 goto nla_put_failure
;
621 if (tcf_exts_dump(skb
, &prog
->exts
) < 0)
622 goto nla_put_failure
;
624 if (prog
->exts_integrated
)
625 bpf_flags
|= TCA_BPF_FLAG_ACT_DIRECT
;
626 if (bpf_flags
&& nla_put_u32(skb
, TCA_BPF_FLAGS
, bpf_flags
))
627 goto nla_put_failure
;
628 if (prog
->gen_flags
&&
629 nla_put_u32(skb
, TCA_BPF_FLAGS_GEN
, prog
->gen_flags
))
630 goto nla_put_failure
;
632 nla_nest_end(skb
, nest
);
634 if (tcf_exts_dump_stats(skb
, &prog
->exts
) < 0)
635 goto nla_put_failure
;
640 nla_nest_cancel(skb
, nest
);
644 static void cls_bpf_bind_class(void *fh
, u32 classid
, unsigned long cl
)
646 struct cls_bpf_prog
*prog
= fh
;
648 if (prog
&& prog
->res
.classid
== classid
)
649 prog
->res
.class = cl
;
652 static void cls_bpf_walk(struct tcf_proto
*tp
, struct tcf_walker
*arg
)
654 struct cls_bpf_head
*head
= rtnl_dereference(tp
->root
);
655 struct cls_bpf_prog
*prog
;
657 list_for_each_entry(prog
, &head
->plist
, link
) {
658 if (arg
->count
< arg
->skip
)
660 if (arg
->fn(tp
, prog
, arg
) < 0) {
669 static struct tcf_proto_ops cls_bpf_ops __read_mostly
= {
671 .owner
= THIS_MODULE
,
672 .classify
= cls_bpf_classify
,
673 .init
= cls_bpf_init
,
674 .destroy
= cls_bpf_destroy
,
676 .change
= cls_bpf_change
,
677 .delete = cls_bpf_delete
,
678 .walk
= cls_bpf_walk
,
679 .dump
= cls_bpf_dump
,
680 .bind_class
= cls_bpf_bind_class
,
683 static int __init
cls_bpf_init_mod(void)
685 return register_tcf_proto_ops(&cls_bpf_ops
);
688 static void __exit
cls_bpf_exit_mod(void)
690 unregister_tcf_proto_ops(&cls_bpf_ops
);
693 module_init(cls_bpf_init_mod
);
694 module_exit(cls_bpf_exit_mod
);