2 * drivers/net/team/team_mode_loadbalance.c - Load-balancing mode for team
3 * Copyright (c) 2012 Jiri Pirko <jpirko@redhat.com>
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
11 #include <linux/kernel.h>
12 #include <linux/types.h>
13 #include <linux/module.h>
14 #include <linux/init.h>
15 #include <linux/errno.h>
16 #include <linux/netdevice.h>
17 #include <linux/etherdevice.h>
18 #include <linux/filter.h>
19 #include <linux/if_team.h>
21 static rx_handler_result_t
lb_receive(struct team
*team
, struct team_port
*port
,
24 if (unlikely(skb
->protocol
== htons(ETH_P_SLOW
))) {
25 /* LACPDU packets should go to exact delivery */
26 const unsigned char *dest
= eth_hdr(skb
)->h_dest
;
28 if (is_link_local_ether_addr(dest
) && dest
[5] == 0x02)
29 return RX_HANDLER_EXACT
;
31 return RX_HANDLER_ANOTHER
;
36 typedef struct team_port
*lb_select_tx_port_func_t(struct team
*,
41 #define LB_TX_HASHTABLE_SIZE 256 /* hash is a char */
47 struct lb_pcpu_stats
{
48 struct lb_stats hash_stats
[LB_TX_HASHTABLE_SIZE
];
49 struct u64_stats_sync syncp
;
52 struct lb_stats_info
{
53 struct lb_stats stats
;
54 struct lb_stats last_stats
;
55 struct team_option_inst_info
*opt_inst_info
;
58 struct lb_port_mapping
{
59 struct team_port __rcu
*port
;
60 struct team_option_inst_info
*opt_inst_info
;
65 struct lb_port_mapping tx_hash_to_port_mapping
[LB_TX_HASHTABLE_SIZE
];
66 struct sock_fprog_kern
*orig_fprog
;
68 unsigned int refresh_interval
; /* in tenths of second */
69 struct delayed_work refresh_dw
;
70 struct lb_stats_info info
[LB_TX_HASHTABLE_SIZE
];
75 struct bpf_prog __rcu
*fp
;
76 lb_select_tx_port_func_t __rcu
*select_tx_port_func
;
77 struct lb_pcpu_stats __percpu
*pcpu_stats
;
78 struct lb_priv_ex
*ex
; /* priv extension */
81 static struct lb_priv
*get_lb_priv(struct team
*team
)
83 return (struct lb_priv
*) &team
->mode_priv
;
87 struct lb_stats __percpu
*pcpu_stats
;
88 struct lb_stats_info stats_info
;
91 static struct lb_port_priv
*get_lb_port_priv(struct team_port
*port
)
93 return (struct lb_port_priv
*) &port
->mode_priv
;
96 #define LB_HTPM_PORT_BY_HASH(lp_priv, hash) \
97 (lb_priv)->ex->tx_hash_to_port_mapping[hash].port
99 #define LB_HTPM_OPT_INST_INFO_BY_HASH(lp_priv, hash) \
100 (lb_priv)->ex->tx_hash_to_port_mapping[hash].opt_inst_info
102 static void lb_tx_hash_to_port_mapping_null_port(struct team
*team
,
103 struct team_port
*port
)
105 struct lb_priv
*lb_priv
= get_lb_priv(team
);
106 bool changed
= false;
109 for (i
= 0; i
< LB_TX_HASHTABLE_SIZE
; i
++) {
110 struct lb_port_mapping
*pm
;
112 pm
= &lb_priv
->ex
->tx_hash_to_port_mapping
[i
];
113 if (rcu_access_pointer(pm
->port
) == port
) {
114 RCU_INIT_POINTER(pm
->port
, NULL
);
115 team_option_inst_set_change(pm
->opt_inst_info
);
120 team_options_change_check(team
);
123 /* Basic tx selection based solely by hash */
124 static struct team_port
*lb_hash_select_tx_port(struct team
*team
,
125 struct lb_priv
*lb_priv
,
129 int port_index
= team_num_to_port_index(team
, hash
);
131 return team_get_port_by_index_rcu(team
, port_index
);
134 /* Hash to port mapping select tx port */
135 static struct team_port
*lb_htpm_select_tx_port(struct team
*team
,
136 struct lb_priv
*lb_priv
,
140 struct team_port
*port
;
142 port
= rcu_dereference_bh(LB_HTPM_PORT_BY_HASH(lb_priv
, hash
));
145 /* If no valid port in the table, fall back to simple hash */
146 return lb_hash_select_tx_port(team
, lb_priv
, skb
, hash
);
149 struct lb_select_tx_port
{
151 lb_select_tx_port_func_t
*func
;
154 static const struct lb_select_tx_port lb_select_tx_port_list
[] = {
157 .func
= lb_hash_select_tx_port
,
160 .name
= "hash_to_port_mapping",
161 .func
= lb_htpm_select_tx_port
,
164 #define LB_SELECT_TX_PORT_LIST_COUNT ARRAY_SIZE(lb_select_tx_port_list)
166 static char *lb_select_tx_port_get_name(lb_select_tx_port_func_t
*func
)
170 for (i
= 0; i
< LB_SELECT_TX_PORT_LIST_COUNT
; i
++) {
171 const struct lb_select_tx_port
*item
;
173 item
= &lb_select_tx_port_list
[i
];
174 if (item
->func
== func
)
180 static lb_select_tx_port_func_t
*lb_select_tx_port_get_func(const char *name
)
184 for (i
= 0; i
< LB_SELECT_TX_PORT_LIST_COUNT
; i
++) {
185 const struct lb_select_tx_port
*item
;
187 item
= &lb_select_tx_port_list
[i
];
188 if (!strcmp(item
->name
, name
))
194 static unsigned int lb_get_skb_hash(struct lb_priv
*lb_priv
,
201 fp
= rcu_dereference_bh(lb_priv
->fp
);
204 lhash
= BPF_PROG_RUN(fp
, skb
);
206 return c
[0] ^ c
[1] ^ c
[2] ^ c
[3];
209 static void lb_update_tx_stats(unsigned int tx_bytes
, struct lb_priv
*lb_priv
,
210 struct lb_port_priv
*lb_port_priv
,
213 struct lb_pcpu_stats
*pcpu_stats
;
214 struct lb_stats
*port_stats
;
215 struct lb_stats
*hash_stats
;
217 pcpu_stats
= this_cpu_ptr(lb_priv
->pcpu_stats
);
218 port_stats
= this_cpu_ptr(lb_port_priv
->pcpu_stats
);
219 hash_stats
= &pcpu_stats
->hash_stats
[hash
];
220 u64_stats_update_begin(&pcpu_stats
->syncp
);
221 port_stats
->tx_bytes
+= tx_bytes
;
222 hash_stats
->tx_bytes
+= tx_bytes
;
223 u64_stats_update_end(&pcpu_stats
->syncp
);
226 static bool lb_transmit(struct team
*team
, struct sk_buff
*skb
)
228 struct lb_priv
*lb_priv
= get_lb_priv(team
);
229 lb_select_tx_port_func_t
*select_tx_port_func
;
230 struct team_port
*port
;
232 unsigned int tx_bytes
= skb
->len
;
234 hash
= lb_get_skb_hash(lb_priv
, skb
);
235 select_tx_port_func
= rcu_dereference_bh(lb_priv
->select_tx_port_func
);
236 port
= select_tx_port_func(team
, lb_priv
, skb
, hash
);
239 if (team_dev_queue_xmit(team
, port
, skb
))
241 lb_update_tx_stats(tx_bytes
, lb_priv
, get_lb_port_priv(port
), hash
);
245 dev_kfree_skb_any(skb
);
249 static int lb_bpf_func_get(struct team
*team
, struct team_gsetter_ctx
*ctx
)
251 struct lb_priv
*lb_priv
= get_lb_priv(team
);
253 if (!lb_priv
->ex
->orig_fprog
) {
254 ctx
->data
.bin_val
.len
= 0;
255 ctx
->data
.bin_val
.ptr
= NULL
;
258 ctx
->data
.bin_val
.len
= lb_priv
->ex
->orig_fprog
->len
*
259 sizeof(struct sock_filter
);
260 ctx
->data
.bin_val
.ptr
= lb_priv
->ex
->orig_fprog
->filter
;
264 static int __fprog_create(struct sock_fprog_kern
**pfprog
, u32 data_len
,
267 struct sock_fprog_kern
*fprog
;
268 struct sock_filter
*filter
= (struct sock_filter
*) data
;
270 if (data_len
% sizeof(struct sock_filter
))
272 fprog
= kmalloc(sizeof(*fprog
), GFP_KERNEL
);
275 fprog
->filter
= kmemdup(filter
, data_len
, GFP_KERNEL
);
276 if (!fprog
->filter
) {
280 fprog
->len
= data_len
/ sizeof(struct sock_filter
);
285 static void __fprog_destroy(struct sock_fprog_kern
*fprog
)
287 kfree(fprog
->filter
);
291 static int lb_bpf_func_set(struct team
*team
, struct team_gsetter_ctx
*ctx
)
293 struct lb_priv
*lb_priv
= get_lb_priv(team
);
294 struct bpf_prog
*fp
= NULL
;
295 struct bpf_prog
*orig_fp
= NULL
;
296 struct sock_fprog_kern
*fprog
= NULL
;
299 if (ctx
->data
.bin_val
.len
) {
300 err
= __fprog_create(&fprog
, ctx
->data
.bin_val
.len
,
301 ctx
->data
.bin_val
.ptr
);
304 err
= bpf_prog_create(&fp
, fprog
);
306 __fprog_destroy(fprog
);
311 if (lb_priv
->ex
->orig_fprog
) {
312 /* Clear old filter data */
313 __fprog_destroy(lb_priv
->ex
->orig_fprog
);
314 orig_fp
= rcu_dereference_protected(lb_priv
->fp
,
315 lockdep_is_held(&team
->lock
));
318 rcu_assign_pointer(lb_priv
->fp
, fp
);
319 lb_priv
->ex
->orig_fprog
= fprog
;
323 bpf_prog_destroy(orig_fp
);
328 static void lb_bpf_func_free(struct team
*team
)
330 struct lb_priv
*lb_priv
= get_lb_priv(team
);
333 if (!lb_priv
->ex
->orig_fprog
)
336 __fprog_destroy(lb_priv
->ex
->orig_fprog
);
337 fp
= rcu_dereference_protected(lb_priv
->fp
,
338 lockdep_is_held(&team
->lock
));
339 bpf_prog_destroy(fp
);
342 static int lb_tx_method_get(struct team
*team
, struct team_gsetter_ctx
*ctx
)
344 struct lb_priv
*lb_priv
= get_lb_priv(team
);
345 lb_select_tx_port_func_t
*func
;
348 func
= rcu_dereference_protected(lb_priv
->select_tx_port_func
,
349 lockdep_is_held(&team
->lock
));
350 name
= lb_select_tx_port_get_name(func
);
352 ctx
->data
.str_val
= name
;
356 static int lb_tx_method_set(struct team
*team
, struct team_gsetter_ctx
*ctx
)
358 struct lb_priv
*lb_priv
= get_lb_priv(team
);
359 lb_select_tx_port_func_t
*func
;
361 func
= lb_select_tx_port_get_func(ctx
->data
.str_val
);
364 rcu_assign_pointer(lb_priv
->select_tx_port_func
, func
);
368 static int lb_tx_hash_to_port_mapping_init(struct team
*team
,
369 struct team_option_inst_info
*info
)
371 struct lb_priv
*lb_priv
= get_lb_priv(team
);
372 unsigned char hash
= info
->array_index
;
374 LB_HTPM_OPT_INST_INFO_BY_HASH(lb_priv
, hash
) = info
;
378 static int lb_tx_hash_to_port_mapping_get(struct team
*team
,
379 struct team_gsetter_ctx
*ctx
)
381 struct lb_priv
*lb_priv
= get_lb_priv(team
);
382 struct team_port
*port
;
383 unsigned char hash
= ctx
->info
->array_index
;
385 port
= LB_HTPM_PORT_BY_HASH(lb_priv
, hash
);
386 ctx
->data
.u32_val
= port
? port
->dev
->ifindex
: 0;
390 static int lb_tx_hash_to_port_mapping_set(struct team
*team
,
391 struct team_gsetter_ctx
*ctx
)
393 struct lb_priv
*lb_priv
= get_lb_priv(team
);
394 struct team_port
*port
;
395 unsigned char hash
= ctx
->info
->array_index
;
397 list_for_each_entry(port
, &team
->port_list
, list
) {
398 if (ctx
->data
.u32_val
== port
->dev
->ifindex
&&
399 team_port_enabled(port
)) {
400 rcu_assign_pointer(LB_HTPM_PORT_BY_HASH(lb_priv
, hash
),
408 static int lb_hash_stats_init(struct team
*team
,
409 struct team_option_inst_info
*info
)
411 struct lb_priv
*lb_priv
= get_lb_priv(team
);
412 unsigned char hash
= info
->array_index
;
414 lb_priv
->ex
->stats
.info
[hash
].opt_inst_info
= info
;
418 static int lb_hash_stats_get(struct team
*team
, struct team_gsetter_ctx
*ctx
)
420 struct lb_priv
*lb_priv
= get_lb_priv(team
);
421 unsigned char hash
= ctx
->info
->array_index
;
423 ctx
->data
.bin_val
.ptr
= &lb_priv
->ex
->stats
.info
[hash
].stats
;
424 ctx
->data
.bin_val
.len
= sizeof(struct lb_stats
);
428 static int lb_port_stats_init(struct team
*team
,
429 struct team_option_inst_info
*info
)
431 struct team_port
*port
= info
->port
;
432 struct lb_port_priv
*lb_port_priv
= get_lb_port_priv(port
);
434 lb_port_priv
->stats_info
.opt_inst_info
= info
;
438 static int lb_port_stats_get(struct team
*team
, struct team_gsetter_ctx
*ctx
)
440 struct team_port
*port
= ctx
->info
->port
;
441 struct lb_port_priv
*lb_port_priv
= get_lb_port_priv(port
);
443 ctx
->data
.bin_val
.ptr
= &lb_port_priv
->stats_info
.stats
;
444 ctx
->data
.bin_val
.len
= sizeof(struct lb_stats
);
448 static void __lb_stats_info_refresh_prepare(struct lb_stats_info
*s_info
)
450 memcpy(&s_info
->last_stats
, &s_info
->stats
, sizeof(struct lb_stats
));
451 memset(&s_info
->stats
, 0, sizeof(struct lb_stats
));
454 static bool __lb_stats_info_refresh_check(struct lb_stats_info
*s_info
,
457 if (memcmp(&s_info
->last_stats
, &s_info
->stats
,
458 sizeof(struct lb_stats
))) {
459 team_option_inst_set_change(s_info
->opt_inst_info
);
465 static void __lb_one_cpu_stats_add(struct lb_stats
*acc_stats
,
466 struct lb_stats
*cpu_stats
,
467 struct u64_stats_sync
*syncp
)
473 start
= u64_stats_fetch_begin_irq(syncp
);
474 tmp
.tx_bytes
= cpu_stats
->tx_bytes
;
475 } while (u64_stats_fetch_retry_irq(syncp
, start
));
476 acc_stats
->tx_bytes
+= tmp
.tx_bytes
;
479 static void lb_stats_refresh(struct work_struct
*work
)
482 struct lb_priv
*lb_priv
;
483 struct lb_priv_ex
*lb_priv_ex
;
484 struct lb_pcpu_stats
*pcpu_stats
;
485 struct lb_stats
*stats
;
486 struct lb_stats_info
*s_info
;
487 struct team_port
*port
;
488 bool changed
= false;
492 lb_priv_ex
= container_of(work
, struct lb_priv_ex
,
493 stats
.refresh_dw
.work
);
495 team
= lb_priv_ex
->team
;
496 lb_priv
= get_lb_priv(team
);
498 if (!mutex_trylock(&team
->lock
)) {
499 schedule_delayed_work(&lb_priv_ex
->stats
.refresh_dw
, 0);
503 for (j
= 0; j
< LB_TX_HASHTABLE_SIZE
; j
++) {
504 s_info
= &lb_priv
->ex
->stats
.info
[j
];
505 __lb_stats_info_refresh_prepare(s_info
);
506 for_each_possible_cpu(i
) {
507 pcpu_stats
= per_cpu_ptr(lb_priv
->pcpu_stats
, i
);
508 stats
= &pcpu_stats
->hash_stats
[j
];
509 __lb_one_cpu_stats_add(&s_info
->stats
, stats
,
512 changed
|= __lb_stats_info_refresh_check(s_info
, team
);
515 list_for_each_entry(port
, &team
->port_list
, list
) {
516 struct lb_port_priv
*lb_port_priv
= get_lb_port_priv(port
);
518 s_info
= &lb_port_priv
->stats_info
;
519 __lb_stats_info_refresh_prepare(s_info
);
520 for_each_possible_cpu(i
) {
521 pcpu_stats
= per_cpu_ptr(lb_priv
->pcpu_stats
, i
);
522 stats
= per_cpu_ptr(lb_port_priv
->pcpu_stats
, i
);
523 __lb_one_cpu_stats_add(&s_info
->stats
, stats
,
526 changed
|= __lb_stats_info_refresh_check(s_info
, team
);
530 team_options_change_check(team
);
532 schedule_delayed_work(&lb_priv_ex
->stats
.refresh_dw
,
533 (lb_priv_ex
->stats
.refresh_interval
* HZ
) / 10);
535 mutex_unlock(&team
->lock
);
538 static int lb_stats_refresh_interval_get(struct team
*team
,
539 struct team_gsetter_ctx
*ctx
)
541 struct lb_priv
*lb_priv
= get_lb_priv(team
);
543 ctx
->data
.u32_val
= lb_priv
->ex
->stats
.refresh_interval
;
547 static int lb_stats_refresh_interval_set(struct team
*team
,
548 struct team_gsetter_ctx
*ctx
)
550 struct lb_priv
*lb_priv
= get_lb_priv(team
);
551 unsigned int interval
;
553 interval
= ctx
->data
.u32_val
;
554 if (lb_priv
->ex
->stats
.refresh_interval
== interval
)
556 lb_priv
->ex
->stats
.refresh_interval
= interval
;
558 schedule_delayed_work(&lb_priv
->ex
->stats
.refresh_dw
, 0);
560 cancel_delayed_work(&lb_priv
->ex
->stats
.refresh_dw
);
564 static const struct team_option lb_options
[] = {
566 .name
= "bpf_hash_func",
567 .type
= TEAM_OPTION_TYPE_BINARY
,
568 .getter
= lb_bpf_func_get
,
569 .setter
= lb_bpf_func_set
,
572 .name
= "lb_tx_method",
573 .type
= TEAM_OPTION_TYPE_STRING
,
574 .getter
= lb_tx_method_get
,
575 .setter
= lb_tx_method_set
,
578 .name
= "lb_tx_hash_to_port_mapping",
579 .array_size
= LB_TX_HASHTABLE_SIZE
,
580 .type
= TEAM_OPTION_TYPE_U32
,
581 .init
= lb_tx_hash_to_port_mapping_init
,
582 .getter
= lb_tx_hash_to_port_mapping_get
,
583 .setter
= lb_tx_hash_to_port_mapping_set
,
586 .name
= "lb_hash_stats",
587 .array_size
= LB_TX_HASHTABLE_SIZE
,
588 .type
= TEAM_OPTION_TYPE_BINARY
,
589 .init
= lb_hash_stats_init
,
590 .getter
= lb_hash_stats_get
,
593 .name
= "lb_port_stats",
595 .type
= TEAM_OPTION_TYPE_BINARY
,
596 .init
= lb_port_stats_init
,
597 .getter
= lb_port_stats_get
,
600 .name
= "lb_stats_refresh_interval",
601 .type
= TEAM_OPTION_TYPE_U32
,
602 .getter
= lb_stats_refresh_interval_get
,
603 .setter
= lb_stats_refresh_interval_set
,
607 static int lb_init(struct team
*team
)
609 struct lb_priv
*lb_priv
= get_lb_priv(team
);
610 lb_select_tx_port_func_t
*func
;
613 /* set default tx port selector */
614 func
= lb_select_tx_port_get_func("hash");
616 rcu_assign_pointer(lb_priv
->select_tx_port_func
, func
);
618 lb_priv
->ex
= kzalloc(sizeof(*lb_priv
->ex
), GFP_KERNEL
);
621 lb_priv
->ex
->team
= team
;
623 lb_priv
->pcpu_stats
= alloc_percpu(struct lb_pcpu_stats
);
624 if (!lb_priv
->pcpu_stats
) {
626 goto err_alloc_pcpu_stats
;
629 for_each_possible_cpu(i
) {
630 struct lb_pcpu_stats
*team_lb_stats
;
631 team_lb_stats
= per_cpu_ptr(lb_priv
->pcpu_stats
, i
);
632 u64_stats_init(&team_lb_stats
->syncp
);
636 INIT_DELAYED_WORK(&lb_priv
->ex
->stats
.refresh_dw
, lb_stats_refresh
);
638 err
= team_options_register(team
, lb_options
, ARRAY_SIZE(lb_options
));
640 goto err_options_register
;
643 err_options_register
:
644 free_percpu(lb_priv
->pcpu_stats
);
645 err_alloc_pcpu_stats
:
650 static void lb_exit(struct team
*team
)
652 struct lb_priv
*lb_priv
= get_lb_priv(team
);
654 team_options_unregister(team
, lb_options
,
655 ARRAY_SIZE(lb_options
));
656 lb_bpf_func_free(team
);
657 cancel_delayed_work_sync(&lb_priv
->ex
->stats
.refresh_dw
);
658 free_percpu(lb_priv
->pcpu_stats
);
662 static int lb_port_enter(struct team
*team
, struct team_port
*port
)
664 struct lb_port_priv
*lb_port_priv
= get_lb_port_priv(port
);
666 lb_port_priv
->pcpu_stats
= alloc_percpu(struct lb_stats
);
667 if (!lb_port_priv
->pcpu_stats
)
672 static void lb_port_leave(struct team
*team
, struct team_port
*port
)
674 struct lb_port_priv
*lb_port_priv
= get_lb_port_priv(port
);
676 free_percpu(lb_port_priv
->pcpu_stats
);
679 static void lb_port_disabled(struct team
*team
, struct team_port
*port
)
681 lb_tx_hash_to_port_mapping_null_port(team
, port
);
684 static const struct team_mode_ops lb_mode_ops
= {
687 .port_enter
= lb_port_enter
,
688 .port_leave
= lb_port_leave
,
689 .port_disabled
= lb_port_disabled
,
690 .receive
= lb_receive
,
691 .transmit
= lb_transmit
,
694 static const struct team_mode lb_mode
= {
695 .kind
= "loadbalance",
696 .owner
= THIS_MODULE
,
697 .priv_size
= sizeof(struct lb_priv
),
698 .port_priv_size
= sizeof(struct lb_port_priv
),
700 .lag_tx_type
= NETDEV_LAG_TX_TYPE_HASH
,
703 static int __init
lb_init_module(void)
705 return team_mode_register(&lb_mode
);
708 static void __exit
lb_cleanup_module(void)
710 team_mode_unregister(&lb_mode
);
713 module_init(lb_init_module
);
714 module_exit(lb_cleanup_module
);
716 MODULE_LICENSE("GPL v2");
717 MODULE_AUTHOR("Jiri Pirko <jpirko@redhat.com>");
718 MODULE_DESCRIPTION("Load-balancing mode for team");
719 MODULE_ALIAS_TEAM_MODE("loadbalance");