2 * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include <linux/netdevice.h>
34 #include <linux/mlx5/driver.h>
35 #include <linux/mlx5/vport.h>
36 #include "mlx5_core.h"
39 MLX5_LAG_FLAG_BONDED
= 1 << 0,
43 struct mlx5_core_dev
*dev
;
44 struct net_device
*netdev
;
47 /* Used for collection of netdev event info. */
49 enum netdev_lag_tx_type tx_type
;
50 struct netdev_lag_lower_state_info netdev_state
[MLX5_MAX_PORTS
];
54 /* LAG data of a ConnectX card.
55 * It serves both its phys functions.
59 u8 v2p_map
[MLX5_MAX_PORTS
];
60 struct lag_func pf
[MLX5_MAX_PORTS
];
61 struct lag_tracker tracker
;
62 struct delayed_work bond_work
;
63 struct notifier_block nb
;
65 /* Admin state. Allow lag only if allowed is true
66 * even if network conditions for lag were met
71 /* General purpose, use for short periods of time.
72 * Beware of lock dependencies (preferably, no locks should be acquired
75 static DEFINE_MUTEX(lag_mutex
);
77 static int mlx5_cmd_create_lag(struct mlx5_core_dev
*dev
, u8 remap_port1
,
80 u32 in
[MLX5_ST_SZ_DW(create_lag_in
)] = {0};
81 u32 out
[MLX5_ST_SZ_DW(create_lag_out
)] = {0};
82 void *lag_ctx
= MLX5_ADDR_OF(create_lag_in
, in
, ctx
);
84 MLX5_SET(create_lag_in
, in
, opcode
, MLX5_CMD_OP_CREATE_LAG
);
86 MLX5_SET(lagc
, lag_ctx
, tx_remap_affinity_1
, remap_port1
);
87 MLX5_SET(lagc
, lag_ctx
, tx_remap_affinity_2
, remap_port2
);
89 return mlx5_cmd_exec(dev
, in
, sizeof(in
), out
, sizeof(out
));
92 static int mlx5_cmd_modify_lag(struct mlx5_core_dev
*dev
, u8 remap_port1
,
95 u32 in
[MLX5_ST_SZ_DW(modify_lag_in
)] = {0};
96 u32 out
[MLX5_ST_SZ_DW(modify_lag_out
)] = {0};
97 void *lag_ctx
= MLX5_ADDR_OF(modify_lag_in
, in
, ctx
);
99 MLX5_SET(modify_lag_in
, in
, opcode
, MLX5_CMD_OP_MODIFY_LAG
);
100 MLX5_SET(modify_lag_in
, in
, field_select
, 0x1);
102 MLX5_SET(lagc
, lag_ctx
, tx_remap_affinity_1
, remap_port1
);
103 MLX5_SET(lagc
, lag_ctx
, tx_remap_affinity_2
, remap_port2
);
105 return mlx5_cmd_exec(dev
, in
, sizeof(in
), out
, sizeof(out
));
108 static int mlx5_cmd_destroy_lag(struct mlx5_core_dev
*dev
)
110 u32 in
[MLX5_ST_SZ_DW(destroy_lag_in
)] = {0};
111 u32 out
[MLX5_ST_SZ_DW(destroy_lag_out
)] = {0};
113 MLX5_SET(destroy_lag_in
, in
, opcode
, MLX5_CMD_OP_DESTROY_LAG
);
115 return mlx5_cmd_exec(dev
, in
, sizeof(in
), out
, sizeof(out
));
118 int mlx5_cmd_create_vport_lag(struct mlx5_core_dev
*dev
)
120 u32 in
[MLX5_ST_SZ_DW(create_vport_lag_in
)] = {0};
121 u32 out
[MLX5_ST_SZ_DW(create_vport_lag_out
)] = {0};
123 MLX5_SET(create_vport_lag_in
, in
, opcode
, MLX5_CMD_OP_CREATE_VPORT_LAG
);
125 return mlx5_cmd_exec(dev
, in
, sizeof(in
), out
, sizeof(out
));
127 EXPORT_SYMBOL(mlx5_cmd_create_vport_lag
);
129 int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev
*dev
)
131 u32 in
[MLX5_ST_SZ_DW(destroy_vport_lag_in
)] = {0};
132 u32 out
[MLX5_ST_SZ_DW(destroy_vport_lag_out
)] = {0};
134 MLX5_SET(destroy_vport_lag_in
, in
, opcode
, MLX5_CMD_OP_DESTROY_VPORT_LAG
);
136 return mlx5_cmd_exec(dev
, in
, sizeof(in
), out
, sizeof(out
));
138 EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag
);
140 static struct mlx5_lag
*mlx5_lag_dev_get(struct mlx5_core_dev
*dev
)
142 return dev
->priv
.lag
;
145 static int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag
*ldev
,
146 struct net_device
*ndev
)
150 for (i
= 0; i
< MLX5_MAX_PORTS
; i
++)
151 if (ldev
->pf
[i
].netdev
== ndev
)
157 static bool mlx5_lag_is_bonded(struct mlx5_lag
*ldev
)
159 return !!(ldev
->flags
& MLX5_LAG_FLAG_BONDED
);
162 static void mlx5_infer_tx_affinity_mapping(struct lag_tracker
*tracker
,
163 u8
*port1
, u8
*port2
)
167 if (!tracker
->netdev_state
[0].tx_enabled
||
168 !tracker
->netdev_state
[0].link_up
) {
173 if (!tracker
->netdev_state
[1].tx_enabled
||
174 !tracker
->netdev_state
[1].link_up
)
178 static void mlx5_activate_lag(struct mlx5_lag
*ldev
,
179 struct lag_tracker
*tracker
)
181 struct mlx5_core_dev
*dev0
= ldev
->pf
[0].dev
;
184 ldev
->flags
|= MLX5_LAG_FLAG_BONDED
;
186 mlx5_infer_tx_affinity_mapping(tracker
, &ldev
->v2p_map
[0],
189 err
= mlx5_cmd_create_lag(dev0
, ldev
->v2p_map
[0], ldev
->v2p_map
[1]);
192 "Failed to create LAG (%d)\n",
196 static void mlx5_deactivate_lag(struct mlx5_lag
*ldev
)
198 struct mlx5_core_dev
*dev0
= ldev
->pf
[0].dev
;
201 ldev
->flags
&= ~MLX5_LAG_FLAG_BONDED
;
203 err
= mlx5_cmd_destroy_lag(dev0
);
206 "Failed to destroy LAG (%d)\n",
210 static void mlx5_do_bond(struct mlx5_lag
*ldev
)
212 struct mlx5_core_dev
*dev0
= ldev
->pf
[0].dev
;
213 struct mlx5_core_dev
*dev1
= ldev
->pf
[1].dev
;
214 struct lag_tracker tracker
;
215 u8 v2p_port1
, v2p_port2
;
222 mutex_lock(&lag_mutex
);
223 tracker
= ldev
->tracker
;
224 mutex_unlock(&lag_mutex
);
226 do_bond
= tracker
.is_bonded
&& ldev
->allowed
;
228 if (do_bond
&& !mlx5_lag_is_bonded(ldev
)) {
229 for (i
= 0; i
< MLX5_MAX_PORTS
; i
++)
230 mlx5_remove_dev_by_protocol(ldev
->pf
[i
].dev
,
231 MLX5_INTERFACE_PROTOCOL_IB
);
233 mlx5_activate_lag(ldev
, &tracker
);
235 mlx5_add_dev_by_protocol(dev0
, MLX5_INTERFACE_PROTOCOL_IB
);
236 mlx5_nic_vport_enable_roce(dev1
);
237 } else if (do_bond
&& mlx5_lag_is_bonded(ldev
)) {
238 mlx5_infer_tx_affinity_mapping(&tracker
, &v2p_port1
,
241 if ((v2p_port1
!= ldev
->v2p_map
[0]) ||
242 (v2p_port2
!= ldev
->v2p_map
[1])) {
243 ldev
->v2p_map
[0] = v2p_port1
;
244 ldev
->v2p_map
[1] = v2p_port2
;
246 err
= mlx5_cmd_modify_lag(dev0
, v2p_port1
, v2p_port2
);
249 "Failed to modify LAG (%d)\n",
252 } else if (!do_bond
&& mlx5_lag_is_bonded(ldev
)) {
253 mlx5_remove_dev_by_protocol(dev0
, MLX5_INTERFACE_PROTOCOL_IB
);
254 mlx5_nic_vport_disable_roce(dev1
);
256 mlx5_deactivate_lag(ldev
);
258 for (i
= 0; i
< MLX5_MAX_PORTS
; i
++)
260 mlx5_add_dev_by_protocol(ldev
->pf
[i
].dev
,
261 MLX5_INTERFACE_PROTOCOL_IB
);
265 static void mlx5_queue_bond_work(struct mlx5_lag
*ldev
, unsigned long delay
)
267 schedule_delayed_work(&ldev
->bond_work
, delay
);
270 static void mlx5_do_bond_work(struct work_struct
*work
)
272 struct delayed_work
*delayed_work
= to_delayed_work(work
);
273 struct mlx5_lag
*ldev
= container_of(delayed_work
, struct mlx5_lag
,
277 status
= mlx5_dev_list_trylock();
280 mlx5_queue_bond_work(ldev
, HZ
);
285 mlx5_dev_list_unlock();
288 static int mlx5_handle_changeupper_event(struct mlx5_lag
*ldev
,
289 struct lag_tracker
*tracker
,
290 struct net_device
*ndev
,
291 struct netdev_notifier_changeupper_info
*info
)
293 struct net_device
*upper
= info
->upper_dev
, *ndev_tmp
;
294 struct netdev_lag_upper_info
*lag_upper_info
= NULL
;
300 if (!netif_is_lag_master(upper
))
304 lag_upper_info
= info
->upper_info
;
306 /* The event may still be of interest if the slave does not belong to
307 * us, but is enslaved to a master which has one or more of our netdevs
308 * as slaves (e.g., if a new slave is added to a master that bonds two
309 * of our netdevs, we should unbond).
312 for_each_netdev_in_bond_rcu(upper
, ndev_tmp
) {
313 idx
= mlx5_lag_dev_get_netdev_idx(ldev
, ndev_tmp
);
315 bond_status
|= (1 << idx
);
321 /* None of this lagdev's netdevs are slaves of this master. */
322 if (!(bond_status
& 0x3))
326 tracker
->tx_type
= lag_upper_info
->tx_type
;
328 /* Determine bonding status:
329 * A device is considered bonded if both its physical ports are slaves
330 * of the same lag master, and only them.
331 * Lag mode must be activebackup or hash.
333 is_bonded
= (num_slaves
== MLX5_MAX_PORTS
) &&
334 (bond_status
== 0x3) &&
335 ((tracker
->tx_type
== NETDEV_LAG_TX_TYPE_ACTIVEBACKUP
) ||
336 (tracker
->tx_type
== NETDEV_LAG_TX_TYPE_HASH
));
338 if (tracker
->is_bonded
!= is_bonded
) {
339 tracker
->is_bonded
= is_bonded
;
346 static int mlx5_handle_changelowerstate_event(struct mlx5_lag
*ldev
,
347 struct lag_tracker
*tracker
,
348 struct net_device
*ndev
,
349 struct netdev_notifier_changelowerstate_info
*info
)
351 struct netdev_lag_lower_state_info
*lag_lower_info
;
354 if (!netif_is_lag_port(ndev
))
357 idx
= mlx5_lag_dev_get_netdev_idx(ldev
, ndev
);
361 /* This information is used to determine virtual to physical
364 lag_lower_info
= info
->lower_state_info
;
368 tracker
->netdev_state
[idx
] = *lag_lower_info
;
373 static int mlx5_lag_netdev_event(struct notifier_block
*this,
374 unsigned long event
, void *ptr
)
376 struct net_device
*ndev
= netdev_notifier_info_to_dev(ptr
);
377 struct lag_tracker tracker
;
378 struct mlx5_lag
*ldev
;
381 if (!net_eq(dev_net(ndev
), &init_net
))
384 if ((event
!= NETDEV_CHANGEUPPER
) && (event
!= NETDEV_CHANGELOWERSTATE
))
387 ldev
= container_of(this, struct mlx5_lag
, nb
);
388 tracker
= ldev
->tracker
;
391 case NETDEV_CHANGEUPPER
:
392 changed
= mlx5_handle_changeupper_event(ldev
, &tracker
, ndev
,
395 case NETDEV_CHANGELOWERSTATE
:
396 changed
= mlx5_handle_changelowerstate_event(ldev
, &tracker
,
401 mutex_lock(&lag_mutex
);
402 ldev
->tracker
= tracker
;
403 mutex_unlock(&lag_mutex
);
406 mlx5_queue_bond_work(ldev
, 0);
411 static bool mlx5_lag_check_prereq(struct mlx5_lag
*ldev
)
413 if ((ldev
->pf
[0].dev
&& mlx5_sriov_is_enabled(ldev
->pf
[0].dev
)) ||
414 (ldev
->pf
[1].dev
&& mlx5_sriov_is_enabled(ldev
->pf
[1].dev
)))
420 static struct mlx5_lag
*mlx5_lag_dev_alloc(void)
422 struct mlx5_lag
*ldev
;
424 ldev
= kzalloc(sizeof(*ldev
), GFP_KERNEL
);
428 INIT_DELAYED_WORK(&ldev
->bond_work
, mlx5_do_bond_work
);
429 ldev
->allowed
= mlx5_lag_check_prereq(ldev
);
434 static void mlx5_lag_dev_free(struct mlx5_lag
*ldev
)
439 static void mlx5_lag_dev_add_pf(struct mlx5_lag
*ldev
,
440 struct mlx5_core_dev
*dev
,
441 struct net_device
*netdev
)
443 unsigned int fn
= PCI_FUNC(dev
->pdev
->devfn
);
445 if (fn
>= MLX5_MAX_PORTS
)
448 mutex_lock(&lag_mutex
);
449 ldev
->pf
[fn
].dev
= dev
;
450 ldev
->pf
[fn
].netdev
= netdev
;
451 ldev
->tracker
.netdev_state
[fn
].link_up
= 0;
452 ldev
->tracker
.netdev_state
[fn
].tx_enabled
= 0;
454 ldev
->allowed
= mlx5_lag_check_prereq(ldev
);
455 dev
->priv
.lag
= ldev
;
457 mutex_unlock(&lag_mutex
);
460 static void mlx5_lag_dev_remove_pf(struct mlx5_lag
*ldev
,
461 struct mlx5_core_dev
*dev
)
465 for (i
= 0; i
< MLX5_MAX_PORTS
; i
++)
466 if (ldev
->pf
[i
].dev
== dev
)
469 if (i
== MLX5_MAX_PORTS
)
472 mutex_lock(&lag_mutex
);
473 memset(&ldev
->pf
[i
], 0, sizeof(*ldev
->pf
));
475 dev
->priv
.lag
= NULL
;
476 ldev
->allowed
= mlx5_lag_check_prereq(ldev
);
477 mutex_unlock(&lag_mutex
);
480 /* Must be called with intf_mutex held */
481 void mlx5_lag_add(struct mlx5_core_dev
*dev
, struct net_device
*netdev
)
483 struct mlx5_lag
*ldev
= NULL
;
484 struct mlx5_core_dev
*tmp_dev
;
486 if (!MLX5_CAP_GEN(dev
, vport_group_manager
) ||
487 !MLX5_CAP_GEN(dev
, lag_master
) ||
488 (MLX5_CAP_GEN(dev
, num_lag_ports
) != MLX5_MAX_PORTS
))
491 tmp_dev
= mlx5_get_next_phys_dev(dev
);
493 ldev
= tmp_dev
->priv
.lag
;
496 ldev
= mlx5_lag_dev_alloc();
498 mlx5_core_err(dev
, "Failed to alloc lag dev\n");
503 mlx5_lag_dev_add_pf(ldev
, dev
, netdev
);
505 if (!ldev
->nb
.notifier_call
) {
506 ldev
->nb
.notifier_call
= mlx5_lag_netdev_event
;
507 if (register_netdevice_notifier(&ldev
->nb
)) {
508 ldev
->nb
.notifier_call
= NULL
;
509 mlx5_core_err(dev
, "Failed to register LAG netdev notifier\n");
514 /* Must be called with intf_mutex held */
515 void mlx5_lag_remove(struct mlx5_core_dev
*dev
)
517 struct mlx5_lag
*ldev
;
520 ldev
= mlx5_lag_dev_get(dev
);
524 if (mlx5_lag_is_bonded(ldev
))
525 mlx5_deactivate_lag(ldev
);
527 mlx5_lag_dev_remove_pf(ldev
, dev
);
529 for (i
= 0; i
< MLX5_MAX_PORTS
; i
++)
533 if (i
== MLX5_MAX_PORTS
) {
534 if (ldev
->nb
.notifier_call
)
535 unregister_netdevice_notifier(&ldev
->nb
);
536 cancel_delayed_work_sync(&ldev
->bond_work
);
537 mlx5_lag_dev_free(ldev
);
541 bool mlx5_lag_is_active(struct mlx5_core_dev
*dev
)
543 struct mlx5_lag
*ldev
;
546 mutex_lock(&lag_mutex
);
547 ldev
= mlx5_lag_dev_get(dev
);
548 res
= ldev
&& mlx5_lag_is_bonded(ldev
);
549 mutex_unlock(&lag_mutex
);
553 EXPORT_SYMBOL(mlx5_lag_is_active
);
555 static int mlx5_lag_set_state(struct mlx5_core_dev
*dev
, bool allow
)
557 struct mlx5_lag
*ldev
;
561 mlx5_dev_list_lock();
563 ldev
= mlx5_lag_dev_get(dev
);
568 lag_active
= mlx5_lag_is_bonded(ldev
);
569 if (!mlx5_lag_check_prereq(ldev
) && allow
) {
573 if (ldev
->allowed
== allow
)
575 ldev
->allowed
= allow
;
576 if ((lag_active
&& !allow
) || allow
)
579 mlx5_dev_list_unlock();
583 int mlx5_lag_forbid(struct mlx5_core_dev
*dev
)
585 return mlx5_lag_set_state(dev
, false);
588 int mlx5_lag_allow(struct mlx5_core_dev
*dev
)
590 return mlx5_lag_set_state(dev
, true);
593 struct net_device
*mlx5_lag_get_roce_netdev(struct mlx5_core_dev
*dev
)
595 struct net_device
*ndev
= NULL
;
596 struct mlx5_lag
*ldev
;
598 mutex_lock(&lag_mutex
);
599 ldev
= mlx5_lag_dev_get(dev
);
601 if (!(ldev
&& mlx5_lag_is_bonded(ldev
)))
604 if (ldev
->tracker
.tx_type
== NETDEV_LAG_TX_TYPE_ACTIVEBACKUP
) {
605 ndev
= ldev
->tracker
.netdev_state
[0].tx_enabled
?
606 ldev
->pf
[0].netdev
: ldev
->pf
[1].netdev
;
608 ndev
= ldev
->pf
[0].netdev
;
614 mutex_unlock(&lag_mutex
);
618 EXPORT_SYMBOL(mlx5_lag_get_roce_netdev
);
620 bool mlx5_lag_intf_add(struct mlx5_interface
*intf
, struct mlx5_priv
*priv
)
622 struct mlx5_core_dev
*dev
= container_of(priv
, struct mlx5_core_dev
,
624 struct mlx5_lag
*ldev
;
626 if (intf
->protocol
!= MLX5_INTERFACE_PROTOCOL_IB
)
629 ldev
= mlx5_lag_dev_get(dev
);
630 if (!ldev
|| !mlx5_lag_is_bonded(ldev
) || ldev
->pf
[0].dev
== dev
)
633 /* If bonded, we do not add an IB device for PF1. */