2 * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include <linux/netdevice.h>
34 #include <linux/mlx5/driver.h>
35 #include <linux/mlx5/vport.h>
36 #include "mlx5_core.h"
39 MLX5_LAG_FLAG_BONDED
= 1 << 0,
43 struct mlx5_core_dev
*dev
;
44 struct net_device
*netdev
;
47 /* Used for collection of netdev event info. */
49 enum netdev_lag_tx_type tx_type
;
50 struct netdev_lag_lower_state_info netdev_state
[MLX5_MAX_PORTS
];
54 /* LAG data of a ConnectX card.
55 * It serves both its phys functions.
59 u8 v2p_map
[MLX5_MAX_PORTS
];
60 struct lag_func pf
[MLX5_MAX_PORTS
];
61 struct lag_tracker tracker
;
62 struct delayed_work bond_work
;
63 struct notifier_block nb
;
66 /* General purpose, use for short periods of time.
67 * Beware of lock dependencies (preferably, no locks should be acquired
70 static DEFINE_MUTEX(lag_mutex
);
72 static int mlx5_cmd_create_lag(struct mlx5_core_dev
*dev
, u8 remap_port1
,
75 u32 in
[MLX5_ST_SZ_DW(create_lag_in
)] = {0};
76 u32 out
[MLX5_ST_SZ_DW(create_lag_out
)] = {0};
77 void *lag_ctx
= MLX5_ADDR_OF(create_lag_in
, in
, ctx
);
79 MLX5_SET(create_lag_in
, in
, opcode
, MLX5_CMD_OP_CREATE_LAG
);
81 MLX5_SET(lagc
, lag_ctx
, tx_remap_affinity_1
, remap_port1
);
82 MLX5_SET(lagc
, lag_ctx
, tx_remap_affinity_2
, remap_port2
);
84 return mlx5_cmd_exec(dev
, in
, sizeof(in
), out
, sizeof(out
));
87 static int mlx5_cmd_modify_lag(struct mlx5_core_dev
*dev
, u8 remap_port1
,
90 u32 in
[MLX5_ST_SZ_DW(modify_lag_in
)] = {0};
91 u32 out
[MLX5_ST_SZ_DW(modify_lag_out
)] = {0};
92 void *lag_ctx
= MLX5_ADDR_OF(modify_lag_in
, in
, ctx
);
94 MLX5_SET(modify_lag_in
, in
, opcode
, MLX5_CMD_OP_MODIFY_LAG
);
95 MLX5_SET(modify_lag_in
, in
, field_select
, 0x1);
97 MLX5_SET(lagc
, lag_ctx
, tx_remap_affinity_1
, remap_port1
);
98 MLX5_SET(lagc
, lag_ctx
, tx_remap_affinity_2
, remap_port2
);
100 return mlx5_cmd_exec(dev
, in
, sizeof(in
), out
, sizeof(out
));
103 static int mlx5_cmd_destroy_lag(struct mlx5_core_dev
*dev
)
105 u32 in
[MLX5_ST_SZ_DW(destroy_lag_in
)] = {0};
106 u32 out
[MLX5_ST_SZ_DW(destroy_lag_out
)] = {0};
108 MLX5_SET(destroy_lag_in
, in
, opcode
, MLX5_CMD_OP_DESTROY_LAG
);
110 return mlx5_cmd_exec(dev
, in
, sizeof(in
), out
, sizeof(out
));
113 int mlx5_cmd_create_vport_lag(struct mlx5_core_dev
*dev
)
115 u32 in
[MLX5_ST_SZ_DW(create_vport_lag_in
)] = {0};
116 u32 out
[MLX5_ST_SZ_DW(create_vport_lag_out
)] = {0};
118 MLX5_SET(create_vport_lag_in
, in
, opcode
, MLX5_CMD_OP_CREATE_VPORT_LAG
);
120 return mlx5_cmd_exec(dev
, in
, sizeof(in
), out
, sizeof(out
));
122 EXPORT_SYMBOL(mlx5_cmd_create_vport_lag
);
124 int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev
*dev
)
126 u32 in
[MLX5_ST_SZ_DW(destroy_vport_lag_in
)] = {0};
127 u32 out
[MLX5_ST_SZ_DW(destroy_vport_lag_out
)] = {0};
129 MLX5_SET(destroy_vport_lag_in
, in
, opcode
, MLX5_CMD_OP_DESTROY_VPORT_LAG
);
131 return mlx5_cmd_exec(dev
, in
, sizeof(in
), out
, sizeof(out
));
133 EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag
);
135 static struct mlx5_lag
*mlx5_lag_dev_get(struct mlx5_core_dev
*dev
)
137 return dev
->priv
.lag
;
140 static int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag
*ldev
,
141 struct net_device
*ndev
)
145 for (i
= 0; i
< MLX5_MAX_PORTS
; i
++)
146 if (ldev
->pf
[i
].netdev
== ndev
)
152 static bool mlx5_lag_is_bonded(struct mlx5_lag
*ldev
)
154 return !!(ldev
->flags
& MLX5_LAG_FLAG_BONDED
);
157 static void mlx5_infer_tx_affinity_mapping(struct lag_tracker
*tracker
,
158 u8
*port1
, u8
*port2
)
160 if (tracker
->tx_type
== NETDEV_LAG_TX_TYPE_ACTIVEBACKUP
) {
161 if (tracker
->netdev_state
[0].tx_enabled
) {
171 if (!tracker
->netdev_state
[0].link_up
)
173 else if (!tracker
->netdev_state
[1].link_up
)
178 static void mlx5_activate_lag(struct mlx5_lag
*ldev
,
179 struct lag_tracker
*tracker
)
181 struct mlx5_core_dev
*dev0
= ldev
->pf
[0].dev
;
184 ldev
->flags
|= MLX5_LAG_FLAG_BONDED
;
186 mlx5_infer_tx_affinity_mapping(tracker
, &ldev
->v2p_map
[0],
189 err
= mlx5_cmd_create_lag(dev0
, ldev
->v2p_map
[0], ldev
->v2p_map
[1]);
192 "Failed to create LAG (%d)\n",
196 static void mlx5_deactivate_lag(struct mlx5_lag
*ldev
)
198 struct mlx5_core_dev
*dev0
= ldev
->pf
[0].dev
;
201 ldev
->flags
&= ~MLX5_LAG_FLAG_BONDED
;
203 err
= mlx5_cmd_destroy_lag(dev0
);
206 "Failed to destroy LAG (%d)\n",
210 static void mlx5_do_bond(struct mlx5_lag
*ldev
)
212 struct mlx5_core_dev
*dev0
= ldev
->pf
[0].dev
;
213 struct mlx5_core_dev
*dev1
= ldev
->pf
[1].dev
;
214 struct lag_tracker tracker
;
215 u8 v2p_port1
, v2p_port2
;
221 mutex_lock(&lag_mutex
);
222 tracker
= ldev
->tracker
;
223 mutex_unlock(&lag_mutex
);
225 if (tracker
.is_bonded
&& !mlx5_lag_is_bonded(ldev
)) {
226 if (mlx5_sriov_is_enabled(dev0
) ||
227 mlx5_sriov_is_enabled(dev1
)) {
228 mlx5_core_warn(dev0
, "LAG is not supported with SRIOV");
232 for (i
= 0; i
< MLX5_MAX_PORTS
; i
++)
233 mlx5_remove_dev_by_protocol(ldev
->pf
[i
].dev
,
234 MLX5_INTERFACE_PROTOCOL_IB
);
236 mlx5_activate_lag(ldev
, &tracker
);
238 mlx5_add_dev_by_protocol(dev0
, MLX5_INTERFACE_PROTOCOL_IB
);
239 mlx5_nic_vport_enable_roce(dev1
);
240 } else if (tracker
.is_bonded
&& mlx5_lag_is_bonded(ldev
)) {
241 mlx5_infer_tx_affinity_mapping(&tracker
, &v2p_port1
,
244 if ((v2p_port1
!= ldev
->v2p_map
[0]) ||
245 (v2p_port2
!= ldev
->v2p_map
[1])) {
246 ldev
->v2p_map
[0] = v2p_port1
;
247 ldev
->v2p_map
[1] = v2p_port2
;
249 err
= mlx5_cmd_modify_lag(dev0
, v2p_port1
, v2p_port2
);
252 "Failed to modify LAG (%d)\n",
255 } else if (!tracker
.is_bonded
&& mlx5_lag_is_bonded(ldev
)) {
256 mlx5_remove_dev_by_protocol(dev0
, MLX5_INTERFACE_PROTOCOL_IB
);
257 mlx5_nic_vport_disable_roce(dev1
);
259 mlx5_deactivate_lag(ldev
);
261 for (i
= 0; i
< MLX5_MAX_PORTS
; i
++)
263 mlx5_add_dev_by_protocol(ldev
->pf
[i
].dev
,
264 MLX5_INTERFACE_PROTOCOL_IB
);
268 static void mlx5_queue_bond_work(struct mlx5_lag
*ldev
, unsigned long delay
)
270 schedule_delayed_work(&ldev
->bond_work
, delay
);
273 static void mlx5_do_bond_work(struct work_struct
*work
)
275 struct delayed_work
*delayed_work
= to_delayed_work(work
);
276 struct mlx5_lag
*ldev
= container_of(delayed_work
, struct mlx5_lag
,
280 status
= mlx5_dev_list_trylock();
283 mlx5_queue_bond_work(ldev
, HZ
);
288 mlx5_dev_list_unlock();
291 static int mlx5_handle_changeupper_event(struct mlx5_lag
*ldev
,
292 struct lag_tracker
*tracker
,
293 struct net_device
*ndev
,
294 struct netdev_notifier_changeupper_info
*info
)
296 struct net_device
*upper
= info
->upper_dev
, *ndev_tmp
;
297 struct netdev_lag_upper_info
*lag_upper_info
= NULL
;
303 if (!netif_is_lag_master(upper
))
307 lag_upper_info
= info
->upper_info
;
309 /* The event may still be of interest if the slave does not belong to
310 * us, but is enslaved to a master which has one or more of our netdevs
311 * as slaves (e.g., if a new slave is added to a master that bonds two
312 * of our netdevs, we should unbond).
315 for_each_netdev_in_bond_rcu(upper
, ndev_tmp
) {
316 idx
= mlx5_lag_dev_get_netdev_idx(ldev
, ndev_tmp
);
318 bond_status
|= (1 << idx
);
324 /* None of this lagdev's netdevs are slaves of this master. */
325 if (!(bond_status
& 0x3))
329 tracker
->tx_type
= lag_upper_info
->tx_type
;
331 /* Determine bonding status:
332 * A device is considered bonded if both its physical ports are slaves
333 * of the same lag master, and only them.
334 * Lag mode must be activebackup or hash.
336 is_bonded
= (num_slaves
== MLX5_MAX_PORTS
) &&
337 (bond_status
== 0x3) &&
338 ((tracker
->tx_type
== NETDEV_LAG_TX_TYPE_ACTIVEBACKUP
) ||
339 (tracker
->tx_type
== NETDEV_LAG_TX_TYPE_HASH
));
341 if (tracker
->is_bonded
!= is_bonded
) {
342 tracker
->is_bonded
= is_bonded
;
349 static int mlx5_handle_changelowerstate_event(struct mlx5_lag
*ldev
,
350 struct lag_tracker
*tracker
,
351 struct net_device
*ndev
,
352 struct netdev_notifier_changelowerstate_info
*info
)
354 struct netdev_lag_lower_state_info
*lag_lower_info
;
357 if (!netif_is_lag_port(ndev
))
360 idx
= mlx5_lag_dev_get_netdev_idx(ldev
, ndev
);
364 /* This information is used to determine virtual to physical
367 lag_lower_info
= info
->lower_state_info
;
371 tracker
->netdev_state
[idx
] = *lag_lower_info
;
376 static int mlx5_lag_netdev_event(struct notifier_block
*this,
377 unsigned long event
, void *ptr
)
379 struct net_device
*ndev
= netdev_notifier_info_to_dev(ptr
);
380 struct lag_tracker tracker
;
381 struct mlx5_lag
*ldev
;
384 if (!net_eq(dev_net(ndev
), &init_net
))
387 if ((event
!= NETDEV_CHANGEUPPER
) && (event
!= NETDEV_CHANGELOWERSTATE
))
390 ldev
= container_of(this, struct mlx5_lag
, nb
);
391 tracker
= ldev
->tracker
;
394 case NETDEV_CHANGEUPPER
:
395 changed
= mlx5_handle_changeupper_event(ldev
, &tracker
, ndev
,
398 case NETDEV_CHANGELOWERSTATE
:
399 changed
= mlx5_handle_changelowerstate_event(ldev
, &tracker
,
404 mutex_lock(&lag_mutex
);
405 ldev
->tracker
= tracker
;
406 mutex_unlock(&lag_mutex
);
409 mlx5_queue_bond_work(ldev
, 0);
414 static struct mlx5_lag
*mlx5_lag_dev_alloc(void)
416 struct mlx5_lag
*ldev
;
418 ldev
= kzalloc(sizeof(*ldev
), GFP_KERNEL
);
422 INIT_DELAYED_WORK(&ldev
->bond_work
, mlx5_do_bond_work
);
427 static void mlx5_lag_dev_free(struct mlx5_lag
*ldev
)
432 static void mlx5_lag_dev_add_pf(struct mlx5_lag
*ldev
,
433 struct mlx5_core_dev
*dev
,
434 struct net_device
*netdev
)
436 unsigned int fn
= PCI_FUNC(dev
->pdev
->devfn
);
438 if (fn
>= MLX5_MAX_PORTS
)
441 mutex_lock(&lag_mutex
);
442 ldev
->pf
[fn
].dev
= dev
;
443 ldev
->pf
[fn
].netdev
= netdev
;
444 ldev
->tracker
.netdev_state
[fn
].link_up
= 0;
445 ldev
->tracker
.netdev_state
[fn
].tx_enabled
= 0;
447 dev
->priv
.lag
= ldev
;
448 mutex_unlock(&lag_mutex
);
451 static void mlx5_lag_dev_remove_pf(struct mlx5_lag
*ldev
,
452 struct mlx5_core_dev
*dev
)
456 for (i
= 0; i
< MLX5_MAX_PORTS
; i
++)
457 if (ldev
->pf
[i
].dev
== dev
)
460 if (i
== MLX5_MAX_PORTS
)
463 mutex_lock(&lag_mutex
);
464 memset(&ldev
->pf
[i
], 0, sizeof(*ldev
->pf
));
466 dev
->priv
.lag
= NULL
;
467 mutex_unlock(&lag_mutex
);
471 /* Must be called with intf_mutex held */
472 void mlx5_lag_add(struct mlx5_core_dev
*dev
, struct net_device
*netdev
)
474 struct mlx5_lag
*ldev
= NULL
;
475 struct mlx5_core_dev
*tmp_dev
;
477 if (!MLX5_CAP_GEN(dev
, vport_group_manager
) ||
478 !MLX5_CAP_GEN(dev
, lag_master
) ||
479 (MLX5_CAP_GEN(dev
, num_lag_ports
) != MLX5_MAX_PORTS
))
482 tmp_dev
= mlx5_get_next_phys_dev(dev
);
484 ldev
= tmp_dev
->priv
.lag
;
487 ldev
= mlx5_lag_dev_alloc();
489 mlx5_core_err(dev
, "Failed to alloc lag dev\n");
494 mlx5_lag_dev_add_pf(ldev
, dev
, netdev
);
496 if (!ldev
->nb
.notifier_call
) {
497 ldev
->nb
.notifier_call
= mlx5_lag_netdev_event
;
498 if (register_netdevice_notifier(&ldev
->nb
)) {
499 ldev
->nb
.notifier_call
= NULL
;
500 mlx5_core_err(dev
, "Failed to register LAG netdev notifier\n");
505 /* Must be called with intf_mutex held */
506 void mlx5_lag_remove(struct mlx5_core_dev
*dev
)
508 struct mlx5_lag
*ldev
;
511 ldev
= mlx5_lag_dev_get(dev
);
515 if (mlx5_lag_is_bonded(ldev
))
516 mlx5_deactivate_lag(ldev
);
518 mlx5_lag_dev_remove_pf(ldev
, dev
);
520 for (i
= 0; i
< MLX5_MAX_PORTS
; i
++)
524 if (i
== MLX5_MAX_PORTS
) {
525 if (ldev
->nb
.notifier_call
)
526 unregister_netdevice_notifier(&ldev
->nb
);
527 cancel_delayed_work_sync(&ldev
->bond_work
);
528 mlx5_lag_dev_free(ldev
);
532 bool mlx5_lag_is_active(struct mlx5_core_dev
*dev
)
534 struct mlx5_lag
*ldev
;
537 mutex_lock(&lag_mutex
);
538 ldev
= mlx5_lag_dev_get(dev
);
539 res
= ldev
&& mlx5_lag_is_bonded(ldev
);
540 mutex_unlock(&lag_mutex
);
544 EXPORT_SYMBOL(mlx5_lag_is_active
);
546 struct net_device
*mlx5_lag_get_roce_netdev(struct mlx5_core_dev
*dev
)
548 struct net_device
*ndev
= NULL
;
549 struct mlx5_lag
*ldev
;
551 mutex_lock(&lag_mutex
);
552 ldev
= mlx5_lag_dev_get(dev
);
554 if (!(ldev
&& mlx5_lag_is_bonded(ldev
)))
557 if (ldev
->tracker
.tx_type
== NETDEV_LAG_TX_TYPE_ACTIVEBACKUP
) {
558 ndev
= ldev
->tracker
.netdev_state
[0].tx_enabled
?
559 ldev
->pf
[0].netdev
: ldev
->pf
[1].netdev
;
561 ndev
= ldev
->pf
[0].netdev
;
567 mutex_unlock(&lag_mutex
);
571 EXPORT_SYMBOL(mlx5_lag_get_roce_netdev
);
573 bool mlx5_lag_intf_add(struct mlx5_interface
*intf
, struct mlx5_priv
*priv
)
575 struct mlx5_core_dev
*dev
= container_of(priv
, struct mlx5_core_dev
,
577 struct mlx5_lag
*ldev
;
579 if (intf
->protocol
!= MLX5_INTERFACE_PROTOCOL_IB
)
582 ldev
= mlx5_lag_dev_get(dev
);
583 if (!ldev
|| !mlx5_lag_is_bonded(ldev
) || ldev
->pf
[0].dev
== dev
)
586 /* If bonded, we do not add an IB device for PF1. */