2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "ofp-actions.h"
29 #include "ofproto/ofproto-provider.h"
30 #include "ofproto/ofproto-dpif.h"
31 #include "ofproto/ofproto-dpif-rid.h"
32 #include "connectivity.h"
34 #include "dynamic-string.h"
43 #include "dp-packet.h"
44 #include "poll-loop.h"
50 #include "openvswitch/vlog.h"
52 VLOG_DEFINE_THIS_MODULE(bond
);
54 static struct ovs_rwlock rwlock
= OVS_RWLOCK_INITIALIZER
;
55 static struct hmap all_bonds__
= HMAP_INITIALIZER(&all_bonds__
);
56 static struct hmap
*const all_bonds
OVS_GUARDED_BY(rwlock
) = &all_bonds__
;
58 /* Bit-mask for hashing a flow down to a bucket. */
59 #define BOND_MASK 0xff
60 #define BOND_BUCKETS (BOND_MASK + 1)
62 /* A hash bucket for mapping a flow to a slave.
63 * "struct bond" has an array of BOND_BUCKETS of these. */
65 struct bond_slave
*slave
; /* Assigned slave, NULL if unassigned. */
66 uint64_t tx_bytes
/* Count of bytes recently transmitted. */
67 OVS_GUARDED_BY(rwlock
);
68 struct ovs_list list_node
; /* In bond_slave's 'entries' list. */
72 * 'pr_rule' is the post-recirculation rule for this entry.
73 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
74 * is used to determine delta (applied to 'tx_bytes' above.) */
76 uint64_t pr_tx_bytes
OVS_GUARDED_BY(rwlock
);
79 /* A bond slave, that is, one of the links comprising a bond. */
81 struct hmap_node hmap_node
; /* In struct bond's slaves hmap. */
82 struct ovs_list list_node
; /* In struct bond's enabled_slaves list. */
83 struct bond
*bond
; /* The bond that contains this slave. */
84 void *aux
; /* Client-provided handle for this slave. */
86 struct netdev
*netdev
; /* Network device, owned by the client. */
87 unsigned int change_seq
; /* Tracks changes in 'netdev'. */
88 ofp_port_t ofp_port
; /* OpenFlow port number. */
89 char *name
; /* Name (a copy of netdev_get_name(netdev)). */
92 long long delay_expires
; /* Time after which 'enabled' may change. */
93 bool enabled
; /* May be chosen for flows? */
94 bool may_enable
; /* Client considers this slave bondable. */
96 /* Rebalancing info. Used only by bond_rebalance(). */
97 struct ovs_list bal_node
; /* In bond_rebalance()'s 'bals' list. */
98 struct ovs_list entries
; /* 'struct bond_entry's assigned here. */
99 uint64_t tx_bytes
; /* Sum across 'tx_bytes' of entries. */
102 /* A bond, that is, a set of network devices grouped to improve performance or
105 struct hmap_node hmap_node
; /* In 'all_bonds' hmap. */
106 char *name
; /* Name provided by client. */
107 struct ofproto_dpif
*ofproto
; /* The bridge this bond belongs to. */
114 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
115 * (To prevent the bond_slave from disappearing they must also hold
117 struct ovs_mutex mutex
OVS_ACQ_AFTER(rwlock
);
118 struct ovs_list enabled_slaves OVS_GUARDED
; /* Contains struct bond_slaves. */
121 enum bond_mode balance
; /* Balancing mode, one of BM_*. */
122 struct bond_slave
*active_slave
;
123 int updelay
, downdelay
; /* Delay before slave goes up/down, in ms. */
124 enum lacp_status lacp_status
; /* Status of LACP negotiations. */
125 bool bond_revalidate
; /* True if flows need revalidation. */
126 uint32_t basis
; /* Basis for flow hash function. */
128 /* SLB specific bonding info. */
129 struct bond_entry
*hash
; /* An array of BOND_BUCKETS elements. */
130 int rebalance_interval
; /* Interval between rebalances, in ms. */
131 long long int next_rebalance
; /* Next rebalancing time. */
132 bool send_learning_packets
;
133 uint32_t recirc_id
; /* Non zero if recirculation can be used.*/
134 struct hmap pr_rule_ops
; /* Helps to maintain post recirculation rules.*/
136 /* Store active slave to OVSDB. */
137 bool active_slave_changed
; /* Set to true whenever the bond changes
138 active slave. It will be reset to false
139 after it is stored into OVSDB */
141 /* Interface name may not be persistent across an OS reboot, use
142 * MAC address for identifing the active slave */
143 uint8_t active_slave_mac
[ETH_ADDR_LEN
];
144 /* The MAC address of the active interface. */
145 /* Legacy compatibility. */
146 bool lacp_fallback_ab
; /* Fallback to active-backup on LACP failure. */
148 struct ovs_refcount ref_cnt
;
151 /* What to do with an bond_recirc_rule. */
153 ADD
, /* Add the rule to ofproto's flow table. */
154 DEL
, /* Delete the rule from the ofproto's flow table. */
157 /* A rule to add to or delete from ofproto's internal flow table. */
158 struct bond_pr_rule_op
{
159 struct hmap_node hmap_node
;
161 ofp_port_t out_ofport
;
163 struct rule
**pr_rule
;
166 static void bond_entry_reset(struct bond
*) OVS_REQ_WRLOCK(rwlock
);
167 static struct bond_slave
*bond_slave_lookup(struct bond
*, const void *slave_
)
168 OVS_REQ_RDLOCK(rwlock
);
169 static void bond_enable_slave(struct bond_slave
*, bool enable
)
170 OVS_REQ_WRLOCK(rwlock
);
171 static void bond_link_status_update(struct bond_slave
*)
172 OVS_REQ_WRLOCK(rwlock
);
173 static void bond_choose_active_slave(struct bond
*)
174 OVS_REQ_WRLOCK(rwlock
);
175 static unsigned int bond_hash_src(const uint8_t mac
[ETH_ADDR_LEN
],
176 uint16_t vlan
, uint32_t basis
);
177 static unsigned int bond_hash_tcp(const struct flow
*, uint16_t vlan
,
179 static struct bond_entry
*lookup_bond_entry(const struct bond
*,
182 OVS_REQ_RDLOCK(rwlock
);
183 static struct bond_slave
*get_enabled_slave(struct bond
*)
184 OVS_REQ_RDLOCK(rwlock
);
185 static struct bond_slave
*choose_output_slave(const struct bond
*,
187 struct flow_wildcards
*,
189 OVS_REQ_RDLOCK(rwlock
);
191 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
192 * stores the mode in '*balance' and returns true. Otherwise returns false
193 * without modifying '*balance'. */
195 bond_mode_from_string(enum bond_mode
*balance
, const char *s
)
197 if (!strcmp(s
, bond_mode_to_string(BM_TCP
))) {
199 } else if (!strcmp(s
, bond_mode_to_string(BM_SLB
))) {
201 } else if (!strcmp(s
, bond_mode_to_string(BM_AB
))) {
209 /* Returns a string representing 'balance'. */
211 bond_mode_to_string(enum bond_mode balance
) {
214 return "balance-tcp";
216 return "balance-slb";
218 return "active-backup";
224 /* Creates and returns a new bond whose configuration is initially taken from
227 * The caller should register each slave on the new bond by calling
228 * bond_slave_register(). */
230 bond_create(const struct bond_settings
*s
, struct ofproto_dpif
*ofproto
)
234 bond
= xzalloc(sizeof *bond
);
235 bond
->ofproto
= ofproto
;
236 hmap_init(&bond
->slaves
);
237 list_init(&bond
->enabled_slaves
);
238 ovs_mutex_init(&bond
->mutex
);
239 ovs_refcount_init(&bond
->ref_cnt
);
242 hmap_init(&bond
->pr_rule_ops
);
244 bond_reconfigure(bond
, s
);
249 bond_ref(const struct bond
*bond_
)
251 struct bond
*bond
= CONST_CAST(struct bond
*, bond_
);
254 ovs_refcount_ref(&bond
->ref_cnt
);
261 bond_unref(struct bond
*bond
)
263 struct bond_slave
*slave
, *next_slave
;
264 struct bond_pr_rule_op
*pr_op
, *next_op
;
266 if (!bond
|| ovs_refcount_unref_relaxed(&bond
->ref_cnt
) != 1) {
270 ovs_rwlock_wrlock(&rwlock
);
271 hmap_remove(all_bonds
, &bond
->hmap_node
);
272 ovs_rwlock_unlock(&rwlock
);
274 HMAP_FOR_EACH_SAFE (slave
, next_slave
, hmap_node
, &bond
->slaves
) {
275 hmap_remove(&bond
->slaves
, &slave
->hmap_node
);
276 /* Client owns 'slave->netdev'. */
280 hmap_destroy(&bond
->slaves
);
282 ovs_mutex_destroy(&bond
->mutex
);
286 HMAP_FOR_EACH_SAFE(pr_op
, next_op
, hmap_node
, &bond
->pr_rule_ops
) {
287 hmap_remove(&bond
->pr_rule_ops
, &pr_op
->hmap_node
);
290 hmap_destroy(&bond
->pr_rule_ops
);
292 if (bond
->recirc_id
) {
293 recirc_free_id(bond
->recirc_id
);
300 add_pr_rule(struct bond
*bond
, const struct match
*match
,
301 ofp_port_t out_ofport
, struct rule
**rule
)
303 uint32_t hash
= match_hash(match
, 0);
304 struct bond_pr_rule_op
*pr_op
;
306 HMAP_FOR_EACH_WITH_HASH(pr_op
, hmap_node
, hash
, &bond
->pr_rule_ops
) {
307 if (match_equal(&pr_op
->match
, match
)) {
309 pr_op
->out_ofport
= out_ofport
;
310 pr_op
->pr_rule
= rule
;
315 pr_op
= xmalloc(sizeof *pr_op
);
316 pr_op
->match
= *match
;
318 pr_op
->out_ofport
= out_ofport
;
319 pr_op
->pr_rule
= rule
;
320 hmap_insert(&bond
->pr_rule_ops
, &pr_op
->hmap_node
, hash
);
324 update_recirc_rules(struct bond
*bond
)
325 OVS_REQ_WRLOCK(rwlock
)
328 struct bond_pr_rule_op
*pr_op
, *next_op
;
329 uint64_t ofpacts_stub
[128 / 8];
330 struct ofpbuf ofpacts
;
333 ofpbuf_use_stub(&ofpacts
, ofpacts_stub
, sizeof ofpacts_stub
);
335 HMAP_FOR_EACH(pr_op
, hmap_node
, &bond
->pr_rule_ops
) {
339 if (bond
->hash
&& bond
->recirc_id
) {
340 for (i
= 0; i
< BOND_BUCKETS
; i
++) {
341 struct bond_slave
*slave
= bond
->hash
[i
].slave
;
344 match_init_catchall(&match
);
345 match_set_recirc_id(&match
, bond
->recirc_id
);
346 match_set_dp_hash_masked(&match
, i
, BOND_MASK
);
348 add_pr_rule(bond
, &match
, slave
->ofp_port
,
349 &bond
->hash
[i
].pr_rule
);
354 HMAP_FOR_EACH_SAFE(pr_op
, next_op
, hmap_node
, &bond
->pr_rule_ops
) {
358 ofpbuf_clear(&ofpacts
);
359 ofpact_put_OUTPUT(&ofpacts
)->port
= pr_op
->out_ofport
;
360 error
= ofproto_dpif_add_internal_flow(bond
->ofproto
,
362 RECIRC_RULE_PRIORITY
, 0,
363 &ofpacts
, pr_op
->pr_rule
);
365 char *err_s
= match_to_string(&pr_op
->match
,
366 RECIRC_RULE_PRIORITY
);
368 VLOG_ERR("failed to add post recirculation flow %s", err_s
);
374 error
= ofproto_dpif_delete_internal_flow(bond
->ofproto
,
376 RECIRC_RULE_PRIORITY
);
378 char *err_s
= match_to_string(&pr_op
->match
,
379 RECIRC_RULE_PRIORITY
);
381 VLOG_ERR("failed to remove post recirculation flow %s", err_s
);
385 hmap_remove(&bond
->pr_rule_ops
, &pr_op
->hmap_node
);
386 *pr_op
->pr_rule
= NULL
;
392 ofpbuf_uninit(&ofpacts
);
396 /* Updates 'bond''s overall configuration to 's'.
398 * The caller should register each slave on 'bond' by calling
399 * bond_slave_register(). This is optional if none of the slaves'
400 * configuration has changed. In any case it can't hurt.
402 * Returns true if the configuration has changed in such a way that requires
406 bond_reconfigure(struct bond
*bond
, const struct bond_settings
*s
)
408 bool revalidate
= false;
410 ovs_rwlock_wrlock(&rwlock
);
411 if (!bond
->name
|| strcmp(bond
->name
, s
->name
)) {
413 hmap_remove(all_bonds
, &bond
->hmap_node
);
416 bond
->name
= xstrdup(s
->name
);
417 hmap_insert(all_bonds
, &bond
->hmap_node
, hash_string(bond
->name
, 0));
420 bond
->updelay
= s
->up_delay
;
421 bond
->downdelay
= s
->down_delay
;
423 if (bond
->lacp_fallback_ab
!= s
->lacp_fallback_ab_cfg
) {
424 bond
->lacp_fallback_ab
= s
->lacp_fallback_ab_cfg
;
428 if (bond
->rebalance_interval
!= s
->rebalance_interval
) {
429 bond
->rebalance_interval
= s
->rebalance_interval
;
433 if (bond
->balance
!= s
->balance
) {
434 bond
->balance
= s
->balance
;
438 if (bond
->basis
!= s
->basis
) {
439 bond
->basis
= s
->basis
;
443 if (bond
->bond_revalidate
) {
445 bond
->bond_revalidate
= false;
448 if (bond
->balance
!= BM_AB
) {
449 if (!bond
->recirc_id
) {
450 bond
->recirc_id
= recirc_alloc_id(bond
->ofproto
);
452 } else if (bond
->recirc_id
) {
453 recirc_free_id(bond
->recirc_id
);
457 if (bond
->balance
== BM_AB
|| !bond
->hash
|| revalidate
) {
458 bond_entry_reset(bond
);
461 memcpy(bond
->active_slave_mac
, s
->active_slave_mac
,
462 sizeof s
->active_slave_mac
);
464 bond
->active_slave_changed
= false;
466 ovs_rwlock_unlock(&rwlock
);
470 static struct bond_slave
*
471 bond_find_slave_by_mac(const struct bond
*bond
, const uint8_t mac
[ETH_ADDR_LEN
])
473 struct bond_slave
*slave
;
475 /* Find the last active slave */
476 HMAP_FOR_EACH(slave
, hmap_node
, &bond
->slaves
) {
477 uint8_t slave_mac
[ETH_ADDR_LEN
];
479 if (netdev_get_etheraddr(slave
->netdev
, slave_mac
)) {
483 if (!memcmp(slave_mac
, mac
, sizeof(slave_mac
))) {
492 bond_active_slave_changed(struct bond
*bond
)
494 uint8_t mac
[ETH_ADDR_LEN
];
496 netdev_get_etheraddr(bond
->active_slave
->netdev
, mac
);
497 memcpy(bond
->active_slave_mac
, mac
, sizeof bond
->active_slave_mac
);
498 bond
->active_slave_changed
= true;
499 seq_change(connectivity_seq_get());
503 bond_slave_set_netdev__(struct bond_slave
*slave
, struct netdev
*netdev
)
504 OVS_REQ_WRLOCK(rwlock
)
506 if (slave
->netdev
!= netdev
) {
507 slave
->netdev
= netdev
;
508 slave
->change_seq
= 0;
512 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
513 * arbitrary client-provided pointer that uniquely identifies a slave within a
514 * bond. If 'slave_' already exists within 'bond' then this function
515 * reconfigures the existing slave.
517 * 'netdev' must be the network device that 'slave_' represents. It is owned
518 * by the client, so the client must not close it before either unregistering
519 * 'slave_' or destroying 'bond'.
522 bond_slave_register(struct bond
*bond
, void *slave_
,
523 ofp_port_t ofport
, struct netdev
*netdev
)
525 struct bond_slave
*slave
;
527 ovs_rwlock_wrlock(&rwlock
);
528 slave
= bond_slave_lookup(bond
, slave_
);
530 slave
= xzalloc(sizeof *slave
);
532 hmap_insert(&bond
->slaves
, &slave
->hmap_node
, hash_pointer(slave_
, 0));
535 slave
->ofp_port
= ofport
;
536 slave
->delay_expires
= LLONG_MAX
;
537 slave
->name
= xstrdup(netdev_get_name(netdev
));
538 bond
->bond_revalidate
= true;
540 slave
->enabled
= false;
541 bond_enable_slave(slave
, netdev_get_carrier(netdev
));
544 bond_slave_set_netdev__(slave
, netdev
);
547 slave
->name
= xstrdup(netdev_get_name(netdev
));
548 ovs_rwlock_unlock(&rwlock
);
551 /* Updates the network device to be used with 'slave_' to 'netdev'.
553 * This is useful if the caller closes and re-opens the network device
554 * registered with bond_slave_register() but doesn't need to change anything
557 bond_slave_set_netdev(struct bond
*bond
, void *slave_
, struct netdev
*netdev
)
559 struct bond_slave
*slave
;
561 ovs_rwlock_wrlock(&rwlock
);
562 slave
= bond_slave_lookup(bond
, slave_
);
564 bond_slave_set_netdev__(slave
, netdev
);
566 ovs_rwlock_unlock(&rwlock
);
569 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
570 * then this function has no effect.
572 * Unregistering a slave invalidates all flows. */
574 bond_slave_unregister(struct bond
*bond
, const void *slave_
)
576 struct bond_slave
*slave
;
579 ovs_rwlock_wrlock(&rwlock
);
580 slave
= bond_slave_lookup(bond
, slave_
);
585 bond
->bond_revalidate
= true;
586 bond_enable_slave(slave
, false);
588 del_active
= bond
->active_slave
== slave
;
590 struct bond_entry
*e
;
591 for (e
= bond
->hash
; e
<= &bond
->hash
[BOND_MASK
]; e
++) {
592 if (e
->slave
== slave
) {
600 hmap_remove(&bond
->slaves
, &slave
->hmap_node
);
601 /* Client owns 'slave->netdev'. */
605 bond_choose_active_slave(bond
);
606 bond
->send_learning_packets
= true;
609 ovs_rwlock_unlock(&rwlock
);
612 /* Should be called on each slave in 'bond' before bond_run() to indicate
613 * whether or not 'slave_' may be enabled. This function is intended to allow
614 * other protocols to have some impact on bonding decisions. For example LACP
615 * or high level link monitoring protocols may decide that a given slave should
616 * not be able to send traffic. */
618 bond_slave_set_may_enable(struct bond
*bond
, void *slave_
, bool may_enable
)
620 ovs_rwlock_wrlock(&rwlock
);
621 bond_slave_lookup(bond
, slave_
)->may_enable
= may_enable
;
622 ovs_rwlock_unlock(&rwlock
);
625 /* Performs periodic maintenance on 'bond'.
627 * Returns true if the caller should revalidate its flows.
629 * The caller should check bond_should_send_learning_packets() afterward. */
631 bond_run(struct bond
*bond
, enum lacp_status lacp_status
)
633 struct bond_slave
*slave
;
636 ovs_rwlock_wrlock(&rwlock
);
637 if (bond
->lacp_status
!= lacp_status
) {
638 bond
->lacp_status
= lacp_status
;
639 bond
->bond_revalidate
= true;
642 /* Enable slaves based on link status and LACP feedback. */
643 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
644 bond_link_status_update(slave
);
645 slave
->change_seq
= seq_read(connectivity_seq_get());
647 if (!bond
->active_slave
|| !bond
->active_slave
->enabled
) {
648 bond_choose_active_slave(bond
);
651 revalidate
= bond
->bond_revalidate
;
652 bond
->bond_revalidate
= false;
653 ovs_rwlock_unlock(&rwlock
);
658 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
660 bond_wait(struct bond
*bond
)
662 struct bond_slave
*slave
;
664 ovs_rwlock_rdlock(&rwlock
);
665 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
666 if (slave
->delay_expires
!= LLONG_MAX
) {
667 poll_timer_wait_until(slave
->delay_expires
);
670 seq_wait(connectivity_seq_get(), slave
->change_seq
);
673 if (bond
->bond_revalidate
) {
674 poll_immediate_wake();
676 ovs_rwlock_unlock(&rwlock
);
678 /* We don't wait for bond->next_rebalance because rebalancing can only run
679 * at a flow account checkpoint. ofproto does checkpointing on its own
680 * schedule and bond_rebalance() gets called afterward, so we'd just be
681 * waking up for no purpose. */
684 /* MAC learning table interaction. */
687 may_send_learning_packets(const struct bond
*bond
)
689 return ((bond
->lacp_status
== LACP_DISABLED
690 && (bond
->balance
== BM_SLB
|| bond
->balance
== BM_AB
))
691 || (bond
->lacp_fallback_ab
&& bond
->lacp_status
== LACP_CONFIGURED
))
692 && bond
->active_slave
;
695 /* Returns true if 'bond' needs the client to send out packets to assist with
696 * MAC learning on 'bond'. If this function returns true, then the client
697 * should iterate through its MAC learning table for the bridge on which 'bond'
698 * is located. For each MAC that has been learned on a port other than 'bond',
699 * it should call bond_compose_learning_packet().
701 * This function will only return true if 'bond' is in SLB or active-backup
702 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
705 * Calling this function resets the state that it checks. */
707 bond_should_send_learning_packets(struct bond
*bond
)
711 ovs_rwlock_wrlock(&rwlock
);
712 send
= bond
->send_learning_packets
&& may_send_learning_packets(bond
);
713 bond
->send_learning_packets
= false;
714 ovs_rwlock_unlock(&rwlock
);
718 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
720 * See bond_should_send_learning_packets() for description of usage. The
721 * caller should send the composed packet on the port associated with
722 * port_aux and takes ownership of the returned ofpbuf. */
724 bond_compose_learning_packet(struct bond
*bond
,
725 const uint8_t eth_src
[ETH_ADDR_LEN
],
726 uint16_t vlan
, void **port_aux
)
728 struct bond_slave
*slave
;
729 struct dp_packet
*packet
;
732 ovs_rwlock_rdlock(&rwlock
);
733 ovs_assert(may_send_learning_packets(bond
));
734 memset(&flow
, 0, sizeof flow
);
735 memcpy(flow
.dl_src
, eth_src
, ETH_ADDR_LEN
);
736 slave
= choose_output_slave(bond
, &flow
, NULL
, vlan
);
738 packet
= dp_packet_new(0);
739 compose_rarp(packet
, eth_src
);
741 eth_push_vlan(packet
, htons(ETH_TYPE_VLAN
), htons(vlan
));
744 *port_aux
= slave
->aux
;
745 ovs_rwlock_unlock(&rwlock
);
749 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
750 * Ethernet destination address of 'eth_dst', should be admitted.
752 * The return value is one of the following:
754 * - BV_ACCEPT: Admit the packet.
756 * - BV_DROP: Drop the packet.
758 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
759 * Ethernet source address and VLAN. If there is none, or if the packet
760 * is on the learned port, then admit the packet. If a different port has
761 * been learned, however, drop the packet (and do not use it for MAC
765 bond_check_admissibility(struct bond
*bond
, const void *slave_
,
766 const uint8_t eth_dst
[ETH_ADDR_LEN
])
768 enum bond_verdict verdict
= BV_DROP
;
769 struct bond_slave
*slave
;
771 ovs_rwlock_rdlock(&rwlock
);
772 slave
= bond_slave_lookup(bond
, slave_
);
777 /* LACP bonds have very loose admissibility restrictions because we can
778 * assume the remote switch is aware of the bond and will "do the right
779 * thing". However, as a precaution we drop packets on disabled slaves
780 * because no correctly implemented partner switch should be sending
783 * If LACP is configured, but LACP negotiations have been unsuccessful, we
784 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
785 switch (bond
->lacp_status
) {
786 case LACP_NEGOTIATED
:
787 verdict
= slave
->enabled
? BV_ACCEPT
: BV_DROP
;
789 case LACP_CONFIGURED
:
790 if (!bond
->lacp_fallback_ab
) {
797 /* Drop all multicast packets on inactive slaves. */
798 if (eth_addr_is_multicast(eth_dst
)) {
799 if (bond
->active_slave
!= slave
) {
804 switch (bond
->balance
) {
806 /* TCP balanced bonds require successful LACP negotiations. Based on the
807 * above check, LACP is off or lacp_fallback_ab is true on this bond.
808 * If lacp_fallback_ab is true fall through to BM_AB case else, we
809 * drop all incoming traffic. */
810 if (!bond
->lacp_fallback_ab
) {
815 /* Drop all packets which arrive on backup slaves. This is similar to
816 * how Linux bonding handles active-backup bonds. */
817 if (bond
->active_slave
!= slave
) {
818 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
820 VLOG_DBG_RL(&rl
, "active-backup bond received packet on backup"
821 " slave (%s) destined for " ETH_ADDR_FMT
,
822 slave
->name
, ETH_ADDR_ARGS(eth_dst
));
829 /* Drop all packets for which we have learned a different input port,
830 * because we probably sent the packet on one slave and got it back on
831 * the other. Gratuitous ARP packets are an exception to this rule:
832 * the host has moved to another switch. The exception to the
833 * exception is if we locked the learning table to avoid reflections on
835 verdict
= BV_DROP_IF_MOVED
;
841 ovs_rwlock_unlock(&rwlock
);
846 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
847 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
848 * NULL if the packet should be dropped because no slaves are enabled.
850 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
851 * should be a VID only (i.e. excluding the PCP bits). Second,
852 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
853 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
854 * packet belongs to (so for an access port it will be the access port's VLAN).
856 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
857 * significant in the selection. At some point earlier, 'wc' should
858 * have been initialized (e.g., by flow_wildcards_init_catchall()).
861 bond_choose_output_slave(struct bond
*bond
, const struct flow
*flow
,
862 struct flow_wildcards
*wc
, uint16_t vlan
)
864 struct bond_slave
*slave
;
867 ovs_rwlock_rdlock(&rwlock
);
868 slave
= choose_output_slave(bond
, flow
, wc
, vlan
);
869 aux
= slave
? slave
->aux
: NULL
;
870 ovs_rwlock_unlock(&rwlock
);
877 bond_entry_account(struct bond_entry
*entry
, uint64_t rule_tx_bytes
)
878 OVS_REQ_WRLOCK(rwlock
)
883 delta
= rule_tx_bytes
- entry
->pr_tx_bytes
;
884 entry
->tx_bytes
+= delta
;
885 entry
->pr_tx_bytes
= rule_tx_bytes
;
889 /* Maintain bond stats using post recirculation rule byte counters.*/
891 bond_recirculation_account(struct bond
*bond
)
892 OVS_REQ_WRLOCK(rwlock
)
896 for (i
=0; i
<=BOND_MASK
; i
++) {
897 struct bond_entry
*entry
= &bond
->hash
[i
];
898 struct rule
*rule
= entry
->pr_rule
;
901 uint64_t n_packets OVS_UNUSED
;
902 long long int used OVS_UNUSED
;
905 rule
->ofproto
->ofproto_class
->rule_get_stats(
906 rule
, &n_packets
, &n_bytes
, &used
);
907 bond_entry_account(entry
, n_bytes
);
913 bond_may_recirc(const struct bond
*bond
, uint32_t *recirc_id
,
916 if (bond
->balance
== BM_TCP
&& bond
->recirc_id
) {
918 *recirc_id
= bond
->recirc_id
;
921 *hash_bias
= bond
->basis
;
930 bond_update_post_recirc_rules__(struct bond
* bond
, const bool force
)
931 OVS_REQ_WRLOCK(rwlock
)
933 struct bond_entry
*e
;
934 bool update_rules
= force
; /* Always update rules if caller forces it. */
936 /* Make sure all bond entries are populated */
937 for (e
= bond
->hash
; e
<= &bond
->hash
[BOND_MASK
]; e
++) {
938 if (!e
->slave
|| !e
->slave
->enabled
) {
940 e
->slave
= CONTAINER_OF(hmap_random_node(&bond
->slaves
),
941 struct bond_slave
, hmap_node
);
942 if (!e
->slave
->enabled
) {
943 e
->slave
= bond
->active_slave
;
949 update_recirc_rules(bond
);
954 bond_update_post_recirc_rules(struct bond
* bond
, const bool force
)
956 ovs_rwlock_wrlock(&rwlock
);
957 bond_update_post_recirc_rules__(bond
, force
);
958 ovs_rwlock_unlock(&rwlock
);
964 bond_is_balanced(const struct bond
*bond
) OVS_REQ_RDLOCK(rwlock
)
966 return bond
->rebalance_interval
967 && (bond
->balance
== BM_SLB
|| bond
->balance
== BM_TCP
);
970 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
972 bond_account(struct bond
*bond
, const struct flow
*flow
, uint16_t vlan
,
975 ovs_rwlock_wrlock(&rwlock
);
976 if (bond_is_balanced(bond
)) {
977 lookup_bond_entry(bond
, flow
, vlan
)->tx_bytes
+= n_bytes
;
979 ovs_rwlock_unlock(&rwlock
);
982 static struct bond_slave
*
983 bond_slave_from_bal_node(struct ovs_list
*bal
) OVS_REQ_RDLOCK(rwlock
)
985 return CONTAINER_OF(bal
, struct bond_slave
, bal_node
);
989 log_bals(struct bond
*bond
, const struct ovs_list
*bals
)
990 OVS_REQ_RDLOCK(rwlock
)
992 if (VLOG_IS_DBG_ENABLED()) {
993 struct ds ds
= DS_EMPTY_INITIALIZER
;
994 const struct bond_slave
*slave
;
996 LIST_FOR_EACH (slave
, bal_node
, bals
) {
998 ds_put_char(&ds
, ',');
1000 ds_put_format(&ds
, " %s %"PRIu64
"kB",
1001 slave
->name
, slave
->tx_bytes
/ 1024);
1003 if (!slave
->enabled
) {
1004 ds_put_cstr(&ds
, " (disabled)");
1006 if (!list_is_empty(&slave
->entries
)) {
1007 struct bond_entry
*e
;
1009 ds_put_cstr(&ds
, " (");
1010 LIST_FOR_EACH (e
, list_node
, &slave
->entries
) {
1011 if (&e
->list_node
!= list_front(&slave
->entries
)) {
1012 ds_put_cstr(&ds
, " + ");
1014 ds_put_format(&ds
, "h%"PRIdPTR
": %"PRIu64
"kB",
1015 e
- bond
->hash
, e
->tx_bytes
/ 1024);
1017 ds_put_cstr(&ds
, ")");
1020 VLOG_DBG("bond %s:%s", bond
->name
, ds_cstr(&ds
));
1025 /* Shifts 'hash' from its current slave to 'to'. */
1027 bond_shift_load(struct bond_entry
*hash
, struct bond_slave
*to
)
1028 OVS_REQ_WRLOCK(rwlock
)
1030 struct bond_slave
*from
= hash
->slave
;
1031 struct bond
*bond
= from
->bond
;
1032 uint64_t delta
= hash
->tx_bytes
;
1034 VLOG_INFO("bond %s: shift %"PRIu64
"kB of load (with hash %"PRIdPTR
") "
1035 "from %s to %s (now carrying %"PRIu64
"kB and "
1036 "%"PRIu64
"kB load, respectively)",
1037 bond
->name
, delta
/ 1024, hash
- bond
->hash
,
1038 from
->name
, to
->name
,
1039 (from
->tx_bytes
- delta
) / 1024,
1040 (to
->tx_bytes
+ delta
) / 1024);
1042 /* Shift load away from 'from' to 'to'. */
1043 from
->tx_bytes
-= delta
;
1044 to
->tx_bytes
+= delta
;
1046 /* Arrange for flows to be revalidated. */
1048 bond
->bond_revalidate
= true;
1051 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1052 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
1053 * given that doing so must decrease the ratio of the load on the two slaves by
1054 * at least 0.1. Returns NULL if there is no appropriate entry.
1056 * The list of entries isn't sorted. I don't know of a reason to prefer to
1057 * shift away small hashes or large hashes. */
1058 static struct bond_entry
*
1059 choose_entry_to_migrate(const struct bond_slave
*from
, uint64_t to_tx_bytes
)
1060 OVS_REQ_WRLOCK(rwlock
)
1062 struct bond_entry
*e
;
1064 if (list_is_short(&from
->entries
)) {
1065 /* 'from' carries no more than one MAC hash, so shifting load away from
1066 * it would be pointless. */
1070 LIST_FOR_EACH (e
, list_node
, &from
->entries
) {
1071 double old_ratio
, new_ratio
;
1074 if (to_tx_bytes
== 0) {
1075 /* Nothing on the new slave, move it. */
1079 delta
= e
->tx_bytes
;
1080 old_ratio
= (double)from
->tx_bytes
/ to_tx_bytes
;
1081 new_ratio
= (double)(from
->tx_bytes
- delta
) / (to_tx_bytes
+ delta
);
1082 if (old_ratio
- new_ratio
> 0.1
1083 && fabs(new_ratio
- 1.0) < fabs(old_ratio
- 1.0)) {
1084 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1085 and 'to' slave have the same load. Therefore, we only move an
1086 entry if it decreases the load on 'from', and brings us closer
1087 to equal traffic load. */
1095 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1098 insert_bal(struct ovs_list
*bals
, struct bond_slave
*slave
)
1100 struct bond_slave
*pos
;
1102 LIST_FOR_EACH (pos
, bal_node
, bals
) {
1103 if (slave
->tx_bytes
> pos
->tx_bytes
) {
1107 list_insert(&pos
->bal_node
, &slave
->bal_node
);
1110 /* Removes 'slave' from its current list and then inserts it into 'bals' so
1111 * that descending order of 'tx_bytes' is maintained. */
1113 reinsert_bal(struct ovs_list
*bals
, struct bond_slave
*slave
)
1115 list_remove(&slave
->bal_node
);
1116 insert_bal(bals
, slave
);
1119 /* If 'bond' needs rebalancing, does so.
1121 * The caller should have called bond_account() for each active flow, or in case
1122 * of recirculation is used, have called bond_recirculation_account(bond),
1123 * to ensure that flow data is consistently accounted at this point.
1126 bond_rebalance(struct bond
*bond
)
1128 struct bond_slave
*slave
;
1129 struct bond_entry
*e
;
1130 struct ovs_list bals
;
1131 bool rebalanced
= false;
1134 ovs_rwlock_wrlock(&rwlock
);
1135 if (!bond_is_balanced(bond
) || time_msec() < bond
->next_rebalance
) {
1138 bond
->next_rebalance
= time_msec() + bond
->rebalance_interval
;
1140 use_recirc
= ofproto_dpif_get_support(bond
->ofproto
)->odp
.recirc
&&
1141 bond_may_recirc(bond
, NULL
, NULL
);
1144 bond_recirculation_account(bond
);
1147 /* Add each bond_entry to its slave's 'entries' list.
1148 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1149 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1150 slave
->tx_bytes
= 0;
1151 list_init(&slave
->entries
);
1153 for (e
= &bond
->hash
[0]; e
<= &bond
->hash
[BOND_MASK
]; e
++) {
1154 if (e
->slave
&& e
->tx_bytes
) {
1155 e
->slave
->tx_bytes
+= e
->tx_bytes
;
1156 list_push_back(&e
->slave
->entries
, &e
->list_node
);
1160 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1162 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1163 * with a proper list sort algorithm. */
1165 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1166 if (slave
->enabled
) {
1167 insert_bal(&bals
, slave
);
1170 log_bals(bond
, &bals
);
1172 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1173 while (!list_is_short(&bals
)) {
1174 struct bond_slave
*from
= bond_slave_from_bal_node(list_front(&bals
));
1175 struct bond_slave
*to
= bond_slave_from_bal_node(list_back(&bals
));
1178 overload
= from
->tx_bytes
- to
->tx_bytes
;
1179 if (overload
< to
->tx_bytes
>> 5 || overload
< 100000) {
1180 /* The extra load on 'from' (and all less-loaded slaves), compared
1181 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1182 * it is less than ~1Mbps. No point in rebalancing. */
1186 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1187 * to move from 'from' to 'to'. */
1188 e
= choose_entry_to_migrate(from
, to
->tx_bytes
);
1190 bond_shift_load(e
, to
);
1192 /* Delete element from from->entries.
1194 * We don't add the element to to->hashes. That would only allow
1195 * 'e' to be migrated to another slave in this rebalancing run, and
1196 * there is no point in doing that. */
1197 list_remove(&e
->list_node
);
1199 /* Re-sort 'bals'. */
1200 reinsert_bal(&bals
, from
);
1201 reinsert_bal(&bals
, to
);
1204 /* Can't usefully migrate anything away from 'from'.
1205 * Don't reconsider it. */
1206 list_remove(&from
->bal_node
);
1210 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1211 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1212 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1213 for (e
= &bond
->hash
[0]; e
<= &bond
->hash
[BOND_MASK
]; e
++) {
1217 if (use_recirc
&& rebalanced
) {
1218 bond_update_post_recirc_rules__(bond
,true);
1222 ovs_rwlock_unlock(&rwlock
);
1225 /* Bonding unixctl user interface functions. */
1227 static struct bond
*
1228 bond_find(const char *name
) OVS_REQ_RDLOCK(rwlock
)
1232 HMAP_FOR_EACH_WITH_HASH (bond
, hmap_node
, hash_string(name
, 0),
1234 if (!strcmp(bond
->name
, name
)) {
1241 static struct bond_slave
*
1242 bond_lookup_slave(struct bond
*bond
, const char *slave_name
)
1244 struct bond_slave
*slave
;
1246 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1247 if (!strcmp(slave
->name
, slave_name
)) {
1255 bond_unixctl_list(struct unixctl_conn
*conn
,
1256 int argc OVS_UNUSED
, const char *argv
[] OVS_UNUSED
,
1257 void *aux OVS_UNUSED
)
1259 struct ds ds
= DS_EMPTY_INITIALIZER
;
1260 const struct bond
*bond
;
1262 ds_put_cstr(&ds
, "bond\ttype\trecircID\tslaves\n");
1264 ovs_rwlock_rdlock(&rwlock
);
1265 HMAP_FOR_EACH (bond
, hmap_node
, all_bonds
) {
1266 const struct bond_slave
*slave
;
1269 ds_put_format(&ds
, "%s\t%s\t%d\t", bond
->name
,
1270 bond_mode_to_string(bond
->balance
), bond
->recirc_id
);
1273 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1275 ds_put_cstr(&ds
, ", ");
1277 ds_put_cstr(&ds
, slave
->name
);
1279 ds_put_char(&ds
, '\n');
1281 ovs_rwlock_unlock(&rwlock
);
1282 unixctl_command_reply(conn
, ds_cstr(&ds
));
1287 bond_print_details(struct ds
*ds
, const struct bond
*bond
)
1288 OVS_REQ_RDLOCK(rwlock
)
1290 struct shash slave_shash
= SHASH_INITIALIZER(&slave_shash
);
1291 const struct shash_node
**sorted_slaves
= NULL
;
1292 const struct bond_slave
*slave
;
1297 ds_put_format(ds
, "---- %s ----\n", bond
->name
);
1298 ds_put_format(ds
, "bond_mode: %s\n",
1299 bond_mode_to_string(bond
->balance
));
1301 may_recirc
= bond_may_recirc(bond
, &recirc_id
, NULL
);
1302 ds_put_format(ds
, "bond may use recirculation: %s, Recirc-ID : %d\n",
1303 may_recirc
? "yes" : "no", may_recirc
? recirc_id
: -1);
1305 ds_put_format(ds
, "bond-hash-basis: %"PRIu32
"\n", bond
->basis
);
1307 ds_put_format(ds
, "updelay: %d ms\n", bond
->updelay
);
1308 ds_put_format(ds
, "downdelay: %d ms\n", bond
->downdelay
);
1310 if (bond_is_balanced(bond
)) {
1311 ds_put_format(ds
, "next rebalance: %lld ms\n",
1312 bond
->next_rebalance
- time_msec());
1315 ds_put_cstr(ds
, "lacp_status: ");
1316 switch (bond
->lacp_status
) {
1317 case LACP_NEGOTIATED
:
1318 ds_put_cstr(ds
, "negotiated\n");
1320 case LACP_CONFIGURED
:
1321 ds_put_cstr(ds
, "configured\n");
1324 ds_put_cstr(ds
, "off\n");
1327 ds_put_cstr(ds
, "<unknown>\n");
1331 ds_put_cstr(ds
, "active slave mac: ");
1332 ds_put_format(ds
, ETH_ADDR_FMT
, ETH_ADDR_ARGS(bond
->active_slave_mac
));
1333 slave
= bond_find_slave_by_mac(bond
, bond
->active_slave_mac
);
1334 ds_put_format(ds
,"(%s)\n", slave
? slave
->name
: "none");
1336 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1337 shash_add(&slave_shash
, slave
->name
, slave
);
1339 sorted_slaves
= shash_sort(&slave_shash
);
1341 for (i
= 0; i
< shash_count(&slave_shash
); i
++) {
1342 struct bond_entry
*be
;
1344 slave
= sorted_slaves
[i
]->data
;
1347 ds_put_format(ds
, "\nslave %s: %s\n",
1348 slave
->name
, slave
->enabled
? "enabled" : "disabled");
1349 if (slave
== bond
->active_slave
) {
1350 ds_put_cstr(ds
, "\tactive slave\n");
1352 if (slave
->delay_expires
!= LLONG_MAX
) {
1353 ds_put_format(ds
, "\t%s expires in %lld ms\n",
1354 slave
->enabled
? "downdelay" : "updelay",
1355 slave
->delay_expires
- time_msec());
1358 ds_put_format(ds
, "\tmay_enable: %s\n",
1359 slave
->may_enable
? "true" : "false");
1361 if (!bond_is_balanced(bond
)) {
1366 for (be
= bond
->hash
; be
<= &bond
->hash
[BOND_MASK
]; be
++) {
1367 int hash
= be
- bond
->hash
;
1370 if (be
->slave
!= slave
) {
1374 be_tx_k
= be
->tx_bytes
/ 1024;
1376 ds_put_format(ds
, "\thash %d: %"PRIu64
" kB load\n",
1380 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1383 shash_destroy(&slave_shash
);
1384 free(sorted_slaves
);
1385 ds_put_cstr(ds
, "\n");
1389 bond_unixctl_show(struct unixctl_conn
*conn
,
1390 int argc
, const char *argv
[],
1391 void *aux OVS_UNUSED
)
1393 struct ds ds
= DS_EMPTY_INITIALIZER
;
1395 ovs_rwlock_rdlock(&rwlock
);
1397 const struct bond
*bond
= bond_find(argv
[1]);
1400 unixctl_command_reply_error(conn
, "no such bond");
1403 bond_print_details(&ds
, bond
);
1405 const struct bond
*bond
;
1407 HMAP_FOR_EACH (bond
, hmap_node
, all_bonds
) {
1408 bond_print_details(&ds
, bond
);
1412 unixctl_command_reply(conn
, ds_cstr(&ds
));
1416 ovs_rwlock_unlock(&rwlock
);
1420 bond_unixctl_migrate(struct unixctl_conn
*conn
,
1421 int argc OVS_UNUSED
, const char *argv
[],
1422 void *aux OVS_UNUSED
)
1424 const char *bond_s
= argv
[1];
1425 const char *hash_s
= argv
[2];
1426 const char *slave_s
= argv
[3];
1428 struct bond_slave
*slave
;
1429 struct bond_entry
*entry
;
1432 ovs_rwlock_wrlock(&rwlock
);
1433 bond
= bond_find(bond_s
);
1435 unixctl_command_reply_error(conn
, "no such bond");
1439 if (bond
->balance
!= BM_SLB
) {
1440 unixctl_command_reply_error(conn
, "not an SLB bond");
1444 if (strspn(hash_s
, "0123456789") == strlen(hash_s
)) {
1445 hash
= atoi(hash_s
) & BOND_MASK
;
1447 unixctl_command_reply_error(conn
, "bad hash");
1451 slave
= bond_lookup_slave(bond
, slave_s
);
1453 unixctl_command_reply_error(conn
, "no such slave");
1457 if (!slave
->enabled
) {
1458 unixctl_command_reply_error(conn
, "cannot migrate to disabled slave");
1462 entry
= &bond
->hash
[hash
];
1463 bond
->bond_revalidate
= true;
1464 entry
->slave
= slave
;
1465 unixctl_command_reply(conn
, "migrated");
1468 ovs_rwlock_unlock(&rwlock
);
1472 bond_unixctl_set_active_slave(struct unixctl_conn
*conn
,
1473 int argc OVS_UNUSED
, const char *argv
[],
1474 void *aux OVS_UNUSED
)
1476 const char *bond_s
= argv
[1];
1477 const char *slave_s
= argv
[2];
1479 struct bond_slave
*slave
;
1481 ovs_rwlock_wrlock(&rwlock
);
1482 bond
= bond_find(bond_s
);
1484 unixctl_command_reply_error(conn
, "no such bond");
1488 slave
= bond_lookup_slave(bond
, slave_s
);
1490 unixctl_command_reply_error(conn
, "no such slave");
1494 if (!slave
->enabled
) {
1495 unixctl_command_reply_error(conn
, "cannot make disabled slave active");
1499 if (bond
->active_slave
!= slave
) {
1500 bond
->bond_revalidate
= true;
1501 bond
->active_slave
= slave
;
1502 VLOG_INFO("bond %s: active interface is now %s",
1503 bond
->name
, slave
->name
);
1504 bond
->send_learning_packets
= true;
1505 unixctl_command_reply(conn
, "done");
1506 bond_active_slave_changed(bond
);
1508 unixctl_command_reply(conn
, "no change");
1511 ovs_rwlock_unlock(&rwlock
);
1515 enable_slave(struct unixctl_conn
*conn
, const char *argv
[], bool enable
)
1517 const char *bond_s
= argv
[1];
1518 const char *slave_s
= argv
[2];
1520 struct bond_slave
*slave
;
1522 ovs_rwlock_wrlock(&rwlock
);
1523 bond
= bond_find(bond_s
);
1525 unixctl_command_reply_error(conn
, "no such bond");
1529 slave
= bond_lookup_slave(bond
, slave_s
);
1531 unixctl_command_reply_error(conn
, "no such slave");
1535 bond_enable_slave(slave
, enable
);
1536 unixctl_command_reply(conn
, enable
? "enabled" : "disabled");
1539 ovs_rwlock_unlock(&rwlock
);
1543 bond_unixctl_enable_slave(struct unixctl_conn
*conn
,
1544 int argc OVS_UNUSED
, const char *argv
[],
1545 void *aux OVS_UNUSED
)
1547 enable_slave(conn
, argv
, true);
1551 bond_unixctl_disable_slave(struct unixctl_conn
*conn
,
1552 int argc OVS_UNUSED
, const char *argv
[],
1553 void *aux OVS_UNUSED
)
1555 enable_slave(conn
, argv
, false);
1559 bond_unixctl_hash(struct unixctl_conn
*conn
, int argc
, const char *argv
[],
1560 void *aux OVS_UNUSED
)
1562 const char *mac_s
= argv
[1];
1563 const char *vlan_s
= argc
> 2 ? argv
[2] : NULL
;
1564 const char *basis_s
= argc
> 3 ? argv
[3] : NULL
;
1565 uint8_t mac
[ETH_ADDR_LEN
];
1572 if (!ovs_scan(vlan_s
, "%u", &vlan
)) {
1573 unixctl_command_reply_error(conn
, "invalid vlan");
1581 if (!ovs_scan(basis_s
, "%"SCNu32
, &basis
)) {
1582 unixctl_command_reply_error(conn
, "invalid basis");
1589 if (ovs_scan(mac_s
, ETH_ADDR_SCAN_FMT
, ETH_ADDR_SCAN_ARGS(mac
))) {
1590 hash
= bond_hash_src(mac
, vlan
, basis
) & BOND_MASK
;
1592 hash_cstr
= xasprintf("%u", hash
);
1593 unixctl_command_reply(conn
, hash_cstr
);
1596 unixctl_command_reply_error(conn
, "invalid mac");
1603 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list
, NULL
);
1604 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show
,
1606 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1607 bond_unixctl_migrate
, NULL
);
1608 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1609 bond_unixctl_set_active_slave
, NULL
);
1610 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1611 bond_unixctl_enable_slave
, NULL
);
1612 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1613 bond_unixctl_disable_slave
, NULL
);
1614 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1615 bond_unixctl_hash
, NULL
);
1619 bond_entry_reset(struct bond
*bond
)
1621 if (bond
->balance
!= BM_AB
) {
1622 size_t hash_len
= BOND_BUCKETS
* sizeof *bond
->hash
;
1625 bond
->hash
= xmalloc(hash_len
);
1627 memset(bond
->hash
, 0, hash_len
);
1629 bond
->next_rebalance
= time_msec() + bond
->rebalance_interval
;
1636 static struct bond_slave
*
1637 bond_slave_lookup(struct bond
*bond
, const void *slave_
)
1639 struct bond_slave
*slave
;
1641 HMAP_FOR_EACH_IN_BUCKET (slave
, hmap_node
, hash_pointer(slave_
, 0),
1643 if (slave
->aux
== slave_
) {
1652 bond_enable_slave(struct bond_slave
*slave
, bool enable
)
1654 slave
->delay_expires
= LLONG_MAX
;
1655 if (enable
!= slave
->enabled
) {
1656 slave
->bond
->bond_revalidate
= true;
1657 slave
->enabled
= enable
;
1659 ovs_mutex_lock(&slave
->bond
->mutex
);
1661 list_insert(&slave
->bond
->enabled_slaves
, &slave
->list_node
);
1663 list_remove(&slave
->list_node
);
1665 ovs_mutex_unlock(&slave
->bond
->mutex
);
1667 VLOG_INFO("interface %s: %s", slave
->name
,
1668 slave
->enabled
? "enabled" : "disabled");
1673 bond_link_status_update(struct bond_slave
*slave
)
1675 struct bond
*bond
= slave
->bond
;
1678 up
= netdev_get_carrier(slave
->netdev
) && slave
->may_enable
;
1679 if ((up
== slave
->enabled
) != (slave
->delay_expires
== LLONG_MAX
)) {
1680 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
1681 VLOG_INFO_RL(&rl
, "interface %s: link state %s",
1682 slave
->name
, up
? "up" : "down");
1683 if (up
== slave
->enabled
) {
1684 slave
->delay_expires
= LLONG_MAX
;
1685 VLOG_INFO_RL(&rl
, "interface %s: will not be %s",
1686 slave
->name
, up
? "disabled" : "enabled");
1688 int delay
= (bond
->lacp_status
!= LACP_DISABLED
? 0
1689 : up
? bond
->updelay
: bond
->downdelay
);
1690 slave
->delay_expires
= time_msec() + delay
;
1692 VLOG_INFO_RL(&rl
, "interface %s: will be %s if it stays %s "
1695 up
? "enabled" : "disabled",
1702 if (time_msec() >= slave
->delay_expires
) {
1703 bond_enable_slave(slave
, up
);
1708 bond_hash_src(const uint8_t mac
[ETH_ADDR_LEN
], uint16_t vlan
, uint32_t basis
)
1710 return hash_mac(mac
, vlan
, basis
);
1714 bond_hash_tcp(const struct flow
*flow
, uint16_t vlan
, uint32_t basis
)
1716 struct flow hash_flow
= *flow
;
1717 hash_flow
.vlan_tci
= htons(vlan
);
1719 /* The symmetric quality of this hash function is not required, but
1720 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1721 * purposes, so we use it out of convenience. */
1722 return flow_hash_symmetric_l4(&hash_flow
, basis
);
1726 bond_hash(const struct bond
*bond
, const struct flow
*flow
, uint16_t vlan
)
1728 ovs_assert(bond
->balance
== BM_TCP
|| bond
->balance
== BM_SLB
);
1730 return (bond
->balance
== BM_TCP
1731 ? bond_hash_tcp(flow
, vlan
, bond
->basis
)
1732 : bond_hash_src(flow
->dl_src
, vlan
, bond
->basis
));
1735 static struct bond_entry
*
1736 lookup_bond_entry(const struct bond
*bond
, const struct flow
*flow
,
1739 return &bond
->hash
[bond_hash(bond
, flow
, vlan
) & BOND_MASK
];
1742 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1743 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1745 static struct bond_slave
*
1746 get_enabled_slave(struct bond
*bond
)
1748 struct ovs_list
*node
;
1750 ovs_mutex_lock(&bond
->mutex
);
1751 if (list_is_empty(&bond
->enabled_slaves
)) {
1752 ovs_mutex_unlock(&bond
->mutex
);
1756 node
= list_pop_front(&bond
->enabled_slaves
);
1757 list_push_back(&bond
->enabled_slaves
, node
);
1758 ovs_mutex_unlock(&bond
->mutex
);
1760 return CONTAINER_OF(node
, struct bond_slave
, list_node
);
1763 static struct bond_slave
*
1764 choose_output_slave(const struct bond
*bond
, const struct flow
*flow
,
1765 struct flow_wildcards
*wc
, uint16_t vlan
)
1767 struct bond_entry
*e
;
1770 balance
= bond
->balance
;
1771 if (bond
->lacp_status
== LACP_CONFIGURED
) {
1772 /* LACP has been configured on this bond but negotiations were
1773 * unsuccussful. If lacp_fallback_ab is enabled use active-
1774 * backup mode else drop all traffic. */
1775 if (!bond
->lacp_fallback_ab
) {
1783 return bond
->active_slave
;
1786 if (bond
->lacp_status
!= LACP_NEGOTIATED
) {
1787 /* Must have LACP negotiations for TCP balanced bonds. */
1791 flow_mask_hash_fields(flow
, wc
, NX_HASH_FIELDS_SYMMETRIC_L4
);
1796 flow_mask_hash_fields(flow
, wc
, NX_HASH_FIELDS_ETH_SRC
);
1798 e
= lookup_bond_entry(bond
, flow
, vlan
);
1799 if (!e
->slave
|| !e
->slave
->enabled
) {
1800 e
->slave
= get_enabled_slave(CONST_CAST(struct bond
*, bond
));
1809 static struct bond_slave
*
1810 bond_choose_slave(const struct bond
*bond
)
1812 struct bond_slave
*slave
, *best
;
1814 /* Find the last active slave. */
1815 slave
= bond_find_slave_by_mac(bond
, bond
->active_slave_mac
);
1816 if (slave
&& slave
->enabled
) {
1820 /* Find an enabled slave. */
1821 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1822 if (slave
->enabled
) {
1827 /* All interfaces are disabled. Find an interface that will be enabled
1828 * after its updelay expires. */
1830 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1831 if (slave
->delay_expires
!= LLONG_MAX
1832 && slave
->may_enable
1833 && (!best
|| slave
->delay_expires
< best
->delay_expires
)) {
1841 bond_choose_active_slave(struct bond
*bond
)
1843 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
1844 struct bond_slave
*old_active_slave
= bond
->active_slave
;
1846 bond
->active_slave
= bond_choose_slave(bond
);
1847 if (bond
->active_slave
) {
1848 if (bond
->active_slave
->enabled
) {
1849 VLOG_INFO_RL(&rl
, "bond %s: active interface is now %s",
1850 bond
->name
, bond
->active_slave
->name
);
1852 VLOG_INFO_RL(&rl
, "bond %s: active interface is now %s, skipping "
1853 "remaining %lld ms updelay (since no interface was "
1854 "enabled)", bond
->name
, bond
->active_slave
->name
,
1855 bond
->active_slave
->delay_expires
- time_msec());
1856 bond_enable_slave(bond
->active_slave
, true);
1859 bond
->send_learning_packets
= true;
1861 if (bond
->active_slave
!= old_active_slave
) {
1862 bond_active_slave_changed(bond
);
1864 } else if (old_active_slave
) {
1865 VLOG_INFO_RL(&rl
, "bond %s: all interfaces disabled", bond
->name
);
1870 * Return true if bond has unstored active slave change.
1871 * If return true, 'mac' will store the bond's current active slave's
1874 bond_get_changed_active_slave(const char *name
, uint8_t* mac
, bool force
)
1878 ovs_rwlock_wrlock(&rwlock
);
1879 bond
= bond_find(name
);
1881 if (bond
->active_slave_changed
|| force
) {
1882 memcpy(mac
, bond
->active_slave_mac
, ETH_ADDR_LEN
);
1883 bond
->active_slave_changed
= false;
1884 ovs_rwlock_unlock(&rwlock
);
1888 ovs_rwlock_unlock(&rwlock
);