2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "ofp-actions.h"
29 #include "ofproto/ofproto-provider.h"
30 #include "ofproto/ofproto-dpif.h"
31 #include "connectivity.h"
33 #include "dynamic-string.h"
42 #include "poll-loop.h"
50 VLOG_DEFINE_THIS_MODULE(bond
);
52 static struct ovs_rwlock rwlock
= OVS_RWLOCK_INITIALIZER
;
53 static struct hmap all_bonds__
= HMAP_INITIALIZER(&all_bonds__
);
54 static struct hmap
*const all_bonds
OVS_GUARDED_BY(rwlock
) = &all_bonds__
;
56 /* Bit-mask for hashing a flow down to a bucket. */
57 #define BOND_MASK 0xff
58 #define BOND_BUCKETS (BOND_MASK + 1)
60 /* A hash bucket for mapping a flow to a slave.
61 * "struct bond" has an array of BOND_BUCKETS of these. */
63 struct bond_slave
*slave
; /* Assigned slave, NULL if unassigned. */
64 uint64_t tx_bytes
/* Count of bytes recently transmitted. */
65 OVS_GUARDED_BY(rwlock
);
66 struct list list_node
; /* In bond_slave's 'entries' list. */
70 * 'pr_rule' is the post-recirculation rule for this entry.
71 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
72 * is used to determine delta (applied to 'tx_bytes' above.) */
74 uint64_t pr_tx_bytes
OVS_GUARDED_BY(rwlock
);
77 /* A bond slave, that is, one of the links comprising a bond. */
79 struct hmap_node hmap_node
; /* In struct bond's slaves hmap. */
80 struct list list_node
; /* In struct bond's enabled_slaves list. */
81 struct bond
*bond
; /* The bond that contains this slave. */
82 void *aux
; /* Client-provided handle for this slave. */
84 struct netdev
*netdev
; /* Network device, owned by the client. */
85 unsigned int change_seq
; /* Tracks changes in 'netdev'. */
86 ofp_port_t ofp_port
; /* Open flow port number */
87 char *name
; /* Name (a copy of netdev_get_name(netdev)). */
90 long long delay_expires
; /* Time after which 'enabled' may change. */
91 bool enabled
; /* May be chosen for flows? */
92 bool may_enable
; /* Client considers this slave bondable. */
94 /* Rebalancing info. Used only by bond_rebalance(). */
95 struct list bal_node
; /* In bond_rebalance()'s 'bals' list. */
96 struct list entries
; /* 'struct bond_entry's assigned here. */
97 uint64_t tx_bytes
; /* Sum across 'tx_bytes' of entries. */
100 /* A bond, that is, a set of network devices grouped to improve performance or
103 struct hmap_node hmap_node
; /* In 'all_bonds' hmap. */
104 char *name
; /* Name provided by client. */
105 struct ofproto_dpif
*ofproto
; /* The bridge this bond belongs to. */
112 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
113 * (To prevent the bond_slave from disappearing they must also hold
115 struct ovs_mutex mutex
OVS_ACQ_AFTER(rwlock
);
116 struct list enabled_slaves OVS_GUARDED
; /* Contains struct bond_slaves. */
119 enum bond_mode balance
; /* Balancing mode, one of BM_*. */
120 struct bond_slave
*active_slave
;
121 int updelay
, downdelay
; /* Delay before slave goes up/down, in ms. */
122 enum lacp_status lacp_status
; /* Status of LACP negotiations. */
123 bool bond_revalidate
; /* True if flows need revalidation. */
124 uint32_t basis
; /* Basis for flow hash function. */
126 /* SLB specific bonding info. */
127 struct bond_entry
*hash
; /* An array of BOND_BUCKETS elements. */
128 int rebalance_interval
; /* Interval between rebalances, in ms. */
129 long long int next_rebalance
; /* Next rebalancing time. */
130 bool send_learning_packets
;
131 uint32_t recirc_id
; /* Non zero if recirculation can be used.*/
132 struct hmap pr_rule_ops
; /* Helps to maintain post recirculation rules.*/
134 /* Store active slave to OVSDB. */
135 bool active_slave_changed
; /* Set to true whenever the bond changes
136 active slave. It will be reset to false
137 after it is stored into OVSDB */
139 /* Interface name may not be persistent across an OS reboot, use
140 * MAC address for identifing the active slave */
141 uint8_t active_slave_mac
[ETH_ADDR_LEN
];
142 /* The MAC address of the active interface. */
143 /* Legacy compatibility. */
144 bool lacp_fallback_ab
; /* Fallback to active-backup on LACP failure. */
146 struct ovs_refcount ref_cnt
;
149 /* What to do with an bond_recirc_rule. */
151 ADD
, /* Add the rule to ofproto's flow table. */
152 DEL
, /* Delete the rule from the ofproto's flow table. */
155 /* A rule to add to or delete from ofproto's internal flow table. */
156 struct bond_pr_rule_op
{
157 struct hmap_node hmap_node
;
159 ofp_port_t out_ofport
;
161 struct rule
**pr_rule
;
164 static void bond_entry_reset(struct bond
*) OVS_REQ_WRLOCK(rwlock
);
165 static struct bond_slave
*bond_slave_lookup(struct bond
*, const void *slave_
)
166 OVS_REQ_RDLOCK(rwlock
);
167 static void bond_enable_slave(struct bond_slave
*, bool enable
)
168 OVS_REQ_WRLOCK(rwlock
);
169 static void bond_link_status_update(struct bond_slave
*)
170 OVS_REQ_WRLOCK(rwlock
);
171 static void bond_choose_active_slave(struct bond
*)
172 OVS_REQ_WRLOCK(rwlock
);
173 static unsigned int bond_hash_src(const uint8_t mac
[ETH_ADDR_LEN
],
174 uint16_t vlan
, uint32_t basis
);
175 static unsigned int bond_hash_tcp(const struct flow
*, uint16_t vlan
,
177 static struct bond_entry
*lookup_bond_entry(const struct bond
*,
180 OVS_REQ_RDLOCK(rwlock
);
181 static struct bond_slave
*get_enabled_slave(struct bond
*)
182 OVS_REQ_RDLOCK(rwlock
);
183 static struct bond_slave
*choose_output_slave(const struct bond
*,
185 struct flow_wildcards
*,
187 OVS_REQ_RDLOCK(rwlock
);
189 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
190 * stores the mode in '*balance' and returns true. Otherwise returns false
191 * without modifying '*balance'. */
193 bond_mode_from_string(enum bond_mode
*balance
, const char *s
)
195 if (!strcmp(s
, bond_mode_to_string(BM_TCP
))) {
197 } else if (!strcmp(s
, bond_mode_to_string(BM_SLB
))) {
199 } else if (!strcmp(s
, bond_mode_to_string(BM_AB
))) {
207 /* Returns a string representing 'balance'. */
209 bond_mode_to_string(enum bond_mode balance
) {
212 return "balance-tcp";
214 return "balance-slb";
216 return "active-backup";
222 /* Creates and returns a new bond whose configuration is initially taken from
225 * The caller should register each slave on the new bond by calling
226 * bond_slave_register(). */
228 bond_create(const struct bond_settings
*s
, struct ofproto_dpif
*ofproto
)
232 bond
= xzalloc(sizeof *bond
);
233 bond
->ofproto
= ofproto
;
234 hmap_init(&bond
->slaves
);
235 list_init(&bond
->enabled_slaves
);
236 ovs_mutex_init(&bond
->mutex
);
237 ovs_refcount_init(&bond
->ref_cnt
);
240 hmap_init(&bond
->pr_rule_ops
);
242 bond_reconfigure(bond
, s
);
247 bond_ref(const struct bond
*bond_
)
249 struct bond
*bond
= CONST_CAST(struct bond
*, bond_
);
252 ovs_refcount_ref(&bond
->ref_cnt
);
259 bond_unref(struct bond
*bond
)
261 struct bond_slave
*slave
, *next_slave
;
262 struct bond_pr_rule_op
*pr_op
, *next_op
;
264 if (!bond
|| ovs_refcount_unref_relaxed(&bond
->ref_cnt
) != 1) {
268 ovs_rwlock_wrlock(&rwlock
);
269 hmap_remove(all_bonds
, &bond
->hmap_node
);
270 ovs_rwlock_unlock(&rwlock
);
272 HMAP_FOR_EACH_SAFE (slave
, next_slave
, hmap_node
, &bond
->slaves
) {
273 hmap_remove(&bond
->slaves
, &slave
->hmap_node
);
274 /* Client owns 'slave->netdev'. */
278 hmap_destroy(&bond
->slaves
);
280 ovs_mutex_destroy(&bond
->mutex
);
284 HMAP_FOR_EACH_SAFE(pr_op
, next_op
, hmap_node
, &bond
->pr_rule_ops
) {
285 hmap_remove(&bond
->pr_rule_ops
, &pr_op
->hmap_node
);
288 hmap_destroy(&bond
->pr_rule_ops
);
290 if (bond
->recirc_id
) {
291 ofproto_dpif_free_recirc_id(bond
->ofproto
, bond
->recirc_id
);
298 add_pr_rule(struct bond
*bond
, const struct match
*match
,
299 ofp_port_t out_ofport
, struct rule
**rule
)
301 uint32_t hash
= match_hash(match
, 0);
302 struct bond_pr_rule_op
*pr_op
;
304 HMAP_FOR_EACH_WITH_HASH(pr_op
, hmap_node
, hash
, &bond
->pr_rule_ops
) {
305 if (match_equal(&pr_op
->match
, match
)) {
307 pr_op
->out_ofport
= out_ofport
;
308 pr_op
->pr_rule
= rule
;
313 pr_op
= xmalloc(sizeof *pr_op
);
314 pr_op
->match
= *match
;
316 pr_op
->out_ofport
= out_ofport
;
317 pr_op
->pr_rule
= rule
;
318 hmap_insert(&bond
->pr_rule_ops
, &pr_op
->hmap_node
, hash
);
322 update_recirc_rules(struct bond
*bond
)
325 struct bond_pr_rule_op
*pr_op
, *next_op
;
326 uint64_t ofpacts_stub
[128 / 8];
327 struct ofpbuf ofpacts
;
330 ofpbuf_use_stub(&ofpacts
, ofpacts_stub
, sizeof ofpacts_stub
);
332 HMAP_FOR_EACH(pr_op
, hmap_node
, &bond
->pr_rule_ops
) {
336 if (bond
->hash
&& bond
->recirc_id
) {
337 for (i
= 0; i
< BOND_BUCKETS
; i
++) {
338 struct bond_slave
*slave
= bond
->hash
[i
].slave
;
341 match_init_catchall(&match
);
342 match_set_recirc_id(&match
, bond
->recirc_id
);
343 match_set_dp_hash_masked(&match
, i
, BOND_MASK
);
345 add_pr_rule(bond
, &match
, slave
->ofp_port
,
346 &bond
->hash
[i
].pr_rule
);
351 HMAP_FOR_EACH_SAFE(pr_op
, next_op
, hmap_node
, &bond
->pr_rule_ops
) {
355 ofpbuf_clear(&ofpacts
);
356 ofpact_put_OUTPUT(&ofpacts
)->port
= pr_op
->out_ofport
;
357 error
= ofproto_dpif_add_internal_flow(bond
->ofproto
,
359 RECIRC_RULE_PRIORITY
, 0,
360 &ofpacts
, pr_op
->pr_rule
);
362 char *err_s
= match_to_string(&pr_op
->match
,
363 RECIRC_RULE_PRIORITY
);
365 VLOG_ERR("failed to add post recirculation flow %s", err_s
);
371 error
= ofproto_dpif_delete_internal_flow(bond
->ofproto
,
373 RECIRC_RULE_PRIORITY
);
375 char *err_s
= match_to_string(&pr_op
->match
,
376 RECIRC_RULE_PRIORITY
);
378 VLOG_ERR("failed to remove post recirculation flow %s", err_s
);
382 hmap_remove(&bond
->pr_rule_ops
, &pr_op
->hmap_node
);
383 *pr_op
->pr_rule
= NULL
;
389 ofpbuf_uninit(&ofpacts
);
393 /* Updates 'bond''s overall configuration to 's'.
395 * The caller should register each slave on 'bond' by calling
396 * bond_slave_register(). This is optional if none of the slaves'
397 * configuration has changed. In any case it can't hurt.
399 * Returns true if the configuration has changed in such a way that requires
403 bond_reconfigure(struct bond
*bond
, const struct bond_settings
*s
)
405 bool revalidate
= false;
407 ovs_rwlock_wrlock(&rwlock
);
408 if (!bond
->name
|| strcmp(bond
->name
, s
->name
)) {
410 hmap_remove(all_bonds
, &bond
->hmap_node
);
413 bond
->name
= xstrdup(s
->name
);
414 hmap_insert(all_bonds
, &bond
->hmap_node
, hash_string(bond
->name
, 0));
417 bond
->updelay
= s
->up_delay
;
418 bond
->downdelay
= s
->down_delay
;
420 if (bond
->lacp_fallback_ab
!= s
->lacp_fallback_ab_cfg
) {
421 bond
->lacp_fallback_ab
= s
->lacp_fallback_ab_cfg
;
425 if (bond
->rebalance_interval
!= s
->rebalance_interval
) {
426 bond
->rebalance_interval
= s
->rebalance_interval
;
430 if (bond
->balance
!= s
->balance
) {
431 bond
->balance
= s
->balance
;
435 if (bond
->basis
!= s
->basis
) {
436 bond
->basis
= s
->basis
;
440 if (bond
->bond_revalidate
) {
442 bond
->bond_revalidate
= false;
445 if (bond
->balance
!= BM_AB
) {
446 if (!bond
->recirc_id
) {
447 bond
->recirc_id
= ofproto_dpif_alloc_recirc_id(bond
->ofproto
);
449 } else if (bond
->recirc_id
) {
450 ofproto_dpif_free_recirc_id(bond
->ofproto
, bond
->recirc_id
);
454 if (bond
->balance
== BM_AB
|| !bond
->hash
|| revalidate
) {
455 bond_entry_reset(bond
);
458 memcpy(bond
->active_slave_mac
, s
->active_slave_mac
,
459 sizeof s
->active_slave_mac
);
461 bond
->active_slave_changed
= false;
463 ovs_rwlock_unlock(&rwlock
);
467 static struct bond_slave
*
468 bond_find_slave_by_mac(const struct bond
*bond
, const uint8_t mac
[ETH_ADDR_LEN
])
470 struct bond_slave
*slave
;
472 /* Find the last active slave */
473 HMAP_FOR_EACH(slave
, hmap_node
, &bond
->slaves
) {
474 uint8_t slave_mac
[ETH_ADDR_LEN
];
476 if (netdev_get_etheraddr(slave
->netdev
, slave_mac
)) {
480 if (!memcmp(slave_mac
, mac
, sizeof(slave_mac
))) {
489 bond_active_slave_changed(struct bond
*bond
)
491 uint8_t mac
[ETH_ADDR_LEN
];
493 netdev_get_etheraddr(bond
->active_slave
->netdev
, mac
);
494 memcpy(bond
->active_slave_mac
, mac
, sizeof bond
->active_slave_mac
);
495 bond
->active_slave_changed
= true;
496 seq_change(connectivity_seq_get());
500 bond_slave_set_netdev__(struct bond_slave
*slave
, struct netdev
*netdev
)
501 OVS_REQ_WRLOCK(rwlock
)
503 if (slave
->netdev
!= netdev
) {
504 slave
->netdev
= netdev
;
505 slave
->change_seq
= 0;
509 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
510 * arbitrary client-provided pointer that uniquely identifies a slave within a
511 * bond. If 'slave_' already exists within 'bond' then this function
512 * reconfigures the existing slave.
514 * 'netdev' must be the network device that 'slave_' represents. It is owned
515 * by the client, so the client must not close it before either unregistering
516 * 'slave_' or destroying 'bond'.
519 bond_slave_register(struct bond
*bond
, void *slave_
,
520 ofp_port_t ofport
, struct netdev
*netdev
)
522 struct bond_slave
*slave
;
524 ovs_rwlock_wrlock(&rwlock
);
525 slave
= bond_slave_lookup(bond
, slave_
);
527 slave
= xzalloc(sizeof *slave
);
529 hmap_insert(&bond
->slaves
, &slave
->hmap_node
, hash_pointer(slave_
, 0));
532 slave
->ofp_port
= ofport
;
533 slave
->delay_expires
= LLONG_MAX
;
534 slave
->name
= xstrdup(netdev_get_name(netdev
));
535 bond
->bond_revalidate
= true;
537 slave
->enabled
= false;
538 bond_enable_slave(slave
, netdev_get_carrier(netdev
));
541 bond_slave_set_netdev__(slave
, netdev
);
544 slave
->name
= xstrdup(netdev_get_name(netdev
));
545 ovs_rwlock_unlock(&rwlock
);
548 /* Updates the network device to be used with 'slave_' to 'netdev'.
550 * This is useful if the caller closes and re-opens the network device
551 * registered with bond_slave_register() but doesn't need to change anything
554 bond_slave_set_netdev(struct bond
*bond
, void *slave_
, struct netdev
*netdev
)
556 struct bond_slave
*slave
;
558 ovs_rwlock_wrlock(&rwlock
);
559 slave
= bond_slave_lookup(bond
, slave_
);
561 bond_slave_set_netdev__(slave
, netdev
);
563 ovs_rwlock_unlock(&rwlock
);
566 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
567 * then this function has no effect.
569 * Unregistering a slave invalidates all flows. */
571 bond_slave_unregister(struct bond
*bond
, const void *slave_
)
573 struct bond_slave
*slave
;
576 ovs_rwlock_wrlock(&rwlock
);
577 slave
= bond_slave_lookup(bond
, slave_
);
582 bond
->bond_revalidate
= true;
583 bond_enable_slave(slave
, false);
585 del_active
= bond
->active_slave
== slave
;
587 struct bond_entry
*e
;
588 for (e
= bond
->hash
; e
<= &bond
->hash
[BOND_MASK
]; e
++) {
589 if (e
->slave
== slave
) {
597 hmap_remove(&bond
->slaves
, &slave
->hmap_node
);
598 /* Client owns 'slave->netdev'. */
602 bond_choose_active_slave(bond
);
603 bond
->send_learning_packets
= true;
606 ovs_rwlock_unlock(&rwlock
);
609 /* Should be called on each slave in 'bond' before bond_run() to indicate
610 * whether or not 'slave_' may be enabled. This function is intended to allow
611 * other protocols to have some impact on bonding decisions. For example LACP
612 * or high level link monitoring protocols may decide that a given slave should
613 * not be able to send traffic. */
615 bond_slave_set_may_enable(struct bond
*bond
, void *slave_
, bool may_enable
)
617 ovs_rwlock_wrlock(&rwlock
);
618 bond_slave_lookup(bond
, slave_
)->may_enable
= may_enable
;
619 ovs_rwlock_unlock(&rwlock
);
622 /* Performs periodic maintenance on 'bond'.
624 * Returns true if the caller should revalidate its flows.
626 * The caller should check bond_should_send_learning_packets() afterward. */
628 bond_run(struct bond
*bond
, enum lacp_status lacp_status
)
630 struct bond_slave
*slave
;
633 ovs_rwlock_wrlock(&rwlock
);
634 if (bond
->lacp_status
!= lacp_status
) {
635 bond
->lacp_status
= lacp_status
;
636 bond
->bond_revalidate
= true;
639 /* Enable slaves based on link status and LACP feedback. */
640 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
641 bond_link_status_update(slave
);
642 slave
->change_seq
= seq_read(connectivity_seq_get());
644 if (!bond
->active_slave
|| !bond
->active_slave
->enabled
) {
645 bond_choose_active_slave(bond
);
648 revalidate
= bond
->bond_revalidate
;
649 bond
->bond_revalidate
= false;
650 ovs_rwlock_unlock(&rwlock
);
655 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
657 bond_wait(struct bond
*bond
)
659 struct bond_slave
*slave
;
661 ovs_rwlock_rdlock(&rwlock
);
662 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
663 if (slave
->delay_expires
!= LLONG_MAX
) {
664 poll_timer_wait_until(slave
->delay_expires
);
667 seq_wait(connectivity_seq_get(), slave
->change_seq
);
670 if (bond
->bond_revalidate
) {
671 poll_immediate_wake();
673 ovs_rwlock_unlock(&rwlock
);
675 /* We don't wait for bond->next_rebalance because rebalancing can only run
676 * at a flow account checkpoint. ofproto does checkpointing on its own
677 * schedule and bond_rebalance() gets called afterward, so we'd just be
678 * waking up for no purpose. */
681 /* MAC learning table interaction. */
684 may_send_learning_packets(const struct bond
*bond
)
686 return ((bond
->lacp_status
== LACP_DISABLED
687 && (bond
->balance
== BM_SLB
|| bond
->balance
== BM_AB
))
688 || (bond
->lacp_fallback_ab
&& bond
->lacp_status
== LACP_CONFIGURED
))
689 && bond
->active_slave
;
692 /* Returns true if 'bond' needs the client to send out packets to assist with
693 * MAC learning on 'bond'. If this function returns true, then the client
694 * should iterate through its MAC learning table for the bridge on which 'bond'
695 * is located. For each MAC that has been learned on a port other than 'bond',
696 * it should call bond_compose_learning_packet().
698 * This function will only return true if 'bond' is in SLB or active-backup
699 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
702 * Calling this function resets the state that it checks. */
704 bond_should_send_learning_packets(struct bond
*bond
)
708 ovs_rwlock_wrlock(&rwlock
);
709 send
= bond
->send_learning_packets
&& may_send_learning_packets(bond
);
710 bond
->send_learning_packets
= false;
711 ovs_rwlock_unlock(&rwlock
);
715 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
717 * See bond_should_send_learning_packets() for description of usage. The
718 * caller should send the composed packet on the port associated with
719 * port_aux and takes ownership of the returned ofpbuf. */
721 bond_compose_learning_packet(struct bond
*bond
,
722 const uint8_t eth_src
[ETH_ADDR_LEN
],
723 uint16_t vlan
, void **port_aux
)
725 struct bond_slave
*slave
;
726 struct ofpbuf
*packet
;
729 ovs_rwlock_rdlock(&rwlock
);
730 ovs_assert(may_send_learning_packets(bond
));
731 memset(&flow
, 0, sizeof flow
);
732 memcpy(flow
.dl_src
, eth_src
, ETH_ADDR_LEN
);
733 slave
= choose_output_slave(bond
, &flow
, NULL
, vlan
);
735 packet
= ofpbuf_new(0);
736 compose_rarp(packet
, eth_src
);
738 eth_push_vlan(packet
, htons(ETH_TYPE_VLAN
), htons(vlan
));
741 *port_aux
= slave
->aux
;
742 ovs_rwlock_unlock(&rwlock
);
746 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
747 * Ethernet destination address of 'eth_dst', should be admitted.
749 * The return value is one of the following:
751 * - BV_ACCEPT: Admit the packet.
753 * - BV_DROP: Drop the packet.
755 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
756 * Ethernet source address and VLAN. If there is none, or if the packet
757 * is on the learned port, then admit the packet. If a different port has
758 * been learned, however, drop the packet (and do not use it for MAC
762 bond_check_admissibility(struct bond
*bond
, const void *slave_
,
763 const uint8_t eth_dst
[ETH_ADDR_LEN
])
765 enum bond_verdict verdict
= BV_DROP
;
766 struct bond_slave
*slave
;
768 ovs_rwlock_rdlock(&rwlock
);
769 slave
= bond_slave_lookup(bond
, slave_
);
774 /* LACP bonds have very loose admissibility restrictions because we can
775 * assume the remote switch is aware of the bond and will "do the right
776 * thing". However, as a precaution we drop packets on disabled slaves
777 * because no correctly implemented partner switch should be sending
780 * If LACP is configured, but LACP negotiations have been unsuccessful, we
781 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
782 switch (bond
->lacp_status
) {
783 case LACP_NEGOTIATED
:
784 verdict
= slave
->enabled
? BV_ACCEPT
: BV_DROP
;
786 case LACP_CONFIGURED
:
787 if (!bond
->lacp_fallback_ab
) {
794 /* Drop all multicast packets on inactive slaves. */
795 if (eth_addr_is_multicast(eth_dst
)) {
796 if (bond
->active_slave
!= slave
) {
801 switch (bond
->balance
) {
803 /* TCP balanced bonds require successful LACP negotiations. Based on the
804 * above check, LACP is off or lacp_fallback_ab is true on this bond.
805 * If lacp_fallback_ab is true fall through to BM_AB case else, we
806 * drop all incoming traffic. */
807 if (!bond
->lacp_fallback_ab
) {
812 /* Drop all packets which arrive on backup slaves. This is similar to
813 * how Linux bonding handles active-backup bonds. */
814 if (bond
->active_slave
!= slave
) {
815 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
817 VLOG_DBG_RL(&rl
, "active-backup bond received packet on backup"
818 " slave (%s) destined for " ETH_ADDR_FMT
,
819 slave
->name
, ETH_ADDR_ARGS(eth_dst
));
826 /* Drop all packets for which we have learned a different input port,
827 * because we probably sent the packet on one slave and got it back on
828 * the other. Gratuitous ARP packets are an exception to this rule:
829 * the host has moved to another switch. The exception to the
830 * exception is if we locked the learning table to avoid reflections on
832 verdict
= BV_DROP_IF_MOVED
;
838 ovs_rwlock_unlock(&rwlock
);
843 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
844 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
845 * NULL if the packet should be dropped because no slaves are enabled.
847 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
848 * should be a VID only (i.e. excluding the PCP bits). Second,
849 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
850 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
851 * packet belongs to (so for an access port it will be the access port's VLAN).
853 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
854 * significant in the selection. At some point earlier, 'wc' should
855 * have been initialized (e.g., by flow_wildcards_init_catchall()).
858 bond_choose_output_slave(struct bond
*bond
, const struct flow
*flow
,
859 struct flow_wildcards
*wc
, uint16_t vlan
)
861 struct bond_slave
*slave
;
864 ovs_rwlock_rdlock(&rwlock
);
865 slave
= choose_output_slave(bond
, flow
, wc
, vlan
);
866 aux
= slave
? slave
->aux
: NULL
;
867 ovs_rwlock_unlock(&rwlock
);
874 bond_entry_account(struct bond_entry
*entry
, uint64_t rule_tx_bytes
)
875 OVS_REQ_WRLOCK(rwlock
)
880 delta
= rule_tx_bytes
- entry
->pr_tx_bytes
;
881 entry
->tx_bytes
+= delta
;
882 entry
->pr_tx_bytes
= rule_tx_bytes
;
886 /* Maintain bond stats using post recirculation rule byte counters.*/
888 bond_recirculation_account(struct bond
*bond
)
889 OVS_REQ_WRLOCK(rwlock
)
893 for (i
=0; i
<=BOND_MASK
; i
++) {
894 struct bond_entry
*entry
= &bond
->hash
[i
];
895 struct rule
*rule
= entry
->pr_rule
;
898 uint64_t n_packets OVS_UNUSED
;
899 long long int used OVS_UNUSED
;
902 rule
->ofproto
->ofproto_class
->rule_get_stats(
903 rule
, &n_packets
, &n_bytes
, &used
);
904 bond_entry_account(entry
, n_bytes
);
910 bond_may_recirc(const struct bond
*bond
, uint32_t *recirc_id
,
913 if (bond
->balance
== BM_TCP
&& bond
->recirc_id
) {
915 *recirc_id
= bond
->recirc_id
;
918 *hash_bias
= bond
->basis
;
927 bond_update_post_recirc_rules(struct bond
* bond
, const bool force
)
929 struct bond_entry
*e
;
930 bool update_rules
= force
; /* Always update rules if caller forces it. */
932 /* Make sure all bond entries are populated */
933 for (e
= bond
->hash
; e
<= &bond
->hash
[BOND_MASK
]; e
++) {
934 if (!e
->slave
|| !e
->slave
->enabled
) {
936 e
->slave
= CONTAINER_OF(hmap_random_node(&bond
->slaves
),
937 struct bond_slave
, hmap_node
);
938 if (!e
->slave
->enabled
) {
939 e
->slave
= bond
->active_slave
;
945 update_recirc_rules(bond
);
952 bond_is_balanced(const struct bond
*bond
) OVS_REQ_RDLOCK(rwlock
)
954 return bond
->rebalance_interval
955 && (bond
->balance
== BM_SLB
|| bond
->balance
== BM_TCP
);
958 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
960 bond_account(struct bond
*bond
, const struct flow
*flow
, uint16_t vlan
,
963 ovs_rwlock_wrlock(&rwlock
);
964 if (bond_is_balanced(bond
)) {
965 lookup_bond_entry(bond
, flow
, vlan
)->tx_bytes
+= n_bytes
;
967 ovs_rwlock_unlock(&rwlock
);
970 static struct bond_slave
*
971 bond_slave_from_bal_node(struct list
*bal
) OVS_REQ_RDLOCK(rwlock
)
973 return CONTAINER_OF(bal
, struct bond_slave
, bal_node
);
977 log_bals(struct bond
*bond
, const struct list
*bals
)
978 OVS_REQ_RDLOCK(rwlock
)
980 if (VLOG_IS_DBG_ENABLED()) {
981 struct ds ds
= DS_EMPTY_INITIALIZER
;
982 const struct bond_slave
*slave
;
984 LIST_FOR_EACH (slave
, bal_node
, bals
) {
986 ds_put_char(&ds
, ',');
988 ds_put_format(&ds
, " %s %"PRIu64
"kB",
989 slave
->name
, slave
->tx_bytes
/ 1024);
991 if (!slave
->enabled
) {
992 ds_put_cstr(&ds
, " (disabled)");
994 if (!list_is_empty(&slave
->entries
)) {
995 struct bond_entry
*e
;
997 ds_put_cstr(&ds
, " (");
998 LIST_FOR_EACH (e
, list_node
, &slave
->entries
) {
999 if (&e
->list_node
!= list_front(&slave
->entries
)) {
1000 ds_put_cstr(&ds
, " + ");
1002 ds_put_format(&ds
, "h%"PRIdPTR
": %"PRIu64
"kB",
1003 e
- bond
->hash
, e
->tx_bytes
/ 1024);
1005 ds_put_cstr(&ds
, ")");
1008 VLOG_DBG("bond %s:%s", bond
->name
, ds_cstr(&ds
));
1013 /* Shifts 'hash' from its current slave to 'to'. */
1015 bond_shift_load(struct bond_entry
*hash
, struct bond_slave
*to
)
1016 OVS_REQ_WRLOCK(rwlock
)
1018 struct bond_slave
*from
= hash
->slave
;
1019 struct bond
*bond
= from
->bond
;
1020 uint64_t delta
= hash
->tx_bytes
;
1022 VLOG_INFO("bond %s: shift %"PRIu64
"kB of load (with hash %"PRIdPTR
") "
1023 "from %s to %s (now carrying %"PRIu64
"kB and "
1024 "%"PRIu64
"kB load, respectively)",
1025 bond
->name
, delta
/ 1024, hash
- bond
->hash
,
1026 from
->name
, to
->name
,
1027 (from
->tx_bytes
- delta
) / 1024,
1028 (to
->tx_bytes
+ delta
) / 1024);
1030 /* Shift load away from 'from' to 'to'. */
1031 from
->tx_bytes
-= delta
;
1032 to
->tx_bytes
+= delta
;
1034 /* Arrange for flows to be revalidated. */
1036 bond
->bond_revalidate
= true;
1039 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1040 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
1041 * given that doing so must decrease the ratio of the load on the two slaves by
1042 * at least 0.1. Returns NULL if there is no appropriate entry.
1044 * The list of entries isn't sorted. I don't know of a reason to prefer to
1045 * shift away small hashes or large hashes. */
1046 static struct bond_entry
*
1047 choose_entry_to_migrate(const struct bond_slave
*from
, uint64_t to_tx_bytes
)
1048 OVS_REQ_WRLOCK(rwlock
)
1050 struct bond_entry
*e
;
1052 if (list_is_short(&from
->entries
)) {
1053 /* 'from' carries no more than one MAC hash, so shifting load away from
1054 * it would be pointless. */
1058 LIST_FOR_EACH (e
, list_node
, &from
->entries
) {
1059 double old_ratio
, new_ratio
;
1062 if (to_tx_bytes
== 0) {
1063 /* Nothing on the new slave, move it. */
1067 delta
= e
->tx_bytes
;
1068 old_ratio
= (double)from
->tx_bytes
/ to_tx_bytes
;
1069 new_ratio
= (double)(from
->tx_bytes
- delta
) / (to_tx_bytes
+ delta
);
1070 if (old_ratio
- new_ratio
> 0.1
1071 && fabs(new_ratio
- 1.0) < fabs(old_ratio
- 1.0)) {
1072 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1073 and 'to' slave have the same load. Therefore, we only move an
1074 entry if it decreases the load on 'from', and brings us closer
1075 to equal traffic load. */
1083 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1086 insert_bal(struct list
*bals
, struct bond_slave
*slave
)
1088 struct bond_slave
*pos
;
1090 LIST_FOR_EACH (pos
, bal_node
, bals
) {
1091 if (slave
->tx_bytes
> pos
->tx_bytes
) {
1095 list_insert(&pos
->bal_node
, &slave
->bal_node
);
1098 /* Removes 'slave' from its current list and then inserts it into 'bals' so
1099 * that descending order of 'tx_bytes' is maintained. */
1101 reinsert_bal(struct list
*bals
, struct bond_slave
*slave
)
1103 list_remove(&slave
->bal_node
);
1104 insert_bal(bals
, slave
);
1107 /* If 'bond' needs rebalancing, does so.
1109 * The caller should have called bond_account() for each active flow, or in case
1110 * of recirculation is used, have called bond_recirculation_account(bond),
1111 * to ensure that flow data is consistently accounted at this point.
1114 bond_rebalance(struct bond
*bond
)
1116 struct bond_slave
*slave
;
1117 struct bond_entry
*e
;
1119 bool rebalanced
= false;
1122 ovs_rwlock_wrlock(&rwlock
);
1123 if (!bond_is_balanced(bond
) || time_msec() < bond
->next_rebalance
) {
1126 bond
->next_rebalance
= time_msec() + bond
->rebalance_interval
;
1128 use_recirc
= ofproto_dpif_get_enable_recirc(bond
->ofproto
) &&
1129 bond_may_recirc(bond
, NULL
, NULL
);
1132 bond_recirculation_account(bond
);
1135 /* Add each bond_entry to its slave's 'entries' list.
1136 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1137 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1138 slave
->tx_bytes
= 0;
1139 list_init(&slave
->entries
);
1141 for (e
= &bond
->hash
[0]; e
<= &bond
->hash
[BOND_MASK
]; e
++) {
1142 if (e
->slave
&& e
->tx_bytes
) {
1143 e
->slave
->tx_bytes
+= e
->tx_bytes
;
1144 list_push_back(&e
->slave
->entries
, &e
->list_node
);
1148 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1150 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1151 * with a proper list sort algorithm. */
1153 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1154 if (slave
->enabled
) {
1155 insert_bal(&bals
, slave
);
1158 log_bals(bond
, &bals
);
1160 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1161 while (!list_is_short(&bals
)) {
1162 struct bond_slave
*from
= bond_slave_from_bal_node(list_front(&bals
));
1163 struct bond_slave
*to
= bond_slave_from_bal_node(list_back(&bals
));
1166 overload
= from
->tx_bytes
- to
->tx_bytes
;
1167 if (overload
< to
->tx_bytes
>> 5 || overload
< 100000) {
1168 /* The extra load on 'from' (and all less-loaded slaves), compared
1169 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1170 * it is less than ~1Mbps. No point in rebalancing. */
1174 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1175 * to move from 'from' to 'to'. */
1176 e
= choose_entry_to_migrate(from
, to
->tx_bytes
);
1178 bond_shift_load(e
, to
);
1180 /* Delete element from from->entries.
1182 * We don't add the element to to->hashes. That would only allow
1183 * 'e' to be migrated to another slave in this rebalancing run, and
1184 * there is no point in doing that. */
1185 list_remove(&e
->list_node
);
1187 /* Re-sort 'bals'. */
1188 reinsert_bal(&bals
, from
);
1189 reinsert_bal(&bals
, to
);
1192 /* Can't usefully migrate anything away from 'from'.
1193 * Don't reconsider it. */
1194 list_remove(&from
->bal_node
);
1198 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1199 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1200 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1201 for (e
= &bond
->hash
[0]; e
<= &bond
->hash
[BOND_MASK
]; e
++) {
1205 if (use_recirc
&& rebalanced
) {
1206 bond_update_post_recirc_rules(bond
,true);
1210 ovs_rwlock_unlock(&rwlock
);
1213 /* Bonding unixctl user interface functions. */
1215 static struct bond
*
1216 bond_find(const char *name
) OVS_REQ_RDLOCK(rwlock
)
1220 HMAP_FOR_EACH_WITH_HASH (bond
, hmap_node
, hash_string(name
, 0),
1222 if (!strcmp(bond
->name
, name
)) {
1229 static struct bond_slave
*
1230 bond_lookup_slave(struct bond
*bond
, const char *slave_name
)
1232 struct bond_slave
*slave
;
1234 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1235 if (!strcmp(slave
->name
, slave_name
)) {
1243 bond_unixctl_list(struct unixctl_conn
*conn
,
1244 int argc OVS_UNUSED
, const char *argv
[] OVS_UNUSED
,
1245 void *aux OVS_UNUSED
)
1247 struct ds ds
= DS_EMPTY_INITIALIZER
;
1248 const struct bond
*bond
;
1250 ds_put_cstr(&ds
, "bond\ttype\trecircID\tslaves\n");
1252 ovs_rwlock_rdlock(&rwlock
);
1253 HMAP_FOR_EACH (bond
, hmap_node
, all_bonds
) {
1254 const struct bond_slave
*slave
;
1257 ds_put_format(&ds
, "%s\t%s\t%d\t", bond
->name
,
1258 bond_mode_to_string(bond
->balance
), bond
->recirc_id
);
1261 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1263 ds_put_cstr(&ds
, ", ");
1265 ds_put_cstr(&ds
, slave
->name
);
1267 ds_put_char(&ds
, '\n');
1269 ovs_rwlock_unlock(&rwlock
);
1270 unixctl_command_reply(conn
, ds_cstr(&ds
));
1275 bond_print_details(struct ds
*ds
, const struct bond
*bond
)
1276 OVS_REQ_RDLOCK(rwlock
)
1278 struct shash slave_shash
= SHASH_INITIALIZER(&slave_shash
);
1279 const struct shash_node
**sorted_slaves
= NULL
;
1280 const struct bond_slave
*slave
;
1285 ds_put_format(ds
, "---- %s ----\n", bond
->name
);
1286 ds_put_format(ds
, "bond_mode: %s\n",
1287 bond_mode_to_string(bond
->balance
));
1289 may_recirc
= bond_may_recirc(bond
, &recirc_id
, NULL
);
1290 ds_put_format(ds
, "bond may use recirculation: %s, Recirc-ID : %d\n",
1291 may_recirc
? "yes" : "no", may_recirc
? recirc_id
: -1);
1293 ds_put_format(ds
, "bond-hash-basis: %"PRIu32
"\n", bond
->basis
);
1295 ds_put_format(ds
, "updelay: %d ms\n", bond
->updelay
);
1296 ds_put_format(ds
, "downdelay: %d ms\n", bond
->downdelay
);
1298 if (bond_is_balanced(bond
)) {
1299 ds_put_format(ds
, "next rebalance: %lld ms\n",
1300 bond
->next_rebalance
- time_msec());
1303 ds_put_cstr(ds
, "lacp_status: ");
1304 switch (bond
->lacp_status
) {
1305 case LACP_NEGOTIATED
:
1306 ds_put_cstr(ds
, "negotiated\n");
1308 case LACP_CONFIGURED
:
1309 ds_put_cstr(ds
, "configured\n");
1312 ds_put_cstr(ds
, "off\n");
1315 ds_put_cstr(ds
, "<unknown>\n");
1319 ds_put_cstr(ds
, "active slave mac: ");
1320 ds_put_format(ds
, ETH_ADDR_FMT
, ETH_ADDR_ARGS(bond
->active_slave_mac
));
1321 slave
= bond_find_slave_by_mac(bond
, bond
->active_slave_mac
);
1322 ds_put_format(ds
,"(%s)\n", slave
? slave
->name
: "none");
1324 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1325 shash_add(&slave_shash
, slave
->name
, slave
);
1327 sorted_slaves
= shash_sort(&slave_shash
);
1329 for (i
= 0; i
< shash_count(&slave_shash
); i
++) {
1330 struct bond_entry
*be
;
1332 slave
= sorted_slaves
[i
]->data
;
1335 ds_put_format(ds
, "\nslave %s: %s\n",
1336 slave
->name
, slave
->enabled
? "enabled" : "disabled");
1337 if (slave
== bond
->active_slave
) {
1338 ds_put_cstr(ds
, "\tactive slave\n");
1340 if (slave
->delay_expires
!= LLONG_MAX
) {
1341 ds_put_format(ds
, "\t%s expires in %lld ms\n",
1342 slave
->enabled
? "downdelay" : "updelay",
1343 slave
->delay_expires
- time_msec());
1346 ds_put_format(ds
, "\tmay_enable: %s\n",
1347 slave
->may_enable
? "true" : "false");
1349 if (!bond_is_balanced(bond
)) {
1354 for (be
= bond
->hash
; be
<= &bond
->hash
[BOND_MASK
]; be
++) {
1355 int hash
= be
- bond
->hash
;
1358 if (be
->slave
!= slave
) {
1362 be_tx_k
= be
->tx_bytes
/ 1024;
1364 ds_put_format(ds
, "\thash %d: %"PRIu64
" kB load\n",
1368 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1371 shash_destroy(&slave_shash
);
1372 free(sorted_slaves
);
1373 ds_put_cstr(ds
, "\n");
1377 bond_unixctl_show(struct unixctl_conn
*conn
,
1378 int argc
, const char *argv
[],
1379 void *aux OVS_UNUSED
)
1381 struct ds ds
= DS_EMPTY_INITIALIZER
;
1383 ovs_rwlock_rdlock(&rwlock
);
1385 const struct bond
*bond
= bond_find(argv
[1]);
1388 unixctl_command_reply_error(conn
, "no such bond");
1391 bond_print_details(&ds
, bond
);
1393 const struct bond
*bond
;
1395 HMAP_FOR_EACH (bond
, hmap_node
, all_bonds
) {
1396 bond_print_details(&ds
, bond
);
1400 unixctl_command_reply(conn
, ds_cstr(&ds
));
1404 ovs_rwlock_unlock(&rwlock
);
1408 bond_unixctl_migrate(struct unixctl_conn
*conn
,
1409 int argc OVS_UNUSED
, const char *argv
[],
1410 void *aux OVS_UNUSED
)
1412 const char *bond_s
= argv
[1];
1413 const char *hash_s
= argv
[2];
1414 const char *slave_s
= argv
[3];
1416 struct bond_slave
*slave
;
1417 struct bond_entry
*entry
;
1420 ovs_rwlock_wrlock(&rwlock
);
1421 bond
= bond_find(bond_s
);
1423 unixctl_command_reply_error(conn
, "no such bond");
1427 if (bond
->balance
!= BM_SLB
) {
1428 unixctl_command_reply_error(conn
, "not an SLB bond");
1432 if (strspn(hash_s
, "0123456789") == strlen(hash_s
)) {
1433 hash
= atoi(hash_s
) & BOND_MASK
;
1435 unixctl_command_reply_error(conn
, "bad hash");
1439 slave
= bond_lookup_slave(bond
, slave_s
);
1441 unixctl_command_reply_error(conn
, "no such slave");
1445 if (!slave
->enabled
) {
1446 unixctl_command_reply_error(conn
, "cannot migrate to disabled slave");
1450 entry
= &bond
->hash
[hash
];
1451 bond
->bond_revalidate
= true;
1452 entry
->slave
= slave
;
1453 unixctl_command_reply(conn
, "migrated");
1456 ovs_rwlock_unlock(&rwlock
);
1460 bond_unixctl_set_active_slave(struct unixctl_conn
*conn
,
1461 int argc OVS_UNUSED
, const char *argv
[],
1462 void *aux OVS_UNUSED
)
1464 const char *bond_s
= argv
[1];
1465 const char *slave_s
= argv
[2];
1467 struct bond_slave
*slave
;
1469 ovs_rwlock_wrlock(&rwlock
);
1470 bond
= bond_find(bond_s
);
1472 unixctl_command_reply_error(conn
, "no such bond");
1476 slave
= bond_lookup_slave(bond
, slave_s
);
1478 unixctl_command_reply_error(conn
, "no such slave");
1482 if (!slave
->enabled
) {
1483 unixctl_command_reply_error(conn
, "cannot make disabled slave active");
1487 if (bond
->active_slave
!= slave
) {
1488 bond
->bond_revalidate
= true;
1489 bond
->active_slave
= slave
;
1490 VLOG_INFO("bond %s: active interface is now %s",
1491 bond
->name
, slave
->name
);
1492 bond
->send_learning_packets
= true;
1493 unixctl_command_reply(conn
, "done");
1494 bond_active_slave_changed(bond
);
1496 unixctl_command_reply(conn
, "no change");
1499 ovs_rwlock_unlock(&rwlock
);
1503 enable_slave(struct unixctl_conn
*conn
, const char *argv
[], bool enable
)
1505 const char *bond_s
= argv
[1];
1506 const char *slave_s
= argv
[2];
1508 struct bond_slave
*slave
;
1510 ovs_rwlock_wrlock(&rwlock
);
1511 bond
= bond_find(bond_s
);
1513 unixctl_command_reply_error(conn
, "no such bond");
1517 slave
= bond_lookup_slave(bond
, slave_s
);
1519 unixctl_command_reply_error(conn
, "no such slave");
1523 bond_enable_slave(slave
, enable
);
1524 unixctl_command_reply(conn
, enable
? "enabled" : "disabled");
1527 ovs_rwlock_unlock(&rwlock
);
1531 bond_unixctl_enable_slave(struct unixctl_conn
*conn
,
1532 int argc OVS_UNUSED
, const char *argv
[],
1533 void *aux OVS_UNUSED
)
1535 enable_slave(conn
, argv
, true);
1539 bond_unixctl_disable_slave(struct unixctl_conn
*conn
,
1540 int argc OVS_UNUSED
, const char *argv
[],
1541 void *aux OVS_UNUSED
)
1543 enable_slave(conn
, argv
, false);
1547 bond_unixctl_hash(struct unixctl_conn
*conn
, int argc
, const char *argv
[],
1548 void *aux OVS_UNUSED
)
1550 const char *mac_s
= argv
[1];
1551 const char *vlan_s
= argc
> 2 ? argv
[2] : NULL
;
1552 const char *basis_s
= argc
> 3 ? argv
[3] : NULL
;
1553 uint8_t mac
[ETH_ADDR_LEN
];
1560 if (!ovs_scan(vlan_s
, "%u", &vlan
)) {
1561 unixctl_command_reply_error(conn
, "invalid vlan");
1569 if (!ovs_scan(basis_s
, "%"SCNu32
, &basis
)) {
1570 unixctl_command_reply_error(conn
, "invalid basis");
1577 if (ovs_scan(mac_s
, ETH_ADDR_SCAN_FMT
, ETH_ADDR_SCAN_ARGS(mac
))) {
1578 hash
= bond_hash_src(mac
, vlan
, basis
) & BOND_MASK
;
1580 hash_cstr
= xasprintf("%u", hash
);
1581 unixctl_command_reply(conn
, hash_cstr
);
1584 unixctl_command_reply_error(conn
, "invalid mac");
1591 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list
, NULL
);
1592 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show
,
1594 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1595 bond_unixctl_migrate
, NULL
);
1596 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1597 bond_unixctl_set_active_slave
, NULL
);
1598 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1599 bond_unixctl_enable_slave
, NULL
);
1600 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1601 bond_unixctl_disable_slave
, NULL
);
1602 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1603 bond_unixctl_hash
, NULL
);
1607 bond_entry_reset(struct bond
*bond
)
1609 if (bond
->balance
!= BM_AB
) {
1610 size_t hash_len
= BOND_BUCKETS
* sizeof *bond
->hash
;
1613 bond
->hash
= xmalloc(hash_len
);
1615 memset(bond
->hash
, 0, hash_len
);
1617 bond
->next_rebalance
= time_msec() + bond
->rebalance_interval
;
1624 static struct bond_slave
*
1625 bond_slave_lookup(struct bond
*bond
, const void *slave_
)
1627 struct bond_slave
*slave
;
1629 HMAP_FOR_EACH_IN_BUCKET (slave
, hmap_node
, hash_pointer(slave_
, 0),
1631 if (slave
->aux
== slave_
) {
1640 bond_enable_slave(struct bond_slave
*slave
, bool enable
)
1642 slave
->delay_expires
= LLONG_MAX
;
1643 if (enable
!= slave
->enabled
) {
1644 slave
->bond
->bond_revalidate
= true;
1645 slave
->enabled
= enable
;
1647 ovs_mutex_lock(&slave
->bond
->mutex
);
1649 list_insert(&slave
->bond
->enabled_slaves
, &slave
->list_node
);
1651 list_remove(&slave
->list_node
);
1653 ovs_mutex_unlock(&slave
->bond
->mutex
);
1655 VLOG_INFO("interface %s: %s", slave
->name
,
1656 slave
->enabled
? "enabled" : "disabled");
1661 bond_link_status_update(struct bond_slave
*slave
)
1663 struct bond
*bond
= slave
->bond
;
1666 up
= netdev_get_carrier(slave
->netdev
) && slave
->may_enable
;
1667 if ((up
== slave
->enabled
) != (slave
->delay_expires
== LLONG_MAX
)) {
1668 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
1669 VLOG_INFO_RL(&rl
, "interface %s: link state %s",
1670 slave
->name
, up
? "up" : "down");
1671 if (up
== slave
->enabled
) {
1672 slave
->delay_expires
= LLONG_MAX
;
1673 VLOG_INFO_RL(&rl
, "interface %s: will not be %s",
1674 slave
->name
, up
? "disabled" : "enabled");
1676 int delay
= (bond
->lacp_status
!= LACP_DISABLED
? 0
1677 : up
? bond
->updelay
: bond
->downdelay
);
1678 slave
->delay_expires
= time_msec() + delay
;
1680 VLOG_INFO_RL(&rl
, "interface %s: will be %s if it stays %s "
1683 up
? "enabled" : "disabled",
1690 if (time_msec() >= slave
->delay_expires
) {
1691 bond_enable_slave(slave
, up
);
1696 bond_hash_src(const uint8_t mac
[ETH_ADDR_LEN
], uint16_t vlan
, uint32_t basis
)
1698 return hash_mac(mac
, vlan
, basis
);
1702 bond_hash_tcp(const struct flow
*flow
, uint16_t vlan
, uint32_t basis
)
1704 struct flow hash_flow
= *flow
;
1705 hash_flow
.vlan_tci
= htons(vlan
);
1707 /* The symmetric quality of this hash function is not required, but
1708 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1709 * purposes, so we use it out of convenience. */
1710 return flow_hash_symmetric_l4(&hash_flow
, basis
);
1714 bond_hash(const struct bond
*bond
, const struct flow
*flow
, uint16_t vlan
)
1716 ovs_assert(bond
->balance
== BM_TCP
|| bond
->balance
== BM_SLB
);
1718 return (bond
->balance
== BM_TCP
1719 ? bond_hash_tcp(flow
, vlan
, bond
->basis
)
1720 : bond_hash_src(flow
->dl_src
, vlan
, bond
->basis
));
1723 static struct bond_entry
*
1724 lookup_bond_entry(const struct bond
*bond
, const struct flow
*flow
,
1727 return &bond
->hash
[bond_hash(bond
, flow
, vlan
) & BOND_MASK
];
1730 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1731 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1733 static struct bond_slave
*
1734 get_enabled_slave(struct bond
*bond
)
1738 ovs_mutex_lock(&bond
->mutex
);
1739 if (list_is_empty(&bond
->enabled_slaves
)) {
1740 ovs_mutex_unlock(&bond
->mutex
);
1744 node
= list_pop_front(&bond
->enabled_slaves
);
1745 list_push_back(&bond
->enabled_slaves
, node
);
1746 ovs_mutex_unlock(&bond
->mutex
);
1748 return CONTAINER_OF(node
, struct bond_slave
, list_node
);
1751 static struct bond_slave
*
1752 choose_output_slave(const struct bond
*bond
, const struct flow
*flow
,
1753 struct flow_wildcards
*wc
, uint16_t vlan
)
1755 struct bond_entry
*e
;
1758 balance
= bond
->balance
;
1759 if (bond
->lacp_status
== LACP_CONFIGURED
) {
1760 /* LACP has been configured on this bond but negotiations were
1761 * unsuccussful. If lacp_fallback_ab is enabled use active-
1762 * backup mode else drop all traffic. */
1763 if (!bond
->lacp_fallback_ab
) {
1771 return bond
->active_slave
;
1774 if (bond
->lacp_status
!= LACP_NEGOTIATED
) {
1775 /* Must have LACP negotiations for TCP balanced bonds. */
1779 flow_mask_hash_fields(flow
, wc
, NX_HASH_FIELDS_SYMMETRIC_L4
);
1784 flow_mask_hash_fields(flow
, wc
, NX_HASH_FIELDS_ETH_SRC
);
1786 e
= lookup_bond_entry(bond
, flow
, vlan
);
1787 if (!e
->slave
|| !e
->slave
->enabled
) {
1788 e
->slave
= get_enabled_slave(CONST_CAST(struct bond
*, bond
));
1797 static struct bond_slave
*
1798 bond_choose_slave(const struct bond
*bond
)
1800 struct bond_slave
*slave
, *best
;
1802 /* Find the last active slave. */
1803 slave
= bond_find_slave_by_mac(bond
, bond
->active_slave_mac
);
1804 if (slave
&& slave
->enabled
) {
1808 /* Find an enabled slave. */
1809 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1810 if (slave
->enabled
) {
1815 /* All interfaces are disabled. Find an interface that will be enabled
1816 * after its updelay expires. */
1818 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1819 if (slave
->delay_expires
!= LLONG_MAX
1820 && slave
->may_enable
1821 && (!best
|| slave
->delay_expires
< best
->delay_expires
)) {
1829 bond_choose_active_slave(struct bond
*bond
)
1831 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
1832 struct bond_slave
*old_active_slave
= bond
->active_slave
;
1834 bond
->active_slave
= bond_choose_slave(bond
);
1835 if (bond
->active_slave
) {
1836 if (bond
->active_slave
->enabled
) {
1837 VLOG_INFO_RL(&rl
, "bond %s: active interface is now %s",
1838 bond
->name
, bond
->active_slave
->name
);
1840 VLOG_INFO_RL(&rl
, "bond %s: active interface is now %s, skipping "
1841 "remaining %lld ms updelay (since no interface was "
1842 "enabled)", bond
->name
, bond
->active_slave
->name
,
1843 bond
->active_slave
->delay_expires
- time_msec());
1844 bond_enable_slave(bond
->active_slave
, true);
1847 bond
->send_learning_packets
= true;
1849 if (bond
->active_slave
!= old_active_slave
) {
1850 bond_active_slave_changed(bond
);
1852 } else if (old_active_slave
) {
1853 VLOG_INFO_RL(&rl
, "bond %s: all interfaces disabled", bond
->name
);
1858 * Return true if bond has unstored active slave change.
1859 * If return true, 'mac' will store the bond's current active slave's
1862 bond_get_changed_active_slave(const char *name
, uint8_t* mac
, bool force
)
1866 ovs_rwlock_wrlock(&rwlock
);
1867 bond
= bond_find(name
);
1869 if (bond
->active_slave_changed
|| force
) {
1870 memcpy(mac
, bond
->active_slave_mac
, ETH_ADDR_LEN
);
1871 bond
->active_slave_changed
= false;
1872 ovs_rwlock_unlock(&rwlock
);
1876 ovs_rwlock_unlock(&rwlock
);