2 * Copyright (c) 2008, 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
26 #include "dynamic-string.h"
35 #include "poll-loop.h"
41 VLOG_DEFINE_THIS_MODULE(bond
);
43 COVERAGE_DEFINE(bond_process_lacp
);
45 /* Bit-mask for hashing a flow down to a bucket.
46 * There are (BOND_MASK + 1) buckets. */
47 #define BOND_MASK 0xff
49 /* A hash bucket for mapping a flow to a slave.
50 * "struct bond" has an array of (BOND_MASK + 1) of these. */
52 struct bond_slave
*slave
; /* Assigned slave, NULL if unassigned. */
53 uint64_t tx_bytes
; /* Count of bytes recently transmitted. */
54 tag_type tag
; /* Tag for entry<->slave association. */
55 struct list list_node
; /* In bond_slave's 'entries' list. */
58 /* A bond slave, that is, one of the links comprising a bond. */
60 struct hmap_node hmap_node
; /* In struct bond's slaves hmap. */
61 struct bond
*bond
; /* The bond that contains this slave. */
62 void *aux
; /* Client-provided handle for this slave. */
64 struct netdev
*netdev
; /* Network device, owned by the client. */
65 char *name
; /* Name (a copy of netdev_get_name(netdev)). */
68 long long delay_expires
; /* Time after which 'enabled' may change. */
69 bool up
; /* Last link status read from netdev. */
70 bool enabled
; /* May be chosen for flows? */
71 tag_type tag
; /* Tag associated with this slave. */
73 /* Rebalancing info. Used only by bond_rebalance(). */
74 struct list bal_node
; /* In bond_rebalance()'s 'bals' list. */
75 struct list entries
; /* 'struct bond_entry's assigned here. */
76 uint64_t tx_bytes
; /* Sum across 'tx_bytes' of entries. */
79 /* A bond, that is, a set of network devices grouped to improve performance or
82 struct hmap_node hmap_node
; /* In 'all_bonds' hmap. */
83 char *name
; /* Name provided by client. */
89 enum bond_mode balance
; /* Balancing mode, one of BM_*. */
90 struct bond_slave
*active_slave
;
91 tag_type no_slaves_tag
; /* Tag for flows when all slaves disabled. */
92 int updelay
, downdelay
; /* Delay before slave goes up/down, in ms. */
94 /* SLB specific bonding info. */
95 struct bond_entry
*hash
; /* An array of (BOND_MASK + 1) elements. */
96 int rebalance_interval
; /* Interval between rebalances, in ms. */
97 long long int next_rebalance
; /* Next rebalancing time. */
98 bool send_learning_packets
;
101 struct lacp
*lacp
; /* LACP object. NULL if LACP is disabled. */
104 enum bond_detect_mode detect
; /* Link status mode, one of BLSM_*. */
105 struct netdev_monitor
*monitor
; /* detect == BLSM_CARRIER only. */
106 long long int miimon_interval
; /* Miimon status refresh interval. */
107 long long int miimon_next_update
; /* Time of next miimon update. */
109 /* Legacy compatibility. */
110 long long int next_fake_iface_update
; /* LLONG_MAX if disabled. */
112 /* Tag set saved for next bond_run(). This tag set is a kluge for cases
113 * where we can't otherwise provide revalidation feedback to the client.
114 * That's only unixctl commands now; I hope no other cases will arise. */
115 struct tag_set unixctl_tags
;
118 static struct hmap all_bonds
= HMAP_INITIALIZER(&all_bonds
);
120 static void bond_entry_reset(struct bond
*);
121 static struct bond_slave
*bond_slave_lookup(struct bond
*, const void *slave_
);
122 static bool bond_is_link_up(struct bond
*, struct netdev
*);
123 static void bond_enable_slave(struct bond_slave
*, bool enable
,
125 static void bond_link_status_update(struct bond_slave
*, struct tag_set
*);
126 static void bond_choose_active_slave(struct bond
*, struct tag_set
*);
127 static bool bond_is_tcp_hash(const struct bond
*);
128 static unsigned int bond_hash_src(const uint8_t mac
[ETH_ADDR_LEN
],
130 static unsigned int bond_hash_tcp(const struct flow
*, uint16_t vlan
);
131 static struct bond_entry
*lookup_bond_entry(const struct bond
*,
134 static tag_type
bond_get_active_slave_tag(const struct bond
*);
135 static struct bond_slave
*choose_output_slave(const struct bond
*,
138 static void bond_update_fake_slave_stats(struct bond
*);
140 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
141 * stores the mode in '*balance' and returns true. Otherwise returns false
142 * without modifying '*balance'. */
144 bond_mode_from_string(enum bond_mode
*balance
, const char *s
)
146 if (!strcmp(s
, bond_mode_to_string(BM_TCP
))) {
148 } else if (!strcmp(s
, bond_mode_to_string(BM_SLB
))) {
150 } else if (!strcmp(s
, bond_mode_to_string(BM_AB
))) {
158 /* Returns a string representing 'balance'. */
160 bond_mode_to_string(enum bond_mode balance
) {
163 return "balance-tcp";
165 return "balance-slb";
167 return "active-backup";
172 /* Attempts to parse 's' as the name of a bond link status detection mode. If
173 * successful, stores the mode in '*detect' and returns true. Otherwise
174 * returns false without modifying '*detect'. */
176 bond_detect_mode_from_string(enum bond_detect_mode
*detect
, const char *s
)
178 if (!strcmp(s
, bond_detect_mode_to_string(BLSM_CARRIER
))) {
179 *detect
= BLSM_CARRIER
;
180 } else if (!strcmp(s
, bond_detect_mode_to_string(BLSM_MIIMON
))) {
181 *detect
= BLSM_MIIMON
;
188 /* Returns a string representing 'detect'. */
190 bond_detect_mode_to_string(enum bond_detect_mode detect
)
201 /* Creates and returns a new bond whose configuration is initially taken from
204 * The caller should register each slave on the new bond by calling
205 * bond_slave_register(). */
207 bond_create(const struct bond_settings
*s
)
211 bond
= xzalloc(sizeof *bond
);
212 hmap_init(&bond
->slaves
);
213 bond
->no_slaves_tag
= tag_create_random();
214 bond
->miimon_next_update
= LLONG_MAX
;
215 bond
->next_fake_iface_update
= LLONG_MAX
;
217 bond_reconfigure(bond
, s
);
219 tag_set_init(&bond
->unixctl_tags
);
226 bond_destroy(struct bond
*bond
)
228 struct bond_slave
*slave
, *next_slave
;
234 hmap_remove(&all_bonds
, &bond
->hmap_node
);
236 HMAP_FOR_EACH_SAFE (slave
, next_slave
, hmap_node
, &bond
->slaves
) {
237 hmap_remove(&bond
->slaves
, &slave
->hmap_node
);
238 /* Client owns 'slave->netdev'. */
242 hmap_destroy(&bond
->slaves
);
246 lacp_destroy(bond
->lacp
);
248 netdev_monitor_destroy(bond
->monitor
);
254 /* Updates 'bond''s overall configuration to 's'.
256 * The caller should register each slave on 'bond' by calling
257 * bond_slave_register(). This is optional if none of the slaves'
258 * configuration has changed, except that it is mandatory if 's' enables LACP
259 * and 'bond' previously didn't have LACP enabled. In any case it can't
262 * Returns true if the configuration has changed in such a way that requires
266 bond_reconfigure(struct bond
*bond
, const struct bond_settings
*s
)
268 bool revalidate
= false;
270 if (!bond
->name
|| strcmp(bond
->name
, s
->name
)) {
272 hmap_remove(&all_bonds
, &bond
->hmap_node
);
275 bond
->name
= xstrdup(s
->name
);
276 hmap_insert(&all_bonds
, &bond
->hmap_node
, hash_string(bond
->name
, 0));
279 bond
->detect
= s
->detect
;
280 bond
->miimon_interval
= s
->miimon_interval
;
281 bond
->updelay
= s
->up_delay
;
282 bond
->downdelay
= s
->down_delay
;
283 bond
->rebalance_interval
= s
->rebalance_interval
;
285 if (bond
->balance
!= s
->balance
) {
286 bond
->balance
= s
->balance
;
290 if (bond
->detect
== BLSM_CARRIER
) {
291 struct bond_slave
*slave
;
293 if (!bond
->monitor
) {
294 bond
->monitor
= netdev_monitor_create();
297 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
298 netdev_monitor_add(bond
->monitor
, slave
->netdev
);
301 netdev_monitor_destroy(bond
->monitor
);
302 bond
->monitor
= NULL
;
304 if (bond
->miimon_next_update
== LLONG_MAX
) {
305 bond
->miimon_next_update
= time_msec() + bond
->miimon_interval
;
311 bond
->lacp
= lacp_create();
313 lacp_configure(bond
->lacp
, s
->lacp
);
315 lacp_destroy(bond
->lacp
);
320 if (bond
->next_fake_iface_update
== LLONG_MAX
) {
321 bond
->next_fake_iface_update
= time_msec();
324 bond
->next_fake_iface_update
= LLONG_MAX
;
327 if (bond
->balance
== BM_AB
|| !bond
->hash
|| revalidate
) {
328 bond_entry_reset(bond
);
334 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
335 * arbitrary client-provided pointer that uniquely identifies a slave within a
336 * bond. If 'slave_' already exists within 'bond' then this function
337 * reconfigures the existing slave.
339 * 'netdev' must be the network device that 'slave_' represents. It is owned
340 * by the client, so the client must not close it before either unregistering
341 * 'slave_' or destroying 'bond'.
343 * If 'bond' has a LACP configuration then 'lacp_settings' must point to LACP
344 * settings for 'slave_'; otherwise 'lacp_settings' is ignored.
347 bond_slave_register(struct bond
*bond
, void *slave_
, struct netdev
*netdev
,
348 const struct lacp_slave_settings
*lacp_settings
)
350 struct bond_slave
*slave
= bond_slave_lookup(bond
, slave_
);
353 slave
= xzalloc(sizeof *slave
);
355 hmap_insert(&bond
->slaves
, &slave
->hmap_node
, hash_pointer(slave_
, 0));
358 slave
->delay_expires
= LLONG_MAX
;
359 slave
->up
= bond_is_link_up(bond
, netdev
);
360 slave
->enabled
= false;
361 bond_enable_slave(slave
, slave
->up
, NULL
);
364 slave
->netdev
= netdev
;
366 slave
->name
= xstrdup(netdev_get_name(netdev
));
369 assert(lacp_settings
!= NULL
);
370 lacp_slave_register(bond
->lacp
, slave
, lacp_settings
);
374 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
375 * then this function has no effect.
377 * Unregistering a slave invalidates all flows. */
379 bond_slave_unregister(struct bond
*bond
, const void *slave_
)
381 struct bond_slave
*slave
= bond_slave_lookup(bond
, slave_
);
388 bond_enable_slave(slave
, false, NULL
);
390 del_active
= bond
->active_slave
== slave
;
392 struct bond_entry
*e
;
393 for (e
= bond
->hash
; e
<= &bond
->hash
[BOND_MASK
]; e
++) {
394 if (e
->slave
== slave
) {
402 hmap_remove(&bond
->slaves
, &slave
->hmap_node
);
403 /* Client owns 'slave->netdev'. */
410 bond_choose_active_slave(bond
, &tags
);
411 bond
->send_learning_packets
= true;
415 /* Callback for lacp_run(). */
417 bond_send_pdu_cb(void *slave_
, const struct lacp_pdu
*pdu
)
419 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 10);
420 struct bond_slave
*slave
= slave_
;
421 uint8_t ea
[ETH_ADDR_LEN
];
424 error
= netdev_get_etheraddr(slave
->netdev
, ea
);
426 struct lacp_pdu
*packet_pdu
;
427 struct ofpbuf packet
;
429 ofpbuf_init(&packet
, 0);
430 packet_pdu
= eth_compose(&packet
, eth_addr_lacp
, ea
, ETH_TYPE_LACP
,
433 error
= netdev_send(slave
->netdev
, &packet
);
435 VLOG_WARN_RL(&rl
, "bond %s: sending LACP PDU on slave %s failed "
437 slave
->bond
->name
, slave
->name
, strerror(error
));
439 ofpbuf_uninit(&packet
);
441 VLOG_ERR_RL(&rl
, "bond %s: cannot obtain Ethernet address of slave "
443 slave
->bond
->name
, slave
->name
, strerror(error
));
447 /* Performs periodic maintenance on 'bond'. The caller must provide 'tags' to
448 * allow tagged flows to be invalidated.
450 * The caller should check bond_should_send_learning_packets() afterward. */
452 bond_run(struct bond
*bond
, struct tag_set
*tags
)
454 struct bond_slave
*slave
;
455 bool is_tcp_hash
= bond_is_tcp_hash(bond
);
457 /* Update link status. */
458 if (bond
->detect
== BLSM_CARRIER
459 || time_msec() >= bond
->miimon_next_update
)
461 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
462 slave
->up
= bond_is_link_up(bond
, slave
->netdev
);
464 bond
->miimon_next_update
= time_msec() + bond
->miimon_interval
;
469 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
470 lacp_slave_enable(bond
->lacp
, slave
, slave
->enabled
);
473 lacp_run(bond
->lacp
, bond_send_pdu_cb
);
476 /* Enable slaves based on link status and LACP feedback. */
477 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
478 bond_link_status_update(slave
, tags
);
480 if (!bond
->active_slave
|| !bond
->active_slave
->enabled
) {
481 bond_choose_active_slave(bond
, tags
);
484 /* Update fake bond interface stats. */
485 if (time_msec() >= bond
->next_fake_iface_update
) {
486 bond_update_fake_slave_stats(bond
);
487 bond
->next_fake_iface_update
= time_msec() + 1000;
490 if (is_tcp_hash
!= bond_is_tcp_hash(bond
)) {
491 struct bond_slave
*slave
;
493 bond_entry_reset(bond
);
494 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
495 tag_set_add(tags
, slave
->tag
);
499 /* Invalidate any tags required by */
500 tag_set_union(tags
, &bond
->unixctl_tags
);
501 tag_set_init(&bond
->unixctl_tags
);
504 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
506 bond_wait(struct bond
*bond
)
508 struct bond_slave
*slave
;
510 if (bond
->detect
== BLSM_CARRIER
) {
511 netdev_monitor_poll_wait(bond
->monitor
);
513 poll_timer_wait_until(bond
->miimon_next_update
);
516 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
517 if (slave
->delay_expires
!= LLONG_MAX
) {
518 poll_timer_wait_until(slave
->delay_expires
);
522 if (bond
->next_fake_iface_update
!= LLONG_MAX
) {
523 poll_timer_wait_until(bond
->next_fake_iface_update
);
526 /* Ensure that any saved tags get revalidated right away. */
527 if (!tag_set_is_empty(&bond
->unixctl_tags
)) {
528 poll_immediate_wake();
531 /* We don't wait for bond->next_rebalance because rebalancing can only run
532 * at a flow account checkpoint. ofproto does checkpointing on its own
533 * schedule and bond_rebalance() gets called afterward, so we'd just be
534 * waking up for no purpose. */
537 /* MAC learning table interaction. */
540 may_send_learning_packets(const struct bond
*bond
)
542 return !lacp_negotiated(bond
->lacp
) && bond
->balance
!= BM_AB
;
545 /* Returns true if 'bond' needs the client to send out packets to assist with
546 * MAC learning on 'bond'. If this function returns true, then the client
547 * should iterate through its MAC learning table for the bridge on which 'bond'
548 * is located. For each MAC that has been learned on a port other than 'bond',
549 * it should call bond_send_learning_packet().
551 * This function will only return true if 'bond' is in SLB mode and LACP is not
552 * negotiated. Otherwise sending learning packets isn't necessary.
554 * Calling this function resets the state that it checks. */
556 bond_should_send_learning_packets(struct bond
*bond
)
558 bool send
= bond
->send_learning_packets
&& may_send_learning_packets(bond
);
559 bond
->send_learning_packets
= false;
563 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
565 * See bond_should_send_learning_packets() for description of usage. */
567 bond_send_learning_packet(struct bond
*bond
,
568 const uint8_t eth_src
[ETH_ADDR_LEN
],
571 struct bond_slave
*slave
;
572 struct ofpbuf packet
;
576 assert(may_send_learning_packets(bond
));
577 if (!bond
->active_slave
) {
578 /* Nowhere to send the learning packet. */
582 memset(&flow
, 0, sizeof flow
);
583 memcpy(flow
.dl_src
, eth_src
, ETH_ADDR_LEN
);
584 slave
= choose_output_slave(bond
, &flow
, vlan
);
586 ofpbuf_init(&packet
, 0);
587 compose_benign_packet(&packet
, "Open vSwitch Bond Failover", 0xf177,
590 eth_set_vlan_tci(&packet
, htons(vlan
));
592 error
= netdev_send(slave
->netdev
, &packet
);
593 ofpbuf_uninit(&packet
);
598 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
599 * Ethernet destination address of 'eth_dst', should be admitted.
601 * The return value is one of the following:
603 * - BV_ACCEPT: Admit the packet.
605 * - BV_DROP: Drop the packet.
607 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
608 * Ethernet source address and VLAN. If there is none, or if the packet
609 * is on the learned port, then admit the packet. If a different port has
610 * been learned, however, drop the packet (and do not use it for MAC
614 bond_check_admissibility(struct bond
*bond
, const void *slave_
,
615 const uint8_t eth_dst
[ETH_ADDR_LEN
], tag_type
*tags
)
617 /* Admit all packets if LACP has been negotiated, because that means that
618 * the remote switch is aware of the bond and will "do the right thing". */
619 if (lacp_negotiated(bond
->lacp
)) {
623 /* Drop all multicast packets on inactive slaves. */
624 if (eth_addr_is_multicast(eth_dst
)) {
625 *tags
|= bond_get_active_slave_tag(bond
);
626 if (bond
->active_slave
!= bond_slave_lookup(bond
, slave_
)) {
631 /* Drop all packets for which we have learned a different input port,
632 * because we probably sent the packet on one slave and got it back on the
633 * other. Gratuitous ARP packets are an exception to this rule: the host
634 * has moved to another switch. The exception to the exception is if we
635 * locked the learning table to avoid reflections on bond slaves. */
636 return BV_DROP_IF_MOVED
;
639 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
640 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
641 * NULL if the packet should be dropped because no slaves are enabled.
643 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
644 * should be a VID only (i.e. excluding the PCP bits). Second,
645 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
646 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
647 * packet belongs to (so for an access port it will be the access port's VLAN).
649 * Adds a tag to '*tags' that associates the flow with the returned slave.
652 bond_choose_output_slave(struct bond
*bond
, const struct flow
*flow
,
653 uint16_t vlan
, tag_type
*tags
)
655 struct bond_slave
*slave
= choose_output_slave(bond
, flow
, vlan
);
660 *tags
|= bond
->no_slaves_tag
;
665 /* Processes LACP packet 'packet', which was received on 'slave_' within
668 * The client should use this function to pass along LACP messages received on
669 * any of 'bond''s slaves. */
671 bond_process_lacp(struct bond
*bond
, void *slave_
, const struct ofpbuf
*packet
)
674 struct bond_slave
*slave
= bond_slave_lookup(bond
, slave_
);
675 const struct lacp_pdu
*pdu
= parse_lacp_packet(packet
);
677 COVERAGE_INC(bond_process_lacp
);
678 lacp_process_pdu(bond
->lacp
, slave
, pdu
);
685 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
687 bond_account(struct bond
*bond
, const struct flow
*flow
, uint16_t vlan
,
690 switch (bond
->balance
) {
697 lookup_bond_entry(bond
, flow
, vlan
)->tx_bytes
+= n_bytes
;
705 static struct bond_slave
*
706 bond_slave_from_bal_node(struct list
*bal
)
708 return CONTAINER_OF(bal
, struct bond_slave
, bal_node
);
712 log_bals(struct bond
*bond
, const struct list
*bals
)
714 if (VLOG_IS_DBG_ENABLED()) {
715 struct ds ds
= DS_EMPTY_INITIALIZER
;
716 const struct bond_slave
*slave
;
718 LIST_FOR_EACH (slave
, bal_node
, bals
) {
720 ds_put_char(&ds
, ',');
722 ds_put_format(&ds
, " %s %"PRIu64
"kB",
723 slave
->name
, slave
->tx_bytes
/ 1024);
725 if (!slave
->enabled
) {
726 ds_put_cstr(&ds
, " (disabled)");
728 if (!list_is_empty(&slave
->entries
)) {
729 struct bond_entry
*e
;
731 ds_put_cstr(&ds
, " (");
732 LIST_FOR_EACH (e
, list_node
, &slave
->entries
) {
733 if (&e
->list_node
!= list_front(&slave
->entries
)) {
734 ds_put_cstr(&ds
, " + ");
736 ds_put_format(&ds
, "h%td: %"PRIu64
"kB",
737 e
- bond
->hash
, e
->tx_bytes
/ 1024);
739 ds_put_cstr(&ds
, ")");
742 VLOG_DBG("bond %s:%s", bond
->name
, ds_cstr(&ds
));
747 /* Shifts 'hash' from its current slave to 'to'. */
749 bond_shift_load(struct bond_entry
*hash
, struct bond_slave
*to
,
752 struct bond_slave
*from
= hash
->slave
;
753 struct bond
*bond
= from
->bond
;
754 uint64_t delta
= hash
->tx_bytes
;
756 VLOG_INFO("bond %s: shift %"PRIu64
"kB of load (with hash %td) "
757 "from %s to %s (now carrying %"PRIu64
"kB and "
758 "%"PRIu64
"kB load, respectively)",
759 bond
->name
, delta
/ 1024, hash
- bond
->hash
,
760 from
->name
, to
->name
,
761 (from
->tx_bytes
- delta
) / 1024,
762 (to
->tx_bytes
+ delta
) / 1024);
764 /* Shift load away from 'from' to 'to'. */
765 from
->tx_bytes
-= delta
;
766 to
->tx_bytes
+= delta
;
768 /* Arrange for flows to be revalidated. */
769 tag_set_add(set
, hash
->tag
);
771 hash
->tag
= tag_create_random();
774 /* Pick and returns a bond_entry to migrate to 'to' (the least-loaded slave),
775 * given that doing so must decrease the ratio of the load on the two slaves by
776 * at least 0.1. Returns NULL if there is no appropriate entry.
778 * The list of entries isn't sorted. I don't know of a reason to prefer to
779 * shift away small hashes or large hashes. */
780 static struct bond_entry
*
781 choose_entry_to_migrate(const struct bond_slave
*from
, uint64_t to_tx_bytes
)
783 struct bond_entry
*e
;
785 if (list_is_short(&from
->entries
)) {
786 /* 'from' carries no more than one MAC hash, so shifting load away from
787 * it would be pointless. */
791 LIST_FOR_EACH (e
, list_node
, &from
->entries
) {
792 double old_ratio
, new_ratio
;
795 if (to_tx_bytes
== 0) {
796 /* Nothing on the new slave, move it. */
801 old_ratio
= (double)from
->tx_bytes
/ to_tx_bytes
;
802 new_ratio
= (double)(from
->tx_bytes
- delta
) / (to_tx_bytes
+ delta
);
803 if (old_ratio
- new_ratio
> 0.1) {
804 /* Would decrease the ratio, move it. */
812 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
815 insert_bal(struct list
*bals
, struct bond_slave
*slave
)
817 struct bond_slave
*pos
;
819 LIST_FOR_EACH (pos
, bal_node
, bals
) {
820 if (slave
->tx_bytes
> pos
->tx_bytes
) {
824 list_insert(&pos
->bal_node
, &slave
->bal_node
);
827 /* Removes 'slave' from its current list and then inserts it into 'bals' so
828 * that descending order of 'tx_bytes' is maintained. */
830 reinsert_bal(struct list
*bals
, struct bond_slave
*slave
)
832 list_remove(&slave
->bal_node
);
833 insert_bal(bals
, slave
);
836 /* If 'bond' needs rebalancing, does so.
838 * The caller should have called bond_account() for each active flow, to ensure
839 * that flow data is consistently accounted at this point. */
841 bond_rebalance(struct bond
*bond
, struct tag_set
*tags
)
843 struct bond_slave
*slave
;
844 struct bond_entry
*e
;
847 if (bond
->balance
== BM_AB
|| time_msec() < bond
->next_rebalance
) {
850 bond
->next_rebalance
= time_msec() + bond
->rebalance_interval
;
852 /* Add each bond_entry to its slave's 'entries' list.
853 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
854 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
856 list_init(&slave
->entries
);
858 for (e
= &bond
->hash
[0]; e
<= &bond
->hash
[BOND_MASK
]; e
++) {
859 if (e
->slave
&& e
->tx_bytes
) {
860 e
->slave
->tx_bytes
+= e
->tx_bytes
;
861 list_push_back(&e
->slave
->entries
, &e
->list_node
);
865 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
867 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
868 * with a proper list sort algorithm. */
870 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
871 if (slave
->enabled
) {
872 insert_bal(&bals
, slave
);
875 log_bals(bond
, &bals
);
877 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
878 while (!list_is_short(&bals
)) {
879 struct bond_slave
*from
= bond_slave_from_bal_node(list_front(&bals
));
880 struct bond_slave
*to
= bond_slave_from_bal_node(list_back(&bals
));
883 overload
= from
->tx_bytes
- to
->tx_bytes
;
884 if (overload
< to
->tx_bytes
>> 5 || overload
< 100000) {
885 /* The extra load on 'from' (and all less-loaded slaves), compared
886 * to that of 'to' (the least-loaded slave), is less than ~3%, or
887 * it is less than ~1Mbps. No point in rebalancing. */
891 /* 'from' is carrying significantly more load than 'to', and that load
892 * is split across at least two different hashes. */
893 e
= choose_entry_to_migrate(from
, to
->tx_bytes
);
895 bond_shift_load(e
, to
, tags
);
897 /* Delete element from from->entries.
899 * We don't add the element to to->hashes. That would only allow
900 * 'e' to be migrated to another slave in this rebalancing run, and
901 * there is no point in doing that. */
902 list_remove(&e
->list_node
);
904 /* Re-sort 'bals'. */
905 reinsert_bal(&bals
, from
);
906 reinsert_bal(&bals
, to
);
908 /* Can't usefully migrate anything away from 'from'.
909 * Don't reconsider it. */
910 list_remove(&from
->bal_node
);
914 /* Implement exponentially weighted moving average. A weight of 1/2 causes
915 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
916 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
917 for (e
= &bond
->hash
[0]; e
<= &bond
->hash
[BOND_MASK
]; e
++) {
925 /* Bonding unixctl user interface functions. */
928 bond_find(const char *name
)
932 HMAP_FOR_EACH_WITH_HASH (bond
, hmap_node
, hash_string(name
, 0),
934 if (!strcmp(bond
->name
, name
)) {
941 static struct bond_slave
*
942 bond_lookup_slave(struct bond
*bond
, const char *slave_name
)
944 struct bond_slave
*slave
;
946 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
947 if (!strcmp(slave
->name
, slave_name
)) {
955 bond_unixctl_list(struct unixctl_conn
*conn
,
956 const char *args OVS_UNUSED
, void *aux OVS_UNUSED
)
958 struct ds ds
= DS_EMPTY_INITIALIZER
;
959 const struct bond
*bond
;
961 ds_put_cstr(&ds
, "bond\ttype\tslaves\n");
963 HMAP_FOR_EACH (bond
, hmap_node
, &all_bonds
) {
964 const struct bond_slave
*slave
;
967 ds_put_format(&ds
, "%s\t%s\t",
968 bond
->name
, bond_mode_to_string(bond
->balance
));
971 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
973 ds_put_cstr(&ds
, ", ");
975 ds_put_cstr(&ds
, slave
->name
);
977 ds_put_char(&ds
, '\n');
979 unixctl_command_reply(conn
, 200, ds_cstr(&ds
));
984 bond_unixctl_show(struct unixctl_conn
*conn
,
985 const char *args
, void *aux OVS_UNUSED
)
987 struct ds ds
= DS_EMPTY_INITIALIZER
;
988 const struct bond_slave
*slave
;
989 const struct bond
*bond
;
991 bond
= bond_find(args
);
993 unixctl_command_reply(conn
, 501, "no such bond");
997 ds_put_format(&ds
, "bond_mode: %s\n",
998 bond_mode_to_string(bond
->balance
));
1001 ds_put_format(&ds
, "lacp: %s\n",
1002 lacp_is_active(bond
->lacp
) ? "active" : "passive");
1004 ds_put_cstr(&ds
, "lacp: off\n");
1007 if (bond
->balance
!= BM_AB
) {
1008 ds_put_format(&ds
, "bond-hash-algorithm: %s\n",
1009 bond_is_tcp_hash(bond
) ? "balance-tcp" : "balance-slb");
1012 ds_put_format(&ds
, "bond-detect-mode: %s\n",
1013 bond
->monitor
? "carrier" : "miimon");
1015 if (!bond
->monitor
) {
1016 ds_put_format(&ds
, "bond-miimon-interval: %lld\n",
1017 bond
->miimon_interval
);
1020 ds_put_format(&ds
, "updelay: %d ms\n", bond
->updelay
);
1021 ds_put_format(&ds
, "downdelay: %d ms\n", bond
->downdelay
);
1023 if (bond
->balance
!= BM_AB
) {
1024 ds_put_format(&ds
, "next rebalance: %lld ms\n",
1025 bond
->next_rebalance
- time_msec());
1028 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1029 struct bond_entry
*be
;
1033 ds_put_format(&ds
, "\nslave %s: %s\n",
1034 slave
->name
, slave
->enabled
? "enabled" : "disabled");
1035 if (slave
== bond
->active_slave
) {
1036 ds_put_cstr(&ds
, "\tactive slave\n");
1038 if (slave
->delay_expires
!= LLONG_MAX
) {
1039 ds_put_format(&ds
, "\t%s expires in %lld ms\n",
1040 slave
->enabled
? "downdelay" : "updelay",
1041 slave
->delay_expires
- time_msec());
1044 if (bond
->balance
== BM_AB
) {
1049 memset(&flow
, 0, sizeof flow
);
1050 for (be
= bond
->hash
; be
<= &bond
->hash
[BOND_MASK
]; be
++) {
1051 int hash
= be
- bond
->hash
;
1053 if (be
->slave
!= slave
) {
1057 ds_put_format(&ds
, "\thash %d: %"PRIu64
" kB load\n",
1058 hash
, be
->tx_bytes
/ 1024);
1060 if (bond
->balance
!= BM_SLB
) {
1064 /* XXX How can we list the MACs assigned to hashes? */
1067 unixctl_command_reply(conn
, 200, ds_cstr(&ds
));
1072 bond_unixctl_migrate(struct unixctl_conn
*conn
, const char *args_
,
1073 void *aux OVS_UNUSED
)
1075 char *args
= (char *) args_
;
1076 char *save_ptr
= NULL
;
1077 char *bond_s
, *hash_s
, *slave_s
;
1079 struct bond_slave
*slave
;
1080 struct bond_entry
*entry
;
1083 bond_s
= strtok_r(args
, " ", &save_ptr
);
1084 hash_s
= strtok_r(NULL
, " ", &save_ptr
);
1085 slave_s
= strtok_r(NULL
, " ", &save_ptr
);
1087 unixctl_command_reply(conn
, 501,
1088 "usage: bond/migrate BOND HASH SLAVE");
1092 bond
= bond_find(bond_s
);
1094 unixctl_command_reply(conn
, 501, "no such bond");
1098 if (bond
->balance
!= BM_SLB
) {
1099 unixctl_command_reply(conn
, 501, "not an SLB bond");
1103 if (strspn(hash_s
, "0123456789") == strlen(hash_s
)) {
1104 hash
= atoi(hash_s
) & BOND_MASK
;
1106 unixctl_command_reply(conn
, 501, "bad hash");
1110 slave
= bond_lookup_slave(bond
, slave_s
);
1112 unixctl_command_reply(conn
, 501, "no such slave");
1116 if (!slave
->enabled
) {
1117 unixctl_command_reply(conn
, 501, "cannot migrate to disabled slave");
1121 entry
= &bond
->hash
[hash
];
1122 tag_set_add(&bond
->unixctl_tags
, entry
->tag
);
1123 entry
->slave
= slave
;
1124 entry
->tag
= tag_create_random();
1125 unixctl_command_reply(conn
, 200, "migrated");
1129 bond_unixctl_set_active_slave(struct unixctl_conn
*conn
, const char *args_
,
1130 void *aux OVS_UNUSED
)
1132 char *args
= (char *) args_
;
1133 char *save_ptr
= NULL
;
1134 char *bond_s
, *slave_s
;
1136 struct bond_slave
*slave
;
1138 bond_s
= strtok_r(args
, " ", &save_ptr
);
1139 slave_s
= strtok_r(NULL
, " ", &save_ptr
);
1141 unixctl_command_reply(conn
, 501,
1142 "usage: bond/set-active-slave BOND SLAVE");
1146 bond
= bond_find(bond_s
);
1148 unixctl_command_reply(conn
, 501, "no such bond");
1152 slave
= bond_lookup_slave(bond
, slave_s
);
1154 unixctl_command_reply(conn
, 501, "no such slave");
1158 if (!slave
->enabled
) {
1159 unixctl_command_reply(conn
, 501, "cannot make disabled slave active");
1163 if (bond
->active_slave
!= slave
) {
1164 tag_set_add(&bond
->unixctl_tags
, bond_get_active_slave_tag(bond
));
1165 bond
->active_slave
= slave
;
1166 bond
->active_slave
->tag
= tag_create_random();
1167 VLOG_INFO("bond %s: active interface is now %s",
1168 bond
->name
, slave
->name
);
1169 bond
->send_learning_packets
= true;
1170 unixctl_command_reply(conn
, 200, "done");
1172 unixctl_command_reply(conn
, 200, "no change");
1177 enable_slave(struct unixctl_conn
*conn
, const char *args_
, bool enable
)
1179 char *args
= (char *) args_
;
1180 char *save_ptr
= NULL
;
1181 char *bond_s
, *slave_s
;
1183 struct bond_slave
*slave
;
1185 bond_s
= strtok_r(args
, " ", &save_ptr
);
1186 slave_s
= strtok_r(NULL
, " ", &save_ptr
);
1188 char *usage
= xasprintf("usage: bond/%s-slave BOND SLAVE",
1189 enable
? "enable" : "disable");
1190 unixctl_command_reply(conn
, 501, usage
);
1195 bond
= bond_find(bond_s
);
1197 unixctl_command_reply(conn
, 501, "no such bond");
1201 slave
= bond_lookup_slave(bond
, slave_s
);
1203 unixctl_command_reply(conn
, 501, "no such slave");
1207 bond_enable_slave(slave
, enable
, &bond
->unixctl_tags
);
1208 unixctl_command_reply(conn
, 501, enable
? "enabled" : "disabled");
1212 bond_unixctl_enable_slave(struct unixctl_conn
*conn
, const char *args
,
1213 void *aux OVS_UNUSED
)
1215 enable_slave(conn
, args
, true);
1219 bond_unixctl_disable_slave(struct unixctl_conn
*conn
, const char *args
,
1220 void *aux OVS_UNUSED
)
1222 enable_slave(conn
, args
, false);
1226 bond_unixctl_hash(struct unixctl_conn
*conn
, const char *args_
,
1227 void *aux OVS_UNUSED
)
1229 char *args
= (char *) args_
;
1230 uint8_t mac
[ETH_ADDR_LEN
];
1234 char *mac_s
, *vlan_s
;
1235 char *save_ptr
= NULL
;
1237 mac_s
= strtok_r(args
, " ", &save_ptr
);
1238 vlan_s
= strtok_r(NULL
, " ", &save_ptr
);
1241 if (sscanf(vlan_s
, "%u", &vlan
) != 1) {
1242 unixctl_command_reply(conn
, 501, "invalid vlan");
1246 vlan
= OFP_VLAN_NONE
;
1249 if (sscanf(mac_s
, ETH_ADDR_SCAN_FMT
, ETH_ADDR_SCAN_ARGS(mac
))
1250 == ETH_ADDR_SCAN_COUNT
) {
1251 hash
= bond_hash_src(mac
, vlan
) & BOND_MASK
;
1253 hash_cstr
= xasprintf("%u", hash
);
1254 unixctl_command_reply(conn
, 200, hash_cstr
);
1257 unixctl_command_reply(conn
, 501, "invalid mac");
1266 unixctl_command_register("bond/list", bond_unixctl_list
, NULL
);
1267 unixctl_command_register("bond/show", bond_unixctl_show
, NULL
);
1268 unixctl_command_register("bond/migrate", bond_unixctl_migrate
, NULL
);
1269 unixctl_command_register("bond/set-active-slave",
1270 bond_unixctl_set_active_slave
, NULL
);
1271 unixctl_command_register("bond/enable-slave", bond_unixctl_enable_slave
,
1273 unixctl_command_register("bond/disable-slave", bond_unixctl_disable_slave
,
1275 unixctl_command_register("bond/hash", bond_unixctl_hash
, NULL
);
1279 bond_entry_reset(struct bond
*bond
)
1281 if (bond
->balance
!= BM_AB
) {
1282 size_t hash_len
= (BOND_MASK
+ 1) * sizeof *bond
->hash
;
1285 bond
->hash
= xmalloc(hash_len
);
1287 memset(bond
->hash
, 0, hash_len
);
1289 bond
->next_rebalance
= time_msec() + bond
->rebalance_interval
;
1296 static struct bond_slave
*
1297 bond_slave_lookup(struct bond
*bond
, const void *slave_
)
1299 struct bond_slave
*slave
;
1301 HMAP_FOR_EACH_IN_BUCKET (slave
, hmap_node
, hash_pointer(slave_
, 0),
1303 if (slave
->aux
== slave_
) {
1312 bond_is_link_up(struct bond
*bond
, struct netdev
*netdev
)
1314 return (bond
->detect
== BLSM_CARRIER
1315 ? netdev_get_carrier(netdev
)
1316 : netdev_get_miimon(netdev
));
1320 bond_enable_slave(struct bond_slave
*slave
, bool enable
, struct tag_set
*tags
)
1322 slave
->delay_expires
= LLONG_MAX
;
1323 if (enable
!= slave
->enabled
) {
1324 slave
->enabled
= enable
;
1325 if (!slave
->enabled
) {
1326 VLOG_WARN("interface %s: disabled", slave
->name
);
1328 tag_set_add(tags
, slave
->tag
);
1331 VLOG_WARN("interface %s: enabled", slave
->name
);
1332 slave
->tag
= tag_create_random();
1338 bond_link_status_update(struct bond_slave
*slave
, struct tag_set
*tags
)
1340 struct bond
*bond
= slave
->bond
;
1343 up
= slave
->up
&& lacp_slave_may_enable(bond
->lacp
, slave
);
1344 if ((up
== slave
->enabled
) != (slave
->delay_expires
== LLONG_MAX
)) {
1345 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
1346 VLOG_INFO_RL(&rl
, "interface %s: link state %s",
1347 slave
->name
, up
? "up" : "down");
1348 if (up
== slave
->enabled
) {
1349 slave
->delay_expires
= LLONG_MAX
;
1350 VLOG_INFO_RL(&rl
, "interface %s: will not be %s",
1351 slave
->name
, up
? "disabled" : "enabled");
1353 int delay
= (lacp_negotiated(bond
->lacp
) ? 0
1354 : up
? bond
->updelay
: bond
->downdelay
);
1355 slave
->delay_expires
= time_msec() + delay
;
1357 VLOG_INFO_RL(&rl
, "interface %s: will be %s if it stays %s "
1360 up
? "enabled" : "disabled",
1367 if (time_msec() >= slave
->delay_expires
) {
1368 bond_enable_slave(slave
, up
, tags
);
1373 bond_is_tcp_hash(const struct bond
*bond
)
1375 return bond
->balance
== BM_TCP
&& lacp_negotiated(bond
->lacp
);
1379 bond_hash_src(const uint8_t mac
[ETH_ADDR_LEN
], uint16_t vlan
)
1381 return hash_bytes(mac
, ETH_ADDR_LEN
, vlan
);
1385 bond_hash_tcp(const struct flow
*flow
, uint16_t vlan
)
1387 struct flow hash_flow
= *flow
;
1388 hash_flow
.vlan_tci
= vlan
;
1390 /* The symmetric quality of this hash function is not required, but
1391 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1392 * purposes, so we use it out of convenience. */
1393 return flow_hash_symmetric_l4(&hash_flow
, 0);
1396 static struct bond_entry
*
1397 lookup_bond_entry(const struct bond
*bond
, const struct flow
*flow
,
1400 assert(bond
->balance
!= BM_AB
);
1401 return &bond
->hash
[(bond_is_tcp_hash(bond
)
1402 ? bond_hash_tcp(flow
, vlan
)
1403 : bond_hash_src(flow
->dl_src
, vlan
)) & BOND_MASK
];
1406 static struct bond_slave
*
1407 choose_output_slave(const struct bond
*bond
, const struct flow
*flow
,
1410 struct bond_entry
*e
;
1412 switch (bond
->balance
) {
1414 return bond
->active_slave
;
1418 e
= lookup_bond_entry(bond
, flow
, vlan
);
1419 if (!e
->slave
|| !e
->slave
->enabled
) {
1420 e
->slave
= CONTAINER_OF(hmap_random_node(&bond
->slaves
),
1421 struct bond_slave
, hmap_node
);
1422 if (!e
->slave
->enabled
) {
1423 e
->slave
= bond
->active_slave
;
1425 e
->tag
= tag_create_random();
1434 static struct bond_slave
*
1435 bond_choose_slave(const struct bond
*bond
)
1437 struct bond_slave
*slave
, *best
;
1439 /* Find an enabled slave. */
1440 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1441 if (slave
->enabled
) {
1446 /* All interfaces are disabled. Find an interface that will be enabled
1447 * after its updelay expires. */
1449 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1450 if (slave
->delay_expires
!= LLONG_MAX
1451 && lacp_slave_may_enable(bond
->lacp
, slave
)
1452 && (!best
|| slave
->delay_expires
< best
->delay_expires
)) {
1460 bond_choose_active_slave(struct bond
*bond
, struct tag_set
*tags
)
1462 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
1463 struct bond_slave
*old_active_slave
= bond
->active_slave
;
1465 bond
->active_slave
= bond_choose_slave(bond
);
1466 if (bond
->active_slave
) {
1467 if (bond
->active_slave
->enabled
) {
1468 VLOG_INFO_RL(&rl
, "bond %s: active interface is now %s",
1469 bond
->name
, bond
->active_slave
->name
);
1471 VLOG_INFO_RL(&rl
, "bond %s: active interface is now %s, skipping "
1472 "remaining %lld ms updelay (since no interface was "
1473 "enabled)", bond
->name
, bond
->active_slave
->name
,
1474 bond
->active_slave
->delay_expires
- time_msec());
1475 bond_enable_slave(bond
->active_slave
, true, tags
);
1478 if (!old_active_slave
) {
1479 tag_set_add(tags
, bond
->no_slaves_tag
);
1482 bond
->send_learning_packets
= true;
1483 } else if (old_active_slave
) {
1484 VLOG_WARN_RL(&rl
, "bond %s: all interfaces disabled", bond
->name
);
1488 /* Returns the tag for 'bond''s active slave, or 'bond''s no_slaves_tag if
1489 * there is no active slave. */
1491 bond_get_active_slave_tag(const struct bond
*bond
)
1493 return (bond
->active_slave
1494 ? bond
->active_slave
->tag
1495 : bond
->no_slaves_tag
);
1498 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1499 * bond interface. */
1501 bond_update_fake_slave_stats(struct bond
*bond
)
1503 struct netdev_stats bond_stats
;
1504 struct bond_slave
*slave
;
1505 struct netdev
*bond_dev
;
1507 memset(&bond_stats
, 0, sizeof bond_stats
);
1509 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1510 struct netdev_stats slave_stats
;
1512 if (!netdev_get_stats(slave
->netdev
, &slave_stats
)) {
1513 /* XXX: We swap the stats here because they are swapped back when
1514 * reported by the internal device. The reason for this is
1515 * internal devices normally represent packets going into the
1516 * system but when used as fake bond device they represent packets
1517 * leaving the system. We really should do this in the internal
1518 * device itself because changing it here reverses the counts from
1519 * the perspective of the switch. However, the internal device
1520 * doesn't know what type of device it represents so we have to do
1521 * it here for now. */
1522 bond_stats
.tx_packets
+= slave_stats
.rx_packets
;
1523 bond_stats
.tx_bytes
+= slave_stats
.rx_bytes
;
1524 bond_stats
.rx_packets
+= slave_stats
.tx_packets
;
1525 bond_stats
.rx_bytes
+= slave_stats
.tx_bytes
;
1529 if (!netdev_open_default(bond
->name
, &bond_dev
)) {
1530 netdev_set_stats(bond_dev
, &bond_stats
);
1531 netdev_close(bond_dev
);