2 * Copyright (c) 2008, 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
26 #include "dynamic-string.h"
34 #include "poll-loop.h"
40 VLOG_DEFINE_THIS_MODULE(bond
);
42 /* Bit-mask for hashing a flow down to a bucket.
43 * There are (BOND_MASK + 1) buckets. */
44 #define BOND_MASK 0xff
46 /* A hash bucket for mapping a flow to a slave.
47 * "struct bond" has an array of (BOND_MASK + 1) of these. */
49 struct bond_slave
*slave
; /* Assigned slave, NULL if unassigned. */
50 uint64_t tx_bytes
; /* Count of bytes recently transmitted. */
51 tag_type tag
; /* Tag for entry<->slave association. */
52 struct list list_node
; /* In bond_slave's 'entries' list. */
55 /* A bond slave, that is, one of the links comprising a bond. */
57 struct hmap_node hmap_node
; /* In struct bond's slaves hmap. */
58 struct bond
*bond
; /* The bond that contains this slave. */
59 void *aux
; /* Client-provided handle for this slave. */
61 struct netdev
*netdev
; /* Network device, owned by the client. */
62 unsigned int change_seq
; /* Tracks changes in 'netdev'. */
63 char *name
; /* Name (a copy of netdev_get_name(netdev)). */
66 long long delay_expires
; /* Time after which 'enabled' may change. */
67 bool enabled
; /* May be chosen for flows? */
68 bool may_enable
; /* Client considers this slave bondable. */
69 tag_type tag
; /* Tag associated with this slave. */
71 /* Rebalancing info. Used only by bond_rebalance(). */
72 struct list bal_node
; /* In bond_rebalance()'s 'bals' list. */
73 struct list entries
; /* 'struct bond_entry's assigned here. */
74 uint64_t tx_bytes
; /* Sum across 'tx_bytes' of entries. */
76 /* BM_STABLE specific bonding info. */
77 uint32_t stb_id
; /* ID used for 'stb_slaves' ordering. */
80 /* A bond, that is, a set of network devices grouped to improve performance or
83 struct hmap_node hmap_node
; /* In 'all_bonds' hmap. */
84 char *name
; /* Name provided by client. */
90 enum bond_mode balance
; /* Balancing mode, one of BM_*. */
91 struct bond_slave
*active_slave
;
92 tag_type no_slaves_tag
; /* Tag for flows when all slaves disabled. */
93 int updelay
, downdelay
; /* Delay before slave goes up/down, in ms. */
94 bool lacp_negotiated
; /* LACP negotiations were successful. */
95 bool bond_revalidate
; /* True if flows need revalidation. */
96 uint32_t basis
; /* Basis for flow hash function. */
98 /* SLB specific bonding info. */
99 struct bond_entry
*hash
; /* An array of (BOND_MASK + 1) elements. */
100 int rebalance_interval
; /* Interval between rebalances, in ms. */
101 long long int next_rebalance
; /* Next rebalancing time. */
102 bool send_learning_packets
;
104 /* BM_STABLE specific bonding info. */
105 tag_type stb_tag
; /* Tag associated with this bond. */
107 /* Legacy compatibility. */
108 long long int next_fake_iface_update
; /* LLONG_MAX if disabled. */
110 /* Tag set saved for next bond_run(). This tag set is a kluge for cases
111 * where we can't otherwise provide revalidation feedback to the client.
112 * That's only unixctl commands now; I hope no other cases will arise. */
113 struct tag_set unixctl_tags
;
116 static struct hmap all_bonds
= HMAP_INITIALIZER(&all_bonds
);
118 static void bond_entry_reset(struct bond
*);
119 static struct bond_slave
*bond_slave_lookup(struct bond
*, const void *slave_
);
120 static void bond_enable_slave(struct bond_slave
*, bool enable
,
122 static void bond_link_status_update(struct bond_slave
*, struct tag_set
*);
123 static void bond_choose_active_slave(struct bond
*, struct tag_set
*);
124 static bool bond_is_tcp_hash(const struct bond
*);
125 static unsigned int bond_hash_src(const uint8_t mac
[ETH_ADDR_LEN
],
126 uint16_t vlan
, uint32_t basis
);
127 static unsigned int bond_hash_tcp(const struct flow
*, uint16_t vlan
,
129 static struct bond_entry
*lookup_bond_entry(const struct bond
*,
132 static tag_type
bond_get_active_slave_tag(const struct bond
*);
133 static struct bond_slave
*choose_output_slave(const struct bond
*,
136 static void bond_update_fake_slave_stats(struct bond
*);
138 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
139 * stores the mode in '*balance' and returns true. Otherwise returns false
140 * without modifying '*balance'. */
142 bond_mode_from_string(enum bond_mode
*balance
, const char *s
)
144 if (!strcmp(s
, bond_mode_to_string(BM_TCP
))) {
146 } else if (!strcmp(s
, bond_mode_to_string(BM_SLB
))) {
148 } else if (!strcmp(s
, bond_mode_to_string(BM_STABLE
))) {
149 *balance
= BM_STABLE
;
150 } else if (!strcmp(s
, bond_mode_to_string(BM_AB
))) {
158 /* Returns a string representing 'balance'. */
160 bond_mode_to_string(enum bond_mode balance
) {
163 return "balance-tcp";
165 return "balance-slb";
169 return "active-backup";
175 /* Creates and returns a new bond whose configuration is initially taken from
178 * The caller should register each slave on the new bond by calling
179 * bond_slave_register(). */
181 bond_create(const struct bond_settings
*s
)
185 bond
= xzalloc(sizeof *bond
);
186 hmap_init(&bond
->slaves
);
187 bond
->no_slaves_tag
= tag_create_random();
188 bond
->stb_tag
= tag_create_random();
189 bond
->next_fake_iface_update
= LLONG_MAX
;
191 bond_reconfigure(bond
, s
);
193 tag_set_init(&bond
->unixctl_tags
);
200 bond_destroy(struct bond
*bond
)
202 struct bond_slave
*slave
, *next_slave
;
208 hmap_remove(&all_bonds
, &bond
->hmap_node
);
210 HMAP_FOR_EACH_SAFE (slave
, next_slave
, hmap_node
, &bond
->slaves
) {
211 hmap_remove(&bond
->slaves
, &slave
->hmap_node
);
212 /* Client owns 'slave->netdev'. */
216 hmap_destroy(&bond
->slaves
);
223 /* Updates 'bond''s overall configuration to 's'.
225 * The caller should register each slave on 'bond' by calling
226 * bond_slave_register(). This is optional if none of the slaves'
227 * configuration has changed. In any case it can't hurt.
229 * Returns true if the configuration has changed in such a way that requires
233 bond_reconfigure(struct bond
*bond
, const struct bond_settings
*s
)
235 bool revalidate
= false;
237 if (!bond
->name
|| strcmp(bond
->name
, s
->name
)) {
239 hmap_remove(&all_bonds
, &bond
->hmap_node
);
242 bond
->name
= xstrdup(s
->name
);
243 hmap_insert(&all_bonds
, &bond
->hmap_node
, hash_string(bond
->name
, 0));
246 bond
->updelay
= s
->up_delay
;
247 bond
->downdelay
= s
->down_delay
;
248 bond
->rebalance_interval
= s
->rebalance_interval
;
250 if (bond
->balance
!= s
->balance
) {
251 bond
->balance
= s
->balance
;
255 if (bond
->basis
!= s
->basis
) {
256 bond
->basis
= s
->basis
;
261 if (bond
->next_fake_iface_update
== LLONG_MAX
) {
262 bond
->next_fake_iface_update
= time_msec();
265 bond
->next_fake_iface_update
= LLONG_MAX
;
268 if (bond
->bond_revalidate
) {
270 bond
->bond_revalidate
= false;
273 if (bond
->balance
== BM_AB
|| !bond
->hash
|| revalidate
) {
274 bond_entry_reset(bond
);
281 bond_slave_set_netdev__(struct bond_slave
*slave
, struct netdev
*netdev
)
283 if (slave
->netdev
!= netdev
) {
284 slave
->netdev
= netdev
;
285 slave
->change_seq
= 0;
289 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
290 * arbitrary client-provided pointer that uniquely identifies a slave within a
291 * bond. If 'slave_' already exists within 'bond' then this function
292 * reconfigures the existing slave.
294 * 'stb_id' is used in BM_STABLE bonds to guarantee consistent slave choices
295 * across restarts and distributed vswitch instances. It should be unique per
296 * slave, and preferably consistent across restarts and reconfigurations.
298 * 'netdev' must be the network device that 'slave_' represents. It is owned
299 * by the client, so the client must not close it before either unregistering
300 * 'slave_' or destroying 'bond'.
303 bond_slave_register(struct bond
*bond
, void *slave_
, uint32_t stb_id
,
304 struct netdev
*netdev
)
306 struct bond_slave
*slave
= bond_slave_lookup(bond
, slave_
);
309 slave
= xzalloc(sizeof *slave
);
311 hmap_insert(&bond
->slaves
, &slave
->hmap_node
, hash_pointer(slave_
, 0));
314 slave
->delay_expires
= LLONG_MAX
;
315 slave
->name
= xstrdup(netdev_get_name(netdev
));
316 bond
->bond_revalidate
= true;
318 slave
->enabled
= false;
319 bond_enable_slave(slave
, netdev_get_carrier(netdev
), NULL
);
322 if (slave
->stb_id
!= stb_id
) {
323 slave
->stb_id
= stb_id
;
324 bond
->bond_revalidate
= true;
327 bond_slave_set_netdev__(slave
, netdev
);
330 slave
->name
= xstrdup(netdev_get_name(netdev
));
333 /* Updates the network device to be used with 'slave_' to 'netdev'.
335 * This is useful if the caller closes and re-opens the network device
336 * registered with bond_slave_register() but doesn't need to change anything
339 bond_slave_set_netdev(struct bond
*bond
, void *slave_
, struct netdev
*netdev
)
341 struct bond_slave
*slave
= bond_slave_lookup(bond
, slave_
);
343 bond_slave_set_netdev__(slave
, netdev
);
347 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
348 * then this function has no effect.
350 * Unregistering a slave invalidates all flows. */
352 bond_slave_unregister(struct bond
*bond
, const void *slave_
)
354 struct bond_slave
*slave
= bond_slave_lookup(bond
, slave_
);
361 bond_enable_slave(slave
, false, NULL
);
363 del_active
= bond
->active_slave
== slave
;
365 struct bond_entry
*e
;
366 for (e
= bond
->hash
; e
<= &bond
->hash
[BOND_MASK
]; e
++) {
367 if (e
->slave
== slave
) {
375 hmap_remove(&bond
->slaves
, &slave
->hmap_node
);
376 /* Client owns 'slave->netdev'. */
383 bond_choose_active_slave(bond
, &tags
);
384 bond
->send_learning_packets
= true;
388 /* Should be called on each slave in 'bond' before bond_run() to indicate
389 * whether or not 'slave_' may be enabled. This function is intended to allow
390 * other protocols to have some impact on bonding decisions. For example LACP
391 * or high level link monitoring protocols may decide that a given slave should
392 * not be able to send traffic. */
394 bond_slave_set_may_enable(struct bond
*bond
, void *slave_
, bool may_enable
)
396 bond_slave_lookup(bond
, slave_
)->may_enable
= may_enable
;
399 /* Performs periodic maintenance on 'bond'. The caller must provide 'tags' to
400 * allow tagged flows to be invalidated.
402 * The caller should check bond_should_send_learning_packets() afterward. */
404 bond_run(struct bond
*bond
, struct tag_set
*tags
, bool lacp_negotiated
)
406 struct bond_slave
*slave
;
407 bool is_tcp_hash
= bond_is_tcp_hash(bond
);
409 bond
->lacp_negotiated
= lacp_negotiated
;
411 /* Enable slaves based on link status and LACP feedback. */
412 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
413 bond_link_status_update(slave
, tags
);
414 slave
->change_seq
= netdev_change_seq(slave
->netdev
);
416 if (!bond
->active_slave
|| !bond
->active_slave
->enabled
) {
417 bond_choose_active_slave(bond
, tags
);
420 /* Update fake bond interface stats. */
421 if (time_msec() >= bond
->next_fake_iface_update
) {
422 bond_update_fake_slave_stats(bond
);
423 bond
->next_fake_iface_update
= time_msec() + 1000;
426 if (is_tcp_hash
!= bond_is_tcp_hash(bond
)) {
427 bond
->bond_revalidate
= true;
430 if (bond
->bond_revalidate
) {
431 bond
->bond_revalidate
= false;
433 bond_entry_reset(bond
);
434 if (bond
->balance
!= BM_STABLE
) {
435 struct bond_slave
*slave
;
437 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
438 tag_set_add(tags
, slave
->tag
);
441 tag_set_add(tags
, bond
->stb_tag
);
443 tag_set_add(tags
, bond
->no_slaves_tag
);
446 /* Invalidate any tags required by */
447 tag_set_union(tags
, &bond
->unixctl_tags
);
448 tag_set_init(&bond
->unixctl_tags
);
451 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
453 bond_wait(struct bond
*bond
)
455 struct bond_slave
*slave
;
457 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
458 if (slave
->delay_expires
!= LLONG_MAX
) {
459 poll_timer_wait_until(slave
->delay_expires
);
462 if (slave
->change_seq
!= netdev_change_seq(slave
->netdev
)) {
463 poll_immediate_wake();
467 if (bond
->next_fake_iface_update
!= LLONG_MAX
) {
468 poll_timer_wait_until(bond
->next_fake_iface_update
);
471 /* Ensure that any saved tags get revalidated right away. */
472 if (!tag_set_is_empty(&bond
->unixctl_tags
)) {
473 poll_immediate_wake();
476 /* We don't wait for bond->next_rebalance because rebalancing can only run
477 * at a flow account checkpoint. ofproto does checkpointing on its own
478 * schedule and bond_rebalance() gets called afterward, so we'd just be
479 * waking up for no purpose. */
482 /* MAC learning table interaction. */
485 may_send_learning_packets(const struct bond
*bond
)
487 return !bond
->lacp_negotiated
&& bond
->balance
!= BM_AB
;
490 /* Returns true if 'bond' needs the client to send out packets to assist with
491 * MAC learning on 'bond'. If this function returns true, then the client
492 * should iterate through its MAC learning table for the bridge on which 'bond'
493 * is located. For each MAC that has been learned on a port other than 'bond',
494 * it should call bond_send_learning_packet().
496 * This function will only return true if 'bond' is in SLB mode and LACP is not
497 * negotiated. Otherwise sending learning packets isn't necessary.
499 * Calling this function resets the state that it checks. */
501 bond_should_send_learning_packets(struct bond
*bond
)
503 bool send
= bond
->send_learning_packets
&& may_send_learning_packets(bond
);
504 bond
->send_learning_packets
= false;
508 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
510 * See bond_should_send_learning_packets() for description of usage. */
512 bond_send_learning_packet(struct bond
*bond
,
513 const uint8_t eth_src
[ETH_ADDR_LEN
],
516 struct bond_slave
*slave
;
517 struct ofpbuf packet
;
521 assert(may_send_learning_packets(bond
));
522 if (!bond
->active_slave
) {
523 /* Nowhere to send the learning packet. */
527 memset(&flow
, 0, sizeof flow
);
528 memcpy(flow
.dl_src
, eth_src
, ETH_ADDR_LEN
);
529 slave
= choose_output_slave(bond
, &flow
, vlan
);
531 ofpbuf_init(&packet
, 0);
532 compose_benign_packet(&packet
, "Open vSwitch Bond Failover", 0xf177,
535 eth_push_vlan(&packet
, htons(vlan
));
537 error
= netdev_send(slave
->netdev
, &packet
);
538 ofpbuf_uninit(&packet
);
543 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
544 * Ethernet destination address of 'eth_dst', should be admitted.
546 * The return value is one of the following:
548 * - BV_ACCEPT: Admit the packet.
550 * - BV_DROP: Drop the packet.
552 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
553 * Ethernet source address and VLAN. If there is none, or if the packet
554 * is on the learned port, then admit the packet. If a different port has
555 * been learned, however, drop the packet (and do not use it for MAC
559 bond_check_admissibility(struct bond
*bond
, const void *slave_
,
560 const uint8_t eth_dst
[ETH_ADDR_LEN
], tag_type
*tags
)
562 struct bond_slave
*slave
= bond_slave_lookup(bond
, slave_
);
564 /* LACP bonds have very loose admissibility restrictions because we can
565 * assume the remote switch is aware of the bond and will "do the right
566 * thing". However, as a precaution we drop packets on disabled slaves
567 * because no correctly implemented partner switch should be sending
568 * packets to them. */
569 if (bond
->lacp_negotiated
) {
570 return slave
->enabled
? BV_ACCEPT
: BV_DROP
;
573 /* Drop all multicast packets on inactive slaves. */
574 if (eth_addr_is_multicast(eth_dst
)) {
575 *tags
|= bond_get_active_slave_tag(bond
);
576 if (bond
->active_slave
!= bond_slave_lookup(bond
, slave_
)) {
581 /* Drop all packets which arrive on backup slaves. This is similar to how
582 * Linux bonding handles active-backup bonds. */
583 if (bond
->balance
== BM_AB
) {
584 *tags
|= bond_get_active_slave_tag(bond
);
585 if (bond
->active_slave
!= slave
) {
586 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
588 VLOG_DBG_RL(&rl
, "active-backup bond received packet on backup"
589 " slave (%s) destined for " ETH_ADDR_FMT
,
590 slave
->name
, ETH_ADDR_ARGS(eth_dst
));
595 /* Drop all packets for which we have learned a different input port,
596 * because we probably sent the packet on one slave and got it back on the
597 * other. Gratuitous ARP packets are an exception to this rule: the host
598 * has moved to another switch. The exception to the exception is if we
599 * locked the learning table to avoid reflections on bond slaves. */
600 return BV_DROP_IF_MOVED
;
603 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
604 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
605 * NULL if the packet should be dropped because no slaves are enabled.
607 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
608 * should be a VID only (i.e. excluding the PCP bits). Second,
609 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
610 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
611 * packet belongs to (so for an access port it will be the access port's VLAN).
613 * Adds a tag to '*tags' that associates the flow with the returned slave.
616 bond_choose_output_slave(struct bond
*bond
, const struct flow
*flow
,
617 uint16_t vlan
, tag_type
*tags
)
619 struct bond_slave
*slave
= choose_output_slave(bond
, flow
, vlan
);
621 *tags
|= bond
->balance
== BM_STABLE
? bond
->stb_tag
: slave
->tag
;
624 *tags
|= bond
->no_slaves_tag
;
632 bond_is_balanced(const struct bond
*bond
)
634 return bond
->balance
== BM_SLB
|| bond
->balance
== BM_TCP
;
637 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
639 bond_account(struct bond
*bond
, const struct flow
*flow
, uint16_t vlan
,
642 if (bond_is_balanced(bond
)) {
643 lookup_bond_entry(bond
, flow
, vlan
)->tx_bytes
+= n_bytes
;
647 static struct bond_slave
*
648 bond_slave_from_bal_node(struct list
*bal
)
650 return CONTAINER_OF(bal
, struct bond_slave
, bal_node
);
654 log_bals(struct bond
*bond
, const struct list
*bals
)
656 if (VLOG_IS_DBG_ENABLED()) {
657 struct ds ds
= DS_EMPTY_INITIALIZER
;
658 const struct bond_slave
*slave
;
660 LIST_FOR_EACH (slave
, bal_node
, bals
) {
662 ds_put_char(&ds
, ',');
664 ds_put_format(&ds
, " %s %"PRIu64
"kB",
665 slave
->name
, slave
->tx_bytes
/ 1024);
667 if (!slave
->enabled
) {
668 ds_put_cstr(&ds
, " (disabled)");
670 if (!list_is_empty(&slave
->entries
)) {
671 struct bond_entry
*e
;
673 ds_put_cstr(&ds
, " (");
674 LIST_FOR_EACH (e
, list_node
, &slave
->entries
) {
675 if (&e
->list_node
!= list_front(&slave
->entries
)) {
676 ds_put_cstr(&ds
, " + ");
678 ds_put_format(&ds
, "h%td: %"PRIu64
"kB",
679 e
- bond
->hash
, e
->tx_bytes
/ 1024);
681 ds_put_cstr(&ds
, ")");
684 VLOG_DBG("bond %s:%s", bond
->name
, ds_cstr(&ds
));
689 /* Shifts 'hash' from its current slave to 'to'. */
691 bond_shift_load(struct bond_entry
*hash
, struct bond_slave
*to
,
694 struct bond_slave
*from
= hash
->slave
;
695 struct bond
*bond
= from
->bond
;
696 uint64_t delta
= hash
->tx_bytes
;
698 VLOG_INFO("bond %s: shift %"PRIu64
"kB of load (with hash %td) "
699 "from %s to %s (now carrying %"PRIu64
"kB and "
700 "%"PRIu64
"kB load, respectively)",
701 bond
->name
, delta
/ 1024, hash
- bond
->hash
,
702 from
->name
, to
->name
,
703 (from
->tx_bytes
- delta
) / 1024,
704 (to
->tx_bytes
+ delta
) / 1024);
706 /* Shift load away from 'from' to 'to'. */
707 from
->tx_bytes
-= delta
;
708 to
->tx_bytes
+= delta
;
710 /* Arrange for flows to be revalidated. */
711 tag_set_add(set
, hash
->tag
);
713 hash
->tag
= tag_create_random();
716 /* Pick and returns a bond_entry to migrate to 'to' (the least-loaded slave),
717 * given that doing so must decrease the ratio of the load on the two slaves by
718 * at least 0.1. Returns NULL if there is no appropriate entry.
720 * The list of entries isn't sorted. I don't know of a reason to prefer to
721 * shift away small hashes or large hashes. */
722 static struct bond_entry
*
723 choose_entry_to_migrate(const struct bond_slave
*from
, uint64_t to_tx_bytes
)
725 struct bond_entry
*e
;
727 if (list_is_short(&from
->entries
)) {
728 /* 'from' carries no more than one MAC hash, so shifting load away from
729 * it would be pointless. */
733 LIST_FOR_EACH (e
, list_node
, &from
->entries
) {
734 double old_ratio
, new_ratio
;
737 if (to_tx_bytes
== 0) {
738 /* Nothing on the new slave, move it. */
743 old_ratio
= (double)from
->tx_bytes
/ to_tx_bytes
;
744 new_ratio
= (double)(from
->tx_bytes
- delta
) / (to_tx_bytes
+ delta
);
745 if (old_ratio
- new_ratio
> 0.1) {
746 /* Would decrease the ratio, move it. */
754 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
757 insert_bal(struct list
*bals
, struct bond_slave
*slave
)
759 struct bond_slave
*pos
;
761 LIST_FOR_EACH (pos
, bal_node
, bals
) {
762 if (slave
->tx_bytes
> pos
->tx_bytes
) {
766 list_insert(&pos
->bal_node
, &slave
->bal_node
);
769 /* Removes 'slave' from its current list and then inserts it into 'bals' so
770 * that descending order of 'tx_bytes' is maintained. */
772 reinsert_bal(struct list
*bals
, struct bond_slave
*slave
)
774 list_remove(&slave
->bal_node
);
775 insert_bal(bals
, slave
);
778 /* If 'bond' needs rebalancing, does so.
780 * The caller should have called bond_account() for each active flow, to ensure
781 * that flow data is consistently accounted at this point. */
783 bond_rebalance(struct bond
*bond
, struct tag_set
*tags
)
785 struct bond_slave
*slave
;
786 struct bond_entry
*e
;
789 if (!bond_is_balanced(bond
) || time_msec() < bond
->next_rebalance
) {
792 bond
->next_rebalance
= time_msec() + bond
->rebalance_interval
;
794 /* Add each bond_entry to its slave's 'entries' list.
795 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
796 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
798 list_init(&slave
->entries
);
800 for (e
= &bond
->hash
[0]; e
<= &bond
->hash
[BOND_MASK
]; e
++) {
801 if (e
->slave
&& e
->tx_bytes
) {
802 e
->slave
->tx_bytes
+= e
->tx_bytes
;
803 list_push_back(&e
->slave
->entries
, &e
->list_node
);
807 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
809 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
810 * with a proper list sort algorithm. */
812 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
813 if (slave
->enabled
) {
814 insert_bal(&bals
, slave
);
817 log_bals(bond
, &bals
);
819 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
820 while (!list_is_short(&bals
)) {
821 struct bond_slave
*from
= bond_slave_from_bal_node(list_front(&bals
));
822 struct bond_slave
*to
= bond_slave_from_bal_node(list_back(&bals
));
825 overload
= from
->tx_bytes
- to
->tx_bytes
;
826 if (overload
< to
->tx_bytes
>> 5 || overload
< 100000) {
827 /* The extra load on 'from' (and all less-loaded slaves), compared
828 * to that of 'to' (the least-loaded slave), is less than ~3%, or
829 * it is less than ~1Mbps. No point in rebalancing. */
833 /* 'from' is carrying significantly more load than 'to', and that load
834 * is split across at least two different hashes. */
835 e
= choose_entry_to_migrate(from
, to
->tx_bytes
);
837 bond_shift_load(e
, to
, tags
);
839 /* Delete element from from->entries.
841 * We don't add the element to to->hashes. That would only allow
842 * 'e' to be migrated to another slave in this rebalancing run, and
843 * there is no point in doing that. */
844 list_remove(&e
->list_node
);
846 /* Re-sort 'bals'. */
847 reinsert_bal(&bals
, from
);
848 reinsert_bal(&bals
, to
);
850 /* Can't usefully migrate anything away from 'from'.
851 * Don't reconsider it. */
852 list_remove(&from
->bal_node
);
856 /* Implement exponentially weighted moving average. A weight of 1/2 causes
857 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
858 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
859 for (e
= &bond
->hash
[0]; e
<= &bond
->hash
[BOND_MASK
]; e
++) {
867 /* Bonding unixctl user interface functions. */
870 bond_find(const char *name
)
874 HMAP_FOR_EACH_WITH_HASH (bond
, hmap_node
, hash_string(name
, 0),
876 if (!strcmp(bond
->name
, name
)) {
883 static struct bond_slave
*
884 bond_lookup_slave(struct bond
*bond
, const char *slave_name
)
886 struct bond_slave
*slave
;
888 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
889 if (!strcmp(slave
->name
, slave_name
)) {
897 bond_unixctl_list(struct unixctl_conn
*conn
,
898 const char *args OVS_UNUSED
, void *aux OVS_UNUSED
)
900 struct ds ds
= DS_EMPTY_INITIALIZER
;
901 const struct bond
*bond
;
903 ds_put_cstr(&ds
, "bond\ttype\tslaves\n");
905 HMAP_FOR_EACH (bond
, hmap_node
, &all_bonds
) {
906 const struct bond_slave
*slave
;
909 ds_put_format(&ds
, "%s\t%s\t",
910 bond
->name
, bond_mode_to_string(bond
->balance
));
913 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
915 ds_put_cstr(&ds
, ", ");
917 ds_put_cstr(&ds
, slave
->name
);
919 ds_put_char(&ds
, '\n');
921 unixctl_command_reply(conn
, 200, ds_cstr(&ds
));
926 bond_unixctl_show(struct unixctl_conn
*conn
,
927 const char *args
, void *aux OVS_UNUSED
)
929 struct ds ds
= DS_EMPTY_INITIALIZER
;
930 const struct bond_slave
*slave
;
931 const struct bond
*bond
;
933 bond
= bond_find(args
);
935 unixctl_command_reply(conn
, 501, "no such bond");
939 ds_put_format(&ds
, "bond_mode: %s\n",
940 bond_mode_to_string(bond
->balance
));
942 if (bond
->balance
!= BM_AB
) {
943 ds_put_format(&ds
, "bond-hash-algorithm: %s\n",
944 bond_is_tcp_hash(bond
) ? "balance-tcp" : "balance-slb");
947 ds_put_format(&ds
, "bond-hash-basis: %"PRIu32
"\n", bond
->basis
);
949 ds_put_format(&ds
, "updelay: %d ms\n", bond
->updelay
);
950 ds_put_format(&ds
, "downdelay: %d ms\n", bond
->downdelay
);
952 if (bond_is_balanced(bond
)) {
953 ds_put_format(&ds
, "next rebalance: %lld ms\n",
954 bond
->next_rebalance
- time_msec());
957 ds_put_format(&ds
, "lacp_negotiated: %s\n",
958 bond
->lacp_negotiated
? "true" : "false");
960 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
961 struct bond_entry
*be
;
964 ds_put_format(&ds
, "\nslave %s: %s\n",
965 slave
->name
, slave
->enabled
? "enabled" : "disabled");
966 if (slave
== bond
->active_slave
) {
967 ds_put_cstr(&ds
, "\tactive slave\n");
969 if (slave
->delay_expires
!= LLONG_MAX
) {
970 ds_put_format(&ds
, "\t%s expires in %lld ms\n",
971 slave
->enabled
? "downdelay" : "updelay",
972 slave
->delay_expires
- time_msec());
975 ds_put_format(&ds
, "\tmay_enable: %s\n",
976 slave
->may_enable
? "true" : "false");
978 if (!bond_is_balanced(bond
)) {
983 for (be
= bond
->hash
; be
<= &bond
->hash
[BOND_MASK
]; be
++) {
984 int hash
= be
- bond
->hash
;
986 if (be
->slave
!= slave
) {
990 ds_put_format(&ds
, "\thash %d: %"PRIu64
" kB load\n",
991 hash
, be
->tx_bytes
/ 1024);
993 if (bond
->balance
!= BM_SLB
) {
997 /* XXX How can we list the MACs assigned to hashes? */
1000 unixctl_command_reply(conn
, 200, ds_cstr(&ds
));
1005 bond_unixctl_migrate(struct unixctl_conn
*conn
, const char *args_
,
1006 void *aux OVS_UNUSED
)
1008 char *args
= (char *) args_
;
1009 char *save_ptr
= NULL
;
1010 char *bond_s
, *hash_s
, *slave_s
;
1012 struct bond_slave
*slave
;
1013 struct bond_entry
*entry
;
1016 bond_s
= strtok_r(args
, " ", &save_ptr
);
1017 hash_s
= strtok_r(NULL
, " ", &save_ptr
);
1018 slave_s
= strtok_r(NULL
, " ", &save_ptr
);
1020 unixctl_command_reply(conn
, 501,
1021 "usage: bond/migrate BOND HASH SLAVE");
1025 bond
= bond_find(bond_s
);
1027 unixctl_command_reply(conn
, 501, "no such bond");
1031 if (bond
->balance
!= BM_SLB
) {
1032 unixctl_command_reply(conn
, 501, "not an SLB bond");
1036 if (strspn(hash_s
, "0123456789") == strlen(hash_s
)) {
1037 hash
= atoi(hash_s
) & BOND_MASK
;
1039 unixctl_command_reply(conn
, 501, "bad hash");
1043 slave
= bond_lookup_slave(bond
, slave_s
);
1045 unixctl_command_reply(conn
, 501, "no such slave");
1049 if (!slave
->enabled
) {
1050 unixctl_command_reply(conn
, 501, "cannot migrate to disabled slave");
1054 entry
= &bond
->hash
[hash
];
1055 tag_set_add(&bond
->unixctl_tags
, entry
->tag
);
1056 entry
->slave
= slave
;
1057 entry
->tag
= tag_create_random();
1058 unixctl_command_reply(conn
, 200, "migrated");
1062 bond_unixctl_set_active_slave(struct unixctl_conn
*conn
, const char *args_
,
1063 void *aux OVS_UNUSED
)
1065 char *args
= (char *) args_
;
1066 char *save_ptr
= NULL
;
1067 char *bond_s
, *slave_s
;
1069 struct bond_slave
*slave
;
1071 bond_s
= strtok_r(args
, " ", &save_ptr
);
1072 slave_s
= strtok_r(NULL
, " ", &save_ptr
);
1074 unixctl_command_reply(conn
, 501,
1075 "usage: bond/set-active-slave BOND SLAVE");
1079 bond
= bond_find(bond_s
);
1081 unixctl_command_reply(conn
, 501, "no such bond");
1085 slave
= bond_lookup_slave(bond
, slave_s
);
1087 unixctl_command_reply(conn
, 501, "no such slave");
1091 if (!slave
->enabled
) {
1092 unixctl_command_reply(conn
, 501, "cannot make disabled slave active");
1096 if (bond
->active_slave
!= slave
) {
1097 tag_set_add(&bond
->unixctl_tags
, bond_get_active_slave_tag(bond
));
1098 bond
->active_slave
= slave
;
1099 bond
->active_slave
->tag
= tag_create_random();
1100 VLOG_INFO("bond %s: active interface is now %s",
1101 bond
->name
, slave
->name
);
1102 bond
->send_learning_packets
= true;
1103 unixctl_command_reply(conn
, 200, "done");
1105 unixctl_command_reply(conn
, 200, "no change");
1110 enable_slave(struct unixctl_conn
*conn
, const char *args_
, bool enable
)
1112 char *args
= (char *) args_
;
1113 char *save_ptr
= NULL
;
1114 char *bond_s
, *slave_s
;
1116 struct bond_slave
*slave
;
1118 bond_s
= strtok_r(args
, " ", &save_ptr
);
1119 slave_s
= strtok_r(NULL
, " ", &save_ptr
);
1121 char *usage
= xasprintf("usage: bond/%s-slave BOND SLAVE",
1122 enable
? "enable" : "disable");
1123 unixctl_command_reply(conn
, 501, usage
);
1128 bond
= bond_find(bond_s
);
1130 unixctl_command_reply(conn
, 501, "no such bond");
1134 slave
= bond_lookup_slave(bond
, slave_s
);
1136 unixctl_command_reply(conn
, 501, "no such slave");
1140 bond_enable_slave(slave
, enable
, &bond
->unixctl_tags
);
1141 unixctl_command_reply(conn
, 501, enable
? "enabled" : "disabled");
1145 bond_unixctl_enable_slave(struct unixctl_conn
*conn
, const char *args
,
1146 void *aux OVS_UNUSED
)
1148 enable_slave(conn
, args
, true);
1152 bond_unixctl_disable_slave(struct unixctl_conn
*conn
, const char *args
,
1153 void *aux OVS_UNUSED
)
1155 enable_slave(conn
, args
, false);
1159 bond_unixctl_hash(struct unixctl_conn
*conn
, const char *args_
,
1160 void *aux OVS_UNUSED
)
1162 char *args
= (char *) args_
;
1163 uint8_t mac
[ETH_ADDR_LEN
];
1168 char *mac_s
, *vlan_s
, *basis_s
;
1169 char *save_ptr
= NULL
;
1171 mac_s
= strtok_r(args
, " ", &save_ptr
);
1172 vlan_s
= strtok_r(NULL
, " ", &save_ptr
);
1173 basis_s
= strtok_r(NULL
, " ", &save_ptr
);
1176 if (sscanf(vlan_s
, "%u", &vlan
) != 1) {
1177 unixctl_command_reply(conn
, 501, "invalid vlan");
1185 if (sscanf(basis_s
, "%"PRIu32
, &basis
) != 1) {
1186 unixctl_command_reply(conn
, 501, "invalid basis");
1193 if (sscanf(mac_s
, ETH_ADDR_SCAN_FMT
, ETH_ADDR_SCAN_ARGS(mac
))
1194 == ETH_ADDR_SCAN_COUNT
) {
1195 hash
= bond_hash_src(mac
, vlan
, basis
) & BOND_MASK
;
1197 hash_cstr
= xasprintf("%u", hash
);
1198 unixctl_command_reply(conn
, 200, hash_cstr
);
1201 unixctl_command_reply(conn
, 501, "invalid mac");
1208 unixctl_command_register("bond/list", "", bond_unixctl_list
, NULL
);
1209 unixctl_command_register("bond/show", "port", bond_unixctl_show
, NULL
);
1210 unixctl_command_register("bond/migrate", "port hash slave",
1211 bond_unixctl_migrate
, NULL
);
1212 unixctl_command_register("bond/set-active-slave", "port slave",
1213 bond_unixctl_set_active_slave
, NULL
);
1214 unixctl_command_register("bond/enable-slave", "port slave",
1215 bond_unixctl_enable_slave
, NULL
);
1216 unixctl_command_register("bond/disable-slave", "port slave",
1217 bond_unixctl_disable_slave
, NULL
);
1218 unixctl_command_register("bond/hash", "mac [vlan] [basis]",
1219 bond_unixctl_hash
, NULL
);
1223 bond_entry_reset(struct bond
*bond
)
1225 if (bond
->balance
!= BM_AB
) {
1226 size_t hash_len
= (BOND_MASK
+ 1) * sizeof *bond
->hash
;
1229 bond
->hash
= xmalloc(hash_len
);
1231 memset(bond
->hash
, 0, hash_len
);
1233 bond
->next_rebalance
= time_msec() + bond
->rebalance_interval
;
1240 static struct bond_slave
*
1241 bond_slave_lookup(struct bond
*bond
, const void *slave_
)
1243 struct bond_slave
*slave
;
1245 HMAP_FOR_EACH_IN_BUCKET (slave
, hmap_node
, hash_pointer(slave_
, 0),
1247 if (slave
->aux
== slave_
) {
1256 bond_enable_slave(struct bond_slave
*slave
, bool enable
, struct tag_set
*tags
)
1258 struct bond
*bond
= slave
->bond
;
1259 slave
->delay_expires
= LLONG_MAX
;
1260 if (enable
!= slave
->enabled
) {
1261 slave
->enabled
= enable
;
1262 if (!slave
->enabled
) {
1263 VLOG_WARN("interface %s: disabled", slave
->name
);
1265 tag_set_add(tags
, slave
->tag
);
1268 VLOG_WARN("interface %s: enabled", slave
->name
);
1269 slave
->tag
= tag_create_random();
1272 if (bond
->balance
== BM_STABLE
) {
1273 bond
->bond_revalidate
= true;
1279 bond_link_status_update(struct bond_slave
*slave
, struct tag_set
*tags
)
1281 struct bond
*bond
= slave
->bond
;
1284 up
= netdev_get_carrier(slave
->netdev
) && slave
->may_enable
;
1285 if ((up
== slave
->enabled
) != (slave
->delay_expires
== LLONG_MAX
)) {
1286 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
1287 VLOG_INFO_RL(&rl
, "interface %s: link state %s",
1288 slave
->name
, up
? "up" : "down");
1289 if (up
== slave
->enabled
) {
1290 slave
->delay_expires
= LLONG_MAX
;
1291 VLOG_INFO_RL(&rl
, "interface %s: will not be %s",
1292 slave
->name
, up
? "disabled" : "enabled");
1294 int delay
= (bond
->lacp_negotiated
? 0
1295 : up
? bond
->updelay
: bond
->downdelay
);
1296 slave
->delay_expires
= time_msec() + delay
;
1298 VLOG_INFO_RL(&rl
, "interface %s: will be %s if it stays %s "
1301 up
? "enabled" : "disabled",
1308 if (time_msec() >= slave
->delay_expires
) {
1309 bond_enable_slave(slave
, up
, tags
);
1314 bond_is_tcp_hash(const struct bond
*bond
)
1316 return (bond
->balance
== BM_TCP
&& bond
->lacp_negotiated
)
1317 || bond
->balance
== BM_STABLE
;
1321 bond_hash_src(const uint8_t mac
[ETH_ADDR_LEN
], uint16_t vlan
, uint32_t basis
)
1323 return hash_3words(hash_bytes(mac
, ETH_ADDR_LEN
, 0), vlan
, basis
);
1327 bond_hash_tcp(const struct flow
*flow
, uint16_t vlan
, uint32_t basis
)
1329 struct flow hash_flow
= *flow
;
1330 hash_flow
.vlan_tci
= htons(vlan
);
1332 /* The symmetric quality of this hash function is not required, but
1333 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1334 * purposes, so we use it out of convenience. */
1335 return flow_hash_symmetric_l4(&hash_flow
, basis
);
1339 bond_hash(const struct bond
*bond
, const struct flow
*flow
, uint16_t vlan
)
1341 assert(bond
->balance
!= BM_AB
);
1343 return (bond_is_tcp_hash(bond
)
1344 ? bond_hash_tcp(flow
, vlan
, bond
->basis
)
1345 : bond_hash_src(flow
->dl_src
, vlan
, bond
->basis
));
1348 static struct bond_entry
*
1349 lookup_bond_entry(const struct bond
*bond
, const struct flow
*flow
,
1352 return &bond
->hash
[bond_hash(bond
, flow
, vlan
) & BOND_MASK
];
1355 /* This function uses Highest Random Weight hashing to choose an output slave.
1356 * This approach only reassigns a minimal number of flows when slaves are
1357 * enabled or disabled. Unfortunately, it has O(n) performance against the
1358 * number of slaves. There exist algorithms which are O(1), but have slightly
1359 * more complex implementations and require the use of memory. This may need
1360 * to be reimplemented if it becomes a performance bottleneck. */
1361 static struct bond_slave
*
1362 choose_stb_slave(const struct bond
*bond
, const struct flow
*flow
,
1365 struct bond_slave
*best
, *slave
;
1366 uint32_t best_hash
, flow_hash
;
1370 flow_hash
= bond_hash(bond
, flow
, vlan
);
1371 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1372 if (slave
->enabled
) {
1375 hash
= hash_2words(flow_hash
, slave
->stb_id
);
1376 if (!best
|| hash
> best_hash
) {
1386 static struct bond_slave
*
1387 choose_output_slave(const struct bond
*bond
, const struct flow
*flow
,
1390 struct bond_entry
*e
;
1392 switch (bond
->balance
) {
1394 return bond
->active_slave
;
1397 return choose_stb_slave(bond
, flow
, vlan
);
1400 e
= lookup_bond_entry(bond
, flow
, vlan
);
1401 if (!e
->slave
|| !e
->slave
->enabled
) {
1402 e
->slave
= CONTAINER_OF(hmap_random_node(&bond
->slaves
),
1403 struct bond_slave
, hmap_node
);
1404 if (!e
->slave
->enabled
) {
1405 e
->slave
= bond
->active_slave
;
1407 e
->tag
= tag_create_random();
1416 static struct bond_slave
*
1417 bond_choose_slave(const struct bond
*bond
)
1419 struct bond_slave
*slave
, *best
;
1421 /* Find an enabled slave. */
1422 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1423 if (slave
->enabled
) {
1428 /* All interfaces are disabled. Find an interface that will be enabled
1429 * after its updelay expires. */
1431 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1432 if (slave
->delay_expires
!= LLONG_MAX
1433 && slave
->may_enable
1434 && (!best
|| slave
->delay_expires
< best
->delay_expires
)) {
1442 bond_choose_active_slave(struct bond
*bond
, struct tag_set
*tags
)
1444 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
1445 struct bond_slave
*old_active_slave
= bond
->active_slave
;
1447 bond
->active_slave
= bond_choose_slave(bond
);
1448 if (bond
->active_slave
) {
1449 if (bond
->active_slave
->enabled
) {
1450 VLOG_INFO_RL(&rl
, "bond %s: active interface is now %s",
1451 bond
->name
, bond
->active_slave
->name
);
1453 VLOG_INFO_RL(&rl
, "bond %s: active interface is now %s, skipping "
1454 "remaining %lld ms updelay (since no interface was "
1455 "enabled)", bond
->name
, bond
->active_slave
->name
,
1456 bond
->active_slave
->delay_expires
- time_msec());
1457 bond_enable_slave(bond
->active_slave
, true, tags
);
1460 if (!old_active_slave
) {
1461 tag_set_add(tags
, bond
->no_slaves_tag
);
1464 bond
->send_learning_packets
= true;
1465 } else if (old_active_slave
) {
1466 VLOG_WARN_RL(&rl
, "bond %s: all interfaces disabled", bond
->name
);
1470 /* Returns the tag for 'bond''s active slave, or 'bond''s no_slaves_tag if
1471 * there is no active slave. */
1473 bond_get_active_slave_tag(const struct bond
*bond
)
1475 return (bond
->active_slave
1476 ? bond
->active_slave
->tag
1477 : bond
->no_slaves_tag
);
1480 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1481 * bond interface. */
1483 bond_update_fake_slave_stats(struct bond
*bond
)
1485 struct netdev_stats bond_stats
;
1486 struct bond_slave
*slave
;
1487 struct netdev
*bond_dev
;
1489 memset(&bond_stats
, 0, sizeof bond_stats
);
1491 HMAP_FOR_EACH (slave
, hmap_node
, &bond
->slaves
) {
1492 struct netdev_stats slave_stats
;
1494 if (!netdev_get_stats(slave
->netdev
, &slave_stats
)) {
1495 /* XXX: We swap the stats here because they are swapped back when
1496 * reported by the internal device. The reason for this is
1497 * internal devices normally represent packets going into the
1498 * system but when used as fake bond device they represent packets
1499 * leaving the system. We really should do this in the internal
1500 * device itself because changing it here reverses the counts from
1501 * the perspective of the switch. However, the internal device
1502 * doesn't know what type of device it represents so we have to do
1503 * it here for now. */
1504 bond_stats
.tx_packets
+= slave_stats
.rx_packets
;
1505 bond_stats
.tx_bytes
+= slave_stats
.rx_bytes
;
1506 bond_stats
.rx_packets
+= slave_stats
.tx_packets
;
1507 bond_stats
.rx_bytes
+= slave_stats
.tx_bytes
;
1511 if (!netdev_open(bond
->name
, "system", &bond_dev
)) {
1512 netdev_set_stats(bond_dev
, &bond_stats
);
1513 netdev_close(bond_dev
);