]> git.proxmox.com Git - ovs.git/blob - lib/bond.c
Replace most uses of assert by ovs_assert.
[ovs.git] / lib / bond.c
1 /*
2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "bond.h"
20
21 #include <limits.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <math.h>
25
26 #include "coverage.h"
27 #include "dynamic-string.h"
28 #include "flow.h"
29 #include "hmap.h"
30 #include "lacp.h"
31 #include "list.h"
32 #include "netdev.h"
33 #include "odp-util.h"
34 #include "ofpbuf.h"
35 #include "packets.h"
36 #include "poll-loop.h"
37 #include "shash.h"
38 #include "tag.h"
39 #include "timeval.h"
40 #include "unixctl.h"
41 #include "vlog.h"
42
43 VLOG_DEFINE_THIS_MODULE(bond);
44
45 /* Bit-mask for hashing a flow down to a bucket.
46 * There are (BOND_MASK + 1) buckets. */
47 #define BOND_MASK 0xff
48
49 /* A hash bucket for mapping a flow to a slave.
50 * "struct bond" has an array of (BOND_MASK + 1) of these. */
51 struct bond_entry {
52 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
53 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
54 tag_type tag; /* Tag for entry<->facet association. */
55 struct list list_node; /* In bond_slave's 'entries' list. */
56 };
57
58 /* A bond slave, that is, one of the links comprising a bond. */
59 struct bond_slave {
60 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
61 struct bond *bond; /* The bond that contains this slave. */
62 void *aux; /* Client-provided handle for this slave. */
63
64 struct netdev *netdev; /* Network device, owned by the client. */
65 unsigned int change_seq; /* Tracks changes in 'netdev'. */
66 char *name; /* Name (a copy of netdev_get_name(netdev)). */
67
68 /* Link status. */
69 long long delay_expires; /* Time after which 'enabled' may change. */
70 bool enabled; /* May be chosen for flows? */
71 bool may_enable; /* Client considers this slave bondable. */
72 tag_type tag; /* Tag associated with this slave. */
73
74 /* Rebalancing info. Used only by bond_rebalance(). */
75 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
76 struct list entries; /* 'struct bond_entry's assigned here. */
77 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
78
79 /* BM_STABLE specific bonding info. */
80 uint32_t stb_id; /* ID used for 'stb_slaves' ordering. */
81 };
82
83 /* A bond, that is, a set of network devices grouped to improve performance or
84 * robustness. */
85 struct bond {
86 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
87 char *name; /* Name provided by client. */
88
89 /* Slaves. */
90 struct hmap slaves;
91
92 /* Bonding info. */
93 enum bond_mode balance; /* Balancing mode, one of BM_*. */
94 struct bond_slave *active_slave;
95 tag_type no_slaves_tag; /* Tag for flows when all slaves disabled. */
96 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
97 enum lacp_status lacp_status; /* Status of LACP negotiations. */
98 bool bond_revalidate; /* True if flows need revalidation. */
99 uint32_t basis; /* Basis for flow hash function. */
100
101 /* SLB specific bonding info. */
102 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
103 int rebalance_interval; /* Interval between rebalances, in ms. */
104 long long int next_rebalance; /* Next rebalancing time. */
105 bool send_learning_packets;
106
107 /* BM_STABLE specific bonding info. */
108 tag_type stb_tag; /* Tag associated with this bond. */
109
110 /* Legacy compatibility. */
111 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
112
113 /* Tag set saved for next bond_run(). This tag set is a kluge for cases
114 * where we can't otherwise provide revalidation feedback to the client.
115 * That's only unixctl commands now; I hope no other cases will arise. */
116 struct tag_set unixctl_tags;
117 };
118
119 static struct hmap all_bonds = HMAP_INITIALIZER(&all_bonds);
120
121 static void bond_entry_reset(struct bond *);
122 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_);
123 static void bond_enable_slave(struct bond_slave *, bool enable,
124 struct tag_set *);
125 static void bond_link_status_update(struct bond_slave *, struct tag_set *);
126 static void bond_choose_active_slave(struct bond *, struct tag_set *);
127 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
128 uint16_t vlan, uint32_t basis);
129 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
130 uint32_t basis);
131 static struct bond_entry *lookup_bond_entry(const struct bond *,
132 const struct flow *,
133 uint16_t vlan);
134 static tag_type bond_get_active_slave_tag(const struct bond *);
135 static struct bond_slave *choose_output_slave(const struct bond *,
136 const struct flow *,
137 uint16_t vlan, tag_type *tags);
138 static void bond_update_fake_slave_stats(struct bond *);
139
140 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
141 * stores the mode in '*balance' and returns true. Otherwise returns false
142 * without modifying '*balance'. */
143 bool
144 bond_mode_from_string(enum bond_mode *balance, const char *s)
145 {
146 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
147 *balance = BM_TCP;
148 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
149 *balance = BM_SLB;
150 } else if (!strcmp(s, bond_mode_to_string(BM_STABLE))) {
151 *balance = BM_STABLE;
152 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
153 *balance = BM_AB;
154 } else {
155 return false;
156 }
157 return true;
158 }
159
160 /* Returns a string representing 'balance'. */
161 const char *
162 bond_mode_to_string(enum bond_mode balance) {
163 switch (balance) {
164 case BM_TCP:
165 return "balance-tcp";
166 case BM_SLB:
167 return "balance-slb";
168 case BM_STABLE:
169 return "stable";
170 case BM_AB:
171 return "active-backup";
172 }
173 NOT_REACHED();
174 }
175
176 \f
177 /* Creates and returns a new bond whose configuration is initially taken from
178 * 's'.
179 *
180 * The caller should register each slave on the new bond by calling
181 * bond_slave_register(). */
182 struct bond *
183 bond_create(const struct bond_settings *s)
184 {
185 struct bond *bond;
186
187 bond = xzalloc(sizeof *bond);
188 hmap_init(&bond->slaves);
189 bond->no_slaves_tag = tag_create_random();
190 bond->stb_tag = tag_create_random();
191 bond->next_fake_iface_update = LLONG_MAX;
192
193 bond_reconfigure(bond, s);
194
195 tag_set_init(&bond->unixctl_tags);
196
197 return bond;
198 }
199
200 /* Frees 'bond'. */
201 void
202 bond_destroy(struct bond *bond)
203 {
204 struct bond_slave *slave, *next_slave;
205
206 if (!bond) {
207 return;
208 }
209
210 hmap_remove(&all_bonds, &bond->hmap_node);
211
212 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
213 hmap_remove(&bond->slaves, &slave->hmap_node);
214 /* Client owns 'slave->netdev'. */
215 free(slave->name);
216 free(slave);
217 }
218 hmap_destroy(&bond->slaves);
219
220 free(bond->hash);
221 free(bond->name);
222 free(bond);
223 }
224
225 /* Updates 'bond''s overall configuration to 's'.
226 *
227 * The caller should register each slave on 'bond' by calling
228 * bond_slave_register(). This is optional if none of the slaves'
229 * configuration has changed. In any case it can't hurt.
230 *
231 * Returns true if the configuration has changed in such a way that requires
232 * flow revalidation.
233 * */
234 bool
235 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
236 {
237 bool revalidate = false;
238
239 if (!bond->name || strcmp(bond->name, s->name)) {
240 if (bond->name) {
241 hmap_remove(&all_bonds, &bond->hmap_node);
242 free(bond->name);
243 }
244 bond->name = xstrdup(s->name);
245 hmap_insert(&all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
246 }
247
248 bond->updelay = s->up_delay;
249 bond->downdelay = s->down_delay;
250
251 if (bond->rebalance_interval != s->rebalance_interval) {
252 bond->rebalance_interval = s->rebalance_interval;
253 revalidate = true;
254 }
255
256 if (bond->balance != s->balance) {
257 bond->balance = s->balance;
258 revalidate = true;
259
260 if (bond->balance == BM_STABLE) {
261 VLOG_WARN_ONCE("Stable bond mode is deprecated and may be removed"
262 " in February 2013. Please email"
263 " dev@openvswitch.org with concerns.");
264 }
265 }
266
267 if (bond->basis != s->basis) {
268 bond->basis = s->basis;
269 revalidate = true;
270 }
271
272 if (s->fake_iface) {
273 if (bond->next_fake_iface_update == LLONG_MAX) {
274 bond->next_fake_iface_update = time_msec();
275 }
276 } else {
277 bond->next_fake_iface_update = LLONG_MAX;
278 }
279
280 if (bond->bond_revalidate) {
281 revalidate = true;
282 bond->bond_revalidate = false;
283 }
284
285 if (bond->balance == BM_AB || !bond->hash || revalidate) {
286 bond_entry_reset(bond);
287 }
288
289 return revalidate;
290 }
291
292 static void
293 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
294 {
295 if (slave->netdev != netdev) {
296 slave->netdev = netdev;
297 slave->change_seq = 0;
298 }
299 }
300
301 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
302 * arbitrary client-provided pointer that uniquely identifies a slave within a
303 * bond. If 'slave_' already exists within 'bond' then this function
304 * reconfigures the existing slave.
305 *
306 * 'stb_id' is used in BM_STABLE bonds to guarantee consistent slave choices
307 * across restarts and distributed vswitch instances. It should be unique per
308 * slave, and preferably consistent across restarts and reconfigurations.
309 *
310 * 'netdev' must be the network device that 'slave_' represents. It is owned
311 * by the client, so the client must not close it before either unregistering
312 * 'slave_' or destroying 'bond'.
313 */
314 void
315 bond_slave_register(struct bond *bond, void *slave_, uint32_t stb_id,
316 struct netdev *netdev)
317 {
318 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
319
320 if (!slave) {
321 slave = xzalloc(sizeof *slave);
322
323 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
324 slave->bond = bond;
325 slave->aux = slave_;
326 slave->delay_expires = LLONG_MAX;
327 slave->name = xstrdup(netdev_get_name(netdev));
328 bond->bond_revalidate = true;
329
330 slave->enabled = false;
331 bond_enable_slave(slave, netdev_get_carrier(netdev), NULL);
332 }
333
334 if (slave->stb_id != stb_id) {
335 slave->stb_id = stb_id;
336 bond->bond_revalidate = true;
337 }
338
339 bond_slave_set_netdev__(slave, netdev);
340
341 free(slave->name);
342 slave->name = xstrdup(netdev_get_name(netdev));
343 }
344
345 /* Updates the network device to be used with 'slave_' to 'netdev'.
346 *
347 * This is useful if the caller closes and re-opens the network device
348 * registered with bond_slave_register() but doesn't need to change anything
349 * else. */
350 void
351 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
352 {
353 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
354 if (slave) {
355 bond_slave_set_netdev__(slave, netdev);
356 }
357 }
358
359 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
360 * then this function has no effect.
361 *
362 * Unregistering a slave invalidates all flows. */
363 void
364 bond_slave_unregister(struct bond *bond, const void *slave_)
365 {
366 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
367 bool del_active;
368
369 if (!slave) {
370 return;
371 }
372
373 bond_enable_slave(slave, false, NULL);
374
375 del_active = bond->active_slave == slave;
376 if (bond->hash) {
377 struct bond_entry *e;
378 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
379 if (e->slave == slave) {
380 e->slave = NULL;
381 }
382 }
383 }
384
385 free(slave->name);
386
387 hmap_remove(&bond->slaves, &slave->hmap_node);
388 /* Client owns 'slave->netdev'. */
389 free(slave);
390
391 if (del_active) {
392 struct tag_set tags;
393
394 tag_set_init(&tags);
395 bond_choose_active_slave(bond, &tags);
396 bond->send_learning_packets = true;
397 }
398 }
399
400 /* Should be called on each slave in 'bond' before bond_run() to indicate
401 * whether or not 'slave_' may be enabled. This function is intended to allow
402 * other protocols to have some impact on bonding decisions. For example LACP
403 * or high level link monitoring protocols may decide that a given slave should
404 * not be able to send traffic. */
405 void
406 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
407 {
408 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
409 }
410
411 /* Performs periodic maintenance on 'bond'. The caller must provide 'tags' to
412 * allow tagged flows to be invalidated.
413 *
414 * The caller should check bond_should_send_learning_packets() afterward. */
415 void
416 bond_run(struct bond *bond, struct tag_set *tags, enum lacp_status lacp_status)
417 {
418 struct bond_slave *slave;
419
420 if (bond->lacp_status != lacp_status) {
421 bond->lacp_status = lacp_status;
422 bond->bond_revalidate = true;
423 }
424
425 /* Enable slaves based on link status and LACP feedback. */
426 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
427 bond_link_status_update(slave, tags);
428 slave->change_seq = netdev_change_seq(slave->netdev);
429 }
430 if (!bond->active_slave || !bond->active_slave->enabled) {
431 bond_choose_active_slave(bond, tags);
432 }
433
434 /* Update fake bond interface stats. */
435 if (time_msec() >= bond->next_fake_iface_update) {
436 bond_update_fake_slave_stats(bond);
437 bond->next_fake_iface_update = time_msec() + 1000;
438 }
439
440 if (bond->bond_revalidate) {
441 bond->bond_revalidate = false;
442
443 bond_entry_reset(bond);
444 if (bond->balance != BM_STABLE) {
445 struct bond_slave *slave;
446
447 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
448 tag_set_add(tags, slave->tag);
449 }
450 } else {
451 tag_set_add(tags, bond->stb_tag);
452 }
453 tag_set_add(tags, bond->no_slaves_tag);
454 }
455
456 /* Invalidate any tags required by */
457 tag_set_union(tags, &bond->unixctl_tags);
458 tag_set_init(&bond->unixctl_tags);
459 }
460
461 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
462 void
463 bond_wait(struct bond *bond)
464 {
465 struct bond_slave *slave;
466
467 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
468 if (slave->delay_expires != LLONG_MAX) {
469 poll_timer_wait_until(slave->delay_expires);
470 }
471
472 if (slave->change_seq != netdev_change_seq(slave->netdev)) {
473 poll_immediate_wake();
474 }
475 }
476
477 if (bond->next_fake_iface_update != LLONG_MAX) {
478 poll_timer_wait_until(bond->next_fake_iface_update);
479 }
480
481 /* Ensure that any saved tags get revalidated right away. */
482 if (!tag_set_is_empty(&bond->unixctl_tags)) {
483 poll_immediate_wake();
484 }
485
486 /* We don't wait for bond->next_rebalance because rebalancing can only run
487 * at a flow account checkpoint. ofproto does checkpointing on its own
488 * schedule and bond_rebalance() gets called afterward, so we'd just be
489 * waking up for no purpose. */
490 }
491 \f
492 /* MAC learning table interaction. */
493
494 static bool
495 may_send_learning_packets(const struct bond *bond)
496 {
497 return bond->lacp_status == LACP_DISABLED
498 && (bond->balance == BM_SLB || bond->balance == BM_AB)
499 && bond->active_slave;
500 }
501
502 /* Returns true if 'bond' needs the client to send out packets to assist with
503 * MAC learning on 'bond'. If this function returns true, then the client
504 * should iterate through its MAC learning table for the bridge on which 'bond'
505 * is located. For each MAC that has been learned on a port other than 'bond',
506 * it should call bond_compose_learning_packet().
507 *
508 * This function will only return true if 'bond' is in SLB or active-backup
509 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
510 * necessary.
511 *
512 * Calling this function resets the state that it checks. */
513 bool
514 bond_should_send_learning_packets(struct bond *bond)
515 {
516 bool send = bond->send_learning_packets && may_send_learning_packets(bond);
517 bond->send_learning_packets = false;
518 return send;
519 }
520
521 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
522 *
523 * See bond_should_send_learning_packets() for description of usage. The
524 * caller should send the composed packet on the port associated with
525 * port_aux and takes ownership of the returned ofpbuf. */
526 struct ofpbuf *
527 bond_compose_learning_packet(struct bond *bond,
528 const uint8_t eth_src[ETH_ADDR_LEN],
529 uint16_t vlan, void **port_aux)
530 {
531 struct bond_slave *slave;
532 struct ofpbuf *packet;
533 tag_type tags = 0;
534 struct flow flow;
535
536 ovs_assert(may_send_learning_packets(bond));
537
538 memset(&flow, 0, sizeof flow);
539 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
540 slave = choose_output_slave(bond, &flow, vlan, &tags);
541
542 packet = ofpbuf_new(0);
543 compose_rarp(packet, eth_src);
544 if (vlan) {
545 eth_push_vlan(packet, htons(vlan));
546 }
547
548 *port_aux = slave->aux;
549 return packet;
550 }
551 \f
552 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
553 * Ethernet destination address of 'eth_dst', should be admitted.
554 *
555 * The return value is one of the following:
556 *
557 * - BV_ACCEPT: Admit the packet.
558 *
559 * - BV_DROP: Drop the packet.
560 *
561 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
562 * Ethernet source address and VLAN. If there is none, or if the packet
563 * is on the learned port, then admit the packet. If a different port has
564 * been learned, however, drop the packet (and do not use it for MAC
565 * learning).
566 */
567 enum bond_verdict
568 bond_check_admissibility(struct bond *bond, const void *slave_,
569 const uint8_t eth_dst[ETH_ADDR_LEN], tag_type *tags)
570 {
571 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
572
573 /* LACP bonds have very loose admissibility restrictions because we can
574 * assume the remote switch is aware of the bond and will "do the right
575 * thing". However, as a precaution we drop packets on disabled slaves
576 * because no correctly implemented partner switch should be sending
577 * packets to them.
578 *
579 * If LACP is configured, but LACP negotiations have been unsuccessful, we
580 * drop all incoming traffic. */
581 switch (bond->lacp_status) {
582 case LACP_NEGOTIATED: return slave->enabled ? BV_ACCEPT : BV_DROP;
583 case LACP_CONFIGURED: return BV_DROP;
584 case LACP_DISABLED: break;
585 }
586
587 /* Drop all multicast packets on inactive slaves. */
588 if (eth_addr_is_multicast(eth_dst)) {
589 *tags |= bond_get_active_slave_tag(bond);
590 if (bond->active_slave != bond_slave_lookup(bond, slave_)) {
591 return BV_DROP;
592 }
593 }
594
595 switch (bond->balance) {
596 case BM_AB:
597 /* Drop all packets which arrive on backup slaves. This is similar to
598 * how Linux bonding handles active-backup bonds. */
599 *tags |= bond_get_active_slave_tag(bond);
600 if (bond->active_slave != slave) {
601 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
602
603 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
604 " slave (%s) destined for " ETH_ADDR_FMT,
605 slave->name, ETH_ADDR_ARGS(eth_dst));
606 return BV_DROP;
607 }
608 return BV_ACCEPT;
609
610 case BM_TCP:
611 /* TCP balanced bonds require successful LACP negotiated. Based on the
612 * above check, LACP is off on this bond. Therfore, we drop all
613 * incoming traffic. */
614 return BV_DROP;
615
616 case BM_SLB:
617 /* Drop all packets for which we have learned a different input port,
618 * because we probably sent the packet on one slave and got it back on
619 * the other. Gratuitous ARP packets are an exception to this rule:
620 * the host has moved to another switch. The exception to the
621 * exception is if we locked the learning table to avoid reflections on
622 * bond slaves. */
623 return BV_DROP_IF_MOVED;
624
625 case BM_STABLE:
626 return BV_ACCEPT;
627 }
628
629 NOT_REACHED();
630 }
631
632 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
633 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
634 * NULL if the packet should be dropped because no slaves are enabled.
635 *
636 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
637 * should be a VID only (i.e. excluding the PCP bits). Second,
638 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
639 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
640 * packet belongs to (so for an access port it will be the access port's VLAN).
641 *
642 * Adds a tag to '*tags' that associates the flow with the returned slave.
643 */
644 void *
645 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
646 uint16_t vlan, tag_type *tags)
647 {
648 struct bond_slave *slave = choose_output_slave(bond, flow, vlan, tags);
649 if (slave) {
650 *tags |= bond->balance == BM_STABLE ? bond->stb_tag : slave->tag;
651 return slave->aux;
652 } else {
653 *tags |= bond->no_slaves_tag;
654 return NULL;
655 }
656 }
657 \f
658 /* Rebalancing. */
659
660 static bool
661 bond_is_balanced(const struct bond *bond)
662 {
663 return bond->rebalance_interval
664 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
665 }
666
667 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
668 void
669 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
670 uint64_t n_bytes)
671 {
672 if (bond_is_balanced(bond)) {
673 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
674 }
675 }
676
677 static struct bond_slave *
678 bond_slave_from_bal_node(struct list *bal)
679 {
680 return CONTAINER_OF(bal, struct bond_slave, bal_node);
681 }
682
683 static void
684 log_bals(struct bond *bond, const struct list *bals)
685 {
686 if (VLOG_IS_DBG_ENABLED()) {
687 struct ds ds = DS_EMPTY_INITIALIZER;
688 const struct bond_slave *slave;
689
690 LIST_FOR_EACH (slave, bal_node, bals) {
691 if (ds.length) {
692 ds_put_char(&ds, ',');
693 }
694 ds_put_format(&ds, " %s %"PRIu64"kB",
695 slave->name, slave->tx_bytes / 1024);
696
697 if (!slave->enabled) {
698 ds_put_cstr(&ds, " (disabled)");
699 }
700 if (!list_is_empty(&slave->entries)) {
701 struct bond_entry *e;
702
703 ds_put_cstr(&ds, " (");
704 LIST_FOR_EACH (e, list_node, &slave->entries) {
705 if (&e->list_node != list_front(&slave->entries)) {
706 ds_put_cstr(&ds, " + ");
707 }
708 ds_put_format(&ds, "h%td: %"PRIu64"kB",
709 e - bond->hash, e->tx_bytes / 1024);
710 }
711 ds_put_cstr(&ds, ")");
712 }
713 }
714 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
715 ds_destroy(&ds);
716 }
717 }
718
719 /* Shifts 'hash' from its current slave to 'to'. */
720 static void
721 bond_shift_load(struct bond_entry *hash, struct bond_slave *to,
722 struct tag_set *set)
723 {
724 struct bond_slave *from = hash->slave;
725 struct bond *bond = from->bond;
726 uint64_t delta = hash->tx_bytes;
727
728 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) "
729 "from %s to %s (now carrying %"PRIu64"kB and "
730 "%"PRIu64"kB load, respectively)",
731 bond->name, delta / 1024, hash - bond->hash,
732 from->name, to->name,
733 (from->tx_bytes - delta) / 1024,
734 (to->tx_bytes + delta) / 1024);
735
736 /* Shift load away from 'from' to 'to'. */
737 from->tx_bytes -= delta;
738 to->tx_bytes += delta;
739
740 /* Arrange for flows to be revalidated. */
741 tag_set_add(set, hash->tag);
742 hash->slave = to;
743 hash->tag = tag_create_random();
744 }
745
746 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
747 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
748 * given that doing so must decrease the ratio of the load on the two slaves by
749 * at least 0.1. Returns NULL if there is no appropriate entry.
750 *
751 * The list of entries isn't sorted. I don't know of a reason to prefer to
752 * shift away small hashes or large hashes. */
753 static struct bond_entry *
754 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
755 {
756 struct bond_entry *e;
757
758 if (list_is_short(&from->entries)) {
759 /* 'from' carries no more than one MAC hash, so shifting load away from
760 * it would be pointless. */
761 return NULL;
762 }
763
764 LIST_FOR_EACH (e, list_node, &from->entries) {
765 double old_ratio, new_ratio;
766 uint64_t delta;
767
768 if (to_tx_bytes == 0) {
769 /* Nothing on the new slave, move it. */
770 return e;
771 }
772
773 delta = e->tx_bytes;
774 old_ratio = (double)from->tx_bytes / to_tx_bytes;
775 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
776 if (old_ratio - new_ratio > 0.1
777 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
778 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
779 and 'to' slave have the same load. Therefore, we only move an
780 entry if it decreases the load on 'from', and brings us closer
781 to equal traffic load. */
782 return e;
783 }
784 }
785
786 return NULL;
787 }
788
789 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
790 * maintained. */
791 static void
792 insert_bal(struct list *bals, struct bond_slave *slave)
793 {
794 struct bond_slave *pos;
795
796 LIST_FOR_EACH (pos, bal_node, bals) {
797 if (slave->tx_bytes > pos->tx_bytes) {
798 break;
799 }
800 }
801 list_insert(&pos->bal_node, &slave->bal_node);
802 }
803
804 /* Removes 'slave' from its current list and then inserts it into 'bals' so
805 * that descending order of 'tx_bytes' is maintained. */
806 static void
807 reinsert_bal(struct list *bals, struct bond_slave *slave)
808 {
809 list_remove(&slave->bal_node);
810 insert_bal(bals, slave);
811 }
812
813 /* If 'bond' needs rebalancing, does so.
814 *
815 * The caller should have called bond_account() for each active flow, to ensure
816 * that flow data is consistently accounted at this point. */
817 void
818 bond_rebalance(struct bond *bond, struct tag_set *tags)
819 {
820 struct bond_slave *slave;
821 struct bond_entry *e;
822 struct list bals;
823
824 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
825 return;
826 }
827 bond->next_rebalance = time_msec() + bond->rebalance_interval;
828
829 /* Add each bond_entry to its slave's 'entries' list.
830 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
831 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
832 slave->tx_bytes = 0;
833 list_init(&slave->entries);
834 }
835 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
836 if (e->slave && e->tx_bytes) {
837 e->slave->tx_bytes += e->tx_bytes;
838 list_push_back(&e->slave->entries, &e->list_node);
839 }
840 }
841
842 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
843 *
844 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
845 * with a proper list sort algorithm. */
846 list_init(&bals);
847 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
848 if (slave->enabled) {
849 insert_bal(&bals, slave);
850 }
851 }
852 log_bals(bond, &bals);
853
854 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
855 while (!list_is_short(&bals)) {
856 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
857 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
858 uint64_t overload;
859
860 overload = from->tx_bytes - to->tx_bytes;
861 if (overload < to->tx_bytes >> 5 || overload < 100000) {
862 /* The extra load on 'from' (and all less-loaded slaves), compared
863 * to that of 'to' (the least-loaded slave), is less than ~3%, or
864 * it is less than ~1Mbps. No point in rebalancing. */
865 break;
866 }
867
868 /* 'from' is carrying significantly more load than 'to'. Pick a hash
869 * to move from 'from' to 'to'. */
870 e = choose_entry_to_migrate(from, to->tx_bytes);
871 if (e) {
872 bond_shift_load(e, to, tags);
873
874 /* Delete element from from->entries.
875 *
876 * We don't add the element to to->hashes. That would only allow
877 * 'e' to be migrated to another slave in this rebalancing run, and
878 * there is no point in doing that. */
879 list_remove(&e->list_node);
880
881 /* Re-sort 'bals'. */
882 reinsert_bal(&bals, from);
883 reinsert_bal(&bals, to);
884 } else {
885 /* Can't usefully migrate anything away from 'from'.
886 * Don't reconsider it. */
887 list_remove(&from->bal_node);
888 }
889 }
890
891 /* Implement exponentially weighted moving average. A weight of 1/2 causes
892 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
893 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
894 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
895 e->tx_bytes /= 2;
896 if (!e->tx_bytes) {
897 e->slave = NULL;
898 }
899 }
900 }
901 \f
902 /* Bonding unixctl user interface functions. */
903
904 static struct bond *
905 bond_find(const char *name)
906 {
907 struct bond *bond;
908
909 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
910 &all_bonds) {
911 if (!strcmp(bond->name, name)) {
912 return bond;
913 }
914 }
915 return NULL;
916 }
917
918 static struct bond_slave *
919 bond_lookup_slave(struct bond *bond, const char *slave_name)
920 {
921 struct bond_slave *slave;
922
923 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
924 if (!strcmp(slave->name, slave_name)) {
925 return slave;
926 }
927 }
928 return NULL;
929 }
930
931 static void
932 bond_unixctl_list(struct unixctl_conn *conn,
933 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
934 void *aux OVS_UNUSED)
935 {
936 struct ds ds = DS_EMPTY_INITIALIZER;
937 const struct bond *bond;
938
939 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
940
941 HMAP_FOR_EACH (bond, hmap_node, &all_bonds) {
942 const struct bond_slave *slave;
943 size_t i;
944
945 ds_put_format(&ds, "%s\t%s\t",
946 bond->name, bond_mode_to_string(bond->balance));
947
948 i = 0;
949 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
950 if (i++ > 0) {
951 ds_put_cstr(&ds, ", ");
952 }
953 ds_put_cstr(&ds, slave->name);
954 }
955 ds_put_char(&ds, '\n');
956 }
957 unixctl_command_reply(conn, ds_cstr(&ds));
958 ds_destroy(&ds);
959 }
960
961 static void
962 bond_print_details(struct ds *ds, const struct bond *bond)
963 {
964 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
965 const struct shash_node **sorted_slaves = NULL;
966 const struct bond_slave *slave;
967 int i;
968
969 ds_put_format(ds, "---- %s ----\n", bond->name);
970 ds_put_format(ds, "bond_mode: %s\n",
971 bond_mode_to_string(bond->balance));
972
973 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
974
975 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
976 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
977
978 if (bond_is_balanced(bond)) {
979 ds_put_format(ds, "next rebalance: %lld ms\n",
980 bond->next_rebalance - time_msec());
981 }
982
983 ds_put_cstr(ds, "lacp_status: ");
984 switch (bond->lacp_status) {
985 case LACP_NEGOTIATED:
986 ds_put_cstr(ds, "negotiated\n");
987 break;
988 case LACP_CONFIGURED:
989 ds_put_cstr(ds, "configured\n");
990 break;
991 case LACP_DISABLED:
992 ds_put_cstr(ds, "off\n");
993 break;
994 default:
995 ds_put_cstr(ds, "<unknown>\n");
996 break;
997 }
998
999 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1000 shash_add(&slave_shash, slave->name, slave);
1001 }
1002 sorted_slaves = shash_sort(&slave_shash);
1003
1004 for (i = 0; i < shash_count(&slave_shash); i++) {
1005 struct bond_entry *be;
1006
1007 slave = sorted_slaves[i]->data;
1008
1009 /* Basic info. */
1010 ds_put_format(ds, "\nslave %s: %s\n",
1011 slave->name, slave->enabled ? "enabled" : "disabled");
1012 if (slave == bond->active_slave) {
1013 ds_put_cstr(ds, "\tactive slave\n");
1014 }
1015 if (slave->delay_expires != LLONG_MAX) {
1016 ds_put_format(ds, "\t%s expires in %lld ms\n",
1017 slave->enabled ? "downdelay" : "updelay",
1018 slave->delay_expires - time_msec());
1019 }
1020
1021 ds_put_format(ds, "\tmay_enable: %s\n",
1022 slave->may_enable ? "true" : "false");
1023
1024 if (!bond_is_balanced(bond)) {
1025 continue;
1026 }
1027
1028 /* Hashes. */
1029 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1030 int hash = be - bond->hash;
1031
1032 if (be->slave != slave) {
1033 continue;
1034 }
1035
1036 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1037 hash, be->tx_bytes / 1024);
1038
1039 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1040 }
1041 }
1042 shash_destroy(&slave_shash);
1043 free(sorted_slaves);
1044 ds_put_cstr(ds, "\n");
1045 }
1046
1047 static void
1048 bond_unixctl_show(struct unixctl_conn *conn,
1049 int argc, const char *argv[],
1050 void *aux OVS_UNUSED)
1051 {
1052 struct ds ds = DS_EMPTY_INITIALIZER;
1053
1054 if (argc > 1) {
1055 const struct bond *bond = bond_find(argv[1]);
1056
1057 if (!bond) {
1058 unixctl_command_reply_error(conn, "no such bond");
1059 return;
1060 }
1061 bond_print_details(&ds, bond);
1062 } else {
1063 const struct bond *bond;
1064
1065 HMAP_FOR_EACH (bond, hmap_node, &all_bonds) {
1066 bond_print_details(&ds, bond);
1067 }
1068 }
1069
1070 unixctl_command_reply(conn, ds_cstr(&ds));
1071 ds_destroy(&ds);
1072 }
1073
1074 static void
1075 bond_unixctl_migrate(struct unixctl_conn *conn,
1076 int argc OVS_UNUSED, const char *argv[],
1077 void *aux OVS_UNUSED)
1078 {
1079 const char *bond_s = argv[1];
1080 const char *hash_s = argv[2];
1081 const char *slave_s = argv[3];
1082 struct bond *bond;
1083 struct bond_slave *slave;
1084 struct bond_entry *entry;
1085 int hash;
1086
1087 bond = bond_find(bond_s);
1088 if (!bond) {
1089 unixctl_command_reply_error(conn, "no such bond");
1090 return;
1091 }
1092
1093 if (bond->balance != BM_SLB) {
1094 unixctl_command_reply_error(conn, "not an SLB bond");
1095 return;
1096 }
1097
1098 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1099 hash = atoi(hash_s) & BOND_MASK;
1100 } else {
1101 unixctl_command_reply_error(conn, "bad hash");
1102 return;
1103 }
1104
1105 slave = bond_lookup_slave(bond, slave_s);
1106 if (!slave) {
1107 unixctl_command_reply_error(conn, "no such slave");
1108 return;
1109 }
1110
1111 if (!slave->enabled) {
1112 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1113 return;
1114 }
1115
1116 entry = &bond->hash[hash];
1117 tag_set_add(&bond->unixctl_tags, entry->tag);
1118 entry->slave = slave;
1119 entry->tag = tag_create_random();
1120 unixctl_command_reply(conn, "migrated");
1121 }
1122
1123 static void
1124 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1125 int argc OVS_UNUSED, const char *argv[],
1126 void *aux OVS_UNUSED)
1127 {
1128 const char *bond_s = argv[1];
1129 const char *slave_s = argv[2];
1130 struct bond *bond;
1131 struct bond_slave *slave;
1132
1133 bond = bond_find(bond_s);
1134 if (!bond) {
1135 unixctl_command_reply_error(conn, "no such bond");
1136 return;
1137 }
1138
1139 slave = bond_lookup_slave(bond, slave_s);
1140 if (!slave) {
1141 unixctl_command_reply_error(conn, "no such slave");
1142 return;
1143 }
1144
1145 if (!slave->enabled) {
1146 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1147 return;
1148 }
1149
1150 if (bond->active_slave != slave) {
1151 tag_set_add(&bond->unixctl_tags, bond_get_active_slave_tag(bond));
1152 bond->active_slave = slave;
1153 bond->active_slave->tag = tag_create_random();
1154 VLOG_INFO("bond %s: active interface is now %s",
1155 bond->name, slave->name);
1156 bond->send_learning_packets = true;
1157 unixctl_command_reply(conn, "done");
1158 } else {
1159 unixctl_command_reply(conn, "no change");
1160 }
1161 }
1162
1163 static void
1164 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1165 {
1166 const char *bond_s = argv[1];
1167 const char *slave_s = argv[2];
1168 struct bond *bond;
1169 struct bond_slave *slave;
1170
1171 bond = bond_find(bond_s);
1172 if (!bond) {
1173 unixctl_command_reply_error(conn, "no such bond");
1174 return;
1175 }
1176
1177 slave = bond_lookup_slave(bond, slave_s);
1178 if (!slave) {
1179 unixctl_command_reply_error(conn, "no such slave");
1180 return;
1181 }
1182
1183 bond_enable_slave(slave, enable, &bond->unixctl_tags);
1184 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1185 }
1186
1187 static void
1188 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1189 int argc OVS_UNUSED, const char *argv[],
1190 void *aux OVS_UNUSED)
1191 {
1192 enable_slave(conn, argv, true);
1193 }
1194
1195 static void
1196 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1197 int argc OVS_UNUSED, const char *argv[],
1198 void *aux OVS_UNUSED)
1199 {
1200 enable_slave(conn, argv, false);
1201 }
1202
1203 static void
1204 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1205 void *aux OVS_UNUSED)
1206 {
1207 const char *mac_s = argv[1];
1208 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1209 const char *basis_s = argc > 3 ? argv[3] : NULL;
1210 uint8_t mac[ETH_ADDR_LEN];
1211 uint8_t hash;
1212 char *hash_cstr;
1213 unsigned int vlan;
1214 uint32_t basis;
1215
1216 if (vlan_s) {
1217 if (sscanf(vlan_s, "%u", &vlan) != 1) {
1218 unixctl_command_reply_error(conn, "invalid vlan");
1219 return;
1220 }
1221 } else {
1222 vlan = 0;
1223 }
1224
1225 if (basis_s) {
1226 if (sscanf(basis_s, "%"PRIu32, &basis) != 1) {
1227 unixctl_command_reply_error(conn, "invalid basis");
1228 return;
1229 }
1230 } else {
1231 basis = 0;
1232 }
1233
1234 if (sscanf(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))
1235 == ETH_ADDR_SCAN_COUNT) {
1236 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1237
1238 hash_cstr = xasprintf("%u", hash);
1239 unixctl_command_reply(conn, hash_cstr);
1240 free(hash_cstr);
1241 } else {
1242 unixctl_command_reply_error(conn, "invalid mac");
1243 }
1244 }
1245
1246 void
1247 bond_init(void)
1248 {
1249 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1250 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1251 NULL);
1252 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1253 bond_unixctl_migrate, NULL);
1254 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1255 bond_unixctl_set_active_slave, NULL);
1256 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1257 bond_unixctl_enable_slave, NULL);
1258 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1259 bond_unixctl_disable_slave, NULL);
1260 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1261 bond_unixctl_hash, NULL);
1262 }
1263 \f
1264 static void
1265 bond_entry_reset(struct bond *bond)
1266 {
1267 if (bond->balance != BM_AB) {
1268 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1269
1270 if (!bond->hash) {
1271 bond->hash = xmalloc(hash_len);
1272 }
1273 memset(bond->hash, 0, hash_len);
1274
1275 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1276 } else {
1277 free(bond->hash);
1278 bond->hash = NULL;
1279 }
1280 }
1281
1282 static struct bond_slave *
1283 bond_slave_lookup(struct bond *bond, const void *slave_)
1284 {
1285 struct bond_slave *slave;
1286
1287 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1288 &bond->slaves) {
1289 if (slave->aux == slave_) {
1290 return slave;
1291 }
1292 }
1293
1294 return NULL;
1295 }
1296
1297 static void
1298 bond_enable_slave(struct bond_slave *slave, bool enable, struct tag_set *tags)
1299 {
1300 struct bond *bond = slave->bond;
1301 slave->delay_expires = LLONG_MAX;
1302 if (enable != slave->enabled) {
1303 slave->enabled = enable;
1304 if (!slave->enabled) {
1305 VLOG_WARN("interface %s: disabled", slave->name);
1306 if (tags) {
1307 tag_set_add(tags, slave->tag);
1308 }
1309 } else {
1310 VLOG_WARN("interface %s: enabled", slave->name);
1311 slave->tag = tag_create_random();
1312 }
1313
1314 if (bond->balance == BM_STABLE) {
1315 bond->bond_revalidate = true;
1316 }
1317 }
1318 }
1319
1320 static void
1321 bond_link_status_update(struct bond_slave *slave, struct tag_set *tags)
1322 {
1323 struct bond *bond = slave->bond;
1324 bool up;
1325
1326 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1327 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1328 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1329 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1330 slave->name, up ? "up" : "down");
1331 if (up == slave->enabled) {
1332 slave->delay_expires = LLONG_MAX;
1333 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1334 slave->name, up ? "disabled" : "enabled");
1335 } else {
1336 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1337 : up ? bond->updelay : bond->downdelay);
1338 slave->delay_expires = time_msec() + delay;
1339 if (delay) {
1340 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1341 "for %d ms",
1342 slave->name,
1343 up ? "enabled" : "disabled",
1344 up ? "up" : "down",
1345 delay);
1346 }
1347 }
1348 }
1349
1350 if (time_msec() >= slave->delay_expires) {
1351 bond_enable_slave(slave, up, tags);
1352 }
1353 }
1354
1355 static unsigned int
1356 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1357 {
1358 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
1359 }
1360
1361 static unsigned int
1362 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1363 {
1364 struct flow hash_flow = *flow;
1365 hash_flow.vlan_tci = htons(vlan);
1366
1367 /* The symmetric quality of this hash function is not required, but
1368 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1369 * purposes, so we use it out of convenience. */
1370 return flow_hash_symmetric_l4(&hash_flow, basis);
1371 }
1372
1373 static unsigned int
1374 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1375 {
1376 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1377
1378 return (bond->balance == BM_TCP
1379 ? bond_hash_tcp(flow, vlan, bond->basis)
1380 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1381 }
1382
1383 static struct bond_entry *
1384 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1385 uint16_t vlan)
1386 {
1387 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1388 }
1389
1390 /* This function uses Highest Random Weight hashing to choose an output slave.
1391 * This approach only reassigns a minimal number of flows when slaves are
1392 * enabled or disabled. Unfortunately, it has O(n) performance against the
1393 * number of slaves. There exist algorithms which are O(1), but have slightly
1394 * more complex implementations and require the use of memory. This may need
1395 * to be reimplemented if it becomes a performance bottleneck. */
1396 static struct bond_slave *
1397 choose_stb_slave(const struct bond *bond, uint32_t flow_hash)
1398 {
1399 struct bond_slave *best, *slave;
1400 uint32_t best_hash;
1401
1402 best = NULL;
1403 best_hash = 0;
1404 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1405 if (slave->enabled) {
1406 uint32_t hash;
1407
1408 hash = hash_2words(flow_hash, slave->stb_id);
1409 if (!best || hash > best_hash) {
1410 best = slave;
1411 best_hash = hash;
1412 }
1413 }
1414 }
1415
1416 return best;
1417 }
1418
1419 static struct bond_slave *
1420 choose_output_slave(const struct bond *bond, const struct flow *flow,
1421 uint16_t vlan, tag_type *tags)
1422 {
1423 struct bond_entry *e;
1424
1425 if (bond->lacp_status == LACP_CONFIGURED) {
1426 /* LACP has been configured on this bond but negotiations were
1427 * unsuccussful. Drop all traffic. */
1428 return NULL;
1429 }
1430
1431 switch (bond->balance) {
1432 case BM_AB:
1433 return bond->active_slave;
1434
1435 case BM_STABLE:
1436 return choose_stb_slave(bond, bond_hash_tcp(flow, vlan, bond->basis));
1437
1438 case BM_TCP:
1439 if (bond->lacp_status != LACP_NEGOTIATED) {
1440 /* Must have LACP negotiations for TCP balanced bonds. */
1441 return NULL;
1442 }
1443 /* Fall Through. */
1444 case BM_SLB:
1445 if (!bond_is_balanced(bond)) {
1446 return choose_stb_slave(bond, bond_hash(bond, flow, vlan));
1447 }
1448 e = lookup_bond_entry(bond, flow, vlan);
1449 if (!e->slave || !e->slave->enabled) {
1450 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1451 struct bond_slave, hmap_node);
1452 if (!e->slave->enabled) {
1453 e->slave = bond->active_slave;
1454 }
1455 e->tag = tag_create_random();
1456 }
1457 *tags |= e->tag;
1458 return e->slave;
1459
1460 default:
1461 NOT_REACHED();
1462 }
1463 }
1464
1465 static struct bond_slave *
1466 bond_choose_slave(const struct bond *bond)
1467 {
1468 struct bond_slave *slave, *best;
1469
1470 /* Find an enabled slave. */
1471 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1472 if (slave->enabled) {
1473 return slave;
1474 }
1475 }
1476
1477 /* All interfaces are disabled. Find an interface that will be enabled
1478 * after its updelay expires. */
1479 best = NULL;
1480 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1481 if (slave->delay_expires != LLONG_MAX
1482 && slave->may_enable
1483 && (!best || slave->delay_expires < best->delay_expires)) {
1484 best = slave;
1485 }
1486 }
1487 return best;
1488 }
1489
1490 static void
1491 bond_choose_active_slave(struct bond *bond, struct tag_set *tags)
1492 {
1493 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1494 struct bond_slave *old_active_slave = bond->active_slave;
1495
1496 bond->active_slave = bond_choose_slave(bond);
1497 if (bond->active_slave) {
1498 if (bond->active_slave->enabled) {
1499 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1500 bond->name, bond->active_slave->name);
1501 } else {
1502 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1503 "remaining %lld ms updelay (since no interface was "
1504 "enabled)", bond->name, bond->active_slave->name,
1505 bond->active_slave->delay_expires - time_msec());
1506 bond_enable_slave(bond->active_slave, true, tags);
1507 }
1508
1509 if (!old_active_slave) {
1510 tag_set_add(tags, bond->no_slaves_tag);
1511 }
1512
1513 bond->send_learning_packets = true;
1514 } else if (old_active_slave) {
1515 VLOG_WARN_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1516 }
1517 }
1518
1519 /* Returns the tag for 'bond''s active slave, or 'bond''s no_slaves_tag if
1520 * there is no active slave. */
1521 static tag_type
1522 bond_get_active_slave_tag(const struct bond *bond)
1523 {
1524 return (bond->active_slave
1525 ? bond->active_slave->tag
1526 : bond->no_slaves_tag);
1527 }
1528
1529 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1530 * bond interface. */
1531 static void
1532 bond_update_fake_slave_stats(struct bond *bond)
1533 {
1534 struct netdev_stats bond_stats;
1535 struct bond_slave *slave;
1536 struct netdev *bond_dev;
1537
1538 memset(&bond_stats, 0, sizeof bond_stats);
1539
1540 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1541 struct netdev_stats slave_stats;
1542
1543 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1544 /* XXX: We swap the stats here because they are swapped back when
1545 * reported by the internal device. The reason for this is
1546 * internal devices normally represent packets going into the
1547 * system but when used as fake bond device they represent packets
1548 * leaving the system. We really should do this in the internal
1549 * device itself because changing it here reverses the counts from
1550 * the perspective of the switch. However, the internal device
1551 * doesn't know what type of device it represents so we have to do
1552 * it here for now. */
1553 bond_stats.tx_packets += slave_stats.rx_packets;
1554 bond_stats.tx_bytes += slave_stats.rx_bytes;
1555 bond_stats.rx_packets += slave_stats.tx_packets;
1556 bond_stats.rx_bytes += slave_stats.tx_bytes;
1557 }
1558 }
1559
1560 if (!netdev_open(bond->name, "system", &bond_dev)) {
1561 netdev_set_stats(bond_dev, &bond_stats);
1562 netdev_close(bond_dev);
1563 }
1564 }