]> git.proxmox.com Git - mirror_ovs.git/blob - ofproto/bond.c
ofproto: Move bond files to ofproto
[mirror_ovs.git] / ofproto / bond.c
1 /*
2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "bond.h"
20
21 #include <limits.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <math.h>
25
26 #include "coverage.h"
27 #include "dynamic-string.h"
28 #include "flow.h"
29 #include "hmap.h"
30 #include "lacp.h"
31 #include "list.h"
32 #include "netdev.h"
33 #include "odp-util.h"
34 #include "ofpbuf.h"
35 #include "packets.h"
36 #include "poll-loop.h"
37 #include "shash.h"
38 #include "timeval.h"
39 #include "unixctl.h"
40 #include "vlog.h"
41
42 VLOG_DEFINE_THIS_MODULE(bond);
43
44 /* Bit-mask for hashing a flow down to a bucket.
45 * There are (BOND_MASK + 1) buckets. */
46 #define BOND_MASK 0xff
47
48 /* A hash bucket for mapping a flow to a slave.
49 * "struct bond" has an array of (BOND_MASK + 1) of these. */
50 struct bond_entry {
51 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
52 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
53 struct list list_node; /* In bond_slave's 'entries' list. */
54 };
55
56 /* A bond slave, that is, one of the links comprising a bond. */
57 struct bond_slave {
58 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
59 struct bond *bond; /* The bond that contains this slave. */
60 void *aux; /* Client-provided handle for this slave. */
61
62 struct netdev *netdev; /* Network device, owned by the client. */
63 unsigned int change_seq; /* Tracks changes in 'netdev'. */
64 char *name; /* Name (a copy of netdev_get_name(netdev)). */
65
66 /* Link status. */
67 long long delay_expires; /* Time after which 'enabled' may change. */
68 bool enabled; /* May be chosen for flows? */
69 bool may_enable; /* Client considers this slave bondable. */
70
71 /* Rebalancing info. Used only by bond_rebalance(). */
72 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
73 struct list entries; /* 'struct bond_entry's assigned here. */
74 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
75 };
76
77 /* A bond, that is, a set of network devices grouped to improve performance or
78 * robustness. */
79 struct bond {
80 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
81 char *name; /* Name provided by client. */
82
83 /* Slaves. */
84 struct hmap slaves;
85
86 /* Bonding info. */
87 enum bond_mode balance; /* Balancing mode, one of BM_*. */
88 struct bond_slave *active_slave;
89 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
90 enum lacp_status lacp_status; /* Status of LACP negotiations. */
91 bool bond_revalidate; /* True if flows need revalidation. */
92 uint32_t basis; /* Basis for flow hash function. */
93
94 /* SLB specific bonding info. */
95 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
96 int rebalance_interval; /* Interval between rebalances, in ms. */
97 long long int next_rebalance; /* Next rebalancing time. */
98 bool send_learning_packets;
99
100 /* Legacy compatibility. */
101 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
102 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
103
104 atomic_int ref_cnt;
105 };
106
107 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
108 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
109 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
110
111 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
112 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
113 OVS_REQ_RDLOCK(rwlock);
114 static void bond_enable_slave(struct bond_slave *, bool enable)
115 OVS_REQ_WRLOCK(rwlock);
116 static void bond_link_status_update(struct bond_slave *)
117 OVS_REQ_WRLOCK(rwlock);
118 static void bond_choose_active_slave(struct bond *)
119 OVS_REQ_WRLOCK(rwlock);;
120 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
121 uint16_t vlan, uint32_t basis);
122 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
123 uint32_t basis);
124 static struct bond_entry *lookup_bond_entry(const struct bond *,
125 const struct flow *,
126 uint16_t vlan)
127 OVS_REQ_RDLOCK(rwlock);
128 static struct bond_slave *choose_output_slave(const struct bond *,
129 const struct flow *,
130 struct flow_wildcards *,
131 uint16_t vlan)
132 OVS_REQ_RDLOCK(rwlock);
133 static void bond_update_fake_slave_stats(struct bond *)
134 OVS_REQ_RDLOCK(rwlock);
135
136 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
137 * stores the mode in '*balance' and returns true. Otherwise returns false
138 * without modifying '*balance'. */
139 bool
140 bond_mode_from_string(enum bond_mode *balance, const char *s)
141 {
142 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
143 *balance = BM_TCP;
144 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
145 *balance = BM_SLB;
146 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
147 *balance = BM_AB;
148 } else {
149 return false;
150 }
151 return true;
152 }
153
154 /* Returns a string representing 'balance'. */
155 const char *
156 bond_mode_to_string(enum bond_mode balance) {
157 switch (balance) {
158 case BM_TCP:
159 return "balance-tcp";
160 case BM_SLB:
161 return "balance-slb";
162 case BM_AB:
163 return "active-backup";
164 }
165 NOT_REACHED();
166 }
167
168 \f
169 /* Creates and returns a new bond whose configuration is initially taken from
170 * 's'.
171 *
172 * The caller should register each slave on the new bond by calling
173 * bond_slave_register(). */
174 struct bond *
175 bond_create(const struct bond_settings *s)
176 {
177 struct bond *bond;
178
179 bond = xzalloc(sizeof *bond);
180 hmap_init(&bond->slaves);
181 bond->next_fake_iface_update = LLONG_MAX;
182 atomic_init(&bond->ref_cnt, 1);
183
184 bond_reconfigure(bond, s);
185 return bond;
186 }
187
188 struct bond *
189 bond_ref(const struct bond *bond_)
190 {
191 struct bond *bond = CONST_CAST(struct bond *, bond_);
192
193 if (bond) {
194 int orig;
195 atomic_add(&bond->ref_cnt, 1, &orig);
196 ovs_assert(orig > 0);
197 }
198 return bond;
199 }
200
201 /* Frees 'bond'. */
202 void
203 bond_unref(struct bond *bond)
204 {
205 struct bond_slave *slave, *next_slave;
206 int orig;
207
208 if (!bond) {
209 return;
210 }
211
212 atomic_sub(&bond->ref_cnt, 1, &orig);
213 ovs_assert(orig > 0);
214 if (orig != 1) {
215 return;
216 }
217
218 ovs_rwlock_wrlock(&rwlock);
219 hmap_remove(all_bonds, &bond->hmap_node);
220 ovs_rwlock_unlock(&rwlock);
221
222 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
223 hmap_remove(&bond->slaves, &slave->hmap_node);
224 /* Client owns 'slave->netdev'. */
225 free(slave->name);
226 free(slave);
227 }
228 hmap_destroy(&bond->slaves);
229
230 free(bond->hash);
231 free(bond->name);
232 free(bond);
233 }
234
235 /* Updates 'bond''s overall configuration to 's'.
236 *
237 * The caller should register each slave on 'bond' by calling
238 * bond_slave_register(). This is optional if none of the slaves'
239 * configuration has changed. In any case it can't hurt.
240 *
241 * Returns true if the configuration has changed in such a way that requires
242 * flow revalidation.
243 * */
244 bool
245 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
246 {
247 bool revalidate = false;
248
249 ovs_rwlock_wrlock(&rwlock);
250 if (!bond->name || strcmp(bond->name, s->name)) {
251 if (bond->name) {
252 hmap_remove(all_bonds, &bond->hmap_node);
253 free(bond->name);
254 }
255 bond->name = xstrdup(s->name);
256 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
257 }
258
259 bond->updelay = s->up_delay;
260 bond->downdelay = s->down_delay;
261
262 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
263 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
264 revalidate = true;
265 }
266
267 if (bond->rebalance_interval != s->rebalance_interval) {
268 bond->rebalance_interval = s->rebalance_interval;
269 revalidate = true;
270 }
271
272 if (bond->balance != s->balance) {
273 bond->balance = s->balance;
274 revalidate = true;
275 }
276
277 if (bond->basis != s->basis) {
278 bond->basis = s->basis;
279 revalidate = true;
280 }
281
282 if (s->fake_iface) {
283 if (bond->next_fake_iface_update == LLONG_MAX) {
284 bond->next_fake_iface_update = time_msec();
285 }
286 } else {
287 bond->next_fake_iface_update = LLONG_MAX;
288 }
289
290 if (bond->bond_revalidate) {
291 revalidate = true;
292 bond->bond_revalidate = false;
293 }
294
295 if (bond->balance == BM_AB || !bond->hash || revalidate) {
296 bond_entry_reset(bond);
297 }
298
299 ovs_rwlock_unlock(&rwlock);
300 return revalidate;
301 }
302
303 static void
304 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
305 OVS_REQ_WRLOCK(rwlock)
306 {
307 if (slave->netdev != netdev) {
308 slave->netdev = netdev;
309 slave->change_seq = 0;
310 }
311 }
312
313 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
314 * arbitrary client-provided pointer that uniquely identifies a slave within a
315 * bond. If 'slave_' already exists within 'bond' then this function
316 * reconfigures the existing slave.
317 *
318 * 'netdev' must be the network device that 'slave_' represents. It is owned
319 * by the client, so the client must not close it before either unregistering
320 * 'slave_' or destroying 'bond'.
321 */
322 void
323 bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
324 {
325 struct bond_slave *slave;
326
327 ovs_rwlock_wrlock(&rwlock);
328 slave = bond_slave_lookup(bond, slave_);
329 if (!slave) {
330 slave = xzalloc(sizeof *slave);
331
332 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
333 slave->bond = bond;
334 slave->aux = slave_;
335 slave->delay_expires = LLONG_MAX;
336 slave->name = xstrdup(netdev_get_name(netdev));
337 bond->bond_revalidate = true;
338
339 slave->enabled = false;
340 bond_enable_slave(slave, netdev_get_carrier(netdev));
341 }
342
343 bond_slave_set_netdev__(slave, netdev);
344
345 free(slave->name);
346 slave->name = xstrdup(netdev_get_name(netdev));
347 ovs_rwlock_unlock(&rwlock);
348 }
349
350 /* Updates the network device to be used with 'slave_' to 'netdev'.
351 *
352 * This is useful if the caller closes and re-opens the network device
353 * registered with bond_slave_register() but doesn't need to change anything
354 * else. */
355 void
356 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
357 {
358 struct bond_slave *slave;
359
360 ovs_rwlock_wrlock(&rwlock);
361 slave = bond_slave_lookup(bond, slave_);
362 if (slave) {
363 bond_slave_set_netdev__(slave, netdev);
364 }
365 ovs_rwlock_unlock(&rwlock);
366 }
367
368 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
369 * then this function has no effect.
370 *
371 * Unregistering a slave invalidates all flows. */
372 void
373 bond_slave_unregister(struct bond *bond, const void *slave_)
374 {
375 struct bond_slave *slave;
376 bool del_active;
377
378 ovs_rwlock_wrlock(&rwlock);
379 slave = bond_slave_lookup(bond, slave_);
380 if (!slave) {
381 goto out;
382 }
383
384 bond->bond_revalidate = true;
385 bond_enable_slave(slave, false);
386
387 del_active = bond->active_slave == slave;
388 if (bond->hash) {
389 struct bond_entry *e;
390 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
391 if (e->slave == slave) {
392 e->slave = NULL;
393 }
394 }
395 }
396
397 free(slave->name);
398
399 hmap_remove(&bond->slaves, &slave->hmap_node);
400 /* Client owns 'slave->netdev'. */
401 free(slave);
402
403 if (del_active) {
404 bond_choose_active_slave(bond);
405 bond->send_learning_packets = true;
406 }
407 out:
408 ovs_rwlock_unlock(&rwlock);
409 }
410
411 /* Should be called on each slave in 'bond' before bond_run() to indicate
412 * whether or not 'slave_' may be enabled. This function is intended to allow
413 * other protocols to have some impact on bonding decisions. For example LACP
414 * or high level link monitoring protocols may decide that a given slave should
415 * not be able to send traffic. */
416 void
417 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
418 {
419 ovs_rwlock_wrlock(&rwlock);
420 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
421 ovs_rwlock_unlock(&rwlock);
422 }
423
424 /* Performs periodic maintenance on 'bond'.
425 *
426 * Returns true if the caller should revalidate its flows.
427 *
428 * The caller should check bond_should_send_learning_packets() afterward. */
429 bool
430 bond_run(struct bond *bond, enum lacp_status lacp_status)
431 {
432 struct bond_slave *slave;
433 bool revalidate;
434
435 ovs_rwlock_wrlock(&rwlock);
436 if (bond->lacp_status != lacp_status) {
437 bond->lacp_status = lacp_status;
438 bond->bond_revalidate = true;
439 }
440
441 /* Enable slaves based on link status and LACP feedback. */
442 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
443 bond_link_status_update(slave);
444 slave->change_seq = netdev_change_seq(slave->netdev);
445 }
446 if (!bond->active_slave || !bond->active_slave->enabled) {
447 bond_choose_active_slave(bond);
448 }
449
450 /* Update fake bond interface stats. */
451 if (time_msec() >= bond->next_fake_iface_update) {
452 bond_update_fake_slave_stats(bond);
453 bond->next_fake_iface_update = time_msec() + 1000;
454 }
455
456 revalidate = bond->bond_revalidate;
457 bond->bond_revalidate = false;
458 ovs_rwlock_unlock(&rwlock);
459
460 return revalidate;
461 }
462
463 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
464 void
465 bond_wait(struct bond *bond)
466 {
467 struct bond_slave *slave;
468
469 ovs_rwlock_rdlock(&rwlock);
470 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
471 if (slave->delay_expires != LLONG_MAX) {
472 poll_timer_wait_until(slave->delay_expires);
473 }
474
475 if (slave->change_seq != netdev_change_seq(slave->netdev)) {
476 poll_immediate_wake();
477 }
478 }
479
480 if (bond->next_fake_iface_update != LLONG_MAX) {
481 poll_timer_wait_until(bond->next_fake_iface_update);
482 }
483
484 if (bond->bond_revalidate) {
485 poll_immediate_wake();
486 }
487 ovs_rwlock_unlock(&rwlock);
488
489 /* We don't wait for bond->next_rebalance because rebalancing can only run
490 * at a flow account checkpoint. ofproto does checkpointing on its own
491 * schedule and bond_rebalance() gets called afterward, so we'd just be
492 * waking up for no purpose. */
493 }
494 \f
495 /* MAC learning table interaction. */
496
497 static bool
498 may_send_learning_packets(const struct bond *bond)
499 {
500 return ((bond->lacp_status == LACP_DISABLED
501 && (bond->balance == BM_SLB || bond->balance == BM_AB))
502 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
503 && bond->active_slave;
504 }
505
506 /* Returns true if 'bond' needs the client to send out packets to assist with
507 * MAC learning on 'bond'. If this function returns true, then the client
508 * should iterate through its MAC learning table for the bridge on which 'bond'
509 * is located. For each MAC that has been learned on a port other than 'bond',
510 * it should call bond_compose_learning_packet().
511 *
512 * This function will only return true if 'bond' is in SLB or active-backup
513 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
514 * necessary.
515 *
516 * Calling this function resets the state that it checks. */
517 bool
518 bond_should_send_learning_packets(struct bond *bond)
519 {
520 bool send;
521
522 ovs_rwlock_wrlock(&rwlock);
523 send = bond->send_learning_packets && may_send_learning_packets(bond);
524 bond->send_learning_packets = false;
525 ovs_rwlock_unlock(&rwlock);
526 return send;
527 }
528
529 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
530 *
531 * See bond_should_send_learning_packets() for description of usage. The
532 * caller should send the composed packet on the port associated with
533 * port_aux and takes ownership of the returned ofpbuf. */
534 struct ofpbuf *
535 bond_compose_learning_packet(struct bond *bond,
536 const uint8_t eth_src[ETH_ADDR_LEN],
537 uint16_t vlan, void **port_aux)
538 {
539 struct bond_slave *slave;
540 struct ofpbuf *packet;
541 struct flow flow;
542
543 ovs_rwlock_rdlock(&rwlock);
544 ovs_assert(may_send_learning_packets(bond));
545 memset(&flow, 0, sizeof flow);
546 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
547 slave = choose_output_slave(bond, &flow, NULL, vlan);
548
549 packet = ofpbuf_new(0);
550 compose_rarp(packet, eth_src);
551 if (vlan) {
552 eth_push_vlan(packet, htons(vlan));
553 }
554
555 *port_aux = slave->aux;
556 ovs_rwlock_unlock(&rwlock);
557 return packet;
558 }
559 \f
560 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
561 * Ethernet destination address of 'eth_dst', should be admitted.
562 *
563 * The return value is one of the following:
564 *
565 * - BV_ACCEPT: Admit the packet.
566 *
567 * - BV_DROP: Drop the packet.
568 *
569 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
570 * Ethernet source address and VLAN. If there is none, or if the packet
571 * is on the learned port, then admit the packet. If a different port has
572 * been learned, however, drop the packet (and do not use it for MAC
573 * learning).
574 */
575 enum bond_verdict
576 bond_check_admissibility(struct bond *bond, const void *slave_,
577 const uint8_t eth_dst[ETH_ADDR_LEN])
578 {
579 enum bond_verdict verdict = BV_DROP;
580 struct bond_slave *slave;
581
582 ovs_rwlock_rdlock(&rwlock);
583 slave = bond_slave_lookup(bond, slave_);
584 if (!slave) {
585 goto out;
586 }
587
588 /* LACP bonds have very loose admissibility restrictions because we can
589 * assume the remote switch is aware of the bond and will "do the right
590 * thing". However, as a precaution we drop packets on disabled slaves
591 * because no correctly implemented partner switch should be sending
592 * packets to them.
593 *
594 * If LACP is configured, but LACP negotiations have been unsuccessful, we
595 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
596 switch (bond->lacp_status) {
597 case LACP_NEGOTIATED:
598 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
599 goto out;
600 case LACP_CONFIGURED:
601 if (!bond->lacp_fallback_ab) {
602 goto out;
603 }
604 case LACP_DISABLED:
605 break;
606 }
607
608 /* Drop all multicast packets on inactive slaves. */
609 if (eth_addr_is_multicast(eth_dst)) {
610 if (bond->active_slave != slave) {
611 goto out;
612 }
613 }
614
615 switch (bond->balance) {
616 case BM_TCP:
617 /* TCP balanced bonds require successful LACP negotiations. Based on the
618 * above check, LACP is off or lacp_fallback_ab is true on this bond.
619 * If lacp_fallback_ab is true fall through to BM_AB case else, we
620 * drop all incoming traffic. */
621 if (!bond->lacp_fallback_ab) {
622 goto out;
623 }
624
625 case BM_AB:
626 /* Drop all packets which arrive on backup slaves. This is similar to
627 * how Linux bonding handles active-backup bonds. */
628 if (bond->active_slave != slave) {
629 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
630
631 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
632 " slave (%s) destined for " ETH_ADDR_FMT,
633 slave->name, ETH_ADDR_ARGS(eth_dst));
634 goto out;
635 }
636 verdict = BV_ACCEPT;
637 goto out;
638
639 case BM_SLB:
640 /* Drop all packets for which we have learned a different input port,
641 * because we probably sent the packet on one slave and got it back on
642 * the other. Gratuitous ARP packets are an exception to this rule:
643 * the host has moved to another switch. The exception to the
644 * exception is if we locked the learning table to avoid reflections on
645 * bond slaves. */
646 verdict = BV_DROP_IF_MOVED;
647 goto out;
648 }
649
650 NOT_REACHED();
651 out:
652 ovs_rwlock_unlock(&rwlock);
653 return verdict;
654
655 }
656
657 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
658 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
659 * NULL if the packet should be dropped because no slaves are enabled.
660 *
661 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
662 * should be a VID only (i.e. excluding the PCP bits). Second,
663 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
664 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
665 * packet belongs to (so for an access port it will be the access port's VLAN).
666 *
667 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
668 * significant in the selection. At some point earlier, 'wc' should
669 * have been initialized (e.g., by flow_wildcards_init_catchall()).
670 */
671 void *
672 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
673 struct flow_wildcards *wc, uint16_t vlan)
674 {
675 struct bond_slave *slave;
676 void *aux;
677
678 ovs_rwlock_rdlock(&rwlock);
679 slave = choose_output_slave(bond, flow, wc, vlan);
680 aux = slave ? slave->aux : NULL;
681 ovs_rwlock_unlock(&rwlock);
682
683 return aux;
684 }
685 \f
686 /* Rebalancing. */
687
688 static bool
689 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
690 {
691 return bond->rebalance_interval
692 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
693 }
694
695 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
696 void
697 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
698 uint64_t n_bytes)
699 {
700 ovs_rwlock_wrlock(&rwlock);
701 if (bond_is_balanced(bond)) {
702 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
703 }
704 ovs_rwlock_unlock(&rwlock);
705 }
706
707 static struct bond_slave *
708 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
709 {
710 return CONTAINER_OF(bal, struct bond_slave, bal_node);
711 }
712
713 static void
714 log_bals(struct bond *bond, const struct list *bals)
715 {
716 if (VLOG_IS_DBG_ENABLED()) {
717 struct ds ds = DS_EMPTY_INITIALIZER;
718 const struct bond_slave *slave;
719
720 LIST_FOR_EACH (slave, bal_node, bals) {
721 if (ds.length) {
722 ds_put_char(&ds, ',');
723 }
724 ds_put_format(&ds, " %s %"PRIu64"kB",
725 slave->name, slave->tx_bytes / 1024);
726
727 if (!slave->enabled) {
728 ds_put_cstr(&ds, " (disabled)");
729 }
730 if (!list_is_empty(&slave->entries)) {
731 struct bond_entry *e;
732
733 ds_put_cstr(&ds, " (");
734 LIST_FOR_EACH (e, list_node, &slave->entries) {
735 if (&e->list_node != list_front(&slave->entries)) {
736 ds_put_cstr(&ds, " + ");
737 }
738 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
739 e - bond->hash, e->tx_bytes / 1024);
740 }
741 ds_put_cstr(&ds, ")");
742 }
743 }
744 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
745 ds_destroy(&ds);
746 }
747 }
748
749 /* Shifts 'hash' from its current slave to 'to'. */
750 static void
751 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
752 {
753 struct bond_slave *from = hash->slave;
754 struct bond *bond = from->bond;
755 uint64_t delta = hash->tx_bytes;
756
757 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
758 "from %s to %s (now carrying %"PRIu64"kB and "
759 "%"PRIu64"kB load, respectively)",
760 bond->name, delta / 1024, hash - bond->hash,
761 from->name, to->name,
762 (from->tx_bytes - delta) / 1024,
763 (to->tx_bytes + delta) / 1024);
764
765 /* Shift load away from 'from' to 'to'. */
766 from->tx_bytes -= delta;
767 to->tx_bytes += delta;
768
769 /* Arrange for flows to be revalidated. */
770 hash->slave = to;
771 bond->bond_revalidate = true;
772 }
773
774 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
775 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
776 * given that doing so must decrease the ratio of the load on the two slaves by
777 * at least 0.1. Returns NULL if there is no appropriate entry.
778 *
779 * The list of entries isn't sorted. I don't know of a reason to prefer to
780 * shift away small hashes or large hashes. */
781 static struct bond_entry *
782 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
783 {
784 struct bond_entry *e;
785
786 if (list_is_short(&from->entries)) {
787 /* 'from' carries no more than one MAC hash, so shifting load away from
788 * it would be pointless. */
789 return NULL;
790 }
791
792 LIST_FOR_EACH (e, list_node, &from->entries) {
793 double old_ratio, new_ratio;
794 uint64_t delta;
795
796 if (to_tx_bytes == 0) {
797 /* Nothing on the new slave, move it. */
798 return e;
799 }
800
801 delta = e->tx_bytes;
802 old_ratio = (double)from->tx_bytes / to_tx_bytes;
803 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
804 if (old_ratio - new_ratio > 0.1
805 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
806 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
807 and 'to' slave have the same load. Therefore, we only move an
808 entry if it decreases the load on 'from', and brings us closer
809 to equal traffic load. */
810 return e;
811 }
812 }
813
814 return NULL;
815 }
816
817 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
818 * maintained. */
819 static void
820 insert_bal(struct list *bals, struct bond_slave *slave)
821 {
822 struct bond_slave *pos;
823
824 LIST_FOR_EACH (pos, bal_node, bals) {
825 if (slave->tx_bytes > pos->tx_bytes) {
826 break;
827 }
828 }
829 list_insert(&pos->bal_node, &slave->bal_node);
830 }
831
832 /* Removes 'slave' from its current list and then inserts it into 'bals' so
833 * that descending order of 'tx_bytes' is maintained. */
834 static void
835 reinsert_bal(struct list *bals, struct bond_slave *slave)
836 {
837 list_remove(&slave->bal_node);
838 insert_bal(bals, slave);
839 }
840
841 /* If 'bond' needs rebalancing, does so.
842 *
843 * The caller should have called bond_account() for each active flow, to ensure
844 * that flow data is consistently accounted at this point. */
845 void
846 bond_rebalance(struct bond *bond)
847 {
848 struct bond_slave *slave;
849 struct bond_entry *e;
850 struct list bals;
851
852 ovs_rwlock_wrlock(&rwlock);
853 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
854 ovs_rwlock_unlock(&rwlock);
855 return;
856 }
857 bond->next_rebalance = time_msec() + bond->rebalance_interval;
858
859 /* Add each bond_entry to its slave's 'entries' list.
860 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
861 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
862 slave->tx_bytes = 0;
863 list_init(&slave->entries);
864 }
865 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
866 if (e->slave && e->tx_bytes) {
867 e->slave->tx_bytes += e->tx_bytes;
868 list_push_back(&e->slave->entries, &e->list_node);
869 }
870 }
871
872 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
873 *
874 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
875 * with a proper list sort algorithm. */
876 list_init(&bals);
877 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
878 if (slave->enabled) {
879 insert_bal(&bals, slave);
880 }
881 }
882 log_bals(bond, &bals);
883
884 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
885 while (!list_is_short(&bals)) {
886 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
887 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
888 uint64_t overload;
889
890 overload = from->tx_bytes - to->tx_bytes;
891 if (overload < to->tx_bytes >> 5 || overload < 100000) {
892 /* The extra load on 'from' (and all less-loaded slaves), compared
893 * to that of 'to' (the least-loaded slave), is less than ~3%, or
894 * it is less than ~1Mbps. No point in rebalancing. */
895 break;
896 }
897
898 /* 'from' is carrying significantly more load than 'to'. Pick a hash
899 * to move from 'from' to 'to'. */
900 e = choose_entry_to_migrate(from, to->tx_bytes);
901 if (e) {
902 bond_shift_load(e, to);
903
904 /* Delete element from from->entries.
905 *
906 * We don't add the element to to->hashes. That would only allow
907 * 'e' to be migrated to another slave in this rebalancing run, and
908 * there is no point in doing that. */
909 list_remove(&e->list_node);
910
911 /* Re-sort 'bals'. */
912 reinsert_bal(&bals, from);
913 reinsert_bal(&bals, to);
914 } else {
915 /* Can't usefully migrate anything away from 'from'.
916 * Don't reconsider it. */
917 list_remove(&from->bal_node);
918 }
919 }
920
921 /* Implement exponentially weighted moving average. A weight of 1/2 causes
922 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
923 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
924 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
925 e->tx_bytes /= 2;
926 if (!e->tx_bytes) {
927 e->slave = NULL;
928 }
929 }
930 ovs_rwlock_unlock(&rwlock);
931 }
932 \f
933 /* Bonding unixctl user interface functions. */
934
935 static struct bond *
936 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
937 {
938 struct bond *bond;
939
940 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
941 all_bonds) {
942 if (!strcmp(bond->name, name)) {
943 return bond;
944 }
945 }
946 return NULL;
947 }
948
949 static struct bond_slave *
950 bond_lookup_slave(struct bond *bond, const char *slave_name)
951 {
952 struct bond_slave *slave;
953
954 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
955 if (!strcmp(slave->name, slave_name)) {
956 return slave;
957 }
958 }
959 return NULL;
960 }
961
962 static void
963 bond_unixctl_list(struct unixctl_conn *conn,
964 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
965 void *aux OVS_UNUSED)
966 {
967 struct ds ds = DS_EMPTY_INITIALIZER;
968 const struct bond *bond;
969
970 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
971
972 ovs_rwlock_rdlock(&rwlock);
973 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
974 const struct bond_slave *slave;
975 size_t i;
976
977 ds_put_format(&ds, "%s\t%s\t",
978 bond->name, bond_mode_to_string(bond->balance));
979
980 i = 0;
981 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
982 if (i++ > 0) {
983 ds_put_cstr(&ds, ", ");
984 }
985 ds_put_cstr(&ds, slave->name);
986 }
987 ds_put_char(&ds, '\n');
988 }
989 ovs_rwlock_unlock(&rwlock);
990 unixctl_command_reply(conn, ds_cstr(&ds));
991 ds_destroy(&ds);
992 }
993
994 static void
995 bond_print_details(struct ds *ds, const struct bond *bond)
996 OVS_REQ_RDLOCK(rwlock)
997 {
998 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
999 const struct shash_node **sorted_slaves = NULL;
1000 const struct bond_slave *slave;
1001 int i;
1002
1003 ds_put_format(ds, "---- %s ----\n", bond->name);
1004 ds_put_format(ds, "bond_mode: %s\n",
1005 bond_mode_to_string(bond->balance));
1006
1007 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1008
1009 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1010 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1011
1012 if (bond_is_balanced(bond)) {
1013 ds_put_format(ds, "next rebalance: %lld ms\n",
1014 bond->next_rebalance - time_msec());
1015 }
1016
1017 ds_put_cstr(ds, "lacp_status: ");
1018 switch (bond->lacp_status) {
1019 case LACP_NEGOTIATED:
1020 ds_put_cstr(ds, "negotiated\n");
1021 break;
1022 case LACP_CONFIGURED:
1023 ds_put_cstr(ds, "configured\n");
1024 break;
1025 case LACP_DISABLED:
1026 ds_put_cstr(ds, "off\n");
1027 break;
1028 default:
1029 ds_put_cstr(ds, "<unknown>\n");
1030 break;
1031 }
1032
1033 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1034 shash_add(&slave_shash, slave->name, slave);
1035 }
1036 sorted_slaves = shash_sort(&slave_shash);
1037
1038 for (i = 0; i < shash_count(&slave_shash); i++) {
1039 struct bond_entry *be;
1040
1041 slave = sorted_slaves[i]->data;
1042
1043 /* Basic info. */
1044 ds_put_format(ds, "\nslave %s: %s\n",
1045 slave->name, slave->enabled ? "enabled" : "disabled");
1046 if (slave == bond->active_slave) {
1047 ds_put_cstr(ds, "\tactive slave\n");
1048 }
1049 if (slave->delay_expires != LLONG_MAX) {
1050 ds_put_format(ds, "\t%s expires in %lld ms\n",
1051 slave->enabled ? "downdelay" : "updelay",
1052 slave->delay_expires - time_msec());
1053 }
1054
1055 ds_put_format(ds, "\tmay_enable: %s\n",
1056 slave->may_enable ? "true" : "false");
1057
1058 if (!bond_is_balanced(bond)) {
1059 continue;
1060 }
1061
1062 /* Hashes. */
1063 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1064 int hash = be - bond->hash;
1065
1066 if (be->slave != slave) {
1067 continue;
1068 }
1069
1070 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1071 hash, be->tx_bytes / 1024);
1072
1073 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1074 }
1075 }
1076 shash_destroy(&slave_shash);
1077 free(sorted_slaves);
1078 ds_put_cstr(ds, "\n");
1079 }
1080
1081 static void
1082 bond_unixctl_show(struct unixctl_conn *conn,
1083 int argc, const char *argv[],
1084 void *aux OVS_UNUSED)
1085 {
1086 struct ds ds = DS_EMPTY_INITIALIZER;
1087
1088 ovs_rwlock_rdlock(&rwlock);
1089 if (argc > 1) {
1090 const struct bond *bond = bond_find(argv[1]);
1091
1092 if (!bond) {
1093 unixctl_command_reply_error(conn, "no such bond");
1094 goto out;
1095 }
1096 bond_print_details(&ds, bond);
1097 } else {
1098 const struct bond *bond;
1099
1100 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1101 bond_print_details(&ds, bond);
1102 }
1103 }
1104
1105 unixctl_command_reply(conn, ds_cstr(&ds));
1106 ds_destroy(&ds);
1107
1108 out:
1109 ovs_rwlock_unlock(&rwlock);
1110 }
1111
1112 static void
1113 bond_unixctl_migrate(struct unixctl_conn *conn,
1114 int argc OVS_UNUSED, const char *argv[],
1115 void *aux OVS_UNUSED)
1116 {
1117 const char *bond_s = argv[1];
1118 const char *hash_s = argv[2];
1119 const char *slave_s = argv[3];
1120 struct bond *bond;
1121 struct bond_slave *slave;
1122 struct bond_entry *entry;
1123 int hash;
1124
1125 ovs_rwlock_wrlock(&rwlock);
1126 bond = bond_find(bond_s);
1127 if (!bond) {
1128 unixctl_command_reply_error(conn, "no such bond");
1129 goto out;
1130 }
1131
1132 if (bond->balance != BM_SLB) {
1133 unixctl_command_reply_error(conn, "not an SLB bond");
1134 goto out;
1135 }
1136
1137 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1138 hash = atoi(hash_s) & BOND_MASK;
1139 } else {
1140 unixctl_command_reply_error(conn, "bad hash");
1141 goto out;
1142 }
1143
1144 slave = bond_lookup_slave(bond, slave_s);
1145 if (!slave) {
1146 unixctl_command_reply_error(conn, "no such slave");
1147 goto out;
1148 }
1149
1150 if (!slave->enabled) {
1151 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1152 goto out;
1153 }
1154
1155 entry = &bond->hash[hash];
1156 bond->bond_revalidate = true;
1157 entry->slave = slave;
1158 unixctl_command_reply(conn, "migrated");
1159
1160 out:
1161 ovs_rwlock_unlock(&rwlock);
1162 }
1163
1164 static void
1165 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1166 int argc OVS_UNUSED, const char *argv[],
1167 void *aux OVS_UNUSED)
1168 {
1169 const char *bond_s = argv[1];
1170 const char *slave_s = argv[2];
1171 struct bond *bond;
1172 struct bond_slave *slave;
1173
1174 ovs_rwlock_wrlock(&rwlock);
1175 bond = bond_find(bond_s);
1176 if (!bond) {
1177 unixctl_command_reply_error(conn, "no such bond");
1178 goto out;
1179 }
1180
1181 slave = bond_lookup_slave(bond, slave_s);
1182 if (!slave) {
1183 unixctl_command_reply_error(conn, "no such slave");
1184 goto out;
1185 }
1186
1187 if (!slave->enabled) {
1188 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1189 goto out;
1190 }
1191
1192 if (bond->active_slave != slave) {
1193 bond->bond_revalidate = true;
1194 bond->active_slave = slave;
1195 VLOG_INFO("bond %s: active interface is now %s",
1196 bond->name, slave->name);
1197 bond->send_learning_packets = true;
1198 unixctl_command_reply(conn, "done");
1199 } else {
1200 unixctl_command_reply(conn, "no change");
1201 }
1202 out:
1203 ovs_rwlock_unlock(&rwlock);
1204 }
1205
1206 static void
1207 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1208 {
1209 const char *bond_s = argv[1];
1210 const char *slave_s = argv[2];
1211 struct bond *bond;
1212 struct bond_slave *slave;
1213
1214 ovs_rwlock_wrlock(&rwlock);
1215 bond = bond_find(bond_s);
1216 if (!bond) {
1217 unixctl_command_reply_error(conn, "no such bond");
1218 goto out;
1219 }
1220
1221 slave = bond_lookup_slave(bond, slave_s);
1222 if (!slave) {
1223 unixctl_command_reply_error(conn, "no such slave");
1224 goto out;
1225 }
1226
1227 bond_enable_slave(slave, enable);
1228 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1229
1230 out:
1231 ovs_rwlock_unlock(&rwlock);
1232 }
1233
1234 static void
1235 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1236 int argc OVS_UNUSED, const char *argv[],
1237 void *aux OVS_UNUSED)
1238 {
1239 enable_slave(conn, argv, true);
1240 }
1241
1242 static void
1243 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1244 int argc OVS_UNUSED, const char *argv[],
1245 void *aux OVS_UNUSED)
1246 {
1247 enable_slave(conn, argv, false);
1248 }
1249
1250 static void
1251 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1252 void *aux OVS_UNUSED)
1253 {
1254 const char *mac_s = argv[1];
1255 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1256 const char *basis_s = argc > 3 ? argv[3] : NULL;
1257 uint8_t mac[ETH_ADDR_LEN];
1258 uint8_t hash;
1259 char *hash_cstr;
1260 unsigned int vlan;
1261 uint32_t basis;
1262
1263 if (vlan_s) {
1264 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1265 unixctl_command_reply_error(conn, "invalid vlan");
1266 return;
1267 }
1268 } else {
1269 vlan = 0;
1270 }
1271
1272 if (basis_s) {
1273 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1274 unixctl_command_reply_error(conn, "invalid basis");
1275 return;
1276 }
1277 } else {
1278 basis = 0;
1279 }
1280
1281 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1282 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1283
1284 hash_cstr = xasprintf("%u", hash);
1285 unixctl_command_reply(conn, hash_cstr);
1286 free(hash_cstr);
1287 } else {
1288 unixctl_command_reply_error(conn, "invalid mac");
1289 }
1290 }
1291
1292 void
1293 bond_init(void)
1294 {
1295 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1296 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1297 NULL);
1298 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1299 bond_unixctl_migrate, NULL);
1300 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1301 bond_unixctl_set_active_slave, NULL);
1302 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1303 bond_unixctl_enable_slave, NULL);
1304 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1305 bond_unixctl_disable_slave, NULL);
1306 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1307 bond_unixctl_hash, NULL);
1308 }
1309 \f
1310 static void
1311 bond_entry_reset(struct bond *bond)
1312 {
1313 if (bond->balance != BM_AB) {
1314 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1315
1316 if (!bond->hash) {
1317 bond->hash = xmalloc(hash_len);
1318 }
1319 memset(bond->hash, 0, hash_len);
1320
1321 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1322 } else {
1323 free(bond->hash);
1324 bond->hash = NULL;
1325 }
1326 }
1327
1328 static struct bond_slave *
1329 bond_slave_lookup(struct bond *bond, const void *slave_)
1330 {
1331 struct bond_slave *slave;
1332
1333 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1334 &bond->slaves) {
1335 if (slave->aux == slave_) {
1336 return slave;
1337 }
1338 }
1339
1340 return NULL;
1341 }
1342
1343 static void
1344 bond_enable_slave(struct bond_slave *slave, bool enable)
1345 {
1346 slave->delay_expires = LLONG_MAX;
1347 if (enable != slave->enabled) {
1348 slave->bond->bond_revalidate = true;
1349 slave->enabled = enable;
1350 VLOG_INFO("interface %s: %s", slave->name,
1351 slave->enabled ? "enabled" : "disabled");
1352 }
1353 }
1354
1355 static void
1356 bond_link_status_update(struct bond_slave *slave)
1357 {
1358 struct bond *bond = slave->bond;
1359 bool up;
1360
1361 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1362 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1363 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1364 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1365 slave->name, up ? "up" : "down");
1366 if (up == slave->enabled) {
1367 slave->delay_expires = LLONG_MAX;
1368 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1369 slave->name, up ? "disabled" : "enabled");
1370 } else {
1371 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1372 : up ? bond->updelay : bond->downdelay);
1373 slave->delay_expires = time_msec() + delay;
1374 if (delay) {
1375 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1376 "for %d ms",
1377 slave->name,
1378 up ? "enabled" : "disabled",
1379 up ? "up" : "down",
1380 delay);
1381 }
1382 }
1383 }
1384
1385 if (time_msec() >= slave->delay_expires) {
1386 bond_enable_slave(slave, up);
1387 }
1388 }
1389
1390 static unsigned int
1391 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1392 {
1393 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
1394 }
1395
1396 static unsigned int
1397 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1398 {
1399 struct flow hash_flow = *flow;
1400 hash_flow.vlan_tci = htons(vlan);
1401
1402 /* The symmetric quality of this hash function is not required, but
1403 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1404 * purposes, so we use it out of convenience. */
1405 return flow_hash_symmetric_l4(&hash_flow, basis);
1406 }
1407
1408 static unsigned int
1409 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1410 {
1411 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1412
1413 return (bond->balance == BM_TCP
1414 ? bond_hash_tcp(flow, vlan, bond->basis)
1415 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1416 }
1417
1418 static struct bond_entry *
1419 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1420 uint16_t vlan)
1421 {
1422 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1423 }
1424
1425 static struct bond_slave *
1426 choose_output_slave(const struct bond *bond, const struct flow *flow,
1427 struct flow_wildcards *wc, uint16_t vlan)
1428 {
1429 struct bond_entry *e;
1430 int balance;
1431
1432 balance = bond->balance;
1433 if (bond->lacp_status == LACP_CONFIGURED) {
1434 /* LACP has been configured on this bond but negotiations were
1435 * unsuccussful. If lacp_fallback_ab is enabled use active-
1436 * backup mode else drop all traffic. */
1437 if (!bond->lacp_fallback_ab) {
1438 return NULL;
1439 }
1440 balance = BM_AB;
1441 }
1442
1443 switch (balance) {
1444 case BM_AB:
1445 return bond->active_slave;
1446
1447 case BM_TCP:
1448 if (bond->lacp_status != LACP_NEGOTIATED) {
1449 /* Must have LACP negotiations for TCP balanced bonds. */
1450 return NULL;
1451 }
1452 if (wc) {
1453 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1454 }
1455 /* Fall Through. */
1456 case BM_SLB:
1457 if (wc) {
1458 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1459 }
1460 e = lookup_bond_entry(bond, flow, vlan);
1461 if (!e->slave || !e->slave->enabled) {
1462 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1463 struct bond_slave, hmap_node);
1464 if (!e->slave->enabled) {
1465 e->slave = bond->active_slave;
1466 }
1467 }
1468 return e->slave;
1469
1470 default:
1471 NOT_REACHED();
1472 }
1473 }
1474
1475 static struct bond_slave *
1476 bond_choose_slave(const struct bond *bond)
1477 {
1478 struct bond_slave *slave, *best;
1479
1480 /* Find an enabled slave. */
1481 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1482 if (slave->enabled) {
1483 return slave;
1484 }
1485 }
1486
1487 /* All interfaces are disabled. Find an interface that will be enabled
1488 * after its updelay expires. */
1489 best = NULL;
1490 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1491 if (slave->delay_expires != LLONG_MAX
1492 && slave->may_enable
1493 && (!best || slave->delay_expires < best->delay_expires)) {
1494 best = slave;
1495 }
1496 }
1497 return best;
1498 }
1499
1500 static void
1501 bond_choose_active_slave(struct bond *bond)
1502 {
1503 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1504 struct bond_slave *old_active_slave = bond->active_slave;
1505
1506 bond->active_slave = bond_choose_slave(bond);
1507 if (bond->active_slave) {
1508 if (bond->active_slave->enabled) {
1509 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1510 bond->name, bond->active_slave->name);
1511 } else {
1512 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1513 "remaining %lld ms updelay (since no interface was "
1514 "enabled)", bond->name, bond->active_slave->name,
1515 bond->active_slave->delay_expires - time_msec());
1516 bond_enable_slave(bond->active_slave, true);
1517 }
1518
1519 bond->send_learning_packets = true;
1520 } else if (old_active_slave) {
1521 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1522 }
1523 }
1524
1525 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1526 * bond interface. */
1527 static void
1528 bond_update_fake_slave_stats(struct bond *bond)
1529 {
1530 struct netdev_stats bond_stats;
1531 struct bond_slave *slave;
1532 struct netdev *bond_dev;
1533
1534 memset(&bond_stats, 0, sizeof bond_stats);
1535
1536 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1537 struct netdev_stats slave_stats;
1538
1539 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1540 /* XXX: We swap the stats here because they are swapped back when
1541 * reported by the internal device. The reason for this is
1542 * internal devices normally represent packets going into the
1543 * system but when used as fake bond device they represent packets
1544 * leaving the system. We really should do this in the internal
1545 * device itself because changing it here reverses the counts from
1546 * the perspective of the switch. However, the internal device
1547 * doesn't know what type of device it represents so we have to do
1548 * it here for now. */
1549 bond_stats.tx_packets += slave_stats.rx_packets;
1550 bond_stats.tx_bytes += slave_stats.rx_bytes;
1551 bond_stats.rx_packets += slave_stats.tx_packets;
1552 bond_stats.rx_bytes += slave_stats.tx_bytes;
1553 }
1554 }
1555
1556 if (!netdev_open(bond->name, "system", &bond_dev)) {
1557 netdev_set_stats(bond_dev, &bond_stats);
1558 netdev_close(bond_dev);
1559 }
1560 }