]> git.proxmox.com Git - mirror_ovs.git/blob - ofproto/bond.c
lib: Add tpid parameter to eth_push_vlan()
[mirror_ovs.git] / ofproto / bond.c
1 /*
2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "bond.h"
20
21 #include <limits.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <math.h>
25
26 #include "connectivity.h"
27 #include "coverage.h"
28 #include "dynamic-string.h"
29 #include "flow.h"
30 #include "hmap.h"
31 #include "lacp.h"
32 #include "list.h"
33 #include "netdev.h"
34 #include "odp-util.h"
35 #include "ofpbuf.h"
36 #include "packets.h"
37 #include "poll-loop.h"
38 #include "seq.h"
39 #include "shash.h"
40 #include "timeval.h"
41 #include "unixctl.h"
42 #include "vlog.h"
43
44 VLOG_DEFINE_THIS_MODULE(bond);
45
46 /* Bit-mask for hashing a flow down to a bucket.
47 * There are (BOND_MASK + 1) buckets. */
48 #define BOND_MASK 0xff
49
50 /* A hash bucket for mapping a flow to a slave.
51 * "struct bond" has an array of (BOND_MASK + 1) of these. */
52 struct bond_entry {
53 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
54 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
55 struct list list_node; /* In bond_slave's 'entries' list. */
56 };
57
58 /* A bond slave, that is, one of the links comprising a bond. */
59 struct bond_slave {
60 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
61 struct bond *bond; /* The bond that contains this slave. */
62 void *aux; /* Client-provided handle for this slave. */
63
64 struct netdev *netdev; /* Network device, owned by the client. */
65 unsigned int change_seq; /* Tracks changes in 'netdev'. */
66 char *name; /* Name (a copy of netdev_get_name(netdev)). */
67
68 /* Link status. */
69 long long delay_expires; /* Time after which 'enabled' may change. */
70 bool enabled; /* May be chosen for flows? */
71 bool may_enable; /* Client considers this slave bondable. */
72
73 /* Rebalancing info. Used only by bond_rebalance(). */
74 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
75 struct list entries; /* 'struct bond_entry's assigned here. */
76 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
77 };
78
79 /* A bond, that is, a set of network devices grouped to improve performance or
80 * robustness. */
81 struct bond {
82 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
83 char *name; /* Name provided by client. */
84
85 /* Slaves. */
86 struct hmap slaves;
87
88 /* Bonding info. */
89 enum bond_mode balance; /* Balancing mode, one of BM_*. */
90 struct bond_slave *active_slave;
91 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
92 enum lacp_status lacp_status; /* Status of LACP negotiations. */
93 bool bond_revalidate; /* True if flows need revalidation. */
94 uint32_t basis; /* Basis for flow hash function. */
95
96 /* SLB specific bonding info. */
97 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
98 int rebalance_interval; /* Interval between rebalances, in ms. */
99 long long int next_rebalance; /* Next rebalancing time. */
100 bool send_learning_packets;
101
102 /* Legacy compatibility. */
103 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
104 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
105
106 struct ovs_refcount ref_cnt;
107 };
108
109 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
110 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
111 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
112
113 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
114 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
115 OVS_REQ_RDLOCK(rwlock);
116 static void bond_enable_slave(struct bond_slave *, bool enable)
117 OVS_REQ_WRLOCK(rwlock);
118 static void bond_link_status_update(struct bond_slave *)
119 OVS_REQ_WRLOCK(rwlock);
120 static void bond_choose_active_slave(struct bond *)
121 OVS_REQ_WRLOCK(rwlock);;
122 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
123 uint16_t vlan, uint32_t basis);
124 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
125 uint32_t basis);
126 static struct bond_entry *lookup_bond_entry(const struct bond *,
127 const struct flow *,
128 uint16_t vlan)
129 OVS_REQ_RDLOCK(rwlock);
130 static struct bond_slave *choose_output_slave(const struct bond *,
131 const struct flow *,
132 struct flow_wildcards *,
133 uint16_t vlan)
134 OVS_REQ_RDLOCK(rwlock);
135 static void bond_update_fake_slave_stats(struct bond *)
136 OVS_REQ_RDLOCK(rwlock);
137
138 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
139 * stores the mode in '*balance' and returns true. Otherwise returns false
140 * without modifying '*balance'. */
141 bool
142 bond_mode_from_string(enum bond_mode *balance, const char *s)
143 {
144 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
145 *balance = BM_TCP;
146 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
147 *balance = BM_SLB;
148 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
149 *balance = BM_AB;
150 } else {
151 return false;
152 }
153 return true;
154 }
155
156 /* Returns a string representing 'balance'. */
157 const char *
158 bond_mode_to_string(enum bond_mode balance) {
159 switch (balance) {
160 case BM_TCP:
161 return "balance-tcp";
162 case BM_SLB:
163 return "balance-slb";
164 case BM_AB:
165 return "active-backup";
166 }
167 OVS_NOT_REACHED();
168 }
169
170 \f
171 /* Creates and returns a new bond whose configuration is initially taken from
172 * 's'.
173 *
174 * The caller should register each slave on the new bond by calling
175 * bond_slave_register(). */
176 struct bond *
177 bond_create(const struct bond_settings *s)
178 {
179 struct bond *bond;
180
181 bond = xzalloc(sizeof *bond);
182 hmap_init(&bond->slaves);
183 bond->next_fake_iface_update = LLONG_MAX;
184 ovs_refcount_init(&bond->ref_cnt);
185
186 bond_reconfigure(bond, s);
187 return bond;
188 }
189
190 struct bond *
191 bond_ref(const struct bond *bond_)
192 {
193 struct bond *bond = CONST_CAST(struct bond *, bond_);
194
195 if (bond) {
196 ovs_refcount_ref(&bond->ref_cnt);
197 }
198 return bond;
199 }
200
201 /* Frees 'bond'. */
202 void
203 bond_unref(struct bond *bond)
204 {
205 struct bond_slave *slave, *next_slave;
206
207 if (!bond || ovs_refcount_unref(&bond->ref_cnt) != 1) {
208 return;
209 }
210
211 ovs_rwlock_wrlock(&rwlock);
212 hmap_remove(all_bonds, &bond->hmap_node);
213 ovs_rwlock_unlock(&rwlock);
214
215 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
216 hmap_remove(&bond->slaves, &slave->hmap_node);
217 /* Client owns 'slave->netdev'. */
218 free(slave->name);
219 free(slave);
220 }
221 hmap_destroy(&bond->slaves);
222
223 free(bond->hash);
224 free(bond->name);
225 ovs_refcount_destroy(&bond->ref_cnt);
226 free(bond);
227 }
228
229 /* Updates 'bond''s overall configuration to 's'.
230 *
231 * The caller should register each slave on 'bond' by calling
232 * bond_slave_register(). This is optional if none of the slaves'
233 * configuration has changed. In any case it can't hurt.
234 *
235 * Returns true if the configuration has changed in such a way that requires
236 * flow revalidation.
237 * */
238 bool
239 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
240 {
241 bool revalidate = false;
242
243 ovs_rwlock_wrlock(&rwlock);
244 if (!bond->name || strcmp(bond->name, s->name)) {
245 if (bond->name) {
246 hmap_remove(all_bonds, &bond->hmap_node);
247 free(bond->name);
248 }
249 bond->name = xstrdup(s->name);
250 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
251 }
252
253 bond->updelay = s->up_delay;
254 bond->downdelay = s->down_delay;
255
256 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
257 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
258 revalidate = true;
259 }
260
261 if (bond->rebalance_interval != s->rebalance_interval) {
262 bond->rebalance_interval = s->rebalance_interval;
263 revalidate = true;
264 }
265
266 if (bond->balance != s->balance) {
267 bond->balance = s->balance;
268 revalidate = true;
269 }
270
271 if (bond->basis != s->basis) {
272 bond->basis = s->basis;
273 revalidate = true;
274 }
275
276 if (s->fake_iface) {
277 if (bond->next_fake_iface_update == LLONG_MAX) {
278 bond->next_fake_iface_update = time_msec();
279 }
280 } else {
281 bond->next_fake_iface_update = LLONG_MAX;
282 }
283
284 if (bond->bond_revalidate) {
285 revalidate = true;
286 bond->bond_revalidate = false;
287 }
288
289 if (bond->balance == BM_AB || !bond->hash || revalidate) {
290 bond_entry_reset(bond);
291 }
292
293 ovs_rwlock_unlock(&rwlock);
294 return revalidate;
295 }
296
297 static void
298 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
299 OVS_REQ_WRLOCK(rwlock)
300 {
301 if (slave->netdev != netdev) {
302 slave->netdev = netdev;
303 slave->change_seq = 0;
304 }
305 }
306
307 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
308 * arbitrary client-provided pointer that uniquely identifies a slave within a
309 * bond. If 'slave_' already exists within 'bond' then this function
310 * reconfigures the existing slave.
311 *
312 * 'netdev' must be the network device that 'slave_' represents. It is owned
313 * by the client, so the client must not close it before either unregistering
314 * 'slave_' or destroying 'bond'.
315 */
316 void
317 bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
318 {
319 struct bond_slave *slave;
320
321 ovs_rwlock_wrlock(&rwlock);
322 slave = bond_slave_lookup(bond, slave_);
323 if (!slave) {
324 slave = xzalloc(sizeof *slave);
325
326 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
327 slave->bond = bond;
328 slave->aux = slave_;
329 slave->delay_expires = LLONG_MAX;
330 slave->name = xstrdup(netdev_get_name(netdev));
331 bond->bond_revalidate = true;
332
333 slave->enabled = false;
334 bond_enable_slave(slave, netdev_get_carrier(netdev));
335 }
336
337 bond_slave_set_netdev__(slave, netdev);
338
339 free(slave->name);
340 slave->name = xstrdup(netdev_get_name(netdev));
341 ovs_rwlock_unlock(&rwlock);
342 }
343
344 /* Updates the network device to be used with 'slave_' to 'netdev'.
345 *
346 * This is useful if the caller closes and re-opens the network device
347 * registered with bond_slave_register() but doesn't need to change anything
348 * else. */
349 void
350 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
351 {
352 struct bond_slave *slave;
353
354 ovs_rwlock_wrlock(&rwlock);
355 slave = bond_slave_lookup(bond, slave_);
356 if (slave) {
357 bond_slave_set_netdev__(slave, netdev);
358 }
359 ovs_rwlock_unlock(&rwlock);
360 }
361
362 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
363 * then this function has no effect.
364 *
365 * Unregistering a slave invalidates all flows. */
366 void
367 bond_slave_unregister(struct bond *bond, const void *slave_)
368 {
369 struct bond_slave *slave;
370 bool del_active;
371
372 ovs_rwlock_wrlock(&rwlock);
373 slave = bond_slave_lookup(bond, slave_);
374 if (!slave) {
375 goto out;
376 }
377
378 bond->bond_revalidate = true;
379 bond_enable_slave(slave, false);
380
381 del_active = bond->active_slave == slave;
382 if (bond->hash) {
383 struct bond_entry *e;
384 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
385 if (e->slave == slave) {
386 e->slave = NULL;
387 }
388 }
389 }
390
391 free(slave->name);
392
393 hmap_remove(&bond->slaves, &slave->hmap_node);
394 /* Client owns 'slave->netdev'. */
395 free(slave);
396
397 if (del_active) {
398 bond_choose_active_slave(bond);
399 bond->send_learning_packets = true;
400 }
401 out:
402 ovs_rwlock_unlock(&rwlock);
403 }
404
405 /* Should be called on each slave in 'bond' before bond_run() to indicate
406 * whether or not 'slave_' may be enabled. This function is intended to allow
407 * other protocols to have some impact on bonding decisions. For example LACP
408 * or high level link monitoring protocols may decide that a given slave should
409 * not be able to send traffic. */
410 void
411 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
412 {
413 ovs_rwlock_wrlock(&rwlock);
414 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
415 ovs_rwlock_unlock(&rwlock);
416 }
417
418 /* Performs periodic maintenance on 'bond'.
419 *
420 * Returns true if the caller should revalidate its flows.
421 *
422 * The caller should check bond_should_send_learning_packets() afterward. */
423 bool
424 bond_run(struct bond *bond, enum lacp_status lacp_status)
425 {
426 struct bond_slave *slave;
427 bool revalidate;
428
429 ovs_rwlock_wrlock(&rwlock);
430 if (bond->lacp_status != lacp_status) {
431 bond->lacp_status = lacp_status;
432 bond->bond_revalidate = true;
433 }
434
435 /* Enable slaves based on link status and LACP feedback. */
436 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
437 bond_link_status_update(slave);
438 slave->change_seq = seq_read(connectivity_seq_get());
439 }
440 if (!bond->active_slave || !bond->active_slave->enabled) {
441 bond_choose_active_slave(bond);
442 }
443
444 /* Update fake bond interface stats. */
445 if (time_msec() >= bond->next_fake_iface_update) {
446 bond_update_fake_slave_stats(bond);
447 bond->next_fake_iface_update = time_msec() + 1000;
448 }
449
450 revalidate = bond->bond_revalidate;
451 bond->bond_revalidate = false;
452 ovs_rwlock_unlock(&rwlock);
453
454 return revalidate;
455 }
456
457 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
458 void
459 bond_wait(struct bond *bond)
460 {
461 struct bond_slave *slave;
462
463 ovs_rwlock_rdlock(&rwlock);
464 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
465 if (slave->delay_expires != LLONG_MAX) {
466 poll_timer_wait_until(slave->delay_expires);
467 }
468
469 seq_wait(connectivity_seq_get(), slave->change_seq);
470 }
471
472 if (bond->next_fake_iface_update != LLONG_MAX) {
473 poll_timer_wait_until(bond->next_fake_iface_update);
474 }
475
476 if (bond->bond_revalidate) {
477 poll_immediate_wake();
478 }
479 ovs_rwlock_unlock(&rwlock);
480
481 /* We don't wait for bond->next_rebalance because rebalancing can only run
482 * at a flow account checkpoint. ofproto does checkpointing on its own
483 * schedule and bond_rebalance() gets called afterward, so we'd just be
484 * waking up for no purpose. */
485 }
486 \f
487 /* MAC learning table interaction. */
488
489 static bool
490 may_send_learning_packets(const struct bond *bond)
491 {
492 return ((bond->lacp_status == LACP_DISABLED
493 && (bond->balance == BM_SLB || bond->balance == BM_AB))
494 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
495 && bond->active_slave;
496 }
497
498 /* Returns true if 'bond' needs the client to send out packets to assist with
499 * MAC learning on 'bond'. If this function returns true, then the client
500 * should iterate through its MAC learning table for the bridge on which 'bond'
501 * is located. For each MAC that has been learned on a port other than 'bond',
502 * it should call bond_compose_learning_packet().
503 *
504 * This function will only return true if 'bond' is in SLB or active-backup
505 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
506 * necessary.
507 *
508 * Calling this function resets the state that it checks. */
509 bool
510 bond_should_send_learning_packets(struct bond *bond)
511 {
512 bool send;
513
514 ovs_rwlock_wrlock(&rwlock);
515 send = bond->send_learning_packets && may_send_learning_packets(bond);
516 bond->send_learning_packets = false;
517 ovs_rwlock_unlock(&rwlock);
518 return send;
519 }
520
521 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
522 *
523 * See bond_should_send_learning_packets() for description of usage. The
524 * caller should send the composed packet on the port associated with
525 * port_aux and takes ownership of the returned ofpbuf. */
526 struct ofpbuf *
527 bond_compose_learning_packet(struct bond *bond,
528 const uint8_t eth_src[ETH_ADDR_LEN],
529 uint16_t vlan, void **port_aux)
530 {
531 struct bond_slave *slave;
532 struct ofpbuf *packet;
533 struct flow flow;
534
535 ovs_rwlock_rdlock(&rwlock);
536 ovs_assert(may_send_learning_packets(bond));
537 memset(&flow, 0, sizeof flow);
538 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
539 slave = choose_output_slave(bond, &flow, NULL, vlan);
540
541 packet = ofpbuf_new(0);
542 compose_rarp(packet, eth_src);
543 if (vlan) {
544 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
545 }
546
547 *port_aux = slave->aux;
548 ovs_rwlock_unlock(&rwlock);
549 return packet;
550 }
551 \f
552 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
553 * Ethernet destination address of 'eth_dst', should be admitted.
554 *
555 * The return value is one of the following:
556 *
557 * - BV_ACCEPT: Admit the packet.
558 *
559 * - BV_DROP: Drop the packet.
560 *
561 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
562 * Ethernet source address and VLAN. If there is none, or if the packet
563 * is on the learned port, then admit the packet. If a different port has
564 * been learned, however, drop the packet (and do not use it for MAC
565 * learning).
566 */
567 enum bond_verdict
568 bond_check_admissibility(struct bond *bond, const void *slave_,
569 const uint8_t eth_dst[ETH_ADDR_LEN])
570 {
571 enum bond_verdict verdict = BV_DROP;
572 struct bond_slave *slave;
573
574 ovs_rwlock_rdlock(&rwlock);
575 slave = bond_slave_lookup(bond, slave_);
576 if (!slave) {
577 goto out;
578 }
579
580 /* LACP bonds have very loose admissibility restrictions because we can
581 * assume the remote switch is aware of the bond and will "do the right
582 * thing". However, as a precaution we drop packets on disabled slaves
583 * because no correctly implemented partner switch should be sending
584 * packets to them.
585 *
586 * If LACP is configured, but LACP negotiations have been unsuccessful, we
587 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
588 switch (bond->lacp_status) {
589 case LACP_NEGOTIATED:
590 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
591 goto out;
592 case LACP_CONFIGURED:
593 if (!bond->lacp_fallback_ab) {
594 goto out;
595 }
596 case LACP_DISABLED:
597 break;
598 }
599
600 /* Drop all multicast packets on inactive slaves. */
601 if (eth_addr_is_multicast(eth_dst)) {
602 if (bond->active_slave != slave) {
603 goto out;
604 }
605 }
606
607 switch (bond->balance) {
608 case BM_TCP:
609 /* TCP balanced bonds require successful LACP negotiations. Based on the
610 * above check, LACP is off or lacp_fallback_ab is true on this bond.
611 * If lacp_fallback_ab is true fall through to BM_AB case else, we
612 * drop all incoming traffic. */
613 if (!bond->lacp_fallback_ab) {
614 goto out;
615 }
616
617 case BM_AB:
618 /* Drop all packets which arrive on backup slaves. This is similar to
619 * how Linux bonding handles active-backup bonds. */
620 if (bond->active_slave != slave) {
621 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
622
623 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
624 " slave (%s) destined for " ETH_ADDR_FMT,
625 slave->name, ETH_ADDR_ARGS(eth_dst));
626 goto out;
627 }
628 verdict = BV_ACCEPT;
629 goto out;
630
631 case BM_SLB:
632 /* Drop all packets for which we have learned a different input port,
633 * because we probably sent the packet on one slave and got it back on
634 * the other. Gratuitous ARP packets are an exception to this rule:
635 * the host has moved to another switch. The exception to the
636 * exception is if we locked the learning table to avoid reflections on
637 * bond slaves. */
638 verdict = BV_DROP_IF_MOVED;
639 goto out;
640 }
641
642 OVS_NOT_REACHED();
643 out:
644 ovs_rwlock_unlock(&rwlock);
645 return verdict;
646
647 }
648
649 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
650 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
651 * NULL if the packet should be dropped because no slaves are enabled.
652 *
653 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
654 * should be a VID only (i.e. excluding the PCP bits). Second,
655 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
656 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
657 * packet belongs to (so for an access port it will be the access port's VLAN).
658 *
659 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
660 * significant in the selection. At some point earlier, 'wc' should
661 * have been initialized (e.g., by flow_wildcards_init_catchall()).
662 */
663 void *
664 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
665 struct flow_wildcards *wc, uint16_t vlan)
666 {
667 struct bond_slave *slave;
668 void *aux;
669
670 ovs_rwlock_rdlock(&rwlock);
671 slave = choose_output_slave(bond, flow, wc, vlan);
672 aux = slave ? slave->aux : NULL;
673 ovs_rwlock_unlock(&rwlock);
674
675 return aux;
676 }
677 \f
678 /* Rebalancing. */
679
680 static bool
681 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
682 {
683 return bond->rebalance_interval
684 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
685 }
686
687 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
688 void
689 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
690 uint64_t n_bytes)
691 {
692 ovs_rwlock_wrlock(&rwlock);
693 if (bond_is_balanced(bond)) {
694 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
695 }
696 ovs_rwlock_unlock(&rwlock);
697 }
698
699 static struct bond_slave *
700 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
701 {
702 return CONTAINER_OF(bal, struct bond_slave, bal_node);
703 }
704
705 static void
706 log_bals(struct bond *bond, const struct list *bals)
707 {
708 if (VLOG_IS_DBG_ENABLED()) {
709 struct ds ds = DS_EMPTY_INITIALIZER;
710 const struct bond_slave *slave;
711
712 LIST_FOR_EACH (slave, bal_node, bals) {
713 if (ds.length) {
714 ds_put_char(&ds, ',');
715 }
716 ds_put_format(&ds, " %s %"PRIu64"kB",
717 slave->name, slave->tx_bytes / 1024);
718
719 if (!slave->enabled) {
720 ds_put_cstr(&ds, " (disabled)");
721 }
722 if (!list_is_empty(&slave->entries)) {
723 struct bond_entry *e;
724
725 ds_put_cstr(&ds, " (");
726 LIST_FOR_EACH (e, list_node, &slave->entries) {
727 if (&e->list_node != list_front(&slave->entries)) {
728 ds_put_cstr(&ds, " + ");
729 }
730 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
731 e - bond->hash, e->tx_bytes / 1024);
732 }
733 ds_put_cstr(&ds, ")");
734 }
735 }
736 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
737 ds_destroy(&ds);
738 }
739 }
740
741 /* Shifts 'hash' from its current slave to 'to'. */
742 static void
743 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
744 {
745 struct bond_slave *from = hash->slave;
746 struct bond *bond = from->bond;
747 uint64_t delta = hash->tx_bytes;
748
749 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
750 "from %s to %s (now carrying %"PRIu64"kB and "
751 "%"PRIu64"kB load, respectively)",
752 bond->name, delta / 1024, hash - bond->hash,
753 from->name, to->name,
754 (from->tx_bytes - delta) / 1024,
755 (to->tx_bytes + delta) / 1024);
756
757 /* Shift load away from 'from' to 'to'. */
758 from->tx_bytes -= delta;
759 to->tx_bytes += delta;
760
761 /* Arrange for flows to be revalidated. */
762 hash->slave = to;
763 bond->bond_revalidate = true;
764 }
765
766 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
767 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
768 * given that doing so must decrease the ratio of the load on the two slaves by
769 * at least 0.1. Returns NULL if there is no appropriate entry.
770 *
771 * The list of entries isn't sorted. I don't know of a reason to prefer to
772 * shift away small hashes or large hashes. */
773 static struct bond_entry *
774 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
775 {
776 struct bond_entry *e;
777
778 if (list_is_short(&from->entries)) {
779 /* 'from' carries no more than one MAC hash, so shifting load away from
780 * it would be pointless. */
781 return NULL;
782 }
783
784 LIST_FOR_EACH (e, list_node, &from->entries) {
785 double old_ratio, new_ratio;
786 uint64_t delta;
787
788 if (to_tx_bytes == 0) {
789 /* Nothing on the new slave, move it. */
790 return e;
791 }
792
793 delta = e->tx_bytes;
794 old_ratio = (double)from->tx_bytes / to_tx_bytes;
795 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
796 if (old_ratio - new_ratio > 0.1
797 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
798 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
799 and 'to' slave have the same load. Therefore, we only move an
800 entry if it decreases the load on 'from', and brings us closer
801 to equal traffic load. */
802 return e;
803 }
804 }
805
806 return NULL;
807 }
808
809 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
810 * maintained. */
811 static void
812 insert_bal(struct list *bals, struct bond_slave *slave)
813 {
814 struct bond_slave *pos;
815
816 LIST_FOR_EACH (pos, bal_node, bals) {
817 if (slave->tx_bytes > pos->tx_bytes) {
818 break;
819 }
820 }
821 list_insert(&pos->bal_node, &slave->bal_node);
822 }
823
824 /* Removes 'slave' from its current list and then inserts it into 'bals' so
825 * that descending order of 'tx_bytes' is maintained. */
826 static void
827 reinsert_bal(struct list *bals, struct bond_slave *slave)
828 {
829 list_remove(&slave->bal_node);
830 insert_bal(bals, slave);
831 }
832
833 /* If 'bond' needs rebalancing, does so.
834 *
835 * The caller should have called bond_account() for each active flow, to ensure
836 * that flow data is consistently accounted at this point. */
837 void
838 bond_rebalance(struct bond *bond)
839 {
840 struct bond_slave *slave;
841 struct bond_entry *e;
842 struct list bals;
843
844 ovs_rwlock_wrlock(&rwlock);
845 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
846 ovs_rwlock_unlock(&rwlock);
847 return;
848 }
849 bond->next_rebalance = time_msec() + bond->rebalance_interval;
850
851 /* Add each bond_entry to its slave's 'entries' list.
852 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
853 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
854 slave->tx_bytes = 0;
855 list_init(&slave->entries);
856 }
857 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
858 if (e->slave && e->tx_bytes) {
859 e->slave->tx_bytes += e->tx_bytes;
860 list_push_back(&e->slave->entries, &e->list_node);
861 }
862 }
863
864 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
865 *
866 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
867 * with a proper list sort algorithm. */
868 list_init(&bals);
869 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
870 if (slave->enabled) {
871 insert_bal(&bals, slave);
872 }
873 }
874 log_bals(bond, &bals);
875
876 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
877 while (!list_is_short(&bals)) {
878 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
879 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
880 uint64_t overload;
881
882 overload = from->tx_bytes - to->tx_bytes;
883 if (overload < to->tx_bytes >> 5 || overload < 100000) {
884 /* The extra load on 'from' (and all less-loaded slaves), compared
885 * to that of 'to' (the least-loaded slave), is less than ~3%, or
886 * it is less than ~1Mbps. No point in rebalancing. */
887 break;
888 }
889
890 /* 'from' is carrying significantly more load than 'to'. Pick a hash
891 * to move from 'from' to 'to'. */
892 e = choose_entry_to_migrate(from, to->tx_bytes);
893 if (e) {
894 bond_shift_load(e, to);
895
896 /* Delete element from from->entries.
897 *
898 * We don't add the element to to->hashes. That would only allow
899 * 'e' to be migrated to another slave in this rebalancing run, and
900 * there is no point in doing that. */
901 list_remove(&e->list_node);
902
903 /* Re-sort 'bals'. */
904 reinsert_bal(&bals, from);
905 reinsert_bal(&bals, to);
906 } else {
907 /* Can't usefully migrate anything away from 'from'.
908 * Don't reconsider it. */
909 list_remove(&from->bal_node);
910 }
911 }
912
913 /* Implement exponentially weighted moving average. A weight of 1/2 causes
914 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
915 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
916 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
917 e->tx_bytes /= 2;
918 if (!e->tx_bytes) {
919 e->slave = NULL;
920 }
921 }
922 ovs_rwlock_unlock(&rwlock);
923 }
924 \f
925 /* Bonding unixctl user interface functions. */
926
927 static struct bond *
928 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
929 {
930 struct bond *bond;
931
932 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
933 all_bonds) {
934 if (!strcmp(bond->name, name)) {
935 return bond;
936 }
937 }
938 return NULL;
939 }
940
941 static struct bond_slave *
942 bond_lookup_slave(struct bond *bond, const char *slave_name)
943 {
944 struct bond_slave *slave;
945
946 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
947 if (!strcmp(slave->name, slave_name)) {
948 return slave;
949 }
950 }
951 return NULL;
952 }
953
954 static void
955 bond_unixctl_list(struct unixctl_conn *conn,
956 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
957 void *aux OVS_UNUSED)
958 {
959 struct ds ds = DS_EMPTY_INITIALIZER;
960 const struct bond *bond;
961
962 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
963
964 ovs_rwlock_rdlock(&rwlock);
965 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
966 const struct bond_slave *slave;
967 size_t i;
968
969 ds_put_format(&ds, "%s\t%s\t",
970 bond->name, bond_mode_to_string(bond->balance));
971
972 i = 0;
973 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
974 if (i++ > 0) {
975 ds_put_cstr(&ds, ", ");
976 }
977 ds_put_cstr(&ds, slave->name);
978 }
979 ds_put_char(&ds, '\n');
980 }
981 ovs_rwlock_unlock(&rwlock);
982 unixctl_command_reply(conn, ds_cstr(&ds));
983 ds_destroy(&ds);
984 }
985
986 static void
987 bond_print_details(struct ds *ds, const struct bond *bond)
988 OVS_REQ_RDLOCK(rwlock)
989 {
990 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
991 const struct shash_node **sorted_slaves = NULL;
992 const struct bond_slave *slave;
993 int i;
994
995 ds_put_format(ds, "---- %s ----\n", bond->name);
996 ds_put_format(ds, "bond_mode: %s\n",
997 bond_mode_to_string(bond->balance));
998
999 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1000
1001 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1002 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1003
1004 if (bond_is_balanced(bond)) {
1005 ds_put_format(ds, "next rebalance: %lld ms\n",
1006 bond->next_rebalance - time_msec());
1007 }
1008
1009 ds_put_cstr(ds, "lacp_status: ");
1010 switch (bond->lacp_status) {
1011 case LACP_NEGOTIATED:
1012 ds_put_cstr(ds, "negotiated\n");
1013 break;
1014 case LACP_CONFIGURED:
1015 ds_put_cstr(ds, "configured\n");
1016 break;
1017 case LACP_DISABLED:
1018 ds_put_cstr(ds, "off\n");
1019 break;
1020 default:
1021 ds_put_cstr(ds, "<unknown>\n");
1022 break;
1023 }
1024
1025 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1026 shash_add(&slave_shash, slave->name, slave);
1027 }
1028 sorted_slaves = shash_sort(&slave_shash);
1029
1030 for (i = 0; i < shash_count(&slave_shash); i++) {
1031 struct bond_entry *be;
1032
1033 slave = sorted_slaves[i]->data;
1034
1035 /* Basic info. */
1036 ds_put_format(ds, "\nslave %s: %s\n",
1037 slave->name, slave->enabled ? "enabled" : "disabled");
1038 if (slave == bond->active_slave) {
1039 ds_put_cstr(ds, "\tactive slave\n");
1040 }
1041 if (slave->delay_expires != LLONG_MAX) {
1042 ds_put_format(ds, "\t%s expires in %lld ms\n",
1043 slave->enabled ? "downdelay" : "updelay",
1044 slave->delay_expires - time_msec());
1045 }
1046
1047 ds_put_format(ds, "\tmay_enable: %s\n",
1048 slave->may_enable ? "true" : "false");
1049
1050 if (!bond_is_balanced(bond)) {
1051 continue;
1052 }
1053
1054 /* Hashes. */
1055 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1056 int hash = be - bond->hash;
1057
1058 if (be->slave != slave) {
1059 continue;
1060 }
1061
1062 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1063 hash, be->tx_bytes / 1024);
1064
1065 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1066 }
1067 }
1068 shash_destroy(&slave_shash);
1069 free(sorted_slaves);
1070 ds_put_cstr(ds, "\n");
1071 }
1072
1073 static void
1074 bond_unixctl_show(struct unixctl_conn *conn,
1075 int argc, const char *argv[],
1076 void *aux OVS_UNUSED)
1077 {
1078 struct ds ds = DS_EMPTY_INITIALIZER;
1079
1080 ovs_rwlock_rdlock(&rwlock);
1081 if (argc > 1) {
1082 const struct bond *bond = bond_find(argv[1]);
1083
1084 if (!bond) {
1085 unixctl_command_reply_error(conn, "no such bond");
1086 goto out;
1087 }
1088 bond_print_details(&ds, bond);
1089 } else {
1090 const struct bond *bond;
1091
1092 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1093 bond_print_details(&ds, bond);
1094 }
1095 }
1096
1097 unixctl_command_reply(conn, ds_cstr(&ds));
1098 ds_destroy(&ds);
1099
1100 out:
1101 ovs_rwlock_unlock(&rwlock);
1102 }
1103
1104 static void
1105 bond_unixctl_migrate(struct unixctl_conn *conn,
1106 int argc OVS_UNUSED, const char *argv[],
1107 void *aux OVS_UNUSED)
1108 {
1109 const char *bond_s = argv[1];
1110 const char *hash_s = argv[2];
1111 const char *slave_s = argv[3];
1112 struct bond *bond;
1113 struct bond_slave *slave;
1114 struct bond_entry *entry;
1115 int hash;
1116
1117 ovs_rwlock_wrlock(&rwlock);
1118 bond = bond_find(bond_s);
1119 if (!bond) {
1120 unixctl_command_reply_error(conn, "no such bond");
1121 goto out;
1122 }
1123
1124 if (bond->balance != BM_SLB) {
1125 unixctl_command_reply_error(conn, "not an SLB bond");
1126 goto out;
1127 }
1128
1129 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1130 hash = atoi(hash_s) & BOND_MASK;
1131 } else {
1132 unixctl_command_reply_error(conn, "bad hash");
1133 goto out;
1134 }
1135
1136 slave = bond_lookup_slave(bond, slave_s);
1137 if (!slave) {
1138 unixctl_command_reply_error(conn, "no such slave");
1139 goto out;
1140 }
1141
1142 if (!slave->enabled) {
1143 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1144 goto out;
1145 }
1146
1147 entry = &bond->hash[hash];
1148 bond->bond_revalidate = true;
1149 entry->slave = slave;
1150 unixctl_command_reply(conn, "migrated");
1151
1152 out:
1153 ovs_rwlock_unlock(&rwlock);
1154 }
1155
1156 static void
1157 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1158 int argc OVS_UNUSED, const char *argv[],
1159 void *aux OVS_UNUSED)
1160 {
1161 const char *bond_s = argv[1];
1162 const char *slave_s = argv[2];
1163 struct bond *bond;
1164 struct bond_slave *slave;
1165
1166 ovs_rwlock_wrlock(&rwlock);
1167 bond = bond_find(bond_s);
1168 if (!bond) {
1169 unixctl_command_reply_error(conn, "no such bond");
1170 goto out;
1171 }
1172
1173 slave = bond_lookup_slave(bond, slave_s);
1174 if (!slave) {
1175 unixctl_command_reply_error(conn, "no such slave");
1176 goto out;
1177 }
1178
1179 if (!slave->enabled) {
1180 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1181 goto out;
1182 }
1183
1184 if (bond->active_slave != slave) {
1185 bond->bond_revalidate = true;
1186 bond->active_slave = slave;
1187 VLOG_INFO("bond %s: active interface is now %s",
1188 bond->name, slave->name);
1189 bond->send_learning_packets = true;
1190 unixctl_command_reply(conn, "done");
1191 } else {
1192 unixctl_command_reply(conn, "no change");
1193 }
1194 out:
1195 ovs_rwlock_unlock(&rwlock);
1196 }
1197
1198 static void
1199 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1200 {
1201 const char *bond_s = argv[1];
1202 const char *slave_s = argv[2];
1203 struct bond *bond;
1204 struct bond_slave *slave;
1205
1206 ovs_rwlock_wrlock(&rwlock);
1207 bond = bond_find(bond_s);
1208 if (!bond) {
1209 unixctl_command_reply_error(conn, "no such bond");
1210 goto out;
1211 }
1212
1213 slave = bond_lookup_slave(bond, slave_s);
1214 if (!slave) {
1215 unixctl_command_reply_error(conn, "no such slave");
1216 goto out;
1217 }
1218
1219 bond_enable_slave(slave, enable);
1220 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1221
1222 out:
1223 ovs_rwlock_unlock(&rwlock);
1224 }
1225
1226 static void
1227 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1228 int argc OVS_UNUSED, const char *argv[],
1229 void *aux OVS_UNUSED)
1230 {
1231 enable_slave(conn, argv, true);
1232 }
1233
1234 static void
1235 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1236 int argc OVS_UNUSED, const char *argv[],
1237 void *aux OVS_UNUSED)
1238 {
1239 enable_slave(conn, argv, false);
1240 }
1241
1242 static void
1243 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1244 void *aux OVS_UNUSED)
1245 {
1246 const char *mac_s = argv[1];
1247 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1248 const char *basis_s = argc > 3 ? argv[3] : NULL;
1249 uint8_t mac[ETH_ADDR_LEN];
1250 uint8_t hash;
1251 char *hash_cstr;
1252 unsigned int vlan;
1253 uint32_t basis;
1254
1255 if (vlan_s) {
1256 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1257 unixctl_command_reply_error(conn, "invalid vlan");
1258 return;
1259 }
1260 } else {
1261 vlan = 0;
1262 }
1263
1264 if (basis_s) {
1265 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1266 unixctl_command_reply_error(conn, "invalid basis");
1267 return;
1268 }
1269 } else {
1270 basis = 0;
1271 }
1272
1273 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1274 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1275
1276 hash_cstr = xasprintf("%u", hash);
1277 unixctl_command_reply(conn, hash_cstr);
1278 free(hash_cstr);
1279 } else {
1280 unixctl_command_reply_error(conn, "invalid mac");
1281 }
1282 }
1283
1284 void
1285 bond_init(void)
1286 {
1287 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1288 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1289 NULL);
1290 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1291 bond_unixctl_migrate, NULL);
1292 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1293 bond_unixctl_set_active_slave, NULL);
1294 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1295 bond_unixctl_enable_slave, NULL);
1296 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1297 bond_unixctl_disable_slave, NULL);
1298 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1299 bond_unixctl_hash, NULL);
1300 }
1301 \f
1302 static void
1303 bond_entry_reset(struct bond *bond)
1304 {
1305 if (bond->balance != BM_AB) {
1306 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1307
1308 if (!bond->hash) {
1309 bond->hash = xmalloc(hash_len);
1310 }
1311 memset(bond->hash, 0, hash_len);
1312
1313 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1314 } else {
1315 free(bond->hash);
1316 bond->hash = NULL;
1317 }
1318 }
1319
1320 static struct bond_slave *
1321 bond_slave_lookup(struct bond *bond, const void *slave_)
1322 {
1323 struct bond_slave *slave;
1324
1325 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1326 &bond->slaves) {
1327 if (slave->aux == slave_) {
1328 return slave;
1329 }
1330 }
1331
1332 return NULL;
1333 }
1334
1335 static void
1336 bond_enable_slave(struct bond_slave *slave, bool enable)
1337 {
1338 slave->delay_expires = LLONG_MAX;
1339 if (enable != slave->enabled) {
1340 slave->bond->bond_revalidate = true;
1341 slave->enabled = enable;
1342 VLOG_INFO("interface %s: %s", slave->name,
1343 slave->enabled ? "enabled" : "disabled");
1344 }
1345 }
1346
1347 static void
1348 bond_link_status_update(struct bond_slave *slave)
1349 {
1350 struct bond *bond = slave->bond;
1351 bool up;
1352
1353 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1354 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1355 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1356 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1357 slave->name, up ? "up" : "down");
1358 if (up == slave->enabled) {
1359 slave->delay_expires = LLONG_MAX;
1360 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1361 slave->name, up ? "disabled" : "enabled");
1362 } else {
1363 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1364 : up ? bond->updelay : bond->downdelay);
1365 slave->delay_expires = time_msec() + delay;
1366 if (delay) {
1367 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1368 "for %d ms",
1369 slave->name,
1370 up ? "enabled" : "disabled",
1371 up ? "up" : "down",
1372 delay);
1373 }
1374 }
1375 }
1376
1377 if (time_msec() >= slave->delay_expires) {
1378 bond_enable_slave(slave, up);
1379 }
1380 }
1381
1382 static unsigned int
1383 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1384 {
1385 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
1386 }
1387
1388 static unsigned int
1389 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1390 {
1391 struct flow hash_flow = *flow;
1392 hash_flow.vlan_tci = htons(vlan);
1393
1394 /* The symmetric quality of this hash function is not required, but
1395 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1396 * purposes, so we use it out of convenience. */
1397 return flow_hash_symmetric_l4(&hash_flow, basis);
1398 }
1399
1400 static unsigned int
1401 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1402 {
1403 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1404
1405 return (bond->balance == BM_TCP
1406 ? bond_hash_tcp(flow, vlan, bond->basis)
1407 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1408 }
1409
1410 static struct bond_entry *
1411 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1412 uint16_t vlan)
1413 {
1414 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1415 }
1416
1417 static struct bond_slave *
1418 choose_output_slave(const struct bond *bond, const struct flow *flow,
1419 struct flow_wildcards *wc, uint16_t vlan)
1420 {
1421 struct bond_entry *e;
1422 int balance;
1423
1424 balance = bond->balance;
1425 if (bond->lacp_status == LACP_CONFIGURED) {
1426 /* LACP has been configured on this bond but negotiations were
1427 * unsuccussful. If lacp_fallback_ab is enabled use active-
1428 * backup mode else drop all traffic. */
1429 if (!bond->lacp_fallback_ab) {
1430 return NULL;
1431 }
1432 balance = BM_AB;
1433 }
1434
1435 switch (balance) {
1436 case BM_AB:
1437 return bond->active_slave;
1438
1439 case BM_TCP:
1440 if (bond->lacp_status != LACP_NEGOTIATED) {
1441 /* Must have LACP negotiations for TCP balanced bonds. */
1442 return NULL;
1443 }
1444 if (wc) {
1445 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1446 }
1447 /* Fall Through. */
1448 case BM_SLB:
1449 if (wc) {
1450 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1451 }
1452 e = lookup_bond_entry(bond, flow, vlan);
1453 if (!e->slave || !e->slave->enabled) {
1454 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1455 struct bond_slave, hmap_node);
1456 if (!e->slave->enabled) {
1457 e->slave = bond->active_slave;
1458 }
1459 }
1460 return e->slave;
1461
1462 default:
1463 OVS_NOT_REACHED();
1464 }
1465 }
1466
1467 static struct bond_slave *
1468 bond_choose_slave(const struct bond *bond)
1469 {
1470 struct bond_slave *slave, *best;
1471
1472 /* Find an enabled slave. */
1473 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1474 if (slave->enabled) {
1475 return slave;
1476 }
1477 }
1478
1479 /* All interfaces are disabled. Find an interface that will be enabled
1480 * after its updelay expires. */
1481 best = NULL;
1482 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1483 if (slave->delay_expires != LLONG_MAX
1484 && slave->may_enable
1485 && (!best || slave->delay_expires < best->delay_expires)) {
1486 best = slave;
1487 }
1488 }
1489 return best;
1490 }
1491
1492 static void
1493 bond_choose_active_slave(struct bond *bond)
1494 {
1495 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1496 struct bond_slave *old_active_slave = bond->active_slave;
1497
1498 bond->active_slave = bond_choose_slave(bond);
1499 if (bond->active_slave) {
1500 if (bond->active_slave->enabled) {
1501 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1502 bond->name, bond->active_slave->name);
1503 } else {
1504 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1505 "remaining %lld ms updelay (since no interface was "
1506 "enabled)", bond->name, bond->active_slave->name,
1507 bond->active_slave->delay_expires - time_msec());
1508 bond_enable_slave(bond->active_slave, true);
1509 }
1510
1511 bond->send_learning_packets = true;
1512 } else if (old_active_slave) {
1513 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1514 }
1515 }
1516
1517 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1518 * bond interface. */
1519 static void
1520 bond_update_fake_slave_stats(struct bond *bond)
1521 {
1522 struct netdev_stats bond_stats;
1523 struct bond_slave *slave;
1524 struct netdev *bond_dev;
1525
1526 memset(&bond_stats, 0, sizeof bond_stats);
1527
1528 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1529 struct netdev_stats slave_stats;
1530
1531 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1532 /* XXX: We swap the stats here because they are swapped back when
1533 * reported by the internal device. The reason for this is
1534 * internal devices normally represent packets going into the
1535 * system but when used as fake bond device they represent packets
1536 * leaving the system. We really should do this in the internal
1537 * device itself because changing it here reverses the counts from
1538 * the perspective of the switch. However, the internal device
1539 * doesn't know what type of device it represents so we have to do
1540 * it here for now. */
1541 bond_stats.tx_packets += slave_stats.rx_packets;
1542 bond_stats.tx_bytes += slave_stats.rx_bytes;
1543 bond_stats.rx_packets += slave_stats.tx_packets;
1544 bond_stats.rx_bytes += slave_stats.tx_bytes;
1545 }
1546 }
1547
1548 if (!netdev_open(bond->name, "system", &bond_dev)) {
1549 netdev_set_stats(bond_dev, &bond_stats);
1550 netdev_close(bond_dev);
1551 }
1552 }