]> git.proxmox.com Git - ovs.git/blob - ofproto/bond.c
Use magic ETH_ADDR_LEN instead of 6 for Ethernet address length.
[ovs.git] / ofproto / bond.c
1 /*
2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "bond.h"
20
21 #include <limits.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <math.h>
25
26 #include "ofp-util.h"
27 #include "ofp-actions.h"
28 #include "ofpbuf.h"
29 #include "ofproto/ofproto-provider.h"
30 #include "ofproto/ofproto-dpif.h"
31 #include "connectivity.h"
32 #include "coverage.h"
33 #include "dynamic-string.h"
34 #include "flow.h"
35 #include "hmap.h"
36 #include "lacp.h"
37 #include "list.h"
38 #include "netdev.h"
39 #include "odp-util.h"
40 #include "ofpbuf.h"
41 #include "packets.h"
42 #include "poll-loop.h"
43 #include "seq.h"
44 #include "match.h"
45 #include "shash.h"
46 #include "timeval.h"
47 #include "unixctl.h"
48 #include "vlog.h"
49
50 VLOG_DEFINE_THIS_MODULE(bond);
51
52 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
53 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
54 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
55
56 /* Bit-mask for hashing a flow down to a bucket. */
57 #define BOND_MASK 0xff
58 #define BOND_BUCKETS (BOND_MASK + 1)
59
60 /* A hash bucket for mapping a flow to a slave.
61 * "struct bond" has an array of BOND_BUCKETS of these. */
62 struct bond_entry {
63 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
64 uint64_t tx_bytes /* Count of bytes recently transmitted. */
65 OVS_GUARDED_BY(rwlock);
66 struct list list_node; /* In bond_slave's 'entries' list. */
67
68 /* Recirculation.
69 *
70 * 'pr_rule' is the post-recirculation rule for this entry.
71 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
72 * is used to determine delta (applied to 'tx_bytes' above.) */
73 struct rule *pr_rule;
74 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
75 };
76
77 /* A bond slave, that is, one of the links comprising a bond. */
78 struct bond_slave {
79 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
80 struct list list_node; /* In struct bond's enabled_slaves list. */
81 struct bond *bond; /* The bond that contains this slave. */
82 void *aux; /* Client-provided handle for this slave. */
83
84 struct netdev *netdev; /* Network device, owned by the client. */
85 unsigned int change_seq; /* Tracks changes in 'netdev'. */
86 ofp_port_t ofp_port; /* Open flow port number */
87 char *name; /* Name (a copy of netdev_get_name(netdev)). */
88
89 /* Link status. */
90 long long delay_expires; /* Time after which 'enabled' may change. */
91 bool enabled; /* May be chosen for flows? */
92 bool may_enable; /* Client considers this slave bondable. */
93
94 /* Rebalancing info. Used only by bond_rebalance(). */
95 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
96 struct list entries; /* 'struct bond_entry's assigned here. */
97 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
98 };
99
100 /* A bond, that is, a set of network devices grouped to improve performance or
101 * robustness. */
102 struct bond {
103 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
104 char *name; /* Name provided by client. */
105 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
106
107 /* Slaves. */
108 struct hmap slaves;
109
110 /* Enabled slaves.
111 *
112 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
113 * (To prevent the bond_slave from disappearing they must also hold
114 * 'rwlock'.) */
115 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
116 struct list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
117
118 /* Bonding info. */
119 enum bond_mode balance; /* Balancing mode, one of BM_*. */
120 struct bond_slave *active_slave;
121 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
122 enum lacp_status lacp_status; /* Status of LACP negotiations. */
123 bool bond_revalidate; /* True if flows need revalidation. */
124 uint32_t basis; /* Basis for flow hash function. */
125
126 /* SLB specific bonding info. */
127 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
128 int rebalance_interval; /* Interval between rebalances, in ms. */
129 long long int next_rebalance; /* Next rebalancing time. */
130 bool send_learning_packets;
131 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
132 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
133
134 /* Store active slave to OVSDB. */
135 bool active_slave_changed; /* Set to true whenever the bond changes
136 active slave. It will be reset to false
137 after it is stored into OVSDB */
138
139 /* Interface name may not be persistent across an OS reboot, use
140 * MAC address for identifing the active slave */
141 uint8_t active_slave_mac[ETH_ADDR_LEN];
142 /* The MAC address of the active interface. */
143 /* Legacy compatibility. */
144 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
145
146 struct ovs_refcount ref_cnt;
147 };
148
149 /* What to do with an bond_recirc_rule. */
150 enum bond_op {
151 ADD, /* Add the rule to ofproto's flow table. */
152 DEL, /* Delete the rule from the ofproto's flow table. */
153 };
154
155 /* A rule to add to or delete from ofproto's internal flow table. */
156 struct bond_pr_rule_op {
157 struct hmap_node hmap_node;
158 struct match match;
159 ofp_port_t out_ofport;
160 enum bond_op op;
161 struct rule **pr_rule;
162 };
163
164 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
165 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
166 OVS_REQ_RDLOCK(rwlock);
167 static void bond_enable_slave(struct bond_slave *, bool enable)
168 OVS_REQ_WRLOCK(rwlock);
169 static void bond_link_status_update(struct bond_slave *)
170 OVS_REQ_WRLOCK(rwlock);
171 static void bond_choose_active_slave(struct bond *)
172 OVS_REQ_WRLOCK(rwlock);
173 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
174 uint16_t vlan, uint32_t basis);
175 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
176 uint32_t basis);
177 static struct bond_entry *lookup_bond_entry(const struct bond *,
178 const struct flow *,
179 uint16_t vlan)
180 OVS_REQ_RDLOCK(rwlock);
181 static struct bond_slave *get_enabled_slave(struct bond *)
182 OVS_REQ_RDLOCK(rwlock);
183 static struct bond_slave *choose_output_slave(const struct bond *,
184 const struct flow *,
185 struct flow_wildcards *,
186 uint16_t vlan)
187 OVS_REQ_RDLOCK(rwlock);
188
189 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
190 * stores the mode in '*balance' and returns true. Otherwise returns false
191 * without modifying '*balance'. */
192 bool
193 bond_mode_from_string(enum bond_mode *balance, const char *s)
194 {
195 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
196 *balance = BM_TCP;
197 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
198 *balance = BM_SLB;
199 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
200 *balance = BM_AB;
201 } else {
202 return false;
203 }
204 return true;
205 }
206
207 /* Returns a string representing 'balance'. */
208 const char *
209 bond_mode_to_string(enum bond_mode balance) {
210 switch (balance) {
211 case BM_TCP:
212 return "balance-tcp";
213 case BM_SLB:
214 return "balance-slb";
215 case BM_AB:
216 return "active-backup";
217 }
218 OVS_NOT_REACHED();
219 }
220
221 \f
222 /* Creates and returns a new bond whose configuration is initially taken from
223 * 's'.
224 *
225 * The caller should register each slave on the new bond by calling
226 * bond_slave_register(). */
227 struct bond *
228 bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
229 {
230 struct bond *bond;
231
232 bond = xzalloc(sizeof *bond);
233 bond->ofproto = ofproto;
234 hmap_init(&bond->slaves);
235 list_init(&bond->enabled_slaves);
236 ovs_mutex_init(&bond->mutex);
237 ovs_refcount_init(&bond->ref_cnt);
238
239 bond->recirc_id = 0;
240 hmap_init(&bond->pr_rule_ops);
241
242 bond_reconfigure(bond, s);
243 return bond;
244 }
245
246 struct bond *
247 bond_ref(const struct bond *bond_)
248 {
249 struct bond *bond = CONST_CAST(struct bond *, bond_);
250
251 if (bond) {
252 ovs_refcount_ref(&bond->ref_cnt);
253 }
254 return bond;
255 }
256
257 /* Frees 'bond'. */
258 void
259 bond_unref(struct bond *bond)
260 {
261 struct bond_slave *slave, *next_slave;
262 struct bond_pr_rule_op *pr_op, *next_op;
263
264 if (!bond || ovs_refcount_unref_relaxed(&bond->ref_cnt) != 1) {
265 return;
266 }
267
268 ovs_rwlock_wrlock(&rwlock);
269 hmap_remove(all_bonds, &bond->hmap_node);
270 ovs_rwlock_unlock(&rwlock);
271
272 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
273 hmap_remove(&bond->slaves, &slave->hmap_node);
274 /* Client owns 'slave->netdev'. */
275 free(slave->name);
276 free(slave);
277 }
278 hmap_destroy(&bond->slaves);
279
280 ovs_mutex_destroy(&bond->mutex);
281 free(bond->hash);
282 free(bond->name);
283
284 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
285 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
286 free(pr_op);
287 }
288 hmap_destroy(&bond->pr_rule_ops);
289
290 if (bond->recirc_id) {
291 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
292 }
293
294 free(bond);
295 }
296
297 static void
298 add_pr_rule(struct bond *bond, const struct match *match,
299 ofp_port_t out_ofport, struct rule **rule)
300 {
301 uint32_t hash = match_hash(match, 0);
302 struct bond_pr_rule_op *pr_op;
303
304 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
305 if (match_equal(&pr_op->match, match)) {
306 pr_op->op = ADD;
307 pr_op->out_ofport = out_ofport;
308 pr_op->pr_rule = rule;
309 return;
310 }
311 }
312
313 pr_op = xmalloc(sizeof *pr_op);
314 pr_op->match = *match;
315 pr_op->op = ADD;
316 pr_op->out_ofport = out_ofport;
317 pr_op->pr_rule = rule;
318 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
319 }
320
321 static void
322 update_recirc_rules(struct bond *bond)
323 {
324 struct match match;
325 struct bond_pr_rule_op *pr_op, *next_op;
326 uint64_t ofpacts_stub[128 / 8];
327 struct ofpbuf ofpacts;
328 int i;
329
330 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
331
332 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
333 pr_op->op = DEL;
334 }
335
336 if (bond->hash && bond->recirc_id) {
337 for (i = 0; i < BOND_BUCKETS; i++) {
338 struct bond_slave *slave = bond->hash[i].slave;
339
340 if (slave) {
341 match_init_catchall(&match);
342 match_set_recirc_id(&match, bond->recirc_id);
343 match_set_dp_hash_masked(&match, i, BOND_MASK);
344
345 add_pr_rule(bond, &match, slave->ofp_port,
346 &bond->hash[i].pr_rule);
347 }
348 }
349 }
350
351 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
352 int error;
353 switch (pr_op->op) {
354 case ADD:
355 ofpbuf_clear(&ofpacts);
356 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
357 error = ofproto_dpif_add_internal_flow(bond->ofproto,
358 &pr_op->match,
359 RECIRC_RULE_PRIORITY, 0,
360 &ofpacts, pr_op->pr_rule);
361 if (error) {
362 char *err_s = match_to_string(&pr_op->match,
363 RECIRC_RULE_PRIORITY);
364
365 VLOG_ERR("failed to add post recirculation flow %s", err_s);
366 free(err_s);
367 }
368 break;
369
370 case DEL:
371 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
372 &pr_op->match,
373 RECIRC_RULE_PRIORITY);
374 if (error) {
375 char *err_s = match_to_string(&pr_op->match,
376 RECIRC_RULE_PRIORITY);
377
378 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
379 free(err_s);
380 }
381
382 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
383 *pr_op->pr_rule = NULL;
384 free(pr_op);
385 break;
386 }
387 }
388
389 ofpbuf_uninit(&ofpacts);
390 }
391
392
393 /* Updates 'bond''s overall configuration to 's'.
394 *
395 * The caller should register each slave on 'bond' by calling
396 * bond_slave_register(). This is optional if none of the slaves'
397 * configuration has changed. In any case it can't hurt.
398 *
399 * Returns true if the configuration has changed in such a way that requires
400 * flow revalidation.
401 * */
402 bool
403 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
404 {
405 bool revalidate = false;
406
407 ovs_rwlock_wrlock(&rwlock);
408 if (!bond->name || strcmp(bond->name, s->name)) {
409 if (bond->name) {
410 hmap_remove(all_bonds, &bond->hmap_node);
411 free(bond->name);
412 }
413 bond->name = xstrdup(s->name);
414 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
415 }
416
417 bond->updelay = s->up_delay;
418 bond->downdelay = s->down_delay;
419
420 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
421 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
422 revalidate = true;
423 }
424
425 if (bond->rebalance_interval != s->rebalance_interval) {
426 bond->rebalance_interval = s->rebalance_interval;
427 revalidate = true;
428 }
429
430 if (bond->balance != s->balance) {
431 bond->balance = s->balance;
432 revalidate = true;
433 }
434
435 if (bond->basis != s->basis) {
436 bond->basis = s->basis;
437 revalidate = true;
438 }
439
440 if (bond->bond_revalidate) {
441 revalidate = true;
442 bond->bond_revalidate = false;
443 }
444
445 if (bond->balance != BM_AB) {
446 if (!bond->recirc_id) {
447 bond->recirc_id = ofproto_dpif_alloc_recirc_id(bond->ofproto);
448 }
449 } else if (bond->recirc_id) {
450 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
451 bond->recirc_id = 0;
452 }
453
454 if (bond->balance == BM_AB || !bond->hash || revalidate) {
455 bond_entry_reset(bond);
456 }
457
458 memcpy(bond->active_slave_mac, s->active_slave_mac,
459 sizeof s->active_slave_mac);
460
461 bond->active_slave_changed = false;
462
463 ovs_rwlock_unlock(&rwlock);
464 return revalidate;
465 }
466
467 static struct bond_slave *
468 bond_find_slave_by_mac(const struct bond *bond, const uint8_t mac[ETH_ADDR_LEN])
469 {
470 struct bond_slave *slave;
471
472 /* Find the last active slave */
473 HMAP_FOR_EACH(slave, hmap_node, &bond->slaves) {
474 uint8_t slave_mac[ETH_ADDR_LEN];
475
476 if (netdev_get_etheraddr(slave->netdev, slave_mac)) {
477 continue;
478 }
479
480 if (!memcmp(slave_mac, mac, sizeof(slave_mac))) {
481 return slave;
482 }
483 }
484
485 return NULL;
486 }
487
488 static void
489 bond_active_slave_changed(struct bond *bond)
490 {
491 uint8_t mac[ETH_ADDR_LEN];
492
493 netdev_get_etheraddr(bond->active_slave->netdev, mac);
494 memcpy(bond->active_slave_mac, mac, sizeof bond->active_slave_mac);
495 bond->active_slave_changed = true;
496 seq_change(connectivity_seq_get());
497 }
498
499 static void
500 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
501 OVS_REQ_WRLOCK(rwlock)
502 {
503 if (slave->netdev != netdev) {
504 slave->netdev = netdev;
505 slave->change_seq = 0;
506 }
507 }
508
509 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
510 * arbitrary client-provided pointer that uniquely identifies a slave within a
511 * bond. If 'slave_' already exists within 'bond' then this function
512 * reconfigures the existing slave.
513 *
514 * 'netdev' must be the network device that 'slave_' represents. It is owned
515 * by the client, so the client must not close it before either unregistering
516 * 'slave_' or destroying 'bond'.
517 */
518 void
519 bond_slave_register(struct bond *bond, void *slave_,
520 ofp_port_t ofport, struct netdev *netdev)
521 {
522 struct bond_slave *slave;
523
524 ovs_rwlock_wrlock(&rwlock);
525 slave = bond_slave_lookup(bond, slave_);
526 if (!slave) {
527 slave = xzalloc(sizeof *slave);
528
529 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
530 slave->bond = bond;
531 slave->aux = slave_;
532 slave->ofp_port = ofport;
533 slave->delay_expires = LLONG_MAX;
534 slave->name = xstrdup(netdev_get_name(netdev));
535 bond->bond_revalidate = true;
536
537 slave->enabled = false;
538 bond_enable_slave(slave, netdev_get_carrier(netdev));
539 }
540
541 bond_slave_set_netdev__(slave, netdev);
542
543 free(slave->name);
544 slave->name = xstrdup(netdev_get_name(netdev));
545 ovs_rwlock_unlock(&rwlock);
546 }
547
548 /* Updates the network device to be used with 'slave_' to 'netdev'.
549 *
550 * This is useful if the caller closes and re-opens the network device
551 * registered with bond_slave_register() but doesn't need to change anything
552 * else. */
553 void
554 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
555 {
556 struct bond_slave *slave;
557
558 ovs_rwlock_wrlock(&rwlock);
559 slave = bond_slave_lookup(bond, slave_);
560 if (slave) {
561 bond_slave_set_netdev__(slave, netdev);
562 }
563 ovs_rwlock_unlock(&rwlock);
564 }
565
566 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
567 * then this function has no effect.
568 *
569 * Unregistering a slave invalidates all flows. */
570 void
571 bond_slave_unregister(struct bond *bond, const void *slave_)
572 {
573 struct bond_slave *slave;
574 bool del_active;
575
576 ovs_rwlock_wrlock(&rwlock);
577 slave = bond_slave_lookup(bond, slave_);
578 if (!slave) {
579 goto out;
580 }
581
582 bond->bond_revalidate = true;
583 bond_enable_slave(slave, false);
584
585 del_active = bond->active_slave == slave;
586 if (bond->hash) {
587 struct bond_entry *e;
588 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
589 if (e->slave == slave) {
590 e->slave = NULL;
591 }
592 }
593 }
594
595 free(slave->name);
596
597 hmap_remove(&bond->slaves, &slave->hmap_node);
598 /* Client owns 'slave->netdev'. */
599 free(slave);
600
601 if (del_active) {
602 bond_choose_active_slave(bond);
603 bond->send_learning_packets = true;
604 }
605 out:
606 ovs_rwlock_unlock(&rwlock);
607 }
608
609 /* Should be called on each slave in 'bond' before bond_run() to indicate
610 * whether or not 'slave_' may be enabled. This function is intended to allow
611 * other protocols to have some impact on bonding decisions. For example LACP
612 * or high level link monitoring protocols may decide that a given slave should
613 * not be able to send traffic. */
614 void
615 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
616 {
617 ovs_rwlock_wrlock(&rwlock);
618 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
619 ovs_rwlock_unlock(&rwlock);
620 }
621
622 /* Performs periodic maintenance on 'bond'.
623 *
624 * Returns true if the caller should revalidate its flows.
625 *
626 * The caller should check bond_should_send_learning_packets() afterward. */
627 bool
628 bond_run(struct bond *bond, enum lacp_status lacp_status)
629 {
630 struct bond_slave *slave;
631 bool revalidate;
632
633 ovs_rwlock_wrlock(&rwlock);
634 if (bond->lacp_status != lacp_status) {
635 bond->lacp_status = lacp_status;
636 bond->bond_revalidate = true;
637 }
638
639 /* Enable slaves based on link status and LACP feedback. */
640 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
641 bond_link_status_update(slave);
642 slave->change_seq = seq_read(connectivity_seq_get());
643 }
644 if (!bond->active_slave || !bond->active_slave->enabled) {
645 bond_choose_active_slave(bond);
646 }
647
648 revalidate = bond->bond_revalidate;
649 bond->bond_revalidate = false;
650 ovs_rwlock_unlock(&rwlock);
651
652 return revalidate;
653 }
654
655 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
656 void
657 bond_wait(struct bond *bond)
658 {
659 struct bond_slave *slave;
660
661 ovs_rwlock_rdlock(&rwlock);
662 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
663 if (slave->delay_expires != LLONG_MAX) {
664 poll_timer_wait_until(slave->delay_expires);
665 }
666
667 seq_wait(connectivity_seq_get(), slave->change_seq);
668 }
669
670 if (bond->bond_revalidate) {
671 poll_immediate_wake();
672 }
673 ovs_rwlock_unlock(&rwlock);
674
675 /* We don't wait for bond->next_rebalance because rebalancing can only run
676 * at a flow account checkpoint. ofproto does checkpointing on its own
677 * schedule and bond_rebalance() gets called afterward, so we'd just be
678 * waking up for no purpose. */
679 }
680 \f
681 /* MAC learning table interaction. */
682
683 static bool
684 may_send_learning_packets(const struct bond *bond)
685 {
686 return ((bond->lacp_status == LACP_DISABLED
687 && (bond->balance == BM_SLB || bond->balance == BM_AB))
688 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
689 && bond->active_slave;
690 }
691
692 /* Returns true if 'bond' needs the client to send out packets to assist with
693 * MAC learning on 'bond'. If this function returns true, then the client
694 * should iterate through its MAC learning table for the bridge on which 'bond'
695 * is located. For each MAC that has been learned on a port other than 'bond',
696 * it should call bond_compose_learning_packet().
697 *
698 * This function will only return true if 'bond' is in SLB or active-backup
699 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
700 * necessary.
701 *
702 * Calling this function resets the state that it checks. */
703 bool
704 bond_should_send_learning_packets(struct bond *bond)
705 {
706 bool send;
707
708 ovs_rwlock_wrlock(&rwlock);
709 send = bond->send_learning_packets && may_send_learning_packets(bond);
710 bond->send_learning_packets = false;
711 ovs_rwlock_unlock(&rwlock);
712 return send;
713 }
714
715 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
716 *
717 * See bond_should_send_learning_packets() for description of usage. The
718 * caller should send the composed packet on the port associated with
719 * port_aux and takes ownership of the returned ofpbuf. */
720 struct ofpbuf *
721 bond_compose_learning_packet(struct bond *bond,
722 const uint8_t eth_src[ETH_ADDR_LEN],
723 uint16_t vlan, void **port_aux)
724 {
725 struct bond_slave *slave;
726 struct ofpbuf *packet;
727 struct flow flow;
728
729 ovs_rwlock_rdlock(&rwlock);
730 ovs_assert(may_send_learning_packets(bond));
731 memset(&flow, 0, sizeof flow);
732 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
733 slave = choose_output_slave(bond, &flow, NULL, vlan);
734
735 packet = ofpbuf_new(0);
736 compose_rarp(packet, eth_src);
737 if (vlan) {
738 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
739 }
740
741 *port_aux = slave->aux;
742 ovs_rwlock_unlock(&rwlock);
743 return packet;
744 }
745 \f
746 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
747 * Ethernet destination address of 'eth_dst', should be admitted.
748 *
749 * The return value is one of the following:
750 *
751 * - BV_ACCEPT: Admit the packet.
752 *
753 * - BV_DROP: Drop the packet.
754 *
755 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
756 * Ethernet source address and VLAN. If there is none, or if the packet
757 * is on the learned port, then admit the packet. If a different port has
758 * been learned, however, drop the packet (and do not use it for MAC
759 * learning).
760 */
761 enum bond_verdict
762 bond_check_admissibility(struct bond *bond, const void *slave_,
763 const uint8_t eth_dst[ETH_ADDR_LEN])
764 {
765 enum bond_verdict verdict = BV_DROP;
766 struct bond_slave *slave;
767
768 ovs_rwlock_rdlock(&rwlock);
769 slave = bond_slave_lookup(bond, slave_);
770 if (!slave) {
771 goto out;
772 }
773
774 /* LACP bonds have very loose admissibility restrictions because we can
775 * assume the remote switch is aware of the bond and will "do the right
776 * thing". However, as a precaution we drop packets on disabled slaves
777 * because no correctly implemented partner switch should be sending
778 * packets to them.
779 *
780 * If LACP is configured, but LACP negotiations have been unsuccessful, we
781 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
782 switch (bond->lacp_status) {
783 case LACP_NEGOTIATED:
784 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
785 goto out;
786 case LACP_CONFIGURED:
787 if (!bond->lacp_fallback_ab) {
788 goto out;
789 }
790 case LACP_DISABLED:
791 break;
792 }
793
794 /* Drop all multicast packets on inactive slaves. */
795 if (eth_addr_is_multicast(eth_dst)) {
796 if (bond->active_slave != slave) {
797 goto out;
798 }
799 }
800
801 switch (bond->balance) {
802 case BM_TCP:
803 /* TCP balanced bonds require successful LACP negotiations. Based on the
804 * above check, LACP is off or lacp_fallback_ab is true on this bond.
805 * If lacp_fallback_ab is true fall through to BM_AB case else, we
806 * drop all incoming traffic. */
807 if (!bond->lacp_fallback_ab) {
808 goto out;
809 }
810
811 case BM_AB:
812 /* Drop all packets which arrive on backup slaves. This is similar to
813 * how Linux bonding handles active-backup bonds. */
814 if (bond->active_slave != slave) {
815 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
816
817 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
818 " slave (%s) destined for " ETH_ADDR_FMT,
819 slave->name, ETH_ADDR_ARGS(eth_dst));
820 goto out;
821 }
822 verdict = BV_ACCEPT;
823 goto out;
824
825 case BM_SLB:
826 /* Drop all packets for which we have learned a different input port,
827 * because we probably sent the packet on one slave and got it back on
828 * the other. Gratuitous ARP packets are an exception to this rule:
829 * the host has moved to another switch. The exception to the
830 * exception is if we locked the learning table to avoid reflections on
831 * bond slaves. */
832 verdict = BV_DROP_IF_MOVED;
833 goto out;
834 }
835
836 OVS_NOT_REACHED();
837 out:
838 ovs_rwlock_unlock(&rwlock);
839 return verdict;
840
841 }
842
843 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
844 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
845 * NULL if the packet should be dropped because no slaves are enabled.
846 *
847 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
848 * should be a VID only (i.e. excluding the PCP bits). Second,
849 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
850 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
851 * packet belongs to (so for an access port it will be the access port's VLAN).
852 *
853 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
854 * significant in the selection. At some point earlier, 'wc' should
855 * have been initialized (e.g., by flow_wildcards_init_catchall()).
856 */
857 void *
858 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
859 struct flow_wildcards *wc, uint16_t vlan)
860 {
861 struct bond_slave *slave;
862 void *aux;
863
864 ovs_rwlock_rdlock(&rwlock);
865 slave = choose_output_slave(bond, flow, wc, vlan);
866 aux = slave ? slave->aux : NULL;
867 ovs_rwlock_unlock(&rwlock);
868
869 return aux;
870 }
871 \f
872 /* Recirculation. */
873 static void
874 bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
875 OVS_REQ_WRLOCK(rwlock)
876 {
877 if (entry->slave) {
878 uint64_t delta;
879
880 delta = rule_tx_bytes - entry->pr_tx_bytes;
881 entry->tx_bytes += delta;
882 entry->pr_tx_bytes = rule_tx_bytes;
883 }
884 }
885
886 /* Maintain bond stats using post recirculation rule byte counters.*/
887 static void
888 bond_recirculation_account(struct bond *bond)
889 OVS_REQ_WRLOCK(rwlock)
890 {
891 int i;
892
893 for (i=0; i<=BOND_MASK; i++) {
894 struct bond_entry *entry = &bond->hash[i];
895 struct rule *rule = entry->pr_rule;
896
897 if (rule) {
898 uint64_t n_packets OVS_UNUSED;
899 long long int used OVS_UNUSED;
900 uint64_t n_bytes;
901
902 rule->ofproto->ofproto_class->rule_get_stats(
903 rule, &n_packets, &n_bytes, &used);
904 bond_entry_account(entry, n_bytes);
905 }
906 }
907 }
908
909 bool
910 bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
911 uint32_t *hash_bias)
912 {
913 if (bond->balance == BM_TCP && bond->recirc_id) {
914 if (recirc_id) {
915 *recirc_id = bond->recirc_id;
916 }
917 if (hash_bias) {
918 *hash_bias = bond->basis;
919 }
920 return true;
921 } else {
922 return false;
923 }
924 }
925
926 void
927 bond_update_post_recirc_rules(struct bond* bond, const bool force)
928 {
929 struct bond_entry *e;
930 bool update_rules = force; /* Always update rules if caller forces it. */
931
932 /* Make sure all bond entries are populated */
933 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
934 if (!e->slave || !e->slave->enabled) {
935 update_rules = true;
936 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
937 struct bond_slave, hmap_node);
938 if (!e->slave->enabled) {
939 e->slave = bond->active_slave;
940 }
941 }
942 }
943
944 if (update_rules) {
945 update_recirc_rules(bond);
946 }
947 }
948 \f
949 /* Rebalancing. */
950
951 static bool
952 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
953 {
954 return bond->rebalance_interval
955 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
956 }
957
958 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
959 void
960 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
961 uint64_t n_bytes)
962 {
963 ovs_rwlock_wrlock(&rwlock);
964 if (bond_is_balanced(bond)) {
965 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
966 }
967 ovs_rwlock_unlock(&rwlock);
968 }
969
970 static struct bond_slave *
971 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
972 {
973 return CONTAINER_OF(bal, struct bond_slave, bal_node);
974 }
975
976 static void
977 log_bals(struct bond *bond, const struct list *bals)
978 OVS_REQ_RDLOCK(rwlock)
979 {
980 if (VLOG_IS_DBG_ENABLED()) {
981 struct ds ds = DS_EMPTY_INITIALIZER;
982 const struct bond_slave *slave;
983
984 LIST_FOR_EACH (slave, bal_node, bals) {
985 if (ds.length) {
986 ds_put_char(&ds, ',');
987 }
988 ds_put_format(&ds, " %s %"PRIu64"kB",
989 slave->name, slave->tx_bytes / 1024);
990
991 if (!slave->enabled) {
992 ds_put_cstr(&ds, " (disabled)");
993 }
994 if (!list_is_empty(&slave->entries)) {
995 struct bond_entry *e;
996
997 ds_put_cstr(&ds, " (");
998 LIST_FOR_EACH (e, list_node, &slave->entries) {
999 if (&e->list_node != list_front(&slave->entries)) {
1000 ds_put_cstr(&ds, " + ");
1001 }
1002 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
1003 e - bond->hash, e->tx_bytes / 1024);
1004 }
1005 ds_put_cstr(&ds, ")");
1006 }
1007 }
1008 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
1009 ds_destroy(&ds);
1010 }
1011 }
1012
1013 /* Shifts 'hash' from its current slave to 'to'. */
1014 static void
1015 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
1016 OVS_REQ_WRLOCK(rwlock)
1017 {
1018 struct bond_slave *from = hash->slave;
1019 struct bond *bond = from->bond;
1020 uint64_t delta = hash->tx_bytes;
1021
1022 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
1023 "from %s to %s (now carrying %"PRIu64"kB and "
1024 "%"PRIu64"kB load, respectively)",
1025 bond->name, delta / 1024, hash - bond->hash,
1026 from->name, to->name,
1027 (from->tx_bytes - delta) / 1024,
1028 (to->tx_bytes + delta) / 1024);
1029
1030 /* Shift load away from 'from' to 'to'. */
1031 from->tx_bytes -= delta;
1032 to->tx_bytes += delta;
1033
1034 /* Arrange for flows to be revalidated. */
1035 hash->slave = to;
1036 bond->bond_revalidate = true;
1037 }
1038
1039 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1040 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
1041 * given that doing so must decrease the ratio of the load on the two slaves by
1042 * at least 0.1. Returns NULL if there is no appropriate entry.
1043 *
1044 * The list of entries isn't sorted. I don't know of a reason to prefer to
1045 * shift away small hashes or large hashes. */
1046 static struct bond_entry *
1047 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
1048 OVS_REQ_WRLOCK(rwlock)
1049 {
1050 struct bond_entry *e;
1051
1052 if (list_is_short(&from->entries)) {
1053 /* 'from' carries no more than one MAC hash, so shifting load away from
1054 * it would be pointless. */
1055 return NULL;
1056 }
1057
1058 LIST_FOR_EACH (e, list_node, &from->entries) {
1059 double old_ratio, new_ratio;
1060 uint64_t delta;
1061
1062 if (to_tx_bytes == 0) {
1063 /* Nothing on the new slave, move it. */
1064 return e;
1065 }
1066
1067 delta = e->tx_bytes;
1068 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1069 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
1070 if (old_ratio - new_ratio > 0.1
1071 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1072 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1073 and 'to' slave have the same load. Therefore, we only move an
1074 entry if it decreases the load on 'from', and brings us closer
1075 to equal traffic load. */
1076 return e;
1077 }
1078 }
1079
1080 return NULL;
1081 }
1082
1083 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1084 * maintained. */
1085 static void
1086 insert_bal(struct list *bals, struct bond_slave *slave)
1087 {
1088 struct bond_slave *pos;
1089
1090 LIST_FOR_EACH (pos, bal_node, bals) {
1091 if (slave->tx_bytes > pos->tx_bytes) {
1092 break;
1093 }
1094 }
1095 list_insert(&pos->bal_node, &slave->bal_node);
1096 }
1097
1098 /* Removes 'slave' from its current list and then inserts it into 'bals' so
1099 * that descending order of 'tx_bytes' is maintained. */
1100 static void
1101 reinsert_bal(struct list *bals, struct bond_slave *slave)
1102 {
1103 list_remove(&slave->bal_node);
1104 insert_bal(bals, slave);
1105 }
1106
1107 /* If 'bond' needs rebalancing, does so.
1108 *
1109 * The caller should have called bond_account() for each active flow, or in case
1110 * of recirculation is used, have called bond_recirculation_account(bond),
1111 * to ensure that flow data is consistently accounted at this point.
1112 */
1113 void
1114 bond_rebalance(struct bond *bond)
1115 {
1116 struct bond_slave *slave;
1117 struct bond_entry *e;
1118 struct list bals;
1119 bool rebalanced = false;
1120 bool use_recirc;
1121
1122 ovs_rwlock_wrlock(&rwlock);
1123 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
1124 goto done;
1125 }
1126 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1127
1128 use_recirc = ofproto_dpif_get_enable_recirc(bond->ofproto) &&
1129 bond_may_recirc(bond, NULL, NULL);
1130
1131 if (use_recirc) {
1132 bond_recirculation_account(bond);
1133 }
1134
1135 /* Add each bond_entry to its slave's 'entries' list.
1136 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1137 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1138 slave->tx_bytes = 0;
1139 list_init(&slave->entries);
1140 }
1141 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1142 if (e->slave && e->tx_bytes) {
1143 e->slave->tx_bytes += e->tx_bytes;
1144 list_push_back(&e->slave->entries, &e->list_node);
1145 }
1146 }
1147
1148 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1149 *
1150 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1151 * with a proper list sort algorithm. */
1152 list_init(&bals);
1153 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1154 if (slave->enabled) {
1155 insert_bal(&bals, slave);
1156 }
1157 }
1158 log_bals(bond, &bals);
1159
1160 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1161 while (!list_is_short(&bals)) {
1162 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1163 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1164 uint64_t overload;
1165
1166 overload = from->tx_bytes - to->tx_bytes;
1167 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1168 /* The extra load on 'from' (and all less-loaded slaves), compared
1169 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1170 * it is less than ~1Mbps. No point in rebalancing. */
1171 break;
1172 }
1173
1174 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1175 * to move from 'from' to 'to'. */
1176 e = choose_entry_to_migrate(from, to->tx_bytes);
1177 if (e) {
1178 bond_shift_load(e, to);
1179
1180 /* Delete element from from->entries.
1181 *
1182 * We don't add the element to to->hashes. That would only allow
1183 * 'e' to be migrated to another slave in this rebalancing run, and
1184 * there is no point in doing that. */
1185 list_remove(&e->list_node);
1186
1187 /* Re-sort 'bals'. */
1188 reinsert_bal(&bals, from);
1189 reinsert_bal(&bals, to);
1190 rebalanced = true;
1191 } else {
1192 /* Can't usefully migrate anything away from 'from'.
1193 * Don't reconsider it. */
1194 list_remove(&from->bal_node);
1195 }
1196 }
1197
1198 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1199 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1200 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1201 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1202 e->tx_bytes /= 2;
1203 }
1204
1205 if (use_recirc && rebalanced) {
1206 bond_update_post_recirc_rules(bond,true);
1207 }
1208
1209 done:
1210 ovs_rwlock_unlock(&rwlock);
1211 }
1212 \f
1213 /* Bonding unixctl user interface functions. */
1214
1215 static struct bond *
1216 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
1217 {
1218 struct bond *bond;
1219
1220 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
1221 all_bonds) {
1222 if (!strcmp(bond->name, name)) {
1223 return bond;
1224 }
1225 }
1226 return NULL;
1227 }
1228
1229 static struct bond_slave *
1230 bond_lookup_slave(struct bond *bond, const char *slave_name)
1231 {
1232 struct bond_slave *slave;
1233
1234 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1235 if (!strcmp(slave->name, slave_name)) {
1236 return slave;
1237 }
1238 }
1239 return NULL;
1240 }
1241
1242 static void
1243 bond_unixctl_list(struct unixctl_conn *conn,
1244 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1245 void *aux OVS_UNUSED)
1246 {
1247 struct ds ds = DS_EMPTY_INITIALIZER;
1248 const struct bond *bond;
1249
1250 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
1251
1252 ovs_rwlock_rdlock(&rwlock);
1253 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1254 const struct bond_slave *slave;
1255 size_t i;
1256
1257 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1258 bond_mode_to_string(bond->balance), bond->recirc_id);
1259
1260 i = 0;
1261 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1262 if (i++ > 0) {
1263 ds_put_cstr(&ds, ", ");
1264 }
1265 ds_put_cstr(&ds, slave->name);
1266 }
1267 ds_put_char(&ds, '\n');
1268 }
1269 ovs_rwlock_unlock(&rwlock);
1270 unixctl_command_reply(conn, ds_cstr(&ds));
1271 ds_destroy(&ds);
1272 }
1273
1274 static void
1275 bond_print_details(struct ds *ds, const struct bond *bond)
1276 OVS_REQ_RDLOCK(rwlock)
1277 {
1278 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1279 const struct shash_node **sorted_slaves = NULL;
1280 const struct bond_slave *slave;
1281 bool may_recirc;
1282 uint32_t recirc_id;
1283 int i;
1284
1285 ds_put_format(ds, "---- %s ----\n", bond->name);
1286 ds_put_format(ds, "bond_mode: %s\n",
1287 bond_mode_to_string(bond->balance));
1288
1289 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1290 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1291 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1292
1293 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1294
1295 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1296 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1297
1298 if (bond_is_balanced(bond)) {
1299 ds_put_format(ds, "next rebalance: %lld ms\n",
1300 bond->next_rebalance - time_msec());
1301 }
1302
1303 ds_put_cstr(ds, "lacp_status: ");
1304 switch (bond->lacp_status) {
1305 case LACP_NEGOTIATED:
1306 ds_put_cstr(ds, "negotiated\n");
1307 break;
1308 case LACP_CONFIGURED:
1309 ds_put_cstr(ds, "configured\n");
1310 break;
1311 case LACP_DISABLED:
1312 ds_put_cstr(ds, "off\n");
1313 break;
1314 default:
1315 ds_put_cstr(ds, "<unknown>\n");
1316 break;
1317 }
1318
1319 ds_put_cstr(ds, "active slave mac: ");
1320 ds_put_format(ds, ETH_ADDR_FMT, ETH_ADDR_ARGS(bond->active_slave_mac));
1321 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1322 ds_put_format(ds,"(%s)\n", slave ? slave->name : "none");
1323
1324 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1325 shash_add(&slave_shash, slave->name, slave);
1326 }
1327 sorted_slaves = shash_sort(&slave_shash);
1328
1329 for (i = 0; i < shash_count(&slave_shash); i++) {
1330 struct bond_entry *be;
1331
1332 slave = sorted_slaves[i]->data;
1333
1334 /* Basic info. */
1335 ds_put_format(ds, "\nslave %s: %s\n",
1336 slave->name, slave->enabled ? "enabled" : "disabled");
1337 if (slave == bond->active_slave) {
1338 ds_put_cstr(ds, "\tactive slave\n");
1339 }
1340 if (slave->delay_expires != LLONG_MAX) {
1341 ds_put_format(ds, "\t%s expires in %lld ms\n",
1342 slave->enabled ? "downdelay" : "updelay",
1343 slave->delay_expires - time_msec());
1344 }
1345
1346 ds_put_format(ds, "\tmay_enable: %s\n",
1347 slave->may_enable ? "true" : "false");
1348
1349 if (!bond_is_balanced(bond)) {
1350 continue;
1351 }
1352
1353 /* Hashes. */
1354 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1355 int hash = be - bond->hash;
1356 uint64_t be_tx_k;
1357
1358 if (be->slave != slave) {
1359 continue;
1360 }
1361
1362 be_tx_k = be->tx_bytes / 1024;
1363 if (be_tx_k) {
1364 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1365 hash, be_tx_k);
1366 }
1367
1368 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1369 }
1370 }
1371 shash_destroy(&slave_shash);
1372 free(sorted_slaves);
1373 ds_put_cstr(ds, "\n");
1374 }
1375
1376 static void
1377 bond_unixctl_show(struct unixctl_conn *conn,
1378 int argc, const char *argv[],
1379 void *aux OVS_UNUSED)
1380 {
1381 struct ds ds = DS_EMPTY_INITIALIZER;
1382
1383 ovs_rwlock_rdlock(&rwlock);
1384 if (argc > 1) {
1385 const struct bond *bond = bond_find(argv[1]);
1386
1387 if (!bond) {
1388 unixctl_command_reply_error(conn, "no such bond");
1389 goto out;
1390 }
1391 bond_print_details(&ds, bond);
1392 } else {
1393 const struct bond *bond;
1394
1395 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1396 bond_print_details(&ds, bond);
1397 }
1398 }
1399
1400 unixctl_command_reply(conn, ds_cstr(&ds));
1401 ds_destroy(&ds);
1402
1403 out:
1404 ovs_rwlock_unlock(&rwlock);
1405 }
1406
1407 static void
1408 bond_unixctl_migrate(struct unixctl_conn *conn,
1409 int argc OVS_UNUSED, const char *argv[],
1410 void *aux OVS_UNUSED)
1411 {
1412 const char *bond_s = argv[1];
1413 const char *hash_s = argv[2];
1414 const char *slave_s = argv[3];
1415 struct bond *bond;
1416 struct bond_slave *slave;
1417 struct bond_entry *entry;
1418 int hash;
1419
1420 ovs_rwlock_wrlock(&rwlock);
1421 bond = bond_find(bond_s);
1422 if (!bond) {
1423 unixctl_command_reply_error(conn, "no such bond");
1424 goto out;
1425 }
1426
1427 if (bond->balance != BM_SLB) {
1428 unixctl_command_reply_error(conn, "not an SLB bond");
1429 goto out;
1430 }
1431
1432 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1433 hash = atoi(hash_s) & BOND_MASK;
1434 } else {
1435 unixctl_command_reply_error(conn, "bad hash");
1436 goto out;
1437 }
1438
1439 slave = bond_lookup_slave(bond, slave_s);
1440 if (!slave) {
1441 unixctl_command_reply_error(conn, "no such slave");
1442 goto out;
1443 }
1444
1445 if (!slave->enabled) {
1446 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1447 goto out;
1448 }
1449
1450 entry = &bond->hash[hash];
1451 bond->bond_revalidate = true;
1452 entry->slave = slave;
1453 unixctl_command_reply(conn, "migrated");
1454
1455 out:
1456 ovs_rwlock_unlock(&rwlock);
1457 }
1458
1459 static void
1460 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1461 int argc OVS_UNUSED, const char *argv[],
1462 void *aux OVS_UNUSED)
1463 {
1464 const char *bond_s = argv[1];
1465 const char *slave_s = argv[2];
1466 struct bond *bond;
1467 struct bond_slave *slave;
1468
1469 ovs_rwlock_wrlock(&rwlock);
1470 bond = bond_find(bond_s);
1471 if (!bond) {
1472 unixctl_command_reply_error(conn, "no such bond");
1473 goto out;
1474 }
1475
1476 slave = bond_lookup_slave(bond, slave_s);
1477 if (!slave) {
1478 unixctl_command_reply_error(conn, "no such slave");
1479 goto out;
1480 }
1481
1482 if (!slave->enabled) {
1483 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1484 goto out;
1485 }
1486
1487 if (bond->active_slave != slave) {
1488 bond->bond_revalidate = true;
1489 bond->active_slave = slave;
1490 VLOG_INFO("bond %s: active interface is now %s",
1491 bond->name, slave->name);
1492 bond->send_learning_packets = true;
1493 unixctl_command_reply(conn, "done");
1494 bond_active_slave_changed(bond);
1495 } else {
1496 unixctl_command_reply(conn, "no change");
1497 }
1498 out:
1499 ovs_rwlock_unlock(&rwlock);
1500 }
1501
1502 static void
1503 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1504 {
1505 const char *bond_s = argv[1];
1506 const char *slave_s = argv[2];
1507 struct bond *bond;
1508 struct bond_slave *slave;
1509
1510 ovs_rwlock_wrlock(&rwlock);
1511 bond = bond_find(bond_s);
1512 if (!bond) {
1513 unixctl_command_reply_error(conn, "no such bond");
1514 goto out;
1515 }
1516
1517 slave = bond_lookup_slave(bond, slave_s);
1518 if (!slave) {
1519 unixctl_command_reply_error(conn, "no such slave");
1520 goto out;
1521 }
1522
1523 bond_enable_slave(slave, enable);
1524 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1525
1526 out:
1527 ovs_rwlock_unlock(&rwlock);
1528 }
1529
1530 static void
1531 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1532 int argc OVS_UNUSED, const char *argv[],
1533 void *aux OVS_UNUSED)
1534 {
1535 enable_slave(conn, argv, true);
1536 }
1537
1538 static void
1539 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1540 int argc OVS_UNUSED, const char *argv[],
1541 void *aux OVS_UNUSED)
1542 {
1543 enable_slave(conn, argv, false);
1544 }
1545
1546 static void
1547 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1548 void *aux OVS_UNUSED)
1549 {
1550 const char *mac_s = argv[1];
1551 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1552 const char *basis_s = argc > 3 ? argv[3] : NULL;
1553 uint8_t mac[ETH_ADDR_LEN];
1554 uint8_t hash;
1555 char *hash_cstr;
1556 unsigned int vlan;
1557 uint32_t basis;
1558
1559 if (vlan_s) {
1560 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1561 unixctl_command_reply_error(conn, "invalid vlan");
1562 return;
1563 }
1564 } else {
1565 vlan = 0;
1566 }
1567
1568 if (basis_s) {
1569 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1570 unixctl_command_reply_error(conn, "invalid basis");
1571 return;
1572 }
1573 } else {
1574 basis = 0;
1575 }
1576
1577 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1578 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1579
1580 hash_cstr = xasprintf("%u", hash);
1581 unixctl_command_reply(conn, hash_cstr);
1582 free(hash_cstr);
1583 } else {
1584 unixctl_command_reply_error(conn, "invalid mac");
1585 }
1586 }
1587
1588 void
1589 bond_init(void)
1590 {
1591 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1592 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1593 NULL);
1594 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1595 bond_unixctl_migrate, NULL);
1596 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1597 bond_unixctl_set_active_slave, NULL);
1598 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1599 bond_unixctl_enable_slave, NULL);
1600 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1601 bond_unixctl_disable_slave, NULL);
1602 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1603 bond_unixctl_hash, NULL);
1604 }
1605 \f
1606 static void
1607 bond_entry_reset(struct bond *bond)
1608 {
1609 if (bond->balance != BM_AB) {
1610 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
1611
1612 if (!bond->hash) {
1613 bond->hash = xmalloc(hash_len);
1614 }
1615 memset(bond->hash, 0, hash_len);
1616
1617 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1618 } else {
1619 free(bond->hash);
1620 bond->hash = NULL;
1621 }
1622 }
1623
1624 static struct bond_slave *
1625 bond_slave_lookup(struct bond *bond, const void *slave_)
1626 {
1627 struct bond_slave *slave;
1628
1629 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1630 &bond->slaves) {
1631 if (slave->aux == slave_) {
1632 return slave;
1633 }
1634 }
1635
1636 return NULL;
1637 }
1638
1639 static void
1640 bond_enable_slave(struct bond_slave *slave, bool enable)
1641 {
1642 slave->delay_expires = LLONG_MAX;
1643 if (enable != slave->enabled) {
1644 slave->bond->bond_revalidate = true;
1645 slave->enabled = enable;
1646
1647 ovs_mutex_lock(&slave->bond->mutex);
1648 if (enable) {
1649 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1650 } else {
1651 list_remove(&slave->list_node);
1652 }
1653 ovs_mutex_unlock(&slave->bond->mutex);
1654
1655 VLOG_INFO("interface %s: %s", slave->name,
1656 slave->enabled ? "enabled" : "disabled");
1657 }
1658 }
1659
1660 static void
1661 bond_link_status_update(struct bond_slave *slave)
1662 {
1663 struct bond *bond = slave->bond;
1664 bool up;
1665
1666 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1667 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1668 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1669 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1670 slave->name, up ? "up" : "down");
1671 if (up == slave->enabled) {
1672 slave->delay_expires = LLONG_MAX;
1673 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1674 slave->name, up ? "disabled" : "enabled");
1675 } else {
1676 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1677 : up ? bond->updelay : bond->downdelay);
1678 slave->delay_expires = time_msec() + delay;
1679 if (delay) {
1680 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1681 "for %d ms",
1682 slave->name,
1683 up ? "enabled" : "disabled",
1684 up ? "up" : "down",
1685 delay);
1686 }
1687 }
1688 }
1689
1690 if (time_msec() >= slave->delay_expires) {
1691 bond_enable_slave(slave, up);
1692 }
1693 }
1694
1695 static unsigned int
1696 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1697 {
1698 return hash_mac(mac, vlan, basis);
1699 }
1700
1701 static unsigned int
1702 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1703 {
1704 struct flow hash_flow = *flow;
1705 hash_flow.vlan_tci = htons(vlan);
1706
1707 /* The symmetric quality of this hash function is not required, but
1708 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1709 * purposes, so we use it out of convenience. */
1710 return flow_hash_symmetric_l4(&hash_flow, basis);
1711 }
1712
1713 static unsigned int
1714 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1715 {
1716 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1717
1718 return (bond->balance == BM_TCP
1719 ? bond_hash_tcp(flow, vlan, bond->basis)
1720 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1721 }
1722
1723 static struct bond_entry *
1724 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1725 uint16_t vlan)
1726 {
1727 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1728 }
1729
1730 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1731 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1732 * returns NULL. */
1733 static struct bond_slave *
1734 get_enabled_slave(struct bond *bond)
1735 {
1736 struct list *node;
1737
1738 ovs_mutex_lock(&bond->mutex);
1739 if (list_is_empty(&bond->enabled_slaves)) {
1740 ovs_mutex_unlock(&bond->mutex);
1741 return NULL;
1742 }
1743
1744 node = list_pop_front(&bond->enabled_slaves);
1745 list_push_back(&bond->enabled_slaves, node);
1746 ovs_mutex_unlock(&bond->mutex);
1747
1748 return CONTAINER_OF(node, struct bond_slave, list_node);
1749 }
1750
1751 static struct bond_slave *
1752 choose_output_slave(const struct bond *bond, const struct flow *flow,
1753 struct flow_wildcards *wc, uint16_t vlan)
1754 {
1755 struct bond_entry *e;
1756 int balance;
1757
1758 balance = bond->balance;
1759 if (bond->lacp_status == LACP_CONFIGURED) {
1760 /* LACP has been configured on this bond but negotiations were
1761 * unsuccussful. If lacp_fallback_ab is enabled use active-
1762 * backup mode else drop all traffic. */
1763 if (!bond->lacp_fallback_ab) {
1764 return NULL;
1765 }
1766 balance = BM_AB;
1767 }
1768
1769 switch (balance) {
1770 case BM_AB:
1771 return bond->active_slave;
1772
1773 case BM_TCP:
1774 if (bond->lacp_status != LACP_NEGOTIATED) {
1775 /* Must have LACP negotiations for TCP balanced bonds. */
1776 return NULL;
1777 }
1778 if (wc) {
1779 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1780 }
1781 /* Fall Through. */
1782 case BM_SLB:
1783 if (wc) {
1784 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1785 }
1786 e = lookup_bond_entry(bond, flow, vlan);
1787 if (!e->slave || !e->slave->enabled) {
1788 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
1789 }
1790 return e->slave;
1791
1792 default:
1793 OVS_NOT_REACHED();
1794 }
1795 }
1796
1797 static struct bond_slave *
1798 bond_choose_slave(const struct bond *bond)
1799 {
1800 struct bond_slave *slave, *best;
1801
1802 /* Find the last active slave. */
1803 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1804 if (slave && slave->enabled) {
1805 return slave;
1806 }
1807
1808 /* Find an enabled slave. */
1809 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1810 if (slave->enabled) {
1811 return slave;
1812 }
1813 }
1814
1815 /* All interfaces are disabled. Find an interface that will be enabled
1816 * after its updelay expires. */
1817 best = NULL;
1818 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1819 if (slave->delay_expires != LLONG_MAX
1820 && slave->may_enable
1821 && (!best || slave->delay_expires < best->delay_expires)) {
1822 best = slave;
1823 }
1824 }
1825 return best;
1826 }
1827
1828 static void
1829 bond_choose_active_slave(struct bond *bond)
1830 {
1831 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1832 struct bond_slave *old_active_slave = bond->active_slave;
1833
1834 bond->active_slave = bond_choose_slave(bond);
1835 if (bond->active_slave) {
1836 if (bond->active_slave->enabled) {
1837 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1838 bond->name, bond->active_slave->name);
1839 } else {
1840 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1841 "remaining %lld ms updelay (since no interface was "
1842 "enabled)", bond->name, bond->active_slave->name,
1843 bond->active_slave->delay_expires - time_msec());
1844 bond_enable_slave(bond->active_slave, true);
1845 }
1846
1847 bond->send_learning_packets = true;
1848
1849 if (bond->active_slave != old_active_slave) {
1850 bond_active_slave_changed(bond);
1851 }
1852 } else if (old_active_slave) {
1853 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1854 }
1855 }
1856
1857 /*
1858 * Return true if bond has unstored active slave change.
1859 * If return true, 'mac' will store the bond's current active slave's
1860 * MAC address. */
1861 bool
1862 bond_get_changed_active_slave(const char *name, uint8_t* mac, bool force)
1863 {
1864 struct bond *bond;
1865
1866 ovs_rwlock_wrlock(&rwlock);
1867 bond = bond_find(name);
1868 if (bond) {
1869 if (bond->active_slave_changed || force) {
1870 memcpy(mac, bond->active_slave_mac, ETH_ADDR_LEN);
1871 bond->active_slave_changed = false;
1872 ovs_rwlock_unlock(&rwlock);
1873 return true;
1874 }
1875 }
1876 ovs_rwlock_unlock(&rwlock);
1877
1878 return false;
1879 }