]> git.proxmox.com Git - ovs.git/blob - ofproto/bond.c
odp-util: Share fields between odp and dpif_backer.
[ovs.git] / ofproto / bond.c
1 /*
2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "bond.h"
20
21 #include <limits.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <math.h>
25
26 #include "ofp-util.h"
27 #include "ofp-actions.h"
28 #include "ofpbuf.h"
29 #include "ofproto/ofproto-provider.h"
30 #include "ofproto/ofproto-dpif.h"
31 #include "ofproto/ofproto-dpif-rid.h"
32 #include "connectivity.h"
33 #include "coverage.h"
34 #include "dynamic-string.h"
35 #include "flow.h"
36 #include "hmap.h"
37 #include "lacp.h"
38 #include "list.h"
39 #include "netdev.h"
40 #include "odp-util.h"
41 #include "ofpbuf.h"
42 #include "packets.h"
43 #include "dp-packet.h"
44 #include "poll-loop.h"
45 #include "seq.h"
46 #include "match.h"
47 #include "shash.h"
48 #include "timeval.h"
49 #include "unixctl.h"
50 #include "openvswitch/vlog.h"
51
52 VLOG_DEFINE_THIS_MODULE(bond);
53
54 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
55 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
56 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
57
58 /* Bit-mask for hashing a flow down to a bucket. */
59 #define BOND_MASK 0xff
60 #define BOND_BUCKETS (BOND_MASK + 1)
61
62 /* A hash bucket for mapping a flow to a slave.
63 * "struct bond" has an array of BOND_BUCKETS of these. */
64 struct bond_entry {
65 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
66 uint64_t tx_bytes /* Count of bytes recently transmitted. */
67 OVS_GUARDED_BY(rwlock);
68 struct ovs_list list_node; /* In bond_slave's 'entries' list. */
69
70 /* Recirculation.
71 *
72 * 'pr_rule' is the post-recirculation rule for this entry.
73 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
74 * is used to determine delta (applied to 'tx_bytes' above.) */
75 struct rule *pr_rule;
76 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
77 };
78
79 /* A bond slave, that is, one of the links comprising a bond. */
80 struct bond_slave {
81 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
82 struct ovs_list list_node; /* In struct bond's enabled_slaves list. */
83 struct bond *bond; /* The bond that contains this slave. */
84 void *aux; /* Client-provided handle for this slave. */
85
86 struct netdev *netdev; /* Network device, owned by the client. */
87 unsigned int change_seq; /* Tracks changes in 'netdev'. */
88 ofp_port_t ofp_port; /* OpenFlow port number. */
89 char *name; /* Name (a copy of netdev_get_name(netdev)). */
90
91 /* Link status. */
92 long long delay_expires; /* Time after which 'enabled' may change. */
93 bool enabled; /* May be chosen for flows? */
94 bool may_enable; /* Client considers this slave bondable. */
95
96 /* Rebalancing info. Used only by bond_rebalance(). */
97 struct ovs_list bal_node; /* In bond_rebalance()'s 'bals' list. */
98 struct ovs_list entries; /* 'struct bond_entry's assigned here. */
99 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
100 };
101
102 /* A bond, that is, a set of network devices grouped to improve performance or
103 * robustness. */
104 struct bond {
105 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
106 char *name; /* Name provided by client. */
107 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
108
109 /* Slaves. */
110 struct hmap slaves;
111
112 /* Enabled slaves.
113 *
114 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
115 * (To prevent the bond_slave from disappearing they must also hold
116 * 'rwlock'.) */
117 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
118 struct ovs_list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
119
120 /* Bonding info. */
121 enum bond_mode balance; /* Balancing mode, one of BM_*. */
122 struct bond_slave *active_slave;
123 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
124 enum lacp_status lacp_status; /* Status of LACP negotiations. */
125 bool bond_revalidate; /* True if flows need revalidation. */
126 uint32_t basis; /* Basis for flow hash function. */
127
128 /* SLB specific bonding info. */
129 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
130 int rebalance_interval; /* Interval between rebalances, in ms. */
131 long long int next_rebalance; /* Next rebalancing time. */
132 bool send_learning_packets;
133 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
134 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
135
136 /* Store active slave to OVSDB. */
137 bool active_slave_changed; /* Set to true whenever the bond changes
138 active slave. It will be reset to false
139 after it is stored into OVSDB */
140
141 /* Interface name may not be persistent across an OS reboot, use
142 * MAC address for identifing the active slave */
143 uint8_t active_slave_mac[ETH_ADDR_LEN];
144 /* The MAC address of the active interface. */
145 /* Legacy compatibility. */
146 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
147
148 struct ovs_refcount ref_cnt;
149 };
150
151 /* What to do with an bond_recirc_rule. */
152 enum bond_op {
153 ADD, /* Add the rule to ofproto's flow table. */
154 DEL, /* Delete the rule from the ofproto's flow table. */
155 };
156
157 /* A rule to add to or delete from ofproto's internal flow table. */
158 struct bond_pr_rule_op {
159 struct hmap_node hmap_node;
160 struct match match;
161 ofp_port_t out_ofport;
162 enum bond_op op;
163 struct rule **pr_rule;
164 };
165
166 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
167 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
168 OVS_REQ_RDLOCK(rwlock);
169 static void bond_enable_slave(struct bond_slave *, bool enable)
170 OVS_REQ_WRLOCK(rwlock);
171 static void bond_link_status_update(struct bond_slave *)
172 OVS_REQ_WRLOCK(rwlock);
173 static void bond_choose_active_slave(struct bond *)
174 OVS_REQ_WRLOCK(rwlock);
175 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
176 uint16_t vlan, uint32_t basis);
177 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
178 uint32_t basis);
179 static struct bond_entry *lookup_bond_entry(const struct bond *,
180 const struct flow *,
181 uint16_t vlan)
182 OVS_REQ_RDLOCK(rwlock);
183 static struct bond_slave *get_enabled_slave(struct bond *)
184 OVS_REQ_RDLOCK(rwlock);
185 static struct bond_slave *choose_output_slave(const struct bond *,
186 const struct flow *,
187 struct flow_wildcards *,
188 uint16_t vlan)
189 OVS_REQ_RDLOCK(rwlock);
190
191 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
192 * stores the mode in '*balance' and returns true. Otherwise returns false
193 * without modifying '*balance'. */
194 bool
195 bond_mode_from_string(enum bond_mode *balance, const char *s)
196 {
197 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
198 *balance = BM_TCP;
199 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
200 *balance = BM_SLB;
201 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
202 *balance = BM_AB;
203 } else {
204 return false;
205 }
206 return true;
207 }
208
209 /* Returns a string representing 'balance'. */
210 const char *
211 bond_mode_to_string(enum bond_mode balance) {
212 switch (balance) {
213 case BM_TCP:
214 return "balance-tcp";
215 case BM_SLB:
216 return "balance-slb";
217 case BM_AB:
218 return "active-backup";
219 }
220 OVS_NOT_REACHED();
221 }
222
223 \f
224 /* Creates and returns a new bond whose configuration is initially taken from
225 * 's'.
226 *
227 * The caller should register each slave on the new bond by calling
228 * bond_slave_register(). */
229 struct bond *
230 bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
231 {
232 struct bond *bond;
233
234 bond = xzalloc(sizeof *bond);
235 bond->ofproto = ofproto;
236 hmap_init(&bond->slaves);
237 list_init(&bond->enabled_slaves);
238 ovs_mutex_init(&bond->mutex);
239 ovs_refcount_init(&bond->ref_cnt);
240
241 bond->recirc_id = 0;
242 hmap_init(&bond->pr_rule_ops);
243
244 bond_reconfigure(bond, s);
245 return bond;
246 }
247
248 struct bond *
249 bond_ref(const struct bond *bond_)
250 {
251 struct bond *bond = CONST_CAST(struct bond *, bond_);
252
253 if (bond) {
254 ovs_refcount_ref(&bond->ref_cnt);
255 }
256 return bond;
257 }
258
259 /* Frees 'bond'. */
260 void
261 bond_unref(struct bond *bond)
262 {
263 struct bond_slave *slave, *next_slave;
264 struct bond_pr_rule_op *pr_op, *next_op;
265
266 if (!bond || ovs_refcount_unref_relaxed(&bond->ref_cnt) != 1) {
267 return;
268 }
269
270 ovs_rwlock_wrlock(&rwlock);
271 hmap_remove(all_bonds, &bond->hmap_node);
272 ovs_rwlock_unlock(&rwlock);
273
274 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
275 hmap_remove(&bond->slaves, &slave->hmap_node);
276 /* Client owns 'slave->netdev'. */
277 free(slave->name);
278 free(slave);
279 }
280 hmap_destroy(&bond->slaves);
281
282 ovs_mutex_destroy(&bond->mutex);
283 free(bond->hash);
284 free(bond->name);
285
286 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
287 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
288 free(pr_op);
289 }
290 hmap_destroy(&bond->pr_rule_ops);
291
292 if (bond->recirc_id) {
293 recirc_free_id(bond->recirc_id);
294 }
295
296 free(bond);
297 }
298
299 static void
300 add_pr_rule(struct bond *bond, const struct match *match,
301 ofp_port_t out_ofport, struct rule **rule)
302 {
303 uint32_t hash = match_hash(match, 0);
304 struct bond_pr_rule_op *pr_op;
305
306 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
307 if (match_equal(&pr_op->match, match)) {
308 pr_op->op = ADD;
309 pr_op->out_ofport = out_ofport;
310 pr_op->pr_rule = rule;
311 return;
312 }
313 }
314
315 pr_op = xmalloc(sizeof *pr_op);
316 pr_op->match = *match;
317 pr_op->op = ADD;
318 pr_op->out_ofport = out_ofport;
319 pr_op->pr_rule = rule;
320 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
321 }
322
323 static void
324 update_recirc_rules(struct bond *bond)
325 OVS_REQ_WRLOCK(rwlock)
326 {
327 struct match match;
328 struct bond_pr_rule_op *pr_op, *next_op;
329 uint64_t ofpacts_stub[128 / 8];
330 struct ofpbuf ofpacts;
331 int i;
332
333 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
334
335 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
336 pr_op->op = DEL;
337 }
338
339 if (bond->hash && bond->recirc_id) {
340 for (i = 0; i < BOND_BUCKETS; i++) {
341 struct bond_slave *slave = bond->hash[i].slave;
342
343 if (slave) {
344 match_init_catchall(&match);
345 match_set_recirc_id(&match, bond->recirc_id);
346 match_set_dp_hash_masked(&match, i, BOND_MASK);
347
348 add_pr_rule(bond, &match, slave->ofp_port,
349 &bond->hash[i].pr_rule);
350 }
351 }
352 }
353
354 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
355 int error;
356 switch (pr_op->op) {
357 case ADD:
358 ofpbuf_clear(&ofpacts);
359 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
360 error = ofproto_dpif_add_internal_flow(bond->ofproto,
361 &pr_op->match,
362 RECIRC_RULE_PRIORITY, 0,
363 &ofpacts, pr_op->pr_rule);
364 if (error) {
365 char *err_s = match_to_string(&pr_op->match,
366 RECIRC_RULE_PRIORITY);
367
368 VLOG_ERR("failed to add post recirculation flow %s", err_s);
369 free(err_s);
370 }
371 break;
372
373 case DEL:
374 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
375 &pr_op->match,
376 RECIRC_RULE_PRIORITY);
377 if (error) {
378 char *err_s = match_to_string(&pr_op->match,
379 RECIRC_RULE_PRIORITY);
380
381 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
382 free(err_s);
383 }
384
385 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
386 *pr_op->pr_rule = NULL;
387 free(pr_op);
388 break;
389 }
390 }
391
392 ofpbuf_uninit(&ofpacts);
393 }
394
395
396 /* Updates 'bond''s overall configuration to 's'.
397 *
398 * The caller should register each slave on 'bond' by calling
399 * bond_slave_register(). This is optional if none of the slaves'
400 * configuration has changed. In any case it can't hurt.
401 *
402 * Returns true if the configuration has changed in such a way that requires
403 * flow revalidation.
404 * */
405 bool
406 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
407 {
408 bool revalidate = false;
409
410 ovs_rwlock_wrlock(&rwlock);
411 if (!bond->name || strcmp(bond->name, s->name)) {
412 if (bond->name) {
413 hmap_remove(all_bonds, &bond->hmap_node);
414 free(bond->name);
415 }
416 bond->name = xstrdup(s->name);
417 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
418 }
419
420 bond->updelay = s->up_delay;
421 bond->downdelay = s->down_delay;
422
423 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
424 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
425 revalidate = true;
426 }
427
428 if (bond->rebalance_interval != s->rebalance_interval) {
429 bond->rebalance_interval = s->rebalance_interval;
430 revalidate = true;
431 }
432
433 if (bond->balance != s->balance) {
434 bond->balance = s->balance;
435 revalidate = true;
436 }
437
438 if (bond->basis != s->basis) {
439 bond->basis = s->basis;
440 revalidate = true;
441 }
442
443 if (bond->bond_revalidate) {
444 revalidate = true;
445 bond->bond_revalidate = false;
446 }
447
448 if (bond->balance != BM_AB) {
449 if (!bond->recirc_id) {
450 bond->recirc_id = recirc_alloc_id(bond->ofproto);
451 }
452 } else if (bond->recirc_id) {
453 recirc_free_id(bond->recirc_id);
454 bond->recirc_id = 0;
455 }
456
457 if (bond->balance == BM_AB || !bond->hash || revalidate) {
458 bond_entry_reset(bond);
459 }
460
461 memcpy(bond->active_slave_mac, s->active_slave_mac,
462 sizeof s->active_slave_mac);
463
464 bond->active_slave_changed = false;
465
466 ovs_rwlock_unlock(&rwlock);
467 return revalidate;
468 }
469
470 static struct bond_slave *
471 bond_find_slave_by_mac(const struct bond *bond, const uint8_t mac[ETH_ADDR_LEN])
472 {
473 struct bond_slave *slave;
474
475 /* Find the last active slave */
476 HMAP_FOR_EACH(slave, hmap_node, &bond->slaves) {
477 uint8_t slave_mac[ETH_ADDR_LEN];
478
479 if (netdev_get_etheraddr(slave->netdev, slave_mac)) {
480 continue;
481 }
482
483 if (!memcmp(slave_mac, mac, sizeof(slave_mac))) {
484 return slave;
485 }
486 }
487
488 return NULL;
489 }
490
491 static void
492 bond_active_slave_changed(struct bond *bond)
493 {
494 uint8_t mac[ETH_ADDR_LEN];
495
496 netdev_get_etheraddr(bond->active_slave->netdev, mac);
497 memcpy(bond->active_slave_mac, mac, sizeof bond->active_slave_mac);
498 bond->active_slave_changed = true;
499 seq_change(connectivity_seq_get());
500 }
501
502 static void
503 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
504 OVS_REQ_WRLOCK(rwlock)
505 {
506 if (slave->netdev != netdev) {
507 slave->netdev = netdev;
508 slave->change_seq = 0;
509 }
510 }
511
512 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
513 * arbitrary client-provided pointer that uniquely identifies a slave within a
514 * bond. If 'slave_' already exists within 'bond' then this function
515 * reconfigures the existing slave.
516 *
517 * 'netdev' must be the network device that 'slave_' represents. It is owned
518 * by the client, so the client must not close it before either unregistering
519 * 'slave_' or destroying 'bond'.
520 */
521 void
522 bond_slave_register(struct bond *bond, void *slave_,
523 ofp_port_t ofport, struct netdev *netdev)
524 {
525 struct bond_slave *slave;
526
527 ovs_rwlock_wrlock(&rwlock);
528 slave = bond_slave_lookup(bond, slave_);
529 if (!slave) {
530 slave = xzalloc(sizeof *slave);
531
532 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
533 slave->bond = bond;
534 slave->aux = slave_;
535 slave->ofp_port = ofport;
536 slave->delay_expires = LLONG_MAX;
537 slave->name = xstrdup(netdev_get_name(netdev));
538 bond->bond_revalidate = true;
539
540 slave->enabled = false;
541 bond_enable_slave(slave, netdev_get_carrier(netdev));
542 }
543
544 bond_slave_set_netdev__(slave, netdev);
545
546 free(slave->name);
547 slave->name = xstrdup(netdev_get_name(netdev));
548 ovs_rwlock_unlock(&rwlock);
549 }
550
551 /* Updates the network device to be used with 'slave_' to 'netdev'.
552 *
553 * This is useful if the caller closes and re-opens the network device
554 * registered with bond_slave_register() but doesn't need to change anything
555 * else. */
556 void
557 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
558 {
559 struct bond_slave *slave;
560
561 ovs_rwlock_wrlock(&rwlock);
562 slave = bond_slave_lookup(bond, slave_);
563 if (slave) {
564 bond_slave_set_netdev__(slave, netdev);
565 }
566 ovs_rwlock_unlock(&rwlock);
567 }
568
569 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
570 * then this function has no effect.
571 *
572 * Unregistering a slave invalidates all flows. */
573 void
574 bond_slave_unregister(struct bond *bond, const void *slave_)
575 {
576 struct bond_slave *slave;
577 bool del_active;
578
579 ovs_rwlock_wrlock(&rwlock);
580 slave = bond_slave_lookup(bond, slave_);
581 if (!slave) {
582 goto out;
583 }
584
585 bond->bond_revalidate = true;
586 bond_enable_slave(slave, false);
587
588 del_active = bond->active_slave == slave;
589 if (bond->hash) {
590 struct bond_entry *e;
591 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
592 if (e->slave == slave) {
593 e->slave = NULL;
594 }
595 }
596 }
597
598 free(slave->name);
599
600 hmap_remove(&bond->slaves, &slave->hmap_node);
601 /* Client owns 'slave->netdev'. */
602 free(slave);
603
604 if (del_active) {
605 bond_choose_active_slave(bond);
606 bond->send_learning_packets = true;
607 }
608 out:
609 ovs_rwlock_unlock(&rwlock);
610 }
611
612 /* Should be called on each slave in 'bond' before bond_run() to indicate
613 * whether or not 'slave_' may be enabled. This function is intended to allow
614 * other protocols to have some impact on bonding decisions. For example LACP
615 * or high level link monitoring protocols may decide that a given slave should
616 * not be able to send traffic. */
617 void
618 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
619 {
620 ovs_rwlock_wrlock(&rwlock);
621 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
622 ovs_rwlock_unlock(&rwlock);
623 }
624
625 /* Performs periodic maintenance on 'bond'.
626 *
627 * Returns true if the caller should revalidate its flows.
628 *
629 * The caller should check bond_should_send_learning_packets() afterward. */
630 bool
631 bond_run(struct bond *bond, enum lacp_status lacp_status)
632 {
633 struct bond_slave *slave;
634 bool revalidate;
635
636 ovs_rwlock_wrlock(&rwlock);
637 if (bond->lacp_status != lacp_status) {
638 bond->lacp_status = lacp_status;
639 bond->bond_revalidate = true;
640 }
641
642 /* Enable slaves based on link status and LACP feedback. */
643 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
644 bond_link_status_update(slave);
645 slave->change_seq = seq_read(connectivity_seq_get());
646 }
647 if (!bond->active_slave || !bond->active_slave->enabled) {
648 bond_choose_active_slave(bond);
649 }
650
651 revalidate = bond->bond_revalidate;
652 bond->bond_revalidate = false;
653 ovs_rwlock_unlock(&rwlock);
654
655 return revalidate;
656 }
657
658 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
659 void
660 bond_wait(struct bond *bond)
661 {
662 struct bond_slave *slave;
663
664 ovs_rwlock_rdlock(&rwlock);
665 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
666 if (slave->delay_expires != LLONG_MAX) {
667 poll_timer_wait_until(slave->delay_expires);
668 }
669
670 seq_wait(connectivity_seq_get(), slave->change_seq);
671 }
672
673 if (bond->bond_revalidate) {
674 poll_immediate_wake();
675 }
676 ovs_rwlock_unlock(&rwlock);
677
678 /* We don't wait for bond->next_rebalance because rebalancing can only run
679 * at a flow account checkpoint. ofproto does checkpointing on its own
680 * schedule and bond_rebalance() gets called afterward, so we'd just be
681 * waking up for no purpose. */
682 }
683 \f
684 /* MAC learning table interaction. */
685
686 static bool
687 may_send_learning_packets(const struct bond *bond)
688 {
689 return ((bond->lacp_status == LACP_DISABLED
690 && (bond->balance == BM_SLB || bond->balance == BM_AB))
691 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
692 && bond->active_slave;
693 }
694
695 /* Returns true if 'bond' needs the client to send out packets to assist with
696 * MAC learning on 'bond'. If this function returns true, then the client
697 * should iterate through its MAC learning table for the bridge on which 'bond'
698 * is located. For each MAC that has been learned on a port other than 'bond',
699 * it should call bond_compose_learning_packet().
700 *
701 * This function will only return true if 'bond' is in SLB or active-backup
702 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
703 * necessary.
704 *
705 * Calling this function resets the state that it checks. */
706 bool
707 bond_should_send_learning_packets(struct bond *bond)
708 {
709 bool send;
710
711 ovs_rwlock_wrlock(&rwlock);
712 send = bond->send_learning_packets && may_send_learning_packets(bond);
713 bond->send_learning_packets = false;
714 ovs_rwlock_unlock(&rwlock);
715 return send;
716 }
717
718 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
719 *
720 * See bond_should_send_learning_packets() for description of usage. The
721 * caller should send the composed packet on the port associated with
722 * port_aux and takes ownership of the returned ofpbuf. */
723 struct dp_packet *
724 bond_compose_learning_packet(struct bond *bond,
725 const uint8_t eth_src[ETH_ADDR_LEN],
726 uint16_t vlan, void **port_aux)
727 {
728 struct bond_slave *slave;
729 struct dp_packet *packet;
730 struct flow flow;
731
732 ovs_rwlock_rdlock(&rwlock);
733 ovs_assert(may_send_learning_packets(bond));
734 memset(&flow, 0, sizeof flow);
735 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
736 slave = choose_output_slave(bond, &flow, NULL, vlan);
737
738 packet = dp_packet_new(0);
739 compose_rarp(packet, eth_src);
740 if (vlan) {
741 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
742 }
743
744 *port_aux = slave->aux;
745 ovs_rwlock_unlock(&rwlock);
746 return packet;
747 }
748 \f
749 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
750 * Ethernet destination address of 'eth_dst', should be admitted.
751 *
752 * The return value is one of the following:
753 *
754 * - BV_ACCEPT: Admit the packet.
755 *
756 * - BV_DROP: Drop the packet.
757 *
758 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
759 * Ethernet source address and VLAN. If there is none, or if the packet
760 * is on the learned port, then admit the packet. If a different port has
761 * been learned, however, drop the packet (and do not use it for MAC
762 * learning).
763 */
764 enum bond_verdict
765 bond_check_admissibility(struct bond *bond, const void *slave_,
766 const uint8_t eth_dst[ETH_ADDR_LEN])
767 {
768 enum bond_verdict verdict = BV_DROP;
769 struct bond_slave *slave;
770
771 ovs_rwlock_rdlock(&rwlock);
772 slave = bond_slave_lookup(bond, slave_);
773 if (!slave) {
774 goto out;
775 }
776
777 /* LACP bonds have very loose admissibility restrictions because we can
778 * assume the remote switch is aware of the bond and will "do the right
779 * thing". However, as a precaution we drop packets on disabled slaves
780 * because no correctly implemented partner switch should be sending
781 * packets to them.
782 *
783 * If LACP is configured, but LACP negotiations have been unsuccessful, we
784 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
785 switch (bond->lacp_status) {
786 case LACP_NEGOTIATED:
787 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
788 goto out;
789 case LACP_CONFIGURED:
790 if (!bond->lacp_fallback_ab) {
791 goto out;
792 }
793 case LACP_DISABLED:
794 break;
795 }
796
797 /* Drop all multicast packets on inactive slaves. */
798 if (eth_addr_is_multicast(eth_dst)) {
799 if (bond->active_slave != slave) {
800 goto out;
801 }
802 }
803
804 switch (bond->balance) {
805 case BM_TCP:
806 /* TCP balanced bonds require successful LACP negotiations. Based on the
807 * above check, LACP is off or lacp_fallback_ab is true on this bond.
808 * If lacp_fallback_ab is true fall through to BM_AB case else, we
809 * drop all incoming traffic. */
810 if (!bond->lacp_fallback_ab) {
811 goto out;
812 }
813
814 case BM_AB:
815 /* Drop all packets which arrive on backup slaves. This is similar to
816 * how Linux bonding handles active-backup bonds. */
817 if (bond->active_slave != slave) {
818 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
819
820 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
821 " slave (%s) destined for " ETH_ADDR_FMT,
822 slave->name, ETH_ADDR_ARGS(eth_dst));
823 goto out;
824 }
825 verdict = BV_ACCEPT;
826 goto out;
827
828 case BM_SLB:
829 /* Drop all packets for which we have learned a different input port,
830 * because we probably sent the packet on one slave and got it back on
831 * the other. Gratuitous ARP packets are an exception to this rule:
832 * the host has moved to another switch. The exception to the
833 * exception is if we locked the learning table to avoid reflections on
834 * bond slaves. */
835 verdict = BV_DROP_IF_MOVED;
836 goto out;
837 }
838
839 OVS_NOT_REACHED();
840 out:
841 ovs_rwlock_unlock(&rwlock);
842 return verdict;
843
844 }
845
846 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
847 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
848 * NULL if the packet should be dropped because no slaves are enabled.
849 *
850 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
851 * should be a VID only (i.e. excluding the PCP bits). Second,
852 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
853 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
854 * packet belongs to (so for an access port it will be the access port's VLAN).
855 *
856 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
857 * significant in the selection. At some point earlier, 'wc' should
858 * have been initialized (e.g., by flow_wildcards_init_catchall()).
859 */
860 void *
861 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
862 struct flow_wildcards *wc, uint16_t vlan)
863 {
864 struct bond_slave *slave;
865 void *aux;
866
867 ovs_rwlock_rdlock(&rwlock);
868 slave = choose_output_slave(bond, flow, wc, vlan);
869 aux = slave ? slave->aux : NULL;
870 ovs_rwlock_unlock(&rwlock);
871
872 return aux;
873 }
874 \f
875 /* Recirculation. */
876 static void
877 bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
878 OVS_REQ_WRLOCK(rwlock)
879 {
880 if (entry->slave) {
881 uint64_t delta;
882
883 delta = rule_tx_bytes - entry->pr_tx_bytes;
884 entry->tx_bytes += delta;
885 entry->pr_tx_bytes = rule_tx_bytes;
886 }
887 }
888
889 /* Maintain bond stats using post recirculation rule byte counters.*/
890 static void
891 bond_recirculation_account(struct bond *bond)
892 OVS_REQ_WRLOCK(rwlock)
893 {
894 int i;
895
896 for (i=0; i<=BOND_MASK; i++) {
897 struct bond_entry *entry = &bond->hash[i];
898 struct rule *rule = entry->pr_rule;
899
900 if (rule) {
901 uint64_t n_packets OVS_UNUSED;
902 long long int used OVS_UNUSED;
903 uint64_t n_bytes;
904
905 rule->ofproto->ofproto_class->rule_get_stats(
906 rule, &n_packets, &n_bytes, &used);
907 bond_entry_account(entry, n_bytes);
908 }
909 }
910 }
911
912 bool
913 bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
914 uint32_t *hash_bias)
915 {
916 if (bond->balance == BM_TCP && bond->recirc_id) {
917 if (recirc_id) {
918 *recirc_id = bond->recirc_id;
919 }
920 if (hash_bias) {
921 *hash_bias = bond->basis;
922 }
923 return true;
924 } else {
925 return false;
926 }
927 }
928
929 static void
930 bond_update_post_recirc_rules__(struct bond* bond, const bool force)
931 OVS_REQ_WRLOCK(rwlock)
932 {
933 struct bond_entry *e;
934 bool update_rules = force; /* Always update rules if caller forces it. */
935
936 /* Make sure all bond entries are populated */
937 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
938 if (!e->slave || !e->slave->enabled) {
939 update_rules = true;
940 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
941 struct bond_slave, hmap_node);
942 if (!e->slave->enabled) {
943 e->slave = bond->active_slave;
944 }
945 }
946 }
947
948 if (update_rules) {
949 update_recirc_rules(bond);
950 }
951 }
952
953 void
954 bond_update_post_recirc_rules(struct bond* bond, const bool force)
955 {
956 ovs_rwlock_wrlock(&rwlock);
957 bond_update_post_recirc_rules__(bond, force);
958 ovs_rwlock_unlock(&rwlock);
959 }
960 \f
961 /* Rebalancing. */
962
963 static bool
964 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
965 {
966 return bond->rebalance_interval
967 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
968 }
969
970 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
971 void
972 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
973 uint64_t n_bytes)
974 {
975 ovs_rwlock_wrlock(&rwlock);
976 if (bond_is_balanced(bond)) {
977 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
978 }
979 ovs_rwlock_unlock(&rwlock);
980 }
981
982 static struct bond_slave *
983 bond_slave_from_bal_node(struct ovs_list *bal) OVS_REQ_RDLOCK(rwlock)
984 {
985 return CONTAINER_OF(bal, struct bond_slave, bal_node);
986 }
987
988 static void
989 log_bals(struct bond *bond, const struct ovs_list *bals)
990 OVS_REQ_RDLOCK(rwlock)
991 {
992 if (VLOG_IS_DBG_ENABLED()) {
993 struct ds ds = DS_EMPTY_INITIALIZER;
994 const struct bond_slave *slave;
995
996 LIST_FOR_EACH (slave, bal_node, bals) {
997 if (ds.length) {
998 ds_put_char(&ds, ',');
999 }
1000 ds_put_format(&ds, " %s %"PRIu64"kB",
1001 slave->name, slave->tx_bytes / 1024);
1002
1003 if (!slave->enabled) {
1004 ds_put_cstr(&ds, " (disabled)");
1005 }
1006 if (!list_is_empty(&slave->entries)) {
1007 struct bond_entry *e;
1008
1009 ds_put_cstr(&ds, " (");
1010 LIST_FOR_EACH (e, list_node, &slave->entries) {
1011 if (&e->list_node != list_front(&slave->entries)) {
1012 ds_put_cstr(&ds, " + ");
1013 }
1014 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
1015 e - bond->hash, e->tx_bytes / 1024);
1016 }
1017 ds_put_cstr(&ds, ")");
1018 }
1019 }
1020 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
1021 ds_destroy(&ds);
1022 }
1023 }
1024
1025 /* Shifts 'hash' from its current slave to 'to'. */
1026 static void
1027 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
1028 OVS_REQ_WRLOCK(rwlock)
1029 {
1030 struct bond_slave *from = hash->slave;
1031 struct bond *bond = from->bond;
1032 uint64_t delta = hash->tx_bytes;
1033
1034 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
1035 "from %s to %s (now carrying %"PRIu64"kB and "
1036 "%"PRIu64"kB load, respectively)",
1037 bond->name, delta / 1024, hash - bond->hash,
1038 from->name, to->name,
1039 (from->tx_bytes - delta) / 1024,
1040 (to->tx_bytes + delta) / 1024);
1041
1042 /* Shift load away from 'from' to 'to'. */
1043 from->tx_bytes -= delta;
1044 to->tx_bytes += delta;
1045
1046 /* Arrange for flows to be revalidated. */
1047 hash->slave = to;
1048 bond->bond_revalidate = true;
1049 }
1050
1051 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1052 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
1053 * given that doing so must decrease the ratio of the load on the two slaves by
1054 * at least 0.1. Returns NULL if there is no appropriate entry.
1055 *
1056 * The list of entries isn't sorted. I don't know of a reason to prefer to
1057 * shift away small hashes or large hashes. */
1058 static struct bond_entry *
1059 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
1060 OVS_REQ_WRLOCK(rwlock)
1061 {
1062 struct bond_entry *e;
1063
1064 if (list_is_short(&from->entries)) {
1065 /* 'from' carries no more than one MAC hash, so shifting load away from
1066 * it would be pointless. */
1067 return NULL;
1068 }
1069
1070 LIST_FOR_EACH (e, list_node, &from->entries) {
1071 double old_ratio, new_ratio;
1072 uint64_t delta;
1073
1074 if (to_tx_bytes == 0) {
1075 /* Nothing on the new slave, move it. */
1076 return e;
1077 }
1078
1079 delta = e->tx_bytes;
1080 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1081 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
1082 if (old_ratio - new_ratio > 0.1
1083 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1084 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1085 and 'to' slave have the same load. Therefore, we only move an
1086 entry if it decreases the load on 'from', and brings us closer
1087 to equal traffic load. */
1088 return e;
1089 }
1090 }
1091
1092 return NULL;
1093 }
1094
1095 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1096 * maintained. */
1097 static void
1098 insert_bal(struct ovs_list *bals, struct bond_slave *slave)
1099 {
1100 struct bond_slave *pos;
1101
1102 LIST_FOR_EACH (pos, bal_node, bals) {
1103 if (slave->tx_bytes > pos->tx_bytes) {
1104 break;
1105 }
1106 }
1107 list_insert(&pos->bal_node, &slave->bal_node);
1108 }
1109
1110 /* Removes 'slave' from its current list and then inserts it into 'bals' so
1111 * that descending order of 'tx_bytes' is maintained. */
1112 static void
1113 reinsert_bal(struct ovs_list *bals, struct bond_slave *slave)
1114 {
1115 list_remove(&slave->bal_node);
1116 insert_bal(bals, slave);
1117 }
1118
1119 /* If 'bond' needs rebalancing, does so.
1120 *
1121 * The caller should have called bond_account() for each active flow, or in case
1122 * of recirculation is used, have called bond_recirculation_account(bond),
1123 * to ensure that flow data is consistently accounted at this point.
1124 */
1125 void
1126 bond_rebalance(struct bond *bond)
1127 {
1128 struct bond_slave *slave;
1129 struct bond_entry *e;
1130 struct ovs_list bals;
1131 bool rebalanced = false;
1132 bool use_recirc;
1133
1134 ovs_rwlock_wrlock(&rwlock);
1135 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
1136 goto done;
1137 }
1138 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1139
1140 use_recirc = ofproto_dpif_get_support(bond->ofproto)->odp.recirc &&
1141 bond_may_recirc(bond, NULL, NULL);
1142
1143 if (use_recirc) {
1144 bond_recirculation_account(bond);
1145 }
1146
1147 /* Add each bond_entry to its slave's 'entries' list.
1148 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1149 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1150 slave->tx_bytes = 0;
1151 list_init(&slave->entries);
1152 }
1153 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1154 if (e->slave && e->tx_bytes) {
1155 e->slave->tx_bytes += e->tx_bytes;
1156 list_push_back(&e->slave->entries, &e->list_node);
1157 }
1158 }
1159
1160 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1161 *
1162 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1163 * with a proper list sort algorithm. */
1164 list_init(&bals);
1165 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1166 if (slave->enabled) {
1167 insert_bal(&bals, slave);
1168 }
1169 }
1170 log_bals(bond, &bals);
1171
1172 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1173 while (!list_is_short(&bals)) {
1174 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1175 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1176 uint64_t overload;
1177
1178 overload = from->tx_bytes - to->tx_bytes;
1179 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1180 /* The extra load on 'from' (and all less-loaded slaves), compared
1181 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1182 * it is less than ~1Mbps. No point in rebalancing. */
1183 break;
1184 }
1185
1186 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1187 * to move from 'from' to 'to'. */
1188 e = choose_entry_to_migrate(from, to->tx_bytes);
1189 if (e) {
1190 bond_shift_load(e, to);
1191
1192 /* Delete element from from->entries.
1193 *
1194 * We don't add the element to to->hashes. That would only allow
1195 * 'e' to be migrated to another slave in this rebalancing run, and
1196 * there is no point in doing that. */
1197 list_remove(&e->list_node);
1198
1199 /* Re-sort 'bals'. */
1200 reinsert_bal(&bals, from);
1201 reinsert_bal(&bals, to);
1202 rebalanced = true;
1203 } else {
1204 /* Can't usefully migrate anything away from 'from'.
1205 * Don't reconsider it. */
1206 list_remove(&from->bal_node);
1207 }
1208 }
1209
1210 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1211 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1212 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1213 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1214 e->tx_bytes /= 2;
1215 }
1216
1217 if (use_recirc && rebalanced) {
1218 bond_update_post_recirc_rules__(bond,true);
1219 }
1220
1221 done:
1222 ovs_rwlock_unlock(&rwlock);
1223 }
1224 \f
1225 /* Bonding unixctl user interface functions. */
1226
1227 static struct bond *
1228 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
1229 {
1230 struct bond *bond;
1231
1232 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
1233 all_bonds) {
1234 if (!strcmp(bond->name, name)) {
1235 return bond;
1236 }
1237 }
1238 return NULL;
1239 }
1240
1241 static struct bond_slave *
1242 bond_lookup_slave(struct bond *bond, const char *slave_name)
1243 {
1244 struct bond_slave *slave;
1245
1246 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1247 if (!strcmp(slave->name, slave_name)) {
1248 return slave;
1249 }
1250 }
1251 return NULL;
1252 }
1253
1254 static void
1255 bond_unixctl_list(struct unixctl_conn *conn,
1256 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1257 void *aux OVS_UNUSED)
1258 {
1259 struct ds ds = DS_EMPTY_INITIALIZER;
1260 const struct bond *bond;
1261
1262 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
1263
1264 ovs_rwlock_rdlock(&rwlock);
1265 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1266 const struct bond_slave *slave;
1267 size_t i;
1268
1269 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1270 bond_mode_to_string(bond->balance), bond->recirc_id);
1271
1272 i = 0;
1273 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1274 if (i++ > 0) {
1275 ds_put_cstr(&ds, ", ");
1276 }
1277 ds_put_cstr(&ds, slave->name);
1278 }
1279 ds_put_char(&ds, '\n');
1280 }
1281 ovs_rwlock_unlock(&rwlock);
1282 unixctl_command_reply(conn, ds_cstr(&ds));
1283 ds_destroy(&ds);
1284 }
1285
1286 static void
1287 bond_print_details(struct ds *ds, const struct bond *bond)
1288 OVS_REQ_RDLOCK(rwlock)
1289 {
1290 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1291 const struct shash_node **sorted_slaves = NULL;
1292 const struct bond_slave *slave;
1293 bool may_recirc;
1294 uint32_t recirc_id;
1295 int i;
1296
1297 ds_put_format(ds, "---- %s ----\n", bond->name);
1298 ds_put_format(ds, "bond_mode: %s\n",
1299 bond_mode_to_string(bond->balance));
1300
1301 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1302 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1303 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1304
1305 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1306
1307 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1308 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1309
1310 if (bond_is_balanced(bond)) {
1311 ds_put_format(ds, "next rebalance: %lld ms\n",
1312 bond->next_rebalance - time_msec());
1313 }
1314
1315 ds_put_cstr(ds, "lacp_status: ");
1316 switch (bond->lacp_status) {
1317 case LACP_NEGOTIATED:
1318 ds_put_cstr(ds, "negotiated\n");
1319 break;
1320 case LACP_CONFIGURED:
1321 ds_put_cstr(ds, "configured\n");
1322 break;
1323 case LACP_DISABLED:
1324 ds_put_cstr(ds, "off\n");
1325 break;
1326 default:
1327 ds_put_cstr(ds, "<unknown>\n");
1328 break;
1329 }
1330
1331 ds_put_cstr(ds, "active slave mac: ");
1332 ds_put_format(ds, ETH_ADDR_FMT, ETH_ADDR_ARGS(bond->active_slave_mac));
1333 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1334 ds_put_format(ds,"(%s)\n", slave ? slave->name : "none");
1335
1336 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1337 shash_add(&slave_shash, slave->name, slave);
1338 }
1339 sorted_slaves = shash_sort(&slave_shash);
1340
1341 for (i = 0; i < shash_count(&slave_shash); i++) {
1342 struct bond_entry *be;
1343
1344 slave = sorted_slaves[i]->data;
1345
1346 /* Basic info. */
1347 ds_put_format(ds, "\nslave %s: %s\n",
1348 slave->name, slave->enabled ? "enabled" : "disabled");
1349 if (slave == bond->active_slave) {
1350 ds_put_cstr(ds, "\tactive slave\n");
1351 }
1352 if (slave->delay_expires != LLONG_MAX) {
1353 ds_put_format(ds, "\t%s expires in %lld ms\n",
1354 slave->enabled ? "downdelay" : "updelay",
1355 slave->delay_expires - time_msec());
1356 }
1357
1358 ds_put_format(ds, "\tmay_enable: %s\n",
1359 slave->may_enable ? "true" : "false");
1360
1361 if (!bond_is_balanced(bond)) {
1362 continue;
1363 }
1364
1365 /* Hashes. */
1366 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1367 int hash = be - bond->hash;
1368 uint64_t be_tx_k;
1369
1370 if (be->slave != slave) {
1371 continue;
1372 }
1373
1374 be_tx_k = be->tx_bytes / 1024;
1375 if (be_tx_k) {
1376 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1377 hash, be_tx_k);
1378 }
1379
1380 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1381 }
1382 }
1383 shash_destroy(&slave_shash);
1384 free(sorted_slaves);
1385 ds_put_cstr(ds, "\n");
1386 }
1387
1388 static void
1389 bond_unixctl_show(struct unixctl_conn *conn,
1390 int argc, const char *argv[],
1391 void *aux OVS_UNUSED)
1392 {
1393 struct ds ds = DS_EMPTY_INITIALIZER;
1394
1395 ovs_rwlock_rdlock(&rwlock);
1396 if (argc > 1) {
1397 const struct bond *bond = bond_find(argv[1]);
1398
1399 if (!bond) {
1400 unixctl_command_reply_error(conn, "no such bond");
1401 goto out;
1402 }
1403 bond_print_details(&ds, bond);
1404 } else {
1405 const struct bond *bond;
1406
1407 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1408 bond_print_details(&ds, bond);
1409 }
1410 }
1411
1412 unixctl_command_reply(conn, ds_cstr(&ds));
1413 ds_destroy(&ds);
1414
1415 out:
1416 ovs_rwlock_unlock(&rwlock);
1417 }
1418
1419 static void
1420 bond_unixctl_migrate(struct unixctl_conn *conn,
1421 int argc OVS_UNUSED, const char *argv[],
1422 void *aux OVS_UNUSED)
1423 {
1424 const char *bond_s = argv[1];
1425 const char *hash_s = argv[2];
1426 const char *slave_s = argv[3];
1427 struct bond *bond;
1428 struct bond_slave *slave;
1429 struct bond_entry *entry;
1430 int hash;
1431
1432 ovs_rwlock_wrlock(&rwlock);
1433 bond = bond_find(bond_s);
1434 if (!bond) {
1435 unixctl_command_reply_error(conn, "no such bond");
1436 goto out;
1437 }
1438
1439 if (bond->balance != BM_SLB) {
1440 unixctl_command_reply_error(conn, "not an SLB bond");
1441 goto out;
1442 }
1443
1444 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1445 hash = atoi(hash_s) & BOND_MASK;
1446 } else {
1447 unixctl_command_reply_error(conn, "bad hash");
1448 goto out;
1449 }
1450
1451 slave = bond_lookup_slave(bond, slave_s);
1452 if (!slave) {
1453 unixctl_command_reply_error(conn, "no such slave");
1454 goto out;
1455 }
1456
1457 if (!slave->enabled) {
1458 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1459 goto out;
1460 }
1461
1462 entry = &bond->hash[hash];
1463 bond->bond_revalidate = true;
1464 entry->slave = slave;
1465 unixctl_command_reply(conn, "migrated");
1466
1467 out:
1468 ovs_rwlock_unlock(&rwlock);
1469 }
1470
1471 static void
1472 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1473 int argc OVS_UNUSED, const char *argv[],
1474 void *aux OVS_UNUSED)
1475 {
1476 const char *bond_s = argv[1];
1477 const char *slave_s = argv[2];
1478 struct bond *bond;
1479 struct bond_slave *slave;
1480
1481 ovs_rwlock_wrlock(&rwlock);
1482 bond = bond_find(bond_s);
1483 if (!bond) {
1484 unixctl_command_reply_error(conn, "no such bond");
1485 goto out;
1486 }
1487
1488 slave = bond_lookup_slave(bond, slave_s);
1489 if (!slave) {
1490 unixctl_command_reply_error(conn, "no such slave");
1491 goto out;
1492 }
1493
1494 if (!slave->enabled) {
1495 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1496 goto out;
1497 }
1498
1499 if (bond->active_slave != slave) {
1500 bond->bond_revalidate = true;
1501 bond->active_slave = slave;
1502 VLOG_INFO("bond %s: active interface is now %s",
1503 bond->name, slave->name);
1504 bond->send_learning_packets = true;
1505 unixctl_command_reply(conn, "done");
1506 bond_active_slave_changed(bond);
1507 } else {
1508 unixctl_command_reply(conn, "no change");
1509 }
1510 out:
1511 ovs_rwlock_unlock(&rwlock);
1512 }
1513
1514 static void
1515 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1516 {
1517 const char *bond_s = argv[1];
1518 const char *slave_s = argv[2];
1519 struct bond *bond;
1520 struct bond_slave *slave;
1521
1522 ovs_rwlock_wrlock(&rwlock);
1523 bond = bond_find(bond_s);
1524 if (!bond) {
1525 unixctl_command_reply_error(conn, "no such bond");
1526 goto out;
1527 }
1528
1529 slave = bond_lookup_slave(bond, slave_s);
1530 if (!slave) {
1531 unixctl_command_reply_error(conn, "no such slave");
1532 goto out;
1533 }
1534
1535 bond_enable_slave(slave, enable);
1536 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1537
1538 out:
1539 ovs_rwlock_unlock(&rwlock);
1540 }
1541
1542 static void
1543 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1544 int argc OVS_UNUSED, const char *argv[],
1545 void *aux OVS_UNUSED)
1546 {
1547 enable_slave(conn, argv, true);
1548 }
1549
1550 static void
1551 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1552 int argc OVS_UNUSED, const char *argv[],
1553 void *aux OVS_UNUSED)
1554 {
1555 enable_slave(conn, argv, false);
1556 }
1557
1558 static void
1559 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1560 void *aux OVS_UNUSED)
1561 {
1562 const char *mac_s = argv[1];
1563 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1564 const char *basis_s = argc > 3 ? argv[3] : NULL;
1565 uint8_t mac[ETH_ADDR_LEN];
1566 uint8_t hash;
1567 char *hash_cstr;
1568 unsigned int vlan;
1569 uint32_t basis;
1570
1571 if (vlan_s) {
1572 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1573 unixctl_command_reply_error(conn, "invalid vlan");
1574 return;
1575 }
1576 } else {
1577 vlan = 0;
1578 }
1579
1580 if (basis_s) {
1581 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1582 unixctl_command_reply_error(conn, "invalid basis");
1583 return;
1584 }
1585 } else {
1586 basis = 0;
1587 }
1588
1589 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1590 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1591
1592 hash_cstr = xasprintf("%u", hash);
1593 unixctl_command_reply(conn, hash_cstr);
1594 free(hash_cstr);
1595 } else {
1596 unixctl_command_reply_error(conn, "invalid mac");
1597 }
1598 }
1599
1600 void
1601 bond_init(void)
1602 {
1603 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1604 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1605 NULL);
1606 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1607 bond_unixctl_migrate, NULL);
1608 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1609 bond_unixctl_set_active_slave, NULL);
1610 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1611 bond_unixctl_enable_slave, NULL);
1612 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1613 bond_unixctl_disable_slave, NULL);
1614 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1615 bond_unixctl_hash, NULL);
1616 }
1617 \f
1618 static void
1619 bond_entry_reset(struct bond *bond)
1620 {
1621 if (bond->balance != BM_AB) {
1622 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
1623
1624 if (!bond->hash) {
1625 bond->hash = xmalloc(hash_len);
1626 }
1627 memset(bond->hash, 0, hash_len);
1628
1629 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1630 } else {
1631 free(bond->hash);
1632 bond->hash = NULL;
1633 }
1634 }
1635
1636 static struct bond_slave *
1637 bond_slave_lookup(struct bond *bond, const void *slave_)
1638 {
1639 struct bond_slave *slave;
1640
1641 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1642 &bond->slaves) {
1643 if (slave->aux == slave_) {
1644 return slave;
1645 }
1646 }
1647
1648 return NULL;
1649 }
1650
1651 static void
1652 bond_enable_slave(struct bond_slave *slave, bool enable)
1653 {
1654 slave->delay_expires = LLONG_MAX;
1655 if (enable != slave->enabled) {
1656 slave->bond->bond_revalidate = true;
1657 slave->enabled = enable;
1658
1659 ovs_mutex_lock(&slave->bond->mutex);
1660 if (enable) {
1661 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1662 } else {
1663 list_remove(&slave->list_node);
1664 }
1665 ovs_mutex_unlock(&slave->bond->mutex);
1666
1667 VLOG_INFO("interface %s: %s", slave->name,
1668 slave->enabled ? "enabled" : "disabled");
1669 }
1670 }
1671
1672 static void
1673 bond_link_status_update(struct bond_slave *slave)
1674 {
1675 struct bond *bond = slave->bond;
1676 bool up;
1677
1678 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1679 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1680 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1681 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1682 slave->name, up ? "up" : "down");
1683 if (up == slave->enabled) {
1684 slave->delay_expires = LLONG_MAX;
1685 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1686 slave->name, up ? "disabled" : "enabled");
1687 } else {
1688 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1689 : up ? bond->updelay : bond->downdelay);
1690 slave->delay_expires = time_msec() + delay;
1691 if (delay) {
1692 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1693 "for %d ms",
1694 slave->name,
1695 up ? "enabled" : "disabled",
1696 up ? "up" : "down",
1697 delay);
1698 }
1699 }
1700 }
1701
1702 if (time_msec() >= slave->delay_expires) {
1703 bond_enable_slave(slave, up);
1704 }
1705 }
1706
1707 static unsigned int
1708 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1709 {
1710 return hash_mac(mac, vlan, basis);
1711 }
1712
1713 static unsigned int
1714 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1715 {
1716 struct flow hash_flow = *flow;
1717 hash_flow.vlan_tci = htons(vlan);
1718
1719 /* The symmetric quality of this hash function is not required, but
1720 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1721 * purposes, so we use it out of convenience. */
1722 return flow_hash_symmetric_l4(&hash_flow, basis);
1723 }
1724
1725 static unsigned int
1726 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1727 {
1728 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1729
1730 return (bond->balance == BM_TCP
1731 ? bond_hash_tcp(flow, vlan, bond->basis)
1732 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1733 }
1734
1735 static struct bond_entry *
1736 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1737 uint16_t vlan)
1738 {
1739 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1740 }
1741
1742 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1743 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1744 * returns NULL. */
1745 static struct bond_slave *
1746 get_enabled_slave(struct bond *bond)
1747 {
1748 struct ovs_list *node;
1749
1750 ovs_mutex_lock(&bond->mutex);
1751 if (list_is_empty(&bond->enabled_slaves)) {
1752 ovs_mutex_unlock(&bond->mutex);
1753 return NULL;
1754 }
1755
1756 node = list_pop_front(&bond->enabled_slaves);
1757 list_push_back(&bond->enabled_slaves, node);
1758 ovs_mutex_unlock(&bond->mutex);
1759
1760 return CONTAINER_OF(node, struct bond_slave, list_node);
1761 }
1762
1763 static struct bond_slave *
1764 choose_output_slave(const struct bond *bond, const struct flow *flow,
1765 struct flow_wildcards *wc, uint16_t vlan)
1766 {
1767 struct bond_entry *e;
1768 int balance;
1769
1770 balance = bond->balance;
1771 if (bond->lacp_status == LACP_CONFIGURED) {
1772 /* LACP has been configured on this bond but negotiations were
1773 * unsuccussful. If lacp_fallback_ab is enabled use active-
1774 * backup mode else drop all traffic. */
1775 if (!bond->lacp_fallback_ab) {
1776 return NULL;
1777 }
1778 balance = BM_AB;
1779 }
1780
1781 switch (balance) {
1782 case BM_AB:
1783 return bond->active_slave;
1784
1785 case BM_TCP:
1786 if (bond->lacp_status != LACP_NEGOTIATED) {
1787 /* Must have LACP negotiations for TCP balanced bonds. */
1788 return NULL;
1789 }
1790 if (wc) {
1791 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1792 }
1793 /* Fall Through. */
1794 case BM_SLB:
1795 if (wc) {
1796 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1797 }
1798 e = lookup_bond_entry(bond, flow, vlan);
1799 if (!e->slave || !e->slave->enabled) {
1800 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
1801 }
1802 return e->slave;
1803
1804 default:
1805 OVS_NOT_REACHED();
1806 }
1807 }
1808
1809 static struct bond_slave *
1810 bond_choose_slave(const struct bond *bond)
1811 {
1812 struct bond_slave *slave, *best;
1813
1814 /* Find the last active slave. */
1815 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1816 if (slave && slave->enabled) {
1817 return slave;
1818 }
1819
1820 /* Find an enabled slave. */
1821 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1822 if (slave->enabled) {
1823 return slave;
1824 }
1825 }
1826
1827 /* All interfaces are disabled. Find an interface that will be enabled
1828 * after its updelay expires. */
1829 best = NULL;
1830 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1831 if (slave->delay_expires != LLONG_MAX
1832 && slave->may_enable
1833 && (!best || slave->delay_expires < best->delay_expires)) {
1834 best = slave;
1835 }
1836 }
1837 return best;
1838 }
1839
1840 static void
1841 bond_choose_active_slave(struct bond *bond)
1842 {
1843 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1844 struct bond_slave *old_active_slave = bond->active_slave;
1845
1846 bond->active_slave = bond_choose_slave(bond);
1847 if (bond->active_slave) {
1848 if (bond->active_slave->enabled) {
1849 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1850 bond->name, bond->active_slave->name);
1851 } else {
1852 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1853 "remaining %lld ms updelay (since no interface was "
1854 "enabled)", bond->name, bond->active_slave->name,
1855 bond->active_slave->delay_expires - time_msec());
1856 bond_enable_slave(bond->active_slave, true);
1857 }
1858
1859 bond->send_learning_packets = true;
1860
1861 if (bond->active_slave != old_active_slave) {
1862 bond_active_slave_changed(bond);
1863 }
1864 } else if (old_active_slave) {
1865 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1866 }
1867 }
1868
1869 /*
1870 * Return true if bond has unstored active slave change.
1871 * If return true, 'mac' will store the bond's current active slave's
1872 * MAC address. */
1873 bool
1874 bond_get_changed_active_slave(const char *name, uint8_t* mac, bool force)
1875 {
1876 struct bond *bond;
1877
1878 ovs_rwlock_wrlock(&rwlock);
1879 bond = bond_find(name);
1880 if (bond) {
1881 if (bond->active_slave_changed || force) {
1882 memcpy(mac, bond->active_slave_mac, ETH_ADDR_LEN);
1883 bond->active_slave_changed = false;
1884 ovs_rwlock_unlock(&rwlock);
1885 return true;
1886 }
1887 }
1888 ovs_rwlock_unlock(&rwlock);
1889
1890 return false;
1891 }