]> git.proxmox.com Git - mirror_ovs.git/blob - ofproto/bond.c
ofproto/bond: only display hash entries with tx_byptes > 1KB
[mirror_ovs.git] / ofproto / bond.c
1 /*
2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "bond.h"
20
21 #include <limits.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <math.h>
25
26 #include "ofp-util.h"
27 #include "ofp-actions.h"
28 #include "ofpbuf.h"
29 #include "ofproto/ofproto-provider.h"
30 #include "ofproto/ofproto-dpif.h"
31 #include "connectivity.h"
32 #include "coverage.h"
33 #include "dynamic-string.h"
34 #include "flow.h"
35 #include "hmap.h"
36 #include "lacp.h"
37 #include "list.h"
38 #include "netdev.h"
39 #include "odp-util.h"
40 #include "ofpbuf.h"
41 #include "packets.h"
42 #include "poll-loop.h"
43 #include "seq.h"
44 #include "match.h"
45 #include "shash.h"
46 #include "timeval.h"
47 #include "unixctl.h"
48 #include "vlog.h"
49
50 VLOG_DEFINE_THIS_MODULE(bond);
51
52 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
53 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
54 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
55
56 /* Bit-mask for hashing a flow down to a bucket. */
57 #define BOND_MASK 0xff
58 #define BOND_BUCKETS (BOND_MASK + 1)
59 #define RECIRC_RULE_PRIORITY 20 /* Priority level for internal rules */
60
61 /* A hash bucket for mapping a flow to a slave.
62 * "struct bond" has an array of BOND_BUCKETS of these. */
63 struct bond_entry {
64 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
65 uint64_t tx_bytes /* Count of bytes recently transmitted. */
66 OVS_GUARDED_BY(rwlock);
67 struct list list_node; /* In bond_slave's 'entries' list. */
68
69 /* Recirculation.
70 *
71 * 'pr_rule' is the post-recirculation rule for this entry.
72 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
73 * is used to determine delta (applied to 'tx_bytes' above.) */
74 struct rule *pr_rule;
75 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
76 };
77
78 /* A bond slave, that is, one of the links comprising a bond. */
79 struct bond_slave {
80 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
81 struct list list_node; /* In struct bond's enabled_slaves list. */
82 struct bond *bond; /* The bond that contains this slave. */
83 void *aux; /* Client-provided handle for this slave. */
84
85 struct netdev *netdev; /* Network device, owned by the client. */
86 unsigned int change_seq; /* Tracks changes in 'netdev'. */
87 ofp_port_t ofp_port; /* Open flow port number */
88 char *name; /* Name (a copy of netdev_get_name(netdev)). */
89
90 /* Link status. */
91 long long delay_expires; /* Time after which 'enabled' may change. */
92 bool enabled; /* May be chosen for flows? */
93 bool may_enable; /* Client considers this slave bondable. */
94
95 /* Rebalancing info. Used only by bond_rebalance(). */
96 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
97 struct list entries; /* 'struct bond_entry's assigned here. */
98 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
99 };
100
101 /* A bond, that is, a set of network devices grouped to improve performance or
102 * robustness. */
103 struct bond {
104 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
105 char *name; /* Name provided by client. */
106 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
107
108 /* Slaves. */
109 struct hmap slaves;
110
111 /* Enabled slaves.
112 *
113 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
114 * (To prevent the bond_slave from disappearing they must also hold
115 * 'rwlock'.) */
116 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
117 struct list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
118
119 /* Bonding info. */
120 enum bond_mode balance; /* Balancing mode, one of BM_*. */
121 struct bond_slave *active_slave;
122 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
123 enum lacp_status lacp_status; /* Status of LACP negotiations. */
124 bool bond_revalidate; /* True if flows need revalidation. */
125 uint32_t basis; /* Basis for flow hash function. */
126
127 /* SLB specific bonding info. */
128 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
129 int rebalance_interval; /* Interval between rebalances, in ms. */
130 long long int next_rebalance; /* Next rebalancing time. */
131 bool send_learning_packets;
132 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
133 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
134
135 /* Legacy compatibility. */
136 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
137 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
138
139 struct ovs_refcount ref_cnt;
140 };
141
142 /* What to do with an bond_recirc_rule. */
143 enum bond_op {
144 ADD, /* Add the rule to ofproto's flow table. */
145 DEL, /* Delete the rule from the ofproto's flow table. */
146 };
147
148 /* A rule to add to or delete from ofproto's internal flow table. */
149 struct bond_pr_rule_op {
150 struct hmap_node hmap_node;
151 struct match match;
152 ofp_port_t out_ofport;
153 enum bond_op op;
154 struct rule *pr_rule;
155 };
156
157 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
158 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
159 OVS_REQ_RDLOCK(rwlock);
160 static void bond_enable_slave(struct bond_slave *, bool enable)
161 OVS_REQ_WRLOCK(rwlock);
162 static void bond_link_status_update(struct bond_slave *)
163 OVS_REQ_WRLOCK(rwlock);
164 static void bond_choose_active_slave(struct bond *)
165 OVS_REQ_WRLOCK(rwlock);
166 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
167 uint16_t vlan, uint32_t basis);
168 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
169 uint32_t basis);
170 static struct bond_entry *lookup_bond_entry(const struct bond *,
171 const struct flow *,
172 uint16_t vlan)
173 OVS_REQ_RDLOCK(rwlock);
174 static struct bond_slave *get_enabled_slave(struct bond *)
175 OVS_REQ_RDLOCK(rwlock);
176 static struct bond_slave *choose_output_slave(const struct bond *,
177 const struct flow *,
178 struct flow_wildcards *,
179 uint16_t vlan)
180 OVS_REQ_RDLOCK(rwlock);
181 static void bond_update_fake_slave_stats(struct bond *)
182 OVS_REQ_RDLOCK(rwlock);
183
184 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
185 * stores the mode in '*balance' and returns true. Otherwise returns false
186 * without modifying '*balance'. */
187 bool
188 bond_mode_from_string(enum bond_mode *balance, const char *s)
189 {
190 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
191 *balance = BM_TCP;
192 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
193 *balance = BM_SLB;
194 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
195 *balance = BM_AB;
196 } else {
197 return false;
198 }
199 return true;
200 }
201
202 /* Returns a string representing 'balance'. */
203 const char *
204 bond_mode_to_string(enum bond_mode balance) {
205 switch (balance) {
206 case BM_TCP:
207 return "balance-tcp";
208 case BM_SLB:
209 return "balance-slb";
210 case BM_AB:
211 return "active-backup";
212 }
213 OVS_NOT_REACHED();
214 }
215
216 \f
217 /* Creates and returns a new bond whose configuration is initially taken from
218 * 's'.
219 *
220 * The caller should register each slave on the new bond by calling
221 * bond_slave_register(). */
222 struct bond *
223 bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
224 {
225 struct bond *bond;
226
227 bond = xzalloc(sizeof *bond);
228 bond->ofproto = ofproto;
229 hmap_init(&bond->slaves);
230 list_init(&bond->enabled_slaves);
231 ovs_mutex_init(&bond->mutex);
232 bond->next_fake_iface_update = LLONG_MAX;
233 ovs_refcount_init(&bond->ref_cnt);
234
235 bond->recirc_id = 0;
236 hmap_init(&bond->pr_rule_ops);
237
238 bond_reconfigure(bond, s);
239 return bond;
240 }
241
242 struct bond *
243 bond_ref(const struct bond *bond_)
244 {
245 struct bond *bond = CONST_CAST(struct bond *, bond_);
246
247 if (bond) {
248 ovs_refcount_ref(&bond->ref_cnt);
249 }
250 return bond;
251 }
252
253 /* Frees 'bond'. */
254 void
255 bond_unref(struct bond *bond)
256 {
257 struct bond_slave *slave, *next_slave;
258 struct bond_pr_rule_op *pr_op, *next_op;
259
260 if (!bond || ovs_refcount_unref(&bond->ref_cnt) != 1) {
261 return;
262 }
263
264 ovs_rwlock_wrlock(&rwlock);
265 hmap_remove(all_bonds, &bond->hmap_node);
266 ovs_rwlock_unlock(&rwlock);
267
268 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
269 hmap_remove(&bond->slaves, &slave->hmap_node);
270 /* Client owns 'slave->netdev'. */
271 free(slave->name);
272 free(slave);
273 }
274 hmap_destroy(&bond->slaves);
275
276 ovs_mutex_destroy(&bond->mutex);
277 free(bond->hash);
278 free(bond->name);
279
280 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
281 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
282 free(pr_op);
283 }
284 hmap_destroy(&bond->pr_rule_ops);
285
286 if (bond->recirc_id) {
287 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
288 }
289
290 free(bond);
291 }
292
293 static void
294 add_pr_rule(struct bond *bond, const struct match *match,
295 ofp_port_t out_ofport, struct rule *rule)
296 {
297 uint32_t hash = match_hash(match, 0);
298 struct bond_pr_rule_op *pr_op;
299
300 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
301 if (match_equal(&pr_op->match, match)) {
302 pr_op->op = ADD;
303 pr_op->out_ofport = out_ofport;
304 pr_op->pr_rule = rule;
305 return;
306 }
307 }
308
309 pr_op = xmalloc(sizeof *pr_op);
310 pr_op->match = *match;
311 pr_op->op = ADD;
312 pr_op->out_ofport = out_ofport;
313 pr_op->pr_rule = rule;
314 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
315 }
316
317 static void
318 update_recirc_rules(struct bond *bond)
319 {
320 struct match match;
321 struct bond_pr_rule_op *pr_op, *next_op;
322 uint64_t ofpacts_stub[128 / 8];
323 struct ofpbuf ofpacts;
324 int i;
325
326 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
327
328 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
329 pr_op->op = DEL;
330 }
331
332 if ((bond->hash == NULL) || (!bond->recirc_id)) {
333 return;
334 }
335
336 for (i = 0; i < BOND_BUCKETS; i++) {
337 struct bond_slave *slave = bond->hash[i].slave;
338
339 if (slave) {
340 match_init_catchall(&match);
341 match_set_recirc_id(&match, bond->recirc_id);
342 /* recirc_id -> metadata to speed up look ups. */
343 match_set_metadata(&match, htonll(bond->recirc_id));
344 match_set_dp_hash_masked(&match, i, BOND_MASK);
345
346 add_pr_rule(bond, &match, slave->ofp_port,
347 bond->hash[i].pr_rule);
348 }
349 }
350
351 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
352 int error;
353 struct rule *rule;
354 switch (pr_op->op) {
355 case ADD:
356 ofpbuf_clear(&ofpacts);
357 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
358 error = ofproto_dpif_add_internal_flow(bond->ofproto,
359 &pr_op->match,
360 RECIRC_RULE_PRIORITY,
361 &ofpacts, &rule);
362 if (error) {
363 char *err_s = match_to_string(&pr_op->match,
364 RECIRC_RULE_PRIORITY);
365
366 VLOG_ERR("failed to add post recirculation flow %s", err_s);
367 free(err_s);
368 pr_op->pr_rule = NULL;
369 } else {
370 pr_op->pr_rule = rule;
371 }
372 break;
373
374 case DEL:
375 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
376 &pr_op->match,
377 RECIRC_RULE_PRIORITY);
378 if (error) {
379 char *err_s = match_to_string(&pr_op->match,
380 RECIRC_RULE_PRIORITY);
381
382 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
383 free(err_s);
384 }
385
386 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
387 pr_op->pr_rule = NULL;
388 free(pr_op);
389 break;
390 }
391 }
392
393 ofpbuf_uninit(&ofpacts);
394 }
395
396
397 /* Updates 'bond''s overall configuration to 's'.
398 *
399 * The caller should register each slave on 'bond' by calling
400 * bond_slave_register(). This is optional if none of the slaves'
401 * configuration has changed. In any case it can't hurt.
402 *
403 * Returns true if the configuration has changed in such a way that requires
404 * flow revalidation.
405 * */
406 bool
407 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
408 {
409 bool revalidate = false;
410
411 ovs_rwlock_wrlock(&rwlock);
412 if (!bond->name || strcmp(bond->name, s->name)) {
413 if (bond->name) {
414 hmap_remove(all_bonds, &bond->hmap_node);
415 free(bond->name);
416 }
417 bond->name = xstrdup(s->name);
418 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
419 }
420
421 bond->updelay = s->up_delay;
422 bond->downdelay = s->down_delay;
423
424 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
425 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
426 revalidate = true;
427 }
428
429 if (bond->rebalance_interval != s->rebalance_interval) {
430 bond->rebalance_interval = s->rebalance_interval;
431 revalidate = true;
432 }
433
434 if (bond->balance != s->balance) {
435 bond->balance = s->balance;
436 revalidate = true;
437 }
438
439 if (bond->basis != s->basis) {
440 bond->basis = s->basis;
441 revalidate = true;
442 }
443
444 if (s->fake_iface) {
445 if (bond->next_fake_iface_update == LLONG_MAX) {
446 bond->next_fake_iface_update = time_msec();
447 }
448 } else {
449 bond->next_fake_iface_update = LLONG_MAX;
450 }
451
452 if (bond->bond_revalidate) {
453 revalidate = true;
454 bond->bond_revalidate = false;
455 }
456
457 if (bond->balance != BM_AB) {
458 if (!bond->recirc_id) {
459 bond->recirc_id = ofproto_dpif_alloc_recirc_id(bond->ofproto);
460 }
461 } else if (bond->recirc_id) {
462 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
463 bond->recirc_id = 0;
464 }
465
466 if (bond->balance == BM_AB || !bond->hash || revalidate) {
467 bond_entry_reset(bond);
468 }
469
470 ovs_rwlock_unlock(&rwlock);
471 return revalidate;
472 }
473
474 static void
475 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
476 OVS_REQ_WRLOCK(rwlock)
477 {
478 if (slave->netdev != netdev) {
479 slave->netdev = netdev;
480 slave->change_seq = 0;
481 }
482 }
483
484 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
485 * arbitrary client-provided pointer that uniquely identifies a slave within a
486 * bond. If 'slave_' already exists within 'bond' then this function
487 * reconfigures the existing slave.
488 *
489 * 'netdev' must be the network device that 'slave_' represents. It is owned
490 * by the client, so the client must not close it before either unregistering
491 * 'slave_' or destroying 'bond'.
492 */
493 void
494 bond_slave_register(struct bond *bond, void *slave_,
495 ofp_port_t ofport, struct netdev *netdev)
496 {
497 struct bond_slave *slave;
498
499 ovs_rwlock_wrlock(&rwlock);
500 slave = bond_slave_lookup(bond, slave_);
501 if (!slave) {
502 slave = xzalloc(sizeof *slave);
503
504 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
505 slave->bond = bond;
506 slave->aux = slave_;
507 slave->ofp_port = ofport;
508 slave->delay_expires = LLONG_MAX;
509 slave->name = xstrdup(netdev_get_name(netdev));
510 bond->bond_revalidate = true;
511
512 slave->enabled = false;
513 bond_enable_slave(slave, netdev_get_carrier(netdev));
514 }
515
516 bond_slave_set_netdev__(slave, netdev);
517
518 free(slave->name);
519 slave->name = xstrdup(netdev_get_name(netdev));
520 ovs_rwlock_unlock(&rwlock);
521 }
522
523 /* Updates the network device to be used with 'slave_' to 'netdev'.
524 *
525 * This is useful if the caller closes and re-opens the network device
526 * registered with bond_slave_register() but doesn't need to change anything
527 * else. */
528 void
529 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
530 {
531 struct bond_slave *slave;
532
533 ovs_rwlock_wrlock(&rwlock);
534 slave = bond_slave_lookup(bond, slave_);
535 if (slave) {
536 bond_slave_set_netdev__(slave, netdev);
537 }
538 ovs_rwlock_unlock(&rwlock);
539 }
540
541 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
542 * then this function has no effect.
543 *
544 * Unregistering a slave invalidates all flows. */
545 void
546 bond_slave_unregister(struct bond *bond, const void *slave_)
547 {
548 struct bond_slave *slave;
549 bool del_active;
550
551 ovs_rwlock_wrlock(&rwlock);
552 slave = bond_slave_lookup(bond, slave_);
553 if (!slave) {
554 goto out;
555 }
556
557 bond->bond_revalidate = true;
558 bond_enable_slave(slave, false);
559
560 del_active = bond->active_slave == slave;
561 if (bond->hash) {
562 struct bond_entry *e;
563 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
564 if (e->slave == slave) {
565 e->slave = NULL;
566 }
567 }
568 }
569
570 free(slave->name);
571
572 hmap_remove(&bond->slaves, &slave->hmap_node);
573 /* Client owns 'slave->netdev'. */
574 free(slave);
575
576 if (del_active) {
577 bond_choose_active_slave(bond);
578 bond->send_learning_packets = true;
579 }
580 out:
581 ovs_rwlock_unlock(&rwlock);
582 }
583
584 /* Should be called on each slave in 'bond' before bond_run() to indicate
585 * whether or not 'slave_' may be enabled. This function is intended to allow
586 * other protocols to have some impact on bonding decisions. For example LACP
587 * or high level link monitoring protocols may decide that a given slave should
588 * not be able to send traffic. */
589 void
590 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
591 {
592 ovs_rwlock_wrlock(&rwlock);
593 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
594 ovs_rwlock_unlock(&rwlock);
595 }
596
597 /* Performs periodic maintenance on 'bond'.
598 *
599 * Returns true if the caller should revalidate its flows.
600 *
601 * The caller should check bond_should_send_learning_packets() afterward. */
602 bool
603 bond_run(struct bond *bond, enum lacp_status lacp_status)
604 {
605 struct bond_slave *slave;
606 bool revalidate;
607
608 ovs_rwlock_wrlock(&rwlock);
609 if (bond->lacp_status != lacp_status) {
610 bond->lacp_status = lacp_status;
611 bond->bond_revalidate = true;
612 }
613
614 /* Enable slaves based on link status and LACP feedback. */
615 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
616 bond_link_status_update(slave);
617 slave->change_seq = seq_read(connectivity_seq_get());
618 }
619 if (!bond->active_slave || !bond->active_slave->enabled) {
620 bond_choose_active_slave(bond);
621 }
622
623 /* Update fake bond interface stats. */
624 if (time_msec() >= bond->next_fake_iface_update) {
625 bond_update_fake_slave_stats(bond);
626 bond->next_fake_iface_update = time_msec() + 1000;
627 }
628
629 revalidate = bond->bond_revalidate;
630 bond->bond_revalidate = false;
631 ovs_rwlock_unlock(&rwlock);
632
633 return revalidate;
634 }
635
636 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
637 void
638 bond_wait(struct bond *bond)
639 {
640 struct bond_slave *slave;
641
642 ovs_rwlock_rdlock(&rwlock);
643 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
644 if (slave->delay_expires != LLONG_MAX) {
645 poll_timer_wait_until(slave->delay_expires);
646 }
647
648 seq_wait(connectivity_seq_get(), slave->change_seq);
649 }
650
651 if (bond->next_fake_iface_update != LLONG_MAX) {
652 poll_timer_wait_until(bond->next_fake_iface_update);
653 }
654
655 if (bond->bond_revalidate) {
656 poll_immediate_wake();
657 }
658 ovs_rwlock_unlock(&rwlock);
659
660 /* We don't wait for bond->next_rebalance because rebalancing can only run
661 * at a flow account checkpoint. ofproto does checkpointing on its own
662 * schedule and bond_rebalance() gets called afterward, so we'd just be
663 * waking up for no purpose. */
664 }
665 \f
666 /* MAC learning table interaction. */
667
668 static bool
669 may_send_learning_packets(const struct bond *bond)
670 {
671 return ((bond->lacp_status == LACP_DISABLED
672 && (bond->balance == BM_SLB || bond->balance == BM_AB))
673 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
674 && bond->active_slave;
675 }
676
677 /* Returns true if 'bond' needs the client to send out packets to assist with
678 * MAC learning on 'bond'. If this function returns true, then the client
679 * should iterate through its MAC learning table for the bridge on which 'bond'
680 * is located. For each MAC that has been learned on a port other than 'bond',
681 * it should call bond_compose_learning_packet().
682 *
683 * This function will only return true if 'bond' is in SLB or active-backup
684 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
685 * necessary.
686 *
687 * Calling this function resets the state that it checks. */
688 bool
689 bond_should_send_learning_packets(struct bond *bond)
690 {
691 bool send;
692
693 ovs_rwlock_wrlock(&rwlock);
694 send = bond->send_learning_packets && may_send_learning_packets(bond);
695 bond->send_learning_packets = false;
696 ovs_rwlock_unlock(&rwlock);
697 return send;
698 }
699
700 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
701 *
702 * See bond_should_send_learning_packets() for description of usage. The
703 * caller should send the composed packet on the port associated with
704 * port_aux and takes ownership of the returned ofpbuf. */
705 struct ofpbuf *
706 bond_compose_learning_packet(struct bond *bond,
707 const uint8_t eth_src[ETH_ADDR_LEN],
708 uint16_t vlan, void **port_aux)
709 {
710 struct bond_slave *slave;
711 struct ofpbuf *packet;
712 struct flow flow;
713
714 ovs_rwlock_rdlock(&rwlock);
715 ovs_assert(may_send_learning_packets(bond));
716 memset(&flow, 0, sizeof flow);
717 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
718 slave = choose_output_slave(bond, &flow, NULL, vlan);
719
720 packet = ofpbuf_new(0);
721 compose_rarp(packet, eth_src);
722 if (vlan) {
723 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
724 }
725
726 *port_aux = slave->aux;
727 ovs_rwlock_unlock(&rwlock);
728 return packet;
729 }
730 \f
731 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
732 * Ethernet destination address of 'eth_dst', should be admitted.
733 *
734 * The return value is one of the following:
735 *
736 * - BV_ACCEPT: Admit the packet.
737 *
738 * - BV_DROP: Drop the packet.
739 *
740 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
741 * Ethernet source address and VLAN. If there is none, or if the packet
742 * is on the learned port, then admit the packet. If a different port has
743 * been learned, however, drop the packet (and do not use it for MAC
744 * learning).
745 */
746 enum bond_verdict
747 bond_check_admissibility(struct bond *bond, const void *slave_,
748 const uint8_t eth_dst[ETH_ADDR_LEN])
749 {
750 enum bond_verdict verdict = BV_DROP;
751 struct bond_slave *slave;
752
753 ovs_rwlock_rdlock(&rwlock);
754 slave = bond_slave_lookup(bond, slave_);
755 if (!slave) {
756 goto out;
757 }
758
759 /* LACP bonds have very loose admissibility restrictions because we can
760 * assume the remote switch is aware of the bond and will "do the right
761 * thing". However, as a precaution we drop packets on disabled slaves
762 * because no correctly implemented partner switch should be sending
763 * packets to them.
764 *
765 * If LACP is configured, but LACP negotiations have been unsuccessful, we
766 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
767 switch (bond->lacp_status) {
768 case LACP_NEGOTIATED:
769 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
770 goto out;
771 case LACP_CONFIGURED:
772 if (!bond->lacp_fallback_ab) {
773 goto out;
774 }
775 case LACP_DISABLED:
776 break;
777 }
778
779 /* Drop all multicast packets on inactive slaves. */
780 if (eth_addr_is_multicast(eth_dst)) {
781 if (bond->active_slave != slave) {
782 goto out;
783 }
784 }
785
786 switch (bond->balance) {
787 case BM_TCP:
788 /* TCP balanced bonds require successful LACP negotiations. Based on the
789 * above check, LACP is off or lacp_fallback_ab is true on this bond.
790 * If lacp_fallback_ab is true fall through to BM_AB case else, we
791 * drop all incoming traffic. */
792 if (!bond->lacp_fallback_ab) {
793 goto out;
794 }
795
796 case BM_AB:
797 /* Drop all packets which arrive on backup slaves. This is similar to
798 * how Linux bonding handles active-backup bonds. */
799 if (bond->active_slave != slave) {
800 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
801
802 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
803 " slave (%s) destined for " ETH_ADDR_FMT,
804 slave->name, ETH_ADDR_ARGS(eth_dst));
805 goto out;
806 }
807 verdict = BV_ACCEPT;
808 goto out;
809
810 case BM_SLB:
811 /* Drop all packets for which we have learned a different input port,
812 * because we probably sent the packet on one slave and got it back on
813 * the other. Gratuitous ARP packets are an exception to this rule:
814 * the host has moved to another switch. The exception to the
815 * exception is if we locked the learning table to avoid reflections on
816 * bond slaves. */
817 verdict = BV_DROP_IF_MOVED;
818 goto out;
819 }
820
821 OVS_NOT_REACHED();
822 out:
823 ovs_rwlock_unlock(&rwlock);
824 return verdict;
825
826 }
827
828 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
829 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
830 * NULL if the packet should be dropped because no slaves are enabled.
831 *
832 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
833 * should be a VID only (i.e. excluding the PCP bits). Second,
834 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
835 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
836 * packet belongs to (so for an access port it will be the access port's VLAN).
837 *
838 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
839 * significant in the selection. At some point earlier, 'wc' should
840 * have been initialized (e.g., by flow_wildcards_init_catchall()).
841 */
842 void *
843 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
844 struct flow_wildcards *wc, uint16_t vlan)
845 {
846 struct bond_slave *slave;
847 void *aux;
848
849 ovs_rwlock_rdlock(&rwlock);
850 slave = choose_output_slave(bond, flow, wc, vlan);
851 aux = slave ? slave->aux : NULL;
852 ovs_rwlock_unlock(&rwlock);
853
854 return aux;
855 }
856 \f
857 /* Recirculation. */
858 static void
859 bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
860 OVS_REQ_WRLOCK(rwlock)
861 {
862 if (entry->slave) {
863 uint64_t delta;
864
865 delta = rule_tx_bytes - entry->pr_tx_bytes;
866 entry->tx_bytes += delta;
867 entry->pr_tx_bytes = rule_tx_bytes;
868 }
869 }
870
871 /* Maintain bond stats using post recirculation rule byte counters.*/
872 void
873 bond_recirculation_account(struct bond *bond)
874 {
875 int i;
876
877 ovs_rwlock_wrlock(&rwlock);
878 for (i=0; i<=BOND_MASK; i++) {
879 struct bond_entry *entry = &bond->hash[i];
880 struct rule *rule = entry->pr_rule;
881
882 if (rule) {
883 uint64_t n_packets OVS_UNUSED;
884 long long int used OVS_UNUSED;
885 uint64_t n_bytes;
886
887 rule->ofproto->ofproto_class->rule_get_stats(
888 rule, &n_packets, &n_bytes, &used);
889 bond_entry_account(entry, n_bytes);
890 }
891 }
892 ovs_rwlock_unlock(&rwlock);
893 }
894
895 bool
896 bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
897 uint32_t *hash_bias)
898 {
899 if (bond->balance == BM_TCP) {
900 if (recirc_id) {
901 *recirc_id = bond->recirc_id;
902 }
903 if (hash_bias) {
904 *hash_bias = bond->basis;
905 }
906 return true;
907 } else {
908 return false;
909 }
910 }
911
912 void
913 bond_update_post_recirc_rules(struct bond* bond, const bool force)
914 {
915 struct bond_entry *e;
916 bool update_rules = force; /* Always update rules if caller forces it. */
917
918 /* Make sure all bond entries are populated */
919 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
920 if (!e->slave || !e->slave->enabled) {
921 update_rules = true;
922 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
923 struct bond_slave, hmap_node);
924 if (!e->slave->enabled) {
925 e->slave = bond->active_slave;
926 }
927 }
928 }
929
930 if (update_rules) {
931 update_recirc_rules(bond);
932 }
933 }
934 \f
935 /* Rebalancing. */
936
937 static bool
938 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
939 {
940 return bond->rebalance_interval
941 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
942 }
943
944 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
945 void
946 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
947 uint64_t n_bytes)
948 {
949 ovs_rwlock_wrlock(&rwlock);
950 if (bond_is_balanced(bond)) {
951 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
952 }
953 ovs_rwlock_unlock(&rwlock);
954 }
955
956 static struct bond_slave *
957 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
958 {
959 return CONTAINER_OF(bal, struct bond_slave, bal_node);
960 }
961
962 static void
963 log_bals(struct bond *bond, const struct list *bals)
964 OVS_REQ_RDLOCK(rwlock)
965 {
966 if (VLOG_IS_DBG_ENABLED()) {
967 struct ds ds = DS_EMPTY_INITIALIZER;
968 const struct bond_slave *slave;
969
970 LIST_FOR_EACH (slave, bal_node, bals) {
971 if (ds.length) {
972 ds_put_char(&ds, ',');
973 }
974 ds_put_format(&ds, " %s %"PRIu64"kB",
975 slave->name, slave->tx_bytes / 1024);
976
977 if (!slave->enabled) {
978 ds_put_cstr(&ds, " (disabled)");
979 }
980 if (!list_is_empty(&slave->entries)) {
981 struct bond_entry *e;
982
983 ds_put_cstr(&ds, " (");
984 LIST_FOR_EACH (e, list_node, &slave->entries) {
985 if (&e->list_node != list_front(&slave->entries)) {
986 ds_put_cstr(&ds, " + ");
987 }
988 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
989 e - bond->hash, e->tx_bytes / 1024);
990 }
991 ds_put_cstr(&ds, ")");
992 }
993 }
994 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
995 ds_destroy(&ds);
996 }
997 }
998
999 /* Shifts 'hash' from its current slave to 'to'. */
1000 static void
1001 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
1002 OVS_REQ_WRLOCK(rwlock)
1003 {
1004 struct bond_slave *from = hash->slave;
1005 struct bond *bond = from->bond;
1006 uint64_t delta = hash->tx_bytes;
1007
1008 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
1009 "from %s to %s (now carrying %"PRIu64"kB and "
1010 "%"PRIu64"kB load, respectively)",
1011 bond->name, delta / 1024, hash - bond->hash,
1012 from->name, to->name,
1013 (from->tx_bytes - delta) / 1024,
1014 (to->tx_bytes + delta) / 1024);
1015
1016 /* Shift load away from 'from' to 'to'. */
1017 from->tx_bytes -= delta;
1018 to->tx_bytes += delta;
1019
1020 /* Arrange for flows to be revalidated. */
1021 hash->slave = to;
1022 bond->bond_revalidate = true;
1023 }
1024
1025 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1026 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
1027 * given that doing so must decrease the ratio of the load on the two slaves by
1028 * at least 0.1. Returns NULL if there is no appropriate entry.
1029 *
1030 * The list of entries isn't sorted. I don't know of a reason to prefer to
1031 * shift away small hashes or large hashes. */
1032 static struct bond_entry *
1033 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
1034 OVS_REQ_WRLOCK(rwlock)
1035 {
1036 struct bond_entry *e;
1037
1038 if (list_is_short(&from->entries)) {
1039 /* 'from' carries no more than one MAC hash, so shifting load away from
1040 * it would be pointless. */
1041 return NULL;
1042 }
1043
1044 LIST_FOR_EACH (e, list_node, &from->entries) {
1045 double old_ratio, new_ratio;
1046 uint64_t delta;
1047
1048 if (to_tx_bytes == 0) {
1049 /* Nothing on the new slave, move it. */
1050 return e;
1051 }
1052
1053 delta = e->tx_bytes;
1054 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1055 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
1056 if (old_ratio - new_ratio > 0.1
1057 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1058 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1059 and 'to' slave have the same load. Therefore, we only move an
1060 entry if it decreases the load on 'from', and brings us closer
1061 to equal traffic load. */
1062 return e;
1063 }
1064 }
1065
1066 return NULL;
1067 }
1068
1069 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1070 * maintained. */
1071 static void
1072 insert_bal(struct list *bals, struct bond_slave *slave)
1073 {
1074 struct bond_slave *pos;
1075
1076 LIST_FOR_EACH (pos, bal_node, bals) {
1077 if (slave->tx_bytes > pos->tx_bytes) {
1078 break;
1079 }
1080 }
1081 list_insert(&pos->bal_node, &slave->bal_node);
1082 }
1083
1084 /* Removes 'slave' from its current list and then inserts it into 'bals' so
1085 * that descending order of 'tx_bytes' is maintained. */
1086 static void
1087 reinsert_bal(struct list *bals, struct bond_slave *slave)
1088 {
1089 list_remove(&slave->bal_node);
1090 insert_bal(bals, slave);
1091 }
1092
1093 /* If 'bond' needs rebalancing, does so.
1094 *
1095 * The caller should have called bond_account() for each active flow, or in case
1096 * of recirculation is used, have called bond_recirculation_account(bond),
1097 * to ensure that flow data is consistently accounted at this point.
1098 *
1099 * Return whether rebalancing took place.*/
1100 bool
1101 bond_rebalance(struct bond *bond)
1102 {
1103 struct bond_slave *slave;
1104 struct bond_entry *e;
1105 struct list bals;
1106 bool rebalanced = false;
1107
1108 ovs_rwlock_wrlock(&rwlock);
1109 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
1110 goto done;
1111 }
1112 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1113
1114 /* Add each bond_entry to its slave's 'entries' list.
1115 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1116 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1117 slave->tx_bytes = 0;
1118 list_init(&slave->entries);
1119 }
1120 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1121 if (e->slave && e->tx_bytes) {
1122 e->slave->tx_bytes += e->tx_bytes;
1123 list_push_back(&e->slave->entries, &e->list_node);
1124 }
1125 }
1126
1127 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1128 *
1129 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1130 * with a proper list sort algorithm. */
1131 list_init(&bals);
1132 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1133 if (slave->enabled) {
1134 insert_bal(&bals, slave);
1135 }
1136 }
1137 log_bals(bond, &bals);
1138
1139 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1140 while (!list_is_short(&bals)) {
1141 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1142 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1143 uint64_t overload;
1144
1145 overload = from->tx_bytes - to->tx_bytes;
1146 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1147 /* The extra load on 'from' (and all less-loaded slaves), compared
1148 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1149 * it is less than ~1Mbps. No point in rebalancing. */
1150 break;
1151 }
1152
1153 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1154 * to move from 'from' to 'to'. */
1155 e = choose_entry_to_migrate(from, to->tx_bytes);
1156 if (e) {
1157 bond_shift_load(e, to);
1158
1159 /* Delete element from from->entries.
1160 *
1161 * We don't add the element to to->hashes. That would only allow
1162 * 'e' to be migrated to another slave in this rebalancing run, and
1163 * there is no point in doing that. */
1164 list_remove(&e->list_node);
1165
1166 /* Re-sort 'bals'. */
1167 reinsert_bal(&bals, from);
1168 reinsert_bal(&bals, to);
1169 rebalanced = true;
1170 } else {
1171 /* Can't usefully migrate anything away from 'from'.
1172 * Don't reconsider it. */
1173 list_remove(&from->bal_node);
1174 }
1175 }
1176
1177 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1178 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1179 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1180 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1181 e->tx_bytes /= 2;
1182 if (!e->tx_bytes) {
1183 e->slave = NULL;
1184 }
1185 }
1186
1187 done:
1188 ovs_rwlock_unlock(&rwlock);
1189 return rebalanced;
1190 }
1191 \f
1192 /* Bonding unixctl user interface functions. */
1193
1194 static struct bond *
1195 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
1196 {
1197 struct bond *bond;
1198
1199 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
1200 all_bonds) {
1201 if (!strcmp(bond->name, name)) {
1202 return bond;
1203 }
1204 }
1205 return NULL;
1206 }
1207
1208 static struct bond_slave *
1209 bond_lookup_slave(struct bond *bond, const char *slave_name)
1210 {
1211 struct bond_slave *slave;
1212
1213 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1214 if (!strcmp(slave->name, slave_name)) {
1215 return slave;
1216 }
1217 }
1218 return NULL;
1219 }
1220
1221 static void
1222 bond_unixctl_list(struct unixctl_conn *conn,
1223 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1224 void *aux OVS_UNUSED)
1225 {
1226 struct ds ds = DS_EMPTY_INITIALIZER;
1227 const struct bond *bond;
1228
1229 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
1230
1231 ovs_rwlock_rdlock(&rwlock);
1232 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1233 const struct bond_slave *slave;
1234 size_t i;
1235
1236 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1237 bond_mode_to_string(bond->balance), bond->recirc_id);
1238
1239 i = 0;
1240 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1241 if (i++ > 0) {
1242 ds_put_cstr(&ds, ", ");
1243 }
1244 ds_put_cstr(&ds, slave->name);
1245 }
1246 ds_put_char(&ds, '\n');
1247 }
1248 ovs_rwlock_unlock(&rwlock);
1249 unixctl_command_reply(conn, ds_cstr(&ds));
1250 ds_destroy(&ds);
1251 }
1252
1253 static void
1254 bond_print_details(struct ds *ds, const struct bond *bond)
1255 OVS_REQ_RDLOCK(rwlock)
1256 {
1257 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1258 const struct shash_node **sorted_slaves = NULL;
1259 const struct bond_slave *slave;
1260 bool may_recirc;
1261 uint32_t recirc_id;
1262 int i;
1263
1264 ds_put_format(ds, "---- %s ----\n", bond->name);
1265 ds_put_format(ds, "bond_mode: %s\n",
1266 bond_mode_to_string(bond->balance));
1267
1268 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1269 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1270 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1271
1272 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1273
1274 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1275 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1276
1277 if (bond_is_balanced(bond)) {
1278 ds_put_format(ds, "next rebalance: %lld ms\n",
1279 bond->next_rebalance - time_msec());
1280 }
1281
1282 ds_put_cstr(ds, "lacp_status: ");
1283 switch (bond->lacp_status) {
1284 case LACP_NEGOTIATED:
1285 ds_put_cstr(ds, "negotiated\n");
1286 break;
1287 case LACP_CONFIGURED:
1288 ds_put_cstr(ds, "configured\n");
1289 break;
1290 case LACP_DISABLED:
1291 ds_put_cstr(ds, "off\n");
1292 break;
1293 default:
1294 ds_put_cstr(ds, "<unknown>\n");
1295 break;
1296 }
1297
1298 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1299 shash_add(&slave_shash, slave->name, slave);
1300 }
1301 sorted_slaves = shash_sort(&slave_shash);
1302
1303 for (i = 0; i < shash_count(&slave_shash); i++) {
1304 struct bond_entry *be;
1305
1306 slave = sorted_slaves[i]->data;
1307
1308 /* Basic info. */
1309 ds_put_format(ds, "\nslave %s: %s\n",
1310 slave->name, slave->enabled ? "enabled" : "disabled");
1311 if (slave == bond->active_slave) {
1312 ds_put_cstr(ds, "\tactive slave\n");
1313 }
1314 if (slave->delay_expires != LLONG_MAX) {
1315 ds_put_format(ds, "\t%s expires in %lld ms\n",
1316 slave->enabled ? "downdelay" : "updelay",
1317 slave->delay_expires - time_msec());
1318 }
1319
1320 ds_put_format(ds, "\tmay_enable: %s\n",
1321 slave->may_enable ? "true" : "false");
1322
1323 if (!bond_is_balanced(bond)) {
1324 continue;
1325 }
1326
1327 /* Hashes. */
1328 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1329 int hash = be - bond->hash;
1330 uint64_t be_tx_k;
1331
1332 if (be->slave != slave) {
1333 continue;
1334 }
1335
1336 be_tx_k = be->tx_bytes / 1024;
1337 if (be_tx_k) {
1338 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1339 hash, be_tx_k);
1340 }
1341
1342 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1343 }
1344 }
1345 shash_destroy(&slave_shash);
1346 free(sorted_slaves);
1347 ds_put_cstr(ds, "\n");
1348 }
1349
1350 static void
1351 bond_unixctl_show(struct unixctl_conn *conn,
1352 int argc, const char *argv[],
1353 void *aux OVS_UNUSED)
1354 {
1355 struct ds ds = DS_EMPTY_INITIALIZER;
1356
1357 ovs_rwlock_rdlock(&rwlock);
1358 if (argc > 1) {
1359 const struct bond *bond = bond_find(argv[1]);
1360
1361 if (!bond) {
1362 unixctl_command_reply_error(conn, "no such bond");
1363 goto out;
1364 }
1365 bond_print_details(&ds, bond);
1366 } else {
1367 const struct bond *bond;
1368
1369 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1370 bond_print_details(&ds, bond);
1371 }
1372 }
1373
1374 unixctl_command_reply(conn, ds_cstr(&ds));
1375 ds_destroy(&ds);
1376
1377 out:
1378 ovs_rwlock_unlock(&rwlock);
1379 }
1380
1381 static void
1382 bond_unixctl_migrate(struct unixctl_conn *conn,
1383 int argc OVS_UNUSED, const char *argv[],
1384 void *aux OVS_UNUSED)
1385 {
1386 const char *bond_s = argv[1];
1387 const char *hash_s = argv[2];
1388 const char *slave_s = argv[3];
1389 struct bond *bond;
1390 struct bond_slave *slave;
1391 struct bond_entry *entry;
1392 int hash;
1393
1394 ovs_rwlock_wrlock(&rwlock);
1395 bond = bond_find(bond_s);
1396 if (!bond) {
1397 unixctl_command_reply_error(conn, "no such bond");
1398 goto out;
1399 }
1400
1401 if (bond->balance != BM_SLB) {
1402 unixctl_command_reply_error(conn, "not an SLB bond");
1403 goto out;
1404 }
1405
1406 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1407 hash = atoi(hash_s) & BOND_MASK;
1408 } else {
1409 unixctl_command_reply_error(conn, "bad hash");
1410 goto out;
1411 }
1412
1413 slave = bond_lookup_slave(bond, slave_s);
1414 if (!slave) {
1415 unixctl_command_reply_error(conn, "no such slave");
1416 goto out;
1417 }
1418
1419 if (!slave->enabled) {
1420 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1421 goto out;
1422 }
1423
1424 entry = &bond->hash[hash];
1425 bond->bond_revalidate = true;
1426 entry->slave = slave;
1427 unixctl_command_reply(conn, "migrated");
1428
1429 out:
1430 ovs_rwlock_unlock(&rwlock);
1431 }
1432
1433 static void
1434 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1435 int argc OVS_UNUSED, const char *argv[],
1436 void *aux OVS_UNUSED)
1437 {
1438 const char *bond_s = argv[1];
1439 const char *slave_s = argv[2];
1440 struct bond *bond;
1441 struct bond_slave *slave;
1442
1443 ovs_rwlock_wrlock(&rwlock);
1444 bond = bond_find(bond_s);
1445 if (!bond) {
1446 unixctl_command_reply_error(conn, "no such bond");
1447 goto out;
1448 }
1449
1450 slave = bond_lookup_slave(bond, slave_s);
1451 if (!slave) {
1452 unixctl_command_reply_error(conn, "no such slave");
1453 goto out;
1454 }
1455
1456 if (!slave->enabled) {
1457 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1458 goto out;
1459 }
1460
1461 if (bond->active_slave != slave) {
1462 bond->bond_revalidate = true;
1463 bond->active_slave = slave;
1464 VLOG_INFO("bond %s: active interface is now %s",
1465 bond->name, slave->name);
1466 bond->send_learning_packets = true;
1467 unixctl_command_reply(conn, "done");
1468 } else {
1469 unixctl_command_reply(conn, "no change");
1470 }
1471 out:
1472 ovs_rwlock_unlock(&rwlock);
1473 }
1474
1475 static void
1476 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1477 {
1478 const char *bond_s = argv[1];
1479 const char *slave_s = argv[2];
1480 struct bond *bond;
1481 struct bond_slave *slave;
1482
1483 ovs_rwlock_wrlock(&rwlock);
1484 bond = bond_find(bond_s);
1485 if (!bond) {
1486 unixctl_command_reply_error(conn, "no such bond");
1487 goto out;
1488 }
1489
1490 slave = bond_lookup_slave(bond, slave_s);
1491 if (!slave) {
1492 unixctl_command_reply_error(conn, "no such slave");
1493 goto out;
1494 }
1495
1496 bond_enable_slave(slave, enable);
1497 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1498
1499 out:
1500 ovs_rwlock_unlock(&rwlock);
1501 }
1502
1503 static void
1504 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1505 int argc OVS_UNUSED, const char *argv[],
1506 void *aux OVS_UNUSED)
1507 {
1508 enable_slave(conn, argv, true);
1509 }
1510
1511 static void
1512 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1513 int argc OVS_UNUSED, const char *argv[],
1514 void *aux OVS_UNUSED)
1515 {
1516 enable_slave(conn, argv, false);
1517 }
1518
1519 static void
1520 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1521 void *aux OVS_UNUSED)
1522 {
1523 const char *mac_s = argv[1];
1524 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1525 const char *basis_s = argc > 3 ? argv[3] : NULL;
1526 uint8_t mac[ETH_ADDR_LEN];
1527 uint8_t hash;
1528 char *hash_cstr;
1529 unsigned int vlan;
1530 uint32_t basis;
1531
1532 if (vlan_s) {
1533 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1534 unixctl_command_reply_error(conn, "invalid vlan");
1535 return;
1536 }
1537 } else {
1538 vlan = 0;
1539 }
1540
1541 if (basis_s) {
1542 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1543 unixctl_command_reply_error(conn, "invalid basis");
1544 return;
1545 }
1546 } else {
1547 basis = 0;
1548 }
1549
1550 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1551 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1552
1553 hash_cstr = xasprintf("%u", hash);
1554 unixctl_command_reply(conn, hash_cstr);
1555 free(hash_cstr);
1556 } else {
1557 unixctl_command_reply_error(conn, "invalid mac");
1558 }
1559 }
1560
1561 void
1562 bond_init(void)
1563 {
1564 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1565 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1566 NULL);
1567 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1568 bond_unixctl_migrate, NULL);
1569 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1570 bond_unixctl_set_active_slave, NULL);
1571 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1572 bond_unixctl_enable_slave, NULL);
1573 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1574 bond_unixctl_disable_slave, NULL);
1575 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1576 bond_unixctl_hash, NULL);
1577 }
1578 \f
1579 static void
1580 bond_entry_reset(struct bond *bond)
1581 {
1582 if (bond->balance != BM_AB) {
1583 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
1584
1585 if (!bond->hash) {
1586 bond->hash = xmalloc(hash_len);
1587 }
1588 memset(bond->hash, 0, hash_len);
1589
1590 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1591 } else {
1592 free(bond->hash);
1593 bond->hash = NULL;
1594 }
1595 }
1596
1597 static struct bond_slave *
1598 bond_slave_lookup(struct bond *bond, const void *slave_)
1599 {
1600 struct bond_slave *slave;
1601
1602 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1603 &bond->slaves) {
1604 if (slave->aux == slave_) {
1605 return slave;
1606 }
1607 }
1608
1609 return NULL;
1610 }
1611
1612 static void
1613 bond_enable_slave(struct bond_slave *slave, bool enable)
1614 {
1615 slave->delay_expires = LLONG_MAX;
1616 if (enable != slave->enabled) {
1617 slave->bond->bond_revalidate = true;
1618 slave->enabled = enable;
1619
1620 ovs_mutex_lock(&slave->bond->mutex);
1621 if (enable) {
1622 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1623 } else {
1624 list_remove(&slave->list_node);
1625 }
1626 ovs_mutex_unlock(&slave->bond->mutex);
1627
1628 VLOG_INFO("interface %s: %s", slave->name,
1629 slave->enabled ? "enabled" : "disabled");
1630 }
1631 }
1632
1633 static void
1634 bond_link_status_update(struct bond_slave *slave)
1635 {
1636 struct bond *bond = slave->bond;
1637 bool up;
1638
1639 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1640 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1641 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1642 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1643 slave->name, up ? "up" : "down");
1644 if (up == slave->enabled) {
1645 slave->delay_expires = LLONG_MAX;
1646 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1647 slave->name, up ? "disabled" : "enabled");
1648 } else {
1649 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1650 : up ? bond->updelay : bond->downdelay);
1651 slave->delay_expires = time_msec() + delay;
1652 if (delay) {
1653 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1654 "for %d ms",
1655 slave->name,
1656 up ? "enabled" : "disabled",
1657 up ? "up" : "down",
1658 delay);
1659 }
1660 }
1661 }
1662
1663 if (time_msec() >= slave->delay_expires) {
1664 bond_enable_slave(slave, up);
1665 }
1666 }
1667
1668 static unsigned int
1669 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1670 {
1671 return hash_mac(mac, vlan, basis);
1672 }
1673
1674 static unsigned int
1675 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1676 {
1677 struct flow hash_flow = *flow;
1678 hash_flow.vlan_tci = htons(vlan);
1679
1680 /* The symmetric quality of this hash function is not required, but
1681 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1682 * purposes, so we use it out of convenience. */
1683 return flow_hash_symmetric_l4(&hash_flow, basis);
1684 }
1685
1686 static unsigned int
1687 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1688 {
1689 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1690
1691 return (bond->balance == BM_TCP
1692 ? bond_hash_tcp(flow, vlan, bond->basis)
1693 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1694 }
1695
1696 static struct bond_entry *
1697 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1698 uint16_t vlan)
1699 {
1700 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1701 }
1702
1703 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1704 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1705 * returns NULL. */
1706 static struct bond_slave *
1707 get_enabled_slave(struct bond *bond)
1708 {
1709 struct list *node;
1710
1711 ovs_mutex_lock(&bond->mutex);
1712 if (list_is_empty(&bond->enabled_slaves)) {
1713 ovs_mutex_unlock(&bond->mutex);
1714 return NULL;
1715 }
1716
1717 node = list_pop_front(&bond->enabled_slaves);
1718 list_push_back(&bond->enabled_slaves, node);
1719 ovs_mutex_unlock(&bond->mutex);
1720
1721 return CONTAINER_OF(node, struct bond_slave, list_node);
1722 }
1723
1724 static struct bond_slave *
1725 choose_output_slave(const struct bond *bond, const struct flow *flow,
1726 struct flow_wildcards *wc, uint16_t vlan)
1727 {
1728 struct bond_entry *e;
1729 int balance;
1730
1731 balance = bond->balance;
1732 if (bond->lacp_status == LACP_CONFIGURED) {
1733 /* LACP has been configured on this bond but negotiations were
1734 * unsuccussful. If lacp_fallback_ab is enabled use active-
1735 * backup mode else drop all traffic. */
1736 if (!bond->lacp_fallback_ab) {
1737 return NULL;
1738 }
1739 balance = BM_AB;
1740 }
1741
1742 switch (balance) {
1743 case BM_AB:
1744 return bond->active_slave;
1745
1746 case BM_TCP:
1747 if (bond->lacp_status != LACP_NEGOTIATED) {
1748 /* Must have LACP negotiations for TCP balanced bonds. */
1749 return NULL;
1750 }
1751 if (wc) {
1752 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1753 }
1754 /* Fall Through. */
1755 case BM_SLB:
1756 if (wc) {
1757 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1758 }
1759 e = lookup_bond_entry(bond, flow, vlan);
1760 if (!e->slave || !e->slave->enabled) {
1761 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
1762 }
1763 return e->slave;
1764
1765 default:
1766 OVS_NOT_REACHED();
1767 }
1768 }
1769
1770 static struct bond_slave *
1771 bond_choose_slave(const struct bond *bond)
1772 {
1773 struct bond_slave *slave, *best;
1774
1775 /* Find an enabled slave. */
1776 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1777 if (slave->enabled) {
1778 return slave;
1779 }
1780 }
1781
1782 /* All interfaces are disabled. Find an interface that will be enabled
1783 * after its updelay expires. */
1784 best = NULL;
1785 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1786 if (slave->delay_expires != LLONG_MAX
1787 && slave->may_enable
1788 && (!best || slave->delay_expires < best->delay_expires)) {
1789 best = slave;
1790 }
1791 }
1792 return best;
1793 }
1794
1795 static void
1796 bond_choose_active_slave(struct bond *bond)
1797 {
1798 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1799 struct bond_slave *old_active_slave = bond->active_slave;
1800
1801 bond->active_slave = bond_choose_slave(bond);
1802 if (bond->active_slave) {
1803 if (bond->active_slave->enabled) {
1804 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1805 bond->name, bond->active_slave->name);
1806 } else {
1807 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1808 "remaining %lld ms updelay (since no interface was "
1809 "enabled)", bond->name, bond->active_slave->name,
1810 bond->active_slave->delay_expires - time_msec());
1811 bond_enable_slave(bond->active_slave, true);
1812 }
1813
1814 bond->send_learning_packets = true;
1815 } else if (old_active_slave) {
1816 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1817 }
1818 }
1819
1820 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1821 * bond interface. */
1822 static void
1823 bond_update_fake_slave_stats(struct bond *bond)
1824 {
1825 struct netdev_stats bond_stats;
1826 struct bond_slave *slave;
1827 struct netdev *bond_dev;
1828
1829 memset(&bond_stats, 0, sizeof bond_stats);
1830
1831 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1832 struct netdev_stats slave_stats;
1833
1834 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1835 /* XXX: We swap the stats here because they are swapped back when
1836 * reported by the internal device. The reason for this is
1837 * internal devices normally represent packets going into the
1838 * system but when used as fake bond device they represent packets
1839 * leaving the system. We really should do this in the internal
1840 * device itself because changing it here reverses the counts from
1841 * the perspective of the switch. However, the internal device
1842 * doesn't know what type of device it represents so we have to do
1843 * it here for now. */
1844 bond_stats.tx_packets += slave_stats.rx_packets;
1845 bond_stats.tx_bytes += slave_stats.rx_bytes;
1846 bond_stats.rx_packets += slave_stats.tx_packets;
1847 bond_stats.rx_bytes += slave_stats.tx_bytes;
1848 }
1849 }
1850
1851 if (!netdev_open(bond->name, "system", &bond_dev)) {
1852 netdev_set_stats(bond_dev, &bond_stats);
1853 netdev_close(bond_dev);
1854 }
1855 }