]> git.proxmox.com Git - mirror_ovs.git/blob - ofproto/bond.c
ofproto-bond: do not allow recirculation when we failed to allocate recirc_id
[mirror_ovs.git] / ofproto / bond.c
1 /*
2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "bond.h"
20
21 #include <limits.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <math.h>
25
26 #include "ofp-util.h"
27 #include "ofp-actions.h"
28 #include "ofpbuf.h"
29 #include "ofproto/ofproto-provider.h"
30 #include "ofproto/ofproto-dpif.h"
31 #include "connectivity.h"
32 #include "coverage.h"
33 #include "dynamic-string.h"
34 #include "flow.h"
35 #include "hmap.h"
36 #include "lacp.h"
37 #include "list.h"
38 #include "netdev.h"
39 #include "odp-util.h"
40 #include "ofpbuf.h"
41 #include "packets.h"
42 #include "poll-loop.h"
43 #include "seq.h"
44 #include "match.h"
45 #include "shash.h"
46 #include "timeval.h"
47 #include "unixctl.h"
48 #include "vlog.h"
49
50 VLOG_DEFINE_THIS_MODULE(bond);
51
52 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
53 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
54 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
55
56 /* Bit-mask for hashing a flow down to a bucket. */
57 #define BOND_MASK 0xff
58 #define BOND_BUCKETS (BOND_MASK + 1)
59 #define RECIRC_RULE_PRIORITY 20 /* Priority level for internal rules */
60
61 /* A hash bucket for mapping a flow to a slave.
62 * "struct bond" has an array of BOND_BUCKETS of these. */
63 struct bond_entry {
64 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
65 uint64_t tx_bytes /* Count of bytes recently transmitted. */
66 OVS_GUARDED_BY(rwlock);
67 struct list list_node; /* In bond_slave's 'entries' list. */
68
69 /* Recirculation.
70 *
71 * 'pr_rule' is the post-recirculation rule for this entry.
72 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
73 * is used to determine delta (applied to 'tx_bytes' above.) */
74 struct rule *pr_rule;
75 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
76 };
77
78 /* A bond slave, that is, one of the links comprising a bond. */
79 struct bond_slave {
80 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
81 struct list list_node; /* In struct bond's enabled_slaves list. */
82 struct bond *bond; /* The bond that contains this slave. */
83 void *aux; /* Client-provided handle for this slave. */
84
85 struct netdev *netdev; /* Network device, owned by the client. */
86 unsigned int change_seq; /* Tracks changes in 'netdev'. */
87 ofp_port_t ofp_port; /* Open flow port number */
88 char *name; /* Name (a copy of netdev_get_name(netdev)). */
89
90 /* Link status. */
91 long long delay_expires; /* Time after which 'enabled' may change. */
92 bool enabled; /* May be chosen for flows? */
93 bool may_enable; /* Client considers this slave bondable. */
94
95 /* Rebalancing info. Used only by bond_rebalance(). */
96 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
97 struct list entries; /* 'struct bond_entry's assigned here. */
98 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
99 };
100
101 /* A bond, that is, a set of network devices grouped to improve performance or
102 * robustness. */
103 struct bond {
104 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
105 char *name; /* Name provided by client. */
106 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
107
108 /* Slaves. */
109 struct hmap slaves;
110
111 /* Enabled slaves.
112 *
113 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
114 * (To prevent the bond_slave from disappearing they must also hold
115 * 'rwlock'.) */
116 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
117 struct list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
118
119 /* Bonding info. */
120 enum bond_mode balance; /* Balancing mode, one of BM_*. */
121 struct bond_slave *active_slave;
122 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
123 enum lacp_status lacp_status; /* Status of LACP negotiations. */
124 bool bond_revalidate; /* True if flows need revalidation. */
125 uint32_t basis; /* Basis for flow hash function. */
126
127 /* SLB specific bonding info. */
128 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
129 int rebalance_interval; /* Interval between rebalances, in ms. */
130 long long int next_rebalance; /* Next rebalancing time. */
131 bool send_learning_packets;
132 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
133 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
134
135 /* Legacy compatibility. */
136 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
137 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
138
139 struct ovs_refcount ref_cnt;
140 };
141
142 /* What to do with an bond_recirc_rule. */
143 enum bond_op {
144 ADD, /* Add the rule to ofproto's flow table. */
145 DEL, /* Delete the rule from the ofproto's flow table. */
146 };
147
148 /* A rule to add to or delete from ofproto's internal flow table. */
149 struct bond_pr_rule_op {
150 struct hmap_node hmap_node;
151 struct match match;
152 ofp_port_t out_ofport;
153 enum bond_op op;
154 struct rule **pr_rule;
155 };
156
157 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
158 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
159 OVS_REQ_RDLOCK(rwlock);
160 static void bond_enable_slave(struct bond_slave *, bool enable)
161 OVS_REQ_WRLOCK(rwlock);
162 static void bond_link_status_update(struct bond_slave *)
163 OVS_REQ_WRLOCK(rwlock);
164 static void bond_choose_active_slave(struct bond *)
165 OVS_REQ_WRLOCK(rwlock);
166 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
167 uint16_t vlan, uint32_t basis);
168 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
169 uint32_t basis);
170 static struct bond_entry *lookup_bond_entry(const struct bond *,
171 const struct flow *,
172 uint16_t vlan)
173 OVS_REQ_RDLOCK(rwlock);
174 static struct bond_slave *get_enabled_slave(struct bond *)
175 OVS_REQ_RDLOCK(rwlock);
176 static struct bond_slave *choose_output_slave(const struct bond *,
177 const struct flow *,
178 struct flow_wildcards *,
179 uint16_t vlan)
180 OVS_REQ_RDLOCK(rwlock);
181 static void bond_update_fake_slave_stats(struct bond *)
182 OVS_REQ_RDLOCK(rwlock);
183
184 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
185 * stores the mode in '*balance' and returns true. Otherwise returns false
186 * without modifying '*balance'. */
187 bool
188 bond_mode_from_string(enum bond_mode *balance, const char *s)
189 {
190 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
191 *balance = BM_TCP;
192 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
193 *balance = BM_SLB;
194 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
195 *balance = BM_AB;
196 } else {
197 return false;
198 }
199 return true;
200 }
201
202 /* Returns a string representing 'balance'. */
203 const char *
204 bond_mode_to_string(enum bond_mode balance) {
205 switch (balance) {
206 case BM_TCP:
207 return "balance-tcp";
208 case BM_SLB:
209 return "balance-slb";
210 case BM_AB:
211 return "active-backup";
212 }
213 OVS_NOT_REACHED();
214 }
215
216 \f
217 /* Creates and returns a new bond whose configuration is initially taken from
218 * 's'.
219 *
220 * The caller should register each slave on the new bond by calling
221 * bond_slave_register(). */
222 struct bond *
223 bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
224 {
225 struct bond *bond;
226
227 bond = xzalloc(sizeof *bond);
228 bond->ofproto = ofproto;
229 hmap_init(&bond->slaves);
230 list_init(&bond->enabled_slaves);
231 ovs_mutex_init(&bond->mutex);
232 bond->next_fake_iface_update = LLONG_MAX;
233 ovs_refcount_init(&bond->ref_cnt);
234
235 bond->recirc_id = 0;
236 hmap_init(&bond->pr_rule_ops);
237
238 bond_reconfigure(bond, s);
239 return bond;
240 }
241
242 struct bond *
243 bond_ref(const struct bond *bond_)
244 {
245 struct bond *bond = CONST_CAST(struct bond *, bond_);
246
247 if (bond) {
248 ovs_refcount_ref(&bond->ref_cnt);
249 }
250 return bond;
251 }
252
253 /* Frees 'bond'. */
254 void
255 bond_unref(struct bond *bond)
256 {
257 struct bond_slave *slave, *next_slave;
258 struct bond_pr_rule_op *pr_op, *next_op;
259
260 if (!bond || ovs_refcount_unref(&bond->ref_cnt) != 1) {
261 return;
262 }
263
264 ovs_rwlock_wrlock(&rwlock);
265 hmap_remove(all_bonds, &bond->hmap_node);
266 ovs_rwlock_unlock(&rwlock);
267
268 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
269 hmap_remove(&bond->slaves, &slave->hmap_node);
270 /* Client owns 'slave->netdev'. */
271 free(slave->name);
272 free(slave);
273 }
274 hmap_destroy(&bond->slaves);
275
276 ovs_mutex_destroy(&bond->mutex);
277 free(bond->hash);
278 free(bond->name);
279
280 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
281 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
282 free(pr_op);
283 }
284 hmap_destroy(&bond->pr_rule_ops);
285
286 if (bond->recirc_id) {
287 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
288 }
289
290 free(bond);
291 }
292
293 static void
294 add_pr_rule(struct bond *bond, const struct match *match,
295 ofp_port_t out_ofport, struct rule **rule)
296 {
297 uint32_t hash = match_hash(match, 0);
298 struct bond_pr_rule_op *pr_op;
299
300 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
301 if (match_equal(&pr_op->match, match)) {
302 pr_op->op = ADD;
303 pr_op->out_ofport = out_ofport;
304 pr_op->pr_rule = rule;
305 return;
306 }
307 }
308
309 pr_op = xmalloc(sizeof *pr_op);
310 pr_op->match = *match;
311 pr_op->op = ADD;
312 pr_op->out_ofport = out_ofport;
313 pr_op->pr_rule = rule;
314 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
315 }
316
317 static void
318 update_recirc_rules(struct bond *bond)
319 {
320 struct match match;
321 struct bond_pr_rule_op *pr_op, *next_op;
322 uint64_t ofpacts_stub[128 / 8];
323 struct ofpbuf ofpacts;
324 int i;
325
326 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
327
328 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
329 pr_op->op = DEL;
330 }
331
332 if (bond->hash && bond->recirc_id) {
333 for (i = 0; i < BOND_BUCKETS; i++) {
334 struct bond_slave *slave = bond->hash[i].slave;
335
336 if (slave) {
337 match_init_catchall(&match);
338 match_set_recirc_id(&match, bond->recirc_id);
339 match_set_dp_hash_masked(&match, i, BOND_MASK);
340
341 add_pr_rule(bond, &match, slave->ofp_port,
342 &bond->hash[i].pr_rule);
343 }
344 }
345 }
346
347 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
348 int error;
349 switch (pr_op->op) {
350 case ADD:
351 ofpbuf_clear(&ofpacts);
352 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
353 error = ofproto_dpif_add_internal_flow(bond->ofproto,
354 &pr_op->match,
355 RECIRC_RULE_PRIORITY,
356 &ofpacts, pr_op->pr_rule);
357 if (error) {
358 char *err_s = match_to_string(&pr_op->match,
359 RECIRC_RULE_PRIORITY);
360
361 VLOG_ERR("failed to add post recirculation flow %s", err_s);
362 free(err_s);
363 }
364 break;
365
366 case DEL:
367 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
368 &pr_op->match,
369 RECIRC_RULE_PRIORITY);
370 if (error) {
371 char *err_s = match_to_string(&pr_op->match,
372 RECIRC_RULE_PRIORITY);
373
374 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
375 free(err_s);
376 }
377
378 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
379 *pr_op->pr_rule = NULL;
380 free(pr_op);
381 break;
382 }
383 }
384
385 ofpbuf_uninit(&ofpacts);
386 }
387
388
389 /* Updates 'bond''s overall configuration to 's'.
390 *
391 * The caller should register each slave on 'bond' by calling
392 * bond_slave_register(). This is optional if none of the slaves'
393 * configuration has changed. In any case it can't hurt.
394 *
395 * Returns true if the configuration has changed in such a way that requires
396 * flow revalidation.
397 * */
398 bool
399 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
400 {
401 bool revalidate = false;
402
403 ovs_rwlock_wrlock(&rwlock);
404 if (!bond->name || strcmp(bond->name, s->name)) {
405 if (bond->name) {
406 hmap_remove(all_bonds, &bond->hmap_node);
407 free(bond->name);
408 }
409 bond->name = xstrdup(s->name);
410 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
411 }
412
413 bond->updelay = s->up_delay;
414 bond->downdelay = s->down_delay;
415
416 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
417 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
418 revalidate = true;
419 }
420
421 if (bond->rebalance_interval != s->rebalance_interval) {
422 bond->rebalance_interval = s->rebalance_interval;
423 revalidate = true;
424 }
425
426 if (bond->balance != s->balance) {
427 bond->balance = s->balance;
428 revalidate = true;
429 }
430
431 if (bond->basis != s->basis) {
432 bond->basis = s->basis;
433 revalidate = true;
434 }
435
436 if (s->fake_iface) {
437 if (bond->next_fake_iface_update == LLONG_MAX) {
438 bond->next_fake_iface_update = time_msec();
439 }
440 } else {
441 bond->next_fake_iface_update = LLONG_MAX;
442 }
443
444 if (bond->bond_revalidate) {
445 revalidate = true;
446 bond->bond_revalidate = false;
447 }
448
449 if (bond->balance != BM_AB) {
450 if (!bond->recirc_id) {
451 bond->recirc_id = ofproto_dpif_alloc_recirc_id(bond->ofproto);
452 }
453 } else if (bond->recirc_id) {
454 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
455 bond->recirc_id = 0;
456 }
457
458 if (bond->balance == BM_AB || !bond->hash || revalidate) {
459 bond_entry_reset(bond);
460 }
461
462 ovs_rwlock_unlock(&rwlock);
463 return revalidate;
464 }
465
466 static void
467 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
468 OVS_REQ_WRLOCK(rwlock)
469 {
470 if (slave->netdev != netdev) {
471 slave->netdev = netdev;
472 slave->change_seq = 0;
473 }
474 }
475
476 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
477 * arbitrary client-provided pointer that uniquely identifies a slave within a
478 * bond. If 'slave_' already exists within 'bond' then this function
479 * reconfigures the existing slave.
480 *
481 * 'netdev' must be the network device that 'slave_' represents. It is owned
482 * by the client, so the client must not close it before either unregistering
483 * 'slave_' or destroying 'bond'.
484 */
485 void
486 bond_slave_register(struct bond *bond, void *slave_,
487 ofp_port_t ofport, struct netdev *netdev)
488 {
489 struct bond_slave *slave;
490
491 ovs_rwlock_wrlock(&rwlock);
492 slave = bond_slave_lookup(bond, slave_);
493 if (!slave) {
494 slave = xzalloc(sizeof *slave);
495
496 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
497 slave->bond = bond;
498 slave->aux = slave_;
499 slave->ofp_port = ofport;
500 slave->delay_expires = LLONG_MAX;
501 slave->name = xstrdup(netdev_get_name(netdev));
502 bond->bond_revalidate = true;
503
504 slave->enabled = false;
505 bond_enable_slave(slave, netdev_get_carrier(netdev));
506 }
507
508 bond_slave_set_netdev__(slave, netdev);
509
510 free(slave->name);
511 slave->name = xstrdup(netdev_get_name(netdev));
512 ovs_rwlock_unlock(&rwlock);
513 }
514
515 /* Updates the network device to be used with 'slave_' to 'netdev'.
516 *
517 * This is useful if the caller closes and re-opens the network device
518 * registered with bond_slave_register() but doesn't need to change anything
519 * else. */
520 void
521 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
522 {
523 struct bond_slave *slave;
524
525 ovs_rwlock_wrlock(&rwlock);
526 slave = bond_slave_lookup(bond, slave_);
527 if (slave) {
528 bond_slave_set_netdev__(slave, netdev);
529 }
530 ovs_rwlock_unlock(&rwlock);
531 }
532
533 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
534 * then this function has no effect.
535 *
536 * Unregistering a slave invalidates all flows. */
537 void
538 bond_slave_unregister(struct bond *bond, const void *slave_)
539 {
540 struct bond_slave *slave;
541 bool del_active;
542
543 ovs_rwlock_wrlock(&rwlock);
544 slave = bond_slave_lookup(bond, slave_);
545 if (!slave) {
546 goto out;
547 }
548
549 bond->bond_revalidate = true;
550 bond_enable_slave(slave, false);
551
552 del_active = bond->active_slave == slave;
553 if (bond->hash) {
554 struct bond_entry *e;
555 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
556 if (e->slave == slave) {
557 e->slave = NULL;
558 }
559 }
560 }
561
562 free(slave->name);
563
564 hmap_remove(&bond->slaves, &slave->hmap_node);
565 /* Client owns 'slave->netdev'. */
566 free(slave);
567
568 if (del_active) {
569 bond_choose_active_slave(bond);
570 bond->send_learning_packets = true;
571 }
572 out:
573 ovs_rwlock_unlock(&rwlock);
574 }
575
576 /* Should be called on each slave in 'bond' before bond_run() to indicate
577 * whether or not 'slave_' may be enabled. This function is intended to allow
578 * other protocols to have some impact on bonding decisions. For example LACP
579 * or high level link monitoring protocols may decide that a given slave should
580 * not be able to send traffic. */
581 void
582 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
583 {
584 ovs_rwlock_wrlock(&rwlock);
585 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
586 ovs_rwlock_unlock(&rwlock);
587 }
588
589 /* Performs periodic maintenance on 'bond'.
590 *
591 * Returns true if the caller should revalidate its flows.
592 *
593 * The caller should check bond_should_send_learning_packets() afterward. */
594 bool
595 bond_run(struct bond *bond, enum lacp_status lacp_status)
596 {
597 struct bond_slave *slave;
598 bool revalidate;
599
600 ovs_rwlock_wrlock(&rwlock);
601 if (bond->lacp_status != lacp_status) {
602 bond->lacp_status = lacp_status;
603 bond->bond_revalidate = true;
604 }
605
606 /* Enable slaves based on link status and LACP feedback. */
607 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
608 bond_link_status_update(slave);
609 slave->change_seq = seq_read(connectivity_seq_get());
610 }
611 if (!bond->active_slave || !bond->active_slave->enabled) {
612 bond_choose_active_slave(bond);
613 }
614
615 /* Update fake bond interface stats. */
616 if (time_msec() >= bond->next_fake_iface_update) {
617 bond_update_fake_slave_stats(bond);
618 bond->next_fake_iface_update = time_msec() + 1000;
619 }
620
621 revalidate = bond->bond_revalidate;
622 bond->bond_revalidate = false;
623 ovs_rwlock_unlock(&rwlock);
624
625 return revalidate;
626 }
627
628 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
629 void
630 bond_wait(struct bond *bond)
631 {
632 struct bond_slave *slave;
633
634 ovs_rwlock_rdlock(&rwlock);
635 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
636 if (slave->delay_expires != LLONG_MAX) {
637 poll_timer_wait_until(slave->delay_expires);
638 }
639
640 seq_wait(connectivity_seq_get(), slave->change_seq);
641 }
642
643 if (bond->next_fake_iface_update != LLONG_MAX) {
644 poll_timer_wait_until(bond->next_fake_iface_update);
645 }
646
647 if (bond->bond_revalidate) {
648 poll_immediate_wake();
649 }
650 ovs_rwlock_unlock(&rwlock);
651
652 /* We don't wait for bond->next_rebalance because rebalancing can only run
653 * at a flow account checkpoint. ofproto does checkpointing on its own
654 * schedule and bond_rebalance() gets called afterward, so we'd just be
655 * waking up for no purpose. */
656 }
657 \f
658 /* MAC learning table interaction. */
659
660 static bool
661 may_send_learning_packets(const struct bond *bond)
662 {
663 return ((bond->lacp_status == LACP_DISABLED
664 && (bond->balance == BM_SLB || bond->balance == BM_AB))
665 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
666 && bond->active_slave;
667 }
668
669 /* Returns true if 'bond' needs the client to send out packets to assist with
670 * MAC learning on 'bond'. If this function returns true, then the client
671 * should iterate through its MAC learning table for the bridge on which 'bond'
672 * is located. For each MAC that has been learned on a port other than 'bond',
673 * it should call bond_compose_learning_packet().
674 *
675 * This function will only return true if 'bond' is in SLB or active-backup
676 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
677 * necessary.
678 *
679 * Calling this function resets the state that it checks. */
680 bool
681 bond_should_send_learning_packets(struct bond *bond)
682 {
683 bool send;
684
685 ovs_rwlock_wrlock(&rwlock);
686 send = bond->send_learning_packets && may_send_learning_packets(bond);
687 bond->send_learning_packets = false;
688 ovs_rwlock_unlock(&rwlock);
689 return send;
690 }
691
692 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
693 *
694 * See bond_should_send_learning_packets() for description of usage. The
695 * caller should send the composed packet on the port associated with
696 * port_aux and takes ownership of the returned ofpbuf. */
697 struct ofpbuf *
698 bond_compose_learning_packet(struct bond *bond,
699 const uint8_t eth_src[ETH_ADDR_LEN],
700 uint16_t vlan, void **port_aux)
701 {
702 struct bond_slave *slave;
703 struct ofpbuf *packet;
704 struct flow flow;
705
706 ovs_rwlock_rdlock(&rwlock);
707 ovs_assert(may_send_learning_packets(bond));
708 memset(&flow, 0, sizeof flow);
709 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
710 slave = choose_output_slave(bond, &flow, NULL, vlan);
711
712 packet = ofpbuf_new(0);
713 compose_rarp(packet, eth_src);
714 if (vlan) {
715 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
716 }
717
718 *port_aux = slave->aux;
719 ovs_rwlock_unlock(&rwlock);
720 return packet;
721 }
722 \f
723 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
724 * Ethernet destination address of 'eth_dst', should be admitted.
725 *
726 * The return value is one of the following:
727 *
728 * - BV_ACCEPT: Admit the packet.
729 *
730 * - BV_DROP: Drop the packet.
731 *
732 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
733 * Ethernet source address and VLAN. If there is none, or if the packet
734 * is on the learned port, then admit the packet. If a different port has
735 * been learned, however, drop the packet (and do not use it for MAC
736 * learning).
737 */
738 enum bond_verdict
739 bond_check_admissibility(struct bond *bond, const void *slave_,
740 const uint8_t eth_dst[ETH_ADDR_LEN])
741 {
742 enum bond_verdict verdict = BV_DROP;
743 struct bond_slave *slave;
744
745 ovs_rwlock_rdlock(&rwlock);
746 slave = bond_slave_lookup(bond, slave_);
747 if (!slave) {
748 goto out;
749 }
750
751 /* LACP bonds have very loose admissibility restrictions because we can
752 * assume the remote switch is aware of the bond and will "do the right
753 * thing". However, as a precaution we drop packets on disabled slaves
754 * because no correctly implemented partner switch should be sending
755 * packets to them.
756 *
757 * If LACP is configured, but LACP negotiations have been unsuccessful, we
758 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
759 switch (bond->lacp_status) {
760 case LACP_NEGOTIATED:
761 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
762 goto out;
763 case LACP_CONFIGURED:
764 if (!bond->lacp_fallback_ab) {
765 goto out;
766 }
767 case LACP_DISABLED:
768 break;
769 }
770
771 /* Drop all multicast packets on inactive slaves. */
772 if (eth_addr_is_multicast(eth_dst)) {
773 if (bond->active_slave != slave) {
774 goto out;
775 }
776 }
777
778 switch (bond->balance) {
779 case BM_TCP:
780 /* TCP balanced bonds require successful LACP negotiations. Based on the
781 * above check, LACP is off or lacp_fallback_ab is true on this bond.
782 * If lacp_fallback_ab is true fall through to BM_AB case else, we
783 * drop all incoming traffic. */
784 if (!bond->lacp_fallback_ab) {
785 goto out;
786 }
787
788 case BM_AB:
789 /* Drop all packets which arrive on backup slaves. This is similar to
790 * how Linux bonding handles active-backup bonds. */
791 if (bond->active_slave != slave) {
792 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
793
794 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
795 " slave (%s) destined for " ETH_ADDR_FMT,
796 slave->name, ETH_ADDR_ARGS(eth_dst));
797 goto out;
798 }
799 verdict = BV_ACCEPT;
800 goto out;
801
802 case BM_SLB:
803 /* Drop all packets for which we have learned a different input port,
804 * because we probably sent the packet on one slave and got it back on
805 * the other. Gratuitous ARP packets are an exception to this rule:
806 * the host has moved to another switch. The exception to the
807 * exception is if we locked the learning table to avoid reflections on
808 * bond slaves. */
809 verdict = BV_DROP_IF_MOVED;
810 goto out;
811 }
812
813 OVS_NOT_REACHED();
814 out:
815 ovs_rwlock_unlock(&rwlock);
816 return verdict;
817
818 }
819
820 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
821 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
822 * NULL if the packet should be dropped because no slaves are enabled.
823 *
824 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
825 * should be a VID only (i.e. excluding the PCP bits). Second,
826 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
827 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
828 * packet belongs to (so for an access port it will be the access port's VLAN).
829 *
830 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
831 * significant in the selection. At some point earlier, 'wc' should
832 * have been initialized (e.g., by flow_wildcards_init_catchall()).
833 */
834 void *
835 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
836 struct flow_wildcards *wc, uint16_t vlan)
837 {
838 struct bond_slave *slave;
839 void *aux;
840
841 ovs_rwlock_rdlock(&rwlock);
842 slave = choose_output_slave(bond, flow, wc, vlan);
843 aux = slave ? slave->aux : NULL;
844 ovs_rwlock_unlock(&rwlock);
845
846 return aux;
847 }
848 \f
849 /* Recirculation. */
850 static void
851 bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
852 OVS_REQ_WRLOCK(rwlock)
853 {
854 if (entry->slave) {
855 uint64_t delta;
856
857 delta = rule_tx_bytes - entry->pr_tx_bytes;
858 entry->tx_bytes += delta;
859 entry->pr_tx_bytes = rule_tx_bytes;
860 }
861 }
862
863 /* Maintain bond stats using post recirculation rule byte counters.*/
864 void
865 bond_recirculation_account(struct bond *bond)
866 {
867 int i;
868
869 ovs_rwlock_wrlock(&rwlock);
870 for (i=0; i<=BOND_MASK; i++) {
871 struct bond_entry *entry = &bond->hash[i];
872 struct rule *rule = entry->pr_rule;
873
874 if (rule) {
875 uint64_t n_packets OVS_UNUSED;
876 long long int used OVS_UNUSED;
877 uint64_t n_bytes;
878
879 rule->ofproto->ofproto_class->rule_get_stats(
880 rule, &n_packets, &n_bytes, &used);
881 bond_entry_account(entry, n_bytes);
882 }
883 }
884 ovs_rwlock_unlock(&rwlock);
885 }
886
887 bool
888 bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
889 uint32_t *hash_bias)
890 {
891 if (bond->balance == BM_TCP && recirc_id) {
892 if (recirc_id) {
893 *recirc_id = bond->recirc_id;
894 }
895 if (hash_bias) {
896 *hash_bias = bond->basis;
897 }
898 return true;
899 } else {
900 return false;
901 }
902 }
903
904 void
905 bond_update_post_recirc_rules(struct bond* bond, const bool force)
906 {
907 struct bond_entry *e;
908 bool update_rules = force; /* Always update rules if caller forces it. */
909
910 /* Make sure all bond entries are populated */
911 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
912 if (!e->slave || !e->slave->enabled) {
913 update_rules = true;
914 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
915 struct bond_slave, hmap_node);
916 if (!e->slave->enabled) {
917 e->slave = bond->active_slave;
918 }
919 }
920 }
921
922 if (update_rules) {
923 update_recirc_rules(bond);
924 }
925 }
926 \f
927 /* Rebalancing. */
928
929 static bool
930 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
931 {
932 return bond->rebalance_interval
933 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
934 }
935
936 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
937 void
938 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
939 uint64_t n_bytes)
940 {
941 ovs_rwlock_wrlock(&rwlock);
942 if (bond_is_balanced(bond)) {
943 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
944 }
945 ovs_rwlock_unlock(&rwlock);
946 }
947
948 static struct bond_slave *
949 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
950 {
951 return CONTAINER_OF(bal, struct bond_slave, bal_node);
952 }
953
954 static void
955 log_bals(struct bond *bond, const struct list *bals)
956 OVS_REQ_RDLOCK(rwlock)
957 {
958 if (VLOG_IS_DBG_ENABLED()) {
959 struct ds ds = DS_EMPTY_INITIALIZER;
960 const struct bond_slave *slave;
961
962 LIST_FOR_EACH (slave, bal_node, bals) {
963 if (ds.length) {
964 ds_put_char(&ds, ',');
965 }
966 ds_put_format(&ds, " %s %"PRIu64"kB",
967 slave->name, slave->tx_bytes / 1024);
968
969 if (!slave->enabled) {
970 ds_put_cstr(&ds, " (disabled)");
971 }
972 if (!list_is_empty(&slave->entries)) {
973 struct bond_entry *e;
974
975 ds_put_cstr(&ds, " (");
976 LIST_FOR_EACH (e, list_node, &slave->entries) {
977 if (&e->list_node != list_front(&slave->entries)) {
978 ds_put_cstr(&ds, " + ");
979 }
980 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
981 e - bond->hash, e->tx_bytes / 1024);
982 }
983 ds_put_cstr(&ds, ")");
984 }
985 }
986 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
987 ds_destroy(&ds);
988 }
989 }
990
991 /* Shifts 'hash' from its current slave to 'to'. */
992 static void
993 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
994 OVS_REQ_WRLOCK(rwlock)
995 {
996 struct bond_slave *from = hash->slave;
997 struct bond *bond = from->bond;
998 uint64_t delta = hash->tx_bytes;
999
1000 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
1001 "from %s to %s (now carrying %"PRIu64"kB and "
1002 "%"PRIu64"kB load, respectively)",
1003 bond->name, delta / 1024, hash - bond->hash,
1004 from->name, to->name,
1005 (from->tx_bytes - delta) / 1024,
1006 (to->tx_bytes + delta) / 1024);
1007
1008 /* Shift load away from 'from' to 'to'. */
1009 from->tx_bytes -= delta;
1010 to->tx_bytes += delta;
1011
1012 /* Arrange for flows to be revalidated. */
1013 hash->slave = to;
1014 bond->bond_revalidate = true;
1015 }
1016
1017 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1018 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
1019 * given that doing so must decrease the ratio of the load on the two slaves by
1020 * at least 0.1. Returns NULL if there is no appropriate entry.
1021 *
1022 * The list of entries isn't sorted. I don't know of a reason to prefer to
1023 * shift away small hashes or large hashes. */
1024 static struct bond_entry *
1025 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
1026 OVS_REQ_WRLOCK(rwlock)
1027 {
1028 struct bond_entry *e;
1029
1030 if (list_is_short(&from->entries)) {
1031 /* 'from' carries no more than one MAC hash, so shifting load away from
1032 * it would be pointless. */
1033 return NULL;
1034 }
1035
1036 LIST_FOR_EACH (e, list_node, &from->entries) {
1037 double old_ratio, new_ratio;
1038 uint64_t delta;
1039
1040 if (to_tx_bytes == 0) {
1041 /* Nothing on the new slave, move it. */
1042 return e;
1043 }
1044
1045 delta = e->tx_bytes;
1046 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1047 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
1048 if (old_ratio - new_ratio > 0.1
1049 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1050 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1051 and 'to' slave have the same load. Therefore, we only move an
1052 entry if it decreases the load on 'from', and brings us closer
1053 to equal traffic load. */
1054 return e;
1055 }
1056 }
1057
1058 return NULL;
1059 }
1060
1061 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1062 * maintained. */
1063 static void
1064 insert_bal(struct list *bals, struct bond_slave *slave)
1065 {
1066 struct bond_slave *pos;
1067
1068 LIST_FOR_EACH (pos, bal_node, bals) {
1069 if (slave->tx_bytes > pos->tx_bytes) {
1070 break;
1071 }
1072 }
1073 list_insert(&pos->bal_node, &slave->bal_node);
1074 }
1075
1076 /* Removes 'slave' from its current list and then inserts it into 'bals' so
1077 * that descending order of 'tx_bytes' is maintained. */
1078 static void
1079 reinsert_bal(struct list *bals, struct bond_slave *slave)
1080 {
1081 list_remove(&slave->bal_node);
1082 insert_bal(bals, slave);
1083 }
1084
1085 /* If 'bond' needs rebalancing, does so.
1086 *
1087 * The caller should have called bond_account() for each active flow, or in case
1088 * of recirculation is used, have called bond_recirculation_account(bond),
1089 * to ensure that flow data is consistently accounted at this point.
1090 *
1091 * Return whether rebalancing took place.*/
1092 bool
1093 bond_rebalance(struct bond *bond)
1094 {
1095 struct bond_slave *slave;
1096 struct bond_entry *e;
1097 struct list bals;
1098 bool rebalanced = false;
1099
1100 ovs_rwlock_wrlock(&rwlock);
1101 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
1102 goto done;
1103 }
1104 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1105
1106 /* Add each bond_entry to its slave's 'entries' list.
1107 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1108 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1109 slave->tx_bytes = 0;
1110 list_init(&slave->entries);
1111 }
1112 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1113 if (e->slave && e->tx_bytes) {
1114 e->slave->tx_bytes += e->tx_bytes;
1115 list_push_back(&e->slave->entries, &e->list_node);
1116 }
1117 }
1118
1119 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1120 *
1121 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1122 * with a proper list sort algorithm. */
1123 list_init(&bals);
1124 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1125 if (slave->enabled) {
1126 insert_bal(&bals, slave);
1127 }
1128 }
1129 log_bals(bond, &bals);
1130
1131 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1132 while (!list_is_short(&bals)) {
1133 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1134 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1135 uint64_t overload;
1136
1137 overload = from->tx_bytes - to->tx_bytes;
1138 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1139 /* The extra load on 'from' (and all less-loaded slaves), compared
1140 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1141 * it is less than ~1Mbps. No point in rebalancing. */
1142 break;
1143 }
1144
1145 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1146 * to move from 'from' to 'to'. */
1147 e = choose_entry_to_migrate(from, to->tx_bytes);
1148 if (e) {
1149 bond_shift_load(e, to);
1150
1151 /* Delete element from from->entries.
1152 *
1153 * We don't add the element to to->hashes. That would only allow
1154 * 'e' to be migrated to another slave in this rebalancing run, and
1155 * there is no point in doing that. */
1156 list_remove(&e->list_node);
1157
1158 /* Re-sort 'bals'. */
1159 reinsert_bal(&bals, from);
1160 reinsert_bal(&bals, to);
1161 rebalanced = true;
1162 } else {
1163 /* Can't usefully migrate anything away from 'from'.
1164 * Don't reconsider it. */
1165 list_remove(&from->bal_node);
1166 }
1167 }
1168
1169 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1170 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1171 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1172 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1173 e->tx_bytes /= 2;
1174 }
1175
1176 done:
1177 ovs_rwlock_unlock(&rwlock);
1178 return rebalanced;
1179 }
1180 \f
1181 /* Bonding unixctl user interface functions. */
1182
1183 static struct bond *
1184 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
1185 {
1186 struct bond *bond;
1187
1188 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
1189 all_bonds) {
1190 if (!strcmp(bond->name, name)) {
1191 return bond;
1192 }
1193 }
1194 return NULL;
1195 }
1196
1197 static struct bond_slave *
1198 bond_lookup_slave(struct bond *bond, const char *slave_name)
1199 {
1200 struct bond_slave *slave;
1201
1202 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1203 if (!strcmp(slave->name, slave_name)) {
1204 return slave;
1205 }
1206 }
1207 return NULL;
1208 }
1209
1210 static void
1211 bond_unixctl_list(struct unixctl_conn *conn,
1212 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1213 void *aux OVS_UNUSED)
1214 {
1215 struct ds ds = DS_EMPTY_INITIALIZER;
1216 const struct bond *bond;
1217
1218 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
1219
1220 ovs_rwlock_rdlock(&rwlock);
1221 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1222 const struct bond_slave *slave;
1223 size_t i;
1224
1225 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1226 bond_mode_to_string(bond->balance), bond->recirc_id);
1227
1228 i = 0;
1229 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1230 if (i++ > 0) {
1231 ds_put_cstr(&ds, ", ");
1232 }
1233 ds_put_cstr(&ds, slave->name);
1234 }
1235 ds_put_char(&ds, '\n');
1236 }
1237 ovs_rwlock_unlock(&rwlock);
1238 unixctl_command_reply(conn, ds_cstr(&ds));
1239 ds_destroy(&ds);
1240 }
1241
1242 static void
1243 bond_print_details(struct ds *ds, const struct bond *bond)
1244 OVS_REQ_RDLOCK(rwlock)
1245 {
1246 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1247 const struct shash_node **sorted_slaves = NULL;
1248 const struct bond_slave *slave;
1249 bool may_recirc;
1250 uint32_t recirc_id;
1251 int i;
1252
1253 ds_put_format(ds, "---- %s ----\n", bond->name);
1254 ds_put_format(ds, "bond_mode: %s\n",
1255 bond_mode_to_string(bond->balance));
1256
1257 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1258 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1259 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1260
1261 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1262
1263 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1264 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1265
1266 if (bond_is_balanced(bond)) {
1267 ds_put_format(ds, "next rebalance: %lld ms\n",
1268 bond->next_rebalance - time_msec());
1269 }
1270
1271 ds_put_cstr(ds, "lacp_status: ");
1272 switch (bond->lacp_status) {
1273 case LACP_NEGOTIATED:
1274 ds_put_cstr(ds, "negotiated\n");
1275 break;
1276 case LACP_CONFIGURED:
1277 ds_put_cstr(ds, "configured\n");
1278 break;
1279 case LACP_DISABLED:
1280 ds_put_cstr(ds, "off\n");
1281 break;
1282 default:
1283 ds_put_cstr(ds, "<unknown>\n");
1284 break;
1285 }
1286
1287 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1288 shash_add(&slave_shash, slave->name, slave);
1289 }
1290 sorted_slaves = shash_sort(&slave_shash);
1291
1292 for (i = 0; i < shash_count(&slave_shash); i++) {
1293 struct bond_entry *be;
1294
1295 slave = sorted_slaves[i]->data;
1296
1297 /* Basic info. */
1298 ds_put_format(ds, "\nslave %s: %s\n",
1299 slave->name, slave->enabled ? "enabled" : "disabled");
1300 if (slave == bond->active_slave) {
1301 ds_put_cstr(ds, "\tactive slave\n");
1302 }
1303 if (slave->delay_expires != LLONG_MAX) {
1304 ds_put_format(ds, "\t%s expires in %lld ms\n",
1305 slave->enabled ? "downdelay" : "updelay",
1306 slave->delay_expires - time_msec());
1307 }
1308
1309 ds_put_format(ds, "\tmay_enable: %s\n",
1310 slave->may_enable ? "true" : "false");
1311
1312 if (!bond_is_balanced(bond)) {
1313 continue;
1314 }
1315
1316 /* Hashes. */
1317 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1318 int hash = be - bond->hash;
1319 uint64_t be_tx_k;
1320
1321 if (be->slave != slave) {
1322 continue;
1323 }
1324
1325 be_tx_k = be->tx_bytes / 1024;
1326 if (be_tx_k) {
1327 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1328 hash, be_tx_k);
1329 }
1330
1331 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1332 }
1333 }
1334 shash_destroy(&slave_shash);
1335 free(sorted_slaves);
1336 ds_put_cstr(ds, "\n");
1337 }
1338
1339 static void
1340 bond_unixctl_show(struct unixctl_conn *conn,
1341 int argc, const char *argv[],
1342 void *aux OVS_UNUSED)
1343 {
1344 struct ds ds = DS_EMPTY_INITIALIZER;
1345
1346 ovs_rwlock_rdlock(&rwlock);
1347 if (argc > 1) {
1348 const struct bond *bond = bond_find(argv[1]);
1349
1350 if (!bond) {
1351 unixctl_command_reply_error(conn, "no such bond");
1352 goto out;
1353 }
1354 bond_print_details(&ds, bond);
1355 } else {
1356 const struct bond *bond;
1357
1358 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1359 bond_print_details(&ds, bond);
1360 }
1361 }
1362
1363 unixctl_command_reply(conn, ds_cstr(&ds));
1364 ds_destroy(&ds);
1365
1366 out:
1367 ovs_rwlock_unlock(&rwlock);
1368 }
1369
1370 static void
1371 bond_unixctl_migrate(struct unixctl_conn *conn,
1372 int argc OVS_UNUSED, const char *argv[],
1373 void *aux OVS_UNUSED)
1374 {
1375 const char *bond_s = argv[1];
1376 const char *hash_s = argv[2];
1377 const char *slave_s = argv[3];
1378 struct bond *bond;
1379 struct bond_slave *slave;
1380 struct bond_entry *entry;
1381 int hash;
1382
1383 ovs_rwlock_wrlock(&rwlock);
1384 bond = bond_find(bond_s);
1385 if (!bond) {
1386 unixctl_command_reply_error(conn, "no such bond");
1387 goto out;
1388 }
1389
1390 if (bond->balance != BM_SLB) {
1391 unixctl_command_reply_error(conn, "not an SLB bond");
1392 goto out;
1393 }
1394
1395 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1396 hash = atoi(hash_s) & BOND_MASK;
1397 } else {
1398 unixctl_command_reply_error(conn, "bad hash");
1399 goto out;
1400 }
1401
1402 slave = bond_lookup_slave(bond, slave_s);
1403 if (!slave) {
1404 unixctl_command_reply_error(conn, "no such slave");
1405 goto out;
1406 }
1407
1408 if (!slave->enabled) {
1409 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1410 goto out;
1411 }
1412
1413 entry = &bond->hash[hash];
1414 bond->bond_revalidate = true;
1415 entry->slave = slave;
1416 unixctl_command_reply(conn, "migrated");
1417
1418 out:
1419 ovs_rwlock_unlock(&rwlock);
1420 }
1421
1422 static void
1423 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1424 int argc OVS_UNUSED, const char *argv[],
1425 void *aux OVS_UNUSED)
1426 {
1427 const char *bond_s = argv[1];
1428 const char *slave_s = argv[2];
1429 struct bond *bond;
1430 struct bond_slave *slave;
1431
1432 ovs_rwlock_wrlock(&rwlock);
1433 bond = bond_find(bond_s);
1434 if (!bond) {
1435 unixctl_command_reply_error(conn, "no such bond");
1436 goto out;
1437 }
1438
1439 slave = bond_lookup_slave(bond, slave_s);
1440 if (!slave) {
1441 unixctl_command_reply_error(conn, "no such slave");
1442 goto out;
1443 }
1444
1445 if (!slave->enabled) {
1446 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1447 goto out;
1448 }
1449
1450 if (bond->active_slave != slave) {
1451 bond->bond_revalidate = true;
1452 bond->active_slave = slave;
1453 VLOG_INFO("bond %s: active interface is now %s",
1454 bond->name, slave->name);
1455 bond->send_learning_packets = true;
1456 unixctl_command_reply(conn, "done");
1457 } else {
1458 unixctl_command_reply(conn, "no change");
1459 }
1460 out:
1461 ovs_rwlock_unlock(&rwlock);
1462 }
1463
1464 static void
1465 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1466 {
1467 const char *bond_s = argv[1];
1468 const char *slave_s = argv[2];
1469 struct bond *bond;
1470 struct bond_slave *slave;
1471
1472 ovs_rwlock_wrlock(&rwlock);
1473 bond = bond_find(bond_s);
1474 if (!bond) {
1475 unixctl_command_reply_error(conn, "no such bond");
1476 goto out;
1477 }
1478
1479 slave = bond_lookup_slave(bond, slave_s);
1480 if (!slave) {
1481 unixctl_command_reply_error(conn, "no such slave");
1482 goto out;
1483 }
1484
1485 bond_enable_slave(slave, enable);
1486 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1487
1488 out:
1489 ovs_rwlock_unlock(&rwlock);
1490 }
1491
1492 static void
1493 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1494 int argc OVS_UNUSED, const char *argv[],
1495 void *aux OVS_UNUSED)
1496 {
1497 enable_slave(conn, argv, true);
1498 }
1499
1500 static void
1501 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1502 int argc OVS_UNUSED, const char *argv[],
1503 void *aux OVS_UNUSED)
1504 {
1505 enable_slave(conn, argv, false);
1506 }
1507
1508 static void
1509 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1510 void *aux OVS_UNUSED)
1511 {
1512 const char *mac_s = argv[1];
1513 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1514 const char *basis_s = argc > 3 ? argv[3] : NULL;
1515 uint8_t mac[ETH_ADDR_LEN];
1516 uint8_t hash;
1517 char *hash_cstr;
1518 unsigned int vlan;
1519 uint32_t basis;
1520
1521 if (vlan_s) {
1522 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1523 unixctl_command_reply_error(conn, "invalid vlan");
1524 return;
1525 }
1526 } else {
1527 vlan = 0;
1528 }
1529
1530 if (basis_s) {
1531 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1532 unixctl_command_reply_error(conn, "invalid basis");
1533 return;
1534 }
1535 } else {
1536 basis = 0;
1537 }
1538
1539 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1540 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1541
1542 hash_cstr = xasprintf("%u", hash);
1543 unixctl_command_reply(conn, hash_cstr);
1544 free(hash_cstr);
1545 } else {
1546 unixctl_command_reply_error(conn, "invalid mac");
1547 }
1548 }
1549
1550 void
1551 bond_init(void)
1552 {
1553 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1554 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1555 NULL);
1556 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1557 bond_unixctl_migrate, NULL);
1558 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1559 bond_unixctl_set_active_slave, NULL);
1560 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1561 bond_unixctl_enable_slave, NULL);
1562 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1563 bond_unixctl_disable_slave, NULL);
1564 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1565 bond_unixctl_hash, NULL);
1566 }
1567 \f
1568 static void
1569 bond_entry_reset(struct bond *bond)
1570 {
1571 if (bond->balance != BM_AB) {
1572 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
1573
1574 if (!bond->hash) {
1575 bond->hash = xmalloc(hash_len);
1576 }
1577 memset(bond->hash, 0, hash_len);
1578
1579 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1580 } else {
1581 free(bond->hash);
1582 bond->hash = NULL;
1583 }
1584 }
1585
1586 static struct bond_slave *
1587 bond_slave_lookup(struct bond *bond, const void *slave_)
1588 {
1589 struct bond_slave *slave;
1590
1591 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1592 &bond->slaves) {
1593 if (slave->aux == slave_) {
1594 return slave;
1595 }
1596 }
1597
1598 return NULL;
1599 }
1600
1601 static void
1602 bond_enable_slave(struct bond_slave *slave, bool enable)
1603 {
1604 slave->delay_expires = LLONG_MAX;
1605 if (enable != slave->enabled) {
1606 slave->bond->bond_revalidate = true;
1607 slave->enabled = enable;
1608
1609 ovs_mutex_lock(&slave->bond->mutex);
1610 if (enable) {
1611 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1612 } else {
1613 list_remove(&slave->list_node);
1614 }
1615 ovs_mutex_unlock(&slave->bond->mutex);
1616
1617 VLOG_INFO("interface %s: %s", slave->name,
1618 slave->enabled ? "enabled" : "disabled");
1619 }
1620 }
1621
1622 static void
1623 bond_link_status_update(struct bond_slave *slave)
1624 {
1625 struct bond *bond = slave->bond;
1626 bool up;
1627
1628 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1629 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1630 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1631 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1632 slave->name, up ? "up" : "down");
1633 if (up == slave->enabled) {
1634 slave->delay_expires = LLONG_MAX;
1635 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1636 slave->name, up ? "disabled" : "enabled");
1637 } else {
1638 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1639 : up ? bond->updelay : bond->downdelay);
1640 slave->delay_expires = time_msec() + delay;
1641 if (delay) {
1642 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1643 "for %d ms",
1644 slave->name,
1645 up ? "enabled" : "disabled",
1646 up ? "up" : "down",
1647 delay);
1648 }
1649 }
1650 }
1651
1652 if (time_msec() >= slave->delay_expires) {
1653 bond_enable_slave(slave, up);
1654 }
1655 }
1656
1657 static unsigned int
1658 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1659 {
1660 return hash_mac(mac, vlan, basis);
1661 }
1662
1663 static unsigned int
1664 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1665 {
1666 struct flow hash_flow = *flow;
1667 hash_flow.vlan_tci = htons(vlan);
1668
1669 /* The symmetric quality of this hash function is not required, but
1670 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1671 * purposes, so we use it out of convenience. */
1672 return flow_hash_symmetric_l4(&hash_flow, basis);
1673 }
1674
1675 static unsigned int
1676 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1677 {
1678 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1679
1680 return (bond->balance == BM_TCP
1681 ? bond_hash_tcp(flow, vlan, bond->basis)
1682 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1683 }
1684
1685 static struct bond_entry *
1686 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1687 uint16_t vlan)
1688 {
1689 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1690 }
1691
1692 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1693 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1694 * returns NULL. */
1695 static struct bond_slave *
1696 get_enabled_slave(struct bond *bond)
1697 {
1698 struct list *node;
1699
1700 ovs_mutex_lock(&bond->mutex);
1701 if (list_is_empty(&bond->enabled_slaves)) {
1702 ovs_mutex_unlock(&bond->mutex);
1703 return NULL;
1704 }
1705
1706 node = list_pop_front(&bond->enabled_slaves);
1707 list_push_back(&bond->enabled_slaves, node);
1708 ovs_mutex_unlock(&bond->mutex);
1709
1710 return CONTAINER_OF(node, struct bond_slave, list_node);
1711 }
1712
1713 static struct bond_slave *
1714 choose_output_slave(const struct bond *bond, const struct flow *flow,
1715 struct flow_wildcards *wc, uint16_t vlan)
1716 {
1717 struct bond_entry *e;
1718 int balance;
1719
1720 balance = bond->balance;
1721 if (bond->lacp_status == LACP_CONFIGURED) {
1722 /* LACP has been configured on this bond but negotiations were
1723 * unsuccussful. If lacp_fallback_ab is enabled use active-
1724 * backup mode else drop all traffic. */
1725 if (!bond->lacp_fallback_ab) {
1726 return NULL;
1727 }
1728 balance = BM_AB;
1729 }
1730
1731 switch (balance) {
1732 case BM_AB:
1733 return bond->active_slave;
1734
1735 case BM_TCP:
1736 if (bond->lacp_status != LACP_NEGOTIATED) {
1737 /* Must have LACP negotiations for TCP balanced bonds. */
1738 return NULL;
1739 }
1740 if (wc) {
1741 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1742 }
1743 /* Fall Through. */
1744 case BM_SLB:
1745 if (wc) {
1746 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1747 }
1748 e = lookup_bond_entry(bond, flow, vlan);
1749 if (!e->slave || !e->slave->enabled) {
1750 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
1751 }
1752 return e->slave;
1753
1754 default:
1755 OVS_NOT_REACHED();
1756 }
1757 }
1758
1759 static struct bond_slave *
1760 bond_choose_slave(const struct bond *bond)
1761 {
1762 struct bond_slave *slave, *best;
1763
1764 /* Find an enabled slave. */
1765 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1766 if (slave->enabled) {
1767 return slave;
1768 }
1769 }
1770
1771 /* All interfaces are disabled. Find an interface that will be enabled
1772 * after its updelay expires. */
1773 best = NULL;
1774 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1775 if (slave->delay_expires != LLONG_MAX
1776 && slave->may_enable
1777 && (!best || slave->delay_expires < best->delay_expires)) {
1778 best = slave;
1779 }
1780 }
1781 return best;
1782 }
1783
1784 static void
1785 bond_choose_active_slave(struct bond *bond)
1786 {
1787 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1788 struct bond_slave *old_active_slave = bond->active_slave;
1789
1790 bond->active_slave = bond_choose_slave(bond);
1791 if (bond->active_slave) {
1792 if (bond->active_slave->enabled) {
1793 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1794 bond->name, bond->active_slave->name);
1795 } else {
1796 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1797 "remaining %lld ms updelay (since no interface was "
1798 "enabled)", bond->name, bond->active_slave->name,
1799 bond->active_slave->delay_expires - time_msec());
1800 bond_enable_slave(bond->active_slave, true);
1801 }
1802
1803 bond->send_learning_packets = true;
1804 } else if (old_active_slave) {
1805 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1806 }
1807 }
1808
1809 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1810 * bond interface. */
1811 static void
1812 bond_update_fake_slave_stats(struct bond *bond)
1813 {
1814 struct netdev_stats bond_stats;
1815 struct bond_slave *slave;
1816 struct netdev *bond_dev;
1817
1818 memset(&bond_stats, 0, sizeof bond_stats);
1819
1820 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1821 struct netdev_stats slave_stats;
1822
1823 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1824 /* XXX: We swap the stats here because they are swapped back when
1825 * reported by the internal device. The reason for this is
1826 * internal devices normally represent packets going into the
1827 * system but when used as fake bond device they represent packets
1828 * leaving the system. We really should do this in the internal
1829 * device itself because changing it here reverses the counts from
1830 * the perspective of the switch. However, the internal device
1831 * doesn't know what type of device it represents so we have to do
1832 * it here for now. */
1833 bond_stats.tx_packets += slave_stats.rx_packets;
1834 bond_stats.tx_bytes += slave_stats.rx_bytes;
1835 bond_stats.rx_packets += slave_stats.tx_packets;
1836 bond_stats.rx_bytes += slave_stats.tx_bytes;
1837 }
1838 }
1839
1840 if (!netdev_open(bond->name, "system", &bond_dev)) {
1841 netdev_set_stats(bond_dev, &bond_stats);
1842 netdev_close(bond_dev);
1843 }
1844 }