]> git.proxmox.com Git - mirror_ovs.git/blob - ofproto/bond.c
49dd49e0b90869ca857681bbf45cb757ccb03cf5
[mirror_ovs.git] / ofproto / bond.c
1 /*
2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "bond.h"
20
21 #include <limits.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <math.h>
25
26 #include "ofp-util.h"
27 #include "ofp-actions.h"
28 #include "ofpbuf.h"
29 #include "ofproto/ofproto-provider.h"
30 #include "ofproto/ofproto-dpif.h"
31 #include "connectivity.h"
32 #include "coverage.h"
33 #include "dynamic-string.h"
34 #include "flow.h"
35 #include "hmap.h"
36 #include "lacp.h"
37 #include "list.h"
38 #include "netdev.h"
39 #include "odp-util.h"
40 #include "ofpbuf.h"
41 #include "packets.h"
42 #include "poll-loop.h"
43 #include "seq.h"
44 #include "match.h"
45 #include "shash.h"
46 #include "timeval.h"
47 #include "unixctl.h"
48 #include "vlog.h"
49
50 VLOG_DEFINE_THIS_MODULE(bond);
51
52 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
53 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
54 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
55
56 /* Bit-mask for hashing a flow down to a bucket.
57 * There are (BOND_MASK + 1) buckets. */
58 #define BOND_MASK 0xff
59 #define RECIRC_RULE_PRIORITY 20 /* Priority level for internal rules */
60
61 /* A hash bucket for mapping a flow to a slave.
62 * "struct bond" has an array of (BOND_MASK + 1) of these. */
63 struct bond_entry {
64 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
65 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
66 struct list list_node; /* In bond_slave's 'entries' list. */
67
68 /* Recirculation. */
69 struct rule *pr_rule; /* Post recirculation rule for this entry.*/
70 uint64_t pr_tx_bytes; /* Record the rule tx_bytes to figure out
71 the delta to update the tx_bytes entry
72 above.*/
73 };
74
75 /* A bond slave, that is, one of the links comprising a bond. */
76 struct bond_slave {
77 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
78 struct list list_node; /* In struct bond's enabled_slaves list. */
79 struct bond *bond; /* The bond that contains this slave. */
80 void *aux; /* Client-provided handle for this slave. */
81
82 struct netdev *netdev; /* Network device, owned by the client. */
83 unsigned int change_seq; /* Tracks changes in 'netdev'. */
84 ofp_port_t ofp_port; /* Open flow port number */
85 char *name; /* Name (a copy of netdev_get_name(netdev)). */
86
87 /* Link status. */
88 long long delay_expires; /* Time after which 'enabled' may change. */
89 bool enabled; /* May be chosen for flows? */
90 bool may_enable; /* Client considers this slave bondable. */
91
92 /* Rebalancing info. Used only by bond_rebalance(). */
93 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
94 struct list entries; /* 'struct bond_entry's assigned here. */
95 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
96 };
97
98 /* A bond, that is, a set of network devices grouped to improve performance or
99 * robustness. */
100 struct bond {
101 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
102 char *name; /* Name provided by client. */
103 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
104
105 /* Slaves. */
106 struct hmap slaves;
107
108 /* Enabled slaves.
109 *
110 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
111 * (To prevent the bond_slave from disappearing they must also hold
112 * 'rwlock'.) */
113 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
114 struct list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
115
116 /* Bonding info. */
117 enum bond_mode balance; /* Balancing mode, one of BM_*. */
118 struct bond_slave *active_slave;
119 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
120 enum lacp_status lacp_status; /* Status of LACP negotiations. */
121 bool bond_revalidate; /* True if flows need revalidation. */
122 uint32_t basis; /* Basis for flow hash function. */
123
124 /* SLB specific bonding info. */
125 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
126 int rebalance_interval; /* Interval between rebalances, in ms. */
127 long long int next_rebalance; /* Next rebalancing time. */
128 bool send_learning_packets;
129 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
130 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
131
132 /* Legacy compatibility. */
133 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
134 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
135
136 struct ovs_refcount ref_cnt;
137 };
138
139 /* What to do with an bond_recirc_rule. */
140 enum bond_op {
141 ADD, /* Add the rule to ofproto's flow table. */
142 DEL, /* Delete the rule from the ofproto's flow table. */
143 };
144
145 /* A rule to add to or delete from ofproto's internal flow table. */
146 struct bond_pr_rule_op {
147 struct hmap_node hmap_node;
148 struct match match;
149 ofp_port_t out_ofport;
150 enum bond_op op;
151 struct rule *pr_rule;
152 };
153
154 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
155 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
156 OVS_REQ_RDLOCK(rwlock);
157 static void bond_enable_slave(struct bond_slave *, bool enable)
158 OVS_REQ_WRLOCK(rwlock);
159 static void bond_link_status_update(struct bond_slave *)
160 OVS_REQ_WRLOCK(rwlock);
161 static void bond_choose_active_slave(struct bond *)
162 OVS_REQ_WRLOCK(rwlock);;
163 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
164 uint16_t vlan, uint32_t basis);
165 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
166 uint32_t basis);
167 static struct bond_entry *lookup_bond_entry(const struct bond *,
168 const struct flow *,
169 uint16_t vlan)
170 OVS_REQ_RDLOCK(rwlock);
171 static struct bond_slave *get_enabled_slave(struct bond *)
172 OVS_REQ_RDLOCK(rwlock);
173 static struct bond_slave *choose_output_slave(const struct bond *,
174 const struct flow *,
175 struct flow_wildcards *,
176 uint16_t vlan)
177 OVS_REQ_RDLOCK(rwlock);
178 static void bond_update_fake_slave_stats(struct bond *)
179 OVS_REQ_RDLOCK(rwlock);
180
181 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
182 * stores the mode in '*balance' and returns true. Otherwise returns false
183 * without modifying '*balance'. */
184 bool
185 bond_mode_from_string(enum bond_mode *balance, const char *s)
186 {
187 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
188 *balance = BM_TCP;
189 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
190 *balance = BM_SLB;
191 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
192 *balance = BM_AB;
193 } else {
194 return false;
195 }
196 return true;
197 }
198
199 /* Returns a string representing 'balance'. */
200 const char *
201 bond_mode_to_string(enum bond_mode balance) {
202 switch (balance) {
203 case BM_TCP:
204 return "balance-tcp";
205 case BM_SLB:
206 return "balance-slb";
207 case BM_AB:
208 return "active-backup";
209 }
210 OVS_NOT_REACHED();
211 }
212
213 \f
214 /* Creates and returns a new bond whose configuration is initially taken from
215 * 's'.
216 *
217 * The caller should register each slave on the new bond by calling
218 * bond_slave_register(). */
219 struct bond *
220 bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
221 {
222 struct bond *bond;
223
224 bond = xzalloc(sizeof *bond);
225 bond->ofproto = ofproto;
226 hmap_init(&bond->slaves);
227 list_init(&bond->enabled_slaves);
228 ovs_mutex_init(&bond->mutex);
229 bond->next_fake_iface_update = LLONG_MAX;
230 ovs_refcount_init(&bond->ref_cnt);
231
232 bond->recirc_id = 0;
233 hmap_init(&bond->pr_rule_ops);
234
235 bond_reconfigure(bond, s);
236 return bond;
237 }
238
239 struct bond *
240 bond_ref(const struct bond *bond_)
241 {
242 struct bond *bond = CONST_CAST(struct bond *, bond_);
243
244 if (bond) {
245 ovs_refcount_ref(&bond->ref_cnt);
246 }
247 return bond;
248 }
249
250 /* Frees 'bond'. */
251 void
252 bond_unref(struct bond *bond)
253 {
254 struct bond_slave *slave, *next_slave;
255 struct bond_pr_rule_op *pr_op, *next_op;
256
257 if (!bond || ovs_refcount_unref(&bond->ref_cnt) != 1) {
258 return;
259 }
260
261 ovs_rwlock_wrlock(&rwlock);
262 hmap_remove(all_bonds, &bond->hmap_node);
263 ovs_rwlock_unlock(&rwlock);
264
265 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
266 hmap_remove(&bond->slaves, &slave->hmap_node);
267 /* Client owns 'slave->netdev'. */
268 free(slave->name);
269 free(slave);
270 }
271 hmap_destroy(&bond->slaves);
272
273 ovs_mutex_destroy(&bond->mutex);
274 free(bond->hash);
275 free(bond->name);
276
277 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
278 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
279 free(pr_op);
280 }
281 hmap_destroy(&bond->pr_rule_ops);
282
283 if (bond->recirc_id) {
284 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
285 }
286
287 free(bond);
288 }
289
290 static void
291 add_pr_rule(struct bond *bond, const struct match *match,
292 ofp_port_t out_ofport, struct rule *rule)
293 {
294 uint32_t hash = match_hash(match, 0);
295 struct bond_pr_rule_op *pr_op;
296
297 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
298 if (match_equal(&pr_op->match, match)) {
299 pr_op->op = ADD;
300 pr_op->out_ofport = out_ofport;
301 pr_op->pr_rule = rule;
302 return;
303 }
304 }
305
306 pr_op = xmalloc(sizeof *pr_op);
307 pr_op->match = *match;
308 pr_op->op = ADD;
309 pr_op->out_ofport = out_ofport;
310 pr_op->pr_rule = rule;
311 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
312 }
313
314 static void
315 update_recirc_rules(struct bond *bond)
316 {
317 struct match match;
318 struct bond_pr_rule_op *pr_op, *next_op;
319 uint64_t ofpacts_stub[128 / 8];
320 struct ofpbuf ofpacts;
321 int i;
322
323 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
324
325 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
326 pr_op->op = DEL;
327 }
328
329 if ((bond->hash == NULL) || (!bond->recirc_id)) {
330 return;
331 }
332
333 for (i = 0; i < BOND_MASK + 1; i++) {
334 struct bond_slave *slave = bond->hash[i].slave;
335
336 if (slave) {
337 match_init_catchall(&match);
338 match_set_recirc_id(&match, bond->recirc_id);
339 /* recirc_id -> metadata to speed up look ups. */
340 match_set_metadata(&match, htonll(bond->recirc_id));
341 match_set_dp_hash_masked(&match, i, BOND_MASK);
342
343 add_pr_rule(bond, &match, slave->ofp_port,
344 bond->hash[i].pr_rule);
345 }
346 }
347
348 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
349 int error;
350 struct rule *rule;
351 switch (pr_op->op) {
352 case ADD:
353 ofpbuf_clear(&ofpacts);
354 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
355 error = ofproto_dpif_add_internal_flow(bond->ofproto,
356 &pr_op->match,
357 RECIRC_RULE_PRIORITY,
358 &ofpacts, &rule);
359 if (error) {
360 char *err_s = match_to_string(&pr_op->match,
361 RECIRC_RULE_PRIORITY);
362
363 VLOG_ERR("failed to add post recirculation flow %s", err_s);
364 free(err_s);
365 pr_op->pr_rule = NULL;
366 } else {
367 pr_op->pr_rule = rule;
368 }
369 break;
370
371 case DEL:
372 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
373 &pr_op->match,
374 RECIRC_RULE_PRIORITY);
375 if (error) {
376 char *err_s = match_to_string(&pr_op->match,
377 RECIRC_RULE_PRIORITY);
378
379 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
380 free(err_s);
381 }
382
383 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
384 pr_op->pr_rule = NULL;
385 free(pr_op);
386 break;
387 }
388 }
389
390 ofpbuf_uninit(&ofpacts);
391 }
392
393
394 /* Updates 'bond''s overall configuration to 's'.
395 *
396 * The caller should register each slave on 'bond' by calling
397 * bond_slave_register(). This is optional if none of the slaves'
398 * configuration has changed. In any case it can't hurt.
399 *
400 * Returns true if the configuration has changed in such a way that requires
401 * flow revalidation.
402 * */
403 bool
404 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
405 {
406 bool revalidate = false;
407
408 ovs_rwlock_wrlock(&rwlock);
409 if (!bond->name || strcmp(bond->name, s->name)) {
410 if (bond->name) {
411 hmap_remove(all_bonds, &bond->hmap_node);
412 free(bond->name);
413 }
414 bond->name = xstrdup(s->name);
415 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
416 }
417
418 bond->updelay = s->up_delay;
419 bond->downdelay = s->down_delay;
420
421 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
422 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
423 revalidate = true;
424 }
425
426 if (bond->rebalance_interval != s->rebalance_interval) {
427 bond->rebalance_interval = s->rebalance_interval;
428 revalidate = true;
429 }
430
431 if (bond->balance != s->balance) {
432 bond->balance = s->balance;
433 revalidate = true;
434 }
435
436 if (bond->basis != s->basis) {
437 bond->basis = s->basis;
438 revalidate = true;
439 }
440
441 if (s->fake_iface) {
442 if (bond->next_fake_iface_update == LLONG_MAX) {
443 bond->next_fake_iface_update = time_msec();
444 }
445 } else {
446 bond->next_fake_iface_update = LLONG_MAX;
447 }
448
449 if (bond->bond_revalidate) {
450 revalidate = true;
451 bond->bond_revalidate = false;
452 }
453
454 if (bond->balance != BM_AB) {
455 if (!bond->recirc_id) {
456 bond->recirc_id = ofproto_dpif_alloc_recirc_id(bond->ofproto);
457 }
458 } else if (bond->recirc_id) {
459 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
460 bond->recirc_id = 0;
461 }
462
463 if (bond->balance == BM_AB || !bond->hash || revalidate) {
464 bond_entry_reset(bond);
465 }
466
467 ovs_rwlock_unlock(&rwlock);
468 return revalidate;
469 }
470
471 static void
472 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
473 OVS_REQ_WRLOCK(rwlock)
474 {
475 if (slave->netdev != netdev) {
476 slave->netdev = netdev;
477 slave->change_seq = 0;
478 }
479 }
480
481 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
482 * arbitrary client-provided pointer that uniquely identifies a slave within a
483 * bond. If 'slave_' already exists within 'bond' then this function
484 * reconfigures the existing slave.
485 *
486 * 'netdev' must be the network device that 'slave_' represents. It is owned
487 * by the client, so the client must not close it before either unregistering
488 * 'slave_' or destroying 'bond'.
489 */
490 void
491 bond_slave_register(struct bond *bond, void *slave_,
492 ofp_port_t ofport, struct netdev *netdev)
493 {
494 struct bond_slave *slave;
495
496 ovs_rwlock_wrlock(&rwlock);
497 slave = bond_slave_lookup(bond, slave_);
498 if (!slave) {
499 slave = xzalloc(sizeof *slave);
500
501 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
502 slave->bond = bond;
503 slave->aux = slave_;
504 slave->ofp_port = ofport;
505 slave->delay_expires = LLONG_MAX;
506 slave->name = xstrdup(netdev_get_name(netdev));
507 bond->bond_revalidate = true;
508
509 slave->enabled = false;
510 bond_enable_slave(slave, netdev_get_carrier(netdev));
511 }
512
513 bond_slave_set_netdev__(slave, netdev);
514
515 free(slave->name);
516 slave->name = xstrdup(netdev_get_name(netdev));
517 ovs_rwlock_unlock(&rwlock);
518 }
519
520 /* Updates the network device to be used with 'slave_' to 'netdev'.
521 *
522 * This is useful if the caller closes and re-opens the network device
523 * registered with bond_slave_register() but doesn't need to change anything
524 * else. */
525 void
526 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
527 {
528 struct bond_slave *slave;
529
530 ovs_rwlock_wrlock(&rwlock);
531 slave = bond_slave_lookup(bond, slave_);
532 if (slave) {
533 bond_slave_set_netdev__(slave, netdev);
534 }
535 ovs_rwlock_unlock(&rwlock);
536 }
537
538 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
539 * then this function has no effect.
540 *
541 * Unregistering a slave invalidates all flows. */
542 void
543 bond_slave_unregister(struct bond *bond, const void *slave_)
544 {
545 struct bond_slave *slave;
546 bool del_active;
547
548 ovs_rwlock_wrlock(&rwlock);
549 slave = bond_slave_lookup(bond, slave_);
550 if (!slave) {
551 goto out;
552 }
553
554 bond->bond_revalidate = true;
555 bond_enable_slave(slave, false);
556
557 del_active = bond->active_slave == slave;
558 if (bond->hash) {
559 struct bond_entry *e;
560 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
561 if (e->slave == slave) {
562 e->slave = NULL;
563 }
564 }
565 }
566
567 free(slave->name);
568
569 hmap_remove(&bond->slaves, &slave->hmap_node);
570 /* Client owns 'slave->netdev'. */
571 free(slave);
572
573 if (del_active) {
574 bond_choose_active_slave(bond);
575 bond->send_learning_packets = true;
576 }
577 out:
578 ovs_rwlock_unlock(&rwlock);
579 }
580
581 /* Should be called on each slave in 'bond' before bond_run() to indicate
582 * whether or not 'slave_' may be enabled. This function is intended to allow
583 * other protocols to have some impact on bonding decisions. For example LACP
584 * or high level link monitoring protocols may decide that a given slave should
585 * not be able to send traffic. */
586 void
587 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
588 {
589 ovs_rwlock_wrlock(&rwlock);
590 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
591 ovs_rwlock_unlock(&rwlock);
592 }
593
594 /* Performs periodic maintenance on 'bond'.
595 *
596 * Returns true if the caller should revalidate its flows.
597 *
598 * The caller should check bond_should_send_learning_packets() afterward. */
599 bool
600 bond_run(struct bond *bond, enum lacp_status lacp_status)
601 {
602 struct bond_slave *slave;
603 bool revalidate;
604
605 ovs_rwlock_wrlock(&rwlock);
606 if (bond->lacp_status != lacp_status) {
607 bond->lacp_status = lacp_status;
608 bond->bond_revalidate = true;
609 }
610
611 /* Enable slaves based on link status and LACP feedback. */
612 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
613 bond_link_status_update(slave);
614 slave->change_seq = seq_read(connectivity_seq_get());
615 }
616 if (!bond->active_slave || !bond->active_slave->enabled) {
617 bond_choose_active_slave(bond);
618 }
619
620 /* Update fake bond interface stats. */
621 if (time_msec() >= bond->next_fake_iface_update) {
622 bond_update_fake_slave_stats(bond);
623 bond->next_fake_iface_update = time_msec() + 1000;
624 }
625
626 revalidate = bond->bond_revalidate;
627 bond->bond_revalidate = false;
628 ovs_rwlock_unlock(&rwlock);
629
630 return revalidate;
631 }
632
633 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
634 void
635 bond_wait(struct bond *bond)
636 {
637 struct bond_slave *slave;
638
639 ovs_rwlock_rdlock(&rwlock);
640 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
641 if (slave->delay_expires != LLONG_MAX) {
642 poll_timer_wait_until(slave->delay_expires);
643 }
644
645 seq_wait(connectivity_seq_get(), slave->change_seq);
646 }
647
648 if (bond->next_fake_iface_update != LLONG_MAX) {
649 poll_timer_wait_until(bond->next_fake_iface_update);
650 }
651
652 if (bond->bond_revalidate) {
653 poll_immediate_wake();
654 }
655 ovs_rwlock_unlock(&rwlock);
656
657 /* We don't wait for bond->next_rebalance because rebalancing can only run
658 * at a flow account checkpoint. ofproto does checkpointing on its own
659 * schedule and bond_rebalance() gets called afterward, so we'd just be
660 * waking up for no purpose. */
661 }
662 \f
663 /* MAC learning table interaction. */
664
665 static bool
666 may_send_learning_packets(const struct bond *bond)
667 {
668 return ((bond->lacp_status == LACP_DISABLED
669 && (bond->balance == BM_SLB || bond->balance == BM_AB))
670 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
671 && bond->active_slave;
672 }
673
674 /* Returns true if 'bond' needs the client to send out packets to assist with
675 * MAC learning on 'bond'. If this function returns true, then the client
676 * should iterate through its MAC learning table for the bridge on which 'bond'
677 * is located. For each MAC that has been learned on a port other than 'bond',
678 * it should call bond_compose_learning_packet().
679 *
680 * This function will only return true if 'bond' is in SLB or active-backup
681 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
682 * necessary.
683 *
684 * Calling this function resets the state that it checks. */
685 bool
686 bond_should_send_learning_packets(struct bond *bond)
687 {
688 bool send;
689
690 ovs_rwlock_wrlock(&rwlock);
691 send = bond->send_learning_packets && may_send_learning_packets(bond);
692 bond->send_learning_packets = false;
693 ovs_rwlock_unlock(&rwlock);
694 return send;
695 }
696
697 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
698 *
699 * See bond_should_send_learning_packets() for description of usage. The
700 * caller should send the composed packet on the port associated with
701 * port_aux and takes ownership of the returned ofpbuf. */
702 struct ofpbuf *
703 bond_compose_learning_packet(struct bond *bond,
704 const uint8_t eth_src[ETH_ADDR_LEN],
705 uint16_t vlan, void **port_aux)
706 {
707 struct bond_slave *slave;
708 struct ofpbuf *packet;
709 struct flow flow;
710
711 ovs_rwlock_rdlock(&rwlock);
712 ovs_assert(may_send_learning_packets(bond));
713 memset(&flow, 0, sizeof flow);
714 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
715 slave = choose_output_slave(bond, &flow, NULL, vlan);
716
717 packet = ofpbuf_new(0);
718 compose_rarp(packet, eth_src);
719 if (vlan) {
720 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
721 }
722
723 *port_aux = slave->aux;
724 ovs_rwlock_unlock(&rwlock);
725 return packet;
726 }
727 \f
728 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
729 * Ethernet destination address of 'eth_dst', should be admitted.
730 *
731 * The return value is one of the following:
732 *
733 * - BV_ACCEPT: Admit the packet.
734 *
735 * - BV_DROP: Drop the packet.
736 *
737 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
738 * Ethernet source address and VLAN. If there is none, or if the packet
739 * is on the learned port, then admit the packet. If a different port has
740 * been learned, however, drop the packet (and do not use it for MAC
741 * learning).
742 */
743 enum bond_verdict
744 bond_check_admissibility(struct bond *bond, const void *slave_,
745 const uint8_t eth_dst[ETH_ADDR_LEN])
746 {
747 enum bond_verdict verdict = BV_DROP;
748 struct bond_slave *slave;
749
750 ovs_rwlock_rdlock(&rwlock);
751 slave = bond_slave_lookup(bond, slave_);
752 if (!slave) {
753 goto out;
754 }
755
756 /* LACP bonds have very loose admissibility restrictions because we can
757 * assume the remote switch is aware of the bond and will "do the right
758 * thing". However, as a precaution we drop packets on disabled slaves
759 * because no correctly implemented partner switch should be sending
760 * packets to them.
761 *
762 * If LACP is configured, but LACP negotiations have been unsuccessful, we
763 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
764 switch (bond->lacp_status) {
765 case LACP_NEGOTIATED:
766 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
767 goto out;
768 case LACP_CONFIGURED:
769 if (!bond->lacp_fallback_ab) {
770 goto out;
771 }
772 case LACP_DISABLED:
773 break;
774 }
775
776 /* Drop all multicast packets on inactive slaves. */
777 if (eth_addr_is_multicast(eth_dst)) {
778 if (bond->active_slave != slave) {
779 goto out;
780 }
781 }
782
783 switch (bond->balance) {
784 case BM_TCP:
785 /* TCP balanced bonds require successful LACP negotiations. Based on the
786 * above check, LACP is off or lacp_fallback_ab is true on this bond.
787 * If lacp_fallback_ab is true fall through to BM_AB case else, we
788 * drop all incoming traffic. */
789 if (!bond->lacp_fallback_ab) {
790 goto out;
791 }
792
793 case BM_AB:
794 /* Drop all packets which arrive on backup slaves. This is similar to
795 * how Linux bonding handles active-backup bonds. */
796 if (bond->active_slave != slave) {
797 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
798
799 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
800 " slave (%s) destined for " ETH_ADDR_FMT,
801 slave->name, ETH_ADDR_ARGS(eth_dst));
802 goto out;
803 }
804 verdict = BV_ACCEPT;
805 goto out;
806
807 case BM_SLB:
808 /* Drop all packets for which we have learned a different input port,
809 * because we probably sent the packet on one slave and got it back on
810 * the other. Gratuitous ARP packets are an exception to this rule:
811 * the host has moved to another switch. The exception to the
812 * exception is if we locked the learning table to avoid reflections on
813 * bond slaves. */
814 verdict = BV_DROP_IF_MOVED;
815 goto out;
816 }
817
818 OVS_NOT_REACHED();
819 out:
820 ovs_rwlock_unlock(&rwlock);
821 return verdict;
822
823 }
824
825 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
826 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
827 * NULL if the packet should be dropped because no slaves are enabled.
828 *
829 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
830 * should be a VID only (i.e. excluding the PCP bits). Second,
831 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
832 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
833 * packet belongs to (so for an access port it will be the access port's VLAN).
834 *
835 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
836 * significant in the selection. At some point earlier, 'wc' should
837 * have been initialized (e.g., by flow_wildcards_init_catchall()).
838 */
839 void *
840 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
841 struct flow_wildcards *wc, uint16_t vlan)
842 {
843 struct bond_slave *slave;
844 void *aux;
845
846 ovs_rwlock_rdlock(&rwlock);
847 slave = choose_output_slave(bond, flow, wc, vlan);
848 aux = slave ? slave->aux : NULL;
849 ovs_rwlock_unlock(&rwlock);
850
851 return aux;
852 }
853 \f
854 /* Recirculation. */
855 static void
856 bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
857 OVS_REQ_RDLOCK(rwlock)
858 {
859 if (entry->slave) {
860 uint64_t delta;
861
862 delta = rule_tx_bytes - entry->pr_tx_bytes;
863 entry->tx_bytes += delta;
864 entry->pr_tx_bytes = rule_tx_bytes;
865 }
866 }
867
868 /* Maintain bond stats using post recirculation rule byte counters.*/
869 void
870 bond_recirculation_account(struct bond *bond)
871 {
872 int i;
873
874 ovs_rwlock_rdlock(&rwlock);
875 for (i=0; i<=BOND_MASK; i++) {
876 struct bond_entry *entry = &bond->hash[i];
877 struct rule *rule = entry->pr_rule;
878
879 if (rule) {
880 uint64_t n_packets OVS_UNUSED;
881 long long int used OVS_UNUSED;
882 uint64_t n_bytes;
883
884 rule->ofproto->ofproto_class->rule_get_stats(
885 rule, &n_packets, &n_bytes, &used);
886 bond_entry_account(entry, n_bytes);
887 }
888 }
889 ovs_rwlock_unlock(&rwlock);
890 }
891
892 bool
893 bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
894 uint32_t *hash_bias)
895 {
896 if (bond->balance == BM_TCP) {
897 if (recirc_id) {
898 *recirc_id = bond->recirc_id;
899 }
900 if (hash_bias) {
901 *hash_bias = bond->basis;
902 }
903 return true;
904 } else {
905 return false;
906 }
907 }
908
909 void
910 bond_update_post_recirc_rules(struct bond* bond, const bool force)
911 {
912 struct bond_entry *e;
913 bool update_rules = force; /* Always update rules if caller forces it. */
914
915 /* Make sure all bond entries are populated */
916 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
917 if (!e->slave || !e->slave->enabled) {
918 update_rules = true;
919 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
920 struct bond_slave, hmap_node);
921 if (!e->slave->enabled) {
922 e->slave = bond->active_slave;
923 }
924 }
925 }
926
927 if (update_rules) {
928 update_recirc_rules(bond);
929 }
930 }
931 \f
932 /* Rebalancing. */
933
934 static bool
935 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
936 {
937 return bond->rebalance_interval
938 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
939 }
940
941 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
942 void
943 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
944 uint64_t n_bytes)
945 {
946 ovs_rwlock_wrlock(&rwlock);
947 if (bond_is_balanced(bond)) {
948 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
949 }
950 ovs_rwlock_unlock(&rwlock);
951 }
952
953 static struct bond_slave *
954 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
955 {
956 return CONTAINER_OF(bal, struct bond_slave, bal_node);
957 }
958
959 static void
960 log_bals(struct bond *bond, const struct list *bals)
961 {
962 if (VLOG_IS_DBG_ENABLED()) {
963 struct ds ds = DS_EMPTY_INITIALIZER;
964 const struct bond_slave *slave;
965
966 LIST_FOR_EACH (slave, bal_node, bals) {
967 if (ds.length) {
968 ds_put_char(&ds, ',');
969 }
970 ds_put_format(&ds, " %s %"PRIu64"kB",
971 slave->name, slave->tx_bytes / 1024);
972
973 if (!slave->enabled) {
974 ds_put_cstr(&ds, " (disabled)");
975 }
976 if (!list_is_empty(&slave->entries)) {
977 struct bond_entry *e;
978
979 ds_put_cstr(&ds, " (");
980 LIST_FOR_EACH (e, list_node, &slave->entries) {
981 if (&e->list_node != list_front(&slave->entries)) {
982 ds_put_cstr(&ds, " + ");
983 }
984 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
985 e - bond->hash, e->tx_bytes / 1024);
986 }
987 ds_put_cstr(&ds, ")");
988 }
989 }
990 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
991 ds_destroy(&ds);
992 }
993 }
994
995 /* Shifts 'hash' from its current slave to 'to'. */
996 static void
997 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
998 {
999 struct bond_slave *from = hash->slave;
1000 struct bond *bond = from->bond;
1001 uint64_t delta = hash->tx_bytes;
1002
1003 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
1004 "from %s to %s (now carrying %"PRIu64"kB and "
1005 "%"PRIu64"kB load, respectively)",
1006 bond->name, delta / 1024, hash - bond->hash,
1007 from->name, to->name,
1008 (from->tx_bytes - delta) / 1024,
1009 (to->tx_bytes + delta) / 1024);
1010
1011 /* Shift load away from 'from' to 'to'. */
1012 from->tx_bytes -= delta;
1013 to->tx_bytes += delta;
1014
1015 /* Arrange for flows to be revalidated. */
1016 hash->slave = to;
1017 bond->bond_revalidate = true;
1018 }
1019
1020 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1021 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
1022 * given that doing so must decrease the ratio of the load on the two slaves by
1023 * at least 0.1. Returns NULL if there is no appropriate entry.
1024 *
1025 * The list of entries isn't sorted. I don't know of a reason to prefer to
1026 * shift away small hashes or large hashes. */
1027 static struct bond_entry *
1028 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
1029 {
1030 struct bond_entry *e;
1031
1032 if (list_is_short(&from->entries)) {
1033 /* 'from' carries no more than one MAC hash, so shifting load away from
1034 * it would be pointless. */
1035 return NULL;
1036 }
1037
1038 LIST_FOR_EACH (e, list_node, &from->entries) {
1039 double old_ratio, new_ratio;
1040 uint64_t delta;
1041
1042 if (to_tx_bytes == 0) {
1043 /* Nothing on the new slave, move it. */
1044 return e;
1045 }
1046
1047 delta = e->tx_bytes;
1048 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1049 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
1050 if (old_ratio - new_ratio > 0.1
1051 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1052 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1053 and 'to' slave have the same load. Therefore, we only move an
1054 entry if it decreases the load on 'from', and brings us closer
1055 to equal traffic load. */
1056 return e;
1057 }
1058 }
1059
1060 return NULL;
1061 }
1062
1063 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1064 * maintained. */
1065 static void
1066 insert_bal(struct list *bals, struct bond_slave *slave)
1067 {
1068 struct bond_slave *pos;
1069
1070 LIST_FOR_EACH (pos, bal_node, bals) {
1071 if (slave->tx_bytes > pos->tx_bytes) {
1072 break;
1073 }
1074 }
1075 list_insert(&pos->bal_node, &slave->bal_node);
1076 }
1077
1078 /* Removes 'slave' from its current list and then inserts it into 'bals' so
1079 * that descending order of 'tx_bytes' is maintained. */
1080 static void
1081 reinsert_bal(struct list *bals, struct bond_slave *slave)
1082 {
1083 list_remove(&slave->bal_node);
1084 insert_bal(bals, slave);
1085 }
1086
1087 /* If 'bond' needs rebalancing, does so.
1088 *
1089 * The caller should have called bond_account() for each active flow, or in case
1090 * of recirculation is used, have called bond_recirculation_account(bond),
1091 * to ensure that flow data is consistently accounted at this point.
1092 *
1093 * Return whether rebalancing took place.*/
1094 bool
1095 bond_rebalance(struct bond *bond)
1096 {
1097 struct bond_slave *slave;
1098 struct bond_entry *e;
1099 struct list bals;
1100 bool rebalanced = false;
1101
1102 ovs_rwlock_wrlock(&rwlock);
1103 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
1104 goto done;
1105 }
1106 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1107
1108 /* Add each bond_entry to its slave's 'entries' list.
1109 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1110 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1111 slave->tx_bytes = 0;
1112 list_init(&slave->entries);
1113 }
1114 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1115 if (e->slave && e->tx_bytes) {
1116 e->slave->tx_bytes += e->tx_bytes;
1117 list_push_back(&e->slave->entries, &e->list_node);
1118 }
1119 }
1120
1121 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1122 *
1123 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1124 * with a proper list sort algorithm. */
1125 list_init(&bals);
1126 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1127 if (slave->enabled) {
1128 insert_bal(&bals, slave);
1129 }
1130 }
1131 log_bals(bond, &bals);
1132
1133 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1134 while (!list_is_short(&bals)) {
1135 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1136 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1137 uint64_t overload;
1138
1139 overload = from->tx_bytes - to->tx_bytes;
1140 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1141 /* The extra load on 'from' (and all less-loaded slaves), compared
1142 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1143 * it is less than ~1Mbps. No point in rebalancing. */
1144 break;
1145 }
1146
1147 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1148 * to move from 'from' to 'to'. */
1149 e = choose_entry_to_migrate(from, to->tx_bytes);
1150 if (e) {
1151 bond_shift_load(e, to);
1152
1153 /* Delete element from from->entries.
1154 *
1155 * We don't add the element to to->hashes. That would only allow
1156 * 'e' to be migrated to another slave in this rebalancing run, and
1157 * there is no point in doing that. */
1158 list_remove(&e->list_node);
1159
1160 /* Re-sort 'bals'. */
1161 reinsert_bal(&bals, from);
1162 reinsert_bal(&bals, to);
1163 rebalanced = true;
1164 } else {
1165 /* Can't usefully migrate anything away from 'from'.
1166 * Don't reconsider it. */
1167 list_remove(&from->bal_node);
1168 }
1169 }
1170
1171 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1172 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1173 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1174 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1175 e->tx_bytes /= 2;
1176 if (!e->tx_bytes) {
1177 e->slave = NULL;
1178 }
1179 }
1180
1181 done:
1182 ovs_rwlock_unlock(&rwlock);
1183 return rebalanced;
1184 }
1185 \f
1186 /* Bonding unixctl user interface functions. */
1187
1188 static struct bond *
1189 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
1190 {
1191 struct bond *bond;
1192
1193 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
1194 all_bonds) {
1195 if (!strcmp(bond->name, name)) {
1196 return bond;
1197 }
1198 }
1199 return NULL;
1200 }
1201
1202 static struct bond_slave *
1203 bond_lookup_slave(struct bond *bond, const char *slave_name)
1204 {
1205 struct bond_slave *slave;
1206
1207 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1208 if (!strcmp(slave->name, slave_name)) {
1209 return slave;
1210 }
1211 }
1212 return NULL;
1213 }
1214
1215 static void
1216 bond_unixctl_list(struct unixctl_conn *conn,
1217 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1218 void *aux OVS_UNUSED)
1219 {
1220 struct ds ds = DS_EMPTY_INITIALIZER;
1221 const struct bond *bond;
1222
1223 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
1224
1225 ovs_rwlock_rdlock(&rwlock);
1226 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1227 const struct bond_slave *slave;
1228 size_t i;
1229
1230 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1231 bond_mode_to_string(bond->balance), bond->recirc_id);
1232
1233 i = 0;
1234 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1235 if (i++ > 0) {
1236 ds_put_cstr(&ds, ", ");
1237 }
1238 ds_put_cstr(&ds, slave->name);
1239 }
1240 ds_put_char(&ds, '\n');
1241 }
1242 ovs_rwlock_unlock(&rwlock);
1243 unixctl_command_reply(conn, ds_cstr(&ds));
1244 ds_destroy(&ds);
1245 }
1246
1247 static void
1248 bond_print_details(struct ds *ds, const struct bond *bond)
1249 OVS_REQ_RDLOCK(rwlock)
1250 {
1251 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1252 const struct shash_node **sorted_slaves = NULL;
1253 const struct bond_slave *slave;
1254 bool may_recirc;
1255 uint32_t recirc_id;
1256 int i;
1257
1258 ds_put_format(ds, "---- %s ----\n", bond->name);
1259 ds_put_format(ds, "bond_mode: %s\n",
1260 bond_mode_to_string(bond->balance));
1261
1262 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1263 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1264 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1265
1266 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1267
1268 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1269 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1270
1271 if (bond_is_balanced(bond)) {
1272 ds_put_format(ds, "next rebalance: %lld ms\n",
1273 bond->next_rebalance - time_msec());
1274 }
1275
1276 ds_put_cstr(ds, "lacp_status: ");
1277 switch (bond->lacp_status) {
1278 case LACP_NEGOTIATED:
1279 ds_put_cstr(ds, "negotiated\n");
1280 break;
1281 case LACP_CONFIGURED:
1282 ds_put_cstr(ds, "configured\n");
1283 break;
1284 case LACP_DISABLED:
1285 ds_put_cstr(ds, "off\n");
1286 break;
1287 default:
1288 ds_put_cstr(ds, "<unknown>\n");
1289 break;
1290 }
1291
1292 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1293 shash_add(&slave_shash, slave->name, slave);
1294 }
1295 sorted_slaves = shash_sort(&slave_shash);
1296
1297 for (i = 0; i < shash_count(&slave_shash); i++) {
1298 struct bond_entry *be;
1299
1300 slave = sorted_slaves[i]->data;
1301
1302 /* Basic info. */
1303 ds_put_format(ds, "\nslave %s: %s\n",
1304 slave->name, slave->enabled ? "enabled" : "disabled");
1305 if (slave == bond->active_slave) {
1306 ds_put_cstr(ds, "\tactive slave\n");
1307 }
1308 if (slave->delay_expires != LLONG_MAX) {
1309 ds_put_format(ds, "\t%s expires in %lld ms\n",
1310 slave->enabled ? "downdelay" : "updelay",
1311 slave->delay_expires - time_msec());
1312 }
1313
1314 ds_put_format(ds, "\tmay_enable: %s\n",
1315 slave->may_enable ? "true" : "false");
1316
1317 if (!bond_is_balanced(bond)) {
1318 continue;
1319 }
1320
1321 /* Hashes. */
1322 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1323 int hash = be - bond->hash;
1324
1325 if (be->slave != slave) {
1326 continue;
1327 }
1328
1329 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1330 hash, be->tx_bytes / 1024);
1331
1332 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1333 }
1334 }
1335 shash_destroy(&slave_shash);
1336 free(sorted_slaves);
1337 ds_put_cstr(ds, "\n");
1338 }
1339
1340 static void
1341 bond_unixctl_show(struct unixctl_conn *conn,
1342 int argc, const char *argv[],
1343 void *aux OVS_UNUSED)
1344 {
1345 struct ds ds = DS_EMPTY_INITIALIZER;
1346
1347 ovs_rwlock_rdlock(&rwlock);
1348 if (argc > 1) {
1349 const struct bond *bond = bond_find(argv[1]);
1350
1351 if (!bond) {
1352 unixctl_command_reply_error(conn, "no such bond");
1353 goto out;
1354 }
1355 bond_print_details(&ds, bond);
1356 } else {
1357 const struct bond *bond;
1358
1359 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1360 bond_print_details(&ds, bond);
1361 }
1362 }
1363
1364 unixctl_command_reply(conn, ds_cstr(&ds));
1365 ds_destroy(&ds);
1366
1367 out:
1368 ovs_rwlock_unlock(&rwlock);
1369 }
1370
1371 static void
1372 bond_unixctl_migrate(struct unixctl_conn *conn,
1373 int argc OVS_UNUSED, const char *argv[],
1374 void *aux OVS_UNUSED)
1375 {
1376 const char *bond_s = argv[1];
1377 const char *hash_s = argv[2];
1378 const char *slave_s = argv[3];
1379 struct bond *bond;
1380 struct bond_slave *slave;
1381 struct bond_entry *entry;
1382 int hash;
1383
1384 ovs_rwlock_wrlock(&rwlock);
1385 bond = bond_find(bond_s);
1386 if (!bond) {
1387 unixctl_command_reply_error(conn, "no such bond");
1388 goto out;
1389 }
1390
1391 if (bond->balance != BM_SLB) {
1392 unixctl_command_reply_error(conn, "not an SLB bond");
1393 goto out;
1394 }
1395
1396 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1397 hash = atoi(hash_s) & BOND_MASK;
1398 } else {
1399 unixctl_command_reply_error(conn, "bad hash");
1400 goto out;
1401 }
1402
1403 slave = bond_lookup_slave(bond, slave_s);
1404 if (!slave) {
1405 unixctl_command_reply_error(conn, "no such slave");
1406 goto out;
1407 }
1408
1409 if (!slave->enabled) {
1410 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1411 goto out;
1412 }
1413
1414 entry = &bond->hash[hash];
1415 bond->bond_revalidate = true;
1416 entry->slave = slave;
1417 unixctl_command_reply(conn, "migrated");
1418
1419 out:
1420 ovs_rwlock_unlock(&rwlock);
1421 }
1422
1423 static void
1424 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1425 int argc OVS_UNUSED, const char *argv[],
1426 void *aux OVS_UNUSED)
1427 {
1428 const char *bond_s = argv[1];
1429 const char *slave_s = argv[2];
1430 struct bond *bond;
1431 struct bond_slave *slave;
1432
1433 ovs_rwlock_wrlock(&rwlock);
1434 bond = bond_find(bond_s);
1435 if (!bond) {
1436 unixctl_command_reply_error(conn, "no such bond");
1437 goto out;
1438 }
1439
1440 slave = bond_lookup_slave(bond, slave_s);
1441 if (!slave) {
1442 unixctl_command_reply_error(conn, "no such slave");
1443 goto out;
1444 }
1445
1446 if (!slave->enabled) {
1447 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1448 goto out;
1449 }
1450
1451 if (bond->active_slave != slave) {
1452 bond->bond_revalidate = true;
1453 bond->active_slave = slave;
1454 VLOG_INFO("bond %s: active interface is now %s",
1455 bond->name, slave->name);
1456 bond->send_learning_packets = true;
1457 unixctl_command_reply(conn, "done");
1458 } else {
1459 unixctl_command_reply(conn, "no change");
1460 }
1461 out:
1462 ovs_rwlock_unlock(&rwlock);
1463 }
1464
1465 static void
1466 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1467 {
1468 const char *bond_s = argv[1];
1469 const char *slave_s = argv[2];
1470 struct bond *bond;
1471 struct bond_slave *slave;
1472
1473 ovs_rwlock_wrlock(&rwlock);
1474 bond = bond_find(bond_s);
1475 if (!bond) {
1476 unixctl_command_reply_error(conn, "no such bond");
1477 goto out;
1478 }
1479
1480 slave = bond_lookup_slave(bond, slave_s);
1481 if (!slave) {
1482 unixctl_command_reply_error(conn, "no such slave");
1483 goto out;
1484 }
1485
1486 bond_enable_slave(slave, enable);
1487 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1488
1489 out:
1490 ovs_rwlock_unlock(&rwlock);
1491 }
1492
1493 static void
1494 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1495 int argc OVS_UNUSED, const char *argv[],
1496 void *aux OVS_UNUSED)
1497 {
1498 enable_slave(conn, argv, true);
1499 }
1500
1501 static void
1502 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1503 int argc OVS_UNUSED, const char *argv[],
1504 void *aux OVS_UNUSED)
1505 {
1506 enable_slave(conn, argv, false);
1507 }
1508
1509 static void
1510 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1511 void *aux OVS_UNUSED)
1512 {
1513 const char *mac_s = argv[1];
1514 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1515 const char *basis_s = argc > 3 ? argv[3] : NULL;
1516 uint8_t mac[ETH_ADDR_LEN];
1517 uint8_t hash;
1518 char *hash_cstr;
1519 unsigned int vlan;
1520 uint32_t basis;
1521
1522 if (vlan_s) {
1523 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1524 unixctl_command_reply_error(conn, "invalid vlan");
1525 return;
1526 }
1527 } else {
1528 vlan = 0;
1529 }
1530
1531 if (basis_s) {
1532 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1533 unixctl_command_reply_error(conn, "invalid basis");
1534 return;
1535 }
1536 } else {
1537 basis = 0;
1538 }
1539
1540 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1541 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1542
1543 hash_cstr = xasprintf("%u", hash);
1544 unixctl_command_reply(conn, hash_cstr);
1545 free(hash_cstr);
1546 } else {
1547 unixctl_command_reply_error(conn, "invalid mac");
1548 }
1549 }
1550
1551 void
1552 bond_init(void)
1553 {
1554 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1555 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1556 NULL);
1557 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1558 bond_unixctl_migrate, NULL);
1559 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1560 bond_unixctl_set_active_slave, NULL);
1561 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1562 bond_unixctl_enable_slave, NULL);
1563 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1564 bond_unixctl_disable_slave, NULL);
1565 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1566 bond_unixctl_hash, NULL);
1567 }
1568 \f
1569 static void
1570 bond_entry_reset(struct bond *bond)
1571 {
1572 if (bond->balance != BM_AB) {
1573 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1574
1575 if (!bond->hash) {
1576 bond->hash = xmalloc(hash_len);
1577 }
1578 memset(bond->hash, 0, hash_len);
1579
1580 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1581 } else {
1582 free(bond->hash);
1583 bond->hash = NULL;
1584 }
1585 }
1586
1587 static struct bond_slave *
1588 bond_slave_lookup(struct bond *bond, const void *slave_)
1589 {
1590 struct bond_slave *slave;
1591
1592 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1593 &bond->slaves) {
1594 if (slave->aux == slave_) {
1595 return slave;
1596 }
1597 }
1598
1599 return NULL;
1600 }
1601
1602 static void
1603 bond_enable_slave(struct bond_slave *slave, bool enable)
1604 {
1605 slave->delay_expires = LLONG_MAX;
1606 if (enable != slave->enabled) {
1607 slave->bond->bond_revalidate = true;
1608 slave->enabled = enable;
1609
1610 ovs_mutex_lock(&slave->bond->mutex);
1611 if (enable) {
1612 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1613 } else {
1614 list_remove(&slave->list_node);
1615 }
1616 ovs_mutex_unlock(&slave->bond->mutex);
1617
1618 VLOG_INFO("interface %s: %s", slave->name,
1619 slave->enabled ? "enabled" : "disabled");
1620 }
1621 }
1622
1623 static void
1624 bond_link_status_update(struct bond_slave *slave)
1625 {
1626 struct bond *bond = slave->bond;
1627 bool up;
1628
1629 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1630 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1631 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1632 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1633 slave->name, up ? "up" : "down");
1634 if (up == slave->enabled) {
1635 slave->delay_expires = LLONG_MAX;
1636 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1637 slave->name, up ? "disabled" : "enabled");
1638 } else {
1639 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1640 : up ? bond->updelay : bond->downdelay);
1641 slave->delay_expires = time_msec() + delay;
1642 if (delay) {
1643 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1644 "for %d ms",
1645 slave->name,
1646 up ? "enabled" : "disabled",
1647 up ? "up" : "down",
1648 delay);
1649 }
1650 }
1651 }
1652
1653 if (time_msec() >= slave->delay_expires) {
1654 bond_enable_slave(slave, up);
1655 }
1656 }
1657
1658 static unsigned int
1659 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1660 {
1661 return hash_mac(mac, vlan, basis);
1662 }
1663
1664 static unsigned int
1665 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1666 {
1667 struct flow hash_flow = *flow;
1668 hash_flow.vlan_tci = htons(vlan);
1669
1670 /* The symmetric quality of this hash function is not required, but
1671 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1672 * purposes, so we use it out of convenience. */
1673 return flow_hash_symmetric_l4(&hash_flow, basis);
1674 }
1675
1676 static unsigned int
1677 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1678 {
1679 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1680
1681 return (bond->balance == BM_TCP
1682 ? bond_hash_tcp(flow, vlan, bond->basis)
1683 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1684 }
1685
1686 static struct bond_entry *
1687 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1688 uint16_t vlan)
1689 {
1690 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1691 }
1692
1693 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1694 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1695 * returns NULL. */
1696 static struct bond_slave *
1697 get_enabled_slave(struct bond *bond)
1698 {
1699 struct list *node;
1700
1701 ovs_mutex_lock(&bond->mutex);
1702 if (list_is_empty(&bond->enabled_slaves)) {
1703 ovs_mutex_unlock(&bond->mutex);
1704 return NULL;
1705 }
1706
1707 node = list_pop_front(&bond->enabled_slaves);
1708 list_push_back(&bond->enabled_slaves, node);
1709 ovs_mutex_unlock(&bond->mutex);
1710
1711 return CONTAINER_OF(node, struct bond_slave, list_node);
1712 }
1713
1714 static struct bond_slave *
1715 choose_output_slave(const struct bond *bond, const struct flow *flow,
1716 struct flow_wildcards *wc, uint16_t vlan)
1717 {
1718 struct bond_entry *e;
1719 int balance;
1720
1721 balance = bond->balance;
1722 if (bond->lacp_status == LACP_CONFIGURED) {
1723 /* LACP has been configured on this bond but negotiations were
1724 * unsuccussful. If lacp_fallback_ab is enabled use active-
1725 * backup mode else drop all traffic. */
1726 if (!bond->lacp_fallback_ab) {
1727 return NULL;
1728 }
1729 balance = BM_AB;
1730 }
1731
1732 switch (balance) {
1733 case BM_AB:
1734 return bond->active_slave;
1735
1736 case BM_TCP:
1737 if (bond->lacp_status != LACP_NEGOTIATED) {
1738 /* Must have LACP negotiations for TCP balanced bonds. */
1739 return NULL;
1740 }
1741 if (wc) {
1742 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1743 }
1744 /* Fall Through. */
1745 case BM_SLB:
1746 if (wc) {
1747 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1748 }
1749 e = lookup_bond_entry(bond, flow, vlan);
1750 if (!e->slave || !e->slave->enabled) {
1751 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
1752 }
1753 return e->slave;
1754
1755 default:
1756 OVS_NOT_REACHED();
1757 }
1758 }
1759
1760 static struct bond_slave *
1761 bond_choose_slave(const struct bond *bond)
1762 {
1763 struct bond_slave *slave, *best;
1764
1765 /* Find an enabled slave. */
1766 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1767 if (slave->enabled) {
1768 return slave;
1769 }
1770 }
1771
1772 /* All interfaces are disabled. Find an interface that will be enabled
1773 * after its updelay expires. */
1774 best = NULL;
1775 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1776 if (slave->delay_expires != LLONG_MAX
1777 && slave->may_enable
1778 && (!best || slave->delay_expires < best->delay_expires)) {
1779 best = slave;
1780 }
1781 }
1782 return best;
1783 }
1784
1785 static void
1786 bond_choose_active_slave(struct bond *bond)
1787 {
1788 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1789 struct bond_slave *old_active_slave = bond->active_slave;
1790
1791 bond->active_slave = bond_choose_slave(bond);
1792 if (bond->active_slave) {
1793 if (bond->active_slave->enabled) {
1794 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1795 bond->name, bond->active_slave->name);
1796 } else {
1797 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1798 "remaining %lld ms updelay (since no interface was "
1799 "enabled)", bond->name, bond->active_slave->name,
1800 bond->active_slave->delay_expires - time_msec());
1801 bond_enable_slave(bond->active_slave, true);
1802 }
1803
1804 bond->send_learning_packets = true;
1805 } else if (old_active_slave) {
1806 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1807 }
1808 }
1809
1810 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1811 * bond interface. */
1812 static void
1813 bond_update_fake_slave_stats(struct bond *bond)
1814 {
1815 struct netdev_stats bond_stats;
1816 struct bond_slave *slave;
1817 struct netdev *bond_dev;
1818
1819 memset(&bond_stats, 0, sizeof bond_stats);
1820
1821 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1822 struct netdev_stats slave_stats;
1823
1824 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1825 /* XXX: We swap the stats here because they are swapped back when
1826 * reported by the internal device. The reason for this is
1827 * internal devices normally represent packets going into the
1828 * system but when used as fake bond device they represent packets
1829 * leaving the system. We really should do this in the internal
1830 * device itself because changing it here reverses the counts from
1831 * the perspective of the switch. However, the internal device
1832 * doesn't know what type of device it represents so we have to do
1833 * it here for now. */
1834 bond_stats.tx_packets += slave_stats.rx_packets;
1835 bond_stats.tx_bytes += slave_stats.rx_bytes;
1836 bond_stats.rx_packets += slave_stats.tx_packets;
1837 bond_stats.rx_bytes += slave_stats.tx_bytes;
1838 }
1839 }
1840
1841 if (!netdev_open(bond->name, "system", &bond_dev)) {
1842 netdev_set_stats(bond_dev, &bond_stats);
1843 netdev_close(bond_dev);
1844 }
1845 }