]> git.proxmox.com Git - mirror_ovs.git/blame - ofproto/bond.c
datapath: Allow each vport to have an array of 'port_id's.
[mirror_ovs.git] / ofproto / bond.c
CommitLineData
f620b43a 1/*
8917f72c 2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
f620b43a
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
19#include "bond.h"
20
21#include <limits.h>
22#include <stdint.h>
23#include <stdlib.h>
75fad143 24#include <math.h>
f620b43a 25
adcf00ba
AZ
26#include "ofp-util.h"
27#include "ofp-actions.h"
28#include "ofpbuf.h"
29#include "ofproto/ofproto-provider.h"
30#include "ofproto/ofproto-dpif.h"
da4a6191 31#include "connectivity.h"
f620b43a
BP
32#include "coverage.h"
33#include "dynamic-string.h"
34#include "flow.h"
35#include "hmap.h"
bdebeece 36#include "lacp.h"
f620b43a
BP
37#include "list.h"
38#include "netdev.h"
39#include "odp-util.h"
40#include "ofpbuf.h"
41#include "packets.h"
42#include "poll-loop.h"
da4a6191 43#include "seq.h"
adcf00ba 44#include "match.h"
fc1d4f01 45#include "shash.h"
f620b43a
BP
46#include "timeval.h"
47#include "unixctl.h"
48#include "vlog.h"
49
50VLOG_DEFINE_THIS_MODULE(bond);
51
f1c8a79c
AW
52static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
53static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
54static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
55
9e1a6910 56/* Bit-mask for hashing a flow down to a bucket. */
f620b43a 57#define BOND_MASK 0xff
9e1a6910 58#define BOND_BUCKETS (BOND_MASK + 1)
adcf00ba 59#define RECIRC_RULE_PRIORITY 20 /* Priority level for internal rules */
f620b43a
BP
60
61/* A hash bucket for mapping a flow to a slave.
9e1a6910 62 * "struct bond" has an array of BOND_BUCKETS of these. */
f620b43a
BP
63struct bond_entry {
64 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
c6855ec5
JS
65 uint64_t tx_bytes /* Count of bytes recently transmitted. */
66 OVS_GUARDED_BY(rwlock);
f620b43a 67 struct list list_node; /* In bond_slave's 'entries' list. */
adcf00ba 68
c6855ec5
JS
69 /* Recirculation.
70 *
71 * 'pr_rule' is the post-recirculation rule for this entry.
72 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
73 * is used to determine delta (applied to 'tx_bytes' above.) */
74 struct rule *pr_rule;
75 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
f620b43a
BP
76};
77
78/* A bond slave, that is, one of the links comprising a bond. */
79struct bond_slave {
80 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
f1c8a79c 81 struct list list_node; /* In struct bond's enabled_slaves list. */
f620b43a
BP
82 struct bond *bond; /* The bond that contains this slave. */
83 void *aux; /* Client-provided handle for this slave. */
84
85 struct netdev *netdev; /* Network device, owned by the client. */
1ea24138 86 unsigned int change_seq; /* Tracks changes in 'netdev'. */
adcf00ba 87 ofp_port_t ofp_port; /* Open flow port number */
f620b43a
BP
88 char *name; /* Name (a copy of netdev_get_name(netdev)). */
89
90 /* Link status. */
91 long long delay_expires; /* Time after which 'enabled' may change. */
f620b43a 92 bool enabled; /* May be chosen for flows? */
296f6519 93 bool may_enable; /* Client considers this slave bondable. */
f620b43a
BP
94
95 /* Rebalancing info. Used only by bond_rebalance(). */
96 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
97 struct list entries; /* 'struct bond_entry's assigned here. */
98 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
99};
100
101/* A bond, that is, a set of network devices grouped to improve performance or
102 * robustness. */
103struct bond {
104 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
105 char *name; /* Name provided by client. */
adcf00ba 106 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
f620b43a
BP
107
108 /* Slaves. */
109 struct hmap slaves;
110
f1c8a79c
AW
111 /* Enabled slaves.
112 *
113 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
114 * (To prevent the bond_slave from disappearing they must also hold
115 * 'rwlock'.) */
116 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
117 struct list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
118
f620b43a
BP
119 /* Bonding info. */
120 enum bond_mode balance; /* Balancing mode, one of BM_*. */
121 struct bond_slave *active_slave;
f620b43a 122 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
bdebeece 123 enum lacp_status lacp_status; /* Status of LACP negotiations. */
62904702 124 bool bond_revalidate; /* True if flows need revalidation. */
672d18b2 125 uint32_t basis; /* Basis for flow hash function. */
f620b43a
BP
126
127 /* SLB specific bonding info. */
9e1a6910 128 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
f620b43a
BP
129 int rebalance_interval; /* Interval between rebalances, in ms. */
130 long long int next_rebalance; /* Next rebalancing time. */
131 bool send_learning_packets;
adcf00ba
AZ
132 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
133 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
f620b43a 134
f620b43a
BP
135 /* Legacy compatibility. */
136 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
9dd165e0 137 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
f620b43a 138
37bec3d3 139 struct ovs_refcount ref_cnt;
f620b43a
BP
140};
141
adcf00ba
AZ
142/* What to do with an bond_recirc_rule. */
143enum bond_op {
144 ADD, /* Add the rule to ofproto's flow table. */
145 DEL, /* Delete the rule from the ofproto's flow table. */
146};
147
148/* A rule to add to or delete from ofproto's internal flow table. */
149struct bond_pr_rule_op {
150 struct hmap_node hmap_node;
151 struct match match;
152 ofp_port_t out_ofport;
153 enum bond_op op;
154 struct rule *pr_rule;
155};
156
3bfd3972
EJ
157static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
158static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
159 OVS_REQ_RDLOCK(rwlock);
4a1b8f30
EJ
160static void bond_enable_slave(struct bond_slave *, bool enable)
161 OVS_REQ_WRLOCK(rwlock);
162static void bond_link_status_update(struct bond_slave *)
3bfd3972 163 OVS_REQ_WRLOCK(rwlock);
4a1b8f30 164static void bond_choose_active_slave(struct bond *)
9e1a6910 165 OVS_REQ_WRLOCK(rwlock);
f620b43a 166static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
672d18b2
EJ
167 uint16_t vlan, uint32_t basis);
168static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
169 uint32_t basis);
f620b43a
BP
170static struct bond_entry *lookup_bond_entry(const struct bond *,
171 const struct flow *,
3bfd3972
EJ
172 uint16_t vlan)
173 OVS_REQ_RDLOCK(rwlock);
f1c8a79c
AW
174static struct bond_slave *get_enabled_slave(struct bond *)
175 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
176static struct bond_slave *choose_output_slave(const struct bond *,
177 const struct flow *,
bcd2633a 178 struct flow_wildcards *,
4a1b8f30 179 uint16_t vlan)
3bfd3972
EJ
180 OVS_REQ_RDLOCK(rwlock);
181static void bond_update_fake_slave_stats(struct bond *)
182 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
183
184/* Attempts to parse 's' as the name of a bond balancing mode. If successful,
185 * stores the mode in '*balance' and returns true. Otherwise returns false
186 * without modifying '*balance'. */
187bool
188bond_mode_from_string(enum bond_mode *balance, const char *s)
189{
190 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
191 *balance = BM_TCP;
192 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
193 *balance = BM_SLB;
194 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
195 *balance = BM_AB;
196 } else {
197 return false;
198 }
199 return true;
200}
201
202/* Returns a string representing 'balance'. */
203const char *
204bond_mode_to_string(enum bond_mode balance) {
205 switch (balance) {
206 case BM_TCP:
207 return "balance-tcp";
208 case BM_SLB:
209 return "balance-slb";
210 case BM_AB:
211 return "active-backup";
212 }
428b2edd 213 OVS_NOT_REACHED();
f620b43a
BP
214}
215
f620b43a
BP
216\f
217/* Creates and returns a new bond whose configuration is initially taken from
218 * 's'.
219 *
220 * The caller should register each slave on the new bond by calling
221 * bond_slave_register(). */
222struct bond *
adcf00ba 223bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
f620b43a
BP
224{
225 struct bond *bond;
226
227 bond = xzalloc(sizeof *bond);
adcf00ba 228 bond->ofproto = ofproto;
f620b43a 229 hmap_init(&bond->slaves);
f1c8a79c
AW
230 list_init(&bond->enabled_slaves);
231 ovs_mutex_init(&bond->mutex);
f620b43a 232 bond->next_fake_iface_update = LLONG_MAX;
37bec3d3 233 ovs_refcount_init(&bond->ref_cnt);
f620b43a 234
adcf00ba
AZ
235 bond->recirc_id = 0;
236 hmap_init(&bond->pr_rule_ops);
237
f620b43a 238 bond_reconfigure(bond, s);
f620b43a
BP
239 return bond;
240}
241
03366a2d
EJ
242struct bond *
243bond_ref(const struct bond *bond_)
244{
245 struct bond *bond = CONST_CAST(struct bond *, bond_);
246
bca0b3b4 247 if (bond) {
37bec3d3 248 ovs_refcount_ref(&bond->ref_cnt);
bca0b3b4 249 }
03366a2d
EJ
250 return bond;
251}
252
f620b43a
BP
253/* Frees 'bond'. */
254void
03366a2d 255bond_unref(struct bond *bond)
f620b43a
BP
256{
257 struct bond_slave *slave, *next_slave;
adcf00ba 258 struct bond_pr_rule_op *pr_op, *next_op;
f620b43a 259
37bec3d3 260 if (!bond || ovs_refcount_unref(&bond->ref_cnt) != 1) {
03366a2d
EJ
261 return;
262 }
263
3bfd3972
EJ
264 ovs_rwlock_wrlock(&rwlock);
265 hmap_remove(all_bonds, &bond->hmap_node);
266 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
267
268 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
269 hmap_remove(&bond->slaves, &slave->hmap_node);
270 /* Client owns 'slave->netdev'. */
271 free(slave->name);
272 free(slave);
273 }
274 hmap_destroy(&bond->slaves);
275
f1c8a79c 276 ovs_mutex_destroy(&bond->mutex);
f620b43a 277 free(bond->hash);
f620b43a 278 free(bond->name);
adcf00ba
AZ
279
280 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
281 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
282 free(pr_op);
283 }
284 hmap_destroy(&bond->pr_rule_ops);
285
286 if (bond->recirc_id) {
287 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
288 }
289
f620b43a
BP
290 free(bond);
291}
292
adcf00ba
AZ
293static void
294add_pr_rule(struct bond *bond, const struct match *match,
295 ofp_port_t out_ofport, struct rule *rule)
296{
297 uint32_t hash = match_hash(match, 0);
298 struct bond_pr_rule_op *pr_op;
299
300 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
301 if (match_equal(&pr_op->match, match)) {
302 pr_op->op = ADD;
303 pr_op->out_ofport = out_ofport;
304 pr_op->pr_rule = rule;
305 return;
306 }
307 }
308
309 pr_op = xmalloc(sizeof *pr_op);
310 pr_op->match = *match;
311 pr_op->op = ADD;
312 pr_op->out_ofport = out_ofport;
313 pr_op->pr_rule = rule;
314 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
315}
316
317static void
318update_recirc_rules(struct bond *bond)
319{
320 struct match match;
321 struct bond_pr_rule_op *pr_op, *next_op;
322 uint64_t ofpacts_stub[128 / 8];
323 struct ofpbuf ofpacts;
324 int i;
325
326 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
327
328 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
329 pr_op->op = DEL;
330 }
331
332 if ((bond->hash == NULL) || (!bond->recirc_id)) {
333 return;
334 }
335
9e1a6910 336 for (i = 0; i < BOND_BUCKETS; i++) {
adcf00ba
AZ
337 struct bond_slave *slave = bond->hash[i].slave;
338
339 if (slave) {
340 match_init_catchall(&match);
341 match_set_recirc_id(&match, bond->recirc_id);
342 /* recirc_id -> metadata to speed up look ups. */
343 match_set_metadata(&match, htonll(bond->recirc_id));
344 match_set_dp_hash_masked(&match, i, BOND_MASK);
345
346 add_pr_rule(bond, &match, slave->ofp_port,
347 bond->hash[i].pr_rule);
348 }
349 }
350
351 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
352 int error;
353 struct rule *rule;
354 switch (pr_op->op) {
355 case ADD:
356 ofpbuf_clear(&ofpacts);
357 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
358 error = ofproto_dpif_add_internal_flow(bond->ofproto,
359 &pr_op->match,
360 RECIRC_RULE_PRIORITY,
361 &ofpacts, &rule);
362 if (error) {
363 char *err_s = match_to_string(&pr_op->match,
364 RECIRC_RULE_PRIORITY);
365
366 VLOG_ERR("failed to add post recirculation flow %s", err_s);
367 free(err_s);
368 pr_op->pr_rule = NULL;
369 } else {
370 pr_op->pr_rule = rule;
371 }
372 break;
373
374 case DEL:
375 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
376 &pr_op->match,
377 RECIRC_RULE_PRIORITY);
378 if (error) {
379 char *err_s = match_to_string(&pr_op->match,
380 RECIRC_RULE_PRIORITY);
381
382 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
383 free(err_s);
384 }
385
386 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
387 pr_op->pr_rule = NULL;
388 free(pr_op);
389 break;
390 }
391 }
392
393 ofpbuf_uninit(&ofpacts);
394}
395
396
f620b43a
BP
397/* Updates 'bond''s overall configuration to 's'.
398 *
399 * The caller should register each slave on 'bond' by calling
400 * bond_slave_register(). This is optional if none of the slaves'
4d6fb5eb 401 * configuration has changed. In any case it can't hurt.
59d7b2b6
EJ
402 *
403 * Returns true if the configuration has changed in such a way that requires
404 * flow revalidation.
405 * */
406bool
f620b43a
BP
407bond_reconfigure(struct bond *bond, const struct bond_settings *s)
408{
59d7b2b6
EJ
409 bool revalidate = false;
410
3bfd3972 411 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
412 if (!bond->name || strcmp(bond->name, s->name)) {
413 if (bond->name) {
3bfd3972 414 hmap_remove(all_bonds, &bond->hmap_node);
f620b43a
BP
415 free(bond->name);
416 }
417 bond->name = xstrdup(s->name);
3bfd3972 418 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
f620b43a
BP
419 }
420
f620b43a
BP
421 bond->updelay = s->up_delay;
422 bond->downdelay = s->down_delay;
bc1b010c 423
9dd165e0
RK
424 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
425 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
426 revalidate = true;
427 }
428
bc1b010c
EJ
429 if (bond->rebalance_interval != s->rebalance_interval) {
430 bond->rebalance_interval = s->rebalance_interval;
431 revalidate = true;
432 }
f620b43a 433
59d7b2b6
EJ
434 if (bond->balance != s->balance) {
435 bond->balance = s->balance;
436 revalidate = true;
437 }
438
672d18b2
EJ
439 if (bond->basis != s->basis) {
440 bond->basis = s->basis;
441 revalidate = true;
442 }
443
f620b43a
BP
444 if (s->fake_iface) {
445 if (bond->next_fake_iface_update == LLONG_MAX) {
446 bond->next_fake_iface_update = time_msec();
447 }
448 } else {
449 bond->next_fake_iface_update = LLONG_MAX;
450 }
59d7b2b6 451
62904702
EJ
452 if (bond->bond_revalidate) {
453 revalidate = true;
454 bond->bond_revalidate = false;
455 }
456
adcf00ba
AZ
457 if (bond->balance != BM_AB) {
458 if (!bond->recirc_id) {
459 bond->recirc_id = ofproto_dpif_alloc_recirc_id(bond->ofproto);
460 }
461 } else if (bond->recirc_id) {
462 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
463 bond->recirc_id = 0;
464 }
465
95aafb2a
EJ
466 if (bond->balance == BM_AB || !bond->hash || revalidate) {
467 bond_entry_reset(bond);
468 }
469
3bfd3972 470 ovs_rwlock_unlock(&rwlock);
59d7b2b6 471 return revalidate;
f620b43a
BP
472}
473
f8ddccd2 474static void
1ea24138 475bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
3bfd3972 476 OVS_REQ_WRLOCK(rwlock)
f8ddccd2
BP
477{
478 if (slave->netdev != netdev) {
f8ddccd2 479 slave->netdev = netdev;
1ea24138 480 slave->change_seq = 0;
f8ddccd2
BP
481 }
482}
483
f620b43a
BP
484/* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
485 * arbitrary client-provided pointer that uniquely identifies a slave within a
486 * bond. If 'slave_' already exists within 'bond' then this function
487 * reconfigures the existing slave.
488 *
489 * 'netdev' must be the network device that 'slave_' represents. It is owned
490 * by the client, so the client must not close it before either unregistering
491 * 'slave_' or destroying 'bond'.
4d6fb5eb 492 */
f620b43a 493void
adcf00ba
AZ
494bond_slave_register(struct bond *bond, void *slave_,
495 ofp_port_t ofport, struct netdev *netdev)
f620b43a 496{
3bfd3972 497 struct bond_slave *slave;
f620b43a 498
3bfd3972
EJ
499 ovs_rwlock_wrlock(&rwlock);
500 slave = bond_slave_lookup(bond, slave_);
f620b43a
BP
501 if (!slave) {
502 slave = xzalloc(sizeof *slave);
503
504 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
505 slave->bond = bond;
506 slave->aux = slave_;
adcf00ba 507 slave->ofp_port = ofport;
f620b43a 508 slave->delay_expires = LLONG_MAX;
244b2160 509 slave->name = xstrdup(netdev_get_name(netdev));
7321e30e 510 bond->bond_revalidate = true;
244b2160 511
b3c18f66 512 slave->enabled = false;
4a1b8f30 513 bond_enable_slave(slave, netdev_get_carrier(netdev));
f620b43a
BP
514 }
515
1ea24138 516 bond_slave_set_netdev__(slave, netdev);
a6934aa9 517
f620b43a
BP
518 free(slave->name);
519 slave->name = xstrdup(netdev_get_name(netdev));
3bfd3972 520 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
521}
522
f8ddccd2
BP
523/* Updates the network device to be used with 'slave_' to 'netdev'.
524 *
525 * This is useful if the caller closes and re-opens the network device
526 * registered with bond_slave_register() but doesn't need to change anything
527 * else. */
528void
529bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
530{
3bfd3972
EJ
531 struct bond_slave *slave;
532
533 ovs_rwlock_wrlock(&rwlock);
534 slave = bond_slave_lookup(bond, slave_);
f8ddccd2 535 if (slave) {
1ea24138 536 bond_slave_set_netdev__(slave, netdev);
f8ddccd2 537 }
3bfd3972 538 ovs_rwlock_unlock(&rwlock);
f8ddccd2
BP
539}
540
f620b43a
BP
541/* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
542 * then this function has no effect.
543 *
544 * Unregistering a slave invalidates all flows. */
545void
546bond_slave_unregister(struct bond *bond, const void *slave_)
547{
3bfd3972 548 struct bond_slave *slave;
f620b43a
BP
549 bool del_active;
550
3bfd3972
EJ
551 ovs_rwlock_wrlock(&rwlock);
552 slave = bond_slave_lookup(bond, slave_);
f620b43a 553 if (!slave) {
3bfd3972 554 goto out;
f620b43a
BP
555 }
556
4a1b8f30
EJ
557 bond->bond_revalidate = true;
558 bond_enable_slave(slave, false);
b3c18f66 559
f620b43a
BP
560 del_active = bond->active_slave == slave;
561 if (bond->hash) {
562 struct bond_entry *e;
563 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
564 if (e->slave == slave) {
565 e->slave = NULL;
566 }
567 }
568 }
569
570 free(slave->name);
571
572 hmap_remove(&bond->slaves, &slave->hmap_node);
573 /* Client owns 'slave->netdev'. */
574 free(slave);
575
576 if (del_active) {
4a1b8f30 577 bond_choose_active_slave(bond);
f620b43a
BP
578 bond->send_learning_packets = true;
579 }
3bfd3972
EJ
580out:
581 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
582}
583
296f6519
EJ
584/* Should be called on each slave in 'bond' before bond_run() to indicate
585 * whether or not 'slave_' may be enabled. This function is intended to allow
586 * other protocols to have some impact on bonding decisions. For example LACP
587 * or high level link monitoring protocols may decide that a given slave should
588 * not be able to send traffic. */
4d6fb5eb 589void
296f6519 590bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
4d6fb5eb 591{
3bfd3972 592 ovs_rwlock_wrlock(&rwlock);
296f6519 593 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
3bfd3972 594 ovs_rwlock_unlock(&rwlock);
4d6fb5eb
EJ
595}
596
4a1b8f30
EJ
597/* Performs periodic maintenance on 'bond'.
598 *
599 * Returns true if the caller should revalidate its flows.
f620b43a
BP
600 *
601 * The caller should check bond_should_send_learning_packets() afterward. */
4a1b8f30
EJ
602bool
603bond_run(struct bond *bond, enum lacp_status lacp_status)
f620b43a
BP
604{
605 struct bond_slave *slave;
4a1b8f30 606 bool revalidate;
f620b43a 607
3bfd3972 608 ovs_rwlock_wrlock(&rwlock);
bdebeece
EJ
609 if (bond->lacp_status != lacp_status) {
610 bond->lacp_status = lacp_status;
4592d0e2
EJ
611 bond->bond_revalidate = true;
612 }
4d6fb5eb 613
f620b43a
BP
614 /* Enable slaves based on link status and LACP feedback. */
615 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
4a1b8f30 616 bond_link_status_update(slave);
da4a6191 617 slave->change_seq = seq_read(connectivity_seq_get());
f620b43a
BP
618 }
619 if (!bond->active_slave || !bond->active_slave->enabled) {
4a1b8f30 620 bond_choose_active_slave(bond);
f620b43a
BP
621 }
622
623 /* Update fake bond interface stats. */
624 if (time_msec() >= bond->next_fake_iface_update) {
625 bond_update_fake_slave_stats(bond);
626 bond->next_fake_iface_update = time_msec() + 1000;
627 }
628
4a1b8f30
EJ
629 revalidate = bond->bond_revalidate;
630 bond->bond_revalidate = false;
3bfd3972 631 ovs_rwlock_unlock(&rwlock);
4a1b8f30
EJ
632
633 return revalidate;
f620b43a
BP
634}
635
636/* Causes poll_block() to wake up when 'bond' needs something to be done. */
637void
638bond_wait(struct bond *bond)
639{
640 struct bond_slave *slave;
641
3bfd3972 642 ovs_rwlock_rdlock(&rwlock);
f620b43a
BP
643 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
644 if (slave->delay_expires != LLONG_MAX) {
645 poll_timer_wait_until(slave->delay_expires);
646 }
1ea24138 647
da4a6191 648 seq_wait(connectivity_seq_get(), slave->change_seq);
f620b43a
BP
649 }
650
651 if (bond->next_fake_iface_update != LLONG_MAX) {
652 poll_timer_wait_until(bond->next_fake_iface_update);
653 }
654
bbc13389 655 if (bond->bond_revalidate) {
f620b43a
BP
656 poll_immediate_wake();
657 }
3bfd3972 658 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
659
660 /* We don't wait for bond->next_rebalance because rebalancing can only run
661 * at a flow account checkpoint. ofproto does checkpointing on its own
662 * schedule and bond_rebalance() gets called afterward, so we'd just be
663 * waking up for no purpose. */
664}
665\f
666/* MAC learning table interaction. */
667
668static bool
669may_send_learning_packets(const struct bond *bond)
670{
9dd165e0
RK
671 return ((bond->lacp_status == LACP_DISABLED
672 && (bond->balance == BM_SLB || bond->balance == BM_AB))
673 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
bdebeece 674 && bond->active_slave;
f620b43a
BP
675}
676
677/* Returns true if 'bond' needs the client to send out packets to assist with
678 * MAC learning on 'bond'. If this function returns true, then the client
679 * should iterate through its MAC learning table for the bridge on which 'bond'
680 * is located. For each MAC that has been learned on a port other than 'bond',
ea131871 681 * it should call bond_compose_learning_packet().
f620b43a 682 *
477879ea
BP
683 * This function will only return true if 'bond' is in SLB or active-backup
684 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
685 * necessary.
f620b43a
BP
686 *
687 * Calling this function resets the state that it checks. */
688bool
689bond_should_send_learning_packets(struct bond *bond)
690{
3bfd3972
EJ
691 bool send;
692
693 ovs_rwlock_wrlock(&rwlock);
694 send = bond->send_learning_packets && may_send_learning_packets(bond);
f620b43a 695 bond->send_learning_packets = false;
3bfd3972 696 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
697 return send;
698}
699
700/* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
701 *
ea131871
JG
702 * See bond_should_send_learning_packets() for description of usage. The
703 * caller should send the composed packet on the port associated with
704 * port_aux and takes ownership of the returned ofpbuf. */
705struct ofpbuf *
706bond_compose_learning_packet(struct bond *bond,
707 const uint8_t eth_src[ETH_ADDR_LEN],
708 uint16_t vlan, void **port_aux)
f620b43a
BP
709{
710 struct bond_slave *slave;
ea131871 711 struct ofpbuf *packet;
f620b43a 712 struct flow flow;
f620b43a 713
3bfd3972 714 ovs_rwlock_rdlock(&rwlock);
cb22974d 715 ovs_assert(may_send_learning_packets(bond));
f620b43a
BP
716 memset(&flow, 0, sizeof flow);
717 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
4a1b8f30 718 slave = choose_output_slave(bond, &flow, NULL, vlan);
f620b43a 719
ea131871 720 packet = ofpbuf_new(0);
2ea838ac 721 compose_rarp(packet, eth_src);
f620b43a 722 if (vlan) {
1bf02876 723 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
f620b43a 724 }
f620b43a 725
ea131871 726 *port_aux = slave->aux;
3bfd3972 727 ovs_rwlock_unlock(&rwlock);
ea131871 728 return packet;
f620b43a
BP
729}
730\f
731/* Checks whether a packet that arrived on 'slave_' within 'bond', with an
732 * Ethernet destination address of 'eth_dst', should be admitted.
733 *
734 * The return value is one of the following:
735 *
736 * - BV_ACCEPT: Admit the packet.
737 *
738 * - BV_DROP: Drop the packet.
739 *
740 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
741 * Ethernet source address and VLAN. If there is none, or if the packet
742 * is on the learned port, then admit the packet. If a different port has
743 * been learned, however, drop the packet (and do not use it for MAC
744 * learning).
745 */
746enum bond_verdict
747bond_check_admissibility(struct bond *bond, const void *slave_,
4a1b8f30 748 const uint8_t eth_dst[ETH_ADDR_LEN])
f620b43a 749{
3bfd3972
EJ
750 enum bond_verdict verdict = BV_DROP;
751 struct bond_slave *slave;
9a1c6450 752
3bfd3972
EJ
753 ovs_rwlock_rdlock(&rwlock);
754 slave = bond_slave_lookup(bond, slave_);
4222bbc8 755 if (!slave) {
3bfd3972 756 goto out;
4222bbc8
EJ
757 }
758
9a1c6450
EJ
759 /* LACP bonds have very loose admissibility restrictions because we can
760 * assume the remote switch is aware of the bond and will "do the right
761 * thing". However, as a precaution we drop packets on disabled slaves
762 * because no correctly implemented partner switch should be sending
bdebeece
EJ
763 * packets to them.
764 *
765 * If LACP is configured, but LACP negotiations have been unsuccessful, we
9dd165e0 766 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
bdebeece 767 switch (bond->lacp_status) {
3bfd3972
EJ
768 case LACP_NEGOTIATED:
769 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
770 goto out;
771 case LACP_CONFIGURED:
9dd165e0
RK
772 if (!bond->lacp_fallback_ab) {
773 goto out;
774 }
3bfd3972
EJ
775 case LACP_DISABLED:
776 break;
f620b43a
BP
777 }
778
779 /* Drop all multicast packets on inactive slaves. */
780 if (eth_addr_is_multicast(eth_dst)) {
4222bbc8 781 if (bond->active_slave != slave) {
3bfd3972 782 goto out;
f620b43a
BP
783 }
784 }
785
f931a4c9 786 switch (bond->balance) {
9dd165e0
RK
787 case BM_TCP:
788 /* TCP balanced bonds require successful LACP negotiations. Based on the
789 * above check, LACP is off or lacp_fallback_ab is true on this bond.
790 * If lacp_fallback_ab is true fall through to BM_AB case else, we
791 * drop all incoming traffic. */
792 if (!bond->lacp_fallback_ab) {
793 goto out;
794 }
795
f931a4c9
BP
796 case BM_AB:
797 /* Drop all packets which arrive on backup slaves. This is similar to
798 * how Linux bonding handles active-backup bonds. */
7ba7dcf0
EJ
799 if (bond->active_slave != slave) {
800 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
801
e6b2255c
BP
802 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
803 " slave (%s) destined for " ETH_ADDR_FMT,
804 slave->name, ETH_ADDR_ARGS(eth_dst));
3bfd3972 805 goto out;
7ba7dcf0 806 }
3bfd3972
EJ
807 verdict = BV_ACCEPT;
808 goto out;
f931a4c9 809
f931a4c9
BP
810 case BM_SLB:
811 /* Drop all packets for which we have learned a different input port,
812 * because we probably sent the packet on one slave and got it back on
813 * the other. Gratuitous ARP packets are an exception to this rule:
814 * the host has moved to another switch. The exception to the
815 * exception is if we locked the learning table to avoid reflections on
816 * bond slaves. */
3bfd3972
EJ
817 verdict = BV_DROP_IF_MOVED;
818 goto out;
7ba7dcf0
EJ
819 }
820
428b2edd 821 OVS_NOT_REACHED();
3bfd3972
EJ
822out:
823 ovs_rwlock_unlock(&rwlock);
824 return verdict;
825
f620b43a
BP
826}
827
828/* Returns the slave (registered on 'bond' by bond_slave_register()) to which
829 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
830 * NULL if the packet should be dropped because no slaves are enabled.
831 *
832 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
833 * should be a VID only (i.e. excluding the PCP bits). Second,
834 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
835 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
836 * packet belongs to (so for an access port it will be the access port's VLAN).
837 *
bcd2633a
JP
838 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
839 * significant in the selection. At some point earlier, 'wc' should
840 * have been initialized (e.g., by flow_wildcards_init_catchall()).
f620b43a
BP
841 */
842void *
843bond_choose_output_slave(struct bond *bond, const struct flow *flow,
4a1b8f30 844 struct flow_wildcards *wc, uint16_t vlan)
f620b43a 845{
3bfd3972 846 struct bond_slave *slave;
b5d5d7d3 847 void *aux;
3bfd3972
EJ
848
849 ovs_rwlock_rdlock(&rwlock);
4a1b8f30 850 slave = choose_output_slave(bond, flow, wc, vlan);
b5d5d7d3 851 aux = slave ? slave->aux : NULL;
3bfd3972 852 ovs_rwlock_unlock(&rwlock);
b5d5d7d3
AW
853
854 return aux;
f620b43a 855}
f620b43a 856\f
adcf00ba
AZ
857/* Recirculation. */
858static void
859bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
c6855ec5 860 OVS_REQ_WRLOCK(rwlock)
adcf00ba
AZ
861{
862 if (entry->slave) {
863 uint64_t delta;
864
865 delta = rule_tx_bytes - entry->pr_tx_bytes;
866 entry->tx_bytes += delta;
867 entry->pr_tx_bytes = rule_tx_bytes;
868 }
869}
870
871/* Maintain bond stats using post recirculation rule byte counters.*/
872void
873bond_recirculation_account(struct bond *bond)
874{
875 int i;
876
c6855ec5 877 ovs_rwlock_wrlock(&rwlock);
adcf00ba
AZ
878 for (i=0; i<=BOND_MASK; i++) {
879 struct bond_entry *entry = &bond->hash[i];
880 struct rule *rule = entry->pr_rule;
881
882 if (rule) {
883 uint64_t n_packets OVS_UNUSED;
884 long long int used OVS_UNUSED;
885 uint64_t n_bytes;
886
887 rule->ofproto->ofproto_class->rule_get_stats(
888 rule, &n_packets, &n_bytes, &used);
889 bond_entry_account(entry, n_bytes);
890 }
891 }
892 ovs_rwlock_unlock(&rwlock);
893}
894
895bool
896bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
897 uint32_t *hash_bias)
898{
899 if (bond->balance == BM_TCP) {
900 if (recirc_id) {
901 *recirc_id = bond->recirc_id;
902 }
903 if (hash_bias) {
904 *hash_bias = bond->basis;
905 }
906 return true;
907 } else {
908 return false;
909 }
910}
911
912void
913bond_update_post_recirc_rules(struct bond* bond, const bool force)
914{
915 struct bond_entry *e;
916 bool update_rules = force; /* Always update rules if caller forces it. */
917
918 /* Make sure all bond entries are populated */
919 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
920 if (!e->slave || !e->slave->enabled) {
921 update_rules = true;
922 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
923 struct bond_slave, hmap_node);
924 if (!e->slave->enabled) {
925 e->slave = bond->active_slave;
926 }
927 }
928 }
929
930 if (update_rules) {
931 update_recirc_rules(bond);
932 }
933}
934\f
f620b43a
BP
935/* Rebalancing. */
936
1b137691 937static bool
3bfd3972 938bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
1b137691 939{
bc1b010c
EJ
940 return bond->rebalance_interval
941 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
1b137691
EJ
942}
943
f620b43a
BP
944/* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
945void
946bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
947 uint64_t n_bytes)
948{
3bfd3972 949 ovs_rwlock_wrlock(&rwlock);
1b137691 950 if (bond_is_balanced(bond)) {
f620b43a 951 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
f620b43a 952 }
3bfd3972 953 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
954}
955
956static struct bond_slave *
3bfd3972 957bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
958{
959 return CONTAINER_OF(bal, struct bond_slave, bal_node);
960}
961
962static void
963log_bals(struct bond *bond, const struct list *bals)
c6855ec5 964 OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
965{
966 if (VLOG_IS_DBG_ENABLED()) {
967 struct ds ds = DS_EMPTY_INITIALIZER;
968 const struct bond_slave *slave;
969
970 LIST_FOR_EACH (slave, bal_node, bals) {
971 if (ds.length) {
972 ds_put_char(&ds, ',');
973 }
974 ds_put_format(&ds, " %s %"PRIu64"kB",
975 slave->name, slave->tx_bytes / 1024);
976
977 if (!slave->enabled) {
978 ds_put_cstr(&ds, " (disabled)");
979 }
980 if (!list_is_empty(&slave->entries)) {
981 struct bond_entry *e;
982
983 ds_put_cstr(&ds, " (");
984 LIST_FOR_EACH (e, list_node, &slave->entries) {
985 if (&e->list_node != list_front(&slave->entries)) {
986 ds_put_cstr(&ds, " + ");
987 }
34582733 988 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
f620b43a
BP
989 e - bond->hash, e->tx_bytes / 1024);
990 }
991 ds_put_cstr(&ds, ")");
992 }
993 }
994 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
995 ds_destroy(&ds);
996 }
997}
998
999/* Shifts 'hash' from its current slave to 'to'. */
1000static void
4a1b8f30 1001bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
c6855ec5 1002 OVS_REQ_WRLOCK(rwlock)
f620b43a
BP
1003{
1004 struct bond_slave *from = hash->slave;
1005 struct bond *bond = from->bond;
1006 uint64_t delta = hash->tx_bytes;
1007
34582733 1008 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
f620b43a
BP
1009 "from %s to %s (now carrying %"PRIu64"kB and "
1010 "%"PRIu64"kB load, respectively)",
1011 bond->name, delta / 1024, hash - bond->hash,
1012 from->name, to->name,
1013 (from->tx_bytes - delta) / 1024,
1014 (to->tx_bytes + delta) / 1024);
1015
1016 /* Shift load away from 'from' to 'to'. */
1017 from->tx_bytes -= delta;
1018 to->tx_bytes += delta;
1019
1020 /* Arrange for flows to be revalidated. */
dc30ea2d 1021 hash->slave = to;
4a1b8f30 1022 bond->bond_revalidate = true;
f620b43a
BP
1023}
1024
09a5d390
BP
1025/* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1026 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
f620b43a
BP
1027 * given that doing so must decrease the ratio of the load on the two slaves by
1028 * at least 0.1. Returns NULL if there is no appropriate entry.
1029 *
1030 * The list of entries isn't sorted. I don't know of a reason to prefer to
1031 * shift away small hashes or large hashes. */
1032static struct bond_entry *
1033choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
c6855ec5 1034 OVS_REQ_WRLOCK(rwlock)
f620b43a
BP
1035{
1036 struct bond_entry *e;
1037
1038 if (list_is_short(&from->entries)) {
1039 /* 'from' carries no more than one MAC hash, so shifting load away from
1040 * it would be pointless. */
1041 return NULL;
1042 }
1043
1044 LIST_FOR_EACH (e, list_node, &from->entries) {
1045 double old_ratio, new_ratio;
1046 uint64_t delta;
1047
1048 if (to_tx_bytes == 0) {
1049 /* Nothing on the new slave, move it. */
1050 return e;
1051 }
1052
1053 delta = e->tx_bytes;
1054 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1055 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
75fad143
ZK
1056 if (old_ratio - new_ratio > 0.1
1057 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1058 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1059 and 'to' slave have the same load. Therefore, we only move an
1060 entry if it decreases the load on 'from', and brings us closer
1061 to equal traffic load. */
f620b43a
BP
1062 return e;
1063 }
1064 }
1065
1066 return NULL;
1067}
1068
1069/* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1070 * maintained. */
1071static void
1072insert_bal(struct list *bals, struct bond_slave *slave)
1073{
1074 struct bond_slave *pos;
1075
1076 LIST_FOR_EACH (pos, bal_node, bals) {
1077 if (slave->tx_bytes > pos->tx_bytes) {
1078 break;
1079 }
1080 }
1081 list_insert(&pos->bal_node, &slave->bal_node);
1082}
1083
1084/* Removes 'slave' from its current list and then inserts it into 'bals' so
1085 * that descending order of 'tx_bytes' is maintained. */
1086static void
1087reinsert_bal(struct list *bals, struct bond_slave *slave)
1088{
1089 list_remove(&slave->bal_node);
1090 insert_bal(bals, slave);
1091}
1092
1093/* If 'bond' needs rebalancing, does so.
1094 *
adcf00ba
AZ
1095 * The caller should have called bond_account() for each active flow, or in case
1096 * of recirculation is used, have called bond_recirculation_account(bond),
1097 * to ensure that flow data is consistently accounted at this point.
1098 *
1099 * Return whether rebalancing took place.*/
1100bool
4a1b8f30 1101bond_rebalance(struct bond *bond)
f620b43a
BP
1102{
1103 struct bond_slave *slave;
1104 struct bond_entry *e;
1105 struct list bals;
adcf00ba 1106 bool rebalanced = false;
f620b43a 1107
3bfd3972 1108 ovs_rwlock_wrlock(&rwlock);
1b137691 1109 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
adcf00ba 1110 goto done;
f620b43a
BP
1111 }
1112 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1113
1114 /* Add each bond_entry to its slave's 'entries' list.
1115 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1116 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1117 slave->tx_bytes = 0;
1118 list_init(&slave->entries);
1119 }
1120 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1121 if (e->slave && e->tx_bytes) {
1122 e->slave->tx_bytes += e->tx_bytes;
1123 list_push_back(&e->slave->entries, &e->list_node);
1124 }
1125 }
1126
1127 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1128 *
1129 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1130 * with a proper list sort algorithm. */
1131 list_init(&bals);
1132 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1133 if (slave->enabled) {
1134 insert_bal(&bals, slave);
1135 }
1136 }
1137 log_bals(bond, &bals);
1138
1139 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1140 while (!list_is_short(&bals)) {
1141 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1142 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1143 uint64_t overload;
1144
1145 overload = from->tx_bytes - to->tx_bytes;
1146 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1147 /* The extra load on 'from' (and all less-loaded slaves), compared
1148 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1149 * it is less than ~1Mbps. No point in rebalancing. */
1150 break;
1151 }
1152
09a5d390
BP
1153 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1154 * to move from 'from' to 'to'. */
f620b43a
BP
1155 e = choose_entry_to_migrate(from, to->tx_bytes);
1156 if (e) {
4a1b8f30 1157 bond_shift_load(e, to);
f620b43a
BP
1158
1159 /* Delete element from from->entries.
1160 *
1161 * We don't add the element to to->hashes. That would only allow
1162 * 'e' to be migrated to another slave in this rebalancing run, and
1163 * there is no point in doing that. */
1164 list_remove(&e->list_node);
1165
1166 /* Re-sort 'bals'. */
1167 reinsert_bal(&bals, from);
1168 reinsert_bal(&bals, to);
adcf00ba 1169 rebalanced = true;
f620b43a
BP
1170 } else {
1171 /* Can't usefully migrate anything away from 'from'.
1172 * Don't reconsider it. */
1173 list_remove(&from->bal_node);
1174 }
1175 }
1176
1177 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1178 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1179 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1180 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1181 e->tx_bytes /= 2;
1182 if (!e->tx_bytes) {
1183 e->slave = NULL;
1184 }
1185 }
adcf00ba
AZ
1186
1187done:
3bfd3972 1188 ovs_rwlock_unlock(&rwlock);
adcf00ba 1189 return rebalanced;
f620b43a
BP
1190}
1191\f
1192/* Bonding unixctl user interface functions. */
1193
1194static struct bond *
3bfd3972 1195bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
1196{
1197 struct bond *bond;
1198
1199 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
3bfd3972 1200 all_bonds) {
f620b43a
BP
1201 if (!strcmp(bond->name, name)) {
1202 return bond;
1203 }
1204 }
1205 return NULL;
1206}
1207
1208static struct bond_slave *
1209bond_lookup_slave(struct bond *bond, const char *slave_name)
1210{
1211 struct bond_slave *slave;
1212
1213 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1214 if (!strcmp(slave->name, slave_name)) {
1215 return slave;
1216 }
1217 }
1218 return NULL;
1219}
1220
1221static void
1222bond_unixctl_list(struct unixctl_conn *conn,
0e15264f
BP
1223 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1224 void *aux OVS_UNUSED)
f620b43a
BP
1225{
1226 struct ds ds = DS_EMPTY_INITIALIZER;
1227 const struct bond *bond;
1228
adcf00ba 1229 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
f620b43a 1230
3bfd3972
EJ
1231 ovs_rwlock_rdlock(&rwlock);
1232 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
f620b43a
BP
1233 const struct bond_slave *slave;
1234 size_t i;
1235
adcf00ba
AZ
1236 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1237 bond_mode_to_string(bond->balance), bond->recirc_id);
f620b43a
BP
1238
1239 i = 0;
1240 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1241 if (i++ > 0) {
1242 ds_put_cstr(&ds, ", ");
1243 }
1244 ds_put_cstr(&ds, slave->name);
1245 }
1246 ds_put_char(&ds, '\n');
1247 }
3bfd3972 1248 ovs_rwlock_unlock(&rwlock);
bde9f75d 1249 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a
BP
1250 ds_destroy(&ds);
1251}
1252
1253static void
c33a8a25 1254bond_print_details(struct ds *ds, const struct bond *bond)
3bfd3972 1255 OVS_REQ_RDLOCK(rwlock)
f620b43a 1256{
fc1d4f01
EJ
1257 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1258 const struct shash_node **sorted_slaves = NULL;
f620b43a 1259 const struct bond_slave *slave;
adcf00ba
AZ
1260 bool may_recirc;
1261 uint32_t recirc_id;
fc1d4f01 1262 int i;
f620b43a 1263
c33a8a25
EJ
1264 ds_put_format(ds, "---- %s ----\n", bond->name);
1265 ds_put_format(ds, "bond_mode: %s\n",
f620b43a
BP
1266 bond_mode_to_string(bond->balance));
1267
adcf00ba
AZ
1268 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1269 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1270 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1271
c33a8a25 1272 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
672d18b2 1273
c33a8a25
EJ
1274 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1275 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
f620b43a 1276
1b137691 1277 if (bond_is_balanced(bond)) {
c33a8a25 1278 ds_put_format(ds, "next rebalance: %lld ms\n",
f620b43a
BP
1279 bond->next_rebalance - time_msec());
1280 }
1281
bdebeece
EJ
1282 ds_put_cstr(ds, "lacp_status: ");
1283 switch (bond->lacp_status) {
1284 case LACP_NEGOTIATED:
1285 ds_put_cstr(ds, "negotiated\n");
1286 break;
1287 case LACP_CONFIGURED:
1288 ds_put_cstr(ds, "configured\n");
1289 break;
1290 case LACP_DISABLED:
1291 ds_put_cstr(ds, "off\n");
1292 break;
1293 default:
1294 ds_put_cstr(ds, "<unknown>\n");
1295 break;
1296 }
4d6fb5eb 1297
f620b43a 1298 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
fc1d4f01
EJ
1299 shash_add(&slave_shash, slave->name, slave);
1300 }
1301 sorted_slaves = shash_sort(&slave_shash);
1302
1303 for (i = 0; i < shash_count(&slave_shash); i++) {
f620b43a 1304 struct bond_entry *be;
f620b43a 1305
fc1d4f01
EJ
1306 slave = sorted_slaves[i]->data;
1307
f620b43a 1308 /* Basic info. */
c33a8a25 1309 ds_put_format(ds, "\nslave %s: %s\n",
f620b43a
BP
1310 slave->name, slave->enabled ? "enabled" : "disabled");
1311 if (slave == bond->active_slave) {
c33a8a25 1312 ds_put_cstr(ds, "\tactive slave\n");
f620b43a
BP
1313 }
1314 if (slave->delay_expires != LLONG_MAX) {
c33a8a25 1315 ds_put_format(ds, "\t%s expires in %lld ms\n",
f620b43a
BP
1316 slave->enabled ? "downdelay" : "updelay",
1317 slave->delay_expires - time_msec());
1318 }
1319
c33a8a25 1320 ds_put_format(ds, "\tmay_enable: %s\n",
296f6519 1321 slave->may_enable ? "true" : "false");
4d6fb5eb 1322
1b137691 1323 if (!bond_is_balanced(bond)) {
f620b43a
BP
1324 continue;
1325 }
1326
1327 /* Hashes. */
f620b43a
BP
1328 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1329 int hash = be - bond->hash;
1330
1331 if (be->slave != slave) {
1332 continue;
1333 }
1334
c33a8a25 1335 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
f620b43a
BP
1336 hash, be->tx_bytes / 1024);
1337
7b9f1974 1338 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
f620b43a
BP
1339 }
1340 }
fc1d4f01
EJ
1341 shash_destroy(&slave_shash);
1342 free(sorted_slaves);
c33a8a25
EJ
1343 ds_put_cstr(ds, "\n");
1344}
1345
1346static void
1347bond_unixctl_show(struct unixctl_conn *conn,
1348 int argc, const char *argv[],
1349 void *aux OVS_UNUSED)
1350{
1351 struct ds ds = DS_EMPTY_INITIALIZER;
1352
3bfd3972 1353 ovs_rwlock_rdlock(&rwlock);
c33a8a25
EJ
1354 if (argc > 1) {
1355 const struct bond *bond = bond_find(argv[1]);
1356
1357 if (!bond) {
bde9f75d 1358 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1359 goto out;
c33a8a25
EJ
1360 }
1361 bond_print_details(&ds, bond);
1362 } else {
1363 const struct bond *bond;
1364
3bfd3972 1365 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
c33a8a25
EJ
1366 bond_print_details(&ds, bond);
1367 }
1368 }
1369
bde9f75d 1370 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a 1371 ds_destroy(&ds);
3bfd3972
EJ
1372
1373out:
1374 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1375}
1376
1377static void
0e15264f
BP
1378bond_unixctl_migrate(struct unixctl_conn *conn,
1379 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1380 void *aux OVS_UNUSED)
1381{
0e15264f
BP
1382 const char *bond_s = argv[1];
1383 const char *hash_s = argv[2];
1384 const char *slave_s = argv[3];
f620b43a
BP
1385 struct bond *bond;
1386 struct bond_slave *slave;
1387 struct bond_entry *entry;
1388 int hash;
1389
3bfd3972 1390 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1391 bond = bond_find(bond_s);
1392 if (!bond) {
bde9f75d 1393 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1394 goto out;
f620b43a
BP
1395 }
1396
1397 if (bond->balance != BM_SLB) {
bde9f75d 1398 unixctl_command_reply_error(conn, "not an SLB bond");
3bfd3972 1399 goto out;
f620b43a
BP
1400 }
1401
1402 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1403 hash = atoi(hash_s) & BOND_MASK;
1404 } else {
bde9f75d 1405 unixctl_command_reply_error(conn, "bad hash");
3bfd3972 1406 goto out;
f620b43a
BP
1407 }
1408
1409 slave = bond_lookup_slave(bond, slave_s);
1410 if (!slave) {
bde9f75d 1411 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1412 goto out;
f620b43a
BP
1413 }
1414
1415 if (!slave->enabled) {
bde9f75d 1416 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
3bfd3972 1417 goto out;
f620b43a
BP
1418 }
1419
1420 entry = &bond->hash[hash];
4a1b8f30 1421 bond->bond_revalidate = true;
f620b43a 1422 entry->slave = slave;
bde9f75d 1423 unixctl_command_reply(conn, "migrated");
3bfd3972
EJ
1424
1425out:
1426 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1427}
1428
1429static void
0e15264f
BP
1430bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1431 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1432 void *aux OVS_UNUSED)
1433{
0e15264f
BP
1434 const char *bond_s = argv[1];
1435 const char *slave_s = argv[2];
f620b43a
BP
1436 struct bond *bond;
1437 struct bond_slave *slave;
1438
3bfd3972 1439 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1440 bond = bond_find(bond_s);
1441 if (!bond) {
bde9f75d 1442 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1443 goto out;
f620b43a
BP
1444 }
1445
1446 slave = bond_lookup_slave(bond, slave_s);
1447 if (!slave) {
bde9f75d 1448 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1449 goto out;
f620b43a
BP
1450 }
1451
1452 if (!slave->enabled) {
bde9f75d 1453 unixctl_command_reply_error(conn, "cannot make disabled slave active");
3bfd3972 1454 goto out;
f620b43a
BP
1455 }
1456
1457 if (bond->active_slave != slave) {
4a1b8f30 1458 bond->bond_revalidate = true;
f620b43a 1459 bond->active_slave = slave;
f620b43a
BP
1460 VLOG_INFO("bond %s: active interface is now %s",
1461 bond->name, slave->name);
1462 bond->send_learning_packets = true;
bde9f75d 1463 unixctl_command_reply(conn, "done");
f620b43a 1464 } else {
bde9f75d 1465 unixctl_command_reply(conn, "no change");
f620b43a 1466 }
3bfd3972
EJ
1467out:
1468 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1469}
1470
1471static void
0e15264f 1472enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
f620b43a 1473{
0e15264f
BP
1474 const char *bond_s = argv[1];
1475 const char *slave_s = argv[2];
f620b43a
BP
1476 struct bond *bond;
1477 struct bond_slave *slave;
1478
3bfd3972 1479 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1480 bond = bond_find(bond_s);
1481 if (!bond) {
bde9f75d 1482 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1483 goto out;
f620b43a
BP
1484 }
1485
1486 slave = bond_lookup_slave(bond, slave_s);
1487 if (!slave) {
bde9f75d 1488 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1489 goto out;
f620b43a
BP
1490 }
1491
4a1b8f30 1492 bond_enable_slave(slave, enable);
bde9f75d 1493 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
3bfd3972
EJ
1494
1495out:
1496 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1497}
1498
1499static void
0e15264f
BP
1500bond_unixctl_enable_slave(struct unixctl_conn *conn,
1501 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1502 void *aux OVS_UNUSED)
1503{
0e15264f 1504 enable_slave(conn, argv, true);
f620b43a
BP
1505}
1506
1507static void
0e15264f
BP
1508bond_unixctl_disable_slave(struct unixctl_conn *conn,
1509 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1510 void *aux OVS_UNUSED)
1511{
0e15264f 1512 enable_slave(conn, argv, false);
f620b43a
BP
1513}
1514
1515static void
0e15264f 1516bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
f620b43a
BP
1517 void *aux OVS_UNUSED)
1518{
0e15264f
BP
1519 const char *mac_s = argv[1];
1520 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1521 const char *basis_s = argc > 3 ? argv[3] : NULL;
f620b43a
BP
1522 uint8_t mac[ETH_ADDR_LEN];
1523 uint8_t hash;
1524 char *hash_cstr;
1525 unsigned int vlan;
672d18b2 1526 uint32_t basis;
f620b43a
BP
1527
1528 if (vlan_s) {
c2c28dfd 1529 if (!ovs_scan(vlan_s, "%u", &vlan)) {
bde9f75d 1530 unixctl_command_reply_error(conn, "invalid vlan");
f620b43a
BP
1531 return;
1532 }
1533 } else {
dc155bff 1534 vlan = 0;
f620b43a
BP
1535 }
1536
672d18b2 1537 if (basis_s) {
c2c28dfd 1538 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
bde9f75d 1539 unixctl_command_reply_error(conn, "invalid basis");
672d18b2
EJ
1540 return;
1541 }
1542 } else {
1543 basis = 0;
1544 }
1545
c2c28dfd 1546 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
672d18b2 1547 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
f620b43a
BP
1548
1549 hash_cstr = xasprintf("%u", hash);
bde9f75d 1550 unixctl_command_reply(conn, hash_cstr);
f620b43a
BP
1551 free(hash_cstr);
1552 } else {
bde9f75d 1553 unixctl_command_reply_error(conn, "invalid mac");
f620b43a
BP
1554 }
1555}
1556
1557void
1558bond_init(void)
1559{
0e15264f 1560 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
c33a8a25
EJ
1561 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1562 NULL);
0e15264f 1563 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
7ff2009a 1564 bond_unixctl_migrate, NULL);
0e15264f 1565 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
f620b43a 1566 bond_unixctl_set_active_slave, NULL);
0e15264f 1567 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
7ff2009a 1568 bond_unixctl_enable_slave, NULL);
0e15264f 1569 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
7ff2009a 1570 bond_unixctl_disable_slave, NULL);
0e15264f 1571 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
7ff2009a 1572 bond_unixctl_hash, NULL);
f620b43a
BP
1573}
1574\f
95aafb2a
EJ
1575static void
1576bond_entry_reset(struct bond *bond)
1577{
1578 if (bond->balance != BM_AB) {
9e1a6910 1579 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
95aafb2a
EJ
1580
1581 if (!bond->hash) {
1582 bond->hash = xmalloc(hash_len);
1583 }
1584 memset(bond->hash, 0, hash_len);
1585
1586 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1587 } else {
1588 free(bond->hash);
1589 bond->hash = NULL;
1590 }
1591}
1592
f620b43a
BP
1593static struct bond_slave *
1594bond_slave_lookup(struct bond *bond, const void *slave_)
1595{
1596 struct bond_slave *slave;
1597
1598 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1599 &bond->slaves) {
1600 if (slave->aux == slave_) {
1601 return slave;
1602 }
1603 }
1604
1605 return NULL;
1606}
1607
f620b43a 1608static void
4a1b8f30 1609bond_enable_slave(struct bond_slave *slave, bool enable)
f620b43a
BP
1610{
1611 slave->delay_expires = LLONG_MAX;
1612 if (enable != slave->enabled) {
4a1b8f30 1613 slave->bond->bond_revalidate = true;
f620b43a 1614 slave->enabled = enable;
f1c8a79c
AW
1615
1616 ovs_mutex_lock(&slave->bond->mutex);
1617 if (enable) {
1618 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1619 } else {
1620 list_remove(&slave->list_node);
1621 }
1622 ovs_mutex_unlock(&slave->bond->mutex);
1623
4a1b8f30
EJ
1624 VLOG_INFO("interface %s: %s", slave->name,
1625 slave->enabled ? "enabled" : "disabled");
f620b43a
BP
1626 }
1627}
1628
1629static void
4a1b8f30 1630bond_link_status_update(struct bond_slave *slave)
f620b43a
BP
1631{
1632 struct bond *bond = slave->bond;
1633 bool up;
1634
296f6519 1635 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
f620b43a
BP
1636 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1637 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1638 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1639 slave->name, up ? "up" : "down");
1640 if (up == slave->enabled) {
1641 slave->delay_expires = LLONG_MAX;
1642 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1643 slave->name, up ? "disabled" : "enabled");
1644 } else {
bdebeece 1645 int delay = (bond->lacp_status != LACP_DISABLED ? 0
f620b43a
BP
1646 : up ? bond->updelay : bond->downdelay);
1647 slave->delay_expires = time_msec() + delay;
1648 if (delay) {
1649 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1650 "for %d ms",
1651 slave->name,
1652 up ? "enabled" : "disabled",
1653 up ? "up" : "down",
1654 delay);
1655 }
1656 }
1657 }
1658
1659 if (time_msec() >= slave->delay_expires) {
4a1b8f30 1660 bond_enable_slave(slave, up);
f620b43a
BP
1661 }
1662}
1663
f620b43a 1664static unsigned int
672d18b2 1665bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
f620b43a 1666{
7e36ac42 1667 return hash_mac(mac, vlan, basis);
f620b43a
BP
1668}
1669
1670static unsigned int
672d18b2 1671bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
f620b43a
BP
1672{
1673 struct flow hash_flow = *flow;
d84d4b88 1674 hash_flow.vlan_tci = htons(vlan);
f620b43a
BP
1675
1676 /* The symmetric quality of this hash function is not required, but
1677 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1678 * purposes, so we use it out of convenience. */
672d18b2 1679 return flow_hash_symmetric_l4(&hash_flow, basis);
f620b43a
BP
1680}
1681
fb0b29a3
EJ
1682static unsigned int
1683bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1684{
cb22974d 1685 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
fb0b29a3 1686
bdebeece 1687 return (bond->balance == BM_TCP
672d18b2
EJ
1688 ? bond_hash_tcp(flow, vlan, bond->basis)
1689 : bond_hash_src(flow->dl_src, vlan, bond->basis));
fb0b29a3
EJ
1690}
1691
f620b43a
BP
1692static struct bond_entry *
1693lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1694 uint16_t vlan)
1695{
fb0b29a3 1696 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
f620b43a
BP
1697}
1698
f1c8a79c
AW
1699/* Selects and returns an enabled slave from the 'enabled_slaves' list
1700 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1701 * returns NULL. */
1702static struct bond_slave *
1703get_enabled_slave(struct bond *bond)
1704{
1705 struct list *node;
1706
1707 ovs_mutex_lock(&bond->mutex);
1708 if (list_is_empty(&bond->enabled_slaves)) {
1709 ovs_mutex_unlock(&bond->mutex);
1710 return NULL;
1711 }
1712
1713 node = list_pop_front(&bond->enabled_slaves);
1714 list_push_back(&bond->enabled_slaves, node);
1715 ovs_mutex_unlock(&bond->mutex);
1716
1717 return CONTAINER_OF(node, struct bond_slave, list_node);
1718}
1719
f620b43a
BP
1720static struct bond_slave *
1721choose_output_slave(const struct bond *bond, const struct flow *flow,
4a1b8f30 1722 struct flow_wildcards *wc, uint16_t vlan)
f620b43a
BP
1723{
1724 struct bond_entry *e;
9dd165e0 1725 int balance;
f620b43a 1726
9dd165e0 1727 balance = bond->balance;
bdebeece
EJ
1728 if (bond->lacp_status == LACP_CONFIGURED) {
1729 /* LACP has been configured on this bond but negotiations were
9dd165e0
RK
1730 * unsuccussful. If lacp_fallback_ab is enabled use active-
1731 * backup mode else drop all traffic. */
1732 if (!bond->lacp_fallback_ab) {
1733 return NULL;
1734 }
1735 balance = BM_AB;
bdebeece
EJ
1736 }
1737
9dd165e0 1738 switch (balance) {
f620b43a
BP
1739 case BM_AB:
1740 return bond->active_slave;
1741
f620b43a 1742 case BM_TCP:
bdebeece
EJ
1743 if (bond->lacp_status != LACP_NEGOTIATED) {
1744 /* Must have LACP negotiations for TCP balanced bonds. */
1745 return NULL;
1746 }
bcd2633a 1747 if (wc) {
6cdd5145 1748 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
bcd2633a 1749 }
bdebeece
EJ
1750 /* Fall Through. */
1751 case BM_SLB:
bcd2633a 1752 if (wc) {
6cdd5145 1753 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
bcd2633a 1754 }
f620b43a
BP
1755 e = lookup_bond_entry(bond, flow, vlan);
1756 if (!e->slave || !e->slave->enabled) {
f1c8a79c 1757 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
f620b43a
BP
1758 }
1759 return e->slave;
1760
1761 default:
428b2edd 1762 OVS_NOT_REACHED();
f620b43a
BP
1763 }
1764}
1765
1766static struct bond_slave *
1767bond_choose_slave(const struct bond *bond)
1768{
1769 struct bond_slave *slave, *best;
1770
1771 /* Find an enabled slave. */
1772 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1773 if (slave->enabled) {
1774 return slave;
1775 }
1776 }
1777
1778 /* All interfaces are disabled. Find an interface that will be enabled
1779 * after its updelay expires. */
1780 best = NULL;
1781 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1782 if (slave->delay_expires != LLONG_MAX
296f6519 1783 && slave->may_enable
f620b43a
BP
1784 && (!best || slave->delay_expires < best->delay_expires)) {
1785 best = slave;
1786 }
1787 }
1788 return best;
1789}
1790
1791static void
4a1b8f30 1792bond_choose_active_slave(struct bond *bond)
f620b43a
BP
1793{
1794 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1795 struct bond_slave *old_active_slave = bond->active_slave;
1796
1797 bond->active_slave = bond_choose_slave(bond);
1798 if (bond->active_slave) {
1799 if (bond->active_slave->enabled) {
1800 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1801 bond->name, bond->active_slave->name);
1802 } else {
1803 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1804 "remaining %lld ms updelay (since no interface was "
1805 "enabled)", bond->name, bond->active_slave->name,
1806 bond->active_slave->delay_expires - time_msec());
4a1b8f30 1807 bond_enable_slave(bond->active_slave, true);
f620b43a
BP
1808 }
1809
1810 bond->send_learning_packets = true;
1811 } else if (old_active_slave) {
d28b9ead 1812 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
f620b43a
BP
1813 }
1814}
1815
f620b43a
BP
1816/* Attempts to make the sum of the bond slaves' statistics appear on the fake
1817 * bond interface. */
1818static void
1819bond_update_fake_slave_stats(struct bond *bond)
1820{
1821 struct netdev_stats bond_stats;
1822 struct bond_slave *slave;
1823 struct netdev *bond_dev;
1824
1825 memset(&bond_stats, 0, sizeof bond_stats);
1826
1827 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1828 struct netdev_stats slave_stats;
1829
1830 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1831 /* XXX: We swap the stats here because they are swapped back when
1832 * reported by the internal device. The reason for this is
1833 * internal devices normally represent packets going into the
1834 * system but when used as fake bond device they represent packets
1835 * leaving the system. We really should do this in the internal
1836 * device itself because changing it here reverses the counts from
1837 * the perspective of the switch. However, the internal device
1838 * doesn't know what type of device it represents so we have to do
1839 * it here for now. */
1840 bond_stats.tx_packets += slave_stats.rx_packets;
1841 bond_stats.tx_bytes += slave_stats.rx_bytes;
1842 bond_stats.rx_packets += slave_stats.tx_packets;
1843 bond_stats.rx_bytes += slave_stats.tx_bytes;
1844 }
1845 }
1846
18812dff 1847 if (!netdev_open(bond->name, "system", &bond_dev)) {
f620b43a
BP
1848 netdev_set_stats(bond_dev, &bond_stats);
1849 netdev_close(bond_dev);
1850 }
1851}