]> git.proxmox.com Git - mirror_ovs.git/blame - ofproto/bond.c
lib/classifier: Support variable sized miniflows.
[mirror_ovs.git] / ofproto / bond.c
CommitLineData
f620b43a 1/*
8917f72c 2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
f620b43a
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
19#include "bond.h"
20
21#include <limits.h>
22#include <stdint.h>
23#include <stdlib.h>
75fad143 24#include <math.h>
f620b43a 25
adcf00ba
AZ
26#include "ofp-util.h"
27#include "ofp-actions.h"
28#include "ofpbuf.h"
29#include "ofproto/ofproto-provider.h"
30#include "ofproto/ofproto-dpif.h"
da4a6191 31#include "connectivity.h"
f620b43a
BP
32#include "coverage.h"
33#include "dynamic-string.h"
34#include "flow.h"
35#include "hmap.h"
bdebeece 36#include "lacp.h"
f620b43a
BP
37#include "list.h"
38#include "netdev.h"
39#include "odp-util.h"
40#include "ofpbuf.h"
41#include "packets.h"
42#include "poll-loop.h"
da4a6191 43#include "seq.h"
adcf00ba 44#include "match.h"
fc1d4f01 45#include "shash.h"
f620b43a
BP
46#include "timeval.h"
47#include "unixctl.h"
48#include "vlog.h"
49
50VLOG_DEFINE_THIS_MODULE(bond);
51
f1c8a79c
AW
52static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
53static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
54static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
55
9e1a6910 56/* Bit-mask for hashing a flow down to a bucket. */
f620b43a 57#define BOND_MASK 0xff
9e1a6910 58#define BOND_BUCKETS (BOND_MASK + 1)
adcf00ba 59#define RECIRC_RULE_PRIORITY 20 /* Priority level for internal rules */
f620b43a
BP
60
61/* A hash bucket for mapping a flow to a slave.
9e1a6910 62 * "struct bond" has an array of BOND_BUCKETS of these. */
f620b43a
BP
63struct bond_entry {
64 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
c6855ec5
JS
65 uint64_t tx_bytes /* Count of bytes recently transmitted. */
66 OVS_GUARDED_BY(rwlock);
f620b43a 67 struct list list_node; /* In bond_slave's 'entries' list. */
adcf00ba 68
c6855ec5
JS
69 /* Recirculation.
70 *
71 * 'pr_rule' is the post-recirculation rule for this entry.
72 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
73 * is used to determine delta (applied to 'tx_bytes' above.) */
74 struct rule *pr_rule;
75 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
f620b43a
BP
76};
77
78/* A bond slave, that is, one of the links comprising a bond. */
79struct bond_slave {
80 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
f1c8a79c 81 struct list list_node; /* In struct bond's enabled_slaves list. */
f620b43a
BP
82 struct bond *bond; /* The bond that contains this slave. */
83 void *aux; /* Client-provided handle for this slave. */
84
85 struct netdev *netdev; /* Network device, owned by the client. */
1ea24138 86 unsigned int change_seq; /* Tracks changes in 'netdev'. */
adcf00ba 87 ofp_port_t ofp_port; /* Open flow port number */
f620b43a
BP
88 char *name; /* Name (a copy of netdev_get_name(netdev)). */
89
90 /* Link status. */
91 long long delay_expires; /* Time after which 'enabled' may change. */
f620b43a 92 bool enabled; /* May be chosen for flows? */
296f6519 93 bool may_enable; /* Client considers this slave bondable. */
f620b43a
BP
94
95 /* Rebalancing info. Used only by bond_rebalance(). */
96 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
97 struct list entries; /* 'struct bond_entry's assigned here. */
98 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
99};
100
101/* A bond, that is, a set of network devices grouped to improve performance or
102 * robustness. */
103struct bond {
104 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
105 char *name; /* Name provided by client. */
adcf00ba 106 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
f620b43a
BP
107
108 /* Slaves. */
109 struct hmap slaves;
110
f1c8a79c
AW
111 /* Enabled slaves.
112 *
113 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
114 * (To prevent the bond_slave from disappearing they must also hold
115 * 'rwlock'.) */
116 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
117 struct list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
118
f620b43a
BP
119 /* Bonding info. */
120 enum bond_mode balance; /* Balancing mode, one of BM_*. */
121 struct bond_slave *active_slave;
f620b43a 122 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
bdebeece 123 enum lacp_status lacp_status; /* Status of LACP negotiations. */
62904702 124 bool bond_revalidate; /* True if flows need revalidation. */
672d18b2 125 uint32_t basis; /* Basis for flow hash function. */
f620b43a
BP
126
127 /* SLB specific bonding info. */
9e1a6910 128 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
f620b43a
BP
129 int rebalance_interval; /* Interval between rebalances, in ms. */
130 long long int next_rebalance; /* Next rebalancing time. */
131 bool send_learning_packets;
adcf00ba
AZ
132 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
133 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
f620b43a 134
f620b43a
BP
135 /* Legacy compatibility. */
136 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
9dd165e0 137 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
f620b43a 138
37bec3d3 139 struct ovs_refcount ref_cnt;
f620b43a
BP
140};
141
adcf00ba
AZ
142/* What to do with an bond_recirc_rule. */
143enum bond_op {
144 ADD, /* Add the rule to ofproto's flow table. */
145 DEL, /* Delete the rule from the ofproto's flow table. */
146};
147
148/* A rule to add to or delete from ofproto's internal flow table. */
149struct bond_pr_rule_op {
150 struct hmap_node hmap_node;
151 struct match match;
152 ofp_port_t out_ofport;
153 enum bond_op op;
6c932bc8 154 struct rule **pr_rule;
adcf00ba
AZ
155};
156
3bfd3972
EJ
157static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
158static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
159 OVS_REQ_RDLOCK(rwlock);
4a1b8f30
EJ
160static void bond_enable_slave(struct bond_slave *, bool enable)
161 OVS_REQ_WRLOCK(rwlock);
162static void bond_link_status_update(struct bond_slave *)
3bfd3972 163 OVS_REQ_WRLOCK(rwlock);
4a1b8f30 164static void bond_choose_active_slave(struct bond *)
9e1a6910 165 OVS_REQ_WRLOCK(rwlock);
f620b43a 166static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
672d18b2
EJ
167 uint16_t vlan, uint32_t basis);
168static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
169 uint32_t basis);
f620b43a
BP
170static struct bond_entry *lookup_bond_entry(const struct bond *,
171 const struct flow *,
3bfd3972
EJ
172 uint16_t vlan)
173 OVS_REQ_RDLOCK(rwlock);
f1c8a79c
AW
174static struct bond_slave *get_enabled_slave(struct bond *)
175 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
176static struct bond_slave *choose_output_slave(const struct bond *,
177 const struct flow *,
bcd2633a 178 struct flow_wildcards *,
4a1b8f30 179 uint16_t vlan)
3bfd3972
EJ
180 OVS_REQ_RDLOCK(rwlock);
181static void bond_update_fake_slave_stats(struct bond *)
182 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
183
184/* Attempts to parse 's' as the name of a bond balancing mode. If successful,
185 * stores the mode in '*balance' and returns true. Otherwise returns false
186 * without modifying '*balance'. */
187bool
188bond_mode_from_string(enum bond_mode *balance, const char *s)
189{
190 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
191 *balance = BM_TCP;
192 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
193 *balance = BM_SLB;
194 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
195 *balance = BM_AB;
196 } else {
197 return false;
198 }
199 return true;
200}
201
202/* Returns a string representing 'balance'. */
203const char *
204bond_mode_to_string(enum bond_mode balance) {
205 switch (balance) {
206 case BM_TCP:
207 return "balance-tcp";
208 case BM_SLB:
209 return "balance-slb";
210 case BM_AB:
211 return "active-backup";
212 }
428b2edd 213 OVS_NOT_REACHED();
f620b43a
BP
214}
215
f620b43a
BP
216\f
217/* Creates and returns a new bond whose configuration is initially taken from
218 * 's'.
219 *
220 * The caller should register each slave on the new bond by calling
221 * bond_slave_register(). */
222struct bond *
adcf00ba 223bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
f620b43a
BP
224{
225 struct bond *bond;
226
227 bond = xzalloc(sizeof *bond);
adcf00ba 228 bond->ofproto = ofproto;
f620b43a 229 hmap_init(&bond->slaves);
f1c8a79c
AW
230 list_init(&bond->enabled_slaves);
231 ovs_mutex_init(&bond->mutex);
f620b43a 232 bond->next_fake_iface_update = LLONG_MAX;
37bec3d3 233 ovs_refcount_init(&bond->ref_cnt);
f620b43a 234
adcf00ba
AZ
235 bond->recirc_id = 0;
236 hmap_init(&bond->pr_rule_ops);
237
f620b43a 238 bond_reconfigure(bond, s);
f620b43a
BP
239 return bond;
240}
241
03366a2d
EJ
242struct bond *
243bond_ref(const struct bond *bond_)
244{
245 struct bond *bond = CONST_CAST(struct bond *, bond_);
246
bca0b3b4 247 if (bond) {
37bec3d3 248 ovs_refcount_ref(&bond->ref_cnt);
bca0b3b4 249 }
03366a2d
EJ
250 return bond;
251}
252
f620b43a
BP
253/* Frees 'bond'. */
254void
03366a2d 255bond_unref(struct bond *bond)
f620b43a
BP
256{
257 struct bond_slave *slave, *next_slave;
adcf00ba 258 struct bond_pr_rule_op *pr_op, *next_op;
f620b43a 259
37bec3d3 260 if (!bond || ovs_refcount_unref(&bond->ref_cnt) != 1) {
03366a2d
EJ
261 return;
262 }
263
3bfd3972
EJ
264 ovs_rwlock_wrlock(&rwlock);
265 hmap_remove(all_bonds, &bond->hmap_node);
266 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
267
268 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
269 hmap_remove(&bond->slaves, &slave->hmap_node);
270 /* Client owns 'slave->netdev'. */
271 free(slave->name);
272 free(slave);
273 }
274 hmap_destroy(&bond->slaves);
275
f1c8a79c 276 ovs_mutex_destroy(&bond->mutex);
f620b43a 277 free(bond->hash);
f620b43a 278 free(bond->name);
adcf00ba
AZ
279
280 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
281 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
282 free(pr_op);
283 }
284 hmap_destroy(&bond->pr_rule_ops);
285
286 if (bond->recirc_id) {
287 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
288 }
289
f620b43a
BP
290 free(bond);
291}
292
adcf00ba
AZ
293static void
294add_pr_rule(struct bond *bond, const struct match *match,
6c932bc8 295 ofp_port_t out_ofport, struct rule **rule)
adcf00ba
AZ
296{
297 uint32_t hash = match_hash(match, 0);
298 struct bond_pr_rule_op *pr_op;
299
300 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
301 if (match_equal(&pr_op->match, match)) {
302 pr_op->op = ADD;
303 pr_op->out_ofport = out_ofport;
304 pr_op->pr_rule = rule;
305 return;
306 }
307 }
308
309 pr_op = xmalloc(sizeof *pr_op);
310 pr_op->match = *match;
311 pr_op->op = ADD;
312 pr_op->out_ofport = out_ofport;
313 pr_op->pr_rule = rule;
314 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
315}
316
317static void
318update_recirc_rules(struct bond *bond)
319{
320 struct match match;
321 struct bond_pr_rule_op *pr_op, *next_op;
322 uint64_t ofpacts_stub[128 / 8];
323 struct ofpbuf ofpacts;
324 int i;
325
326 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
327
328 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
329 pr_op->op = DEL;
330 }
331
6c932bc8
AZ
332 if (bond->hash && bond->recirc_id) {
333 for (i = 0; i < BOND_BUCKETS; i++) {
334 struct bond_slave *slave = bond->hash[i].slave;
adcf00ba 335
6c932bc8
AZ
336 if (slave) {
337 match_init_catchall(&match);
338 match_set_recirc_id(&match, bond->recirc_id);
6c932bc8 339 match_set_dp_hash_masked(&match, i, BOND_MASK);
adcf00ba 340
6c932bc8
AZ
341 add_pr_rule(bond, &match, slave->ofp_port,
342 &bond->hash[i].pr_rule);
343 }
adcf00ba
AZ
344 }
345 }
346
347 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
348 int error;
adcf00ba
AZ
349 switch (pr_op->op) {
350 case ADD:
351 ofpbuf_clear(&ofpacts);
352 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
353 error = ofproto_dpif_add_internal_flow(bond->ofproto,
354 &pr_op->match,
355 RECIRC_RULE_PRIORITY,
6c932bc8 356 &ofpacts, pr_op->pr_rule);
adcf00ba
AZ
357 if (error) {
358 char *err_s = match_to_string(&pr_op->match,
359 RECIRC_RULE_PRIORITY);
360
361 VLOG_ERR("failed to add post recirculation flow %s", err_s);
362 free(err_s);
adcf00ba
AZ
363 }
364 break;
365
366 case DEL:
367 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
368 &pr_op->match,
369 RECIRC_RULE_PRIORITY);
370 if (error) {
371 char *err_s = match_to_string(&pr_op->match,
372 RECIRC_RULE_PRIORITY);
373
374 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
375 free(err_s);
376 }
377
378 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
6c932bc8 379 *pr_op->pr_rule = NULL;
adcf00ba
AZ
380 free(pr_op);
381 break;
382 }
383 }
384
385 ofpbuf_uninit(&ofpacts);
386}
387
388
f620b43a
BP
389/* Updates 'bond''s overall configuration to 's'.
390 *
391 * The caller should register each slave on 'bond' by calling
392 * bond_slave_register(). This is optional if none of the slaves'
4d6fb5eb 393 * configuration has changed. In any case it can't hurt.
59d7b2b6
EJ
394 *
395 * Returns true if the configuration has changed in such a way that requires
396 * flow revalidation.
397 * */
398bool
f620b43a
BP
399bond_reconfigure(struct bond *bond, const struct bond_settings *s)
400{
59d7b2b6
EJ
401 bool revalidate = false;
402
3bfd3972 403 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
404 if (!bond->name || strcmp(bond->name, s->name)) {
405 if (bond->name) {
3bfd3972 406 hmap_remove(all_bonds, &bond->hmap_node);
f620b43a
BP
407 free(bond->name);
408 }
409 bond->name = xstrdup(s->name);
3bfd3972 410 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
f620b43a
BP
411 }
412
f620b43a
BP
413 bond->updelay = s->up_delay;
414 bond->downdelay = s->down_delay;
bc1b010c 415
9dd165e0
RK
416 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
417 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
418 revalidate = true;
419 }
420
bc1b010c
EJ
421 if (bond->rebalance_interval != s->rebalance_interval) {
422 bond->rebalance_interval = s->rebalance_interval;
423 revalidate = true;
424 }
f620b43a 425
59d7b2b6
EJ
426 if (bond->balance != s->balance) {
427 bond->balance = s->balance;
428 revalidate = true;
429 }
430
672d18b2
EJ
431 if (bond->basis != s->basis) {
432 bond->basis = s->basis;
433 revalidate = true;
434 }
435
f620b43a
BP
436 if (s->fake_iface) {
437 if (bond->next_fake_iface_update == LLONG_MAX) {
438 bond->next_fake_iface_update = time_msec();
439 }
440 } else {
441 bond->next_fake_iface_update = LLONG_MAX;
442 }
59d7b2b6 443
62904702
EJ
444 if (bond->bond_revalidate) {
445 revalidate = true;
446 bond->bond_revalidate = false;
447 }
448
adcf00ba
AZ
449 if (bond->balance != BM_AB) {
450 if (!bond->recirc_id) {
451 bond->recirc_id = ofproto_dpif_alloc_recirc_id(bond->ofproto);
452 }
453 } else if (bond->recirc_id) {
454 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
455 bond->recirc_id = 0;
456 }
457
95aafb2a
EJ
458 if (bond->balance == BM_AB || !bond->hash || revalidate) {
459 bond_entry_reset(bond);
460 }
461
3bfd3972 462 ovs_rwlock_unlock(&rwlock);
59d7b2b6 463 return revalidate;
f620b43a
BP
464}
465
f8ddccd2 466static void
1ea24138 467bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
3bfd3972 468 OVS_REQ_WRLOCK(rwlock)
f8ddccd2
BP
469{
470 if (slave->netdev != netdev) {
f8ddccd2 471 slave->netdev = netdev;
1ea24138 472 slave->change_seq = 0;
f8ddccd2
BP
473 }
474}
475
f620b43a
BP
476/* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
477 * arbitrary client-provided pointer that uniquely identifies a slave within a
478 * bond. If 'slave_' already exists within 'bond' then this function
479 * reconfigures the existing slave.
480 *
481 * 'netdev' must be the network device that 'slave_' represents. It is owned
482 * by the client, so the client must not close it before either unregistering
483 * 'slave_' or destroying 'bond'.
4d6fb5eb 484 */
f620b43a 485void
adcf00ba
AZ
486bond_slave_register(struct bond *bond, void *slave_,
487 ofp_port_t ofport, struct netdev *netdev)
f620b43a 488{
3bfd3972 489 struct bond_slave *slave;
f620b43a 490
3bfd3972
EJ
491 ovs_rwlock_wrlock(&rwlock);
492 slave = bond_slave_lookup(bond, slave_);
f620b43a
BP
493 if (!slave) {
494 slave = xzalloc(sizeof *slave);
495
496 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
497 slave->bond = bond;
498 slave->aux = slave_;
adcf00ba 499 slave->ofp_port = ofport;
f620b43a 500 slave->delay_expires = LLONG_MAX;
244b2160 501 slave->name = xstrdup(netdev_get_name(netdev));
7321e30e 502 bond->bond_revalidate = true;
244b2160 503
b3c18f66 504 slave->enabled = false;
4a1b8f30 505 bond_enable_slave(slave, netdev_get_carrier(netdev));
f620b43a
BP
506 }
507
1ea24138 508 bond_slave_set_netdev__(slave, netdev);
a6934aa9 509
f620b43a
BP
510 free(slave->name);
511 slave->name = xstrdup(netdev_get_name(netdev));
3bfd3972 512 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
513}
514
f8ddccd2
BP
515/* Updates the network device to be used with 'slave_' to 'netdev'.
516 *
517 * This is useful if the caller closes and re-opens the network device
518 * registered with bond_slave_register() but doesn't need to change anything
519 * else. */
520void
521bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
522{
3bfd3972
EJ
523 struct bond_slave *slave;
524
525 ovs_rwlock_wrlock(&rwlock);
526 slave = bond_slave_lookup(bond, slave_);
f8ddccd2 527 if (slave) {
1ea24138 528 bond_slave_set_netdev__(slave, netdev);
f8ddccd2 529 }
3bfd3972 530 ovs_rwlock_unlock(&rwlock);
f8ddccd2
BP
531}
532
f620b43a
BP
533/* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
534 * then this function has no effect.
535 *
536 * Unregistering a slave invalidates all flows. */
537void
538bond_slave_unregister(struct bond *bond, const void *slave_)
539{
3bfd3972 540 struct bond_slave *slave;
f620b43a
BP
541 bool del_active;
542
3bfd3972
EJ
543 ovs_rwlock_wrlock(&rwlock);
544 slave = bond_slave_lookup(bond, slave_);
f620b43a 545 if (!slave) {
3bfd3972 546 goto out;
f620b43a
BP
547 }
548
4a1b8f30
EJ
549 bond->bond_revalidate = true;
550 bond_enable_slave(slave, false);
b3c18f66 551
f620b43a
BP
552 del_active = bond->active_slave == slave;
553 if (bond->hash) {
554 struct bond_entry *e;
555 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
556 if (e->slave == slave) {
557 e->slave = NULL;
558 }
559 }
560 }
561
562 free(slave->name);
563
564 hmap_remove(&bond->slaves, &slave->hmap_node);
565 /* Client owns 'slave->netdev'. */
566 free(slave);
567
568 if (del_active) {
4a1b8f30 569 bond_choose_active_slave(bond);
f620b43a
BP
570 bond->send_learning_packets = true;
571 }
3bfd3972
EJ
572out:
573 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
574}
575
296f6519
EJ
576/* Should be called on each slave in 'bond' before bond_run() to indicate
577 * whether or not 'slave_' may be enabled. This function is intended to allow
578 * other protocols to have some impact on bonding decisions. For example LACP
579 * or high level link monitoring protocols may decide that a given slave should
580 * not be able to send traffic. */
4d6fb5eb 581void
296f6519 582bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
4d6fb5eb 583{
3bfd3972 584 ovs_rwlock_wrlock(&rwlock);
296f6519 585 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
3bfd3972 586 ovs_rwlock_unlock(&rwlock);
4d6fb5eb
EJ
587}
588
4a1b8f30
EJ
589/* Performs periodic maintenance on 'bond'.
590 *
591 * Returns true if the caller should revalidate its flows.
f620b43a
BP
592 *
593 * The caller should check bond_should_send_learning_packets() afterward. */
4a1b8f30
EJ
594bool
595bond_run(struct bond *bond, enum lacp_status lacp_status)
f620b43a
BP
596{
597 struct bond_slave *slave;
4a1b8f30 598 bool revalidate;
f620b43a 599
3bfd3972 600 ovs_rwlock_wrlock(&rwlock);
bdebeece
EJ
601 if (bond->lacp_status != lacp_status) {
602 bond->lacp_status = lacp_status;
4592d0e2
EJ
603 bond->bond_revalidate = true;
604 }
4d6fb5eb 605
f620b43a
BP
606 /* Enable slaves based on link status and LACP feedback. */
607 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
4a1b8f30 608 bond_link_status_update(slave);
da4a6191 609 slave->change_seq = seq_read(connectivity_seq_get());
f620b43a
BP
610 }
611 if (!bond->active_slave || !bond->active_slave->enabled) {
4a1b8f30 612 bond_choose_active_slave(bond);
f620b43a
BP
613 }
614
615 /* Update fake bond interface stats. */
616 if (time_msec() >= bond->next_fake_iface_update) {
617 bond_update_fake_slave_stats(bond);
618 bond->next_fake_iface_update = time_msec() + 1000;
619 }
620
4a1b8f30
EJ
621 revalidate = bond->bond_revalidate;
622 bond->bond_revalidate = false;
3bfd3972 623 ovs_rwlock_unlock(&rwlock);
4a1b8f30
EJ
624
625 return revalidate;
f620b43a
BP
626}
627
628/* Causes poll_block() to wake up when 'bond' needs something to be done. */
629void
630bond_wait(struct bond *bond)
631{
632 struct bond_slave *slave;
633
3bfd3972 634 ovs_rwlock_rdlock(&rwlock);
f620b43a
BP
635 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
636 if (slave->delay_expires != LLONG_MAX) {
637 poll_timer_wait_until(slave->delay_expires);
638 }
1ea24138 639
da4a6191 640 seq_wait(connectivity_seq_get(), slave->change_seq);
f620b43a
BP
641 }
642
643 if (bond->next_fake_iface_update != LLONG_MAX) {
644 poll_timer_wait_until(bond->next_fake_iface_update);
645 }
646
bbc13389 647 if (bond->bond_revalidate) {
f620b43a
BP
648 poll_immediate_wake();
649 }
3bfd3972 650 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
651
652 /* We don't wait for bond->next_rebalance because rebalancing can only run
653 * at a flow account checkpoint. ofproto does checkpointing on its own
654 * schedule and bond_rebalance() gets called afterward, so we'd just be
655 * waking up for no purpose. */
656}
657\f
658/* MAC learning table interaction. */
659
660static bool
661may_send_learning_packets(const struct bond *bond)
662{
9dd165e0
RK
663 return ((bond->lacp_status == LACP_DISABLED
664 && (bond->balance == BM_SLB || bond->balance == BM_AB))
665 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
bdebeece 666 && bond->active_slave;
f620b43a
BP
667}
668
669/* Returns true if 'bond' needs the client to send out packets to assist with
670 * MAC learning on 'bond'. If this function returns true, then the client
671 * should iterate through its MAC learning table for the bridge on which 'bond'
672 * is located. For each MAC that has been learned on a port other than 'bond',
ea131871 673 * it should call bond_compose_learning_packet().
f620b43a 674 *
477879ea
BP
675 * This function will only return true if 'bond' is in SLB or active-backup
676 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
677 * necessary.
f620b43a
BP
678 *
679 * Calling this function resets the state that it checks. */
680bool
681bond_should_send_learning_packets(struct bond *bond)
682{
3bfd3972
EJ
683 bool send;
684
685 ovs_rwlock_wrlock(&rwlock);
686 send = bond->send_learning_packets && may_send_learning_packets(bond);
f620b43a 687 bond->send_learning_packets = false;
3bfd3972 688 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
689 return send;
690}
691
692/* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
693 *
ea131871
JG
694 * See bond_should_send_learning_packets() for description of usage. The
695 * caller should send the composed packet on the port associated with
696 * port_aux and takes ownership of the returned ofpbuf. */
697struct ofpbuf *
698bond_compose_learning_packet(struct bond *bond,
699 const uint8_t eth_src[ETH_ADDR_LEN],
700 uint16_t vlan, void **port_aux)
f620b43a
BP
701{
702 struct bond_slave *slave;
ea131871 703 struct ofpbuf *packet;
f620b43a 704 struct flow flow;
f620b43a 705
3bfd3972 706 ovs_rwlock_rdlock(&rwlock);
cb22974d 707 ovs_assert(may_send_learning_packets(bond));
f620b43a
BP
708 memset(&flow, 0, sizeof flow);
709 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
4a1b8f30 710 slave = choose_output_slave(bond, &flow, NULL, vlan);
f620b43a 711
ea131871 712 packet = ofpbuf_new(0);
2ea838ac 713 compose_rarp(packet, eth_src);
f620b43a 714 if (vlan) {
1bf02876 715 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
f620b43a 716 }
f620b43a 717
ea131871 718 *port_aux = slave->aux;
3bfd3972 719 ovs_rwlock_unlock(&rwlock);
ea131871 720 return packet;
f620b43a
BP
721}
722\f
723/* Checks whether a packet that arrived on 'slave_' within 'bond', with an
724 * Ethernet destination address of 'eth_dst', should be admitted.
725 *
726 * The return value is one of the following:
727 *
728 * - BV_ACCEPT: Admit the packet.
729 *
730 * - BV_DROP: Drop the packet.
731 *
732 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
733 * Ethernet source address and VLAN. If there is none, or if the packet
734 * is on the learned port, then admit the packet. If a different port has
735 * been learned, however, drop the packet (and do not use it for MAC
736 * learning).
737 */
738enum bond_verdict
739bond_check_admissibility(struct bond *bond, const void *slave_,
4a1b8f30 740 const uint8_t eth_dst[ETH_ADDR_LEN])
f620b43a 741{
3bfd3972
EJ
742 enum bond_verdict verdict = BV_DROP;
743 struct bond_slave *slave;
9a1c6450 744
3bfd3972
EJ
745 ovs_rwlock_rdlock(&rwlock);
746 slave = bond_slave_lookup(bond, slave_);
4222bbc8 747 if (!slave) {
3bfd3972 748 goto out;
4222bbc8
EJ
749 }
750
9a1c6450
EJ
751 /* LACP bonds have very loose admissibility restrictions because we can
752 * assume the remote switch is aware of the bond and will "do the right
753 * thing". However, as a precaution we drop packets on disabled slaves
754 * because no correctly implemented partner switch should be sending
bdebeece
EJ
755 * packets to them.
756 *
757 * If LACP is configured, but LACP negotiations have been unsuccessful, we
9dd165e0 758 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
bdebeece 759 switch (bond->lacp_status) {
3bfd3972
EJ
760 case LACP_NEGOTIATED:
761 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
762 goto out;
763 case LACP_CONFIGURED:
9dd165e0
RK
764 if (!bond->lacp_fallback_ab) {
765 goto out;
766 }
3bfd3972
EJ
767 case LACP_DISABLED:
768 break;
f620b43a
BP
769 }
770
771 /* Drop all multicast packets on inactive slaves. */
772 if (eth_addr_is_multicast(eth_dst)) {
4222bbc8 773 if (bond->active_slave != slave) {
3bfd3972 774 goto out;
f620b43a
BP
775 }
776 }
777
f931a4c9 778 switch (bond->balance) {
9dd165e0
RK
779 case BM_TCP:
780 /* TCP balanced bonds require successful LACP negotiations. Based on the
781 * above check, LACP is off or lacp_fallback_ab is true on this bond.
782 * If lacp_fallback_ab is true fall through to BM_AB case else, we
783 * drop all incoming traffic. */
784 if (!bond->lacp_fallback_ab) {
785 goto out;
786 }
787
f931a4c9
BP
788 case BM_AB:
789 /* Drop all packets which arrive on backup slaves. This is similar to
790 * how Linux bonding handles active-backup bonds. */
7ba7dcf0
EJ
791 if (bond->active_slave != slave) {
792 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
793
e6b2255c
BP
794 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
795 " slave (%s) destined for " ETH_ADDR_FMT,
796 slave->name, ETH_ADDR_ARGS(eth_dst));
3bfd3972 797 goto out;
7ba7dcf0 798 }
3bfd3972
EJ
799 verdict = BV_ACCEPT;
800 goto out;
f931a4c9 801
f931a4c9
BP
802 case BM_SLB:
803 /* Drop all packets for which we have learned a different input port,
804 * because we probably sent the packet on one slave and got it back on
805 * the other. Gratuitous ARP packets are an exception to this rule:
806 * the host has moved to another switch. The exception to the
807 * exception is if we locked the learning table to avoid reflections on
808 * bond slaves. */
3bfd3972
EJ
809 verdict = BV_DROP_IF_MOVED;
810 goto out;
7ba7dcf0
EJ
811 }
812
428b2edd 813 OVS_NOT_REACHED();
3bfd3972
EJ
814out:
815 ovs_rwlock_unlock(&rwlock);
816 return verdict;
817
f620b43a
BP
818}
819
820/* Returns the slave (registered on 'bond' by bond_slave_register()) to which
821 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
822 * NULL if the packet should be dropped because no slaves are enabled.
823 *
824 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
825 * should be a VID only (i.e. excluding the PCP bits). Second,
826 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
827 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
828 * packet belongs to (so for an access port it will be the access port's VLAN).
829 *
bcd2633a
JP
830 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
831 * significant in the selection. At some point earlier, 'wc' should
832 * have been initialized (e.g., by flow_wildcards_init_catchall()).
f620b43a
BP
833 */
834void *
835bond_choose_output_slave(struct bond *bond, const struct flow *flow,
4a1b8f30 836 struct flow_wildcards *wc, uint16_t vlan)
f620b43a 837{
3bfd3972 838 struct bond_slave *slave;
b5d5d7d3 839 void *aux;
3bfd3972
EJ
840
841 ovs_rwlock_rdlock(&rwlock);
4a1b8f30 842 slave = choose_output_slave(bond, flow, wc, vlan);
b5d5d7d3 843 aux = slave ? slave->aux : NULL;
3bfd3972 844 ovs_rwlock_unlock(&rwlock);
b5d5d7d3
AW
845
846 return aux;
f620b43a 847}
f620b43a 848\f
adcf00ba
AZ
849/* Recirculation. */
850static void
851bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
c6855ec5 852 OVS_REQ_WRLOCK(rwlock)
adcf00ba
AZ
853{
854 if (entry->slave) {
855 uint64_t delta;
856
857 delta = rule_tx_bytes - entry->pr_tx_bytes;
858 entry->tx_bytes += delta;
859 entry->pr_tx_bytes = rule_tx_bytes;
860 }
861}
862
863/* Maintain bond stats using post recirculation rule byte counters.*/
60cda7d6 864static void
adcf00ba
AZ
865bond_recirculation_account(struct bond *bond)
866{
867 int i;
868
c6855ec5 869 ovs_rwlock_wrlock(&rwlock);
adcf00ba
AZ
870 for (i=0; i<=BOND_MASK; i++) {
871 struct bond_entry *entry = &bond->hash[i];
872 struct rule *rule = entry->pr_rule;
873
874 if (rule) {
875 uint64_t n_packets OVS_UNUSED;
876 long long int used OVS_UNUSED;
877 uint64_t n_bytes;
878
879 rule->ofproto->ofproto_class->rule_get_stats(
880 rule, &n_packets, &n_bytes, &used);
881 bond_entry_account(entry, n_bytes);
882 }
883 }
884 ovs_rwlock_unlock(&rwlock);
885}
886
887bool
888bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
889 uint32_t *hash_bias)
890{
9e4e33db 891 if (bond->balance == BM_TCP && recirc_id) {
adcf00ba
AZ
892 if (recirc_id) {
893 *recirc_id = bond->recirc_id;
894 }
895 if (hash_bias) {
896 *hash_bias = bond->basis;
897 }
898 return true;
899 } else {
900 return false;
901 }
902}
903
904void
905bond_update_post_recirc_rules(struct bond* bond, const bool force)
906{
907 struct bond_entry *e;
908 bool update_rules = force; /* Always update rules if caller forces it. */
909
910 /* Make sure all bond entries are populated */
911 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
912 if (!e->slave || !e->slave->enabled) {
913 update_rules = true;
914 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
915 struct bond_slave, hmap_node);
916 if (!e->slave->enabled) {
917 e->slave = bond->active_slave;
918 }
919 }
920 }
921
922 if (update_rules) {
923 update_recirc_rules(bond);
924 }
925}
926\f
f620b43a
BP
927/* Rebalancing. */
928
1b137691 929static bool
3bfd3972 930bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
1b137691 931{
bc1b010c
EJ
932 return bond->rebalance_interval
933 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
1b137691
EJ
934}
935
f620b43a
BP
936/* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
937void
938bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
939 uint64_t n_bytes)
940{
3bfd3972 941 ovs_rwlock_wrlock(&rwlock);
1b137691 942 if (bond_is_balanced(bond)) {
f620b43a 943 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
f620b43a 944 }
3bfd3972 945 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
946}
947
948static struct bond_slave *
3bfd3972 949bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
950{
951 return CONTAINER_OF(bal, struct bond_slave, bal_node);
952}
953
954static void
955log_bals(struct bond *bond, const struct list *bals)
c6855ec5 956 OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
957{
958 if (VLOG_IS_DBG_ENABLED()) {
959 struct ds ds = DS_EMPTY_INITIALIZER;
960 const struct bond_slave *slave;
961
962 LIST_FOR_EACH (slave, bal_node, bals) {
963 if (ds.length) {
964 ds_put_char(&ds, ',');
965 }
966 ds_put_format(&ds, " %s %"PRIu64"kB",
967 slave->name, slave->tx_bytes / 1024);
968
969 if (!slave->enabled) {
970 ds_put_cstr(&ds, " (disabled)");
971 }
972 if (!list_is_empty(&slave->entries)) {
973 struct bond_entry *e;
974
975 ds_put_cstr(&ds, " (");
976 LIST_FOR_EACH (e, list_node, &slave->entries) {
977 if (&e->list_node != list_front(&slave->entries)) {
978 ds_put_cstr(&ds, " + ");
979 }
34582733 980 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
f620b43a
BP
981 e - bond->hash, e->tx_bytes / 1024);
982 }
983 ds_put_cstr(&ds, ")");
984 }
985 }
986 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
987 ds_destroy(&ds);
988 }
989}
990
991/* Shifts 'hash' from its current slave to 'to'. */
992static void
4a1b8f30 993bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
c6855ec5 994 OVS_REQ_WRLOCK(rwlock)
f620b43a
BP
995{
996 struct bond_slave *from = hash->slave;
997 struct bond *bond = from->bond;
998 uint64_t delta = hash->tx_bytes;
999
34582733 1000 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
f620b43a
BP
1001 "from %s to %s (now carrying %"PRIu64"kB and "
1002 "%"PRIu64"kB load, respectively)",
1003 bond->name, delta / 1024, hash - bond->hash,
1004 from->name, to->name,
1005 (from->tx_bytes - delta) / 1024,
1006 (to->tx_bytes + delta) / 1024);
1007
1008 /* Shift load away from 'from' to 'to'. */
1009 from->tx_bytes -= delta;
1010 to->tx_bytes += delta;
1011
1012 /* Arrange for flows to be revalidated. */
dc30ea2d 1013 hash->slave = to;
4a1b8f30 1014 bond->bond_revalidate = true;
f620b43a
BP
1015}
1016
09a5d390
BP
1017/* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1018 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
f620b43a
BP
1019 * given that doing so must decrease the ratio of the load on the two slaves by
1020 * at least 0.1. Returns NULL if there is no appropriate entry.
1021 *
1022 * The list of entries isn't sorted. I don't know of a reason to prefer to
1023 * shift away small hashes or large hashes. */
1024static struct bond_entry *
1025choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
c6855ec5 1026 OVS_REQ_WRLOCK(rwlock)
f620b43a
BP
1027{
1028 struct bond_entry *e;
1029
1030 if (list_is_short(&from->entries)) {
1031 /* 'from' carries no more than one MAC hash, so shifting load away from
1032 * it would be pointless. */
1033 return NULL;
1034 }
1035
1036 LIST_FOR_EACH (e, list_node, &from->entries) {
1037 double old_ratio, new_ratio;
1038 uint64_t delta;
1039
1040 if (to_tx_bytes == 0) {
1041 /* Nothing on the new slave, move it. */
1042 return e;
1043 }
1044
1045 delta = e->tx_bytes;
1046 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1047 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
75fad143
ZK
1048 if (old_ratio - new_ratio > 0.1
1049 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1050 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1051 and 'to' slave have the same load. Therefore, we only move an
1052 entry if it decreases the load on 'from', and brings us closer
1053 to equal traffic load. */
f620b43a
BP
1054 return e;
1055 }
1056 }
1057
1058 return NULL;
1059}
1060
1061/* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1062 * maintained. */
1063static void
1064insert_bal(struct list *bals, struct bond_slave *slave)
1065{
1066 struct bond_slave *pos;
1067
1068 LIST_FOR_EACH (pos, bal_node, bals) {
1069 if (slave->tx_bytes > pos->tx_bytes) {
1070 break;
1071 }
1072 }
1073 list_insert(&pos->bal_node, &slave->bal_node);
1074}
1075
1076/* Removes 'slave' from its current list and then inserts it into 'bals' so
1077 * that descending order of 'tx_bytes' is maintained. */
1078static void
1079reinsert_bal(struct list *bals, struct bond_slave *slave)
1080{
1081 list_remove(&slave->bal_node);
1082 insert_bal(bals, slave);
1083}
1084
1085/* If 'bond' needs rebalancing, does so.
1086 *
adcf00ba
AZ
1087 * The caller should have called bond_account() for each active flow, or in case
1088 * of recirculation is used, have called bond_recirculation_account(bond),
1089 * to ensure that flow data is consistently accounted at this point.
60cda7d6
AZ
1090 */
1091void
4a1b8f30 1092bond_rebalance(struct bond *bond)
f620b43a
BP
1093{
1094 struct bond_slave *slave;
1095 struct bond_entry *e;
1096 struct list bals;
adcf00ba 1097 bool rebalanced = false;
60cda7d6 1098 bool use_recirc;
f620b43a 1099
3bfd3972 1100 ovs_rwlock_wrlock(&rwlock);
1b137691 1101 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
adcf00ba 1102 goto done;
f620b43a
BP
1103 }
1104 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1105
60cda7d6
AZ
1106 use_recirc = ofproto_dpif_get_enable_recirc(bond->ofproto) &&
1107 bond_may_recirc(bond, NULL, NULL);
1108
1109 if (use_recirc) {
1110 bond_recirculation_account(bond);
1111 }
1112
f620b43a
BP
1113 /* Add each bond_entry to its slave's 'entries' list.
1114 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1115 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1116 slave->tx_bytes = 0;
1117 list_init(&slave->entries);
1118 }
1119 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1120 if (e->slave && e->tx_bytes) {
1121 e->slave->tx_bytes += e->tx_bytes;
1122 list_push_back(&e->slave->entries, &e->list_node);
1123 }
1124 }
1125
1126 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1127 *
1128 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1129 * with a proper list sort algorithm. */
1130 list_init(&bals);
1131 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1132 if (slave->enabled) {
1133 insert_bal(&bals, slave);
1134 }
1135 }
1136 log_bals(bond, &bals);
1137
1138 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1139 while (!list_is_short(&bals)) {
1140 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1141 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1142 uint64_t overload;
1143
1144 overload = from->tx_bytes - to->tx_bytes;
1145 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1146 /* The extra load on 'from' (and all less-loaded slaves), compared
1147 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1148 * it is less than ~1Mbps. No point in rebalancing. */
1149 break;
1150 }
1151
09a5d390
BP
1152 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1153 * to move from 'from' to 'to'. */
f620b43a
BP
1154 e = choose_entry_to_migrate(from, to->tx_bytes);
1155 if (e) {
4a1b8f30 1156 bond_shift_load(e, to);
f620b43a
BP
1157
1158 /* Delete element from from->entries.
1159 *
1160 * We don't add the element to to->hashes. That would only allow
1161 * 'e' to be migrated to another slave in this rebalancing run, and
1162 * there is no point in doing that. */
1163 list_remove(&e->list_node);
1164
1165 /* Re-sort 'bals'. */
1166 reinsert_bal(&bals, from);
1167 reinsert_bal(&bals, to);
60cda7d6 1168 rebalanced = true;
f620b43a
BP
1169 } else {
1170 /* Can't usefully migrate anything away from 'from'.
1171 * Don't reconsider it. */
1172 list_remove(&from->bal_node);
1173 }
1174 }
1175
1176 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1177 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1178 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1179 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1180 e->tx_bytes /= 2;
f620b43a 1181 }
adcf00ba
AZ
1182
1183done:
60cda7d6
AZ
1184 if (use_recirc && rebalanced) {
1185 bond_update_post_recirc_rules(bond,true);
1186 }
3bfd3972 1187 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1188}
1189\f
1190/* Bonding unixctl user interface functions. */
1191
1192static struct bond *
3bfd3972 1193bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
1194{
1195 struct bond *bond;
1196
1197 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
3bfd3972 1198 all_bonds) {
f620b43a
BP
1199 if (!strcmp(bond->name, name)) {
1200 return bond;
1201 }
1202 }
1203 return NULL;
1204}
1205
1206static struct bond_slave *
1207bond_lookup_slave(struct bond *bond, const char *slave_name)
1208{
1209 struct bond_slave *slave;
1210
1211 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1212 if (!strcmp(slave->name, slave_name)) {
1213 return slave;
1214 }
1215 }
1216 return NULL;
1217}
1218
1219static void
1220bond_unixctl_list(struct unixctl_conn *conn,
0e15264f
BP
1221 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1222 void *aux OVS_UNUSED)
f620b43a
BP
1223{
1224 struct ds ds = DS_EMPTY_INITIALIZER;
1225 const struct bond *bond;
1226
adcf00ba 1227 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
f620b43a 1228
3bfd3972
EJ
1229 ovs_rwlock_rdlock(&rwlock);
1230 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
f620b43a
BP
1231 const struct bond_slave *slave;
1232 size_t i;
1233
adcf00ba
AZ
1234 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1235 bond_mode_to_string(bond->balance), bond->recirc_id);
f620b43a
BP
1236
1237 i = 0;
1238 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1239 if (i++ > 0) {
1240 ds_put_cstr(&ds, ", ");
1241 }
1242 ds_put_cstr(&ds, slave->name);
1243 }
1244 ds_put_char(&ds, '\n');
1245 }
3bfd3972 1246 ovs_rwlock_unlock(&rwlock);
bde9f75d 1247 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a
BP
1248 ds_destroy(&ds);
1249}
1250
1251static void
c33a8a25 1252bond_print_details(struct ds *ds, const struct bond *bond)
3bfd3972 1253 OVS_REQ_RDLOCK(rwlock)
f620b43a 1254{
fc1d4f01
EJ
1255 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1256 const struct shash_node **sorted_slaves = NULL;
f620b43a 1257 const struct bond_slave *slave;
adcf00ba
AZ
1258 bool may_recirc;
1259 uint32_t recirc_id;
fc1d4f01 1260 int i;
f620b43a 1261
c33a8a25
EJ
1262 ds_put_format(ds, "---- %s ----\n", bond->name);
1263 ds_put_format(ds, "bond_mode: %s\n",
f620b43a
BP
1264 bond_mode_to_string(bond->balance));
1265
adcf00ba
AZ
1266 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1267 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1268 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1269
c33a8a25 1270 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
672d18b2 1271
c33a8a25
EJ
1272 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1273 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
f620b43a 1274
1b137691 1275 if (bond_is_balanced(bond)) {
c33a8a25 1276 ds_put_format(ds, "next rebalance: %lld ms\n",
f620b43a
BP
1277 bond->next_rebalance - time_msec());
1278 }
1279
bdebeece
EJ
1280 ds_put_cstr(ds, "lacp_status: ");
1281 switch (bond->lacp_status) {
1282 case LACP_NEGOTIATED:
1283 ds_put_cstr(ds, "negotiated\n");
1284 break;
1285 case LACP_CONFIGURED:
1286 ds_put_cstr(ds, "configured\n");
1287 break;
1288 case LACP_DISABLED:
1289 ds_put_cstr(ds, "off\n");
1290 break;
1291 default:
1292 ds_put_cstr(ds, "<unknown>\n");
1293 break;
1294 }
4d6fb5eb 1295
f620b43a 1296 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
fc1d4f01
EJ
1297 shash_add(&slave_shash, slave->name, slave);
1298 }
1299 sorted_slaves = shash_sort(&slave_shash);
1300
1301 for (i = 0; i < shash_count(&slave_shash); i++) {
f620b43a 1302 struct bond_entry *be;
f620b43a 1303
fc1d4f01
EJ
1304 slave = sorted_slaves[i]->data;
1305
f620b43a 1306 /* Basic info. */
c33a8a25 1307 ds_put_format(ds, "\nslave %s: %s\n",
f620b43a
BP
1308 slave->name, slave->enabled ? "enabled" : "disabled");
1309 if (slave == bond->active_slave) {
c33a8a25 1310 ds_put_cstr(ds, "\tactive slave\n");
f620b43a
BP
1311 }
1312 if (slave->delay_expires != LLONG_MAX) {
c33a8a25 1313 ds_put_format(ds, "\t%s expires in %lld ms\n",
f620b43a
BP
1314 slave->enabled ? "downdelay" : "updelay",
1315 slave->delay_expires - time_msec());
1316 }
1317
c33a8a25 1318 ds_put_format(ds, "\tmay_enable: %s\n",
296f6519 1319 slave->may_enable ? "true" : "false");
4d6fb5eb 1320
1b137691 1321 if (!bond_is_balanced(bond)) {
f620b43a
BP
1322 continue;
1323 }
1324
1325 /* Hashes. */
f620b43a
BP
1326 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1327 int hash = be - bond->hash;
f6ba1f35 1328 uint64_t be_tx_k;
f620b43a
BP
1329
1330 if (be->slave != slave) {
1331 continue;
1332 }
1333
f6ba1f35
AZ
1334 be_tx_k = be->tx_bytes / 1024;
1335 if (be_tx_k) {
1336 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1337 hash, be_tx_k);
1338 }
f620b43a 1339
7b9f1974 1340 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
f620b43a
BP
1341 }
1342 }
fc1d4f01
EJ
1343 shash_destroy(&slave_shash);
1344 free(sorted_slaves);
c33a8a25
EJ
1345 ds_put_cstr(ds, "\n");
1346}
1347
1348static void
1349bond_unixctl_show(struct unixctl_conn *conn,
1350 int argc, const char *argv[],
1351 void *aux OVS_UNUSED)
1352{
1353 struct ds ds = DS_EMPTY_INITIALIZER;
1354
3bfd3972 1355 ovs_rwlock_rdlock(&rwlock);
c33a8a25
EJ
1356 if (argc > 1) {
1357 const struct bond *bond = bond_find(argv[1]);
1358
1359 if (!bond) {
bde9f75d 1360 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1361 goto out;
c33a8a25
EJ
1362 }
1363 bond_print_details(&ds, bond);
1364 } else {
1365 const struct bond *bond;
1366
3bfd3972 1367 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
c33a8a25
EJ
1368 bond_print_details(&ds, bond);
1369 }
1370 }
1371
bde9f75d 1372 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a 1373 ds_destroy(&ds);
3bfd3972
EJ
1374
1375out:
1376 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1377}
1378
1379static void
0e15264f
BP
1380bond_unixctl_migrate(struct unixctl_conn *conn,
1381 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1382 void *aux OVS_UNUSED)
1383{
0e15264f
BP
1384 const char *bond_s = argv[1];
1385 const char *hash_s = argv[2];
1386 const char *slave_s = argv[3];
f620b43a
BP
1387 struct bond *bond;
1388 struct bond_slave *slave;
1389 struct bond_entry *entry;
1390 int hash;
1391
3bfd3972 1392 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1393 bond = bond_find(bond_s);
1394 if (!bond) {
bde9f75d 1395 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1396 goto out;
f620b43a
BP
1397 }
1398
1399 if (bond->balance != BM_SLB) {
bde9f75d 1400 unixctl_command_reply_error(conn, "not an SLB bond");
3bfd3972 1401 goto out;
f620b43a
BP
1402 }
1403
1404 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1405 hash = atoi(hash_s) & BOND_MASK;
1406 } else {
bde9f75d 1407 unixctl_command_reply_error(conn, "bad hash");
3bfd3972 1408 goto out;
f620b43a
BP
1409 }
1410
1411 slave = bond_lookup_slave(bond, slave_s);
1412 if (!slave) {
bde9f75d 1413 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1414 goto out;
f620b43a
BP
1415 }
1416
1417 if (!slave->enabled) {
bde9f75d 1418 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
3bfd3972 1419 goto out;
f620b43a
BP
1420 }
1421
1422 entry = &bond->hash[hash];
4a1b8f30 1423 bond->bond_revalidate = true;
f620b43a 1424 entry->slave = slave;
bde9f75d 1425 unixctl_command_reply(conn, "migrated");
3bfd3972
EJ
1426
1427out:
1428 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1429}
1430
1431static void
0e15264f
BP
1432bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1433 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1434 void *aux OVS_UNUSED)
1435{
0e15264f
BP
1436 const char *bond_s = argv[1];
1437 const char *slave_s = argv[2];
f620b43a
BP
1438 struct bond *bond;
1439 struct bond_slave *slave;
1440
3bfd3972 1441 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1442 bond = bond_find(bond_s);
1443 if (!bond) {
bde9f75d 1444 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1445 goto out;
f620b43a
BP
1446 }
1447
1448 slave = bond_lookup_slave(bond, slave_s);
1449 if (!slave) {
bde9f75d 1450 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1451 goto out;
f620b43a
BP
1452 }
1453
1454 if (!slave->enabled) {
bde9f75d 1455 unixctl_command_reply_error(conn, "cannot make disabled slave active");
3bfd3972 1456 goto out;
f620b43a
BP
1457 }
1458
1459 if (bond->active_slave != slave) {
4a1b8f30 1460 bond->bond_revalidate = true;
f620b43a 1461 bond->active_slave = slave;
f620b43a
BP
1462 VLOG_INFO("bond %s: active interface is now %s",
1463 bond->name, slave->name);
1464 bond->send_learning_packets = true;
bde9f75d 1465 unixctl_command_reply(conn, "done");
f620b43a 1466 } else {
bde9f75d 1467 unixctl_command_reply(conn, "no change");
f620b43a 1468 }
3bfd3972
EJ
1469out:
1470 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1471}
1472
1473static void
0e15264f 1474enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
f620b43a 1475{
0e15264f
BP
1476 const char *bond_s = argv[1];
1477 const char *slave_s = argv[2];
f620b43a
BP
1478 struct bond *bond;
1479 struct bond_slave *slave;
1480
3bfd3972 1481 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1482 bond = bond_find(bond_s);
1483 if (!bond) {
bde9f75d 1484 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1485 goto out;
f620b43a
BP
1486 }
1487
1488 slave = bond_lookup_slave(bond, slave_s);
1489 if (!slave) {
bde9f75d 1490 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1491 goto out;
f620b43a
BP
1492 }
1493
4a1b8f30 1494 bond_enable_slave(slave, enable);
bde9f75d 1495 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
3bfd3972
EJ
1496
1497out:
1498 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1499}
1500
1501static void
0e15264f
BP
1502bond_unixctl_enable_slave(struct unixctl_conn *conn,
1503 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1504 void *aux OVS_UNUSED)
1505{
0e15264f 1506 enable_slave(conn, argv, true);
f620b43a
BP
1507}
1508
1509static void
0e15264f
BP
1510bond_unixctl_disable_slave(struct unixctl_conn *conn,
1511 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1512 void *aux OVS_UNUSED)
1513{
0e15264f 1514 enable_slave(conn, argv, false);
f620b43a
BP
1515}
1516
1517static void
0e15264f 1518bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
f620b43a
BP
1519 void *aux OVS_UNUSED)
1520{
0e15264f
BP
1521 const char *mac_s = argv[1];
1522 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1523 const char *basis_s = argc > 3 ? argv[3] : NULL;
f620b43a
BP
1524 uint8_t mac[ETH_ADDR_LEN];
1525 uint8_t hash;
1526 char *hash_cstr;
1527 unsigned int vlan;
672d18b2 1528 uint32_t basis;
f620b43a
BP
1529
1530 if (vlan_s) {
c2c28dfd 1531 if (!ovs_scan(vlan_s, "%u", &vlan)) {
bde9f75d 1532 unixctl_command_reply_error(conn, "invalid vlan");
f620b43a
BP
1533 return;
1534 }
1535 } else {
dc155bff 1536 vlan = 0;
f620b43a
BP
1537 }
1538
672d18b2 1539 if (basis_s) {
c2c28dfd 1540 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
bde9f75d 1541 unixctl_command_reply_error(conn, "invalid basis");
672d18b2
EJ
1542 return;
1543 }
1544 } else {
1545 basis = 0;
1546 }
1547
c2c28dfd 1548 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
672d18b2 1549 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
f620b43a
BP
1550
1551 hash_cstr = xasprintf("%u", hash);
bde9f75d 1552 unixctl_command_reply(conn, hash_cstr);
f620b43a
BP
1553 free(hash_cstr);
1554 } else {
bde9f75d 1555 unixctl_command_reply_error(conn, "invalid mac");
f620b43a
BP
1556 }
1557}
1558
1559void
1560bond_init(void)
1561{
0e15264f 1562 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
c33a8a25
EJ
1563 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1564 NULL);
0e15264f 1565 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
7ff2009a 1566 bond_unixctl_migrate, NULL);
0e15264f 1567 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
f620b43a 1568 bond_unixctl_set_active_slave, NULL);
0e15264f 1569 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
7ff2009a 1570 bond_unixctl_enable_slave, NULL);
0e15264f 1571 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
7ff2009a 1572 bond_unixctl_disable_slave, NULL);
0e15264f 1573 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
7ff2009a 1574 bond_unixctl_hash, NULL);
f620b43a
BP
1575}
1576\f
95aafb2a
EJ
1577static void
1578bond_entry_reset(struct bond *bond)
1579{
1580 if (bond->balance != BM_AB) {
9e1a6910 1581 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
95aafb2a
EJ
1582
1583 if (!bond->hash) {
1584 bond->hash = xmalloc(hash_len);
1585 }
1586 memset(bond->hash, 0, hash_len);
1587
1588 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1589 } else {
1590 free(bond->hash);
1591 bond->hash = NULL;
1592 }
1593}
1594
f620b43a
BP
1595static struct bond_slave *
1596bond_slave_lookup(struct bond *bond, const void *slave_)
1597{
1598 struct bond_slave *slave;
1599
1600 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1601 &bond->slaves) {
1602 if (slave->aux == slave_) {
1603 return slave;
1604 }
1605 }
1606
1607 return NULL;
1608}
1609
f620b43a 1610static void
4a1b8f30 1611bond_enable_slave(struct bond_slave *slave, bool enable)
f620b43a
BP
1612{
1613 slave->delay_expires = LLONG_MAX;
1614 if (enable != slave->enabled) {
4a1b8f30 1615 slave->bond->bond_revalidate = true;
f620b43a 1616 slave->enabled = enable;
f1c8a79c
AW
1617
1618 ovs_mutex_lock(&slave->bond->mutex);
1619 if (enable) {
1620 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1621 } else {
1622 list_remove(&slave->list_node);
1623 }
1624 ovs_mutex_unlock(&slave->bond->mutex);
1625
4a1b8f30
EJ
1626 VLOG_INFO("interface %s: %s", slave->name,
1627 slave->enabled ? "enabled" : "disabled");
f620b43a
BP
1628 }
1629}
1630
1631static void
4a1b8f30 1632bond_link_status_update(struct bond_slave *slave)
f620b43a
BP
1633{
1634 struct bond *bond = slave->bond;
1635 bool up;
1636
296f6519 1637 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
f620b43a
BP
1638 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1639 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1640 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1641 slave->name, up ? "up" : "down");
1642 if (up == slave->enabled) {
1643 slave->delay_expires = LLONG_MAX;
1644 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1645 slave->name, up ? "disabled" : "enabled");
1646 } else {
bdebeece 1647 int delay = (bond->lacp_status != LACP_DISABLED ? 0
f620b43a
BP
1648 : up ? bond->updelay : bond->downdelay);
1649 slave->delay_expires = time_msec() + delay;
1650 if (delay) {
1651 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1652 "for %d ms",
1653 slave->name,
1654 up ? "enabled" : "disabled",
1655 up ? "up" : "down",
1656 delay);
1657 }
1658 }
1659 }
1660
1661 if (time_msec() >= slave->delay_expires) {
4a1b8f30 1662 bond_enable_slave(slave, up);
f620b43a
BP
1663 }
1664}
1665
f620b43a 1666static unsigned int
672d18b2 1667bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
f620b43a 1668{
7e36ac42 1669 return hash_mac(mac, vlan, basis);
f620b43a
BP
1670}
1671
1672static unsigned int
672d18b2 1673bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
f620b43a
BP
1674{
1675 struct flow hash_flow = *flow;
d84d4b88 1676 hash_flow.vlan_tci = htons(vlan);
f620b43a
BP
1677
1678 /* The symmetric quality of this hash function is not required, but
1679 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1680 * purposes, so we use it out of convenience. */
672d18b2 1681 return flow_hash_symmetric_l4(&hash_flow, basis);
f620b43a
BP
1682}
1683
fb0b29a3
EJ
1684static unsigned int
1685bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1686{
cb22974d 1687 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
fb0b29a3 1688
bdebeece 1689 return (bond->balance == BM_TCP
672d18b2
EJ
1690 ? bond_hash_tcp(flow, vlan, bond->basis)
1691 : bond_hash_src(flow->dl_src, vlan, bond->basis));
fb0b29a3
EJ
1692}
1693
f620b43a
BP
1694static struct bond_entry *
1695lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1696 uint16_t vlan)
1697{
fb0b29a3 1698 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
f620b43a
BP
1699}
1700
f1c8a79c
AW
1701/* Selects and returns an enabled slave from the 'enabled_slaves' list
1702 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1703 * returns NULL. */
1704static struct bond_slave *
1705get_enabled_slave(struct bond *bond)
1706{
1707 struct list *node;
1708
1709 ovs_mutex_lock(&bond->mutex);
1710 if (list_is_empty(&bond->enabled_slaves)) {
1711 ovs_mutex_unlock(&bond->mutex);
1712 return NULL;
1713 }
1714
1715 node = list_pop_front(&bond->enabled_slaves);
1716 list_push_back(&bond->enabled_slaves, node);
1717 ovs_mutex_unlock(&bond->mutex);
1718
1719 return CONTAINER_OF(node, struct bond_slave, list_node);
1720}
1721
f620b43a
BP
1722static struct bond_slave *
1723choose_output_slave(const struct bond *bond, const struct flow *flow,
4a1b8f30 1724 struct flow_wildcards *wc, uint16_t vlan)
f620b43a
BP
1725{
1726 struct bond_entry *e;
9dd165e0 1727 int balance;
f620b43a 1728
9dd165e0 1729 balance = bond->balance;
bdebeece
EJ
1730 if (bond->lacp_status == LACP_CONFIGURED) {
1731 /* LACP has been configured on this bond but negotiations were
9dd165e0
RK
1732 * unsuccussful. If lacp_fallback_ab is enabled use active-
1733 * backup mode else drop all traffic. */
1734 if (!bond->lacp_fallback_ab) {
1735 return NULL;
1736 }
1737 balance = BM_AB;
bdebeece
EJ
1738 }
1739
9dd165e0 1740 switch (balance) {
f620b43a
BP
1741 case BM_AB:
1742 return bond->active_slave;
1743
f620b43a 1744 case BM_TCP:
bdebeece
EJ
1745 if (bond->lacp_status != LACP_NEGOTIATED) {
1746 /* Must have LACP negotiations for TCP balanced bonds. */
1747 return NULL;
1748 }
bcd2633a 1749 if (wc) {
6cdd5145 1750 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
bcd2633a 1751 }
bdebeece
EJ
1752 /* Fall Through. */
1753 case BM_SLB:
bcd2633a 1754 if (wc) {
6cdd5145 1755 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
bcd2633a 1756 }
f620b43a
BP
1757 e = lookup_bond_entry(bond, flow, vlan);
1758 if (!e->slave || !e->slave->enabled) {
f1c8a79c 1759 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
f620b43a
BP
1760 }
1761 return e->slave;
1762
1763 default:
428b2edd 1764 OVS_NOT_REACHED();
f620b43a
BP
1765 }
1766}
1767
1768static struct bond_slave *
1769bond_choose_slave(const struct bond *bond)
1770{
1771 struct bond_slave *slave, *best;
1772
1773 /* Find an enabled slave. */
1774 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1775 if (slave->enabled) {
1776 return slave;
1777 }
1778 }
1779
1780 /* All interfaces are disabled. Find an interface that will be enabled
1781 * after its updelay expires. */
1782 best = NULL;
1783 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1784 if (slave->delay_expires != LLONG_MAX
296f6519 1785 && slave->may_enable
f620b43a
BP
1786 && (!best || slave->delay_expires < best->delay_expires)) {
1787 best = slave;
1788 }
1789 }
1790 return best;
1791}
1792
1793static void
4a1b8f30 1794bond_choose_active_slave(struct bond *bond)
f620b43a
BP
1795{
1796 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1797 struct bond_slave *old_active_slave = bond->active_slave;
1798
1799 bond->active_slave = bond_choose_slave(bond);
1800 if (bond->active_slave) {
1801 if (bond->active_slave->enabled) {
1802 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1803 bond->name, bond->active_slave->name);
1804 } else {
1805 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1806 "remaining %lld ms updelay (since no interface was "
1807 "enabled)", bond->name, bond->active_slave->name,
1808 bond->active_slave->delay_expires - time_msec());
4a1b8f30 1809 bond_enable_slave(bond->active_slave, true);
f620b43a
BP
1810 }
1811
1812 bond->send_learning_packets = true;
1813 } else if (old_active_slave) {
d28b9ead 1814 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
f620b43a
BP
1815 }
1816}
1817
f620b43a
BP
1818/* Attempts to make the sum of the bond slaves' statistics appear on the fake
1819 * bond interface. */
1820static void
1821bond_update_fake_slave_stats(struct bond *bond)
1822{
1823 struct netdev_stats bond_stats;
1824 struct bond_slave *slave;
1825 struct netdev *bond_dev;
1826
1827 memset(&bond_stats, 0, sizeof bond_stats);
1828
1829 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1830 struct netdev_stats slave_stats;
1831
1832 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1833 /* XXX: We swap the stats here because they are swapped back when
1834 * reported by the internal device. The reason for this is
1835 * internal devices normally represent packets going into the
1836 * system but when used as fake bond device they represent packets
1837 * leaving the system. We really should do this in the internal
1838 * device itself because changing it here reverses the counts from
1839 * the perspective of the switch. However, the internal device
1840 * doesn't know what type of device it represents so we have to do
1841 * it here for now. */
1842 bond_stats.tx_packets += slave_stats.rx_packets;
1843 bond_stats.tx_bytes += slave_stats.rx_bytes;
1844 bond_stats.rx_packets += slave_stats.tx_packets;
1845 bond_stats.rx_bytes += slave_stats.tx_bytes;
1846 }
1847 }
1848
18812dff 1849 if (!netdev_open(bond->name, "system", &bond_dev)) {
f620b43a
BP
1850 netdev_set_stats(bond_dev, &bond_stats);
1851 netdev_close(bond_dev);
1852 }
1853}