]> git.proxmox.com Git - mirror_ovs.git/blame - ofproto/bond.c
ofproto-dpif: Add recirc_id field to struct rule_dpif
[mirror_ovs.git] / ofproto / bond.c
CommitLineData
f620b43a 1/*
8917f72c 2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
f620b43a
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
19#include "bond.h"
20
21#include <limits.h>
22#include <stdint.h>
23#include <stdlib.h>
75fad143 24#include <math.h>
f620b43a 25
adcf00ba
AZ
26#include "ofp-util.h"
27#include "ofp-actions.h"
28#include "ofpbuf.h"
29#include "ofproto/ofproto-provider.h"
30#include "ofproto/ofproto-dpif.h"
da4a6191 31#include "connectivity.h"
f620b43a
BP
32#include "coverage.h"
33#include "dynamic-string.h"
34#include "flow.h"
35#include "hmap.h"
bdebeece 36#include "lacp.h"
f620b43a
BP
37#include "list.h"
38#include "netdev.h"
39#include "odp-util.h"
40#include "ofpbuf.h"
41#include "packets.h"
42#include "poll-loop.h"
da4a6191 43#include "seq.h"
adcf00ba 44#include "match.h"
fc1d4f01 45#include "shash.h"
f620b43a
BP
46#include "timeval.h"
47#include "unixctl.h"
48#include "vlog.h"
49
50VLOG_DEFINE_THIS_MODULE(bond);
51
f1c8a79c
AW
52static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
53static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
54static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
55
9e1a6910 56/* Bit-mask for hashing a flow down to a bucket. */
f620b43a 57#define BOND_MASK 0xff
9e1a6910 58#define BOND_BUCKETS (BOND_MASK + 1)
f620b43a
BP
59
60/* A hash bucket for mapping a flow to a slave.
9e1a6910 61 * "struct bond" has an array of BOND_BUCKETS of these. */
f620b43a
BP
62struct bond_entry {
63 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
c6855ec5
JS
64 uint64_t tx_bytes /* Count of bytes recently transmitted. */
65 OVS_GUARDED_BY(rwlock);
f620b43a 66 struct list list_node; /* In bond_slave's 'entries' list. */
adcf00ba 67
c6855ec5
JS
68 /* Recirculation.
69 *
70 * 'pr_rule' is the post-recirculation rule for this entry.
71 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
72 * is used to determine delta (applied to 'tx_bytes' above.) */
73 struct rule *pr_rule;
74 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
f620b43a
BP
75};
76
77/* A bond slave, that is, one of the links comprising a bond. */
78struct bond_slave {
79 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
f1c8a79c 80 struct list list_node; /* In struct bond's enabled_slaves list. */
f620b43a
BP
81 struct bond *bond; /* The bond that contains this slave. */
82 void *aux; /* Client-provided handle for this slave. */
83
84 struct netdev *netdev; /* Network device, owned by the client. */
1ea24138 85 unsigned int change_seq; /* Tracks changes in 'netdev'. */
adcf00ba 86 ofp_port_t ofp_port; /* Open flow port number */
f620b43a
BP
87 char *name; /* Name (a copy of netdev_get_name(netdev)). */
88
89 /* Link status. */
90 long long delay_expires; /* Time after which 'enabled' may change. */
f620b43a 91 bool enabled; /* May be chosen for flows? */
296f6519 92 bool may_enable; /* Client considers this slave bondable. */
f620b43a
BP
93
94 /* Rebalancing info. Used only by bond_rebalance(). */
95 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
96 struct list entries; /* 'struct bond_entry's assigned here. */
97 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
98};
99
100/* A bond, that is, a set of network devices grouped to improve performance or
101 * robustness. */
102struct bond {
103 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
104 char *name; /* Name provided by client. */
adcf00ba 105 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
f620b43a
BP
106
107 /* Slaves. */
108 struct hmap slaves;
109
f1c8a79c
AW
110 /* Enabled slaves.
111 *
112 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
113 * (To prevent the bond_slave from disappearing they must also hold
114 * 'rwlock'.) */
115 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
116 struct list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
117
f620b43a
BP
118 /* Bonding info. */
119 enum bond_mode balance; /* Balancing mode, one of BM_*. */
120 struct bond_slave *active_slave;
f620b43a 121 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
bdebeece 122 enum lacp_status lacp_status; /* Status of LACP negotiations. */
62904702 123 bool bond_revalidate; /* True if flows need revalidation. */
672d18b2 124 uint32_t basis; /* Basis for flow hash function. */
f620b43a
BP
125
126 /* SLB specific bonding info. */
9e1a6910 127 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
f620b43a
BP
128 int rebalance_interval; /* Interval between rebalances, in ms. */
129 long long int next_rebalance; /* Next rebalancing time. */
130 bool send_learning_packets;
adcf00ba
AZ
131 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
132 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
f620b43a 133
f620b43a
BP
134 /* Legacy compatibility. */
135 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
9dd165e0 136 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
f620b43a 137
37bec3d3 138 struct ovs_refcount ref_cnt;
f620b43a
BP
139};
140
adcf00ba
AZ
141/* What to do with an bond_recirc_rule. */
142enum bond_op {
143 ADD, /* Add the rule to ofproto's flow table. */
144 DEL, /* Delete the rule from the ofproto's flow table. */
145};
146
147/* A rule to add to or delete from ofproto's internal flow table. */
148struct bond_pr_rule_op {
149 struct hmap_node hmap_node;
150 struct match match;
151 ofp_port_t out_ofport;
152 enum bond_op op;
6c932bc8 153 struct rule **pr_rule;
adcf00ba
AZ
154};
155
3bfd3972
EJ
156static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
157static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
158 OVS_REQ_RDLOCK(rwlock);
4a1b8f30
EJ
159static void bond_enable_slave(struct bond_slave *, bool enable)
160 OVS_REQ_WRLOCK(rwlock);
161static void bond_link_status_update(struct bond_slave *)
3bfd3972 162 OVS_REQ_WRLOCK(rwlock);
4a1b8f30 163static void bond_choose_active_slave(struct bond *)
9e1a6910 164 OVS_REQ_WRLOCK(rwlock);
f620b43a 165static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
672d18b2
EJ
166 uint16_t vlan, uint32_t basis);
167static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
168 uint32_t basis);
f620b43a
BP
169static struct bond_entry *lookup_bond_entry(const struct bond *,
170 const struct flow *,
3bfd3972
EJ
171 uint16_t vlan)
172 OVS_REQ_RDLOCK(rwlock);
f1c8a79c
AW
173static struct bond_slave *get_enabled_slave(struct bond *)
174 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
175static struct bond_slave *choose_output_slave(const struct bond *,
176 const struct flow *,
bcd2633a 177 struct flow_wildcards *,
4a1b8f30 178 uint16_t vlan)
3bfd3972
EJ
179 OVS_REQ_RDLOCK(rwlock);
180static void bond_update_fake_slave_stats(struct bond *)
181 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
182
183/* Attempts to parse 's' as the name of a bond balancing mode. If successful,
184 * stores the mode in '*balance' and returns true. Otherwise returns false
185 * without modifying '*balance'. */
186bool
187bond_mode_from_string(enum bond_mode *balance, const char *s)
188{
189 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
190 *balance = BM_TCP;
191 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
192 *balance = BM_SLB;
193 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
194 *balance = BM_AB;
195 } else {
196 return false;
197 }
198 return true;
199}
200
201/* Returns a string representing 'balance'. */
202const char *
203bond_mode_to_string(enum bond_mode balance) {
204 switch (balance) {
205 case BM_TCP:
206 return "balance-tcp";
207 case BM_SLB:
208 return "balance-slb";
209 case BM_AB:
210 return "active-backup";
211 }
428b2edd 212 OVS_NOT_REACHED();
f620b43a
BP
213}
214
f620b43a
BP
215\f
216/* Creates and returns a new bond whose configuration is initially taken from
217 * 's'.
218 *
219 * The caller should register each slave on the new bond by calling
220 * bond_slave_register(). */
221struct bond *
adcf00ba 222bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
f620b43a
BP
223{
224 struct bond *bond;
225
226 bond = xzalloc(sizeof *bond);
adcf00ba 227 bond->ofproto = ofproto;
f620b43a 228 hmap_init(&bond->slaves);
f1c8a79c
AW
229 list_init(&bond->enabled_slaves);
230 ovs_mutex_init(&bond->mutex);
f620b43a 231 bond->next_fake_iface_update = LLONG_MAX;
37bec3d3 232 ovs_refcount_init(&bond->ref_cnt);
f620b43a 233
adcf00ba
AZ
234 bond->recirc_id = 0;
235 hmap_init(&bond->pr_rule_ops);
236
f620b43a 237 bond_reconfigure(bond, s);
f620b43a
BP
238 return bond;
239}
240
03366a2d
EJ
241struct bond *
242bond_ref(const struct bond *bond_)
243{
244 struct bond *bond = CONST_CAST(struct bond *, bond_);
245
bca0b3b4 246 if (bond) {
37bec3d3 247 ovs_refcount_ref(&bond->ref_cnt);
bca0b3b4 248 }
03366a2d
EJ
249 return bond;
250}
251
f620b43a
BP
252/* Frees 'bond'. */
253void
03366a2d 254bond_unref(struct bond *bond)
f620b43a
BP
255{
256 struct bond_slave *slave, *next_slave;
adcf00ba 257 struct bond_pr_rule_op *pr_op, *next_op;
f620b43a 258
37bec3d3 259 if (!bond || ovs_refcount_unref(&bond->ref_cnt) != 1) {
03366a2d
EJ
260 return;
261 }
262
3bfd3972
EJ
263 ovs_rwlock_wrlock(&rwlock);
264 hmap_remove(all_bonds, &bond->hmap_node);
265 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
266
267 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
268 hmap_remove(&bond->slaves, &slave->hmap_node);
269 /* Client owns 'slave->netdev'. */
270 free(slave->name);
271 free(slave);
272 }
273 hmap_destroy(&bond->slaves);
274
f1c8a79c 275 ovs_mutex_destroy(&bond->mutex);
f620b43a 276 free(bond->hash);
f620b43a 277 free(bond->name);
adcf00ba
AZ
278
279 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
280 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
281 free(pr_op);
282 }
283 hmap_destroy(&bond->pr_rule_ops);
284
285 if (bond->recirc_id) {
286 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
287 }
288
f620b43a
BP
289 free(bond);
290}
291
adcf00ba
AZ
292static void
293add_pr_rule(struct bond *bond, const struct match *match,
6c932bc8 294 ofp_port_t out_ofport, struct rule **rule)
adcf00ba
AZ
295{
296 uint32_t hash = match_hash(match, 0);
297 struct bond_pr_rule_op *pr_op;
298
299 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
300 if (match_equal(&pr_op->match, match)) {
301 pr_op->op = ADD;
302 pr_op->out_ofport = out_ofport;
303 pr_op->pr_rule = rule;
304 return;
305 }
306 }
307
308 pr_op = xmalloc(sizeof *pr_op);
309 pr_op->match = *match;
310 pr_op->op = ADD;
311 pr_op->out_ofport = out_ofport;
312 pr_op->pr_rule = rule;
313 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
314}
315
316static void
317update_recirc_rules(struct bond *bond)
318{
319 struct match match;
320 struct bond_pr_rule_op *pr_op, *next_op;
321 uint64_t ofpacts_stub[128 / 8];
322 struct ofpbuf ofpacts;
323 int i;
324
325 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
326
327 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
328 pr_op->op = DEL;
329 }
330
6c932bc8
AZ
331 if (bond->hash && bond->recirc_id) {
332 for (i = 0; i < BOND_BUCKETS; i++) {
333 struct bond_slave *slave = bond->hash[i].slave;
adcf00ba 334
6c932bc8
AZ
335 if (slave) {
336 match_init_catchall(&match);
337 match_set_recirc_id(&match, bond->recirc_id);
6c932bc8 338 match_set_dp_hash_masked(&match, i, BOND_MASK);
adcf00ba 339
6c932bc8
AZ
340 add_pr_rule(bond, &match, slave->ofp_port,
341 &bond->hash[i].pr_rule);
342 }
adcf00ba
AZ
343 }
344 }
345
346 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
347 int error;
adcf00ba
AZ
348 switch (pr_op->op) {
349 case ADD:
350 ofpbuf_clear(&ofpacts);
351 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
352 error = ofproto_dpif_add_internal_flow(bond->ofproto,
353 &pr_op->match,
354 RECIRC_RULE_PRIORITY,
6c932bc8 355 &ofpacts, pr_op->pr_rule);
adcf00ba
AZ
356 if (error) {
357 char *err_s = match_to_string(&pr_op->match,
358 RECIRC_RULE_PRIORITY);
359
360 VLOG_ERR("failed to add post recirculation flow %s", err_s);
361 free(err_s);
adcf00ba
AZ
362 }
363 break;
364
365 case DEL:
366 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
367 &pr_op->match,
368 RECIRC_RULE_PRIORITY);
369 if (error) {
370 char *err_s = match_to_string(&pr_op->match,
371 RECIRC_RULE_PRIORITY);
372
373 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
374 free(err_s);
375 }
376
377 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
6c932bc8 378 *pr_op->pr_rule = NULL;
adcf00ba
AZ
379 free(pr_op);
380 break;
381 }
382 }
383
384 ofpbuf_uninit(&ofpacts);
385}
386
387
f620b43a
BP
388/* Updates 'bond''s overall configuration to 's'.
389 *
390 * The caller should register each slave on 'bond' by calling
391 * bond_slave_register(). This is optional if none of the slaves'
4d6fb5eb 392 * configuration has changed. In any case it can't hurt.
59d7b2b6
EJ
393 *
394 * Returns true if the configuration has changed in such a way that requires
395 * flow revalidation.
396 * */
397bool
f620b43a
BP
398bond_reconfigure(struct bond *bond, const struct bond_settings *s)
399{
59d7b2b6
EJ
400 bool revalidate = false;
401
3bfd3972 402 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
403 if (!bond->name || strcmp(bond->name, s->name)) {
404 if (bond->name) {
3bfd3972 405 hmap_remove(all_bonds, &bond->hmap_node);
f620b43a
BP
406 free(bond->name);
407 }
408 bond->name = xstrdup(s->name);
3bfd3972 409 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
f620b43a
BP
410 }
411
f620b43a
BP
412 bond->updelay = s->up_delay;
413 bond->downdelay = s->down_delay;
bc1b010c 414
9dd165e0
RK
415 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
416 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
417 revalidate = true;
418 }
419
bc1b010c
EJ
420 if (bond->rebalance_interval != s->rebalance_interval) {
421 bond->rebalance_interval = s->rebalance_interval;
422 revalidate = true;
423 }
f620b43a 424
59d7b2b6
EJ
425 if (bond->balance != s->balance) {
426 bond->balance = s->balance;
427 revalidate = true;
428 }
429
672d18b2
EJ
430 if (bond->basis != s->basis) {
431 bond->basis = s->basis;
432 revalidate = true;
433 }
434
f620b43a
BP
435 if (s->fake_iface) {
436 if (bond->next_fake_iface_update == LLONG_MAX) {
437 bond->next_fake_iface_update = time_msec();
438 }
439 } else {
440 bond->next_fake_iface_update = LLONG_MAX;
441 }
59d7b2b6 442
62904702
EJ
443 if (bond->bond_revalidate) {
444 revalidate = true;
445 bond->bond_revalidate = false;
446 }
447
adcf00ba
AZ
448 if (bond->balance != BM_AB) {
449 if (!bond->recirc_id) {
450 bond->recirc_id = ofproto_dpif_alloc_recirc_id(bond->ofproto);
451 }
452 } else if (bond->recirc_id) {
453 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
454 bond->recirc_id = 0;
455 }
456
95aafb2a
EJ
457 if (bond->balance == BM_AB || !bond->hash || revalidate) {
458 bond_entry_reset(bond);
459 }
460
3bfd3972 461 ovs_rwlock_unlock(&rwlock);
59d7b2b6 462 return revalidate;
f620b43a
BP
463}
464
f8ddccd2 465static void
1ea24138 466bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
3bfd3972 467 OVS_REQ_WRLOCK(rwlock)
f8ddccd2
BP
468{
469 if (slave->netdev != netdev) {
f8ddccd2 470 slave->netdev = netdev;
1ea24138 471 slave->change_seq = 0;
f8ddccd2
BP
472 }
473}
474
f620b43a
BP
475/* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
476 * arbitrary client-provided pointer that uniquely identifies a slave within a
477 * bond. If 'slave_' already exists within 'bond' then this function
478 * reconfigures the existing slave.
479 *
480 * 'netdev' must be the network device that 'slave_' represents. It is owned
481 * by the client, so the client must not close it before either unregistering
482 * 'slave_' or destroying 'bond'.
4d6fb5eb 483 */
f620b43a 484void
adcf00ba
AZ
485bond_slave_register(struct bond *bond, void *slave_,
486 ofp_port_t ofport, struct netdev *netdev)
f620b43a 487{
3bfd3972 488 struct bond_slave *slave;
f620b43a 489
3bfd3972
EJ
490 ovs_rwlock_wrlock(&rwlock);
491 slave = bond_slave_lookup(bond, slave_);
f620b43a
BP
492 if (!slave) {
493 slave = xzalloc(sizeof *slave);
494
495 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
496 slave->bond = bond;
497 slave->aux = slave_;
adcf00ba 498 slave->ofp_port = ofport;
f620b43a 499 slave->delay_expires = LLONG_MAX;
244b2160 500 slave->name = xstrdup(netdev_get_name(netdev));
7321e30e 501 bond->bond_revalidate = true;
244b2160 502
b3c18f66 503 slave->enabled = false;
4a1b8f30 504 bond_enable_slave(slave, netdev_get_carrier(netdev));
f620b43a
BP
505 }
506
1ea24138 507 bond_slave_set_netdev__(slave, netdev);
a6934aa9 508
f620b43a
BP
509 free(slave->name);
510 slave->name = xstrdup(netdev_get_name(netdev));
3bfd3972 511 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
512}
513
f8ddccd2
BP
514/* Updates the network device to be used with 'slave_' to 'netdev'.
515 *
516 * This is useful if the caller closes and re-opens the network device
517 * registered with bond_slave_register() but doesn't need to change anything
518 * else. */
519void
520bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
521{
3bfd3972
EJ
522 struct bond_slave *slave;
523
524 ovs_rwlock_wrlock(&rwlock);
525 slave = bond_slave_lookup(bond, slave_);
f8ddccd2 526 if (slave) {
1ea24138 527 bond_slave_set_netdev__(slave, netdev);
f8ddccd2 528 }
3bfd3972 529 ovs_rwlock_unlock(&rwlock);
f8ddccd2
BP
530}
531
f620b43a
BP
532/* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
533 * then this function has no effect.
534 *
535 * Unregistering a slave invalidates all flows. */
536void
537bond_slave_unregister(struct bond *bond, const void *slave_)
538{
3bfd3972 539 struct bond_slave *slave;
f620b43a
BP
540 bool del_active;
541
3bfd3972
EJ
542 ovs_rwlock_wrlock(&rwlock);
543 slave = bond_slave_lookup(bond, slave_);
f620b43a 544 if (!slave) {
3bfd3972 545 goto out;
f620b43a
BP
546 }
547
4a1b8f30
EJ
548 bond->bond_revalidate = true;
549 bond_enable_slave(slave, false);
b3c18f66 550
f620b43a
BP
551 del_active = bond->active_slave == slave;
552 if (bond->hash) {
553 struct bond_entry *e;
554 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
555 if (e->slave == slave) {
556 e->slave = NULL;
557 }
558 }
559 }
560
561 free(slave->name);
562
563 hmap_remove(&bond->slaves, &slave->hmap_node);
564 /* Client owns 'slave->netdev'. */
565 free(slave);
566
567 if (del_active) {
4a1b8f30 568 bond_choose_active_slave(bond);
f620b43a
BP
569 bond->send_learning_packets = true;
570 }
3bfd3972
EJ
571out:
572 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
573}
574
296f6519
EJ
575/* Should be called on each slave in 'bond' before bond_run() to indicate
576 * whether or not 'slave_' may be enabled. This function is intended to allow
577 * other protocols to have some impact on bonding decisions. For example LACP
578 * or high level link monitoring protocols may decide that a given slave should
579 * not be able to send traffic. */
4d6fb5eb 580void
296f6519 581bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
4d6fb5eb 582{
3bfd3972 583 ovs_rwlock_wrlock(&rwlock);
296f6519 584 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
3bfd3972 585 ovs_rwlock_unlock(&rwlock);
4d6fb5eb
EJ
586}
587
4a1b8f30
EJ
588/* Performs periodic maintenance on 'bond'.
589 *
590 * Returns true if the caller should revalidate its flows.
f620b43a
BP
591 *
592 * The caller should check bond_should_send_learning_packets() afterward. */
4a1b8f30
EJ
593bool
594bond_run(struct bond *bond, enum lacp_status lacp_status)
f620b43a
BP
595{
596 struct bond_slave *slave;
4a1b8f30 597 bool revalidate;
f620b43a 598
3bfd3972 599 ovs_rwlock_wrlock(&rwlock);
bdebeece
EJ
600 if (bond->lacp_status != lacp_status) {
601 bond->lacp_status = lacp_status;
4592d0e2
EJ
602 bond->bond_revalidate = true;
603 }
4d6fb5eb 604
f620b43a
BP
605 /* Enable slaves based on link status and LACP feedback. */
606 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
4a1b8f30 607 bond_link_status_update(slave);
da4a6191 608 slave->change_seq = seq_read(connectivity_seq_get());
f620b43a
BP
609 }
610 if (!bond->active_slave || !bond->active_slave->enabled) {
4a1b8f30 611 bond_choose_active_slave(bond);
f620b43a
BP
612 }
613
614 /* Update fake bond interface stats. */
615 if (time_msec() >= bond->next_fake_iface_update) {
616 bond_update_fake_slave_stats(bond);
617 bond->next_fake_iface_update = time_msec() + 1000;
618 }
619
4a1b8f30
EJ
620 revalidate = bond->bond_revalidate;
621 bond->bond_revalidate = false;
3bfd3972 622 ovs_rwlock_unlock(&rwlock);
4a1b8f30
EJ
623
624 return revalidate;
f620b43a
BP
625}
626
627/* Causes poll_block() to wake up when 'bond' needs something to be done. */
628void
629bond_wait(struct bond *bond)
630{
631 struct bond_slave *slave;
632
3bfd3972 633 ovs_rwlock_rdlock(&rwlock);
f620b43a
BP
634 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
635 if (slave->delay_expires != LLONG_MAX) {
636 poll_timer_wait_until(slave->delay_expires);
637 }
1ea24138 638
da4a6191 639 seq_wait(connectivity_seq_get(), slave->change_seq);
f620b43a
BP
640 }
641
642 if (bond->next_fake_iface_update != LLONG_MAX) {
643 poll_timer_wait_until(bond->next_fake_iface_update);
644 }
645
bbc13389 646 if (bond->bond_revalidate) {
f620b43a
BP
647 poll_immediate_wake();
648 }
3bfd3972 649 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
650
651 /* We don't wait for bond->next_rebalance because rebalancing can only run
652 * at a flow account checkpoint. ofproto does checkpointing on its own
653 * schedule and bond_rebalance() gets called afterward, so we'd just be
654 * waking up for no purpose. */
655}
656\f
657/* MAC learning table interaction. */
658
659static bool
660may_send_learning_packets(const struct bond *bond)
661{
9dd165e0
RK
662 return ((bond->lacp_status == LACP_DISABLED
663 && (bond->balance == BM_SLB || bond->balance == BM_AB))
664 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
bdebeece 665 && bond->active_slave;
f620b43a
BP
666}
667
668/* Returns true if 'bond' needs the client to send out packets to assist with
669 * MAC learning on 'bond'. If this function returns true, then the client
670 * should iterate through its MAC learning table for the bridge on which 'bond'
671 * is located. For each MAC that has been learned on a port other than 'bond',
ea131871 672 * it should call bond_compose_learning_packet().
f620b43a 673 *
477879ea
BP
674 * This function will only return true if 'bond' is in SLB or active-backup
675 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
676 * necessary.
f620b43a
BP
677 *
678 * Calling this function resets the state that it checks. */
679bool
680bond_should_send_learning_packets(struct bond *bond)
681{
3bfd3972
EJ
682 bool send;
683
684 ovs_rwlock_wrlock(&rwlock);
685 send = bond->send_learning_packets && may_send_learning_packets(bond);
f620b43a 686 bond->send_learning_packets = false;
3bfd3972 687 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
688 return send;
689}
690
691/* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
692 *
ea131871
JG
693 * See bond_should_send_learning_packets() for description of usage. The
694 * caller should send the composed packet on the port associated with
695 * port_aux and takes ownership of the returned ofpbuf. */
696struct ofpbuf *
697bond_compose_learning_packet(struct bond *bond,
698 const uint8_t eth_src[ETH_ADDR_LEN],
699 uint16_t vlan, void **port_aux)
f620b43a
BP
700{
701 struct bond_slave *slave;
ea131871 702 struct ofpbuf *packet;
f620b43a 703 struct flow flow;
f620b43a 704
3bfd3972 705 ovs_rwlock_rdlock(&rwlock);
cb22974d 706 ovs_assert(may_send_learning_packets(bond));
f620b43a
BP
707 memset(&flow, 0, sizeof flow);
708 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
4a1b8f30 709 slave = choose_output_slave(bond, &flow, NULL, vlan);
f620b43a 710
ea131871 711 packet = ofpbuf_new(0);
2ea838ac 712 compose_rarp(packet, eth_src);
f620b43a 713 if (vlan) {
1bf02876 714 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
f620b43a 715 }
f620b43a 716
ea131871 717 *port_aux = slave->aux;
3bfd3972 718 ovs_rwlock_unlock(&rwlock);
ea131871 719 return packet;
f620b43a
BP
720}
721\f
722/* Checks whether a packet that arrived on 'slave_' within 'bond', with an
723 * Ethernet destination address of 'eth_dst', should be admitted.
724 *
725 * The return value is one of the following:
726 *
727 * - BV_ACCEPT: Admit the packet.
728 *
729 * - BV_DROP: Drop the packet.
730 *
731 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
732 * Ethernet source address and VLAN. If there is none, or if the packet
733 * is on the learned port, then admit the packet. If a different port has
734 * been learned, however, drop the packet (and do not use it for MAC
735 * learning).
736 */
737enum bond_verdict
738bond_check_admissibility(struct bond *bond, const void *slave_,
4a1b8f30 739 const uint8_t eth_dst[ETH_ADDR_LEN])
f620b43a 740{
3bfd3972
EJ
741 enum bond_verdict verdict = BV_DROP;
742 struct bond_slave *slave;
9a1c6450 743
3bfd3972
EJ
744 ovs_rwlock_rdlock(&rwlock);
745 slave = bond_slave_lookup(bond, slave_);
4222bbc8 746 if (!slave) {
3bfd3972 747 goto out;
4222bbc8
EJ
748 }
749
9a1c6450
EJ
750 /* LACP bonds have very loose admissibility restrictions because we can
751 * assume the remote switch is aware of the bond and will "do the right
752 * thing". However, as a precaution we drop packets on disabled slaves
753 * because no correctly implemented partner switch should be sending
bdebeece
EJ
754 * packets to them.
755 *
756 * If LACP is configured, but LACP negotiations have been unsuccessful, we
9dd165e0 757 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
bdebeece 758 switch (bond->lacp_status) {
3bfd3972
EJ
759 case LACP_NEGOTIATED:
760 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
761 goto out;
762 case LACP_CONFIGURED:
9dd165e0
RK
763 if (!bond->lacp_fallback_ab) {
764 goto out;
765 }
3bfd3972
EJ
766 case LACP_DISABLED:
767 break;
f620b43a
BP
768 }
769
770 /* Drop all multicast packets on inactive slaves. */
771 if (eth_addr_is_multicast(eth_dst)) {
4222bbc8 772 if (bond->active_slave != slave) {
3bfd3972 773 goto out;
f620b43a
BP
774 }
775 }
776
f931a4c9 777 switch (bond->balance) {
9dd165e0
RK
778 case BM_TCP:
779 /* TCP balanced bonds require successful LACP negotiations. Based on the
780 * above check, LACP is off or lacp_fallback_ab is true on this bond.
781 * If lacp_fallback_ab is true fall through to BM_AB case else, we
782 * drop all incoming traffic. */
783 if (!bond->lacp_fallback_ab) {
784 goto out;
785 }
786
f931a4c9
BP
787 case BM_AB:
788 /* Drop all packets which arrive on backup slaves. This is similar to
789 * how Linux bonding handles active-backup bonds. */
7ba7dcf0
EJ
790 if (bond->active_slave != slave) {
791 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
792
e6b2255c
BP
793 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
794 " slave (%s) destined for " ETH_ADDR_FMT,
795 slave->name, ETH_ADDR_ARGS(eth_dst));
3bfd3972 796 goto out;
7ba7dcf0 797 }
3bfd3972
EJ
798 verdict = BV_ACCEPT;
799 goto out;
f931a4c9 800
f931a4c9
BP
801 case BM_SLB:
802 /* Drop all packets for which we have learned a different input port,
803 * because we probably sent the packet on one slave and got it back on
804 * the other. Gratuitous ARP packets are an exception to this rule:
805 * the host has moved to another switch. The exception to the
806 * exception is if we locked the learning table to avoid reflections on
807 * bond slaves. */
3bfd3972
EJ
808 verdict = BV_DROP_IF_MOVED;
809 goto out;
7ba7dcf0
EJ
810 }
811
428b2edd 812 OVS_NOT_REACHED();
3bfd3972
EJ
813out:
814 ovs_rwlock_unlock(&rwlock);
815 return verdict;
816
f620b43a
BP
817}
818
819/* Returns the slave (registered on 'bond' by bond_slave_register()) to which
820 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
821 * NULL if the packet should be dropped because no slaves are enabled.
822 *
823 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
824 * should be a VID only (i.e. excluding the PCP bits). Second,
825 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
826 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
827 * packet belongs to (so for an access port it will be the access port's VLAN).
828 *
bcd2633a
JP
829 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
830 * significant in the selection. At some point earlier, 'wc' should
831 * have been initialized (e.g., by flow_wildcards_init_catchall()).
f620b43a
BP
832 */
833void *
834bond_choose_output_slave(struct bond *bond, const struct flow *flow,
4a1b8f30 835 struct flow_wildcards *wc, uint16_t vlan)
f620b43a 836{
3bfd3972 837 struct bond_slave *slave;
b5d5d7d3 838 void *aux;
3bfd3972
EJ
839
840 ovs_rwlock_rdlock(&rwlock);
4a1b8f30 841 slave = choose_output_slave(bond, flow, wc, vlan);
b5d5d7d3 842 aux = slave ? slave->aux : NULL;
3bfd3972 843 ovs_rwlock_unlock(&rwlock);
b5d5d7d3
AW
844
845 return aux;
f620b43a 846}
f620b43a 847\f
adcf00ba
AZ
848/* Recirculation. */
849static void
850bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
c6855ec5 851 OVS_REQ_WRLOCK(rwlock)
adcf00ba
AZ
852{
853 if (entry->slave) {
854 uint64_t delta;
855
856 delta = rule_tx_bytes - entry->pr_tx_bytes;
857 entry->tx_bytes += delta;
858 entry->pr_tx_bytes = rule_tx_bytes;
859 }
860}
861
862/* Maintain bond stats using post recirculation rule byte counters.*/
60cda7d6 863static void
adcf00ba 864bond_recirculation_account(struct bond *bond)
80316557 865 OVS_REQ_WRLOCK(rwlock)
adcf00ba
AZ
866{
867 int i;
868
adcf00ba
AZ
869 for (i=0; i<=BOND_MASK; i++) {
870 struct bond_entry *entry = &bond->hash[i];
871 struct rule *rule = entry->pr_rule;
872
873 if (rule) {
874 uint64_t n_packets OVS_UNUSED;
875 long long int used OVS_UNUSED;
876 uint64_t n_bytes;
877
878 rule->ofproto->ofproto_class->rule_get_stats(
879 rule, &n_packets, &n_bytes, &used);
880 bond_entry_account(entry, n_bytes);
881 }
882 }
adcf00ba
AZ
883}
884
885bool
886bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
887 uint32_t *hash_bias)
888{
80316557 889 if (bond->balance == BM_TCP && bond->recirc_id) {
adcf00ba
AZ
890 if (recirc_id) {
891 *recirc_id = bond->recirc_id;
892 }
893 if (hash_bias) {
894 *hash_bias = bond->basis;
895 }
896 return true;
897 } else {
898 return false;
899 }
900}
901
902void
903bond_update_post_recirc_rules(struct bond* bond, const bool force)
904{
905 struct bond_entry *e;
906 bool update_rules = force; /* Always update rules if caller forces it. */
907
908 /* Make sure all bond entries are populated */
909 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
910 if (!e->slave || !e->slave->enabled) {
911 update_rules = true;
912 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
913 struct bond_slave, hmap_node);
914 if (!e->slave->enabled) {
915 e->slave = bond->active_slave;
916 }
917 }
918 }
919
920 if (update_rules) {
921 update_recirc_rules(bond);
922 }
923}
924\f
f620b43a
BP
925/* Rebalancing. */
926
1b137691 927static bool
3bfd3972 928bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
1b137691 929{
bc1b010c
EJ
930 return bond->rebalance_interval
931 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
1b137691
EJ
932}
933
f620b43a
BP
934/* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
935void
936bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
937 uint64_t n_bytes)
938{
3bfd3972 939 ovs_rwlock_wrlock(&rwlock);
1b137691 940 if (bond_is_balanced(bond)) {
f620b43a 941 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
f620b43a 942 }
3bfd3972 943 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
944}
945
946static struct bond_slave *
3bfd3972 947bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
948{
949 return CONTAINER_OF(bal, struct bond_slave, bal_node);
950}
951
952static void
953log_bals(struct bond *bond, const struct list *bals)
c6855ec5 954 OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
955{
956 if (VLOG_IS_DBG_ENABLED()) {
957 struct ds ds = DS_EMPTY_INITIALIZER;
958 const struct bond_slave *slave;
959
960 LIST_FOR_EACH (slave, bal_node, bals) {
961 if (ds.length) {
962 ds_put_char(&ds, ',');
963 }
964 ds_put_format(&ds, " %s %"PRIu64"kB",
965 slave->name, slave->tx_bytes / 1024);
966
967 if (!slave->enabled) {
968 ds_put_cstr(&ds, " (disabled)");
969 }
970 if (!list_is_empty(&slave->entries)) {
971 struct bond_entry *e;
972
973 ds_put_cstr(&ds, " (");
974 LIST_FOR_EACH (e, list_node, &slave->entries) {
975 if (&e->list_node != list_front(&slave->entries)) {
976 ds_put_cstr(&ds, " + ");
977 }
34582733 978 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
f620b43a
BP
979 e - bond->hash, e->tx_bytes / 1024);
980 }
981 ds_put_cstr(&ds, ")");
982 }
983 }
984 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
985 ds_destroy(&ds);
986 }
987}
988
989/* Shifts 'hash' from its current slave to 'to'. */
990static void
4a1b8f30 991bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
c6855ec5 992 OVS_REQ_WRLOCK(rwlock)
f620b43a
BP
993{
994 struct bond_slave *from = hash->slave;
995 struct bond *bond = from->bond;
996 uint64_t delta = hash->tx_bytes;
997
34582733 998 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
f620b43a
BP
999 "from %s to %s (now carrying %"PRIu64"kB and "
1000 "%"PRIu64"kB load, respectively)",
1001 bond->name, delta / 1024, hash - bond->hash,
1002 from->name, to->name,
1003 (from->tx_bytes - delta) / 1024,
1004 (to->tx_bytes + delta) / 1024);
1005
1006 /* Shift load away from 'from' to 'to'. */
1007 from->tx_bytes -= delta;
1008 to->tx_bytes += delta;
1009
1010 /* Arrange for flows to be revalidated. */
dc30ea2d 1011 hash->slave = to;
4a1b8f30 1012 bond->bond_revalidate = true;
f620b43a
BP
1013}
1014
09a5d390
BP
1015/* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1016 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
f620b43a
BP
1017 * given that doing so must decrease the ratio of the load on the two slaves by
1018 * at least 0.1. Returns NULL if there is no appropriate entry.
1019 *
1020 * The list of entries isn't sorted. I don't know of a reason to prefer to
1021 * shift away small hashes or large hashes. */
1022static struct bond_entry *
1023choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
c6855ec5 1024 OVS_REQ_WRLOCK(rwlock)
f620b43a
BP
1025{
1026 struct bond_entry *e;
1027
1028 if (list_is_short(&from->entries)) {
1029 /* 'from' carries no more than one MAC hash, so shifting load away from
1030 * it would be pointless. */
1031 return NULL;
1032 }
1033
1034 LIST_FOR_EACH (e, list_node, &from->entries) {
1035 double old_ratio, new_ratio;
1036 uint64_t delta;
1037
1038 if (to_tx_bytes == 0) {
1039 /* Nothing on the new slave, move it. */
1040 return e;
1041 }
1042
1043 delta = e->tx_bytes;
1044 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1045 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
75fad143
ZK
1046 if (old_ratio - new_ratio > 0.1
1047 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1048 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1049 and 'to' slave have the same load. Therefore, we only move an
1050 entry if it decreases the load on 'from', and brings us closer
1051 to equal traffic load. */
f620b43a
BP
1052 return e;
1053 }
1054 }
1055
1056 return NULL;
1057}
1058
1059/* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1060 * maintained. */
1061static void
1062insert_bal(struct list *bals, struct bond_slave *slave)
1063{
1064 struct bond_slave *pos;
1065
1066 LIST_FOR_EACH (pos, bal_node, bals) {
1067 if (slave->tx_bytes > pos->tx_bytes) {
1068 break;
1069 }
1070 }
1071 list_insert(&pos->bal_node, &slave->bal_node);
1072}
1073
1074/* Removes 'slave' from its current list and then inserts it into 'bals' so
1075 * that descending order of 'tx_bytes' is maintained. */
1076static void
1077reinsert_bal(struct list *bals, struct bond_slave *slave)
1078{
1079 list_remove(&slave->bal_node);
1080 insert_bal(bals, slave);
1081}
1082
1083/* If 'bond' needs rebalancing, does so.
1084 *
adcf00ba
AZ
1085 * The caller should have called bond_account() for each active flow, or in case
1086 * of recirculation is used, have called bond_recirculation_account(bond),
1087 * to ensure that flow data is consistently accounted at this point.
60cda7d6
AZ
1088 */
1089void
4a1b8f30 1090bond_rebalance(struct bond *bond)
f620b43a
BP
1091{
1092 struct bond_slave *slave;
1093 struct bond_entry *e;
1094 struct list bals;
adcf00ba 1095 bool rebalanced = false;
60cda7d6 1096 bool use_recirc;
f620b43a 1097
3bfd3972 1098 ovs_rwlock_wrlock(&rwlock);
1b137691 1099 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
adcf00ba 1100 goto done;
f620b43a
BP
1101 }
1102 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1103
60cda7d6
AZ
1104 use_recirc = ofproto_dpif_get_enable_recirc(bond->ofproto) &&
1105 bond_may_recirc(bond, NULL, NULL);
1106
1107 if (use_recirc) {
1108 bond_recirculation_account(bond);
1109 }
1110
f620b43a
BP
1111 /* Add each bond_entry to its slave's 'entries' list.
1112 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1113 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1114 slave->tx_bytes = 0;
1115 list_init(&slave->entries);
1116 }
1117 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1118 if (e->slave && e->tx_bytes) {
1119 e->slave->tx_bytes += e->tx_bytes;
1120 list_push_back(&e->slave->entries, &e->list_node);
1121 }
1122 }
1123
1124 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1125 *
1126 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1127 * with a proper list sort algorithm. */
1128 list_init(&bals);
1129 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1130 if (slave->enabled) {
1131 insert_bal(&bals, slave);
1132 }
1133 }
1134 log_bals(bond, &bals);
1135
1136 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1137 while (!list_is_short(&bals)) {
1138 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1139 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1140 uint64_t overload;
1141
1142 overload = from->tx_bytes - to->tx_bytes;
1143 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1144 /* The extra load on 'from' (and all less-loaded slaves), compared
1145 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1146 * it is less than ~1Mbps. No point in rebalancing. */
1147 break;
1148 }
1149
09a5d390
BP
1150 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1151 * to move from 'from' to 'to'. */
f620b43a
BP
1152 e = choose_entry_to_migrate(from, to->tx_bytes);
1153 if (e) {
4a1b8f30 1154 bond_shift_load(e, to);
f620b43a
BP
1155
1156 /* Delete element from from->entries.
1157 *
1158 * We don't add the element to to->hashes. That would only allow
1159 * 'e' to be migrated to another slave in this rebalancing run, and
1160 * there is no point in doing that. */
1161 list_remove(&e->list_node);
1162
1163 /* Re-sort 'bals'. */
1164 reinsert_bal(&bals, from);
1165 reinsert_bal(&bals, to);
60cda7d6 1166 rebalanced = true;
f620b43a
BP
1167 } else {
1168 /* Can't usefully migrate anything away from 'from'.
1169 * Don't reconsider it. */
1170 list_remove(&from->bal_node);
1171 }
1172 }
1173
1174 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1175 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1176 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1177 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1178 e->tx_bytes /= 2;
f620b43a 1179 }
adcf00ba 1180
60cda7d6
AZ
1181 if (use_recirc && rebalanced) {
1182 bond_update_post_recirc_rules(bond,true);
1183 }
2f486d4c
AZ
1184
1185done:
3bfd3972 1186 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1187}
1188\f
1189/* Bonding unixctl user interface functions. */
1190
1191static struct bond *
3bfd3972 1192bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
1193{
1194 struct bond *bond;
1195
1196 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
3bfd3972 1197 all_bonds) {
f620b43a
BP
1198 if (!strcmp(bond->name, name)) {
1199 return bond;
1200 }
1201 }
1202 return NULL;
1203}
1204
1205static struct bond_slave *
1206bond_lookup_slave(struct bond *bond, const char *slave_name)
1207{
1208 struct bond_slave *slave;
1209
1210 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1211 if (!strcmp(slave->name, slave_name)) {
1212 return slave;
1213 }
1214 }
1215 return NULL;
1216}
1217
1218static void
1219bond_unixctl_list(struct unixctl_conn *conn,
0e15264f
BP
1220 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1221 void *aux OVS_UNUSED)
f620b43a
BP
1222{
1223 struct ds ds = DS_EMPTY_INITIALIZER;
1224 const struct bond *bond;
1225
adcf00ba 1226 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
f620b43a 1227
3bfd3972
EJ
1228 ovs_rwlock_rdlock(&rwlock);
1229 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
f620b43a
BP
1230 const struct bond_slave *slave;
1231 size_t i;
1232
adcf00ba
AZ
1233 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1234 bond_mode_to_string(bond->balance), bond->recirc_id);
f620b43a
BP
1235
1236 i = 0;
1237 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1238 if (i++ > 0) {
1239 ds_put_cstr(&ds, ", ");
1240 }
1241 ds_put_cstr(&ds, slave->name);
1242 }
1243 ds_put_char(&ds, '\n');
1244 }
3bfd3972 1245 ovs_rwlock_unlock(&rwlock);
bde9f75d 1246 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a
BP
1247 ds_destroy(&ds);
1248}
1249
1250static void
c33a8a25 1251bond_print_details(struct ds *ds, const struct bond *bond)
3bfd3972 1252 OVS_REQ_RDLOCK(rwlock)
f620b43a 1253{
fc1d4f01
EJ
1254 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1255 const struct shash_node **sorted_slaves = NULL;
f620b43a 1256 const struct bond_slave *slave;
adcf00ba
AZ
1257 bool may_recirc;
1258 uint32_t recirc_id;
fc1d4f01 1259 int i;
f620b43a 1260
c33a8a25
EJ
1261 ds_put_format(ds, "---- %s ----\n", bond->name);
1262 ds_put_format(ds, "bond_mode: %s\n",
f620b43a
BP
1263 bond_mode_to_string(bond->balance));
1264
adcf00ba
AZ
1265 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1266 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1267 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1268
c33a8a25 1269 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
672d18b2 1270
c33a8a25
EJ
1271 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1272 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
f620b43a 1273
1b137691 1274 if (bond_is_balanced(bond)) {
c33a8a25 1275 ds_put_format(ds, "next rebalance: %lld ms\n",
f620b43a
BP
1276 bond->next_rebalance - time_msec());
1277 }
1278
bdebeece
EJ
1279 ds_put_cstr(ds, "lacp_status: ");
1280 switch (bond->lacp_status) {
1281 case LACP_NEGOTIATED:
1282 ds_put_cstr(ds, "negotiated\n");
1283 break;
1284 case LACP_CONFIGURED:
1285 ds_put_cstr(ds, "configured\n");
1286 break;
1287 case LACP_DISABLED:
1288 ds_put_cstr(ds, "off\n");
1289 break;
1290 default:
1291 ds_put_cstr(ds, "<unknown>\n");
1292 break;
1293 }
4d6fb5eb 1294
f620b43a 1295 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
fc1d4f01
EJ
1296 shash_add(&slave_shash, slave->name, slave);
1297 }
1298 sorted_slaves = shash_sort(&slave_shash);
1299
1300 for (i = 0; i < shash_count(&slave_shash); i++) {
f620b43a 1301 struct bond_entry *be;
f620b43a 1302
fc1d4f01
EJ
1303 slave = sorted_slaves[i]->data;
1304
f620b43a 1305 /* Basic info. */
c33a8a25 1306 ds_put_format(ds, "\nslave %s: %s\n",
f620b43a
BP
1307 slave->name, slave->enabled ? "enabled" : "disabled");
1308 if (slave == bond->active_slave) {
c33a8a25 1309 ds_put_cstr(ds, "\tactive slave\n");
f620b43a
BP
1310 }
1311 if (slave->delay_expires != LLONG_MAX) {
c33a8a25 1312 ds_put_format(ds, "\t%s expires in %lld ms\n",
f620b43a
BP
1313 slave->enabled ? "downdelay" : "updelay",
1314 slave->delay_expires - time_msec());
1315 }
1316
c33a8a25 1317 ds_put_format(ds, "\tmay_enable: %s\n",
296f6519 1318 slave->may_enable ? "true" : "false");
4d6fb5eb 1319
1b137691 1320 if (!bond_is_balanced(bond)) {
f620b43a
BP
1321 continue;
1322 }
1323
1324 /* Hashes. */
f620b43a
BP
1325 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1326 int hash = be - bond->hash;
f6ba1f35 1327 uint64_t be_tx_k;
f620b43a
BP
1328
1329 if (be->slave != slave) {
1330 continue;
1331 }
1332
f6ba1f35
AZ
1333 be_tx_k = be->tx_bytes / 1024;
1334 if (be_tx_k) {
1335 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1336 hash, be_tx_k);
1337 }
f620b43a 1338
7b9f1974 1339 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
f620b43a
BP
1340 }
1341 }
fc1d4f01
EJ
1342 shash_destroy(&slave_shash);
1343 free(sorted_slaves);
c33a8a25
EJ
1344 ds_put_cstr(ds, "\n");
1345}
1346
1347static void
1348bond_unixctl_show(struct unixctl_conn *conn,
1349 int argc, const char *argv[],
1350 void *aux OVS_UNUSED)
1351{
1352 struct ds ds = DS_EMPTY_INITIALIZER;
1353
3bfd3972 1354 ovs_rwlock_rdlock(&rwlock);
c33a8a25
EJ
1355 if (argc > 1) {
1356 const struct bond *bond = bond_find(argv[1]);
1357
1358 if (!bond) {
bde9f75d 1359 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1360 goto out;
c33a8a25
EJ
1361 }
1362 bond_print_details(&ds, bond);
1363 } else {
1364 const struct bond *bond;
1365
3bfd3972 1366 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
c33a8a25
EJ
1367 bond_print_details(&ds, bond);
1368 }
1369 }
1370
bde9f75d 1371 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a 1372 ds_destroy(&ds);
3bfd3972
EJ
1373
1374out:
1375 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1376}
1377
1378static void
0e15264f
BP
1379bond_unixctl_migrate(struct unixctl_conn *conn,
1380 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1381 void *aux OVS_UNUSED)
1382{
0e15264f
BP
1383 const char *bond_s = argv[1];
1384 const char *hash_s = argv[2];
1385 const char *slave_s = argv[3];
f620b43a
BP
1386 struct bond *bond;
1387 struct bond_slave *slave;
1388 struct bond_entry *entry;
1389 int hash;
1390
3bfd3972 1391 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1392 bond = bond_find(bond_s);
1393 if (!bond) {
bde9f75d 1394 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1395 goto out;
f620b43a
BP
1396 }
1397
1398 if (bond->balance != BM_SLB) {
bde9f75d 1399 unixctl_command_reply_error(conn, "not an SLB bond");
3bfd3972 1400 goto out;
f620b43a
BP
1401 }
1402
1403 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1404 hash = atoi(hash_s) & BOND_MASK;
1405 } else {
bde9f75d 1406 unixctl_command_reply_error(conn, "bad hash");
3bfd3972 1407 goto out;
f620b43a
BP
1408 }
1409
1410 slave = bond_lookup_slave(bond, slave_s);
1411 if (!slave) {
bde9f75d 1412 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1413 goto out;
f620b43a
BP
1414 }
1415
1416 if (!slave->enabled) {
bde9f75d 1417 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
3bfd3972 1418 goto out;
f620b43a
BP
1419 }
1420
1421 entry = &bond->hash[hash];
4a1b8f30 1422 bond->bond_revalidate = true;
f620b43a 1423 entry->slave = slave;
bde9f75d 1424 unixctl_command_reply(conn, "migrated");
3bfd3972
EJ
1425
1426out:
1427 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1428}
1429
1430static void
0e15264f
BP
1431bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1432 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1433 void *aux OVS_UNUSED)
1434{
0e15264f
BP
1435 const char *bond_s = argv[1];
1436 const char *slave_s = argv[2];
f620b43a
BP
1437 struct bond *bond;
1438 struct bond_slave *slave;
1439
3bfd3972 1440 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1441 bond = bond_find(bond_s);
1442 if (!bond) {
bde9f75d 1443 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1444 goto out;
f620b43a
BP
1445 }
1446
1447 slave = bond_lookup_slave(bond, slave_s);
1448 if (!slave) {
bde9f75d 1449 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1450 goto out;
f620b43a
BP
1451 }
1452
1453 if (!slave->enabled) {
bde9f75d 1454 unixctl_command_reply_error(conn, "cannot make disabled slave active");
3bfd3972 1455 goto out;
f620b43a
BP
1456 }
1457
1458 if (bond->active_slave != slave) {
4a1b8f30 1459 bond->bond_revalidate = true;
f620b43a 1460 bond->active_slave = slave;
f620b43a
BP
1461 VLOG_INFO("bond %s: active interface is now %s",
1462 bond->name, slave->name);
1463 bond->send_learning_packets = true;
bde9f75d 1464 unixctl_command_reply(conn, "done");
f620b43a 1465 } else {
bde9f75d 1466 unixctl_command_reply(conn, "no change");
f620b43a 1467 }
3bfd3972
EJ
1468out:
1469 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1470}
1471
1472static void
0e15264f 1473enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
f620b43a 1474{
0e15264f
BP
1475 const char *bond_s = argv[1];
1476 const char *slave_s = argv[2];
f620b43a
BP
1477 struct bond *bond;
1478 struct bond_slave *slave;
1479
3bfd3972 1480 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1481 bond = bond_find(bond_s);
1482 if (!bond) {
bde9f75d 1483 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1484 goto out;
f620b43a
BP
1485 }
1486
1487 slave = bond_lookup_slave(bond, slave_s);
1488 if (!slave) {
bde9f75d 1489 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1490 goto out;
f620b43a
BP
1491 }
1492
4a1b8f30 1493 bond_enable_slave(slave, enable);
bde9f75d 1494 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
3bfd3972
EJ
1495
1496out:
1497 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1498}
1499
1500static void
0e15264f
BP
1501bond_unixctl_enable_slave(struct unixctl_conn *conn,
1502 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1503 void *aux OVS_UNUSED)
1504{
0e15264f 1505 enable_slave(conn, argv, true);
f620b43a
BP
1506}
1507
1508static void
0e15264f
BP
1509bond_unixctl_disable_slave(struct unixctl_conn *conn,
1510 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1511 void *aux OVS_UNUSED)
1512{
0e15264f 1513 enable_slave(conn, argv, false);
f620b43a
BP
1514}
1515
1516static void
0e15264f 1517bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
f620b43a
BP
1518 void *aux OVS_UNUSED)
1519{
0e15264f
BP
1520 const char *mac_s = argv[1];
1521 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1522 const char *basis_s = argc > 3 ? argv[3] : NULL;
f620b43a
BP
1523 uint8_t mac[ETH_ADDR_LEN];
1524 uint8_t hash;
1525 char *hash_cstr;
1526 unsigned int vlan;
672d18b2 1527 uint32_t basis;
f620b43a
BP
1528
1529 if (vlan_s) {
c2c28dfd 1530 if (!ovs_scan(vlan_s, "%u", &vlan)) {
bde9f75d 1531 unixctl_command_reply_error(conn, "invalid vlan");
f620b43a
BP
1532 return;
1533 }
1534 } else {
dc155bff 1535 vlan = 0;
f620b43a
BP
1536 }
1537
672d18b2 1538 if (basis_s) {
c2c28dfd 1539 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
bde9f75d 1540 unixctl_command_reply_error(conn, "invalid basis");
672d18b2
EJ
1541 return;
1542 }
1543 } else {
1544 basis = 0;
1545 }
1546
c2c28dfd 1547 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
672d18b2 1548 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
f620b43a
BP
1549
1550 hash_cstr = xasprintf("%u", hash);
bde9f75d 1551 unixctl_command_reply(conn, hash_cstr);
f620b43a
BP
1552 free(hash_cstr);
1553 } else {
bde9f75d 1554 unixctl_command_reply_error(conn, "invalid mac");
f620b43a
BP
1555 }
1556}
1557
1558void
1559bond_init(void)
1560{
0e15264f 1561 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
c33a8a25
EJ
1562 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1563 NULL);
0e15264f 1564 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
7ff2009a 1565 bond_unixctl_migrate, NULL);
0e15264f 1566 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
f620b43a 1567 bond_unixctl_set_active_slave, NULL);
0e15264f 1568 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
7ff2009a 1569 bond_unixctl_enable_slave, NULL);
0e15264f 1570 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
7ff2009a 1571 bond_unixctl_disable_slave, NULL);
0e15264f 1572 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
7ff2009a 1573 bond_unixctl_hash, NULL);
f620b43a
BP
1574}
1575\f
95aafb2a
EJ
1576static void
1577bond_entry_reset(struct bond *bond)
1578{
1579 if (bond->balance != BM_AB) {
9e1a6910 1580 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
95aafb2a
EJ
1581
1582 if (!bond->hash) {
1583 bond->hash = xmalloc(hash_len);
1584 }
1585 memset(bond->hash, 0, hash_len);
1586
1587 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1588 } else {
1589 free(bond->hash);
1590 bond->hash = NULL;
1591 }
1592}
1593
f620b43a
BP
1594static struct bond_slave *
1595bond_slave_lookup(struct bond *bond, const void *slave_)
1596{
1597 struct bond_slave *slave;
1598
1599 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1600 &bond->slaves) {
1601 if (slave->aux == slave_) {
1602 return slave;
1603 }
1604 }
1605
1606 return NULL;
1607}
1608
f620b43a 1609static void
4a1b8f30 1610bond_enable_slave(struct bond_slave *slave, bool enable)
f620b43a
BP
1611{
1612 slave->delay_expires = LLONG_MAX;
1613 if (enable != slave->enabled) {
4a1b8f30 1614 slave->bond->bond_revalidate = true;
f620b43a 1615 slave->enabled = enable;
f1c8a79c
AW
1616
1617 ovs_mutex_lock(&slave->bond->mutex);
1618 if (enable) {
1619 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1620 } else {
1621 list_remove(&slave->list_node);
1622 }
1623 ovs_mutex_unlock(&slave->bond->mutex);
1624
4a1b8f30
EJ
1625 VLOG_INFO("interface %s: %s", slave->name,
1626 slave->enabled ? "enabled" : "disabled");
f620b43a
BP
1627 }
1628}
1629
1630static void
4a1b8f30 1631bond_link_status_update(struct bond_slave *slave)
f620b43a
BP
1632{
1633 struct bond *bond = slave->bond;
1634 bool up;
1635
296f6519 1636 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
f620b43a
BP
1637 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1638 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1639 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1640 slave->name, up ? "up" : "down");
1641 if (up == slave->enabled) {
1642 slave->delay_expires = LLONG_MAX;
1643 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1644 slave->name, up ? "disabled" : "enabled");
1645 } else {
bdebeece 1646 int delay = (bond->lacp_status != LACP_DISABLED ? 0
f620b43a
BP
1647 : up ? bond->updelay : bond->downdelay);
1648 slave->delay_expires = time_msec() + delay;
1649 if (delay) {
1650 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1651 "for %d ms",
1652 slave->name,
1653 up ? "enabled" : "disabled",
1654 up ? "up" : "down",
1655 delay);
1656 }
1657 }
1658 }
1659
1660 if (time_msec() >= slave->delay_expires) {
4a1b8f30 1661 bond_enable_slave(slave, up);
f620b43a
BP
1662 }
1663}
1664
f620b43a 1665static unsigned int
672d18b2 1666bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
f620b43a 1667{
7e36ac42 1668 return hash_mac(mac, vlan, basis);
f620b43a
BP
1669}
1670
1671static unsigned int
672d18b2 1672bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
f620b43a
BP
1673{
1674 struct flow hash_flow = *flow;
d84d4b88 1675 hash_flow.vlan_tci = htons(vlan);
f620b43a
BP
1676
1677 /* The symmetric quality of this hash function is not required, but
1678 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1679 * purposes, so we use it out of convenience. */
672d18b2 1680 return flow_hash_symmetric_l4(&hash_flow, basis);
f620b43a
BP
1681}
1682
fb0b29a3
EJ
1683static unsigned int
1684bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1685{
cb22974d 1686 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
fb0b29a3 1687
bdebeece 1688 return (bond->balance == BM_TCP
672d18b2
EJ
1689 ? bond_hash_tcp(flow, vlan, bond->basis)
1690 : bond_hash_src(flow->dl_src, vlan, bond->basis));
fb0b29a3
EJ
1691}
1692
f620b43a
BP
1693static struct bond_entry *
1694lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1695 uint16_t vlan)
1696{
fb0b29a3 1697 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
f620b43a
BP
1698}
1699
f1c8a79c
AW
1700/* Selects and returns an enabled slave from the 'enabled_slaves' list
1701 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1702 * returns NULL. */
1703static struct bond_slave *
1704get_enabled_slave(struct bond *bond)
1705{
1706 struct list *node;
1707
1708 ovs_mutex_lock(&bond->mutex);
1709 if (list_is_empty(&bond->enabled_slaves)) {
1710 ovs_mutex_unlock(&bond->mutex);
1711 return NULL;
1712 }
1713
1714 node = list_pop_front(&bond->enabled_slaves);
1715 list_push_back(&bond->enabled_slaves, node);
1716 ovs_mutex_unlock(&bond->mutex);
1717
1718 return CONTAINER_OF(node, struct bond_slave, list_node);
1719}
1720
f620b43a
BP
1721static struct bond_slave *
1722choose_output_slave(const struct bond *bond, const struct flow *flow,
4a1b8f30 1723 struct flow_wildcards *wc, uint16_t vlan)
f620b43a
BP
1724{
1725 struct bond_entry *e;
9dd165e0 1726 int balance;
f620b43a 1727
9dd165e0 1728 balance = bond->balance;
bdebeece
EJ
1729 if (bond->lacp_status == LACP_CONFIGURED) {
1730 /* LACP has been configured on this bond but negotiations were
9dd165e0
RK
1731 * unsuccussful. If lacp_fallback_ab is enabled use active-
1732 * backup mode else drop all traffic. */
1733 if (!bond->lacp_fallback_ab) {
1734 return NULL;
1735 }
1736 balance = BM_AB;
bdebeece
EJ
1737 }
1738
9dd165e0 1739 switch (balance) {
f620b43a
BP
1740 case BM_AB:
1741 return bond->active_slave;
1742
f620b43a 1743 case BM_TCP:
bdebeece
EJ
1744 if (bond->lacp_status != LACP_NEGOTIATED) {
1745 /* Must have LACP negotiations for TCP balanced bonds. */
1746 return NULL;
1747 }
bcd2633a 1748 if (wc) {
6cdd5145 1749 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
bcd2633a 1750 }
bdebeece
EJ
1751 /* Fall Through. */
1752 case BM_SLB:
bcd2633a 1753 if (wc) {
6cdd5145 1754 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
bcd2633a 1755 }
f620b43a
BP
1756 e = lookup_bond_entry(bond, flow, vlan);
1757 if (!e->slave || !e->slave->enabled) {
f1c8a79c 1758 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
f620b43a
BP
1759 }
1760 return e->slave;
1761
1762 default:
428b2edd 1763 OVS_NOT_REACHED();
f620b43a
BP
1764 }
1765}
1766
1767static struct bond_slave *
1768bond_choose_slave(const struct bond *bond)
1769{
1770 struct bond_slave *slave, *best;
1771
1772 /* Find an enabled slave. */
1773 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1774 if (slave->enabled) {
1775 return slave;
1776 }
1777 }
1778
1779 /* All interfaces are disabled. Find an interface that will be enabled
1780 * after its updelay expires. */
1781 best = NULL;
1782 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1783 if (slave->delay_expires != LLONG_MAX
296f6519 1784 && slave->may_enable
f620b43a
BP
1785 && (!best || slave->delay_expires < best->delay_expires)) {
1786 best = slave;
1787 }
1788 }
1789 return best;
1790}
1791
1792static void
4a1b8f30 1793bond_choose_active_slave(struct bond *bond)
f620b43a
BP
1794{
1795 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1796 struct bond_slave *old_active_slave = bond->active_slave;
1797
1798 bond->active_slave = bond_choose_slave(bond);
1799 if (bond->active_slave) {
1800 if (bond->active_slave->enabled) {
1801 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1802 bond->name, bond->active_slave->name);
1803 } else {
1804 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1805 "remaining %lld ms updelay (since no interface was "
1806 "enabled)", bond->name, bond->active_slave->name,
1807 bond->active_slave->delay_expires - time_msec());
4a1b8f30 1808 bond_enable_slave(bond->active_slave, true);
f620b43a
BP
1809 }
1810
1811 bond->send_learning_packets = true;
1812 } else if (old_active_slave) {
d28b9ead 1813 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
f620b43a
BP
1814 }
1815}
1816
f620b43a
BP
1817/* Attempts to make the sum of the bond slaves' statistics appear on the fake
1818 * bond interface. */
1819static void
1820bond_update_fake_slave_stats(struct bond *bond)
1821{
1822 struct netdev_stats bond_stats;
1823 struct bond_slave *slave;
1824 struct netdev *bond_dev;
1825
1826 memset(&bond_stats, 0, sizeof bond_stats);
1827
1828 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1829 struct netdev_stats slave_stats;
1830
1831 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1832 /* XXX: We swap the stats here because they are swapped back when
1833 * reported by the internal device. The reason for this is
1834 * internal devices normally represent packets going into the
1835 * system but when used as fake bond device they represent packets
1836 * leaving the system. We really should do this in the internal
1837 * device itself because changing it here reverses the counts from
1838 * the perspective of the switch. However, the internal device
1839 * doesn't know what type of device it represents so we have to do
1840 * it here for now. */
1841 bond_stats.tx_packets += slave_stats.rx_packets;
1842 bond_stats.tx_bytes += slave_stats.rx_bytes;
1843 bond_stats.rx_packets += slave_stats.tx_packets;
1844 bond_stats.rx_bytes += slave_stats.tx_bytes;
1845 }
1846 }
1847
18812dff 1848 if (!netdev_open(bond->name, "system", &bond_dev)) {
f620b43a
BP
1849 netdev_set_stats(bond_dev, &bond_stats);
1850 netdev_close(bond_dev);
1851 }
1852}