]> git.proxmox.com Git - mirror_ovs.git/blame - ofproto/bond.c
stopwatch: Remove tabs from output.
[mirror_ovs.git] / ofproto / bond.c
CommitLineData
f620b43a 1/*
50f96b10 2 * Copyright (c) 2008-2017 Nicira, Inc.
f620b43a
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
19#include "bond.h"
20
21#include <limits.h>
22#include <stdint.h>
23#include <stdlib.h>
75fad143 24#include <math.h>
f620b43a 25
da4a6191 26#include "connectivity.h"
f620b43a 27#include "coverage.h"
b598f214 28#include "dp-packet.h"
f620b43a 29#include "flow.h"
ee89ea7b 30#include "openvswitch/hmap.h"
bdebeece 31#include "lacp.h"
f620b43a
BP
32#include "netdev.h"
33#include "odp-util.h"
b598f214
BW
34#include "ofproto/ofproto-dpif.h"
35#include "ofproto/ofproto-dpif-rid.h"
36#include "ofproto/ofproto-provider.h"
37#include "openvswitch/dynamic-string.h"
38#include "openvswitch/list.h"
39#include "openvswitch/match.h"
40#include "openvswitch/ofp-actions.h"
64c96779 41#include "openvswitch/ofpbuf.h"
b598f214 42#include "openvswitch/vlog.h"
f620b43a 43#include "packets.h"
fd016ae3 44#include "openvswitch/poll-loop.h"
da4a6191 45#include "seq.h"
ee89ea7b 46#include "openvswitch/shash.h"
f620b43a
BP
47#include "timeval.h"
48#include "unixctl.h"
ee89ea7b 49#include "util.h"
f620b43a
BP
50
51VLOG_DEFINE_THIS_MODULE(bond);
52
f1c8a79c
AW
53static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
54static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
55static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
56
9e1a6910 57/* Bit-mask for hashing a flow down to a bucket. */
f620b43a 58#define BOND_MASK 0xff
9e1a6910 59#define BOND_BUCKETS (BOND_MASK + 1)
f620b43a 60
07a3cd5c
BP
61/* Priority for internal rules created to handle recirculation */
62#define RECIRC_RULE_PRIORITY 20
63
f620b43a 64/* A hash bucket for mapping a flow to a slave.
9e1a6910 65 * "struct bond" has an array of BOND_BUCKETS of these. */
f620b43a
BP
66struct bond_entry {
67 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
c6855ec5
JS
68 uint64_t tx_bytes /* Count of bytes recently transmitted. */
69 OVS_GUARDED_BY(rwlock);
ca6ba700 70 struct ovs_list list_node; /* In bond_slave's 'entries' list. */
adcf00ba 71
c6855ec5
JS
72 /* Recirculation.
73 *
74 * 'pr_rule' is the post-recirculation rule for this entry.
75 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
76 * is used to determine delta (applied to 'tx_bytes' above.) */
77 struct rule *pr_rule;
78 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
f620b43a
BP
79};
80
81/* A bond slave, that is, one of the links comprising a bond. */
82struct bond_slave {
83 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
ca6ba700 84 struct ovs_list list_node; /* In struct bond's enabled_slaves list. */
f620b43a
BP
85 struct bond *bond; /* The bond that contains this slave. */
86 void *aux; /* Client-provided handle for this slave. */
87
88 struct netdev *netdev; /* Network device, owned by the client. */
6422372c 89 uint64_t change_seq; /* Tracks changes in 'netdev'. */
f620b43a 90 char *name; /* Name (a copy of netdev_get_name(netdev)). */
abec9228 91 ofp_port_t ofp_port; /* OpenFlow port number. */
f620b43a
BP
92
93 /* Link status. */
f620b43a 94 bool enabled; /* May be chosen for flows? */
296f6519 95 bool may_enable; /* Client considers this slave bondable. */
abec9228 96 long long delay_expires; /* Time after which 'enabled' may change. */
f620b43a
BP
97
98 /* Rebalancing info. Used only by bond_rebalance(). */
ca6ba700
TG
99 struct ovs_list bal_node; /* In bond_rebalance()'s 'bals' list. */
100 struct ovs_list entries; /* 'struct bond_entry's assigned here. */
f620b43a
BP
101 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
102};
103
104/* A bond, that is, a set of network devices grouped to improve performance or
105 * robustness. */
106struct bond {
107 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
108 char *name; /* Name provided by client. */
adcf00ba 109 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
f620b43a
BP
110
111 /* Slaves. */
112 struct hmap slaves;
113
f1c8a79c
AW
114 /* Enabled slaves.
115 *
116 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
117 * (To prevent the bond_slave from disappearing they must also hold
118 * 'rwlock'.) */
119 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
ca6ba700 120 struct ovs_list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
f1c8a79c 121
f620b43a
BP
122 /* Bonding info. */
123 enum bond_mode balance; /* Balancing mode, one of BM_*. */
124 struct bond_slave *active_slave;
f620b43a 125 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
bdebeece 126 enum lacp_status lacp_status; /* Status of LACP negotiations. */
62904702 127 bool bond_revalidate; /* True if flows need revalidation. */
672d18b2 128 uint32_t basis; /* Basis for flow hash function. */
f620b43a
BP
129
130 /* SLB specific bonding info. */
9e1a6910 131 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
f620b43a
BP
132 int rebalance_interval; /* Interval between rebalances, in ms. */
133 long long int next_rebalance; /* Next rebalancing time. */
134 bool send_learning_packets;
adcf00ba
AZ
135 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
136 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
f620b43a 137
3e5aeeb5
AZ
138 /* Store active slave to OVSDB. */
139 bool active_slave_changed; /* Set to true whenever the bond changes
140 active slave. It will be reset to false
141 after it is stored into OVSDB */
142
143 /* Interface name may not be persistent across an OS reboot, use
144 * MAC address for identifing the active slave */
74ff3298 145 struct eth_addr active_slave_mac;
3e5aeeb5 146 /* The MAC address of the active interface. */
f620b43a 147 /* Legacy compatibility. */
9dd165e0 148 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
f620b43a 149
37bec3d3 150 struct ovs_refcount ref_cnt;
f620b43a
BP
151};
152
adcf00ba
AZ
153/* What to do with an bond_recirc_rule. */
154enum bond_op {
155 ADD, /* Add the rule to ofproto's flow table. */
156 DEL, /* Delete the rule from the ofproto's flow table. */
157};
158
159/* A rule to add to or delete from ofproto's internal flow table. */
160struct bond_pr_rule_op {
161 struct hmap_node hmap_node;
162 struct match match;
163 ofp_port_t out_ofport;
164 enum bond_op op;
6c932bc8 165 struct rule **pr_rule;
adcf00ba
AZ
166};
167
3bfd3972
EJ
168static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
169static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
170 OVS_REQ_RDLOCK(rwlock);
4a1b8f30
EJ
171static void bond_enable_slave(struct bond_slave *, bool enable)
172 OVS_REQ_WRLOCK(rwlock);
173static void bond_link_status_update(struct bond_slave *)
3bfd3972 174 OVS_REQ_WRLOCK(rwlock);
4a1b8f30 175static void bond_choose_active_slave(struct bond *)
9e1a6910 176 OVS_REQ_WRLOCK(rwlock);
f620b43a
BP
177static struct bond_entry *lookup_bond_entry(const struct bond *,
178 const struct flow *,
3bfd3972
EJ
179 uint16_t vlan)
180 OVS_REQ_RDLOCK(rwlock);
f1c8a79c
AW
181static struct bond_slave *get_enabled_slave(struct bond *)
182 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
183static struct bond_slave *choose_output_slave(const struct bond *,
184 const struct flow *,
bcd2633a 185 struct flow_wildcards *,
4a1b8f30 186 uint16_t vlan)
3bfd3972 187 OVS_REQ_RDLOCK(rwlock);
05df1623 188static void update_recirc_rules__(struct bond *bond);
f620b43a
BP
189
190/* Attempts to parse 's' as the name of a bond balancing mode. If successful,
191 * stores the mode in '*balance' and returns true. Otherwise returns false
192 * without modifying '*balance'. */
193bool
194bond_mode_from_string(enum bond_mode *balance, const char *s)
195{
196 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
197 *balance = BM_TCP;
198 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
199 *balance = BM_SLB;
200 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
201 *balance = BM_AB;
202 } else {
203 return false;
204 }
205 return true;
206}
207
208/* Returns a string representing 'balance'. */
209const char *
210bond_mode_to_string(enum bond_mode balance) {
211 switch (balance) {
212 case BM_TCP:
213 return "balance-tcp";
214 case BM_SLB:
215 return "balance-slb";
216 case BM_AB:
217 return "active-backup";
218 }
428b2edd 219 OVS_NOT_REACHED();
f620b43a
BP
220}
221
f620b43a
BP
222\f
223/* Creates and returns a new bond whose configuration is initially taken from
224 * 's'.
225 *
226 * The caller should register each slave on the new bond by calling
227 * bond_slave_register(). */
228struct bond *
adcf00ba 229bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
f620b43a
BP
230{
231 struct bond *bond;
232
233 bond = xzalloc(sizeof *bond);
adcf00ba 234 bond->ofproto = ofproto;
f620b43a 235 hmap_init(&bond->slaves);
417e7e66 236 ovs_list_init(&bond->enabled_slaves);
f1c8a79c 237 ovs_mutex_init(&bond->mutex);
37bec3d3 238 ovs_refcount_init(&bond->ref_cnt);
adcf00ba
AZ
239 hmap_init(&bond->pr_rule_ops);
240
30353934 241 bond->active_slave_mac = eth_addr_zero;
242 bond->active_slave_changed = false;
243
f620b43a 244 bond_reconfigure(bond, s);
f620b43a
BP
245 return bond;
246}
247
03366a2d
EJ
248struct bond *
249bond_ref(const struct bond *bond_)
250{
251 struct bond *bond = CONST_CAST(struct bond *, bond_);
252
bca0b3b4 253 if (bond) {
37bec3d3 254 ovs_refcount_ref(&bond->ref_cnt);
bca0b3b4 255 }
03366a2d
EJ
256 return bond;
257}
258
f620b43a
BP
259/* Frees 'bond'. */
260void
03366a2d 261bond_unref(struct bond *bond)
f620b43a 262{
4ec3d7c7 263 struct bond_slave *slave;
f620b43a 264
24f83812 265 if (!bond || ovs_refcount_unref_relaxed(&bond->ref_cnt) != 1) {
03366a2d
EJ
266 return;
267 }
268
3bfd3972
EJ
269 ovs_rwlock_wrlock(&rwlock);
270 hmap_remove(all_bonds, &bond->hmap_node);
271 ovs_rwlock_unlock(&rwlock);
f620b43a 272
4ec3d7c7 273 HMAP_FOR_EACH_POP (slave, hmap_node, &bond->slaves) {
f620b43a
BP
274 /* Client owns 'slave->netdev'. */
275 free(slave->name);
276 free(slave);
277 }
278 hmap_destroy(&bond->slaves);
279
f1c8a79c 280 ovs_mutex_destroy(&bond->mutex);
adcf00ba 281
05df1623 282 /* Free bond resources. Remove existing post recirc rules. */
adcf00ba 283 if (bond->recirc_id) {
e672ff9b 284 recirc_free_id(bond->recirc_id);
05df1623 285 bond->recirc_id = 0;
adcf00ba 286 }
05df1623
AZ
287 free(bond->hash);
288 bond->hash = NULL;
289 update_recirc_rules__(bond);
adcf00ba 290
05df1623
AZ
291 hmap_destroy(&bond->pr_rule_ops);
292 free(bond->name);
f620b43a
BP
293 free(bond);
294}
295
adcf00ba
AZ
296static void
297add_pr_rule(struct bond *bond, const struct match *match,
6c932bc8 298 ofp_port_t out_ofport, struct rule **rule)
adcf00ba
AZ
299{
300 uint32_t hash = match_hash(match, 0);
301 struct bond_pr_rule_op *pr_op;
302
303 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
304 if (match_equal(&pr_op->match, match)) {
305 pr_op->op = ADD;
306 pr_op->out_ofport = out_ofport;
307 pr_op->pr_rule = rule;
308 return;
309 }
310 }
311
312 pr_op = xmalloc(sizeof *pr_op);
313 pr_op->match = *match;
314 pr_op->op = ADD;
315 pr_op->out_ofport = out_ofport;
316 pr_op->pr_rule = rule;
317 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
318}
319
05df1623
AZ
320/* This function should almost never be called directly.
321 * 'update_recirc_rules()' should be called instead. Since
322 * this function modifies 'bond->pr_rule_ops', it is only
323 * safe when 'rwlock' is held.
324 *
325 * However, when the 'bond' is the only reference in the system,
326 * calling this function avoid acquiring lock only to satisfy
327 * lock annotation. Currently, only 'bond_unref()' calls
328 * this function directly. */
adcf00ba 329static void
05df1623 330update_recirc_rules__(struct bond *bond)
adcf00ba
AZ
331{
332 struct match match;
333 struct bond_pr_rule_op *pr_op, *next_op;
334 uint64_t ofpacts_stub[128 / 8];
335 struct ofpbuf ofpacts;
336 int i;
337
338 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
339
340 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
341 pr_op->op = DEL;
342 }
343
6c932bc8
AZ
344 if (bond->hash && bond->recirc_id) {
345 for (i = 0; i < BOND_BUCKETS; i++) {
346 struct bond_slave *slave = bond->hash[i].slave;
adcf00ba 347
6c932bc8
AZ
348 if (slave) {
349 match_init_catchall(&match);
350 match_set_recirc_id(&match, bond->recirc_id);
6c932bc8 351 match_set_dp_hash_masked(&match, i, BOND_MASK);
adcf00ba 352
6c932bc8
AZ
353 add_pr_rule(bond, &match, slave->ofp_port,
354 &bond->hash[i].pr_rule);
355 }
adcf00ba
AZ
356 }
357 }
358
359 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
360 int error;
adcf00ba
AZ
361 switch (pr_op->op) {
362 case ADD:
363 ofpbuf_clear(&ofpacts);
364 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
365 error = ofproto_dpif_add_internal_flow(bond->ofproto,
366 &pr_op->match,
290ad78a 367 RECIRC_RULE_PRIORITY, 0,
6c932bc8 368 &ofpacts, pr_op->pr_rule);
adcf00ba 369 if (error) {
50f96b10 370 char *err_s = match_to_string(&pr_op->match, NULL,
adcf00ba
AZ
371 RECIRC_RULE_PRIORITY);
372
373 VLOG_ERR("failed to add post recirculation flow %s", err_s);
374 free(err_s);
adcf00ba
AZ
375 }
376 break;
377
378 case DEL:
379 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
380 &pr_op->match,
381 RECIRC_RULE_PRIORITY);
382 if (error) {
50f96b10 383 char *err_s = match_to_string(&pr_op->match, NULL,
adcf00ba
AZ
384 RECIRC_RULE_PRIORITY);
385
386 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
387 free(err_s);
388 }
389
390 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
fa233667
YS
391 if (bond->hash) {
392 *pr_op->pr_rule = NULL;
393 }
adcf00ba
AZ
394 free(pr_op);
395 break;
396 }
397 }
398
399 ofpbuf_uninit(&ofpacts);
400}
401
05df1623
AZ
402static void
403update_recirc_rules(struct bond *bond)
404 OVS_REQ_RDLOCK(rwlock)
405{
406 update_recirc_rules__(bond);
407}
adcf00ba 408
f620b43a
BP
409/* Updates 'bond''s overall configuration to 's'.
410 *
411 * The caller should register each slave on 'bond' by calling
412 * bond_slave_register(). This is optional if none of the slaves'
4d6fb5eb 413 * configuration has changed. In any case it can't hurt.
59d7b2b6
EJ
414 *
415 * Returns true if the configuration has changed in such a way that requires
416 * flow revalidation.
417 * */
418bool
f620b43a
BP
419bond_reconfigure(struct bond *bond, const struct bond_settings *s)
420{
59d7b2b6
EJ
421 bool revalidate = false;
422
3bfd3972 423 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
424 if (!bond->name || strcmp(bond->name, s->name)) {
425 if (bond->name) {
3bfd3972 426 hmap_remove(all_bonds, &bond->hmap_node);
f620b43a
BP
427 free(bond->name);
428 }
429 bond->name = xstrdup(s->name);
3bfd3972 430 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
f620b43a
BP
431 }
432
f620b43a
BP
433 bond->updelay = s->up_delay;
434 bond->downdelay = s->down_delay;
bc1b010c 435
9dd165e0
RK
436 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
437 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
438 revalidate = true;
439 }
440
bc1b010c
EJ
441 if (bond->rebalance_interval != s->rebalance_interval) {
442 bond->rebalance_interval = s->rebalance_interval;
443 revalidate = true;
444 }
f620b43a 445
59d7b2b6
EJ
446 if (bond->balance != s->balance) {
447 bond->balance = s->balance;
448 revalidate = true;
449 }
450
672d18b2
EJ
451 if (bond->basis != s->basis) {
452 bond->basis = s->basis;
453 revalidate = true;
454 }
455
62904702
EJ
456 if (bond->bond_revalidate) {
457 revalidate = true;
458 bond->bond_revalidate = false;
459 }
460
adcf00ba
AZ
461 if (bond->balance != BM_AB) {
462 if (!bond->recirc_id) {
e672ff9b 463 bond->recirc_id = recirc_alloc_id(bond->ofproto);
adcf00ba
AZ
464 }
465 } else if (bond->recirc_id) {
e672ff9b 466 recirc_free_id(bond->recirc_id);
adcf00ba
AZ
467 bond->recirc_id = 0;
468 }
469
95aafb2a
EJ
470 if (bond->balance == BM_AB || !bond->hash || revalidate) {
471 bond_entry_reset(bond);
472 }
473
3bfd3972 474 ovs_rwlock_unlock(&rwlock);
59d7b2b6 475 return revalidate;
f620b43a
BP
476}
477
3e5aeeb5 478static struct bond_slave *
74ff3298 479bond_find_slave_by_mac(const struct bond *bond, const struct eth_addr mac)
3e5aeeb5
AZ
480{
481 struct bond_slave *slave;
482
483 /* Find the last active slave */
484 HMAP_FOR_EACH(slave, hmap_node, &bond->slaves) {
74ff3298 485 struct eth_addr slave_mac;
3e5aeeb5 486
74ff3298 487 if (netdev_get_etheraddr(slave->netdev, &slave_mac)) {
3e5aeeb5
AZ
488 continue;
489 }
490
74ff3298 491 if (eth_addr_equals(slave_mac, mac)) {
3e5aeeb5
AZ
492 return slave;
493 }
494 }
495
496 return NULL;
497}
498
499static void
500bond_active_slave_changed(struct bond *bond)
501{
f626af7a
AZ
502 if (bond->active_slave) {
503 struct eth_addr mac;
504 netdev_get_etheraddr(bond->active_slave->netdev, &mac);
505 bond->active_slave_mac = mac;
506 } else {
507 bond->active_slave_mac = eth_addr_zero;
508 }
3e5aeeb5
AZ
509 bond->active_slave_changed = true;
510 seq_change(connectivity_seq_get());
511}
512
f8ddccd2 513static void
1ea24138 514bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
3bfd3972 515 OVS_REQ_WRLOCK(rwlock)
f8ddccd2
BP
516{
517 if (slave->netdev != netdev) {
f8ddccd2 518 slave->netdev = netdev;
1ea24138 519 slave->change_seq = 0;
f8ddccd2
BP
520 }
521}
522
f620b43a
BP
523/* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
524 * arbitrary client-provided pointer that uniquely identifies a slave within a
525 * bond. If 'slave_' already exists within 'bond' then this function
526 * reconfigures the existing slave.
527 *
528 * 'netdev' must be the network device that 'slave_' represents. It is owned
529 * by the client, so the client must not close it before either unregistering
530 * 'slave_' or destroying 'bond'.
4d6fb5eb 531 */
f620b43a 532void
adcf00ba
AZ
533bond_slave_register(struct bond *bond, void *slave_,
534 ofp_port_t ofport, struct netdev *netdev)
f620b43a 535{
3bfd3972 536 struct bond_slave *slave;
f620b43a 537
3bfd3972
EJ
538 ovs_rwlock_wrlock(&rwlock);
539 slave = bond_slave_lookup(bond, slave_);
f620b43a
BP
540 if (!slave) {
541 slave = xzalloc(sizeof *slave);
542
543 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
544 slave->bond = bond;
545 slave->aux = slave_;
adcf00ba 546 slave->ofp_port = ofport;
f620b43a 547 slave->delay_expires = LLONG_MAX;
244b2160 548 slave->name = xstrdup(netdev_get_name(netdev));
7321e30e 549 bond->bond_revalidate = true;
244b2160 550
b3c18f66 551 slave->enabled = false;
4a1b8f30 552 bond_enable_slave(slave, netdev_get_carrier(netdev));
f620b43a
BP
553 }
554
1ea24138 555 bond_slave_set_netdev__(slave, netdev);
a6934aa9 556
f620b43a
BP
557 free(slave->name);
558 slave->name = xstrdup(netdev_get_name(netdev));
3bfd3972 559 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
560}
561
f8ddccd2
BP
562/* Updates the network device to be used with 'slave_' to 'netdev'.
563 *
564 * This is useful if the caller closes and re-opens the network device
565 * registered with bond_slave_register() but doesn't need to change anything
566 * else. */
567void
568bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
569{
3bfd3972
EJ
570 struct bond_slave *slave;
571
572 ovs_rwlock_wrlock(&rwlock);
573 slave = bond_slave_lookup(bond, slave_);
f8ddccd2 574 if (slave) {
1ea24138 575 bond_slave_set_netdev__(slave, netdev);
f8ddccd2 576 }
3bfd3972 577 ovs_rwlock_unlock(&rwlock);
f8ddccd2
BP
578}
579
f620b43a
BP
580/* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
581 * then this function has no effect.
582 *
583 * Unregistering a slave invalidates all flows. */
584void
585bond_slave_unregister(struct bond *bond, const void *slave_)
586{
3bfd3972 587 struct bond_slave *slave;
f620b43a
BP
588 bool del_active;
589
3bfd3972
EJ
590 ovs_rwlock_wrlock(&rwlock);
591 slave = bond_slave_lookup(bond, slave_);
f620b43a 592 if (!slave) {
3bfd3972 593 goto out;
f620b43a
BP
594 }
595
4a1b8f30
EJ
596 bond->bond_revalidate = true;
597 bond_enable_slave(slave, false);
b3c18f66 598
f620b43a
BP
599 del_active = bond->active_slave == slave;
600 if (bond->hash) {
601 struct bond_entry *e;
602 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
603 if (e->slave == slave) {
604 e->slave = NULL;
605 }
606 }
607 }
608
609 free(slave->name);
610
611 hmap_remove(&bond->slaves, &slave->hmap_node);
612 /* Client owns 'slave->netdev'. */
613 free(slave);
614
615 if (del_active) {
4a1b8f30 616 bond_choose_active_slave(bond);
f620b43a
BP
617 bond->send_learning_packets = true;
618 }
3bfd3972
EJ
619out:
620 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
621}
622
296f6519
EJ
623/* Should be called on each slave in 'bond' before bond_run() to indicate
624 * whether or not 'slave_' may be enabled. This function is intended to allow
625 * other protocols to have some impact on bonding decisions. For example LACP
626 * or high level link monitoring protocols may decide that a given slave should
627 * not be able to send traffic. */
4d6fb5eb 628void
296f6519 629bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
4d6fb5eb 630{
3bfd3972 631 ovs_rwlock_wrlock(&rwlock);
296f6519 632 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
3bfd3972 633 ovs_rwlock_unlock(&rwlock);
4d6fb5eb
EJ
634}
635
4a1b8f30
EJ
636/* Performs periodic maintenance on 'bond'.
637 *
638 * Returns true if the caller should revalidate its flows.
f620b43a
BP
639 *
640 * The caller should check bond_should_send_learning_packets() afterward. */
4a1b8f30
EJ
641bool
642bond_run(struct bond *bond, enum lacp_status lacp_status)
f620b43a
BP
643{
644 struct bond_slave *slave;
4a1b8f30 645 bool revalidate;
f620b43a 646
3bfd3972 647 ovs_rwlock_wrlock(&rwlock);
bdebeece
EJ
648 if (bond->lacp_status != lacp_status) {
649 bond->lacp_status = lacp_status;
4592d0e2
EJ
650 bond->bond_revalidate = true;
651 }
4d6fb5eb 652
f620b43a
BP
653 /* Enable slaves based on link status and LACP feedback. */
654 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
4a1b8f30 655 bond_link_status_update(slave);
da4a6191 656 slave->change_seq = seq_read(connectivity_seq_get());
f620b43a
BP
657 }
658 if (!bond->active_slave || !bond->active_slave->enabled) {
4a1b8f30 659 bond_choose_active_slave(bond);
f620b43a
BP
660 }
661
4a1b8f30
EJ
662 revalidate = bond->bond_revalidate;
663 bond->bond_revalidate = false;
3bfd3972 664 ovs_rwlock_unlock(&rwlock);
4a1b8f30
EJ
665
666 return revalidate;
f620b43a
BP
667}
668
669/* Causes poll_block() to wake up when 'bond' needs something to be done. */
670void
671bond_wait(struct bond *bond)
672{
673 struct bond_slave *slave;
674
3bfd3972 675 ovs_rwlock_rdlock(&rwlock);
f620b43a
BP
676 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
677 if (slave->delay_expires != LLONG_MAX) {
678 poll_timer_wait_until(slave->delay_expires);
679 }
1ea24138 680
da4a6191 681 seq_wait(connectivity_seq_get(), slave->change_seq);
f620b43a
BP
682 }
683
bbc13389 684 if (bond->bond_revalidate) {
f620b43a
BP
685 poll_immediate_wake();
686 }
3bfd3972 687 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
688
689 /* We don't wait for bond->next_rebalance because rebalancing can only run
690 * at a flow account checkpoint. ofproto does checkpointing on its own
691 * schedule and bond_rebalance() gets called afterward, so we'd just be
692 * waking up for no purpose. */
693}
694\f
695/* MAC learning table interaction. */
696
697static bool
698may_send_learning_packets(const struct bond *bond)
699{
9dd165e0
RK
700 return ((bond->lacp_status == LACP_DISABLED
701 && (bond->balance == BM_SLB || bond->balance == BM_AB))
702 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
bdebeece 703 && bond->active_slave;
f620b43a
BP
704}
705
706/* Returns true if 'bond' needs the client to send out packets to assist with
707 * MAC learning on 'bond'. If this function returns true, then the client
708 * should iterate through its MAC learning table for the bridge on which 'bond'
709 * is located. For each MAC that has been learned on a port other than 'bond',
ea131871 710 * it should call bond_compose_learning_packet().
f620b43a 711 *
477879ea
BP
712 * This function will only return true if 'bond' is in SLB or active-backup
713 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
714 * necessary.
f620b43a
BP
715 *
716 * Calling this function resets the state that it checks. */
717bool
718bond_should_send_learning_packets(struct bond *bond)
719{
3bfd3972
EJ
720 bool send;
721
722 ovs_rwlock_wrlock(&rwlock);
723 send = bond->send_learning_packets && may_send_learning_packets(bond);
f620b43a 724 bond->send_learning_packets = false;
3bfd3972 725 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
726 return send;
727}
728
729/* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
730 *
ea131871
JG
731 * See bond_should_send_learning_packets() for description of usage. The
732 * caller should send the composed packet on the port associated with
733 * port_aux and takes ownership of the returned ofpbuf. */
cf62fa4c 734struct dp_packet *
74ff3298 735bond_compose_learning_packet(struct bond *bond, const struct eth_addr eth_src,
ea131871 736 uint16_t vlan, void **port_aux)
f620b43a
BP
737{
738 struct bond_slave *slave;
cf62fa4c 739 struct dp_packet *packet;
f620b43a 740 struct flow flow;
f620b43a 741
3bfd3972 742 ovs_rwlock_rdlock(&rwlock);
cb22974d 743 ovs_assert(may_send_learning_packets(bond));
f620b43a 744 memset(&flow, 0, sizeof flow);
74ff3298 745 flow.dl_src = eth_src;
4a1b8f30 746 slave = choose_output_slave(bond, &flow, NULL, vlan);
f620b43a 747
cf62fa4c 748 packet = dp_packet_new(0);
2ea838ac 749 compose_rarp(packet, eth_src);
f620b43a 750 if (vlan) {
1bf02876 751 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
f620b43a 752 }
f620b43a 753
ea131871 754 *port_aux = slave->aux;
3bfd3972 755 ovs_rwlock_unlock(&rwlock);
ea131871 756 return packet;
f620b43a
BP
757}
758\f
759/* Checks whether a packet that arrived on 'slave_' within 'bond', with an
760 * Ethernet destination address of 'eth_dst', should be admitted.
761 *
762 * The return value is one of the following:
763 *
764 * - BV_ACCEPT: Admit the packet.
765 *
766 * - BV_DROP: Drop the packet.
767 *
768 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
769 * Ethernet source address and VLAN. If there is none, or if the packet
770 * is on the learned port, then admit the packet. If a different port has
771 * been learned, however, drop the packet (and do not use it for MAC
772 * learning).
773 */
774enum bond_verdict
775bond_check_admissibility(struct bond *bond, const void *slave_,
74ff3298 776 const struct eth_addr eth_dst)
f620b43a 777{
3bfd3972
EJ
778 enum bond_verdict verdict = BV_DROP;
779 struct bond_slave *slave;
9a1c6450 780
3bfd3972
EJ
781 ovs_rwlock_rdlock(&rwlock);
782 slave = bond_slave_lookup(bond, slave_);
4222bbc8 783 if (!slave) {
3bfd3972 784 goto out;
4222bbc8
EJ
785 }
786
9a1c6450
EJ
787 /* LACP bonds have very loose admissibility restrictions because we can
788 * assume the remote switch is aware of the bond and will "do the right
789 * thing". However, as a precaution we drop packets on disabled slaves
790 * because no correctly implemented partner switch should be sending
bdebeece
EJ
791 * packets to them.
792 *
793 * If LACP is configured, but LACP negotiations have been unsuccessful, we
9dd165e0 794 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
bdebeece 795 switch (bond->lacp_status) {
3bfd3972
EJ
796 case LACP_NEGOTIATED:
797 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
798 goto out;
799 case LACP_CONFIGURED:
9dd165e0
RK
800 if (!bond->lacp_fallback_ab) {
801 goto out;
802 }
e5c4f827 803 break;
3bfd3972 804 case LACP_DISABLED:
e5c4f827 805 if (bond->balance == BM_TCP) {
806 goto out;
807 }
3bfd3972 808 break;
f620b43a
BP
809 }
810
811 /* Drop all multicast packets on inactive slaves. */
812 if (eth_addr_is_multicast(eth_dst)) {
4222bbc8 813 if (bond->active_slave != slave) {
3bfd3972 814 goto out;
f620b43a
BP
815 }
816 }
817
f931a4c9 818 switch (bond->balance) {
9dd165e0
RK
819 case BM_TCP:
820 /* TCP balanced bonds require successful LACP negotiations. Based on the
821 * above check, LACP is off or lacp_fallback_ab is true on this bond.
822 * If lacp_fallback_ab is true fall through to BM_AB case else, we
823 * drop all incoming traffic. */
824 if (!bond->lacp_fallback_ab) {
825 goto out;
826 }
73c7216a 827 /* fall through */
9dd165e0 828
f931a4c9
BP
829 case BM_AB:
830 /* Drop all packets which arrive on backup slaves. This is similar to
831 * how Linux bonding handles active-backup bonds. */
7ba7dcf0
EJ
832 if (bond->active_slave != slave) {
833 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
834
e6b2255c
BP
835 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
836 " slave (%s) destined for " ETH_ADDR_FMT,
837 slave->name, ETH_ADDR_ARGS(eth_dst));
3bfd3972 838 goto out;
7ba7dcf0 839 }
3bfd3972
EJ
840 verdict = BV_ACCEPT;
841 goto out;
f931a4c9 842
f931a4c9
BP
843 case BM_SLB:
844 /* Drop all packets for which we have learned a different input port,
845 * because we probably sent the packet on one slave and got it back on
846 * the other. Gratuitous ARP packets are an exception to this rule:
847 * the host has moved to another switch. The exception to the
848 * exception is if we locked the learning table to avoid reflections on
849 * bond slaves. */
3bfd3972
EJ
850 verdict = BV_DROP_IF_MOVED;
851 goto out;
7ba7dcf0
EJ
852 }
853
428b2edd 854 OVS_NOT_REACHED();
3bfd3972
EJ
855out:
856 ovs_rwlock_unlock(&rwlock);
857 return verdict;
858
f620b43a
BP
859}
860
861/* Returns the slave (registered on 'bond' by bond_slave_register()) to which
862 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
863 * NULL if the packet should be dropped because no slaves are enabled.
864 *
865 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
866 * should be a VID only (i.e. excluding the PCP bits). Second,
867 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
868 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
869 * packet belongs to (so for an access port it will be the access port's VLAN).
870 *
bcd2633a
JP
871 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
872 * significant in the selection. At some point earlier, 'wc' should
873 * have been initialized (e.g., by flow_wildcards_init_catchall()).
f620b43a
BP
874 */
875void *
876bond_choose_output_slave(struct bond *bond, const struct flow *flow,
4a1b8f30 877 struct flow_wildcards *wc, uint16_t vlan)
f620b43a 878{
3bfd3972 879 struct bond_slave *slave;
b5d5d7d3 880 void *aux;
3bfd3972
EJ
881
882 ovs_rwlock_rdlock(&rwlock);
4a1b8f30 883 slave = choose_output_slave(bond, flow, wc, vlan);
b5d5d7d3 884 aux = slave ? slave->aux : NULL;
3bfd3972 885 ovs_rwlock_unlock(&rwlock);
b5d5d7d3
AW
886
887 return aux;
f620b43a 888}
f620b43a 889\f
adcf00ba
AZ
890/* Recirculation. */
891static void
892bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
c6855ec5 893 OVS_REQ_WRLOCK(rwlock)
adcf00ba
AZ
894{
895 if (entry->slave) {
896 uint64_t delta;
897
898 delta = rule_tx_bytes - entry->pr_tx_bytes;
899 entry->tx_bytes += delta;
900 entry->pr_tx_bytes = rule_tx_bytes;
901 }
902}
903
904/* Maintain bond stats using post recirculation rule byte counters.*/
60cda7d6 905static void
adcf00ba 906bond_recirculation_account(struct bond *bond)
80316557 907 OVS_REQ_WRLOCK(rwlock)
adcf00ba
AZ
908{
909 int i;
910
adcf00ba
AZ
911 for (i=0; i<=BOND_MASK; i++) {
912 struct bond_entry *entry = &bond->hash[i];
913 struct rule *rule = entry->pr_rule;
914
915 if (rule) {
916 uint64_t n_packets OVS_UNUSED;
917 long long int used OVS_UNUSED;
918 uint64_t n_bytes;
919
920 rule->ofproto->ofproto_class->rule_get_stats(
921 rule, &n_packets, &n_bytes, &used);
922 bond_entry_account(entry, n_bytes);
923 }
924 }
adcf00ba
AZ
925}
926
a80aba3a 927static bool
6b95d23c 928bond_may_recirc(const struct bond *bond)
adcf00ba 929{
6b95d23c 930 return bond->balance == BM_TCP && bond->recirc_id;
adcf00ba
AZ
931}
932
ca8127fd
AZ
933static void
934bond_update_post_recirc_rules__(struct bond* bond, const bool force)
935 OVS_REQ_WRLOCK(rwlock)
adcf00ba
AZ
936{
937 struct bond_entry *e;
938 bool update_rules = force; /* Always update rules if caller forces it. */
939
940 /* Make sure all bond entries are populated */
941 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
942 if (!e->slave || !e->slave->enabled) {
943 update_rules = true;
944 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
945 struct bond_slave, hmap_node);
946 if (!e->slave->enabled) {
947 e->slave = bond->active_slave;
948 }
949 }
950 }
951
952 if (update_rules) {
953 update_recirc_rules(bond);
954 }
955}
ca8127fd
AZ
956
957void
82f9f1f5
AZ
958bond_update_post_recirc_rules(struct bond *bond, uint32_t *recirc_id,
959 uint32_t *hash_basis)
ca8127fd 960{
a80aba3a
AZ
961 bool may_recirc = bond_may_recirc(bond);
962
963 if (may_recirc) {
964 /* To avoid unnecessary locking, bond_may_recirc() is first
965 * called outside of the 'rwlock'. After acquiring the lock,
966 * check again to make sure bond configuration has not been changed. */
967 ovs_rwlock_wrlock(&rwlock);
968 may_recirc = bond_may_recirc(bond);
969 if (may_recirc) {
970 *recirc_id = bond->recirc_id;
971 *hash_basis = bond->basis;
972 bond_update_post_recirc_rules__(bond, false);
973 }
974 ovs_rwlock_unlock(&rwlock);
975 }
976
977 if (!may_recirc) {
6b95d23c 978 *recirc_id = *hash_basis = 0;
82f9f1f5 979 }
ca8127fd 980}
82f9f1f5 981
adcf00ba 982\f
f620b43a
BP
983/* Rebalancing. */
984
1b137691 985static bool
3bfd3972 986bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
1b137691 987{
bc1b010c
EJ
988 return bond->rebalance_interval
989 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
1b137691
EJ
990}
991
f620b43a
BP
992/* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
993void
994bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
995 uint64_t n_bytes)
996{
3bfd3972 997 ovs_rwlock_wrlock(&rwlock);
1b137691 998 if (bond_is_balanced(bond)) {
f620b43a 999 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
f620b43a 1000 }
3bfd3972 1001 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1002}
1003
1004static struct bond_slave *
ca6ba700 1005bond_slave_from_bal_node(struct ovs_list *bal) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
1006{
1007 return CONTAINER_OF(bal, struct bond_slave, bal_node);
1008}
1009
1010static void
ca6ba700 1011log_bals(struct bond *bond, const struct ovs_list *bals)
c6855ec5 1012 OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
1013{
1014 if (VLOG_IS_DBG_ENABLED()) {
1015 struct ds ds = DS_EMPTY_INITIALIZER;
1016 const struct bond_slave *slave;
1017
1018 LIST_FOR_EACH (slave, bal_node, bals) {
1019 if (ds.length) {
1020 ds_put_char(&ds, ',');
1021 }
1022 ds_put_format(&ds, " %s %"PRIu64"kB",
1023 slave->name, slave->tx_bytes / 1024);
1024
1025 if (!slave->enabled) {
1026 ds_put_cstr(&ds, " (disabled)");
1027 }
417e7e66 1028 if (!ovs_list_is_empty(&slave->entries)) {
f620b43a
BP
1029 struct bond_entry *e;
1030
1031 ds_put_cstr(&ds, " (");
1032 LIST_FOR_EACH (e, list_node, &slave->entries) {
417e7e66 1033 if (&e->list_node != ovs_list_front(&slave->entries)) {
f620b43a
BP
1034 ds_put_cstr(&ds, " + ");
1035 }
34582733 1036 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
f620b43a
BP
1037 e - bond->hash, e->tx_bytes / 1024);
1038 }
1039 ds_put_cstr(&ds, ")");
1040 }
1041 }
1042 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
1043 ds_destroy(&ds);
1044 }
1045}
1046
1047/* Shifts 'hash' from its current slave to 'to'. */
1048static void
4a1b8f30 1049bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
c6855ec5 1050 OVS_REQ_WRLOCK(rwlock)
f620b43a
BP
1051{
1052 struct bond_slave *from = hash->slave;
1053 struct bond *bond = from->bond;
1054 uint64_t delta = hash->tx_bytes;
1055
34582733 1056 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
f620b43a
BP
1057 "from %s to %s (now carrying %"PRIu64"kB and "
1058 "%"PRIu64"kB load, respectively)",
1059 bond->name, delta / 1024, hash - bond->hash,
1060 from->name, to->name,
1061 (from->tx_bytes - delta) / 1024,
1062 (to->tx_bytes + delta) / 1024);
1063
1064 /* Shift load away from 'from' to 'to'. */
1065 from->tx_bytes -= delta;
1066 to->tx_bytes += delta;
1067
1068 /* Arrange for flows to be revalidated. */
dc30ea2d 1069 hash->slave = to;
4a1b8f30 1070 bond->bond_revalidate = true;
f620b43a
BP
1071}
1072
09a5d390
BP
1073/* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1074 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
f620b43a
BP
1075 * given that doing so must decrease the ratio of the load on the two slaves by
1076 * at least 0.1. Returns NULL if there is no appropriate entry.
1077 *
1078 * The list of entries isn't sorted. I don't know of a reason to prefer to
1079 * shift away small hashes or large hashes. */
1080static struct bond_entry *
1081choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
c6855ec5 1082 OVS_REQ_WRLOCK(rwlock)
f620b43a
BP
1083{
1084 struct bond_entry *e;
1085
417e7e66 1086 if (ovs_list_is_short(&from->entries)) {
f620b43a
BP
1087 /* 'from' carries no more than one MAC hash, so shifting load away from
1088 * it would be pointless. */
1089 return NULL;
1090 }
1091
1092 LIST_FOR_EACH (e, list_node, &from->entries) {
c460a6a7
AZ
1093 uint64_t delta = e->tx_bytes; /* The amount to rebalance. */
1094 uint64_t ideal_tx_bytes = (from->tx_bytes + to_tx_bytes)/2;
1095 /* Note, the ideal traffic is the mid point
1096 * between 'from' and 'to'. This value does
1097 * not change by rebalancing. */
1098 uint64_t new_low; /* The lower bandwidth between 'to' and 'from'
1099 after rebalancing. */
1100
1101 new_low = MIN(from->tx_bytes - delta, to_tx_bytes + delta);
1102
1103 if ((new_low > to_tx_bytes) &&
1104 (new_low - to_tx_bytes >= (ideal_tx_bytes - to_tx_bytes) / 10)) {
1105 /* Only rebalance if the new 'low' is closer to to the mid point,
1106 * and the improvement exceeds 10% of current traffic
1107 * deviation from the ideal split.
1108 *
1109 * The improvement on the 'high' side is always the same as the
1110 * 'low' side. Thus consider 'low' side is sufficient. */
f620b43a
BP
1111 return e;
1112 }
1113 }
1114
1115 return NULL;
1116}
1117
1118/* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1119 * maintained. */
1120static void
ca6ba700 1121insert_bal(struct ovs_list *bals, struct bond_slave *slave)
f620b43a
BP
1122{
1123 struct bond_slave *pos;
1124
1125 LIST_FOR_EACH (pos, bal_node, bals) {
1126 if (slave->tx_bytes > pos->tx_bytes) {
1127 break;
1128 }
1129 }
417e7e66 1130 ovs_list_insert(&pos->bal_node, &slave->bal_node);
f620b43a
BP
1131}
1132
1133/* Removes 'slave' from its current list and then inserts it into 'bals' so
1134 * that descending order of 'tx_bytes' is maintained. */
1135static void
ca6ba700 1136reinsert_bal(struct ovs_list *bals, struct bond_slave *slave)
f620b43a 1137{
417e7e66 1138 ovs_list_remove(&slave->bal_node);
f620b43a
BP
1139 insert_bal(bals, slave);
1140}
1141
1142/* If 'bond' needs rebalancing, does so.
1143 *
adcf00ba
AZ
1144 * The caller should have called bond_account() for each active flow, or in case
1145 * of recirculation is used, have called bond_recirculation_account(bond),
1146 * to ensure that flow data is consistently accounted at this point.
60cda7d6
AZ
1147 */
1148void
4a1b8f30 1149bond_rebalance(struct bond *bond)
f620b43a
BP
1150{
1151 struct bond_slave *slave;
1152 struct bond_entry *e;
ca6ba700 1153 struct ovs_list bals;
adcf00ba 1154 bool rebalanced = false;
60cda7d6 1155 bool use_recirc;
f620b43a 1156
3bfd3972 1157 ovs_rwlock_wrlock(&rwlock);
1b137691 1158 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
adcf00ba 1159 goto done;
f620b43a
BP
1160 }
1161 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1162
88186383 1163 use_recirc = bond->ofproto->backer->rt_support.odp.recirc &&
6b95d23c 1164 bond_may_recirc(bond);
60cda7d6
AZ
1165
1166 if (use_recirc) {
1167 bond_recirculation_account(bond);
1168 }
1169
f620b43a
BP
1170 /* Add each bond_entry to its slave's 'entries' list.
1171 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1172 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1173 slave->tx_bytes = 0;
417e7e66 1174 ovs_list_init(&slave->entries);
f620b43a
BP
1175 }
1176 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1177 if (e->slave && e->tx_bytes) {
1178 e->slave->tx_bytes += e->tx_bytes;
417e7e66 1179 ovs_list_push_back(&e->slave->entries, &e->list_node);
f620b43a
BP
1180 }
1181 }
1182
1183 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1184 *
1185 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1186 * with a proper list sort algorithm. */
417e7e66 1187 ovs_list_init(&bals);
f620b43a
BP
1188 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1189 if (slave->enabled) {
1190 insert_bal(&bals, slave);
1191 }
1192 }
1193 log_bals(bond, &bals);
1194
1195 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
417e7e66
BW
1196 while (!ovs_list_is_short(&bals)) {
1197 struct bond_slave *from = bond_slave_from_bal_node(ovs_list_front(&bals));
1198 struct bond_slave *to = bond_slave_from_bal_node(ovs_list_back(&bals));
f620b43a
BP
1199 uint64_t overload;
1200
1201 overload = from->tx_bytes - to->tx_bytes;
1202 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1203 /* The extra load on 'from' (and all less-loaded slaves), compared
1204 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1205 * it is less than ~1Mbps. No point in rebalancing. */
1206 break;
1207 }
1208
09a5d390
BP
1209 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1210 * to move from 'from' to 'to'. */
f620b43a
BP
1211 e = choose_entry_to_migrate(from, to->tx_bytes);
1212 if (e) {
4a1b8f30 1213 bond_shift_load(e, to);
f620b43a
BP
1214
1215 /* Delete element from from->entries.
1216 *
1217 * We don't add the element to to->hashes. That would only allow
1218 * 'e' to be migrated to another slave in this rebalancing run, and
1219 * there is no point in doing that. */
417e7e66 1220 ovs_list_remove(&e->list_node);
f620b43a
BP
1221
1222 /* Re-sort 'bals'. */
1223 reinsert_bal(&bals, from);
1224 reinsert_bal(&bals, to);
60cda7d6 1225 rebalanced = true;
f620b43a
BP
1226 } else {
1227 /* Can't usefully migrate anything away from 'from'.
1228 * Don't reconsider it. */
417e7e66 1229 ovs_list_remove(&from->bal_node);
f620b43a
BP
1230 }
1231 }
1232
1233 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1234 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1235 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1236 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1237 e->tx_bytes /= 2;
f620b43a 1238 }
adcf00ba 1239
60cda7d6 1240 if (use_recirc && rebalanced) {
ca8127fd 1241 bond_update_post_recirc_rules__(bond,true);
60cda7d6 1242 }
2f486d4c
AZ
1243
1244done:
3bfd3972 1245 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1246}
1247\f
1248/* Bonding unixctl user interface functions. */
1249
1250static struct bond *
3bfd3972 1251bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
1252{
1253 struct bond *bond;
1254
1255 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
3bfd3972 1256 all_bonds) {
f620b43a
BP
1257 if (!strcmp(bond->name, name)) {
1258 return bond;
1259 }
1260 }
1261 return NULL;
1262}
1263
1264static struct bond_slave *
1265bond_lookup_slave(struct bond *bond, const char *slave_name)
1266{
1267 struct bond_slave *slave;
1268
1269 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1270 if (!strcmp(slave->name, slave_name)) {
1271 return slave;
1272 }
1273 }
1274 return NULL;
1275}
1276
1277static void
1278bond_unixctl_list(struct unixctl_conn *conn,
0e15264f
BP
1279 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1280 void *aux OVS_UNUSED)
f620b43a
BP
1281{
1282 struct ds ds = DS_EMPTY_INITIALIZER;
1283 const struct bond *bond;
1284
adcf00ba 1285 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
f620b43a 1286
3bfd3972
EJ
1287 ovs_rwlock_rdlock(&rwlock);
1288 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
f620b43a
BP
1289 const struct bond_slave *slave;
1290 size_t i;
1291
adcf00ba
AZ
1292 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1293 bond_mode_to_string(bond->balance), bond->recirc_id);
f620b43a
BP
1294
1295 i = 0;
1296 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1297 if (i++ > 0) {
1298 ds_put_cstr(&ds, ", ");
1299 }
1300 ds_put_cstr(&ds, slave->name);
1301 }
1302 ds_put_char(&ds, '\n');
1303 }
3bfd3972 1304 ovs_rwlock_unlock(&rwlock);
bde9f75d 1305 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a
BP
1306 ds_destroy(&ds);
1307}
1308
1309static void
c33a8a25 1310bond_print_details(struct ds *ds, const struct bond *bond)
3bfd3972 1311 OVS_REQ_RDLOCK(rwlock)
f620b43a 1312{
fc1d4f01
EJ
1313 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1314 const struct shash_node **sorted_slaves = NULL;
f620b43a 1315 const struct bond_slave *slave;
adcf00ba
AZ
1316 bool may_recirc;
1317 uint32_t recirc_id;
fc1d4f01 1318 int i;
f620b43a 1319
c33a8a25
EJ
1320 ds_put_format(ds, "---- %s ----\n", bond->name);
1321 ds_put_format(ds, "bond_mode: %s\n",
f620b43a
BP
1322 bond_mode_to_string(bond->balance));
1323
6b95d23c
AZ
1324 may_recirc = bond_may_recirc(bond);
1325 recirc_id = bond->recirc_id;
adcf00ba
AZ
1326 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1327 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1328
c33a8a25 1329 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
672d18b2 1330
c33a8a25
EJ
1331 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1332 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
f620b43a 1333
1b137691 1334 if (bond_is_balanced(bond)) {
c33a8a25 1335 ds_put_format(ds, "next rebalance: %lld ms\n",
f620b43a
BP
1336 bond->next_rebalance - time_msec());
1337 }
1338
bdebeece
EJ
1339 ds_put_cstr(ds, "lacp_status: ");
1340 switch (bond->lacp_status) {
1341 case LACP_NEGOTIATED:
1342 ds_put_cstr(ds, "negotiated\n");
1343 break;
1344 case LACP_CONFIGURED:
1345 ds_put_cstr(ds, "configured\n");
1346 break;
1347 case LACP_DISABLED:
1348 ds_put_cstr(ds, "off\n");
1349 break;
1350 default:
1351 ds_put_cstr(ds, "<unknown>\n");
1352 break;
1353 }
4d6fb5eb 1354
57fc4fd0 1355 ds_put_format(ds, "lacp_fallback_ab: %s\n",
1356 bond->lacp_fallback_ab ? "true" : "false");
1357
3e5aeeb5
AZ
1358 ds_put_cstr(ds, "active slave mac: ");
1359 ds_put_format(ds, ETH_ADDR_FMT, ETH_ADDR_ARGS(bond->active_slave_mac));
1360 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1361 ds_put_format(ds,"(%s)\n", slave ? slave->name : "none");
1362
f620b43a 1363 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
fc1d4f01
EJ
1364 shash_add(&slave_shash, slave->name, slave);
1365 }
1366 sorted_slaves = shash_sort(&slave_shash);
1367
1368 for (i = 0; i < shash_count(&slave_shash); i++) {
f620b43a 1369 struct bond_entry *be;
f620b43a 1370
fc1d4f01
EJ
1371 slave = sorted_slaves[i]->data;
1372
f620b43a 1373 /* Basic info. */
c33a8a25 1374 ds_put_format(ds, "\nslave %s: %s\n",
f620b43a
BP
1375 slave->name, slave->enabled ? "enabled" : "disabled");
1376 if (slave == bond->active_slave) {
c33a8a25 1377 ds_put_cstr(ds, "\tactive slave\n");
f620b43a
BP
1378 }
1379 if (slave->delay_expires != LLONG_MAX) {
c33a8a25 1380 ds_put_format(ds, "\t%s expires in %lld ms\n",
f620b43a
BP
1381 slave->enabled ? "downdelay" : "updelay",
1382 slave->delay_expires - time_msec());
1383 }
1384
c33a8a25 1385 ds_put_format(ds, "\tmay_enable: %s\n",
296f6519 1386 slave->may_enable ? "true" : "false");
4d6fb5eb 1387
1b137691 1388 if (!bond_is_balanced(bond)) {
f620b43a
BP
1389 continue;
1390 }
1391
1392 /* Hashes. */
f620b43a
BP
1393 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1394 int hash = be - bond->hash;
f6ba1f35 1395 uint64_t be_tx_k;
f620b43a
BP
1396
1397 if (be->slave != slave) {
1398 continue;
1399 }
1400
f6ba1f35
AZ
1401 be_tx_k = be->tx_bytes / 1024;
1402 if (be_tx_k) {
1403 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1404 hash, be_tx_k);
1405 }
f620b43a 1406
7b9f1974 1407 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
f620b43a
BP
1408 }
1409 }
fc1d4f01
EJ
1410 shash_destroy(&slave_shash);
1411 free(sorted_slaves);
c33a8a25
EJ
1412 ds_put_cstr(ds, "\n");
1413}
1414
1415static void
1416bond_unixctl_show(struct unixctl_conn *conn,
1417 int argc, const char *argv[],
1418 void *aux OVS_UNUSED)
1419{
1420 struct ds ds = DS_EMPTY_INITIALIZER;
1421
3bfd3972 1422 ovs_rwlock_rdlock(&rwlock);
c33a8a25
EJ
1423 if (argc > 1) {
1424 const struct bond *bond = bond_find(argv[1]);
1425
1426 if (!bond) {
bde9f75d 1427 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1428 goto out;
c33a8a25
EJ
1429 }
1430 bond_print_details(&ds, bond);
1431 } else {
1432 const struct bond *bond;
1433
3bfd3972 1434 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
c33a8a25
EJ
1435 bond_print_details(&ds, bond);
1436 }
1437 }
1438
bde9f75d 1439 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a 1440 ds_destroy(&ds);
3bfd3972
EJ
1441
1442out:
1443 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1444}
1445
1446static void
0e15264f
BP
1447bond_unixctl_migrate(struct unixctl_conn *conn,
1448 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1449 void *aux OVS_UNUSED)
1450{
0e15264f
BP
1451 const char *bond_s = argv[1];
1452 const char *hash_s = argv[2];
1453 const char *slave_s = argv[3];
f620b43a
BP
1454 struct bond *bond;
1455 struct bond_slave *slave;
1456 struct bond_entry *entry;
1457 int hash;
1458
3bfd3972 1459 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1460 bond = bond_find(bond_s);
1461 if (!bond) {
bde9f75d 1462 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1463 goto out;
f620b43a
BP
1464 }
1465
1466 if (bond->balance != BM_SLB) {
bde9f75d 1467 unixctl_command_reply_error(conn, "not an SLB bond");
3bfd3972 1468 goto out;
f620b43a
BP
1469 }
1470
1471 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1472 hash = atoi(hash_s) & BOND_MASK;
1473 } else {
bde9f75d 1474 unixctl_command_reply_error(conn, "bad hash");
3bfd3972 1475 goto out;
f620b43a
BP
1476 }
1477
1478 slave = bond_lookup_slave(bond, slave_s);
1479 if (!slave) {
bde9f75d 1480 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1481 goto out;
f620b43a
BP
1482 }
1483
1484 if (!slave->enabled) {
bde9f75d 1485 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
3bfd3972 1486 goto out;
f620b43a
BP
1487 }
1488
1489 entry = &bond->hash[hash];
4a1b8f30 1490 bond->bond_revalidate = true;
f620b43a 1491 entry->slave = slave;
bde9f75d 1492 unixctl_command_reply(conn, "migrated");
3bfd3972
EJ
1493
1494out:
1495 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1496}
1497
1498static void
0e15264f
BP
1499bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1500 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1501 void *aux OVS_UNUSED)
1502{
0e15264f
BP
1503 const char *bond_s = argv[1];
1504 const char *slave_s = argv[2];
f620b43a
BP
1505 struct bond *bond;
1506 struct bond_slave *slave;
1507
3bfd3972 1508 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1509 bond = bond_find(bond_s);
1510 if (!bond) {
bde9f75d 1511 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1512 goto out;
f620b43a
BP
1513 }
1514
1515 slave = bond_lookup_slave(bond, slave_s);
1516 if (!slave) {
bde9f75d 1517 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1518 goto out;
f620b43a
BP
1519 }
1520
1521 if (!slave->enabled) {
bde9f75d 1522 unixctl_command_reply_error(conn, "cannot make disabled slave active");
3bfd3972 1523 goto out;
f620b43a
BP
1524 }
1525
1526 if (bond->active_slave != slave) {
4a1b8f30 1527 bond->bond_revalidate = true;
f620b43a 1528 bond->active_slave = slave;
f620b43a
BP
1529 VLOG_INFO("bond %s: active interface is now %s",
1530 bond->name, slave->name);
1531 bond->send_learning_packets = true;
bde9f75d 1532 unixctl_command_reply(conn, "done");
3e5aeeb5 1533 bond_active_slave_changed(bond);
f620b43a 1534 } else {
bde9f75d 1535 unixctl_command_reply(conn, "no change");
f620b43a 1536 }
3bfd3972
EJ
1537out:
1538 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1539}
1540
1541static void
0e15264f 1542enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
f620b43a 1543{
0e15264f
BP
1544 const char *bond_s = argv[1];
1545 const char *slave_s = argv[2];
f620b43a
BP
1546 struct bond *bond;
1547 struct bond_slave *slave;
1548
3bfd3972 1549 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1550 bond = bond_find(bond_s);
1551 if (!bond) {
bde9f75d 1552 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1553 goto out;
f620b43a
BP
1554 }
1555
1556 slave = bond_lookup_slave(bond, slave_s);
1557 if (!slave) {
bde9f75d 1558 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1559 goto out;
f620b43a
BP
1560 }
1561
4a1b8f30 1562 bond_enable_slave(slave, enable);
bde9f75d 1563 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
3bfd3972
EJ
1564
1565out:
1566 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1567}
1568
1569static void
0e15264f
BP
1570bond_unixctl_enable_slave(struct unixctl_conn *conn,
1571 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1572 void *aux OVS_UNUSED)
1573{
0e15264f 1574 enable_slave(conn, argv, true);
f620b43a
BP
1575}
1576
1577static void
0e15264f
BP
1578bond_unixctl_disable_slave(struct unixctl_conn *conn,
1579 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1580 void *aux OVS_UNUSED)
1581{
0e15264f 1582 enable_slave(conn, argv, false);
f620b43a
BP
1583}
1584
1585static void
0e15264f 1586bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
f620b43a
BP
1587 void *aux OVS_UNUSED)
1588{
0e15264f
BP
1589 const char *mac_s = argv[1];
1590 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1591 const char *basis_s = argc > 3 ? argv[3] : NULL;
74ff3298 1592 struct eth_addr mac;
f620b43a
BP
1593 uint8_t hash;
1594 char *hash_cstr;
1595 unsigned int vlan;
672d18b2 1596 uint32_t basis;
f620b43a
BP
1597
1598 if (vlan_s) {
c2c28dfd 1599 if (!ovs_scan(vlan_s, "%u", &vlan)) {
bde9f75d 1600 unixctl_command_reply_error(conn, "invalid vlan");
f620b43a
BP
1601 return;
1602 }
1603 } else {
dc155bff 1604 vlan = 0;
f620b43a
BP
1605 }
1606
672d18b2 1607 if (basis_s) {
c2c28dfd 1608 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
bde9f75d 1609 unixctl_command_reply_error(conn, "invalid basis");
672d18b2
EJ
1610 return;
1611 }
1612 } else {
1613 basis = 0;
1614 }
1615
c2c28dfd 1616 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
e9013d6a 1617 hash = hash_mac(mac, vlan, basis) & BOND_MASK;
f620b43a
BP
1618
1619 hash_cstr = xasprintf("%u", hash);
bde9f75d 1620 unixctl_command_reply(conn, hash_cstr);
f620b43a
BP
1621 free(hash_cstr);
1622 } else {
bde9f75d 1623 unixctl_command_reply_error(conn, "invalid mac");
f620b43a
BP
1624 }
1625}
1626
1627void
1628bond_init(void)
1629{
0e15264f 1630 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
c33a8a25
EJ
1631 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1632 NULL);
0e15264f 1633 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
7ff2009a 1634 bond_unixctl_migrate, NULL);
0e15264f 1635 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
f620b43a 1636 bond_unixctl_set_active_slave, NULL);
0e15264f 1637 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
7ff2009a 1638 bond_unixctl_enable_slave, NULL);
0e15264f 1639 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
7ff2009a 1640 bond_unixctl_disable_slave, NULL);
0e15264f 1641 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
7ff2009a 1642 bond_unixctl_hash, NULL);
f620b43a
BP
1643}
1644\f
95aafb2a
EJ
1645static void
1646bond_entry_reset(struct bond *bond)
1647{
1648 if (bond->balance != BM_AB) {
9e1a6910 1649 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
95aafb2a
EJ
1650
1651 if (!bond->hash) {
1652 bond->hash = xmalloc(hash_len);
1653 }
1654 memset(bond->hash, 0, hash_len);
1655
1656 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1657 } else {
1658 free(bond->hash);
1659 bond->hash = NULL;
05df1623
AZ
1660 /* Remove existing post recirc rules. */
1661 update_recirc_rules(bond);
95aafb2a
EJ
1662 }
1663}
1664
f620b43a
BP
1665static struct bond_slave *
1666bond_slave_lookup(struct bond *bond, const void *slave_)
1667{
1668 struct bond_slave *slave;
1669
1670 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1671 &bond->slaves) {
1672 if (slave->aux == slave_) {
1673 return slave;
1674 }
1675 }
1676
1677 return NULL;
1678}
1679
f620b43a 1680static void
4a1b8f30 1681bond_enable_slave(struct bond_slave *slave, bool enable)
f620b43a 1682{
5fef88ea 1683 struct bond *bond = slave->bond;
1684
f620b43a
BP
1685 slave->delay_expires = LLONG_MAX;
1686 if (enable != slave->enabled) {
4a1b8f30 1687 slave->bond->bond_revalidate = true;
f620b43a 1688 slave->enabled = enable;
f1c8a79c
AW
1689
1690 ovs_mutex_lock(&slave->bond->mutex);
1691 if (enable) {
417e7e66 1692 ovs_list_insert(&slave->bond->enabled_slaves, &slave->list_node);
f1c8a79c 1693 } else {
5fef88ea 1694 bond->send_learning_packets = true;
417e7e66 1695 ovs_list_remove(&slave->list_node);
f1c8a79c
AW
1696 }
1697 ovs_mutex_unlock(&slave->bond->mutex);
1698
4a1b8f30
EJ
1699 VLOG_INFO("interface %s: %s", slave->name,
1700 slave->enabled ? "enabled" : "disabled");
f620b43a
BP
1701 }
1702}
1703
1704static void
4a1b8f30 1705bond_link_status_update(struct bond_slave *slave)
f620b43a
BP
1706{
1707 struct bond *bond = slave->bond;
1708 bool up;
1709
296f6519 1710 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
f620b43a
BP
1711 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1712 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1713 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1714 slave->name, up ? "up" : "down");
1715 if (up == slave->enabled) {
1716 slave->delay_expires = LLONG_MAX;
1717 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1718 slave->name, up ? "disabled" : "enabled");
1719 } else {
bdebeece 1720 int delay = (bond->lacp_status != LACP_DISABLED ? 0
f620b43a
BP
1721 : up ? bond->updelay : bond->downdelay);
1722 slave->delay_expires = time_msec() + delay;
1723 if (delay) {
1724 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1725 "for %d ms",
1726 slave->name,
1727 up ? "enabled" : "disabled",
1728 up ? "up" : "down",
1729 delay);
1730 }
1731 }
1732 }
1733
1734 if (time_msec() >= slave->delay_expires) {
4a1b8f30 1735 bond_enable_slave(slave, up);
f620b43a
BP
1736 }
1737}
1738
fb0b29a3
EJ
1739static unsigned int
1740bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1741{
cb22974d 1742 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
fb0b29a3 1743
bdebeece 1744 return (bond->balance == BM_TCP
42781e77 1745 ? flow_hash_5tuple(flow, bond->basis)
e9013d6a 1746 : hash_mac(flow->dl_src, vlan, bond->basis));
fb0b29a3
EJ
1747}
1748
f620b43a
BP
1749static struct bond_entry *
1750lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1751 uint16_t vlan)
1752{
fb0b29a3 1753 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
f620b43a
BP
1754}
1755
f1c8a79c
AW
1756/* Selects and returns an enabled slave from the 'enabled_slaves' list
1757 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1758 * returns NULL. */
1759static struct bond_slave *
1760get_enabled_slave(struct bond *bond)
1761{
ca6ba700 1762 struct ovs_list *node;
f1c8a79c
AW
1763
1764 ovs_mutex_lock(&bond->mutex);
417e7e66 1765 if (ovs_list_is_empty(&bond->enabled_slaves)) {
f1c8a79c
AW
1766 ovs_mutex_unlock(&bond->mutex);
1767 return NULL;
1768 }
1769
417e7e66
BW
1770 node = ovs_list_pop_front(&bond->enabled_slaves);
1771 ovs_list_push_back(&bond->enabled_slaves, node);
f1c8a79c
AW
1772 ovs_mutex_unlock(&bond->mutex);
1773
1774 return CONTAINER_OF(node, struct bond_slave, list_node);
1775}
1776
f620b43a
BP
1777static struct bond_slave *
1778choose_output_slave(const struct bond *bond, const struct flow *flow,
4a1b8f30 1779 struct flow_wildcards *wc, uint16_t vlan)
f620b43a
BP
1780{
1781 struct bond_entry *e;
9dd165e0 1782 int balance;
f620b43a 1783
9dd165e0 1784 balance = bond->balance;
bdebeece
EJ
1785 if (bond->lacp_status == LACP_CONFIGURED) {
1786 /* LACP has been configured on this bond but negotiations were
9dd165e0
RK
1787 * unsuccussful. If lacp_fallback_ab is enabled use active-
1788 * backup mode else drop all traffic. */
1789 if (!bond->lacp_fallback_ab) {
1790 return NULL;
1791 }
1792 balance = BM_AB;
bdebeece
EJ
1793 }
1794
9dd165e0 1795 switch (balance) {
f620b43a
BP
1796 case BM_AB:
1797 return bond->active_slave;
1798
f620b43a 1799 case BM_TCP:
bdebeece
EJ
1800 if (bond->lacp_status != LACP_NEGOTIATED) {
1801 /* Must have LACP negotiations for TCP balanced bonds. */
1802 return NULL;
1803 }
bcd2633a 1804 if (wc) {
deb67947 1805 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L3L4_UDP);
bcd2633a 1806 }
bdebeece
EJ
1807 /* Fall Through. */
1808 case BM_SLB:
deb67947 1809 if (wc && balance == BM_SLB) {
6cdd5145 1810 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
bcd2633a 1811 }
f620b43a
BP
1812 e = lookup_bond_entry(bond, flow, vlan);
1813 if (!e->slave || !e->slave->enabled) {
f1c8a79c 1814 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
f620b43a
BP
1815 }
1816 return e->slave;
1817
1818 default:
428b2edd 1819 OVS_NOT_REACHED();
f620b43a
BP
1820 }
1821}
1822
1823static struct bond_slave *
1824bond_choose_slave(const struct bond *bond)
1825{
1826 struct bond_slave *slave, *best;
1827
3e5aeeb5
AZ
1828 /* Find the last active slave. */
1829 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1830 if (slave && slave->enabled) {
1831 return slave;
1832 }
1833
f620b43a
BP
1834 /* Find an enabled slave. */
1835 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1836 if (slave->enabled) {
1837 return slave;
1838 }
1839 }
1840
1841 /* All interfaces are disabled. Find an interface that will be enabled
1842 * after its updelay expires. */
1843 best = NULL;
1844 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1845 if (slave->delay_expires != LLONG_MAX
296f6519 1846 && slave->may_enable
f620b43a
BP
1847 && (!best || slave->delay_expires < best->delay_expires)) {
1848 best = slave;
1849 }
1850 }
1851 return best;
1852}
1853
1854static void
4a1b8f30 1855bond_choose_active_slave(struct bond *bond)
f620b43a
BP
1856{
1857 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1858 struct bond_slave *old_active_slave = bond->active_slave;
1859
1860 bond->active_slave = bond_choose_slave(bond);
1861 if (bond->active_slave) {
1862 if (bond->active_slave->enabled) {
1863 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1864 bond->name, bond->active_slave->name);
1865 } else {
1866 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1867 "remaining %lld ms updelay (since no interface was "
1868 "enabled)", bond->name, bond->active_slave->name,
1869 bond->active_slave->delay_expires - time_msec());
4a1b8f30 1870 bond_enable_slave(bond->active_slave, true);
f620b43a
BP
1871 }
1872
1873 bond->send_learning_packets = true;
3e5aeeb5
AZ
1874
1875 if (bond->active_slave != old_active_slave) {
1876 bond_active_slave_changed(bond);
1877 }
f620b43a 1878 } else if (old_active_slave) {
f626af7a 1879 bond_active_slave_changed(bond);
d28b9ead 1880 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
f620b43a
BP
1881 }
1882}
3e5aeeb5
AZ
1883
1884/*
1885 * Return true if bond has unstored active slave change.
1886 * If return true, 'mac' will store the bond's current active slave's
1887 * MAC address. */
1888bool
74ff3298
JR
1889bond_get_changed_active_slave(const char *name, struct eth_addr *mac,
1890 bool force)
3e5aeeb5
AZ
1891{
1892 struct bond *bond;
1893
1894 ovs_rwlock_wrlock(&rwlock);
1895 bond = bond_find(name);
1896 if (bond) {
1897 if (bond->active_slave_changed || force) {
74ff3298 1898 *mac = bond->active_slave_mac;
3e5aeeb5
AZ
1899 bond->active_slave_changed = false;
1900 ovs_rwlock_unlock(&rwlock);
1901 return true;
1902 }
1903 }
1904 ovs_rwlock_unlock(&rwlock);
1905
1906 return false;
1907}