]> git.proxmox.com Git - mirror_ovs.git/blame - ofproto/bond.c
AUTHORS: Add Mark Kavanagh.
[mirror_ovs.git] / ofproto / bond.c
CommitLineData
f620b43a 1/*
8917f72c 2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
f620b43a
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
19#include "bond.h"
20
21#include <limits.h>
22#include <stdint.h>
23#include <stdlib.h>
75fad143 24#include <math.h>
f620b43a 25
adcf00ba
AZ
26#include "ofp-util.h"
27#include "ofp-actions.h"
28#include "ofpbuf.h"
29#include "ofproto/ofproto-provider.h"
30#include "ofproto/ofproto-dpif.h"
da4a6191 31#include "connectivity.h"
f620b43a
BP
32#include "coverage.h"
33#include "dynamic-string.h"
34#include "flow.h"
35#include "hmap.h"
bdebeece 36#include "lacp.h"
f620b43a
BP
37#include "list.h"
38#include "netdev.h"
39#include "odp-util.h"
40#include "ofpbuf.h"
41#include "packets.h"
42#include "poll-loop.h"
da4a6191 43#include "seq.h"
adcf00ba 44#include "match.h"
fc1d4f01 45#include "shash.h"
f620b43a
BP
46#include "timeval.h"
47#include "unixctl.h"
e6211adc 48#include "openvswitch/vlog.h"
f620b43a
BP
49
50VLOG_DEFINE_THIS_MODULE(bond);
51
f1c8a79c
AW
52static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
53static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
54static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
55
9e1a6910 56/* Bit-mask for hashing a flow down to a bucket. */
f620b43a 57#define BOND_MASK 0xff
9e1a6910 58#define BOND_BUCKETS (BOND_MASK + 1)
f620b43a
BP
59
60/* A hash bucket for mapping a flow to a slave.
9e1a6910 61 * "struct bond" has an array of BOND_BUCKETS of these. */
f620b43a
BP
62struct bond_entry {
63 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
c6855ec5
JS
64 uint64_t tx_bytes /* Count of bytes recently transmitted. */
65 OVS_GUARDED_BY(rwlock);
ca6ba700 66 struct ovs_list list_node; /* In bond_slave's 'entries' list. */
adcf00ba 67
c6855ec5
JS
68 /* Recirculation.
69 *
70 * 'pr_rule' is the post-recirculation rule for this entry.
71 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
72 * is used to determine delta (applied to 'tx_bytes' above.) */
73 struct rule *pr_rule;
74 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
f620b43a
BP
75};
76
77/* A bond slave, that is, one of the links comprising a bond. */
78struct bond_slave {
79 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
ca6ba700 80 struct ovs_list list_node; /* In struct bond's enabled_slaves list. */
f620b43a
BP
81 struct bond *bond; /* The bond that contains this slave. */
82 void *aux; /* Client-provided handle for this slave. */
83
84 struct netdev *netdev; /* Network device, owned by the client. */
1ea24138 85 unsigned int change_seq; /* Tracks changes in 'netdev'. */
0746a84f 86 ofp_port_t ofp_port; /* OpenFlow port number. */
f620b43a
BP
87 char *name; /* Name (a copy of netdev_get_name(netdev)). */
88
89 /* Link status. */
90 long long delay_expires; /* Time after which 'enabled' may change. */
f620b43a 91 bool enabled; /* May be chosen for flows? */
296f6519 92 bool may_enable; /* Client considers this slave bondable. */
f620b43a
BP
93
94 /* Rebalancing info. Used only by bond_rebalance(). */
ca6ba700
TG
95 struct ovs_list bal_node; /* In bond_rebalance()'s 'bals' list. */
96 struct ovs_list entries; /* 'struct bond_entry's assigned here. */
f620b43a
BP
97 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
98};
99
100/* A bond, that is, a set of network devices grouped to improve performance or
101 * robustness. */
102struct bond {
103 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
104 char *name; /* Name provided by client. */
adcf00ba 105 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
f620b43a
BP
106
107 /* Slaves. */
108 struct hmap slaves;
109
f1c8a79c
AW
110 /* Enabled slaves.
111 *
112 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
113 * (To prevent the bond_slave from disappearing they must also hold
114 * 'rwlock'.) */
115 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
ca6ba700 116 struct ovs_list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
f1c8a79c 117
f620b43a
BP
118 /* Bonding info. */
119 enum bond_mode balance; /* Balancing mode, one of BM_*. */
120 struct bond_slave *active_slave;
f620b43a 121 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
bdebeece 122 enum lacp_status lacp_status; /* Status of LACP negotiations. */
62904702 123 bool bond_revalidate; /* True if flows need revalidation. */
672d18b2 124 uint32_t basis; /* Basis for flow hash function. */
f620b43a
BP
125
126 /* SLB specific bonding info. */
9e1a6910 127 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
f620b43a
BP
128 int rebalance_interval; /* Interval between rebalances, in ms. */
129 long long int next_rebalance; /* Next rebalancing time. */
130 bool send_learning_packets;
adcf00ba
AZ
131 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
132 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
f620b43a 133
3e5aeeb5
AZ
134 /* Store active slave to OVSDB. */
135 bool active_slave_changed; /* Set to true whenever the bond changes
136 active slave. It will be reset to false
137 after it is stored into OVSDB */
138
139 /* Interface name may not be persistent across an OS reboot, use
140 * MAC address for identifing the active slave */
141 uint8_t active_slave_mac[ETH_ADDR_LEN];
142 /* The MAC address of the active interface. */
f620b43a 143 /* Legacy compatibility. */
9dd165e0 144 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
f620b43a 145
37bec3d3 146 struct ovs_refcount ref_cnt;
f620b43a
BP
147};
148
adcf00ba
AZ
149/* What to do with an bond_recirc_rule. */
150enum bond_op {
151 ADD, /* Add the rule to ofproto's flow table. */
152 DEL, /* Delete the rule from the ofproto's flow table. */
153};
154
155/* A rule to add to or delete from ofproto's internal flow table. */
156struct bond_pr_rule_op {
157 struct hmap_node hmap_node;
158 struct match match;
159 ofp_port_t out_ofport;
160 enum bond_op op;
6c932bc8 161 struct rule **pr_rule;
adcf00ba
AZ
162};
163
3bfd3972
EJ
164static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
165static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
166 OVS_REQ_RDLOCK(rwlock);
4a1b8f30
EJ
167static void bond_enable_slave(struct bond_slave *, bool enable)
168 OVS_REQ_WRLOCK(rwlock);
169static void bond_link_status_update(struct bond_slave *)
3bfd3972 170 OVS_REQ_WRLOCK(rwlock);
4a1b8f30 171static void bond_choose_active_slave(struct bond *)
9e1a6910 172 OVS_REQ_WRLOCK(rwlock);
f620b43a 173static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
672d18b2
EJ
174 uint16_t vlan, uint32_t basis);
175static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
176 uint32_t basis);
f620b43a
BP
177static struct bond_entry *lookup_bond_entry(const struct bond *,
178 const struct flow *,
3bfd3972
EJ
179 uint16_t vlan)
180 OVS_REQ_RDLOCK(rwlock);
f1c8a79c
AW
181static struct bond_slave *get_enabled_slave(struct bond *)
182 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
183static struct bond_slave *choose_output_slave(const struct bond *,
184 const struct flow *,
bcd2633a 185 struct flow_wildcards *,
4a1b8f30 186 uint16_t vlan)
3bfd3972 187 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
188
189/* Attempts to parse 's' as the name of a bond balancing mode. If successful,
190 * stores the mode in '*balance' and returns true. Otherwise returns false
191 * without modifying '*balance'. */
192bool
193bond_mode_from_string(enum bond_mode *balance, const char *s)
194{
195 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
196 *balance = BM_TCP;
197 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
198 *balance = BM_SLB;
199 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
200 *balance = BM_AB;
201 } else {
202 return false;
203 }
204 return true;
205}
206
207/* Returns a string representing 'balance'. */
208const char *
209bond_mode_to_string(enum bond_mode balance) {
210 switch (balance) {
211 case BM_TCP:
212 return "balance-tcp";
213 case BM_SLB:
214 return "balance-slb";
215 case BM_AB:
216 return "active-backup";
217 }
428b2edd 218 OVS_NOT_REACHED();
f620b43a
BP
219}
220
f620b43a
BP
221\f
222/* Creates and returns a new bond whose configuration is initially taken from
223 * 's'.
224 *
225 * The caller should register each slave on the new bond by calling
226 * bond_slave_register(). */
227struct bond *
adcf00ba 228bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
f620b43a
BP
229{
230 struct bond *bond;
231
232 bond = xzalloc(sizeof *bond);
adcf00ba 233 bond->ofproto = ofproto;
f620b43a 234 hmap_init(&bond->slaves);
f1c8a79c
AW
235 list_init(&bond->enabled_slaves);
236 ovs_mutex_init(&bond->mutex);
37bec3d3 237 ovs_refcount_init(&bond->ref_cnt);
f620b43a 238
adcf00ba
AZ
239 bond->recirc_id = 0;
240 hmap_init(&bond->pr_rule_ops);
241
f620b43a 242 bond_reconfigure(bond, s);
f620b43a
BP
243 return bond;
244}
245
03366a2d
EJ
246struct bond *
247bond_ref(const struct bond *bond_)
248{
249 struct bond *bond = CONST_CAST(struct bond *, bond_);
250
bca0b3b4 251 if (bond) {
37bec3d3 252 ovs_refcount_ref(&bond->ref_cnt);
bca0b3b4 253 }
03366a2d
EJ
254 return bond;
255}
256
f620b43a
BP
257/* Frees 'bond'. */
258void
03366a2d 259bond_unref(struct bond *bond)
f620b43a
BP
260{
261 struct bond_slave *slave, *next_slave;
adcf00ba 262 struct bond_pr_rule_op *pr_op, *next_op;
f620b43a 263
24f83812 264 if (!bond || ovs_refcount_unref_relaxed(&bond->ref_cnt) != 1) {
03366a2d
EJ
265 return;
266 }
267
3bfd3972
EJ
268 ovs_rwlock_wrlock(&rwlock);
269 hmap_remove(all_bonds, &bond->hmap_node);
270 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
271
272 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
273 hmap_remove(&bond->slaves, &slave->hmap_node);
274 /* Client owns 'slave->netdev'. */
275 free(slave->name);
276 free(slave);
277 }
278 hmap_destroy(&bond->slaves);
279
f1c8a79c 280 ovs_mutex_destroy(&bond->mutex);
f620b43a 281 free(bond->hash);
f620b43a 282 free(bond->name);
adcf00ba
AZ
283
284 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
285 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
286 free(pr_op);
287 }
288 hmap_destroy(&bond->pr_rule_ops);
289
290 if (bond->recirc_id) {
291 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
292 }
293
f620b43a
BP
294 free(bond);
295}
296
adcf00ba
AZ
297static void
298add_pr_rule(struct bond *bond, const struct match *match,
6c932bc8 299 ofp_port_t out_ofport, struct rule **rule)
adcf00ba
AZ
300{
301 uint32_t hash = match_hash(match, 0);
302 struct bond_pr_rule_op *pr_op;
303
304 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
305 if (match_equal(&pr_op->match, match)) {
306 pr_op->op = ADD;
307 pr_op->out_ofport = out_ofport;
308 pr_op->pr_rule = rule;
309 return;
310 }
311 }
312
313 pr_op = xmalloc(sizeof *pr_op);
314 pr_op->match = *match;
315 pr_op->op = ADD;
316 pr_op->out_ofport = out_ofport;
317 pr_op->pr_rule = rule;
318 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
319}
320
321static void
322update_recirc_rules(struct bond *bond)
323{
324 struct match match;
325 struct bond_pr_rule_op *pr_op, *next_op;
326 uint64_t ofpacts_stub[128 / 8];
327 struct ofpbuf ofpacts;
328 int i;
329
330 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
331
332 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
333 pr_op->op = DEL;
334 }
335
6c932bc8
AZ
336 if (bond->hash && bond->recirc_id) {
337 for (i = 0; i < BOND_BUCKETS; i++) {
338 struct bond_slave *slave = bond->hash[i].slave;
adcf00ba 339
6c932bc8
AZ
340 if (slave) {
341 match_init_catchall(&match);
342 match_set_recirc_id(&match, bond->recirc_id);
6c932bc8 343 match_set_dp_hash_masked(&match, i, BOND_MASK);
adcf00ba 344
6c932bc8
AZ
345 add_pr_rule(bond, &match, slave->ofp_port,
346 &bond->hash[i].pr_rule);
347 }
adcf00ba
AZ
348 }
349 }
350
351 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
352 int error;
adcf00ba
AZ
353 switch (pr_op->op) {
354 case ADD:
355 ofpbuf_clear(&ofpacts);
356 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
357 error = ofproto_dpif_add_internal_flow(bond->ofproto,
358 &pr_op->match,
290ad78a 359 RECIRC_RULE_PRIORITY, 0,
6c932bc8 360 &ofpacts, pr_op->pr_rule);
adcf00ba
AZ
361 if (error) {
362 char *err_s = match_to_string(&pr_op->match,
363 RECIRC_RULE_PRIORITY);
364
365 VLOG_ERR("failed to add post recirculation flow %s", err_s);
366 free(err_s);
adcf00ba
AZ
367 }
368 break;
369
370 case DEL:
371 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
372 &pr_op->match,
373 RECIRC_RULE_PRIORITY);
374 if (error) {
375 char *err_s = match_to_string(&pr_op->match,
376 RECIRC_RULE_PRIORITY);
377
378 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
379 free(err_s);
380 }
381
382 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
6c932bc8 383 *pr_op->pr_rule = NULL;
adcf00ba
AZ
384 free(pr_op);
385 break;
386 }
387 }
388
389 ofpbuf_uninit(&ofpacts);
390}
391
392
f620b43a
BP
393/* Updates 'bond''s overall configuration to 's'.
394 *
395 * The caller should register each slave on 'bond' by calling
396 * bond_slave_register(). This is optional if none of the slaves'
4d6fb5eb 397 * configuration has changed. In any case it can't hurt.
59d7b2b6
EJ
398 *
399 * Returns true if the configuration has changed in such a way that requires
400 * flow revalidation.
401 * */
402bool
f620b43a
BP
403bond_reconfigure(struct bond *bond, const struct bond_settings *s)
404{
59d7b2b6
EJ
405 bool revalidate = false;
406
3bfd3972 407 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
408 if (!bond->name || strcmp(bond->name, s->name)) {
409 if (bond->name) {
3bfd3972 410 hmap_remove(all_bonds, &bond->hmap_node);
f620b43a
BP
411 free(bond->name);
412 }
413 bond->name = xstrdup(s->name);
3bfd3972 414 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
f620b43a
BP
415 }
416
f620b43a
BP
417 bond->updelay = s->up_delay;
418 bond->downdelay = s->down_delay;
bc1b010c 419
9dd165e0
RK
420 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
421 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
422 revalidate = true;
423 }
424
bc1b010c
EJ
425 if (bond->rebalance_interval != s->rebalance_interval) {
426 bond->rebalance_interval = s->rebalance_interval;
427 revalidate = true;
428 }
f620b43a 429
59d7b2b6
EJ
430 if (bond->balance != s->balance) {
431 bond->balance = s->balance;
432 revalidate = true;
433 }
434
672d18b2
EJ
435 if (bond->basis != s->basis) {
436 bond->basis = s->basis;
437 revalidate = true;
438 }
439
62904702
EJ
440 if (bond->bond_revalidate) {
441 revalidate = true;
442 bond->bond_revalidate = false;
443 }
444
adcf00ba
AZ
445 if (bond->balance != BM_AB) {
446 if (!bond->recirc_id) {
447 bond->recirc_id = ofproto_dpif_alloc_recirc_id(bond->ofproto);
448 }
449 } else if (bond->recirc_id) {
450 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
451 bond->recirc_id = 0;
452 }
453
95aafb2a
EJ
454 if (bond->balance == BM_AB || !bond->hash || revalidate) {
455 bond_entry_reset(bond);
456 }
457
3e5aeeb5
AZ
458 memcpy(bond->active_slave_mac, s->active_slave_mac,
459 sizeof s->active_slave_mac);
460
461 bond->active_slave_changed = false;
462
3bfd3972 463 ovs_rwlock_unlock(&rwlock);
59d7b2b6 464 return revalidate;
f620b43a
BP
465}
466
3e5aeeb5 467static struct bond_slave *
3bd0fd39 468bond_find_slave_by_mac(const struct bond *bond, const uint8_t mac[ETH_ADDR_LEN])
3e5aeeb5
AZ
469{
470 struct bond_slave *slave;
471
472 /* Find the last active slave */
473 HMAP_FOR_EACH(slave, hmap_node, &bond->slaves) {
3bd0fd39 474 uint8_t slave_mac[ETH_ADDR_LEN];
3e5aeeb5
AZ
475
476 if (netdev_get_etheraddr(slave->netdev, slave_mac)) {
477 continue;
478 }
479
480 if (!memcmp(slave_mac, mac, sizeof(slave_mac))) {
481 return slave;
482 }
483 }
484
485 return NULL;
486}
487
488static void
489bond_active_slave_changed(struct bond *bond)
490{
3bd0fd39 491 uint8_t mac[ETH_ADDR_LEN];
3e5aeeb5
AZ
492
493 netdev_get_etheraddr(bond->active_slave->netdev, mac);
494 memcpy(bond->active_slave_mac, mac, sizeof bond->active_slave_mac);
495 bond->active_slave_changed = true;
496 seq_change(connectivity_seq_get());
497}
498
f8ddccd2 499static void
1ea24138 500bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
3bfd3972 501 OVS_REQ_WRLOCK(rwlock)
f8ddccd2
BP
502{
503 if (slave->netdev != netdev) {
f8ddccd2 504 slave->netdev = netdev;
1ea24138 505 slave->change_seq = 0;
f8ddccd2
BP
506 }
507}
508
f620b43a
BP
509/* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
510 * arbitrary client-provided pointer that uniquely identifies a slave within a
511 * bond. If 'slave_' already exists within 'bond' then this function
512 * reconfigures the existing slave.
513 *
514 * 'netdev' must be the network device that 'slave_' represents. It is owned
515 * by the client, so the client must not close it before either unregistering
516 * 'slave_' or destroying 'bond'.
4d6fb5eb 517 */
f620b43a 518void
adcf00ba
AZ
519bond_slave_register(struct bond *bond, void *slave_,
520 ofp_port_t ofport, struct netdev *netdev)
f620b43a 521{
3bfd3972 522 struct bond_slave *slave;
f620b43a 523
3bfd3972
EJ
524 ovs_rwlock_wrlock(&rwlock);
525 slave = bond_slave_lookup(bond, slave_);
f620b43a
BP
526 if (!slave) {
527 slave = xzalloc(sizeof *slave);
528
529 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
530 slave->bond = bond;
531 slave->aux = slave_;
adcf00ba 532 slave->ofp_port = ofport;
f620b43a 533 slave->delay_expires = LLONG_MAX;
244b2160 534 slave->name = xstrdup(netdev_get_name(netdev));
7321e30e 535 bond->bond_revalidate = true;
244b2160 536
b3c18f66 537 slave->enabled = false;
4a1b8f30 538 bond_enable_slave(slave, netdev_get_carrier(netdev));
f620b43a
BP
539 }
540
1ea24138 541 bond_slave_set_netdev__(slave, netdev);
a6934aa9 542
f620b43a
BP
543 free(slave->name);
544 slave->name = xstrdup(netdev_get_name(netdev));
3bfd3972 545 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
546}
547
f8ddccd2
BP
548/* Updates the network device to be used with 'slave_' to 'netdev'.
549 *
550 * This is useful if the caller closes and re-opens the network device
551 * registered with bond_slave_register() but doesn't need to change anything
552 * else. */
553void
554bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
555{
3bfd3972
EJ
556 struct bond_slave *slave;
557
558 ovs_rwlock_wrlock(&rwlock);
559 slave = bond_slave_lookup(bond, slave_);
f8ddccd2 560 if (slave) {
1ea24138 561 bond_slave_set_netdev__(slave, netdev);
f8ddccd2 562 }
3bfd3972 563 ovs_rwlock_unlock(&rwlock);
f8ddccd2
BP
564}
565
f620b43a
BP
566/* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
567 * then this function has no effect.
568 *
569 * Unregistering a slave invalidates all flows. */
570void
571bond_slave_unregister(struct bond *bond, const void *slave_)
572{
3bfd3972 573 struct bond_slave *slave;
f620b43a
BP
574 bool del_active;
575
3bfd3972
EJ
576 ovs_rwlock_wrlock(&rwlock);
577 slave = bond_slave_lookup(bond, slave_);
f620b43a 578 if (!slave) {
3bfd3972 579 goto out;
f620b43a
BP
580 }
581
4a1b8f30
EJ
582 bond->bond_revalidate = true;
583 bond_enable_slave(slave, false);
b3c18f66 584
f620b43a
BP
585 del_active = bond->active_slave == slave;
586 if (bond->hash) {
587 struct bond_entry *e;
588 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
589 if (e->slave == slave) {
590 e->slave = NULL;
591 }
592 }
593 }
594
595 free(slave->name);
596
597 hmap_remove(&bond->slaves, &slave->hmap_node);
598 /* Client owns 'slave->netdev'. */
599 free(slave);
600
601 if (del_active) {
4a1b8f30 602 bond_choose_active_slave(bond);
f620b43a
BP
603 bond->send_learning_packets = true;
604 }
3bfd3972
EJ
605out:
606 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
607}
608
296f6519
EJ
609/* Should be called on each slave in 'bond' before bond_run() to indicate
610 * whether or not 'slave_' may be enabled. This function is intended to allow
611 * other protocols to have some impact on bonding decisions. For example LACP
612 * or high level link monitoring protocols may decide that a given slave should
613 * not be able to send traffic. */
4d6fb5eb 614void
296f6519 615bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
4d6fb5eb 616{
3bfd3972 617 ovs_rwlock_wrlock(&rwlock);
296f6519 618 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
3bfd3972 619 ovs_rwlock_unlock(&rwlock);
4d6fb5eb
EJ
620}
621
4a1b8f30
EJ
622/* Performs periodic maintenance on 'bond'.
623 *
624 * Returns true if the caller should revalidate its flows.
f620b43a
BP
625 *
626 * The caller should check bond_should_send_learning_packets() afterward. */
4a1b8f30
EJ
627bool
628bond_run(struct bond *bond, enum lacp_status lacp_status)
f620b43a
BP
629{
630 struct bond_slave *slave;
4a1b8f30 631 bool revalidate;
f620b43a 632
3bfd3972 633 ovs_rwlock_wrlock(&rwlock);
bdebeece
EJ
634 if (bond->lacp_status != lacp_status) {
635 bond->lacp_status = lacp_status;
4592d0e2
EJ
636 bond->bond_revalidate = true;
637 }
4d6fb5eb 638
f620b43a
BP
639 /* Enable slaves based on link status and LACP feedback. */
640 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
4a1b8f30 641 bond_link_status_update(slave);
da4a6191 642 slave->change_seq = seq_read(connectivity_seq_get());
f620b43a
BP
643 }
644 if (!bond->active_slave || !bond->active_slave->enabled) {
4a1b8f30 645 bond_choose_active_slave(bond);
f620b43a
BP
646 }
647
4a1b8f30
EJ
648 revalidate = bond->bond_revalidate;
649 bond->bond_revalidate = false;
3bfd3972 650 ovs_rwlock_unlock(&rwlock);
4a1b8f30
EJ
651
652 return revalidate;
f620b43a
BP
653}
654
655/* Causes poll_block() to wake up when 'bond' needs something to be done. */
656void
657bond_wait(struct bond *bond)
658{
659 struct bond_slave *slave;
660
3bfd3972 661 ovs_rwlock_rdlock(&rwlock);
f620b43a
BP
662 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
663 if (slave->delay_expires != LLONG_MAX) {
664 poll_timer_wait_until(slave->delay_expires);
665 }
1ea24138 666
da4a6191 667 seq_wait(connectivity_seq_get(), slave->change_seq);
f620b43a
BP
668 }
669
bbc13389 670 if (bond->bond_revalidate) {
f620b43a
BP
671 poll_immediate_wake();
672 }
3bfd3972 673 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
674
675 /* We don't wait for bond->next_rebalance because rebalancing can only run
676 * at a flow account checkpoint. ofproto does checkpointing on its own
677 * schedule and bond_rebalance() gets called afterward, so we'd just be
678 * waking up for no purpose. */
679}
680\f
681/* MAC learning table interaction. */
682
683static bool
684may_send_learning_packets(const struct bond *bond)
685{
9dd165e0
RK
686 return ((bond->lacp_status == LACP_DISABLED
687 && (bond->balance == BM_SLB || bond->balance == BM_AB))
688 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
bdebeece 689 && bond->active_slave;
f620b43a
BP
690}
691
692/* Returns true if 'bond' needs the client to send out packets to assist with
693 * MAC learning on 'bond'. If this function returns true, then the client
694 * should iterate through its MAC learning table for the bridge on which 'bond'
695 * is located. For each MAC that has been learned on a port other than 'bond',
ea131871 696 * it should call bond_compose_learning_packet().
f620b43a 697 *
477879ea
BP
698 * This function will only return true if 'bond' is in SLB or active-backup
699 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
700 * necessary.
f620b43a
BP
701 *
702 * Calling this function resets the state that it checks. */
703bool
704bond_should_send_learning_packets(struct bond *bond)
705{
3bfd3972
EJ
706 bool send;
707
708 ovs_rwlock_wrlock(&rwlock);
709 send = bond->send_learning_packets && may_send_learning_packets(bond);
f620b43a 710 bond->send_learning_packets = false;
3bfd3972 711 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
712 return send;
713}
714
715/* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
716 *
ea131871
JG
717 * See bond_should_send_learning_packets() for description of usage. The
718 * caller should send the composed packet on the port associated with
719 * port_aux and takes ownership of the returned ofpbuf. */
720struct ofpbuf *
721bond_compose_learning_packet(struct bond *bond,
722 const uint8_t eth_src[ETH_ADDR_LEN],
723 uint16_t vlan, void **port_aux)
f620b43a
BP
724{
725 struct bond_slave *slave;
ea131871 726 struct ofpbuf *packet;
f620b43a 727 struct flow flow;
f620b43a 728
3bfd3972 729 ovs_rwlock_rdlock(&rwlock);
cb22974d 730 ovs_assert(may_send_learning_packets(bond));
f620b43a
BP
731 memset(&flow, 0, sizeof flow);
732 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
4a1b8f30 733 slave = choose_output_slave(bond, &flow, NULL, vlan);
f620b43a 734
ea131871 735 packet = ofpbuf_new(0);
2ea838ac 736 compose_rarp(packet, eth_src);
f620b43a 737 if (vlan) {
1bf02876 738 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
f620b43a 739 }
f620b43a 740
ea131871 741 *port_aux = slave->aux;
3bfd3972 742 ovs_rwlock_unlock(&rwlock);
ea131871 743 return packet;
f620b43a
BP
744}
745\f
746/* Checks whether a packet that arrived on 'slave_' within 'bond', with an
747 * Ethernet destination address of 'eth_dst', should be admitted.
748 *
749 * The return value is one of the following:
750 *
751 * - BV_ACCEPT: Admit the packet.
752 *
753 * - BV_DROP: Drop the packet.
754 *
755 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
756 * Ethernet source address and VLAN. If there is none, or if the packet
757 * is on the learned port, then admit the packet. If a different port has
758 * been learned, however, drop the packet (and do not use it for MAC
759 * learning).
760 */
761enum bond_verdict
762bond_check_admissibility(struct bond *bond, const void *slave_,
4a1b8f30 763 const uint8_t eth_dst[ETH_ADDR_LEN])
f620b43a 764{
3bfd3972
EJ
765 enum bond_verdict verdict = BV_DROP;
766 struct bond_slave *slave;
9a1c6450 767
3bfd3972
EJ
768 ovs_rwlock_rdlock(&rwlock);
769 slave = bond_slave_lookup(bond, slave_);
4222bbc8 770 if (!slave) {
3bfd3972 771 goto out;
4222bbc8
EJ
772 }
773
9a1c6450
EJ
774 /* LACP bonds have very loose admissibility restrictions because we can
775 * assume the remote switch is aware of the bond and will "do the right
776 * thing". However, as a precaution we drop packets on disabled slaves
777 * because no correctly implemented partner switch should be sending
bdebeece
EJ
778 * packets to them.
779 *
780 * If LACP is configured, but LACP negotiations have been unsuccessful, we
9dd165e0 781 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
bdebeece 782 switch (bond->lacp_status) {
3bfd3972
EJ
783 case LACP_NEGOTIATED:
784 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
785 goto out;
786 case LACP_CONFIGURED:
9dd165e0
RK
787 if (!bond->lacp_fallback_ab) {
788 goto out;
789 }
3bfd3972
EJ
790 case LACP_DISABLED:
791 break;
f620b43a
BP
792 }
793
794 /* Drop all multicast packets on inactive slaves. */
795 if (eth_addr_is_multicast(eth_dst)) {
4222bbc8 796 if (bond->active_slave != slave) {
3bfd3972 797 goto out;
f620b43a
BP
798 }
799 }
800
f931a4c9 801 switch (bond->balance) {
9dd165e0
RK
802 case BM_TCP:
803 /* TCP balanced bonds require successful LACP negotiations. Based on the
804 * above check, LACP is off or lacp_fallback_ab is true on this bond.
805 * If lacp_fallback_ab is true fall through to BM_AB case else, we
806 * drop all incoming traffic. */
807 if (!bond->lacp_fallback_ab) {
808 goto out;
809 }
810
f931a4c9
BP
811 case BM_AB:
812 /* Drop all packets which arrive on backup slaves. This is similar to
813 * how Linux bonding handles active-backup bonds. */
7ba7dcf0
EJ
814 if (bond->active_slave != slave) {
815 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
816
e6b2255c
BP
817 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
818 " slave (%s) destined for " ETH_ADDR_FMT,
819 slave->name, ETH_ADDR_ARGS(eth_dst));
3bfd3972 820 goto out;
7ba7dcf0 821 }
3bfd3972
EJ
822 verdict = BV_ACCEPT;
823 goto out;
f931a4c9 824
f931a4c9
BP
825 case BM_SLB:
826 /* Drop all packets for which we have learned a different input port,
827 * because we probably sent the packet on one slave and got it back on
828 * the other. Gratuitous ARP packets are an exception to this rule:
829 * the host has moved to another switch. The exception to the
830 * exception is if we locked the learning table to avoid reflections on
831 * bond slaves. */
3bfd3972
EJ
832 verdict = BV_DROP_IF_MOVED;
833 goto out;
7ba7dcf0
EJ
834 }
835
428b2edd 836 OVS_NOT_REACHED();
3bfd3972
EJ
837out:
838 ovs_rwlock_unlock(&rwlock);
839 return verdict;
840
f620b43a
BP
841}
842
843/* Returns the slave (registered on 'bond' by bond_slave_register()) to which
844 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
845 * NULL if the packet should be dropped because no slaves are enabled.
846 *
847 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
848 * should be a VID only (i.e. excluding the PCP bits). Second,
849 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
850 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
851 * packet belongs to (so for an access port it will be the access port's VLAN).
852 *
bcd2633a
JP
853 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
854 * significant in the selection. At some point earlier, 'wc' should
855 * have been initialized (e.g., by flow_wildcards_init_catchall()).
f620b43a
BP
856 */
857void *
858bond_choose_output_slave(struct bond *bond, const struct flow *flow,
4a1b8f30 859 struct flow_wildcards *wc, uint16_t vlan)
f620b43a 860{
3bfd3972 861 struct bond_slave *slave;
b5d5d7d3 862 void *aux;
3bfd3972
EJ
863
864 ovs_rwlock_rdlock(&rwlock);
4a1b8f30 865 slave = choose_output_slave(bond, flow, wc, vlan);
b5d5d7d3 866 aux = slave ? slave->aux : NULL;
3bfd3972 867 ovs_rwlock_unlock(&rwlock);
b5d5d7d3
AW
868
869 return aux;
f620b43a 870}
f620b43a 871\f
adcf00ba
AZ
872/* Recirculation. */
873static void
874bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
c6855ec5 875 OVS_REQ_WRLOCK(rwlock)
adcf00ba
AZ
876{
877 if (entry->slave) {
878 uint64_t delta;
879
880 delta = rule_tx_bytes - entry->pr_tx_bytes;
881 entry->tx_bytes += delta;
882 entry->pr_tx_bytes = rule_tx_bytes;
883 }
884}
885
886/* Maintain bond stats using post recirculation rule byte counters.*/
60cda7d6 887static void
adcf00ba 888bond_recirculation_account(struct bond *bond)
80316557 889 OVS_REQ_WRLOCK(rwlock)
adcf00ba
AZ
890{
891 int i;
892
adcf00ba
AZ
893 for (i=0; i<=BOND_MASK; i++) {
894 struct bond_entry *entry = &bond->hash[i];
895 struct rule *rule = entry->pr_rule;
896
897 if (rule) {
898 uint64_t n_packets OVS_UNUSED;
899 long long int used OVS_UNUSED;
900 uint64_t n_bytes;
901
902 rule->ofproto->ofproto_class->rule_get_stats(
903 rule, &n_packets, &n_bytes, &used);
904 bond_entry_account(entry, n_bytes);
905 }
906 }
adcf00ba
AZ
907}
908
909bool
910bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
911 uint32_t *hash_bias)
912{
80316557 913 if (bond->balance == BM_TCP && bond->recirc_id) {
adcf00ba
AZ
914 if (recirc_id) {
915 *recirc_id = bond->recirc_id;
916 }
917 if (hash_bias) {
918 *hash_bias = bond->basis;
919 }
920 return true;
921 } else {
922 return false;
923 }
924}
925
926void
927bond_update_post_recirc_rules(struct bond* bond, const bool force)
928{
929 struct bond_entry *e;
930 bool update_rules = force; /* Always update rules if caller forces it. */
931
932 /* Make sure all bond entries are populated */
933 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
934 if (!e->slave || !e->slave->enabled) {
935 update_rules = true;
936 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
937 struct bond_slave, hmap_node);
938 if (!e->slave->enabled) {
939 e->slave = bond->active_slave;
940 }
941 }
942 }
943
944 if (update_rules) {
945 update_recirc_rules(bond);
946 }
947}
948\f
f620b43a
BP
949/* Rebalancing. */
950
1b137691 951static bool
3bfd3972 952bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
1b137691 953{
bc1b010c
EJ
954 return bond->rebalance_interval
955 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
1b137691
EJ
956}
957
f620b43a
BP
958/* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
959void
960bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
961 uint64_t n_bytes)
962{
3bfd3972 963 ovs_rwlock_wrlock(&rwlock);
1b137691 964 if (bond_is_balanced(bond)) {
f620b43a 965 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
f620b43a 966 }
3bfd3972 967 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
968}
969
970static struct bond_slave *
ca6ba700 971bond_slave_from_bal_node(struct ovs_list *bal) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
972{
973 return CONTAINER_OF(bal, struct bond_slave, bal_node);
974}
975
976static void
ca6ba700 977log_bals(struct bond *bond, const struct ovs_list *bals)
c6855ec5 978 OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
979{
980 if (VLOG_IS_DBG_ENABLED()) {
981 struct ds ds = DS_EMPTY_INITIALIZER;
982 const struct bond_slave *slave;
983
984 LIST_FOR_EACH (slave, bal_node, bals) {
985 if (ds.length) {
986 ds_put_char(&ds, ',');
987 }
988 ds_put_format(&ds, " %s %"PRIu64"kB",
989 slave->name, slave->tx_bytes / 1024);
990
991 if (!slave->enabled) {
992 ds_put_cstr(&ds, " (disabled)");
993 }
994 if (!list_is_empty(&slave->entries)) {
995 struct bond_entry *e;
996
997 ds_put_cstr(&ds, " (");
998 LIST_FOR_EACH (e, list_node, &slave->entries) {
999 if (&e->list_node != list_front(&slave->entries)) {
1000 ds_put_cstr(&ds, " + ");
1001 }
34582733 1002 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
f620b43a
BP
1003 e - bond->hash, e->tx_bytes / 1024);
1004 }
1005 ds_put_cstr(&ds, ")");
1006 }
1007 }
1008 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
1009 ds_destroy(&ds);
1010 }
1011}
1012
1013/* Shifts 'hash' from its current slave to 'to'. */
1014static void
4a1b8f30 1015bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
c6855ec5 1016 OVS_REQ_WRLOCK(rwlock)
f620b43a
BP
1017{
1018 struct bond_slave *from = hash->slave;
1019 struct bond *bond = from->bond;
1020 uint64_t delta = hash->tx_bytes;
1021
34582733 1022 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
f620b43a
BP
1023 "from %s to %s (now carrying %"PRIu64"kB and "
1024 "%"PRIu64"kB load, respectively)",
1025 bond->name, delta / 1024, hash - bond->hash,
1026 from->name, to->name,
1027 (from->tx_bytes - delta) / 1024,
1028 (to->tx_bytes + delta) / 1024);
1029
1030 /* Shift load away from 'from' to 'to'. */
1031 from->tx_bytes -= delta;
1032 to->tx_bytes += delta;
1033
1034 /* Arrange for flows to be revalidated. */
dc30ea2d 1035 hash->slave = to;
4a1b8f30 1036 bond->bond_revalidate = true;
f620b43a
BP
1037}
1038
09a5d390
BP
1039/* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1040 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
f620b43a
BP
1041 * given that doing so must decrease the ratio of the load on the two slaves by
1042 * at least 0.1. Returns NULL if there is no appropriate entry.
1043 *
1044 * The list of entries isn't sorted. I don't know of a reason to prefer to
1045 * shift away small hashes or large hashes. */
1046static struct bond_entry *
1047choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
c6855ec5 1048 OVS_REQ_WRLOCK(rwlock)
f620b43a
BP
1049{
1050 struct bond_entry *e;
1051
1052 if (list_is_short(&from->entries)) {
1053 /* 'from' carries no more than one MAC hash, so shifting load away from
1054 * it would be pointless. */
1055 return NULL;
1056 }
1057
1058 LIST_FOR_EACH (e, list_node, &from->entries) {
1059 double old_ratio, new_ratio;
1060 uint64_t delta;
1061
1062 if (to_tx_bytes == 0) {
1063 /* Nothing on the new slave, move it. */
1064 return e;
1065 }
1066
1067 delta = e->tx_bytes;
1068 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1069 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
75fad143
ZK
1070 if (old_ratio - new_ratio > 0.1
1071 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1072 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1073 and 'to' slave have the same load. Therefore, we only move an
1074 entry if it decreases the load on 'from', and brings us closer
1075 to equal traffic load. */
f620b43a
BP
1076 return e;
1077 }
1078 }
1079
1080 return NULL;
1081}
1082
1083/* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1084 * maintained. */
1085static void
ca6ba700 1086insert_bal(struct ovs_list *bals, struct bond_slave *slave)
f620b43a
BP
1087{
1088 struct bond_slave *pos;
1089
1090 LIST_FOR_EACH (pos, bal_node, bals) {
1091 if (slave->tx_bytes > pos->tx_bytes) {
1092 break;
1093 }
1094 }
1095 list_insert(&pos->bal_node, &slave->bal_node);
1096}
1097
1098/* Removes 'slave' from its current list and then inserts it into 'bals' so
1099 * that descending order of 'tx_bytes' is maintained. */
1100static void
ca6ba700 1101reinsert_bal(struct ovs_list *bals, struct bond_slave *slave)
f620b43a
BP
1102{
1103 list_remove(&slave->bal_node);
1104 insert_bal(bals, slave);
1105}
1106
1107/* If 'bond' needs rebalancing, does so.
1108 *
adcf00ba
AZ
1109 * The caller should have called bond_account() for each active flow, or in case
1110 * of recirculation is used, have called bond_recirculation_account(bond),
1111 * to ensure that flow data is consistently accounted at this point.
60cda7d6
AZ
1112 */
1113void
4a1b8f30 1114bond_rebalance(struct bond *bond)
f620b43a
BP
1115{
1116 struct bond_slave *slave;
1117 struct bond_entry *e;
ca6ba700 1118 struct ovs_list bals;
adcf00ba 1119 bool rebalanced = false;
60cda7d6 1120 bool use_recirc;
f620b43a 1121
3bfd3972 1122 ovs_rwlock_wrlock(&rwlock);
1b137691 1123 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
adcf00ba 1124 goto done;
f620b43a
BP
1125 }
1126 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1127
60cda7d6
AZ
1128 use_recirc = ofproto_dpif_get_enable_recirc(bond->ofproto) &&
1129 bond_may_recirc(bond, NULL, NULL);
1130
1131 if (use_recirc) {
1132 bond_recirculation_account(bond);
1133 }
1134
f620b43a
BP
1135 /* Add each bond_entry to its slave's 'entries' list.
1136 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1137 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1138 slave->tx_bytes = 0;
1139 list_init(&slave->entries);
1140 }
1141 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1142 if (e->slave && e->tx_bytes) {
1143 e->slave->tx_bytes += e->tx_bytes;
1144 list_push_back(&e->slave->entries, &e->list_node);
1145 }
1146 }
1147
1148 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1149 *
1150 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1151 * with a proper list sort algorithm. */
1152 list_init(&bals);
1153 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1154 if (slave->enabled) {
1155 insert_bal(&bals, slave);
1156 }
1157 }
1158 log_bals(bond, &bals);
1159
1160 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1161 while (!list_is_short(&bals)) {
1162 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1163 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1164 uint64_t overload;
1165
1166 overload = from->tx_bytes - to->tx_bytes;
1167 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1168 /* The extra load on 'from' (and all less-loaded slaves), compared
1169 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1170 * it is less than ~1Mbps. No point in rebalancing. */
1171 break;
1172 }
1173
09a5d390
BP
1174 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1175 * to move from 'from' to 'to'. */
f620b43a
BP
1176 e = choose_entry_to_migrate(from, to->tx_bytes);
1177 if (e) {
4a1b8f30 1178 bond_shift_load(e, to);
f620b43a
BP
1179
1180 /* Delete element from from->entries.
1181 *
1182 * We don't add the element to to->hashes. That would only allow
1183 * 'e' to be migrated to another slave in this rebalancing run, and
1184 * there is no point in doing that. */
1185 list_remove(&e->list_node);
1186
1187 /* Re-sort 'bals'. */
1188 reinsert_bal(&bals, from);
1189 reinsert_bal(&bals, to);
60cda7d6 1190 rebalanced = true;
f620b43a
BP
1191 } else {
1192 /* Can't usefully migrate anything away from 'from'.
1193 * Don't reconsider it. */
1194 list_remove(&from->bal_node);
1195 }
1196 }
1197
1198 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1199 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1200 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1201 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1202 e->tx_bytes /= 2;
f620b43a 1203 }
adcf00ba 1204
60cda7d6
AZ
1205 if (use_recirc && rebalanced) {
1206 bond_update_post_recirc_rules(bond,true);
1207 }
2f486d4c
AZ
1208
1209done:
3bfd3972 1210 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1211}
1212\f
1213/* Bonding unixctl user interface functions. */
1214
1215static struct bond *
3bfd3972 1216bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
1217{
1218 struct bond *bond;
1219
1220 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
3bfd3972 1221 all_bonds) {
f620b43a
BP
1222 if (!strcmp(bond->name, name)) {
1223 return bond;
1224 }
1225 }
1226 return NULL;
1227}
1228
1229static struct bond_slave *
1230bond_lookup_slave(struct bond *bond, const char *slave_name)
1231{
1232 struct bond_slave *slave;
1233
1234 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1235 if (!strcmp(slave->name, slave_name)) {
1236 return slave;
1237 }
1238 }
1239 return NULL;
1240}
1241
1242static void
1243bond_unixctl_list(struct unixctl_conn *conn,
0e15264f
BP
1244 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1245 void *aux OVS_UNUSED)
f620b43a
BP
1246{
1247 struct ds ds = DS_EMPTY_INITIALIZER;
1248 const struct bond *bond;
1249
adcf00ba 1250 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
f620b43a 1251
3bfd3972
EJ
1252 ovs_rwlock_rdlock(&rwlock);
1253 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
f620b43a
BP
1254 const struct bond_slave *slave;
1255 size_t i;
1256
adcf00ba
AZ
1257 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1258 bond_mode_to_string(bond->balance), bond->recirc_id);
f620b43a
BP
1259
1260 i = 0;
1261 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1262 if (i++ > 0) {
1263 ds_put_cstr(&ds, ", ");
1264 }
1265 ds_put_cstr(&ds, slave->name);
1266 }
1267 ds_put_char(&ds, '\n');
1268 }
3bfd3972 1269 ovs_rwlock_unlock(&rwlock);
bde9f75d 1270 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a
BP
1271 ds_destroy(&ds);
1272}
1273
1274static void
c33a8a25 1275bond_print_details(struct ds *ds, const struct bond *bond)
3bfd3972 1276 OVS_REQ_RDLOCK(rwlock)
f620b43a 1277{
fc1d4f01
EJ
1278 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1279 const struct shash_node **sorted_slaves = NULL;
f620b43a 1280 const struct bond_slave *slave;
adcf00ba
AZ
1281 bool may_recirc;
1282 uint32_t recirc_id;
fc1d4f01 1283 int i;
f620b43a 1284
c33a8a25
EJ
1285 ds_put_format(ds, "---- %s ----\n", bond->name);
1286 ds_put_format(ds, "bond_mode: %s\n",
f620b43a
BP
1287 bond_mode_to_string(bond->balance));
1288
adcf00ba
AZ
1289 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1290 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1291 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1292
c33a8a25 1293 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
672d18b2 1294
c33a8a25
EJ
1295 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1296 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
f620b43a 1297
1b137691 1298 if (bond_is_balanced(bond)) {
c33a8a25 1299 ds_put_format(ds, "next rebalance: %lld ms\n",
f620b43a
BP
1300 bond->next_rebalance - time_msec());
1301 }
1302
bdebeece
EJ
1303 ds_put_cstr(ds, "lacp_status: ");
1304 switch (bond->lacp_status) {
1305 case LACP_NEGOTIATED:
1306 ds_put_cstr(ds, "negotiated\n");
1307 break;
1308 case LACP_CONFIGURED:
1309 ds_put_cstr(ds, "configured\n");
1310 break;
1311 case LACP_DISABLED:
1312 ds_put_cstr(ds, "off\n");
1313 break;
1314 default:
1315 ds_put_cstr(ds, "<unknown>\n");
1316 break;
1317 }
4d6fb5eb 1318
3e5aeeb5
AZ
1319 ds_put_cstr(ds, "active slave mac: ");
1320 ds_put_format(ds, ETH_ADDR_FMT, ETH_ADDR_ARGS(bond->active_slave_mac));
1321 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1322 ds_put_format(ds,"(%s)\n", slave ? slave->name : "none");
1323
f620b43a 1324 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
fc1d4f01
EJ
1325 shash_add(&slave_shash, slave->name, slave);
1326 }
1327 sorted_slaves = shash_sort(&slave_shash);
1328
1329 for (i = 0; i < shash_count(&slave_shash); i++) {
f620b43a 1330 struct bond_entry *be;
f620b43a 1331
fc1d4f01
EJ
1332 slave = sorted_slaves[i]->data;
1333
f620b43a 1334 /* Basic info. */
c33a8a25 1335 ds_put_format(ds, "\nslave %s: %s\n",
f620b43a
BP
1336 slave->name, slave->enabled ? "enabled" : "disabled");
1337 if (slave == bond->active_slave) {
c33a8a25 1338 ds_put_cstr(ds, "\tactive slave\n");
f620b43a
BP
1339 }
1340 if (slave->delay_expires != LLONG_MAX) {
c33a8a25 1341 ds_put_format(ds, "\t%s expires in %lld ms\n",
f620b43a
BP
1342 slave->enabled ? "downdelay" : "updelay",
1343 slave->delay_expires - time_msec());
1344 }
1345
c33a8a25 1346 ds_put_format(ds, "\tmay_enable: %s\n",
296f6519 1347 slave->may_enable ? "true" : "false");
4d6fb5eb 1348
1b137691 1349 if (!bond_is_balanced(bond)) {
f620b43a
BP
1350 continue;
1351 }
1352
1353 /* Hashes. */
f620b43a
BP
1354 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1355 int hash = be - bond->hash;
f6ba1f35 1356 uint64_t be_tx_k;
f620b43a
BP
1357
1358 if (be->slave != slave) {
1359 continue;
1360 }
1361
f6ba1f35
AZ
1362 be_tx_k = be->tx_bytes / 1024;
1363 if (be_tx_k) {
1364 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1365 hash, be_tx_k);
1366 }
f620b43a 1367
7b9f1974 1368 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
f620b43a
BP
1369 }
1370 }
fc1d4f01
EJ
1371 shash_destroy(&slave_shash);
1372 free(sorted_slaves);
c33a8a25
EJ
1373 ds_put_cstr(ds, "\n");
1374}
1375
1376static void
1377bond_unixctl_show(struct unixctl_conn *conn,
1378 int argc, const char *argv[],
1379 void *aux OVS_UNUSED)
1380{
1381 struct ds ds = DS_EMPTY_INITIALIZER;
1382
3bfd3972 1383 ovs_rwlock_rdlock(&rwlock);
c33a8a25
EJ
1384 if (argc > 1) {
1385 const struct bond *bond = bond_find(argv[1]);
1386
1387 if (!bond) {
bde9f75d 1388 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1389 goto out;
c33a8a25
EJ
1390 }
1391 bond_print_details(&ds, bond);
1392 } else {
1393 const struct bond *bond;
1394
3bfd3972 1395 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
c33a8a25
EJ
1396 bond_print_details(&ds, bond);
1397 }
1398 }
1399
bde9f75d 1400 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a 1401 ds_destroy(&ds);
3bfd3972
EJ
1402
1403out:
1404 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1405}
1406
1407static void
0e15264f
BP
1408bond_unixctl_migrate(struct unixctl_conn *conn,
1409 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1410 void *aux OVS_UNUSED)
1411{
0e15264f
BP
1412 const char *bond_s = argv[1];
1413 const char *hash_s = argv[2];
1414 const char *slave_s = argv[3];
f620b43a
BP
1415 struct bond *bond;
1416 struct bond_slave *slave;
1417 struct bond_entry *entry;
1418 int hash;
1419
3bfd3972 1420 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1421 bond = bond_find(bond_s);
1422 if (!bond) {
bde9f75d 1423 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1424 goto out;
f620b43a
BP
1425 }
1426
1427 if (bond->balance != BM_SLB) {
bde9f75d 1428 unixctl_command_reply_error(conn, "not an SLB bond");
3bfd3972 1429 goto out;
f620b43a
BP
1430 }
1431
1432 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1433 hash = atoi(hash_s) & BOND_MASK;
1434 } else {
bde9f75d 1435 unixctl_command_reply_error(conn, "bad hash");
3bfd3972 1436 goto out;
f620b43a
BP
1437 }
1438
1439 slave = bond_lookup_slave(bond, slave_s);
1440 if (!slave) {
bde9f75d 1441 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1442 goto out;
f620b43a
BP
1443 }
1444
1445 if (!slave->enabled) {
bde9f75d 1446 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
3bfd3972 1447 goto out;
f620b43a
BP
1448 }
1449
1450 entry = &bond->hash[hash];
4a1b8f30 1451 bond->bond_revalidate = true;
f620b43a 1452 entry->slave = slave;
bde9f75d 1453 unixctl_command_reply(conn, "migrated");
3bfd3972
EJ
1454
1455out:
1456 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1457}
1458
1459static void
0e15264f
BP
1460bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1461 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1462 void *aux OVS_UNUSED)
1463{
0e15264f
BP
1464 const char *bond_s = argv[1];
1465 const char *slave_s = argv[2];
f620b43a
BP
1466 struct bond *bond;
1467 struct bond_slave *slave;
1468
3bfd3972 1469 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1470 bond = bond_find(bond_s);
1471 if (!bond) {
bde9f75d 1472 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1473 goto out;
f620b43a
BP
1474 }
1475
1476 slave = bond_lookup_slave(bond, slave_s);
1477 if (!slave) {
bde9f75d 1478 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1479 goto out;
f620b43a
BP
1480 }
1481
1482 if (!slave->enabled) {
bde9f75d 1483 unixctl_command_reply_error(conn, "cannot make disabled slave active");
3bfd3972 1484 goto out;
f620b43a
BP
1485 }
1486
1487 if (bond->active_slave != slave) {
4a1b8f30 1488 bond->bond_revalidate = true;
f620b43a 1489 bond->active_slave = slave;
f620b43a
BP
1490 VLOG_INFO("bond %s: active interface is now %s",
1491 bond->name, slave->name);
1492 bond->send_learning_packets = true;
bde9f75d 1493 unixctl_command_reply(conn, "done");
3e5aeeb5 1494 bond_active_slave_changed(bond);
f620b43a 1495 } else {
bde9f75d 1496 unixctl_command_reply(conn, "no change");
f620b43a 1497 }
3bfd3972
EJ
1498out:
1499 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1500}
1501
1502static void
0e15264f 1503enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
f620b43a 1504{
0e15264f
BP
1505 const char *bond_s = argv[1];
1506 const char *slave_s = argv[2];
f620b43a
BP
1507 struct bond *bond;
1508 struct bond_slave *slave;
1509
3bfd3972 1510 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1511 bond = bond_find(bond_s);
1512 if (!bond) {
bde9f75d 1513 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1514 goto out;
f620b43a
BP
1515 }
1516
1517 slave = bond_lookup_slave(bond, slave_s);
1518 if (!slave) {
bde9f75d 1519 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1520 goto out;
f620b43a
BP
1521 }
1522
4a1b8f30 1523 bond_enable_slave(slave, enable);
bde9f75d 1524 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
3bfd3972
EJ
1525
1526out:
1527 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1528}
1529
1530static void
0e15264f
BP
1531bond_unixctl_enable_slave(struct unixctl_conn *conn,
1532 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1533 void *aux OVS_UNUSED)
1534{
0e15264f 1535 enable_slave(conn, argv, true);
f620b43a
BP
1536}
1537
1538static void
0e15264f
BP
1539bond_unixctl_disable_slave(struct unixctl_conn *conn,
1540 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1541 void *aux OVS_UNUSED)
1542{
0e15264f 1543 enable_slave(conn, argv, false);
f620b43a
BP
1544}
1545
1546static void
0e15264f 1547bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
f620b43a
BP
1548 void *aux OVS_UNUSED)
1549{
0e15264f
BP
1550 const char *mac_s = argv[1];
1551 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1552 const char *basis_s = argc > 3 ? argv[3] : NULL;
f620b43a
BP
1553 uint8_t mac[ETH_ADDR_LEN];
1554 uint8_t hash;
1555 char *hash_cstr;
1556 unsigned int vlan;
672d18b2 1557 uint32_t basis;
f620b43a
BP
1558
1559 if (vlan_s) {
c2c28dfd 1560 if (!ovs_scan(vlan_s, "%u", &vlan)) {
bde9f75d 1561 unixctl_command_reply_error(conn, "invalid vlan");
f620b43a
BP
1562 return;
1563 }
1564 } else {
dc155bff 1565 vlan = 0;
f620b43a
BP
1566 }
1567
672d18b2 1568 if (basis_s) {
c2c28dfd 1569 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
bde9f75d 1570 unixctl_command_reply_error(conn, "invalid basis");
672d18b2
EJ
1571 return;
1572 }
1573 } else {
1574 basis = 0;
1575 }
1576
c2c28dfd 1577 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
672d18b2 1578 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
f620b43a
BP
1579
1580 hash_cstr = xasprintf("%u", hash);
bde9f75d 1581 unixctl_command_reply(conn, hash_cstr);
f620b43a
BP
1582 free(hash_cstr);
1583 } else {
bde9f75d 1584 unixctl_command_reply_error(conn, "invalid mac");
f620b43a
BP
1585 }
1586}
1587
1588void
1589bond_init(void)
1590{
0e15264f 1591 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
c33a8a25
EJ
1592 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1593 NULL);
0e15264f 1594 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
7ff2009a 1595 bond_unixctl_migrate, NULL);
0e15264f 1596 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
f620b43a 1597 bond_unixctl_set_active_slave, NULL);
0e15264f 1598 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
7ff2009a 1599 bond_unixctl_enable_slave, NULL);
0e15264f 1600 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
7ff2009a 1601 bond_unixctl_disable_slave, NULL);
0e15264f 1602 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
7ff2009a 1603 bond_unixctl_hash, NULL);
f620b43a
BP
1604}
1605\f
95aafb2a
EJ
1606static void
1607bond_entry_reset(struct bond *bond)
1608{
1609 if (bond->balance != BM_AB) {
9e1a6910 1610 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
95aafb2a
EJ
1611
1612 if (!bond->hash) {
1613 bond->hash = xmalloc(hash_len);
1614 }
1615 memset(bond->hash, 0, hash_len);
1616
1617 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1618 } else {
1619 free(bond->hash);
1620 bond->hash = NULL;
1621 }
1622}
1623
f620b43a
BP
1624static struct bond_slave *
1625bond_slave_lookup(struct bond *bond, const void *slave_)
1626{
1627 struct bond_slave *slave;
1628
1629 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1630 &bond->slaves) {
1631 if (slave->aux == slave_) {
1632 return slave;
1633 }
1634 }
1635
1636 return NULL;
1637}
1638
f620b43a 1639static void
4a1b8f30 1640bond_enable_slave(struct bond_slave *slave, bool enable)
f620b43a
BP
1641{
1642 slave->delay_expires = LLONG_MAX;
1643 if (enable != slave->enabled) {
4a1b8f30 1644 slave->bond->bond_revalidate = true;
f620b43a 1645 slave->enabled = enable;
f1c8a79c
AW
1646
1647 ovs_mutex_lock(&slave->bond->mutex);
1648 if (enable) {
1649 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1650 } else {
1651 list_remove(&slave->list_node);
1652 }
1653 ovs_mutex_unlock(&slave->bond->mutex);
1654
4a1b8f30
EJ
1655 VLOG_INFO("interface %s: %s", slave->name,
1656 slave->enabled ? "enabled" : "disabled");
f620b43a
BP
1657 }
1658}
1659
1660static void
4a1b8f30 1661bond_link_status_update(struct bond_slave *slave)
f620b43a
BP
1662{
1663 struct bond *bond = slave->bond;
1664 bool up;
1665
296f6519 1666 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
f620b43a
BP
1667 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1668 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1669 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1670 slave->name, up ? "up" : "down");
1671 if (up == slave->enabled) {
1672 slave->delay_expires = LLONG_MAX;
1673 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1674 slave->name, up ? "disabled" : "enabled");
1675 } else {
bdebeece 1676 int delay = (bond->lacp_status != LACP_DISABLED ? 0
f620b43a
BP
1677 : up ? bond->updelay : bond->downdelay);
1678 slave->delay_expires = time_msec() + delay;
1679 if (delay) {
1680 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1681 "for %d ms",
1682 slave->name,
1683 up ? "enabled" : "disabled",
1684 up ? "up" : "down",
1685 delay);
1686 }
1687 }
1688 }
1689
1690 if (time_msec() >= slave->delay_expires) {
4a1b8f30 1691 bond_enable_slave(slave, up);
f620b43a
BP
1692 }
1693}
1694
f620b43a 1695static unsigned int
672d18b2 1696bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
f620b43a 1697{
7e36ac42 1698 return hash_mac(mac, vlan, basis);
f620b43a
BP
1699}
1700
1701static unsigned int
672d18b2 1702bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
f620b43a
BP
1703{
1704 struct flow hash_flow = *flow;
d84d4b88 1705 hash_flow.vlan_tci = htons(vlan);
f620b43a
BP
1706
1707 /* The symmetric quality of this hash function is not required, but
1708 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1709 * purposes, so we use it out of convenience. */
672d18b2 1710 return flow_hash_symmetric_l4(&hash_flow, basis);
f620b43a
BP
1711}
1712
fb0b29a3
EJ
1713static unsigned int
1714bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1715{
cb22974d 1716 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
fb0b29a3 1717
bdebeece 1718 return (bond->balance == BM_TCP
672d18b2
EJ
1719 ? bond_hash_tcp(flow, vlan, bond->basis)
1720 : bond_hash_src(flow->dl_src, vlan, bond->basis));
fb0b29a3
EJ
1721}
1722
f620b43a
BP
1723static struct bond_entry *
1724lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1725 uint16_t vlan)
1726{
fb0b29a3 1727 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
f620b43a
BP
1728}
1729
f1c8a79c
AW
1730/* Selects and returns an enabled slave from the 'enabled_slaves' list
1731 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1732 * returns NULL. */
1733static struct bond_slave *
1734get_enabled_slave(struct bond *bond)
1735{
ca6ba700 1736 struct ovs_list *node;
f1c8a79c
AW
1737
1738 ovs_mutex_lock(&bond->mutex);
1739 if (list_is_empty(&bond->enabled_slaves)) {
1740 ovs_mutex_unlock(&bond->mutex);
1741 return NULL;
1742 }
1743
1744 node = list_pop_front(&bond->enabled_slaves);
1745 list_push_back(&bond->enabled_slaves, node);
1746 ovs_mutex_unlock(&bond->mutex);
1747
1748 return CONTAINER_OF(node, struct bond_slave, list_node);
1749}
1750
f620b43a
BP
1751static struct bond_slave *
1752choose_output_slave(const struct bond *bond, const struct flow *flow,
4a1b8f30 1753 struct flow_wildcards *wc, uint16_t vlan)
f620b43a
BP
1754{
1755 struct bond_entry *e;
9dd165e0 1756 int balance;
f620b43a 1757
9dd165e0 1758 balance = bond->balance;
bdebeece
EJ
1759 if (bond->lacp_status == LACP_CONFIGURED) {
1760 /* LACP has been configured on this bond but negotiations were
9dd165e0
RK
1761 * unsuccussful. If lacp_fallback_ab is enabled use active-
1762 * backup mode else drop all traffic. */
1763 if (!bond->lacp_fallback_ab) {
1764 return NULL;
1765 }
1766 balance = BM_AB;
bdebeece
EJ
1767 }
1768
9dd165e0 1769 switch (balance) {
f620b43a
BP
1770 case BM_AB:
1771 return bond->active_slave;
1772
f620b43a 1773 case BM_TCP:
bdebeece
EJ
1774 if (bond->lacp_status != LACP_NEGOTIATED) {
1775 /* Must have LACP negotiations for TCP balanced bonds. */
1776 return NULL;
1777 }
bcd2633a 1778 if (wc) {
6cdd5145 1779 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
bcd2633a 1780 }
bdebeece
EJ
1781 /* Fall Through. */
1782 case BM_SLB:
bcd2633a 1783 if (wc) {
6cdd5145 1784 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
bcd2633a 1785 }
f620b43a
BP
1786 e = lookup_bond_entry(bond, flow, vlan);
1787 if (!e->slave || !e->slave->enabled) {
f1c8a79c 1788 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
f620b43a
BP
1789 }
1790 return e->slave;
1791
1792 default:
428b2edd 1793 OVS_NOT_REACHED();
f620b43a
BP
1794 }
1795}
1796
1797static struct bond_slave *
1798bond_choose_slave(const struct bond *bond)
1799{
1800 struct bond_slave *slave, *best;
1801
3e5aeeb5
AZ
1802 /* Find the last active slave. */
1803 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1804 if (slave && slave->enabled) {
1805 return slave;
1806 }
1807
f620b43a
BP
1808 /* Find an enabled slave. */
1809 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1810 if (slave->enabled) {
1811 return slave;
1812 }
1813 }
1814
1815 /* All interfaces are disabled. Find an interface that will be enabled
1816 * after its updelay expires. */
1817 best = NULL;
1818 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1819 if (slave->delay_expires != LLONG_MAX
296f6519 1820 && slave->may_enable
f620b43a
BP
1821 && (!best || slave->delay_expires < best->delay_expires)) {
1822 best = slave;
1823 }
1824 }
1825 return best;
1826}
1827
1828static void
4a1b8f30 1829bond_choose_active_slave(struct bond *bond)
f620b43a
BP
1830{
1831 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1832 struct bond_slave *old_active_slave = bond->active_slave;
1833
1834 bond->active_slave = bond_choose_slave(bond);
1835 if (bond->active_slave) {
1836 if (bond->active_slave->enabled) {
1837 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1838 bond->name, bond->active_slave->name);
1839 } else {
1840 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1841 "remaining %lld ms updelay (since no interface was "
1842 "enabled)", bond->name, bond->active_slave->name,
1843 bond->active_slave->delay_expires - time_msec());
4a1b8f30 1844 bond_enable_slave(bond->active_slave, true);
f620b43a
BP
1845 }
1846
1847 bond->send_learning_packets = true;
3e5aeeb5
AZ
1848
1849 if (bond->active_slave != old_active_slave) {
1850 bond_active_slave_changed(bond);
1851 }
f620b43a 1852 } else if (old_active_slave) {
d28b9ead 1853 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
f620b43a
BP
1854 }
1855}
3e5aeeb5
AZ
1856
1857/*
1858 * Return true if bond has unstored active slave change.
1859 * If return true, 'mac' will store the bond's current active slave's
1860 * MAC address. */
1861bool
1862bond_get_changed_active_slave(const char *name, uint8_t* mac, bool force)
1863{
1864 struct bond *bond;
1865
1866 ovs_rwlock_wrlock(&rwlock);
1867 bond = bond_find(name);
1868 if (bond) {
1869 if (bond->active_slave_changed || force) {
1870 memcpy(mac, bond->active_slave_mac, ETH_ADDR_LEN);
1871 bond->active_slave_changed = false;
1872 ovs_rwlock_unlock(&rwlock);
1873 return true;
1874 }
1875 }
1876 ovs_rwlock_unlock(&rwlock);
1877
1878 return false;
1879}