]> git.proxmox.com Git - mirror_ovs.git/blame - ofproto/bond.c
nicira-ext: Fix typo in comment.
[mirror_ovs.git] / ofproto / bond.c
CommitLineData
f620b43a 1/*
8917f72c 2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
f620b43a
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
19#include "bond.h"
20
21#include <limits.h>
22#include <stdint.h>
23#include <stdlib.h>
75fad143 24#include <math.h>
f620b43a 25
adcf00ba
AZ
26#include "ofp-util.h"
27#include "ofp-actions.h"
28#include "ofpbuf.h"
29#include "ofproto/ofproto-provider.h"
30#include "ofproto/ofproto-dpif.h"
e672ff9b 31#include "ofproto/ofproto-dpif-rid.h"
da4a6191 32#include "connectivity.h"
f620b43a
BP
33#include "coverage.h"
34#include "dynamic-string.h"
35#include "flow.h"
36#include "hmap.h"
bdebeece 37#include "lacp.h"
f620b43a
BP
38#include "list.h"
39#include "netdev.h"
40#include "odp-util.h"
41#include "ofpbuf.h"
42#include "packets.h"
cf62fa4c 43#include "dp-packet.h"
f620b43a 44#include "poll-loop.h"
da4a6191 45#include "seq.h"
adcf00ba 46#include "match.h"
fc1d4f01 47#include "shash.h"
f620b43a
BP
48#include "timeval.h"
49#include "unixctl.h"
e6211adc 50#include "openvswitch/vlog.h"
f620b43a
BP
51
52VLOG_DEFINE_THIS_MODULE(bond);
53
f1c8a79c
AW
54static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
55static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
56static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
57
9e1a6910 58/* Bit-mask for hashing a flow down to a bucket. */
f620b43a 59#define BOND_MASK 0xff
9e1a6910 60#define BOND_BUCKETS (BOND_MASK + 1)
f620b43a
BP
61
62/* A hash bucket for mapping a flow to a slave.
9e1a6910 63 * "struct bond" has an array of BOND_BUCKETS of these. */
f620b43a
BP
64struct bond_entry {
65 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
c6855ec5
JS
66 uint64_t tx_bytes /* Count of bytes recently transmitted. */
67 OVS_GUARDED_BY(rwlock);
ca6ba700 68 struct ovs_list list_node; /* In bond_slave's 'entries' list. */
adcf00ba 69
c6855ec5
JS
70 /* Recirculation.
71 *
72 * 'pr_rule' is the post-recirculation rule for this entry.
73 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
74 * is used to determine delta (applied to 'tx_bytes' above.) */
75 struct rule *pr_rule;
76 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
f620b43a
BP
77};
78
79/* A bond slave, that is, one of the links comprising a bond. */
80struct bond_slave {
81 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
ca6ba700 82 struct ovs_list list_node; /* In struct bond's enabled_slaves list. */
f620b43a
BP
83 struct bond *bond; /* The bond that contains this slave. */
84 void *aux; /* Client-provided handle for this slave. */
85
86 struct netdev *netdev; /* Network device, owned by the client. */
1ea24138 87 unsigned int change_seq; /* Tracks changes in 'netdev'. */
0746a84f 88 ofp_port_t ofp_port; /* OpenFlow port number. */
f620b43a
BP
89 char *name; /* Name (a copy of netdev_get_name(netdev)). */
90
91 /* Link status. */
92 long long delay_expires; /* Time after which 'enabled' may change. */
f620b43a 93 bool enabled; /* May be chosen for flows? */
296f6519 94 bool may_enable; /* Client considers this slave bondable. */
f620b43a
BP
95
96 /* Rebalancing info. Used only by bond_rebalance(). */
ca6ba700
TG
97 struct ovs_list bal_node; /* In bond_rebalance()'s 'bals' list. */
98 struct ovs_list entries; /* 'struct bond_entry's assigned here. */
f620b43a
BP
99 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
100};
101
102/* A bond, that is, a set of network devices grouped to improve performance or
103 * robustness. */
104struct bond {
105 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
106 char *name; /* Name provided by client. */
adcf00ba 107 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
f620b43a
BP
108
109 /* Slaves. */
110 struct hmap slaves;
111
f1c8a79c
AW
112 /* Enabled slaves.
113 *
114 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
115 * (To prevent the bond_slave from disappearing they must also hold
116 * 'rwlock'.) */
117 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
ca6ba700 118 struct ovs_list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
f1c8a79c 119
f620b43a
BP
120 /* Bonding info. */
121 enum bond_mode balance; /* Balancing mode, one of BM_*. */
122 struct bond_slave *active_slave;
f620b43a 123 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
bdebeece 124 enum lacp_status lacp_status; /* Status of LACP negotiations. */
62904702 125 bool bond_revalidate; /* True if flows need revalidation. */
672d18b2 126 uint32_t basis; /* Basis for flow hash function. */
f620b43a
BP
127
128 /* SLB specific bonding info. */
9e1a6910 129 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
f620b43a
BP
130 int rebalance_interval; /* Interval between rebalances, in ms. */
131 long long int next_rebalance; /* Next rebalancing time. */
132 bool send_learning_packets;
adcf00ba
AZ
133 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
134 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
f620b43a 135
3e5aeeb5
AZ
136 /* Store active slave to OVSDB. */
137 bool active_slave_changed; /* Set to true whenever the bond changes
138 active slave. It will be reset to false
139 after it is stored into OVSDB */
140
141 /* Interface name may not be persistent across an OS reboot, use
142 * MAC address for identifing the active slave */
143 uint8_t active_slave_mac[ETH_ADDR_LEN];
144 /* The MAC address of the active interface. */
f620b43a 145 /* Legacy compatibility. */
9dd165e0 146 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
f620b43a 147
37bec3d3 148 struct ovs_refcount ref_cnt;
f620b43a
BP
149};
150
adcf00ba
AZ
151/* What to do with an bond_recirc_rule. */
152enum bond_op {
153 ADD, /* Add the rule to ofproto's flow table. */
154 DEL, /* Delete the rule from the ofproto's flow table. */
155};
156
157/* A rule to add to or delete from ofproto's internal flow table. */
158struct bond_pr_rule_op {
159 struct hmap_node hmap_node;
160 struct match match;
161 ofp_port_t out_ofport;
162 enum bond_op op;
6c932bc8 163 struct rule **pr_rule;
adcf00ba
AZ
164};
165
3bfd3972
EJ
166static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
167static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
168 OVS_REQ_RDLOCK(rwlock);
4a1b8f30
EJ
169static void bond_enable_slave(struct bond_slave *, bool enable)
170 OVS_REQ_WRLOCK(rwlock);
171static void bond_link_status_update(struct bond_slave *)
3bfd3972 172 OVS_REQ_WRLOCK(rwlock);
4a1b8f30 173static void bond_choose_active_slave(struct bond *)
9e1a6910 174 OVS_REQ_WRLOCK(rwlock);
f620b43a 175static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
672d18b2
EJ
176 uint16_t vlan, uint32_t basis);
177static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
178 uint32_t basis);
f620b43a
BP
179static struct bond_entry *lookup_bond_entry(const struct bond *,
180 const struct flow *,
3bfd3972
EJ
181 uint16_t vlan)
182 OVS_REQ_RDLOCK(rwlock);
f1c8a79c
AW
183static struct bond_slave *get_enabled_slave(struct bond *)
184 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
185static struct bond_slave *choose_output_slave(const struct bond *,
186 const struct flow *,
bcd2633a 187 struct flow_wildcards *,
4a1b8f30 188 uint16_t vlan)
3bfd3972 189 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
190
191/* Attempts to parse 's' as the name of a bond balancing mode. If successful,
192 * stores the mode in '*balance' and returns true. Otherwise returns false
193 * without modifying '*balance'. */
194bool
195bond_mode_from_string(enum bond_mode *balance, const char *s)
196{
197 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
198 *balance = BM_TCP;
199 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
200 *balance = BM_SLB;
201 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
202 *balance = BM_AB;
203 } else {
204 return false;
205 }
206 return true;
207}
208
209/* Returns a string representing 'balance'. */
210const char *
211bond_mode_to_string(enum bond_mode balance) {
212 switch (balance) {
213 case BM_TCP:
214 return "balance-tcp";
215 case BM_SLB:
216 return "balance-slb";
217 case BM_AB:
218 return "active-backup";
219 }
428b2edd 220 OVS_NOT_REACHED();
f620b43a
BP
221}
222
f620b43a
BP
223\f
224/* Creates and returns a new bond whose configuration is initially taken from
225 * 's'.
226 *
227 * The caller should register each slave on the new bond by calling
228 * bond_slave_register(). */
229struct bond *
adcf00ba 230bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
f620b43a
BP
231{
232 struct bond *bond;
233
234 bond = xzalloc(sizeof *bond);
adcf00ba 235 bond->ofproto = ofproto;
f620b43a 236 hmap_init(&bond->slaves);
f1c8a79c
AW
237 list_init(&bond->enabled_slaves);
238 ovs_mutex_init(&bond->mutex);
37bec3d3 239 ovs_refcount_init(&bond->ref_cnt);
f620b43a 240
adcf00ba
AZ
241 bond->recirc_id = 0;
242 hmap_init(&bond->pr_rule_ops);
243
f620b43a 244 bond_reconfigure(bond, s);
f620b43a
BP
245 return bond;
246}
247
03366a2d
EJ
248struct bond *
249bond_ref(const struct bond *bond_)
250{
251 struct bond *bond = CONST_CAST(struct bond *, bond_);
252
bca0b3b4 253 if (bond) {
37bec3d3 254 ovs_refcount_ref(&bond->ref_cnt);
bca0b3b4 255 }
03366a2d
EJ
256 return bond;
257}
258
f620b43a
BP
259/* Frees 'bond'. */
260void
03366a2d 261bond_unref(struct bond *bond)
f620b43a
BP
262{
263 struct bond_slave *slave, *next_slave;
adcf00ba 264 struct bond_pr_rule_op *pr_op, *next_op;
f620b43a 265
24f83812 266 if (!bond || ovs_refcount_unref_relaxed(&bond->ref_cnt) != 1) {
03366a2d
EJ
267 return;
268 }
269
3bfd3972
EJ
270 ovs_rwlock_wrlock(&rwlock);
271 hmap_remove(all_bonds, &bond->hmap_node);
272 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
273
274 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
275 hmap_remove(&bond->slaves, &slave->hmap_node);
276 /* Client owns 'slave->netdev'. */
277 free(slave->name);
278 free(slave);
279 }
280 hmap_destroy(&bond->slaves);
281
f1c8a79c 282 ovs_mutex_destroy(&bond->mutex);
f620b43a 283 free(bond->hash);
f620b43a 284 free(bond->name);
adcf00ba
AZ
285
286 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
287 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
288 free(pr_op);
289 }
290 hmap_destroy(&bond->pr_rule_ops);
291
292 if (bond->recirc_id) {
e672ff9b 293 recirc_free_id(bond->recirc_id);
adcf00ba
AZ
294 }
295
f620b43a
BP
296 free(bond);
297}
298
adcf00ba
AZ
299static void
300add_pr_rule(struct bond *bond, const struct match *match,
6c932bc8 301 ofp_port_t out_ofport, struct rule **rule)
adcf00ba
AZ
302{
303 uint32_t hash = match_hash(match, 0);
304 struct bond_pr_rule_op *pr_op;
305
306 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
307 if (match_equal(&pr_op->match, match)) {
308 pr_op->op = ADD;
309 pr_op->out_ofport = out_ofport;
310 pr_op->pr_rule = rule;
311 return;
312 }
313 }
314
315 pr_op = xmalloc(sizeof *pr_op);
316 pr_op->match = *match;
317 pr_op->op = ADD;
318 pr_op->out_ofport = out_ofport;
319 pr_op->pr_rule = rule;
320 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
321}
322
323static void
324update_recirc_rules(struct bond *bond)
ca8127fd 325 OVS_REQ_WRLOCK(rwlock)
adcf00ba
AZ
326{
327 struct match match;
328 struct bond_pr_rule_op *pr_op, *next_op;
329 uint64_t ofpacts_stub[128 / 8];
330 struct ofpbuf ofpacts;
331 int i;
332
333 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
334
335 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
336 pr_op->op = DEL;
337 }
338
6c932bc8
AZ
339 if (bond->hash && bond->recirc_id) {
340 for (i = 0; i < BOND_BUCKETS; i++) {
341 struct bond_slave *slave = bond->hash[i].slave;
adcf00ba 342
6c932bc8
AZ
343 if (slave) {
344 match_init_catchall(&match);
345 match_set_recirc_id(&match, bond->recirc_id);
6c932bc8 346 match_set_dp_hash_masked(&match, i, BOND_MASK);
adcf00ba 347
6c932bc8
AZ
348 add_pr_rule(bond, &match, slave->ofp_port,
349 &bond->hash[i].pr_rule);
350 }
adcf00ba
AZ
351 }
352 }
353
354 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
355 int error;
adcf00ba
AZ
356 switch (pr_op->op) {
357 case ADD:
358 ofpbuf_clear(&ofpacts);
359 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
360 error = ofproto_dpif_add_internal_flow(bond->ofproto,
361 &pr_op->match,
290ad78a 362 RECIRC_RULE_PRIORITY, 0,
6c932bc8 363 &ofpacts, pr_op->pr_rule);
adcf00ba
AZ
364 if (error) {
365 char *err_s = match_to_string(&pr_op->match,
366 RECIRC_RULE_PRIORITY);
367
368 VLOG_ERR("failed to add post recirculation flow %s", err_s);
369 free(err_s);
adcf00ba
AZ
370 }
371 break;
372
373 case DEL:
374 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
375 &pr_op->match,
376 RECIRC_RULE_PRIORITY);
377 if (error) {
378 char *err_s = match_to_string(&pr_op->match,
379 RECIRC_RULE_PRIORITY);
380
381 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
382 free(err_s);
383 }
384
385 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
6c932bc8 386 *pr_op->pr_rule = NULL;
adcf00ba
AZ
387 free(pr_op);
388 break;
389 }
390 }
391
392 ofpbuf_uninit(&ofpacts);
393}
394
395
f620b43a
BP
396/* Updates 'bond''s overall configuration to 's'.
397 *
398 * The caller should register each slave on 'bond' by calling
399 * bond_slave_register(). This is optional if none of the slaves'
4d6fb5eb 400 * configuration has changed. In any case it can't hurt.
59d7b2b6
EJ
401 *
402 * Returns true if the configuration has changed in such a way that requires
403 * flow revalidation.
404 * */
405bool
f620b43a
BP
406bond_reconfigure(struct bond *bond, const struct bond_settings *s)
407{
59d7b2b6
EJ
408 bool revalidate = false;
409
3bfd3972 410 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
411 if (!bond->name || strcmp(bond->name, s->name)) {
412 if (bond->name) {
3bfd3972 413 hmap_remove(all_bonds, &bond->hmap_node);
f620b43a
BP
414 free(bond->name);
415 }
416 bond->name = xstrdup(s->name);
3bfd3972 417 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
f620b43a
BP
418 }
419
f620b43a
BP
420 bond->updelay = s->up_delay;
421 bond->downdelay = s->down_delay;
bc1b010c 422
9dd165e0
RK
423 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
424 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
425 revalidate = true;
426 }
427
bc1b010c
EJ
428 if (bond->rebalance_interval != s->rebalance_interval) {
429 bond->rebalance_interval = s->rebalance_interval;
430 revalidate = true;
431 }
f620b43a 432
59d7b2b6
EJ
433 if (bond->balance != s->balance) {
434 bond->balance = s->balance;
435 revalidate = true;
436 }
437
672d18b2
EJ
438 if (bond->basis != s->basis) {
439 bond->basis = s->basis;
440 revalidate = true;
441 }
442
62904702
EJ
443 if (bond->bond_revalidate) {
444 revalidate = true;
445 bond->bond_revalidate = false;
446 }
447
adcf00ba
AZ
448 if (bond->balance != BM_AB) {
449 if (!bond->recirc_id) {
e672ff9b 450 bond->recirc_id = recirc_alloc_id(bond->ofproto);
adcf00ba
AZ
451 }
452 } else if (bond->recirc_id) {
e672ff9b 453 recirc_free_id(bond->recirc_id);
adcf00ba
AZ
454 bond->recirc_id = 0;
455 }
456
95aafb2a
EJ
457 if (bond->balance == BM_AB || !bond->hash || revalidate) {
458 bond_entry_reset(bond);
459 }
460
3e5aeeb5
AZ
461 memcpy(bond->active_slave_mac, s->active_slave_mac,
462 sizeof s->active_slave_mac);
463
464 bond->active_slave_changed = false;
465
3bfd3972 466 ovs_rwlock_unlock(&rwlock);
59d7b2b6 467 return revalidate;
f620b43a
BP
468}
469
3e5aeeb5 470static struct bond_slave *
3bd0fd39 471bond_find_slave_by_mac(const struct bond *bond, const uint8_t mac[ETH_ADDR_LEN])
3e5aeeb5
AZ
472{
473 struct bond_slave *slave;
474
475 /* Find the last active slave */
476 HMAP_FOR_EACH(slave, hmap_node, &bond->slaves) {
3bd0fd39 477 uint8_t slave_mac[ETH_ADDR_LEN];
3e5aeeb5
AZ
478
479 if (netdev_get_etheraddr(slave->netdev, slave_mac)) {
480 continue;
481 }
482
483 if (!memcmp(slave_mac, mac, sizeof(slave_mac))) {
484 return slave;
485 }
486 }
487
488 return NULL;
489}
490
491static void
492bond_active_slave_changed(struct bond *bond)
493{
3bd0fd39 494 uint8_t mac[ETH_ADDR_LEN];
3e5aeeb5
AZ
495
496 netdev_get_etheraddr(bond->active_slave->netdev, mac);
497 memcpy(bond->active_slave_mac, mac, sizeof bond->active_slave_mac);
498 bond->active_slave_changed = true;
499 seq_change(connectivity_seq_get());
500}
501
f8ddccd2 502static void
1ea24138 503bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
3bfd3972 504 OVS_REQ_WRLOCK(rwlock)
f8ddccd2
BP
505{
506 if (slave->netdev != netdev) {
f8ddccd2 507 slave->netdev = netdev;
1ea24138 508 slave->change_seq = 0;
f8ddccd2
BP
509 }
510}
511
f620b43a
BP
512/* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
513 * arbitrary client-provided pointer that uniquely identifies a slave within a
514 * bond. If 'slave_' already exists within 'bond' then this function
515 * reconfigures the existing slave.
516 *
517 * 'netdev' must be the network device that 'slave_' represents. It is owned
518 * by the client, so the client must not close it before either unregistering
519 * 'slave_' or destroying 'bond'.
4d6fb5eb 520 */
f620b43a 521void
adcf00ba
AZ
522bond_slave_register(struct bond *bond, void *slave_,
523 ofp_port_t ofport, struct netdev *netdev)
f620b43a 524{
3bfd3972 525 struct bond_slave *slave;
f620b43a 526
3bfd3972
EJ
527 ovs_rwlock_wrlock(&rwlock);
528 slave = bond_slave_lookup(bond, slave_);
f620b43a
BP
529 if (!slave) {
530 slave = xzalloc(sizeof *slave);
531
532 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
533 slave->bond = bond;
534 slave->aux = slave_;
adcf00ba 535 slave->ofp_port = ofport;
f620b43a 536 slave->delay_expires = LLONG_MAX;
244b2160 537 slave->name = xstrdup(netdev_get_name(netdev));
7321e30e 538 bond->bond_revalidate = true;
244b2160 539
b3c18f66 540 slave->enabled = false;
4a1b8f30 541 bond_enable_slave(slave, netdev_get_carrier(netdev));
f620b43a
BP
542 }
543
1ea24138 544 bond_slave_set_netdev__(slave, netdev);
a6934aa9 545
f620b43a
BP
546 free(slave->name);
547 slave->name = xstrdup(netdev_get_name(netdev));
3bfd3972 548 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
549}
550
f8ddccd2
BP
551/* Updates the network device to be used with 'slave_' to 'netdev'.
552 *
553 * This is useful if the caller closes and re-opens the network device
554 * registered with bond_slave_register() but doesn't need to change anything
555 * else. */
556void
557bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
558{
3bfd3972
EJ
559 struct bond_slave *slave;
560
561 ovs_rwlock_wrlock(&rwlock);
562 slave = bond_slave_lookup(bond, slave_);
f8ddccd2 563 if (slave) {
1ea24138 564 bond_slave_set_netdev__(slave, netdev);
f8ddccd2 565 }
3bfd3972 566 ovs_rwlock_unlock(&rwlock);
f8ddccd2
BP
567}
568
f620b43a
BP
569/* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
570 * then this function has no effect.
571 *
572 * Unregistering a slave invalidates all flows. */
573void
574bond_slave_unregister(struct bond *bond, const void *slave_)
575{
3bfd3972 576 struct bond_slave *slave;
f620b43a
BP
577 bool del_active;
578
3bfd3972
EJ
579 ovs_rwlock_wrlock(&rwlock);
580 slave = bond_slave_lookup(bond, slave_);
f620b43a 581 if (!slave) {
3bfd3972 582 goto out;
f620b43a
BP
583 }
584
4a1b8f30
EJ
585 bond->bond_revalidate = true;
586 bond_enable_slave(slave, false);
b3c18f66 587
f620b43a
BP
588 del_active = bond->active_slave == slave;
589 if (bond->hash) {
590 struct bond_entry *e;
591 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
592 if (e->slave == slave) {
593 e->slave = NULL;
594 }
595 }
596 }
597
598 free(slave->name);
599
600 hmap_remove(&bond->slaves, &slave->hmap_node);
601 /* Client owns 'slave->netdev'. */
602 free(slave);
603
604 if (del_active) {
4a1b8f30 605 bond_choose_active_slave(bond);
f620b43a
BP
606 bond->send_learning_packets = true;
607 }
3bfd3972
EJ
608out:
609 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
610}
611
296f6519
EJ
612/* Should be called on each slave in 'bond' before bond_run() to indicate
613 * whether or not 'slave_' may be enabled. This function is intended to allow
614 * other protocols to have some impact on bonding decisions. For example LACP
615 * or high level link monitoring protocols may decide that a given slave should
616 * not be able to send traffic. */
4d6fb5eb 617void
296f6519 618bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
4d6fb5eb 619{
3bfd3972 620 ovs_rwlock_wrlock(&rwlock);
296f6519 621 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
3bfd3972 622 ovs_rwlock_unlock(&rwlock);
4d6fb5eb
EJ
623}
624
4a1b8f30
EJ
625/* Performs periodic maintenance on 'bond'.
626 *
627 * Returns true if the caller should revalidate its flows.
f620b43a
BP
628 *
629 * The caller should check bond_should_send_learning_packets() afterward. */
4a1b8f30
EJ
630bool
631bond_run(struct bond *bond, enum lacp_status lacp_status)
f620b43a
BP
632{
633 struct bond_slave *slave;
4a1b8f30 634 bool revalidate;
f620b43a 635
3bfd3972 636 ovs_rwlock_wrlock(&rwlock);
bdebeece
EJ
637 if (bond->lacp_status != lacp_status) {
638 bond->lacp_status = lacp_status;
4592d0e2
EJ
639 bond->bond_revalidate = true;
640 }
4d6fb5eb 641
f620b43a
BP
642 /* Enable slaves based on link status and LACP feedback. */
643 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
4a1b8f30 644 bond_link_status_update(slave);
da4a6191 645 slave->change_seq = seq_read(connectivity_seq_get());
f620b43a
BP
646 }
647 if (!bond->active_slave || !bond->active_slave->enabled) {
4a1b8f30 648 bond_choose_active_slave(bond);
f620b43a
BP
649 }
650
4a1b8f30
EJ
651 revalidate = bond->bond_revalidate;
652 bond->bond_revalidate = false;
3bfd3972 653 ovs_rwlock_unlock(&rwlock);
4a1b8f30
EJ
654
655 return revalidate;
f620b43a
BP
656}
657
658/* Causes poll_block() to wake up when 'bond' needs something to be done. */
659void
660bond_wait(struct bond *bond)
661{
662 struct bond_slave *slave;
663
3bfd3972 664 ovs_rwlock_rdlock(&rwlock);
f620b43a
BP
665 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
666 if (slave->delay_expires != LLONG_MAX) {
667 poll_timer_wait_until(slave->delay_expires);
668 }
1ea24138 669
da4a6191 670 seq_wait(connectivity_seq_get(), slave->change_seq);
f620b43a
BP
671 }
672
bbc13389 673 if (bond->bond_revalidate) {
f620b43a
BP
674 poll_immediate_wake();
675 }
3bfd3972 676 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
677
678 /* We don't wait for bond->next_rebalance because rebalancing can only run
679 * at a flow account checkpoint. ofproto does checkpointing on its own
680 * schedule and bond_rebalance() gets called afterward, so we'd just be
681 * waking up for no purpose. */
682}
683\f
684/* MAC learning table interaction. */
685
686static bool
687may_send_learning_packets(const struct bond *bond)
688{
9dd165e0
RK
689 return ((bond->lacp_status == LACP_DISABLED
690 && (bond->balance == BM_SLB || bond->balance == BM_AB))
691 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
bdebeece 692 && bond->active_slave;
f620b43a
BP
693}
694
695/* Returns true if 'bond' needs the client to send out packets to assist with
696 * MAC learning on 'bond'. If this function returns true, then the client
697 * should iterate through its MAC learning table for the bridge on which 'bond'
698 * is located. For each MAC that has been learned on a port other than 'bond',
ea131871 699 * it should call bond_compose_learning_packet().
f620b43a 700 *
477879ea
BP
701 * This function will only return true if 'bond' is in SLB or active-backup
702 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
703 * necessary.
f620b43a
BP
704 *
705 * Calling this function resets the state that it checks. */
706bool
707bond_should_send_learning_packets(struct bond *bond)
708{
3bfd3972
EJ
709 bool send;
710
711 ovs_rwlock_wrlock(&rwlock);
712 send = bond->send_learning_packets && may_send_learning_packets(bond);
f620b43a 713 bond->send_learning_packets = false;
3bfd3972 714 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
715 return send;
716}
717
718/* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
719 *
ea131871
JG
720 * See bond_should_send_learning_packets() for description of usage. The
721 * caller should send the composed packet on the port associated with
722 * port_aux and takes ownership of the returned ofpbuf. */
cf62fa4c 723struct dp_packet *
ea131871
JG
724bond_compose_learning_packet(struct bond *bond,
725 const uint8_t eth_src[ETH_ADDR_LEN],
726 uint16_t vlan, void **port_aux)
f620b43a
BP
727{
728 struct bond_slave *slave;
cf62fa4c 729 struct dp_packet *packet;
f620b43a 730 struct flow flow;
f620b43a 731
3bfd3972 732 ovs_rwlock_rdlock(&rwlock);
cb22974d 733 ovs_assert(may_send_learning_packets(bond));
f620b43a
BP
734 memset(&flow, 0, sizeof flow);
735 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
4a1b8f30 736 slave = choose_output_slave(bond, &flow, NULL, vlan);
f620b43a 737
cf62fa4c 738 packet = dp_packet_new(0);
2ea838ac 739 compose_rarp(packet, eth_src);
f620b43a 740 if (vlan) {
1bf02876 741 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
f620b43a 742 }
f620b43a 743
ea131871 744 *port_aux = slave->aux;
3bfd3972 745 ovs_rwlock_unlock(&rwlock);
ea131871 746 return packet;
f620b43a
BP
747}
748\f
749/* Checks whether a packet that arrived on 'slave_' within 'bond', with an
750 * Ethernet destination address of 'eth_dst', should be admitted.
751 *
752 * The return value is one of the following:
753 *
754 * - BV_ACCEPT: Admit the packet.
755 *
756 * - BV_DROP: Drop the packet.
757 *
758 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
759 * Ethernet source address and VLAN. If there is none, or if the packet
760 * is on the learned port, then admit the packet. If a different port has
761 * been learned, however, drop the packet (and do not use it for MAC
762 * learning).
763 */
764enum bond_verdict
765bond_check_admissibility(struct bond *bond, const void *slave_,
4a1b8f30 766 const uint8_t eth_dst[ETH_ADDR_LEN])
f620b43a 767{
3bfd3972
EJ
768 enum bond_verdict verdict = BV_DROP;
769 struct bond_slave *slave;
9a1c6450 770
3bfd3972
EJ
771 ovs_rwlock_rdlock(&rwlock);
772 slave = bond_slave_lookup(bond, slave_);
4222bbc8 773 if (!slave) {
3bfd3972 774 goto out;
4222bbc8
EJ
775 }
776
9a1c6450
EJ
777 /* LACP bonds have very loose admissibility restrictions because we can
778 * assume the remote switch is aware of the bond and will "do the right
779 * thing". However, as a precaution we drop packets on disabled slaves
780 * because no correctly implemented partner switch should be sending
bdebeece
EJ
781 * packets to them.
782 *
783 * If LACP is configured, but LACP negotiations have been unsuccessful, we
9dd165e0 784 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
bdebeece 785 switch (bond->lacp_status) {
3bfd3972
EJ
786 case LACP_NEGOTIATED:
787 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
788 goto out;
789 case LACP_CONFIGURED:
9dd165e0
RK
790 if (!bond->lacp_fallback_ab) {
791 goto out;
792 }
3bfd3972
EJ
793 case LACP_DISABLED:
794 break;
f620b43a
BP
795 }
796
797 /* Drop all multicast packets on inactive slaves. */
798 if (eth_addr_is_multicast(eth_dst)) {
4222bbc8 799 if (bond->active_slave != slave) {
3bfd3972 800 goto out;
f620b43a
BP
801 }
802 }
803
f931a4c9 804 switch (bond->balance) {
9dd165e0
RK
805 case BM_TCP:
806 /* TCP balanced bonds require successful LACP negotiations. Based on the
807 * above check, LACP is off or lacp_fallback_ab is true on this bond.
808 * If lacp_fallback_ab is true fall through to BM_AB case else, we
809 * drop all incoming traffic. */
810 if (!bond->lacp_fallback_ab) {
811 goto out;
812 }
813
f931a4c9
BP
814 case BM_AB:
815 /* Drop all packets which arrive on backup slaves. This is similar to
816 * how Linux bonding handles active-backup bonds. */
7ba7dcf0
EJ
817 if (bond->active_slave != slave) {
818 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
819
e6b2255c
BP
820 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
821 " slave (%s) destined for " ETH_ADDR_FMT,
822 slave->name, ETH_ADDR_ARGS(eth_dst));
3bfd3972 823 goto out;
7ba7dcf0 824 }
3bfd3972
EJ
825 verdict = BV_ACCEPT;
826 goto out;
f931a4c9 827
f931a4c9
BP
828 case BM_SLB:
829 /* Drop all packets for which we have learned a different input port,
830 * because we probably sent the packet on one slave and got it back on
831 * the other. Gratuitous ARP packets are an exception to this rule:
832 * the host has moved to another switch. The exception to the
833 * exception is if we locked the learning table to avoid reflections on
834 * bond slaves. */
3bfd3972
EJ
835 verdict = BV_DROP_IF_MOVED;
836 goto out;
7ba7dcf0
EJ
837 }
838
428b2edd 839 OVS_NOT_REACHED();
3bfd3972
EJ
840out:
841 ovs_rwlock_unlock(&rwlock);
842 return verdict;
843
f620b43a
BP
844}
845
846/* Returns the slave (registered on 'bond' by bond_slave_register()) to which
847 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
848 * NULL if the packet should be dropped because no slaves are enabled.
849 *
850 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
851 * should be a VID only (i.e. excluding the PCP bits). Second,
852 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
853 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
854 * packet belongs to (so for an access port it will be the access port's VLAN).
855 *
bcd2633a
JP
856 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
857 * significant in the selection. At some point earlier, 'wc' should
858 * have been initialized (e.g., by flow_wildcards_init_catchall()).
f620b43a
BP
859 */
860void *
861bond_choose_output_slave(struct bond *bond, const struct flow *flow,
4a1b8f30 862 struct flow_wildcards *wc, uint16_t vlan)
f620b43a 863{
3bfd3972 864 struct bond_slave *slave;
b5d5d7d3 865 void *aux;
3bfd3972
EJ
866
867 ovs_rwlock_rdlock(&rwlock);
4a1b8f30 868 slave = choose_output_slave(bond, flow, wc, vlan);
b5d5d7d3 869 aux = slave ? slave->aux : NULL;
3bfd3972 870 ovs_rwlock_unlock(&rwlock);
b5d5d7d3
AW
871
872 return aux;
f620b43a 873}
f620b43a 874\f
adcf00ba
AZ
875/* Recirculation. */
876static void
877bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
c6855ec5 878 OVS_REQ_WRLOCK(rwlock)
adcf00ba
AZ
879{
880 if (entry->slave) {
881 uint64_t delta;
882
883 delta = rule_tx_bytes - entry->pr_tx_bytes;
884 entry->tx_bytes += delta;
885 entry->pr_tx_bytes = rule_tx_bytes;
886 }
887}
888
889/* Maintain bond stats using post recirculation rule byte counters.*/
60cda7d6 890static void
adcf00ba 891bond_recirculation_account(struct bond *bond)
80316557 892 OVS_REQ_WRLOCK(rwlock)
adcf00ba
AZ
893{
894 int i;
895
adcf00ba
AZ
896 for (i=0; i<=BOND_MASK; i++) {
897 struct bond_entry *entry = &bond->hash[i];
898 struct rule *rule = entry->pr_rule;
899
900 if (rule) {
901 uint64_t n_packets OVS_UNUSED;
902 long long int used OVS_UNUSED;
903 uint64_t n_bytes;
904
905 rule->ofproto->ofproto_class->rule_get_stats(
906 rule, &n_packets, &n_bytes, &used);
907 bond_entry_account(entry, n_bytes);
908 }
909 }
adcf00ba
AZ
910}
911
912bool
913bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
914 uint32_t *hash_bias)
915{
80316557 916 if (bond->balance == BM_TCP && bond->recirc_id) {
adcf00ba
AZ
917 if (recirc_id) {
918 *recirc_id = bond->recirc_id;
919 }
920 if (hash_bias) {
921 *hash_bias = bond->basis;
922 }
923 return true;
924 } else {
925 return false;
926 }
927}
928
ca8127fd
AZ
929static void
930bond_update_post_recirc_rules__(struct bond* bond, const bool force)
931 OVS_REQ_WRLOCK(rwlock)
adcf00ba
AZ
932{
933 struct bond_entry *e;
934 bool update_rules = force; /* Always update rules if caller forces it. */
935
936 /* Make sure all bond entries are populated */
937 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
938 if (!e->slave || !e->slave->enabled) {
939 update_rules = true;
940 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
941 struct bond_slave, hmap_node);
942 if (!e->slave->enabled) {
943 e->slave = bond->active_slave;
944 }
945 }
946 }
947
948 if (update_rules) {
949 update_recirc_rules(bond);
950 }
951}
ca8127fd
AZ
952
953void
954bond_update_post_recirc_rules(struct bond* bond, const bool force)
955{
956 ovs_rwlock_wrlock(&rwlock);
957 bond_update_post_recirc_rules__(bond, force);
958 ovs_rwlock_unlock(&rwlock);
959}
adcf00ba 960\f
f620b43a
BP
961/* Rebalancing. */
962
1b137691 963static bool
3bfd3972 964bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
1b137691 965{
bc1b010c
EJ
966 return bond->rebalance_interval
967 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
1b137691
EJ
968}
969
f620b43a
BP
970/* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
971void
972bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
973 uint64_t n_bytes)
974{
3bfd3972 975 ovs_rwlock_wrlock(&rwlock);
1b137691 976 if (bond_is_balanced(bond)) {
f620b43a 977 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
f620b43a 978 }
3bfd3972 979 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
980}
981
982static struct bond_slave *
ca6ba700 983bond_slave_from_bal_node(struct ovs_list *bal) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
984{
985 return CONTAINER_OF(bal, struct bond_slave, bal_node);
986}
987
988static void
ca6ba700 989log_bals(struct bond *bond, const struct ovs_list *bals)
c6855ec5 990 OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
991{
992 if (VLOG_IS_DBG_ENABLED()) {
993 struct ds ds = DS_EMPTY_INITIALIZER;
994 const struct bond_slave *slave;
995
996 LIST_FOR_EACH (slave, bal_node, bals) {
997 if (ds.length) {
998 ds_put_char(&ds, ',');
999 }
1000 ds_put_format(&ds, " %s %"PRIu64"kB",
1001 slave->name, slave->tx_bytes / 1024);
1002
1003 if (!slave->enabled) {
1004 ds_put_cstr(&ds, " (disabled)");
1005 }
1006 if (!list_is_empty(&slave->entries)) {
1007 struct bond_entry *e;
1008
1009 ds_put_cstr(&ds, " (");
1010 LIST_FOR_EACH (e, list_node, &slave->entries) {
1011 if (&e->list_node != list_front(&slave->entries)) {
1012 ds_put_cstr(&ds, " + ");
1013 }
34582733 1014 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
f620b43a
BP
1015 e - bond->hash, e->tx_bytes / 1024);
1016 }
1017 ds_put_cstr(&ds, ")");
1018 }
1019 }
1020 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
1021 ds_destroy(&ds);
1022 }
1023}
1024
1025/* Shifts 'hash' from its current slave to 'to'. */
1026static void
4a1b8f30 1027bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
c6855ec5 1028 OVS_REQ_WRLOCK(rwlock)
f620b43a
BP
1029{
1030 struct bond_slave *from = hash->slave;
1031 struct bond *bond = from->bond;
1032 uint64_t delta = hash->tx_bytes;
1033
34582733 1034 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
f620b43a
BP
1035 "from %s to %s (now carrying %"PRIu64"kB and "
1036 "%"PRIu64"kB load, respectively)",
1037 bond->name, delta / 1024, hash - bond->hash,
1038 from->name, to->name,
1039 (from->tx_bytes - delta) / 1024,
1040 (to->tx_bytes + delta) / 1024);
1041
1042 /* Shift load away from 'from' to 'to'. */
1043 from->tx_bytes -= delta;
1044 to->tx_bytes += delta;
1045
1046 /* Arrange for flows to be revalidated. */
dc30ea2d 1047 hash->slave = to;
4a1b8f30 1048 bond->bond_revalidate = true;
f620b43a
BP
1049}
1050
09a5d390
BP
1051/* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1052 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
f620b43a
BP
1053 * given that doing so must decrease the ratio of the load on the two slaves by
1054 * at least 0.1. Returns NULL if there is no appropriate entry.
1055 *
1056 * The list of entries isn't sorted. I don't know of a reason to prefer to
1057 * shift away small hashes or large hashes. */
1058static struct bond_entry *
1059choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
c6855ec5 1060 OVS_REQ_WRLOCK(rwlock)
f620b43a
BP
1061{
1062 struct bond_entry *e;
1063
1064 if (list_is_short(&from->entries)) {
1065 /* 'from' carries no more than one MAC hash, so shifting load away from
1066 * it would be pointless. */
1067 return NULL;
1068 }
1069
1070 LIST_FOR_EACH (e, list_node, &from->entries) {
1071 double old_ratio, new_ratio;
1072 uint64_t delta;
1073
1074 if (to_tx_bytes == 0) {
1075 /* Nothing on the new slave, move it. */
1076 return e;
1077 }
1078
1079 delta = e->tx_bytes;
1080 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1081 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
75fad143
ZK
1082 if (old_ratio - new_ratio > 0.1
1083 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1084 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1085 and 'to' slave have the same load. Therefore, we only move an
1086 entry if it decreases the load on 'from', and brings us closer
1087 to equal traffic load. */
f620b43a
BP
1088 return e;
1089 }
1090 }
1091
1092 return NULL;
1093}
1094
1095/* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1096 * maintained. */
1097static void
ca6ba700 1098insert_bal(struct ovs_list *bals, struct bond_slave *slave)
f620b43a
BP
1099{
1100 struct bond_slave *pos;
1101
1102 LIST_FOR_EACH (pos, bal_node, bals) {
1103 if (slave->tx_bytes > pos->tx_bytes) {
1104 break;
1105 }
1106 }
1107 list_insert(&pos->bal_node, &slave->bal_node);
1108}
1109
1110/* Removes 'slave' from its current list and then inserts it into 'bals' so
1111 * that descending order of 'tx_bytes' is maintained. */
1112static void
ca6ba700 1113reinsert_bal(struct ovs_list *bals, struct bond_slave *slave)
f620b43a
BP
1114{
1115 list_remove(&slave->bal_node);
1116 insert_bal(bals, slave);
1117}
1118
1119/* If 'bond' needs rebalancing, does so.
1120 *
adcf00ba
AZ
1121 * The caller should have called bond_account() for each active flow, or in case
1122 * of recirculation is used, have called bond_recirculation_account(bond),
1123 * to ensure that flow data is consistently accounted at this point.
60cda7d6
AZ
1124 */
1125void
4a1b8f30 1126bond_rebalance(struct bond *bond)
f620b43a
BP
1127{
1128 struct bond_slave *slave;
1129 struct bond_entry *e;
ca6ba700 1130 struct ovs_list bals;
adcf00ba 1131 bool rebalanced = false;
60cda7d6 1132 bool use_recirc;
f620b43a 1133
3bfd3972 1134 ovs_rwlock_wrlock(&rwlock);
1b137691 1135 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
adcf00ba 1136 goto done;
f620b43a
BP
1137 }
1138 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1139
2494ccd7 1140 use_recirc = ofproto_dpif_get_support(bond->ofproto)->odp.recirc &&
60cda7d6
AZ
1141 bond_may_recirc(bond, NULL, NULL);
1142
1143 if (use_recirc) {
1144 bond_recirculation_account(bond);
1145 }
1146
f620b43a
BP
1147 /* Add each bond_entry to its slave's 'entries' list.
1148 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1149 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1150 slave->tx_bytes = 0;
1151 list_init(&slave->entries);
1152 }
1153 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1154 if (e->slave && e->tx_bytes) {
1155 e->slave->tx_bytes += e->tx_bytes;
1156 list_push_back(&e->slave->entries, &e->list_node);
1157 }
1158 }
1159
1160 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1161 *
1162 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1163 * with a proper list sort algorithm. */
1164 list_init(&bals);
1165 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1166 if (slave->enabled) {
1167 insert_bal(&bals, slave);
1168 }
1169 }
1170 log_bals(bond, &bals);
1171
1172 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1173 while (!list_is_short(&bals)) {
1174 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1175 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1176 uint64_t overload;
1177
1178 overload = from->tx_bytes - to->tx_bytes;
1179 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1180 /* The extra load on 'from' (and all less-loaded slaves), compared
1181 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1182 * it is less than ~1Mbps. No point in rebalancing. */
1183 break;
1184 }
1185
09a5d390
BP
1186 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1187 * to move from 'from' to 'to'. */
f620b43a
BP
1188 e = choose_entry_to_migrate(from, to->tx_bytes);
1189 if (e) {
4a1b8f30 1190 bond_shift_load(e, to);
f620b43a
BP
1191
1192 /* Delete element from from->entries.
1193 *
1194 * We don't add the element to to->hashes. That would only allow
1195 * 'e' to be migrated to another slave in this rebalancing run, and
1196 * there is no point in doing that. */
1197 list_remove(&e->list_node);
1198
1199 /* Re-sort 'bals'. */
1200 reinsert_bal(&bals, from);
1201 reinsert_bal(&bals, to);
60cda7d6 1202 rebalanced = true;
f620b43a
BP
1203 } else {
1204 /* Can't usefully migrate anything away from 'from'.
1205 * Don't reconsider it. */
1206 list_remove(&from->bal_node);
1207 }
1208 }
1209
1210 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1211 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1212 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1213 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1214 e->tx_bytes /= 2;
f620b43a 1215 }
adcf00ba 1216
60cda7d6 1217 if (use_recirc && rebalanced) {
ca8127fd 1218 bond_update_post_recirc_rules__(bond,true);
60cda7d6 1219 }
2f486d4c
AZ
1220
1221done:
3bfd3972 1222 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1223}
1224\f
1225/* Bonding unixctl user interface functions. */
1226
1227static struct bond *
3bfd3972 1228bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
1229{
1230 struct bond *bond;
1231
1232 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
3bfd3972 1233 all_bonds) {
f620b43a
BP
1234 if (!strcmp(bond->name, name)) {
1235 return bond;
1236 }
1237 }
1238 return NULL;
1239}
1240
1241static struct bond_slave *
1242bond_lookup_slave(struct bond *bond, const char *slave_name)
1243{
1244 struct bond_slave *slave;
1245
1246 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1247 if (!strcmp(slave->name, slave_name)) {
1248 return slave;
1249 }
1250 }
1251 return NULL;
1252}
1253
1254static void
1255bond_unixctl_list(struct unixctl_conn *conn,
0e15264f
BP
1256 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1257 void *aux OVS_UNUSED)
f620b43a
BP
1258{
1259 struct ds ds = DS_EMPTY_INITIALIZER;
1260 const struct bond *bond;
1261
adcf00ba 1262 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
f620b43a 1263
3bfd3972
EJ
1264 ovs_rwlock_rdlock(&rwlock);
1265 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
f620b43a
BP
1266 const struct bond_slave *slave;
1267 size_t i;
1268
adcf00ba
AZ
1269 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1270 bond_mode_to_string(bond->balance), bond->recirc_id);
f620b43a
BP
1271
1272 i = 0;
1273 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1274 if (i++ > 0) {
1275 ds_put_cstr(&ds, ", ");
1276 }
1277 ds_put_cstr(&ds, slave->name);
1278 }
1279 ds_put_char(&ds, '\n');
1280 }
3bfd3972 1281 ovs_rwlock_unlock(&rwlock);
bde9f75d 1282 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a
BP
1283 ds_destroy(&ds);
1284}
1285
1286static void
c33a8a25 1287bond_print_details(struct ds *ds, const struct bond *bond)
3bfd3972 1288 OVS_REQ_RDLOCK(rwlock)
f620b43a 1289{
fc1d4f01
EJ
1290 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1291 const struct shash_node **sorted_slaves = NULL;
f620b43a 1292 const struct bond_slave *slave;
adcf00ba
AZ
1293 bool may_recirc;
1294 uint32_t recirc_id;
fc1d4f01 1295 int i;
f620b43a 1296
c33a8a25
EJ
1297 ds_put_format(ds, "---- %s ----\n", bond->name);
1298 ds_put_format(ds, "bond_mode: %s\n",
f620b43a
BP
1299 bond_mode_to_string(bond->balance));
1300
adcf00ba
AZ
1301 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1302 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1303 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1304
c33a8a25 1305 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
672d18b2 1306
c33a8a25
EJ
1307 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1308 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
f620b43a 1309
1b137691 1310 if (bond_is_balanced(bond)) {
c33a8a25 1311 ds_put_format(ds, "next rebalance: %lld ms\n",
f620b43a
BP
1312 bond->next_rebalance - time_msec());
1313 }
1314
bdebeece
EJ
1315 ds_put_cstr(ds, "lacp_status: ");
1316 switch (bond->lacp_status) {
1317 case LACP_NEGOTIATED:
1318 ds_put_cstr(ds, "negotiated\n");
1319 break;
1320 case LACP_CONFIGURED:
1321 ds_put_cstr(ds, "configured\n");
1322 break;
1323 case LACP_DISABLED:
1324 ds_put_cstr(ds, "off\n");
1325 break;
1326 default:
1327 ds_put_cstr(ds, "<unknown>\n");
1328 break;
1329 }
4d6fb5eb 1330
3e5aeeb5
AZ
1331 ds_put_cstr(ds, "active slave mac: ");
1332 ds_put_format(ds, ETH_ADDR_FMT, ETH_ADDR_ARGS(bond->active_slave_mac));
1333 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1334 ds_put_format(ds,"(%s)\n", slave ? slave->name : "none");
1335
f620b43a 1336 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
fc1d4f01
EJ
1337 shash_add(&slave_shash, slave->name, slave);
1338 }
1339 sorted_slaves = shash_sort(&slave_shash);
1340
1341 for (i = 0; i < shash_count(&slave_shash); i++) {
f620b43a 1342 struct bond_entry *be;
f620b43a 1343
fc1d4f01
EJ
1344 slave = sorted_slaves[i]->data;
1345
f620b43a 1346 /* Basic info. */
c33a8a25 1347 ds_put_format(ds, "\nslave %s: %s\n",
f620b43a
BP
1348 slave->name, slave->enabled ? "enabled" : "disabled");
1349 if (slave == bond->active_slave) {
c33a8a25 1350 ds_put_cstr(ds, "\tactive slave\n");
f620b43a
BP
1351 }
1352 if (slave->delay_expires != LLONG_MAX) {
c33a8a25 1353 ds_put_format(ds, "\t%s expires in %lld ms\n",
f620b43a
BP
1354 slave->enabled ? "downdelay" : "updelay",
1355 slave->delay_expires - time_msec());
1356 }
1357
c33a8a25 1358 ds_put_format(ds, "\tmay_enable: %s\n",
296f6519 1359 slave->may_enable ? "true" : "false");
4d6fb5eb 1360
1b137691 1361 if (!bond_is_balanced(bond)) {
f620b43a
BP
1362 continue;
1363 }
1364
1365 /* Hashes. */
f620b43a
BP
1366 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1367 int hash = be - bond->hash;
f6ba1f35 1368 uint64_t be_tx_k;
f620b43a
BP
1369
1370 if (be->slave != slave) {
1371 continue;
1372 }
1373
f6ba1f35
AZ
1374 be_tx_k = be->tx_bytes / 1024;
1375 if (be_tx_k) {
1376 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1377 hash, be_tx_k);
1378 }
f620b43a 1379
7b9f1974 1380 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
f620b43a
BP
1381 }
1382 }
fc1d4f01
EJ
1383 shash_destroy(&slave_shash);
1384 free(sorted_slaves);
c33a8a25
EJ
1385 ds_put_cstr(ds, "\n");
1386}
1387
1388static void
1389bond_unixctl_show(struct unixctl_conn *conn,
1390 int argc, const char *argv[],
1391 void *aux OVS_UNUSED)
1392{
1393 struct ds ds = DS_EMPTY_INITIALIZER;
1394
3bfd3972 1395 ovs_rwlock_rdlock(&rwlock);
c33a8a25
EJ
1396 if (argc > 1) {
1397 const struct bond *bond = bond_find(argv[1]);
1398
1399 if (!bond) {
bde9f75d 1400 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1401 goto out;
c33a8a25
EJ
1402 }
1403 bond_print_details(&ds, bond);
1404 } else {
1405 const struct bond *bond;
1406
3bfd3972 1407 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
c33a8a25
EJ
1408 bond_print_details(&ds, bond);
1409 }
1410 }
1411
bde9f75d 1412 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a 1413 ds_destroy(&ds);
3bfd3972
EJ
1414
1415out:
1416 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1417}
1418
1419static void
0e15264f
BP
1420bond_unixctl_migrate(struct unixctl_conn *conn,
1421 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1422 void *aux OVS_UNUSED)
1423{
0e15264f
BP
1424 const char *bond_s = argv[1];
1425 const char *hash_s = argv[2];
1426 const char *slave_s = argv[3];
f620b43a
BP
1427 struct bond *bond;
1428 struct bond_slave *slave;
1429 struct bond_entry *entry;
1430 int hash;
1431
3bfd3972 1432 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1433 bond = bond_find(bond_s);
1434 if (!bond) {
bde9f75d 1435 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1436 goto out;
f620b43a
BP
1437 }
1438
1439 if (bond->balance != BM_SLB) {
bde9f75d 1440 unixctl_command_reply_error(conn, "not an SLB bond");
3bfd3972 1441 goto out;
f620b43a
BP
1442 }
1443
1444 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1445 hash = atoi(hash_s) & BOND_MASK;
1446 } else {
bde9f75d 1447 unixctl_command_reply_error(conn, "bad hash");
3bfd3972 1448 goto out;
f620b43a
BP
1449 }
1450
1451 slave = bond_lookup_slave(bond, slave_s);
1452 if (!slave) {
bde9f75d 1453 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1454 goto out;
f620b43a
BP
1455 }
1456
1457 if (!slave->enabled) {
bde9f75d 1458 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
3bfd3972 1459 goto out;
f620b43a
BP
1460 }
1461
1462 entry = &bond->hash[hash];
4a1b8f30 1463 bond->bond_revalidate = true;
f620b43a 1464 entry->slave = slave;
bde9f75d 1465 unixctl_command_reply(conn, "migrated");
3bfd3972
EJ
1466
1467out:
1468 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1469}
1470
1471static void
0e15264f
BP
1472bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1473 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1474 void *aux OVS_UNUSED)
1475{
0e15264f
BP
1476 const char *bond_s = argv[1];
1477 const char *slave_s = argv[2];
f620b43a
BP
1478 struct bond *bond;
1479 struct bond_slave *slave;
1480
3bfd3972 1481 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1482 bond = bond_find(bond_s);
1483 if (!bond) {
bde9f75d 1484 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1485 goto out;
f620b43a
BP
1486 }
1487
1488 slave = bond_lookup_slave(bond, slave_s);
1489 if (!slave) {
bde9f75d 1490 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1491 goto out;
f620b43a
BP
1492 }
1493
1494 if (!slave->enabled) {
bde9f75d 1495 unixctl_command_reply_error(conn, "cannot make disabled slave active");
3bfd3972 1496 goto out;
f620b43a
BP
1497 }
1498
1499 if (bond->active_slave != slave) {
4a1b8f30 1500 bond->bond_revalidate = true;
f620b43a 1501 bond->active_slave = slave;
f620b43a
BP
1502 VLOG_INFO("bond %s: active interface is now %s",
1503 bond->name, slave->name);
1504 bond->send_learning_packets = true;
bde9f75d 1505 unixctl_command_reply(conn, "done");
3e5aeeb5 1506 bond_active_slave_changed(bond);
f620b43a 1507 } else {
bde9f75d 1508 unixctl_command_reply(conn, "no change");
f620b43a 1509 }
3bfd3972
EJ
1510out:
1511 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1512}
1513
1514static void
0e15264f 1515enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
f620b43a 1516{
0e15264f
BP
1517 const char *bond_s = argv[1];
1518 const char *slave_s = argv[2];
f620b43a
BP
1519 struct bond *bond;
1520 struct bond_slave *slave;
1521
3bfd3972 1522 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1523 bond = bond_find(bond_s);
1524 if (!bond) {
bde9f75d 1525 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1526 goto out;
f620b43a
BP
1527 }
1528
1529 slave = bond_lookup_slave(bond, slave_s);
1530 if (!slave) {
bde9f75d 1531 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1532 goto out;
f620b43a
BP
1533 }
1534
4a1b8f30 1535 bond_enable_slave(slave, enable);
bde9f75d 1536 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
3bfd3972
EJ
1537
1538out:
1539 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1540}
1541
1542static void
0e15264f
BP
1543bond_unixctl_enable_slave(struct unixctl_conn *conn,
1544 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1545 void *aux OVS_UNUSED)
1546{
0e15264f 1547 enable_slave(conn, argv, true);
f620b43a
BP
1548}
1549
1550static void
0e15264f
BP
1551bond_unixctl_disable_slave(struct unixctl_conn *conn,
1552 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1553 void *aux OVS_UNUSED)
1554{
0e15264f 1555 enable_slave(conn, argv, false);
f620b43a
BP
1556}
1557
1558static void
0e15264f 1559bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
f620b43a
BP
1560 void *aux OVS_UNUSED)
1561{
0e15264f
BP
1562 const char *mac_s = argv[1];
1563 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1564 const char *basis_s = argc > 3 ? argv[3] : NULL;
f620b43a
BP
1565 uint8_t mac[ETH_ADDR_LEN];
1566 uint8_t hash;
1567 char *hash_cstr;
1568 unsigned int vlan;
672d18b2 1569 uint32_t basis;
f620b43a
BP
1570
1571 if (vlan_s) {
c2c28dfd 1572 if (!ovs_scan(vlan_s, "%u", &vlan)) {
bde9f75d 1573 unixctl_command_reply_error(conn, "invalid vlan");
f620b43a
BP
1574 return;
1575 }
1576 } else {
dc155bff 1577 vlan = 0;
f620b43a
BP
1578 }
1579
672d18b2 1580 if (basis_s) {
c2c28dfd 1581 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
bde9f75d 1582 unixctl_command_reply_error(conn, "invalid basis");
672d18b2
EJ
1583 return;
1584 }
1585 } else {
1586 basis = 0;
1587 }
1588
c2c28dfd 1589 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
672d18b2 1590 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
f620b43a
BP
1591
1592 hash_cstr = xasprintf("%u", hash);
bde9f75d 1593 unixctl_command_reply(conn, hash_cstr);
f620b43a
BP
1594 free(hash_cstr);
1595 } else {
bde9f75d 1596 unixctl_command_reply_error(conn, "invalid mac");
f620b43a
BP
1597 }
1598}
1599
1600void
1601bond_init(void)
1602{
0e15264f 1603 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
c33a8a25
EJ
1604 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1605 NULL);
0e15264f 1606 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
7ff2009a 1607 bond_unixctl_migrate, NULL);
0e15264f 1608 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
f620b43a 1609 bond_unixctl_set_active_slave, NULL);
0e15264f 1610 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
7ff2009a 1611 bond_unixctl_enable_slave, NULL);
0e15264f 1612 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
7ff2009a 1613 bond_unixctl_disable_slave, NULL);
0e15264f 1614 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
7ff2009a 1615 bond_unixctl_hash, NULL);
f620b43a
BP
1616}
1617\f
95aafb2a
EJ
1618static void
1619bond_entry_reset(struct bond *bond)
1620{
1621 if (bond->balance != BM_AB) {
9e1a6910 1622 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
95aafb2a
EJ
1623
1624 if (!bond->hash) {
1625 bond->hash = xmalloc(hash_len);
1626 }
1627 memset(bond->hash, 0, hash_len);
1628
1629 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1630 } else {
1631 free(bond->hash);
1632 bond->hash = NULL;
1633 }
1634}
1635
f620b43a
BP
1636static struct bond_slave *
1637bond_slave_lookup(struct bond *bond, const void *slave_)
1638{
1639 struct bond_slave *slave;
1640
1641 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1642 &bond->slaves) {
1643 if (slave->aux == slave_) {
1644 return slave;
1645 }
1646 }
1647
1648 return NULL;
1649}
1650
f620b43a 1651static void
4a1b8f30 1652bond_enable_slave(struct bond_slave *slave, bool enable)
f620b43a
BP
1653{
1654 slave->delay_expires = LLONG_MAX;
1655 if (enable != slave->enabled) {
4a1b8f30 1656 slave->bond->bond_revalidate = true;
f620b43a 1657 slave->enabled = enable;
f1c8a79c
AW
1658
1659 ovs_mutex_lock(&slave->bond->mutex);
1660 if (enable) {
1661 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1662 } else {
1663 list_remove(&slave->list_node);
1664 }
1665 ovs_mutex_unlock(&slave->bond->mutex);
1666
4a1b8f30
EJ
1667 VLOG_INFO("interface %s: %s", slave->name,
1668 slave->enabled ? "enabled" : "disabled");
f620b43a
BP
1669 }
1670}
1671
1672static void
4a1b8f30 1673bond_link_status_update(struct bond_slave *slave)
f620b43a
BP
1674{
1675 struct bond *bond = slave->bond;
1676 bool up;
1677
296f6519 1678 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
f620b43a
BP
1679 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1680 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1681 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1682 slave->name, up ? "up" : "down");
1683 if (up == slave->enabled) {
1684 slave->delay_expires = LLONG_MAX;
1685 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1686 slave->name, up ? "disabled" : "enabled");
1687 } else {
bdebeece 1688 int delay = (bond->lacp_status != LACP_DISABLED ? 0
f620b43a
BP
1689 : up ? bond->updelay : bond->downdelay);
1690 slave->delay_expires = time_msec() + delay;
1691 if (delay) {
1692 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1693 "for %d ms",
1694 slave->name,
1695 up ? "enabled" : "disabled",
1696 up ? "up" : "down",
1697 delay);
1698 }
1699 }
1700 }
1701
1702 if (time_msec() >= slave->delay_expires) {
4a1b8f30 1703 bond_enable_slave(slave, up);
f620b43a
BP
1704 }
1705}
1706
f620b43a 1707static unsigned int
672d18b2 1708bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
f620b43a 1709{
7e36ac42 1710 return hash_mac(mac, vlan, basis);
f620b43a
BP
1711}
1712
1713static unsigned int
672d18b2 1714bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
f620b43a
BP
1715{
1716 struct flow hash_flow = *flow;
d84d4b88 1717 hash_flow.vlan_tci = htons(vlan);
f620b43a
BP
1718
1719 /* The symmetric quality of this hash function is not required, but
1720 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1721 * purposes, so we use it out of convenience. */
672d18b2 1722 return flow_hash_symmetric_l4(&hash_flow, basis);
f620b43a
BP
1723}
1724
fb0b29a3
EJ
1725static unsigned int
1726bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1727{
cb22974d 1728 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
fb0b29a3 1729
bdebeece 1730 return (bond->balance == BM_TCP
672d18b2
EJ
1731 ? bond_hash_tcp(flow, vlan, bond->basis)
1732 : bond_hash_src(flow->dl_src, vlan, bond->basis));
fb0b29a3
EJ
1733}
1734
f620b43a
BP
1735static struct bond_entry *
1736lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1737 uint16_t vlan)
1738{
fb0b29a3 1739 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
f620b43a
BP
1740}
1741
f1c8a79c
AW
1742/* Selects and returns an enabled slave from the 'enabled_slaves' list
1743 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1744 * returns NULL. */
1745static struct bond_slave *
1746get_enabled_slave(struct bond *bond)
1747{
ca6ba700 1748 struct ovs_list *node;
f1c8a79c
AW
1749
1750 ovs_mutex_lock(&bond->mutex);
1751 if (list_is_empty(&bond->enabled_slaves)) {
1752 ovs_mutex_unlock(&bond->mutex);
1753 return NULL;
1754 }
1755
1756 node = list_pop_front(&bond->enabled_slaves);
1757 list_push_back(&bond->enabled_slaves, node);
1758 ovs_mutex_unlock(&bond->mutex);
1759
1760 return CONTAINER_OF(node, struct bond_slave, list_node);
1761}
1762
f620b43a
BP
1763static struct bond_slave *
1764choose_output_slave(const struct bond *bond, const struct flow *flow,
4a1b8f30 1765 struct flow_wildcards *wc, uint16_t vlan)
f620b43a
BP
1766{
1767 struct bond_entry *e;
9dd165e0 1768 int balance;
f620b43a 1769
9dd165e0 1770 balance = bond->balance;
bdebeece
EJ
1771 if (bond->lacp_status == LACP_CONFIGURED) {
1772 /* LACP has been configured on this bond but negotiations were
9dd165e0
RK
1773 * unsuccussful. If lacp_fallback_ab is enabled use active-
1774 * backup mode else drop all traffic. */
1775 if (!bond->lacp_fallback_ab) {
1776 return NULL;
1777 }
1778 balance = BM_AB;
bdebeece
EJ
1779 }
1780
9dd165e0 1781 switch (balance) {
f620b43a
BP
1782 case BM_AB:
1783 return bond->active_slave;
1784
f620b43a 1785 case BM_TCP:
bdebeece
EJ
1786 if (bond->lacp_status != LACP_NEGOTIATED) {
1787 /* Must have LACP negotiations for TCP balanced bonds. */
1788 return NULL;
1789 }
bcd2633a 1790 if (wc) {
6cdd5145 1791 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
bcd2633a 1792 }
bdebeece
EJ
1793 /* Fall Through. */
1794 case BM_SLB:
bcd2633a 1795 if (wc) {
6cdd5145 1796 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
bcd2633a 1797 }
f620b43a
BP
1798 e = lookup_bond_entry(bond, flow, vlan);
1799 if (!e->slave || !e->slave->enabled) {
f1c8a79c 1800 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
f620b43a
BP
1801 }
1802 return e->slave;
1803
1804 default:
428b2edd 1805 OVS_NOT_REACHED();
f620b43a
BP
1806 }
1807}
1808
1809static struct bond_slave *
1810bond_choose_slave(const struct bond *bond)
1811{
1812 struct bond_slave *slave, *best;
1813
3e5aeeb5
AZ
1814 /* Find the last active slave. */
1815 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1816 if (slave && slave->enabled) {
1817 return slave;
1818 }
1819
f620b43a
BP
1820 /* Find an enabled slave. */
1821 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1822 if (slave->enabled) {
1823 return slave;
1824 }
1825 }
1826
1827 /* All interfaces are disabled. Find an interface that will be enabled
1828 * after its updelay expires. */
1829 best = NULL;
1830 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1831 if (slave->delay_expires != LLONG_MAX
296f6519 1832 && slave->may_enable
f620b43a
BP
1833 && (!best || slave->delay_expires < best->delay_expires)) {
1834 best = slave;
1835 }
1836 }
1837 return best;
1838}
1839
1840static void
4a1b8f30 1841bond_choose_active_slave(struct bond *bond)
f620b43a
BP
1842{
1843 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1844 struct bond_slave *old_active_slave = bond->active_slave;
1845
1846 bond->active_slave = bond_choose_slave(bond);
1847 if (bond->active_slave) {
1848 if (bond->active_slave->enabled) {
1849 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1850 bond->name, bond->active_slave->name);
1851 } else {
1852 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1853 "remaining %lld ms updelay (since no interface was "
1854 "enabled)", bond->name, bond->active_slave->name,
1855 bond->active_slave->delay_expires - time_msec());
4a1b8f30 1856 bond_enable_slave(bond->active_slave, true);
f620b43a
BP
1857 }
1858
1859 bond->send_learning_packets = true;
3e5aeeb5
AZ
1860
1861 if (bond->active_slave != old_active_slave) {
1862 bond_active_slave_changed(bond);
1863 }
f620b43a 1864 } else if (old_active_slave) {
d28b9ead 1865 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
f620b43a
BP
1866 }
1867}
3e5aeeb5
AZ
1868
1869/*
1870 * Return true if bond has unstored active slave change.
1871 * If return true, 'mac' will store the bond's current active slave's
1872 * MAC address. */
1873bool
1874bond_get_changed_active_slave(const char *name, uint8_t* mac, bool force)
1875{
1876 struct bond *bond;
1877
1878 ovs_rwlock_wrlock(&rwlock);
1879 bond = bond_find(name);
1880 if (bond) {
1881 if (bond->active_slave_changed || force) {
1882 memcpy(mac, bond->active_slave_mac, ETH_ADDR_LEN);
1883 bond->active_slave_changed = false;
1884 ovs_rwlock_unlock(&rwlock);
1885 return true;
1886 }
1887 }
1888 ovs_rwlock_unlock(&rwlock);
1889
1890 return false;
1891}