]> git.proxmox.com Git - mirror_ovs.git/blame - ofproto/bond.c
cirrus: Use FreeBSD 12.2.
[mirror_ovs.git] / ofproto / bond.c
CommitLineData
f620b43a 1/*
50f96b10 2 * Copyright (c) 2008-2017 Nicira, Inc.
f620b43a
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
19#include "bond.h"
20
21#include <limits.h>
22#include <stdint.h>
23#include <stdlib.h>
75fad143 24#include <math.h>
f620b43a 25
da4a6191 26#include "connectivity.h"
f620b43a 27#include "coverage.h"
b598f214 28#include "dp-packet.h"
f620b43a 29#include "flow.h"
ee89ea7b 30#include "openvswitch/hmap.h"
bdebeece 31#include "lacp.h"
f620b43a
BP
32#include "netdev.h"
33#include "odp-util.h"
b598f214
BW
34#include "ofproto/ofproto-dpif.h"
35#include "ofproto/ofproto-dpif-rid.h"
36#include "ofproto/ofproto-provider.h"
37#include "openvswitch/dynamic-string.h"
38#include "openvswitch/list.h"
39#include "openvswitch/match.h"
40#include "openvswitch/ofp-actions.h"
64c96779 41#include "openvswitch/ofpbuf.h"
b598f214 42#include "openvswitch/vlog.h"
f620b43a 43#include "packets.h"
fd016ae3 44#include "openvswitch/poll-loop.h"
da4a6191 45#include "seq.h"
ee89ea7b 46#include "openvswitch/shash.h"
f620b43a
BP
47#include "timeval.h"
48#include "unixctl.h"
ee89ea7b 49#include "util.h"
f620b43a
BP
50
51VLOG_DEFINE_THIS_MODULE(bond);
52
f1c8a79c
AW
53static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
54static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
55static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
56
07a3cd5c
BP
57/* Priority for internal rules created to handle recirculation */
58#define RECIRC_RULE_PRIORITY 20
59
91fc374a 60/* A hash bucket for mapping a flow to a member interface.
9e1a6910 61 * "struct bond" has an array of BOND_BUCKETS of these. */
f620b43a 62struct bond_entry {
91fc374a 63 struct bond_member *member; /* Assigned member, NULL if unassigned. */
c6855ec5
JS
64 uint64_t tx_bytes /* Count of bytes recently transmitted. */
65 OVS_GUARDED_BY(rwlock);
91fc374a 66 struct ovs_list list_node; /* In bond_member's 'entries' list. */
adcf00ba 67
c6855ec5
JS
68 /* Recirculation.
69 *
70 * 'pr_rule' is the post-recirculation rule for this entry.
71 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
72 * is used to determine delta (applied to 'tx_bytes' above.) */
73 struct rule *pr_rule;
74 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
f620b43a
BP
75};
76
91fc374a
BP
77/* A bond member interface, that is, one of the links comprising a bond. */
78struct bond_member {
79 struct hmap_node hmap_node; /* In struct bond's members hmap. */
80 struct ovs_list list_node; /* In struct bond's enabled_members list. */
81 struct bond *bond; /* The bond that contains this member. */
82 void *aux; /* Client-provided handle for this member. */
f620b43a
BP
83
84 struct netdev *netdev; /* Network device, owned by the client. */
6422372c 85 uint64_t change_seq; /* Tracks changes in 'netdev'. */
f620b43a 86 char *name; /* Name (a copy of netdev_get_name(netdev)). */
abec9228 87 ofp_port_t ofp_port; /* OpenFlow port number. */
f620b43a
BP
88
89 /* Link status. */
f620b43a 90 bool enabled; /* May be chosen for flows? */
91fc374a
BP
91 bool may_enable; /* Client considers this member bondable. */
92 bool is_primary; /* This member is preferred over others. */
abec9228 93 long long delay_expires; /* Time after which 'enabled' may change. */
f620b43a
BP
94
95 /* Rebalancing info. Used only by bond_rebalance(). */
ca6ba700
TG
96 struct ovs_list bal_node; /* In bond_rebalance()'s 'bals' list. */
97 struct ovs_list entries; /* 'struct bond_entry's assigned here. */
f620b43a
BP
98 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
99};
100
101/* A bond, that is, a set of network devices grouped to improve performance or
102 * robustness. */
103struct bond {
104 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
105 char *name; /* Name provided by client. */
adcf00ba 106 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
f620b43a 107
91fc374a
BP
108 /* Members. */
109 struct hmap members;
f620b43a 110
91fc374a 111 /* Enabled members.
f1c8a79c 112 *
91fc374a
BP
113 * Any reader or writer of 'enabled_members' must hold 'mutex'.
114 * (To prevent the bond_member from disappearing they must also hold
f1c8a79c
AW
115 * 'rwlock'.) */
116 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
91fc374a 117 struct ovs_list enabled_members OVS_GUARDED; /* Of struct bond_members. */
f1c8a79c 118
f620b43a
BP
119 /* Bonding info. */
120 enum bond_mode balance; /* Balancing mode, one of BM_*. */
91fc374a
BP
121 struct bond_member *active_member;
122 int updelay, downdelay; /* Delay before member goes up/down, in ms. */
bdebeece 123 enum lacp_status lacp_status; /* Status of LACP negotiations. */
62904702 124 bool bond_revalidate; /* True if flows need revalidation. */
672d18b2 125 uint32_t basis; /* Basis for flow hash function. */
9df65060
VDA
126 bool use_lb_output_action; /* Use lb_output action to avoid recirculation.
127 Applicable only for Balance TCP mode. */
91fc374a 128 char *primary; /* Name of the primary member. */
f620b43a
BP
129
130 /* SLB specific bonding info. */
9e1a6910 131 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
f620b43a
BP
132 int rebalance_interval; /* Interval between rebalances, in ms. */
133 long long int next_rebalance; /* Next rebalancing time. */
134 bool send_learning_packets;
adcf00ba
AZ
135 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
136 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
f620b43a 137
91fc374a
BP
138 /* Store active member to OVSDB. */
139 bool active_member_changed; /* Set to true whenever the bond changes active
140 * member. It will be reset to false after
141 * it is stored into OVSDB */
3e5aeeb5
AZ
142
143 /* Interface name may not be persistent across an OS reboot, use
91fc374a
BP
144 * MAC address for identifing the active member. */
145 struct eth_addr active_member_mac; /* MAC address of the active member. */
f620b43a 146 /* Legacy compatibility. */
9dd165e0 147 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
f620b43a 148
37bec3d3 149 struct ovs_refcount ref_cnt;
f620b43a
BP
150};
151
adcf00ba
AZ
152/* What to do with an bond_recirc_rule. */
153enum bond_op {
154 ADD, /* Add the rule to ofproto's flow table. */
155 DEL, /* Delete the rule from the ofproto's flow table. */
156};
157
158/* A rule to add to or delete from ofproto's internal flow table. */
159struct bond_pr_rule_op {
160 struct hmap_node hmap_node;
161 struct match match;
162 ofp_port_t out_ofport;
163 enum bond_op op;
6c932bc8 164 struct rule **pr_rule;
adcf00ba
AZ
165};
166
3bfd3972 167static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
91fc374a 168static struct bond_member *bond_member_lookup(struct bond *, const void *member_)
3bfd3972 169 OVS_REQ_RDLOCK(rwlock);
91fc374a 170static void bond_enable_member(struct bond_member *, bool enable)
4a1b8f30 171 OVS_REQ_WRLOCK(rwlock);
91fc374a 172static void bond_link_status_update(struct bond_member *)
3bfd3972 173 OVS_REQ_WRLOCK(rwlock);
91fc374a 174static void bond_choose_active_member(struct bond *)
9e1a6910 175 OVS_REQ_WRLOCK(rwlock);
f620b43a
BP
176static struct bond_entry *lookup_bond_entry(const struct bond *,
177 const struct flow *,
3bfd3972
EJ
178 uint16_t vlan)
179 OVS_REQ_RDLOCK(rwlock);
91fc374a 180static struct bond_member *get_enabled_member(struct bond *)
f1c8a79c 181 OVS_REQ_RDLOCK(rwlock);
91fc374a
BP
182static struct bond_member *choose_output_member(const struct bond *,
183 const struct flow *,
184 struct flow_wildcards *,
185 uint16_t vlan)
3bfd3972 186 OVS_REQ_RDLOCK(rwlock);
9df65060 187static void update_recirc_rules__(struct bond *);
90061ea7 188static bool bond_is_falling_back_to_ab(const struct bond *);
9df65060
VDA
189static void bond_add_lb_output_buckets(const struct bond *);
190static void bond_del_lb_output_buckets(const struct bond *);
f620b43a
BP
191
192/* Attempts to parse 's' as the name of a bond balancing mode. If successful,
193 * stores the mode in '*balance' and returns true. Otherwise returns false
194 * without modifying '*balance'. */
195bool
196bond_mode_from_string(enum bond_mode *balance, const char *s)
197{
198 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
199 *balance = BM_TCP;
200 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
201 *balance = BM_SLB;
202 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
203 *balance = BM_AB;
204 } else {
205 return false;
206 }
207 return true;
208}
209
210/* Returns a string representing 'balance'. */
211const char *
212bond_mode_to_string(enum bond_mode balance) {
213 switch (balance) {
214 case BM_TCP:
215 return "balance-tcp";
216 case BM_SLB:
217 return "balance-slb";
218 case BM_AB:
219 return "active-backup";
220 }
428b2edd 221 OVS_NOT_REACHED();
f620b43a
BP
222}
223
f620b43a
BP
224\f
225/* Creates and returns a new bond whose configuration is initially taken from
226 * 's'.
227 *
91fc374a
BP
228 * The caller should register each member on the new bond by calling
229 * bond_member_register(). */
f620b43a 230struct bond *
adcf00ba 231bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
f620b43a
BP
232{
233 struct bond *bond;
234
235 bond = xzalloc(sizeof *bond);
adcf00ba 236 bond->ofproto = ofproto;
91fc374a
BP
237 hmap_init(&bond->members);
238 ovs_list_init(&bond->enabled_members);
f1c8a79c 239 ovs_mutex_init(&bond->mutex);
37bec3d3 240 ovs_refcount_init(&bond->ref_cnt);
adcf00ba
AZ
241 hmap_init(&bond->pr_rule_ops);
242
91fc374a
BP
243 bond->active_member_mac = eth_addr_zero;
244 bond->active_member_changed = false;
b4e50218 245 bond->primary = NULL;
30353934 246
f620b43a 247 bond_reconfigure(bond, s);
f620b43a
BP
248 return bond;
249}
250
03366a2d
EJ
251struct bond *
252bond_ref(const struct bond *bond_)
253{
254 struct bond *bond = CONST_CAST(struct bond *, bond_);
255
bca0b3b4 256 if (bond) {
37bec3d3 257 ovs_refcount_ref(&bond->ref_cnt);
bca0b3b4 258 }
03366a2d
EJ
259 return bond;
260}
261
f620b43a
BP
262/* Frees 'bond'. */
263void
03366a2d 264bond_unref(struct bond *bond)
f620b43a 265{
91fc374a 266 struct bond_member *member;
f620b43a 267
24f83812 268 if (!bond || ovs_refcount_unref_relaxed(&bond->ref_cnt) != 1) {
03366a2d
EJ
269 return;
270 }
271
3bfd3972
EJ
272 ovs_rwlock_wrlock(&rwlock);
273 hmap_remove(all_bonds, &bond->hmap_node);
274 ovs_rwlock_unlock(&rwlock);
f620b43a 275
91fc374a
BP
276 HMAP_FOR_EACH_POP (member, hmap_node, &bond->members) {
277 /* Client owns 'member->netdev'. */
278 free(member->name);
279 free(member);
f620b43a 280 }
91fc374a 281 hmap_destroy(&bond->members);
f620b43a 282
f1c8a79c 283 ovs_mutex_destroy(&bond->mutex);
adcf00ba 284
05df1623 285 /* Free bond resources. Remove existing post recirc rules. */
adcf00ba 286 if (bond->recirc_id) {
9df65060
VDA
287 if (bond_use_lb_output_action(bond)) {
288 /* Delete bond buckets from datapath if installed. */
289 bond_del_lb_output_buckets(bond);
290 }
e672ff9b 291 recirc_free_id(bond->recirc_id);
05df1623 292 bond->recirc_id = 0;
adcf00ba 293 }
05df1623
AZ
294 free(bond->hash);
295 bond->hash = NULL;
296 update_recirc_rules__(bond);
adcf00ba 297
05df1623 298 hmap_destroy(&bond->pr_rule_ops);
b4e50218 299 free(bond->primary);
05df1623 300 free(bond->name);
f620b43a
BP
301 free(bond);
302}
303
adcf00ba
AZ
304static void
305add_pr_rule(struct bond *bond, const struct match *match,
6c932bc8 306 ofp_port_t out_ofport, struct rule **rule)
adcf00ba
AZ
307{
308 uint32_t hash = match_hash(match, 0);
309 struct bond_pr_rule_op *pr_op;
310
311 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
312 if (match_equal(&pr_op->match, match)) {
313 pr_op->op = ADD;
314 pr_op->out_ofport = out_ofport;
315 pr_op->pr_rule = rule;
316 return;
317 }
318 }
319
320 pr_op = xmalloc(sizeof *pr_op);
321 pr_op->match = *match;
322 pr_op->op = ADD;
323 pr_op->out_ofport = out_ofport;
324 pr_op->pr_rule = rule;
325 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
326}
327
05df1623
AZ
328/* This function should almost never be called directly.
329 * 'update_recirc_rules()' should be called instead. Since
330 * this function modifies 'bond->pr_rule_ops', it is only
331 * safe when 'rwlock' is held.
332 *
333 * However, when the 'bond' is the only reference in the system,
334 * calling this function avoid acquiring lock only to satisfy
335 * lock annotation. Currently, only 'bond_unref()' calls
336 * this function directly. */
adcf00ba 337static void
05df1623 338update_recirc_rules__(struct bond *bond)
adcf00ba
AZ
339{
340 struct match match;
341 struct bond_pr_rule_op *pr_op, *next_op;
342 uint64_t ofpacts_stub[128 / 8];
343 struct ofpbuf ofpacts;
344 int i;
345
adcf00ba
AZ
346 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
347 pr_op->op = DEL;
348 }
349
6c932bc8 350 if (bond->hash && bond->recirc_id) {
9df65060
VDA
351 if (bond_use_lb_output_action(bond)) {
352 bond_add_lb_output_buckets(bond);
353 /* No need to install post recirculation rules as we are using
354 * lb_output action with bond buckets.
355 */
356 return;
357 } else {
358 for (i = 0; i < BOND_BUCKETS; i++) {
91fc374a 359 struct bond_member *member = bond->hash[i].member;
adcf00ba 360
91fc374a 361 if (member) {
9df65060
VDA
362 match_init_catchall(&match);
363 match_set_recirc_id(&match, bond->recirc_id);
364 match_set_dp_hash_masked(&match, i, BOND_MASK);
adcf00ba 365
91fc374a 366 add_pr_rule(bond, &match, member->ofp_port,
9df65060
VDA
367 &bond->hash[i].pr_rule);
368 }
6c932bc8 369 }
adcf00ba
AZ
370 }
371 }
372
9df65060
VDA
373 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
374
adcf00ba
AZ
375 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
376 int error;
adcf00ba
AZ
377 switch (pr_op->op) {
378 case ADD:
379 ofpbuf_clear(&ofpacts);
380 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
381 error = ofproto_dpif_add_internal_flow(bond->ofproto,
382 &pr_op->match,
290ad78a 383 RECIRC_RULE_PRIORITY, 0,
6c932bc8 384 &ofpacts, pr_op->pr_rule);
adcf00ba 385 if (error) {
50f96b10 386 char *err_s = match_to_string(&pr_op->match, NULL,
adcf00ba
AZ
387 RECIRC_RULE_PRIORITY);
388
389 VLOG_ERR("failed to add post recirculation flow %s", err_s);
390 free(err_s);
adcf00ba
AZ
391 }
392 break;
393
394 case DEL:
395 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
396 &pr_op->match,
397 RECIRC_RULE_PRIORITY);
398 if (error) {
50f96b10 399 char *err_s = match_to_string(&pr_op->match, NULL,
adcf00ba
AZ
400 RECIRC_RULE_PRIORITY);
401
402 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
403 free(err_s);
404 }
405
406 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
fa233667
YS
407 if (bond->hash) {
408 *pr_op->pr_rule = NULL;
409 }
adcf00ba
AZ
410 free(pr_op);
411 break;
412 }
413 }
414
415 ofpbuf_uninit(&ofpacts);
416}
417
05df1623
AZ
418static void
419update_recirc_rules(struct bond *bond)
420 OVS_REQ_RDLOCK(rwlock)
421{
422 update_recirc_rules__(bond);
423}
adcf00ba 424
f620b43a
BP
425/* Updates 'bond''s overall configuration to 's'.
426 *
91fc374a
BP
427 * The caller should register each member on 'bond' by calling
428 * bond_member_register(). This is optional if none of the members'
4d6fb5eb 429 * configuration has changed. In any case it can't hurt.
59d7b2b6
EJ
430 *
431 * Returns true if the configuration has changed in such a way that requires
432 * flow revalidation.
433 * */
434bool
f620b43a
BP
435bond_reconfigure(struct bond *bond, const struct bond_settings *s)
436{
59d7b2b6
EJ
437 bool revalidate = false;
438
3bfd3972 439 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
440 if (!bond->name || strcmp(bond->name, s->name)) {
441 if (bond->name) {
3bfd3972 442 hmap_remove(all_bonds, &bond->hmap_node);
f620b43a
BP
443 free(bond->name);
444 }
445 bond->name = xstrdup(s->name);
3bfd3972 446 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
f620b43a
BP
447 }
448
f620b43a
BP
449 bond->updelay = s->up_delay;
450 bond->downdelay = s->down_delay;
bc1b010c 451
9dd165e0
RK
452 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
453 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
454 revalidate = true;
455 }
456
bc1b010c
EJ
457 if (bond->rebalance_interval != s->rebalance_interval) {
458 bond->rebalance_interval = s->rebalance_interval;
459 revalidate = true;
460 }
f620b43a 461
59d7b2b6
EJ
462 if (bond->balance != s->balance) {
463 bond->balance = s->balance;
464 revalidate = true;
465 }
466
672d18b2
EJ
467 if (bond->basis != s->basis) {
468 bond->basis = s->basis;
469 revalidate = true;
470 }
471
62904702
EJ
472 if (bond->bond_revalidate) {
473 revalidate = true;
474 bond->bond_revalidate = false;
475 }
476
b4e50218
JS
477 if (!nullable_string_is_equal(bond->primary, s->primary)) {
478 free(bond->primary);
479 bond->primary = nullable_xstrdup(s->primary);
480 revalidate = true;
481 }
482
adcf00ba
AZ
483 if (bond->balance != BM_AB) {
484 if (!bond->recirc_id) {
e672ff9b 485 bond->recirc_id = recirc_alloc_id(bond->ofproto);
adcf00ba
AZ
486 }
487 } else if (bond->recirc_id) {
9df65060
VDA
488 if (bond_use_lb_output_action(bond)) {
489 /* Delete bond buckets from datapath if installed. */
490 bond_del_lb_output_buckets(bond);
491 }
e672ff9b 492 recirc_free_id(bond->recirc_id);
adcf00ba
AZ
493 bond->recirc_id = 0;
494 }
9df65060
VDA
495 if (bond->use_lb_output_action != s->use_lb_output_action) {
496 if (s->use_lb_output_action &&
497 !ovs_lb_output_action_supported(bond->ofproto)) {
498 VLOG_WARN("%s: Datapath does not support 'lb_output' action, "
499 "disabled.", bond->name);
500 } else {
501 bond->use_lb_output_action = s->use_lb_output_action;
82a106eb
AM
502 if (!bond->use_lb_output_action) {
503 bond_del_lb_output_buckets(bond);
504 }
9df65060
VDA
505 revalidate = true;
506 }
507 }
adcf00ba 508
95aafb2a
EJ
509 if (bond->balance == BM_AB || !bond->hash || revalidate) {
510 bond_entry_reset(bond);
511 }
512
3bfd3972 513 ovs_rwlock_unlock(&rwlock);
59d7b2b6 514 return revalidate;
f620b43a
BP
515}
516
91fc374a
BP
517static struct bond_member *
518bond_find_member_by_mac(const struct bond *bond, const struct eth_addr mac)
3e5aeeb5 519{
91fc374a 520 struct bond_member *member;
3e5aeeb5 521
91fc374a
BP
522 /* Find the last active member */
523 HMAP_FOR_EACH (member, hmap_node, &bond->members) {
524 struct eth_addr member_mac;
3e5aeeb5 525
91fc374a 526 if (netdev_get_etheraddr(member->netdev, &member_mac)) {
3e5aeeb5
AZ
527 continue;
528 }
529
91fc374a
BP
530 if (eth_addr_equals(member_mac, mac)) {
531 return member;
3e5aeeb5
AZ
532 }
533 }
534
535 return NULL;
536}
537
538static void
91fc374a 539bond_active_member_changed(struct bond *bond)
3e5aeeb5 540{
91fc374a 541 if (bond->active_member) {
f626af7a 542 struct eth_addr mac;
91fc374a
BP
543 netdev_get_etheraddr(bond->active_member->netdev, &mac);
544 bond->active_member_mac = mac;
f626af7a 545 } else {
91fc374a 546 bond->active_member_mac = eth_addr_zero;
f626af7a 547 }
91fc374a 548 bond->active_member_changed = true;
3e5aeeb5
AZ
549 seq_change(connectivity_seq_get());
550}
551
f8ddccd2 552static void
91fc374a 553bond_member_set_netdev__(struct bond_member *member, struct netdev *netdev)
3bfd3972 554 OVS_REQ_WRLOCK(rwlock)
f8ddccd2 555{
91fc374a
BP
556 if (member->netdev != netdev) {
557 member->netdev = netdev;
558 member->change_seq = 0;
f8ddccd2
BP
559 }
560}
561
91fc374a
BP
562/* Registers 'member_' as a member interface of 'bond'. The 'member_' pointer
563 * is an arbitrary client-provided pointer that uniquely identifies a member
564 * within a bond. If 'member_' already exists within 'bond' then this function
565 * reconfigures the existing member.
f620b43a 566 *
91fc374a 567 * 'netdev' must be the network device that 'member_' represents. It is owned
f620b43a 568 * by the client, so the client must not close it before either unregistering
91fc374a 569 * 'member_' or destroying 'bond'.
4d6fb5eb 570 */
f620b43a 571void
91fc374a
BP
572bond_member_register(struct bond *bond, void *member_,
573 ofp_port_t ofport, struct netdev *netdev)
f620b43a 574{
91fc374a 575 struct bond_member *member;
f620b43a 576
3bfd3972 577 ovs_rwlock_wrlock(&rwlock);
91fc374a
BP
578 member = bond_member_lookup(bond, member_);
579 if (!member) {
580 member = xzalloc(sizeof *member);
581
582 hmap_insert(&bond->members, &member->hmap_node, hash_pointer(member_, 0));
583 member->bond = bond;
584 member->aux = member_;
585 member->ofp_port = ofport;
586 member->delay_expires = LLONG_MAX;
587 member->name = xstrdup(netdev_get_name(netdev));
7321e30e 588 bond->bond_revalidate = true;
244b2160 589
91fc374a
BP
590 member->enabled = false;
591 bond_enable_member(member, netdev_get_carrier(netdev));
f620b43a
BP
592 }
593
91fc374a 594 bond_member_set_netdev__(member, netdev);
a6934aa9 595
91fc374a
BP
596 free(member->name);
597 member->name = xstrdup(netdev_get_name(netdev));
598 if (bond->primary && !strcmp(bond->primary, member->name)) {
599 member->is_primary = true;
b4e50218 600 } else {
91fc374a 601 member->is_primary = false;
b4e50218 602 }
3bfd3972 603 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
604}
605
91fc374a 606/* Updates the network device to be used with 'member_' to 'netdev'.
f8ddccd2
BP
607 *
608 * This is useful if the caller closes and re-opens the network device
91fc374a 609 * registered with bond_member_register() but doesn't need to change anything
f8ddccd2
BP
610 * else. */
611void
91fc374a 612bond_member_set_netdev(struct bond *bond, void *member_, struct netdev *netdev)
f8ddccd2 613{
91fc374a 614 struct bond_member *member;
3bfd3972
EJ
615
616 ovs_rwlock_wrlock(&rwlock);
91fc374a
BP
617 member = bond_member_lookup(bond, member_);
618 if (member) {
619 bond_member_set_netdev__(member, netdev);
f8ddccd2 620 }
3bfd3972 621 ovs_rwlock_unlock(&rwlock);
f8ddccd2
BP
622}
623
91fc374a
BP
624/* Unregisters 'member_' from 'bond'. If 'bond' does not contain such a
625 * member then this function has no effect.
f620b43a 626 *
91fc374a 627 * Unregistering a member invalidates all flows. */
f620b43a 628void
91fc374a 629bond_member_unregister(struct bond *bond, const void *member_)
f620b43a 630{
91fc374a 631 struct bond_member *member;
f620b43a
BP
632 bool del_active;
633
3bfd3972 634 ovs_rwlock_wrlock(&rwlock);
91fc374a
BP
635 member = bond_member_lookup(bond, member_);
636 if (!member) {
3bfd3972 637 goto out;
f620b43a
BP
638 }
639
4a1b8f30 640 bond->bond_revalidate = true;
91fc374a 641 bond_enable_member(member, false);
b3c18f66 642
91fc374a 643 del_active = bond->active_member == member;
f620b43a
BP
644 if (bond->hash) {
645 struct bond_entry *e;
646 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
91fc374a
BP
647 if (e->member == member) {
648 e->member = NULL;
f620b43a
BP
649 }
650 }
651 }
652
91fc374a 653 free(member->name);
f620b43a 654
91fc374a
BP
655 hmap_remove(&bond->members, &member->hmap_node);
656 /* Client owns 'member->netdev'. */
657 free(member);
f620b43a
BP
658
659 if (del_active) {
91fc374a 660 bond_choose_active_member(bond);
f620b43a
BP
661 bond->send_learning_packets = true;
662 }
3bfd3972
EJ
663out:
664 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
665}
666
91fc374a
BP
667/* Should be called on each member in 'bond' before bond_run() to indicate
668 * whether or not 'member_' may be enabled. This function is intended to allow
296f6519 669 * other protocols to have some impact on bonding decisions. For example LACP
91fc374a
BP
670 * or high level link monitoring protocols may decide that a given member
671 * should not be able to send traffic. */
4d6fb5eb 672void
91fc374a 673bond_member_set_may_enable(struct bond *bond, void *member_, bool may_enable)
4d6fb5eb 674{
3bfd3972 675 ovs_rwlock_wrlock(&rwlock);
91fc374a 676 bond_member_lookup(bond, member_)->may_enable = may_enable;
3bfd3972 677 ovs_rwlock_unlock(&rwlock);
4d6fb5eb
EJ
678}
679
4a1b8f30
EJ
680/* Performs periodic maintenance on 'bond'.
681 *
682 * Returns true if the caller should revalidate its flows.
f620b43a
BP
683 *
684 * The caller should check bond_should_send_learning_packets() afterward. */
4a1b8f30
EJ
685bool
686bond_run(struct bond *bond, enum lacp_status lacp_status)
f620b43a 687{
91fc374a 688 struct bond_member *member, *primary;
4a1b8f30 689 bool revalidate;
f620b43a 690
3bfd3972 691 ovs_rwlock_wrlock(&rwlock);
bdebeece
EJ
692 if (bond->lacp_status != lacp_status) {
693 bond->lacp_status = lacp_status;
4592d0e2 694 bond->bond_revalidate = true;
90061ea7
BP
695
696 /* Change in LACP status can affect whether the bond is falling back to
697 * active-backup. Make sure to create or destroy buckets if
698 * necessary. */
699 if (bond_is_falling_back_to_ab(bond) || !bond->hash) {
700 bond_entry_reset(bond);
701 }
4592d0e2 702 }
4d6fb5eb 703
91fc374a 704 /* Enable members based on link status and LACP feedback. */
b4e50218 705 primary = NULL;
91fc374a
BP
706 HMAP_FOR_EACH (member, hmap_node, &bond->members) {
707 bond_link_status_update(member);
708 member->change_seq = seq_read(connectivity_seq_get());
b4e50218 709
91fc374a
BP
710 /* Discover if there is an active member marked 'primary'. */
711 if (bond->balance == BM_AB && member->is_primary && member->enabled) {
712 primary = member;
b4e50218 713 }
f620b43a 714 }
b4e50218 715
91fc374a
BP
716 if (!bond->active_member || !bond->active_member->enabled ||
717 (primary && bond->active_member != primary)) {
718 bond_choose_active_member(bond);
f620b43a
BP
719 }
720
4a1b8f30
EJ
721 revalidate = bond->bond_revalidate;
722 bond->bond_revalidate = false;
3bfd3972 723 ovs_rwlock_unlock(&rwlock);
4a1b8f30
EJ
724
725 return revalidate;
f620b43a
BP
726}
727
728/* Causes poll_block() to wake up when 'bond' needs something to be done. */
729void
730bond_wait(struct bond *bond)
731{
91fc374a 732 struct bond_member *member;
f620b43a 733
3bfd3972 734 ovs_rwlock_rdlock(&rwlock);
91fc374a
BP
735 HMAP_FOR_EACH (member, hmap_node, &bond->members) {
736 if (member->delay_expires != LLONG_MAX) {
737 poll_timer_wait_until(member->delay_expires);
f620b43a 738 }
1ea24138 739
91fc374a 740 seq_wait(connectivity_seq_get(), member->change_seq);
f620b43a
BP
741 }
742
bbc13389 743 if (bond->bond_revalidate) {
f620b43a
BP
744 poll_immediate_wake();
745 }
3bfd3972 746 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
747
748 /* We don't wait for bond->next_rebalance because rebalancing can only run
749 * at a flow account checkpoint. ofproto does checkpointing on its own
750 * schedule and bond_rebalance() gets called afterward, so we'd just be
751 * waking up for no purpose. */
752}
753\f
754/* MAC learning table interaction. */
755
756static bool
757may_send_learning_packets(const struct bond *bond)
758{
9dd165e0
RK
759 return ((bond->lacp_status == LACP_DISABLED
760 && (bond->balance == BM_SLB || bond->balance == BM_AB))
761 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
91fc374a 762 && bond->active_member;
f620b43a
BP
763}
764
765/* Returns true if 'bond' needs the client to send out packets to assist with
766 * MAC learning on 'bond'. If this function returns true, then the client
767 * should iterate through its MAC learning table for the bridge on which 'bond'
768 * is located. For each MAC that has been learned on a port other than 'bond',
ea131871 769 * it should call bond_compose_learning_packet().
f620b43a 770 *
477879ea
BP
771 * This function will only return true if 'bond' is in SLB or active-backup
772 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
773 * necessary.
f620b43a
BP
774 *
775 * Calling this function resets the state that it checks. */
776bool
777bond_should_send_learning_packets(struct bond *bond)
778{
3bfd3972
EJ
779 bool send;
780
781 ovs_rwlock_wrlock(&rwlock);
782 send = bond->send_learning_packets && may_send_learning_packets(bond);
f620b43a 783 bond->send_learning_packets = false;
3bfd3972 784 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
785 return send;
786}
787
788/* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
789 *
ea131871
JG
790 * See bond_should_send_learning_packets() for description of usage. The
791 * caller should send the composed packet on the port associated with
792 * port_aux and takes ownership of the returned ofpbuf. */
cf62fa4c 793struct dp_packet *
74ff3298 794bond_compose_learning_packet(struct bond *bond, const struct eth_addr eth_src,
ea131871 795 uint16_t vlan, void **port_aux)
f620b43a 796{
91fc374a 797 struct bond_member *member;
cf62fa4c 798 struct dp_packet *packet;
f620b43a 799 struct flow flow;
f620b43a 800
3bfd3972 801 ovs_rwlock_rdlock(&rwlock);
cb22974d 802 ovs_assert(may_send_learning_packets(bond));
f620b43a 803 memset(&flow, 0, sizeof flow);
74ff3298 804 flow.dl_src = eth_src;
91fc374a 805 member = choose_output_member(bond, &flow, NULL, vlan);
f620b43a 806
cf62fa4c 807 packet = dp_packet_new(0);
2ea838ac 808 compose_rarp(packet, eth_src);
f620b43a 809 if (vlan) {
1bf02876 810 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
f620b43a 811 }
f620b43a 812
91fc374a 813 *port_aux = member->aux;
3bfd3972 814 ovs_rwlock_unlock(&rwlock);
ea131871 815 return packet;
f620b43a
BP
816}
817\f
90061ea7
BP
818
819static bool
820bond_is_falling_back_to_ab(const struct bond *bond)
821{
822 return (bond->lacp_fallback_ab
823 && (bond->balance == BM_SLB || bond->balance == BM_TCP)
824 && bond->lacp_status == LACP_CONFIGURED);
825}
826
91fc374a 827/* Checks whether a packet that arrived on 'member_' within 'bond', with an
f620b43a
BP
828 * Ethernet destination address of 'eth_dst', should be admitted.
829 *
830 * The return value is one of the following:
831 *
832 * - BV_ACCEPT: Admit the packet.
833 *
834 * - BV_DROP: Drop the packet.
835 *
836 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
837 * Ethernet source address and VLAN. If there is none, or if the packet
838 * is on the learned port, then admit the packet. If a different port has
839 * been learned, however, drop the packet (and do not use it for MAC
840 * learning).
841 */
842enum bond_verdict
91fc374a 843bond_check_admissibility(struct bond *bond, const void *member_,
74ff3298 844 const struct eth_addr eth_dst)
f620b43a 845{
3bfd3972 846 enum bond_verdict verdict = BV_DROP;
91fc374a 847 struct bond_member *member;
a8448cb1 848 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
9a1c6450 849
3bfd3972 850 ovs_rwlock_rdlock(&rwlock);
91fc374a
BP
851 member = bond_member_lookup(bond, member_);
852 if (!member) {
3bfd3972 853 goto out;
4222bbc8
EJ
854 }
855
9a1c6450
EJ
856 /* LACP bonds have very loose admissibility restrictions because we can
857 * assume the remote switch is aware of the bond and will "do the right
91fc374a 858 * thing". However, as a precaution we drop packets on disabled members
9a1c6450 859 * because no correctly implemented partner switch should be sending
bdebeece
EJ
860 * packets to them.
861 *
862 * If LACP is configured, but LACP negotiations have been unsuccessful, we
9dd165e0 863 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
bdebeece 864 switch (bond->lacp_status) {
3bfd3972 865 case LACP_NEGOTIATED:
91fc374a 866 /* To reduce packet-drops due to delay in enabling of member (post
a8448cb1
NK
867 * LACP-SYNC), from main thread, check for may_enable as well.
868 * When may_enable is TRUE, it means LACP is UP and waiting for the
91fc374a
BP
869 * main thread to run LACP state machine and enable the member. */
870 verdict = (member->enabled || member->may_enable) ? BV_ACCEPT : BV_DROP;
871 if (!member->enabled && member->may_enable) {
872 VLOG_DBG_RL(&rl, "bond %s: member %s: "
873 "main thread has not yet enabled member",
874 bond->name, bond->active_member->name);
423416f5 875 }
3bfd3972
EJ
876 goto out;
877 case LACP_CONFIGURED:
9dd165e0
RK
878 if (!bond->lacp_fallback_ab) {
879 goto out;
880 }
e5c4f827 881 break;
3bfd3972 882 case LACP_DISABLED:
e5c4f827 883 if (bond->balance == BM_TCP) {
884 goto out;
885 }
3bfd3972 886 break;
f620b43a
BP
887 }
888
91fc374a 889 /* Drop all multicast packets on inactive members. */
f620b43a 890 if (eth_addr_is_multicast(eth_dst)) {
91fc374a 891 if (bond->active_member != member) {
3bfd3972 892 goto out;
f620b43a
BP
893 }
894 }
895
f931a4c9 896 switch (bond->balance) {
9dd165e0
RK
897 case BM_TCP:
898 /* TCP balanced bonds require successful LACP negotiations. Based on the
899 * above check, LACP is off or lacp_fallback_ab is true on this bond.
900 * If lacp_fallback_ab is true fall through to BM_AB case else, we
901 * drop all incoming traffic. */
902 if (!bond->lacp_fallback_ab) {
903 goto out;
904 }
73c7216a 905 /* fall through */
9dd165e0 906
f931a4c9 907 case BM_AB:
91fc374a 908 /* Drop all packets which arrive on backup members. This is similar to
f931a4c9 909 * how Linux bonding handles active-backup bonds. */
91fc374a 910 if (bond->active_member != member) {
e6b2255c 911 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
91fc374a
BP
912 " member (%s) destined for " ETH_ADDR_FMT,
913 member->name, ETH_ADDR_ARGS(eth_dst));
3bfd3972 914 goto out;
7ba7dcf0 915 }
3bfd3972
EJ
916 verdict = BV_ACCEPT;
917 goto out;
f931a4c9 918
f931a4c9
BP
919 case BM_SLB:
920 /* Drop all packets for which we have learned a different input port,
91fc374a 921 * because we probably sent the packet on one member and got it back on
f931a4c9
BP
922 * the other. Gratuitous ARP packets are an exception to this rule:
923 * the host has moved to another switch. The exception to the
924 * exception is if we locked the learning table to avoid reflections on
91fc374a 925 * bond members. */
3bfd3972
EJ
926 verdict = BV_DROP_IF_MOVED;
927 goto out;
7ba7dcf0
EJ
928 }
929
428b2edd 930 OVS_NOT_REACHED();
3bfd3972 931out:
91fc374a
BP
932 if (member && (verdict != BV_ACCEPT)) {
933 VLOG_DBG_RL(&rl, "member (%s): "
934 "Admissibility verdict is to drop pkt %s."
935 "active member: %s, may_enable: %s enable: %s "
a8448cb1 936 "LACP status:%d",
91fc374a 937 member->name,
a8448cb1
NK
938 (verdict == BV_DROP_IF_MOVED) ?
939 "as different port is learned" : "",
91fc374a
BP
940 (bond->active_member == member) ? "true" : "false",
941 member->may_enable ? "true" : "false",
942 member->enabled ? "true" : "false",
a8448cb1
NK
943 bond->lacp_status);
944 }
945
3bfd3972
EJ
946 ovs_rwlock_unlock(&rwlock);
947 return verdict;
948
f620b43a
BP
949}
950
91fc374a
BP
951/* Returns the member (registered on 'bond' by bond_member_register()) to which
952 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns NULL
953 * if the packet should be dropped because no members are enabled.
f620b43a
BP
954 *
955 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
956 * should be a VID only (i.e. excluding the PCP bits). Second,
957 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
958 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
959 * packet belongs to (so for an access port it will be the access port's VLAN).
960 *
bcd2633a
JP
961 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
962 * significant in the selection. At some point earlier, 'wc' should
963 * have been initialized (e.g., by flow_wildcards_init_catchall()).
f620b43a
BP
964 */
965void *
91fc374a
BP
966bond_choose_output_member(struct bond *bond, const struct flow *flow,
967 struct flow_wildcards *wc, uint16_t vlan)
f620b43a 968{
91fc374a 969 struct bond_member *member;
b5d5d7d3 970 void *aux;
3bfd3972
EJ
971
972 ovs_rwlock_rdlock(&rwlock);
91fc374a
BP
973 member = choose_output_member(bond, flow, wc, vlan);
974 aux = member ? member->aux : NULL;
3bfd3972 975 ovs_rwlock_unlock(&rwlock);
b5d5d7d3
AW
976
977 return aux;
f620b43a 978}
f620b43a 979\f
adcf00ba
AZ
980/* Recirculation. */
981static void
982bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
c6855ec5 983 OVS_REQ_WRLOCK(rwlock)
adcf00ba 984{
91fc374a 985 if (entry->member) {
adcf00ba
AZ
986 uint64_t delta;
987
988 delta = rule_tx_bytes - entry->pr_tx_bytes;
989 entry->tx_bytes += delta;
990 entry->pr_tx_bytes = rule_tx_bytes;
991 }
992}
993
994/* Maintain bond stats using post recirculation rule byte counters.*/
60cda7d6 995static void
adcf00ba 996bond_recirculation_account(struct bond *bond)
80316557 997 OVS_REQ_WRLOCK(rwlock)
adcf00ba
AZ
998{
999 int i;
9df65060
VDA
1000 uint64_t n_bytes[BOND_BUCKETS];
1001 bool use_lb_output_action = bond_use_lb_output_action(bond);
1002
1003 if (use_lb_output_action) {
1004 /* Retrieve bond stats from datapath. */
1005 dpif_bond_stats_get(bond->ofproto->backer->dpif,
1006 bond->recirc_id, n_bytes);
1007 }
adcf00ba 1008
adcf00ba
AZ
1009 for (i=0; i<=BOND_MASK; i++) {
1010 struct bond_entry *entry = &bond->hash[i];
1011 struct rule *rule = entry->pr_rule;
9df65060 1012 struct pkt_stats stats;
adcf00ba 1013
9df65060
VDA
1014 if (use_lb_output_action) {
1015 stats.n_bytes = n_bytes[i];
1016 } else if (rule) {
adcf00ba 1017 long long int used OVS_UNUSED;
adcf00ba
AZ
1018
1019 rule->ofproto->ofproto_class->rule_get_stats(
16441315 1020 rule, &stats, &used);
9df65060
VDA
1021 } else {
1022 continue;
adcf00ba 1023 }
9df65060 1024 bond_entry_account(entry, stats.n_bytes);
adcf00ba 1025 }
adcf00ba
AZ
1026}
1027
a80aba3a 1028static bool
6b95d23c 1029bond_may_recirc(const struct bond *bond)
adcf00ba 1030{
90061ea7
BP
1031 return (bond->balance == BM_TCP && bond->recirc_id
1032 && !bond_is_falling_back_to_ab(bond));
adcf00ba
AZ
1033}
1034
ca8127fd
AZ
1035static void
1036bond_update_post_recirc_rules__(struct bond* bond, const bool force)
1037 OVS_REQ_WRLOCK(rwlock)
adcf00ba
AZ
1038{
1039 struct bond_entry *e;
1040 bool update_rules = force; /* Always update rules if caller forces it. */
1041
1042 /* Make sure all bond entries are populated */
1043 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
91fc374a 1044 if (!e->member || !e->member->enabled) {
adcf00ba 1045 update_rules = true;
91fc374a
BP
1046 e->member = CONTAINER_OF(hmap_random_node(&bond->members),
1047 struct bond_member, hmap_node);
1048 if (!e->member->enabled) {
1049 e->member = bond->active_member;
adcf00ba
AZ
1050 }
1051 }
1052 }
1053
1054 if (update_rules) {
1055 update_recirc_rules(bond);
1056 }
1057}
ca8127fd
AZ
1058
1059void
82f9f1f5
AZ
1060bond_update_post_recirc_rules(struct bond *bond, uint32_t *recirc_id,
1061 uint32_t *hash_basis)
ca8127fd 1062{
a80aba3a
AZ
1063 bool may_recirc = bond_may_recirc(bond);
1064
1065 if (may_recirc) {
1066 /* To avoid unnecessary locking, bond_may_recirc() is first
1067 * called outside of the 'rwlock'. After acquiring the lock,
1068 * check again to make sure bond configuration has not been changed. */
1069 ovs_rwlock_wrlock(&rwlock);
1070 may_recirc = bond_may_recirc(bond);
1071 if (may_recirc) {
1072 *recirc_id = bond->recirc_id;
1073 *hash_basis = bond->basis;
1074 bond_update_post_recirc_rules__(bond, false);
1075 }
1076 ovs_rwlock_unlock(&rwlock);
1077 }
1078
1079 if (!may_recirc) {
6b95d23c 1080 *recirc_id = *hash_basis = 0;
82f9f1f5 1081 }
ca8127fd 1082}
82f9f1f5 1083
adcf00ba 1084\f
f620b43a
BP
1085/* Rebalancing. */
1086
1b137691 1087static bool
3bfd3972 1088bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
1b137691 1089{
bc1b010c 1090 return bond->rebalance_interval
90061ea7
BP
1091 && (bond->balance == BM_SLB || bond->balance == BM_TCP)
1092 && !(bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED);
1b137691
EJ
1093}
1094
f620b43a
BP
1095/* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
1096void
1097bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
1098 uint64_t n_bytes)
1099{
3bfd3972 1100 ovs_rwlock_wrlock(&rwlock);
1b137691 1101 if (bond_is_balanced(bond)) {
f620b43a 1102 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
f620b43a 1103 }
3bfd3972 1104 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1105}
1106
91fc374a
BP
1107static struct bond_member *
1108bond_member_from_bal_node(struct ovs_list *bal) OVS_REQ_RDLOCK(rwlock)
f620b43a 1109{
91fc374a 1110 return CONTAINER_OF(bal, struct bond_member, bal_node);
f620b43a
BP
1111}
1112
1113static void
ca6ba700 1114log_bals(struct bond *bond, const struct ovs_list *bals)
c6855ec5 1115 OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
1116{
1117 if (VLOG_IS_DBG_ENABLED()) {
1118 struct ds ds = DS_EMPTY_INITIALIZER;
91fc374a 1119 const struct bond_member *member;
f620b43a 1120
91fc374a 1121 LIST_FOR_EACH (member, bal_node, bals) {
f620b43a
BP
1122 if (ds.length) {
1123 ds_put_char(&ds, ',');
1124 }
1125 ds_put_format(&ds, " %s %"PRIu64"kB",
91fc374a 1126 member->name, member->tx_bytes / 1024);
f620b43a 1127
91fc374a 1128 if (!member->enabled) {
f620b43a
BP
1129 ds_put_cstr(&ds, " (disabled)");
1130 }
91fc374a 1131 if (!ovs_list_is_empty(&member->entries)) {
f620b43a
BP
1132 struct bond_entry *e;
1133
1134 ds_put_cstr(&ds, " (");
91fc374a
BP
1135 LIST_FOR_EACH (e, list_node, &member->entries) {
1136 if (&e->list_node != ovs_list_front(&member->entries)) {
f620b43a
BP
1137 ds_put_cstr(&ds, " + ");
1138 }
34582733 1139 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
f620b43a
BP
1140 e - bond->hash, e->tx_bytes / 1024);
1141 }
1142 ds_put_cstr(&ds, ")");
1143 }
1144 }
1145 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
1146 ds_destroy(&ds);
1147 }
1148}
1149
91fc374a 1150/* Shifts 'hash' from its current member to 'to'. */
f620b43a 1151static void
91fc374a 1152bond_shift_load(struct bond_entry *hash, struct bond_member *to)
c6855ec5 1153 OVS_REQ_WRLOCK(rwlock)
f620b43a 1154{
91fc374a 1155 struct bond_member *from = hash->member;
f620b43a
BP
1156 struct bond *bond = from->bond;
1157 uint64_t delta = hash->tx_bytes;
1158
34582733 1159 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
f620b43a
BP
1160 "from %s to %s (now carrying %"PRIu64"kB and "
1161 "%"PRIu64"kB load, respectively)",
1162 bond->name, delta / 1024, hash - bond->hash,
1163 from->name, to->name,
1164 (from->tx_bytes - delta) / 1024,
1165 (to->tx_bytes + delta) / 1024);
1166
1167 /* Shift load away from 'from' to 'to'. */
1168 from->tx_bytes -= delta;
1169 to->tx_bytes += delta;
1170
1171 /* Arrange for flows to be revalidated. */
91fc374a 1172 hash->member = to;
4a1b8f30 1173 bond->bond_revalidate = true;
f620b43a
BP
1174}
1175
09a5d390 1176/* Picks and returns a bond_entry to migrate from 'from' (the most heavily
91fc374a
BP
1177 * loaded bond member) to a bond member that has 'to_tx_bytes' bytes of load,
1178 * given that doing so must decrease the ratio of the load on the two members
1179 * by at least 0.1. Returns NULL if there is no appropriate entry.
f620b43a
BP
1180 *
1181 * The list of entries isn't sorted. I don't know of a reason to prefer to
1182 * shift away small hashes or large hashes. */
1183static struct bond_entry *
91fc374a 1184choose_entry_to_migrate(const struct bond_member *from, uint64_t to_tx_bytes)
c6855ec5 1185 OVS_REQ_WRLOCK(rwlock)
f620b43a
BP
1186{
1187 struct bond_entry *e;
1188
417e7e66 1189 if (ovs_list_is_short(&from->entries)) {
f620b43a
BP
1190 /* 'from' carries no more than one MAC hash, so shifting load away from
1191 * it would be pointless. */
1192 return NULL;
1193 }
1194
1195 LIST_FOR_EACH (e, list_node, &from->entries) {
c460a6a7
AZ
1196 uint64_t delta = e->tx_bytes; /* The amount to rebalance. */
1197 uint64_t ideal_tx_bytes = (from->tx_bytes + to_tx_bytes)/2;
1198 /* Note, the ideal traffic is the mid point
1199 * between 'from' and 'to'. This value does
1200 * not change by rebalancing. */
1201 uint64_t new_low; /* The lower bandwidth between 'to' and 'from'
1202 after rebalancing. */
1203
1204 new_low = MIN(from->tx_bytes - delta, to_tx_bytes + delta);
1205
1206 if ((new_low > to_tx_bytes) &&
1207 (new_low - to_tx_bytes >= (ideal_tx_bytes - to_tx_bytes) / 10)) {
1208 /* Only rebalance if the new 'low' is closer to to the mid point,
1209 * and the improvement exceeds 10% of current traffic
1210 * deviation from the ideal split.
1211 *
1212 * The improvement on the 'high' side is always the same as the
1213 * 'low' side. Thus consider 'low' side is sufficient. */
f620b43a
BP
1214 return e;
1215 }
1216 }
1217
1218 return NULL;
1219}
1220
91fc374a 1221/* Inserts 'member' into 'bals' so that descending order of 'tx_bytes' is
f620b43a
BP
1222 * maintained. */
1223static void
91fc374a 1224insert_bal(struct ovs_list *bals, struct bond_member *member)
f620b43a 1225{
91fc374a 1226 struct bond_member *pos;
f620b43a
BP
1227
1228 LIST_FOR_EACH (pos, bal_node, bals) {
91fc374a 1229 if (member->tx_bytes > pos->tx_bytes) {
f620b43a
BP
1230 break;
1231 }
1232 }
91fc374a 1233 ovs_list_insert(&pos->bal_node, &member->bal_node);
f620b43a
BP
1234}
1235
91fc374a 1236/* Removes 'member' from its current list and then inserts it into 'bals' so
f620b43a
BP
1237 * that descending order of 'tx_bytes' is maintained. */
1238static void
91fc374a 1239reinsert_bal(struct ovs_list *bals, struct bond_member *member)
f620b43a 1240{
91fc374a
BP
1241 ovs_list_remove(&member->bal_node);
1242 insert_bal(bals, member);
f620b43a
BP
1243}
1244
1245/* If 'bond' needs rebalancing, does so.
1246 *
adcf00ba
AZ
1247 * The caller should have called bond_account() for each active flow, or in case
1248 * of recirculation is used, have called bond_recirculation_account(bond),
1249 * to ensure that flow data is consistently accounted at this point.
60cda7d6
AZ
1250 */
1251void
4a1b8f30 1252bond_rebalance(struct bond *bond)
f620b43a 1253{
91fc374a 1254 struct bond_member *member;
f620b43a 1255 struct bond_entry *e;
ca6ba700 1256 struct ovs_list bals;
adcf00ba 1257 bool rebalanced = false;
60cda7d6 1258 bool use_recirc;
f620b43a 1259
3bfd3972 1260 ovs_rwlock_wrlock(&rwlock);
1b137691 1261 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
adcf00ba 1262 goto done;
f620b43a
BP
1263 }
1264 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1265
88186383 1266 use_recirc = bond->ofproto->backer->rt_support.odp.recirc &&
6b95d23c 1267 bond_may_recirc(bond);
60cda7d6
AZ
1268
1269 if (use_recirc) {
1270 bond_recirculation_account(bond);
1271 }
1272
91fc374a
BP
1273 /* Add each bond_entry to its member's 'entries' list.
1274 * Compute each member's tx_bytes as the sum of its entries' tx_bytes. */
1275 HMAP_FOR_EACH (member, hmap_node, &bond->members) {
1276 member->tx_bytes = 0;
1277 ovs_list_init(&member->entries);
f620b43a
BP
1278 }
1279 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
91fc374a
BP
1280 if (e->member && e->tx_bytes) {
1281 e->member->tx_bytes += e->tx_bytes;
1282 ovs_list_push_back(&e->member->entries, &e->list_node);
f620b43a
BP
1283 }
1284 }
1285
91fc374a 1286 /* Add enabled members to 'bals' in descending order of tx_bytes.
f620b43a 1287 *
91fc374a 1288 * XXX This is O(n**2) in the number of members but it could be O(n lg n)
f620b43a 1289 * with a proper list sort algorithm. */
417e7e66 1290 ovs_list_init(&bals);
91fc374a
BP
1291 HMAP_FOR_EACH (member, hmap_node, &bond->members) {
1292 if (member->enabled) {
1293 insert_bal(&bals, member);
f620b43a
BP
1294 }
1295 }
1296 log_bals(bond, &bals);
1297
91fc374a 1298 /* Shift load from the most-loaded members to the least-loaded members. */
417e7e66 1299 while (!ovs_list_is_short(&bals)) {
91fc374a
BP
1300 struct bond_member *from
1301 = bond_member_from_bal_node(ovs_list_front(&bals));
1302 struct bond_member *to
1303 = bond_member_from_bal_node(ovs_list_back(&bals));
f620b43a
BP
1304 uint64_t overload;
1305
1306 overload = from->tx_bytes - to->tx_bytes;
1307 if (overload < to->tx_bytes >> 5 || overload < 100000) {
91fc374a
BP
1308 /* The extra load on 'from' (and all less-loaded members), compared
1309 * to that of 'to' (the least-loaded member), is less than ~3%, or
f620b43a
BP
1310 * it is less than ~1Mbps. No point in rebalancing. */
1311 break;
1312 }
1313
09a5d390
BP
1314 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1315 * to move from 'from' to 'to'. */
f620b43a
BP
1316 e = choose_entry_to_migrate(from, to->tx_bytes);
1317 if (e) {
4a1b8f30 1318 bond_shift_load(e, to);
f620b43a
BP
1319
1320 /* Delete element from from->entries.
1321 *
1322 * We don't add the element to to->hashes. That would only allow
91fc374a 1323 * 'e' to be migrated to another member in this rebalancing run, and
f620b43a 1324 * there is no point in doing that. */
417e7e66 1325 ovs_list_remove(&e->list_node);
f620b43a
BP
1326
1327 /* Re-sort 'bals'. */
1328 reinsert_bal(&bals, from);
1329 reinsert_bal(&bals, to);
60cda7d6 1330 rebalanced = true;
f620b43a
BP
1331 } else {
1332 /* Can't usefully migrate anything away from 'from'.
1333 * Don't reconsider it. */
417e7e66 1334 ovs_list_remove(&from->bal_node);
f620b43a
BP
1335 }
1336 }
1337
1338 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1339 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1340 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1341 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1342 e->tx_bytes /= 2;
f620b43a 1343 }
adcf00ba 1344
60cda7d6 1345 if (use_recirc && rebalanced) {
ca8127fd 1346 bond_update_post_recirc_rules__(bond,true);
60cda7d6 1347 }
2f486d4c
AZ
1348
1349done:
3bfd3972 1350 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1351}
1352\f
1353/* Bonding unixctl user interface functions. */
1354
1355static struct bond *
3bfd3972 1356bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
1357{
1358 struct bond *bond;
1359
1360 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
3bfd3972 1361 all_bonds) {
f620b43a
BP
1362 if (!strcmp(bond->name, name)) {
1363 return bond;
1364 }
1365 }
1366 return NULL;
1367}
1368
91fc374a
BP
1369static struct bond_member *
1370bond_lookup_member(struct bond *bond, const char *member_name)
f620b43a 1371{
91fc374a 1372 struct bond_member *member;
f620b43a 1373
91fc374a
BP
1374 HMAP_FOR_EACH (member, hmap_node, &bond->members) {
1375 if (!strcmp(member->name, member_name)) {
1376 return member;
f620b43a
BP
1377 }
1378 }
1379 return NULL;
1380}
1381
1382static void
1383bond_unixctl_list(struct unixctl_conn *conn,
0e15264f
BP
1384 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1385 void *aux OVS_UNUSED)
f620b43a
BP
1386{
1387 struct ds ds = DS_EMPTY_INITIALIZER;
1388 const struct bond *bond;
1389
91fc374a 1390 ds_put_cstr(&ds, "bond\ttype\trecircID\tmembers\n");
f620b43a 1391
3bfd3972
EJ
1392 ovs_rwlock_rdlock(&rwlock);
1393 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
91fc374a 1394 const struct bond_member *member;
f620b43a
BP
1395 size_t i;
1396
adcf00ba
AZ
1397 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1398 bond_mode_to_string(bond->balance), bond->recirc_id);
f620b43a
BP
1399
1400 i = 0;
91fc374a 1401 HMAP_FOR_EACH (member, hmap_node, &bond->members) {
f620b43a
BP
1402 if (i++ > 0) {
1403 ds_put_cstr(&ds, ", ");
1404 }
91fc374a 1405 ds_put_cstr(&ds, member->name);
f620b43a
BP
1406 }
1407 ds_put_char(&ds, '\n');
1408 }
3bfd3972 1409 ovs_rwlock_unlock(&rwlock);
bde9f75d 1410 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a
BP
1411 ds_destroy(&ds);
1412}
1413
1414static void
c33a8a25 1415bond_print_details(struct ds *ds, const struct bond *bond)
3bfd3972 1416 OVS_REQ_RDLOCK(rwlock)
f620b43a 1417{
91fc374a
BP
1418 struct shash member_shash = SHASH_INITIALIZER(&member_shash);
1419 const struct shash_node **sorted_members = NULL;
1420 const struct bond_member *member;
9df65060 1421 bool use_lb_output_action;
adcf00ba
AZ
1422 bool may_recirc;
1423 uint32_t recirc_id;
fc1d4f01 1424 int i;
f620b43a 1425
c33a8a25
EJ
1426 ds_put_format(ds, "---- %s ----\n", bond->name);
1427 ds_put_format(ds, "bond_mode: %s\n",
f620b43a
BP
1428 bond_mode_to_string(bond->balance));
1429
6b95d23c
AZ
1430 may_recirc = bond_may_recirc(bond);
1431 recirc_id = bond->recirc_id;
adcf00ba
AZ
1432 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1433 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1434
c33a8a25 1435 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
672d18b2 1436
9df65060
VDA
1437 use_lb_output_action = bond_use_lb_output_action(bond);
1438 ds_put_format(ds, "lb_output action: %s, bond-id: %d\n",
1439 use_lb_output_action ? "enabled" : "disabled",
1440 use_lb_output_action ? recirc_id : -1);
1441
c33a8a25
EJ
1442 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1443 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
f620b43a 1444
1b137691 1445 if (bond_is_balanced(bond)) {
c33a8a25 1446 ds_put_format(ds, "next rebalance: %lld ms\n",
f620b43a
BP
1447 bond->next_rebalance - time_msec());
1448 }
1449
bdebeece
EJ
1450 ds_put_cstr(ds, "lacp_status: ");
1451 switch (bond->lacp_status) {
1452 case LACP_NEGOTIATED:
1453 ds_put_cstr(ds, "negotiated\n");
1454 break;
1455 case LACP_CONFIGURED:
1456 ds_put_cstr(ds, "configured\n");
1457 break;
1458 case LACP_DISABLED:
1459 ds_put_cstr(ds, "off\n");
1460 break;
1461 default:
1462 ds_put_cstr(ds, "<unknown>\n");
1463 break;
1464 }
4d6fb5eb 1465
57fc4fd0 1466 ds_put_format(ds, "lacp_fallback_ab: %s\n",
1467 bond->lacp_fallback_ab ? "true" : "false");
1468
b4e50218 1469 bool found_primary = false;
91fc374a
BP
1470 HMAP_FOR_EACH (member, hmap_node, &bond->members) {
1471 if (member->is_primary) {
b4e50218
JS
1472 found_primary = true;
1473 }
91fc374a 1474 shash_add(&member_shash, member->name, member);
b4e50218
JS
1475 }
1476
1477 ds_put_format(ds, "active-backup primary: %s%s\n",
1478 bond->primary ? bond->primary : "<none>",
1479 (!found_primary && bond->primary)
91fc374a 1480 ? " (no such member)" : "");
b4e50218 1481
91fc374a
BP
1482 member = bond_find_member_by_mac(bond, bond->active_member_mac);
1483 ds_put_cstr(ds, "active member mac: ");
1484 ds_put_format(ds, ETH_ADDR_FMT, ETH_ADDR_ARGS(bond->active_member_mac));
1485 ds_put_format(ds, "(%s)\n", member ? member->name : "none");
3e5aeeb5 1486
91fc374a
BP
1487 sorted_members = shash_sort(&member_shash);
1488 for (i = 0; i < shash_count(&member_shash); i++) {
f620b43a 1489 struct bond_entry *be;
f620b43a 1490
91fc374a 1491 member = sorted_members[i]->data;
fc1d4f01 1492
f620b43a 1493 /* Basic info. */
91fc374a
BP
1494 ds_put_format(ds, "\nmember %s: %s\n",
1495 member->name, member->enabled ? "enabled" : "disabled");
1496 if (member == bond->active_member) {
1497 ds_put_cstr(ds, " active member\n");
f620b43a 1498 }
91fc374a 1499 if (member->delay_expires != LLONG_MAX) {
828519ca 1500 ds_put_format(ds, " %s expires in %lld ms\n",
91fc374a
BP
1501 member->enabled ? "downdelay" : "updelay",
1502 member->delay_expires - time_msec());
f620b43a
BP
1503 }
1504
828519ca 1505 ds_put_format(ds, " may_enable: %s\n",
91fc374a 1506 member->may_enable ? "true" : "false");
4d6fb5eb 1507
1b137691 1508 if (!bond_is_balanced(bond)) {
f620b43a
BP
1509 continue;
1510 }
1511
1512 /* Hashes. */
f620b43a
BP
1513 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1514 int hash = be - bond->hash;
f6ba1f35 1515 uint64_t be_tx_k;
f620b43a 1516
91fc374a 1517 if (be->member != member) {
f620b43a
BP
1518 continue;
1519 }
1520
f6ba1f35
AZ
1521 be_tx_k = be->tx_bytes / 1024;
1522 if (be_tx_k) {
828519ca 1523 ds_put_format(ds, " hash %d: %"PRIu64" kB load\n",
f6ba1f35
AZ
1524 hash, be_tx_k);
1525 }
f620b43a 1526
7b9f1974 1527 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
f620b43a
BP
1528 }
1529 }
91fc374a
BP
1530 shash_destroy(&member_shash);
1531 free(sorted_members);
c33a8a25
EJ
1532 ds_put_cstr(ds, "\n");
1533}
1534
1535static void
1536bond_unixctl_show(struct unixctl_conn *conn,
1537 int argc, const char *argv[],
1538 void *aux OVS_UNUSED)
1539{
1540 struct ds ds = DS_EMPTY_INITIALIZER;
1541
3bfd3972 1542 ovs_rwlock_rdlock(&rwlock);
c33a8a25
EJ
1543 if (argc > 1) {
1544 const struct bond *bond = bond_find(argv[1]);
1545
1546 if (!bond) {
bde9f75d 1547 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1548 goto out;
c33a8a25
EJ
1549 }
1550 bond_print_details(&ds, bond);
1551 } else {
1552 const struct bond *bond;
1553
3bfd3972 1554 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
c33a8a25
EJ
1555 bond_print_details(&ds, bond);
1556 }
1557 }
1558
bde9f75d 1559 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a 1560 ds_destroy(&ds);
3bfd3972
EJ
1561
1562out:
1563 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1564}
1565
1566static void
0e15264f
BP
1567bond_unixctl_migrate(struct unixctl_conn *conn,
1568 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1569 void *aux OVS_UNUSED)
1570{
0e15264f
BP
1571 const char *bond_s = argv[1];
1572 const char *hash_s = argv[2];
91fc374a 1573 const char *member_s = argv[3];
f620b43a 1574 struct bond *bond;
91fc374a 1575 struct bond_member *member;
f620b43a
BP
1576 struct bond_entry *entry;
1577 int hash;
1578
3bfd3972 1579 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1580 bond = bond_find(bond_s);
1581 if (!bond) {
bde9f75d 1582 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1583 goto out;
f620b43a
BP
1584 }
1585
1586 if (bond->balance != BM_SLB) {
bde9f75d 1587 unixctl_command_reply_error(conn, "not an SLB bond");
3bfd3972 1588 goto out;
f620b43a
BP
1589 }
1590
1591 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1592 hash = atoi(hash_s) & BOND_MASK;
1593 } else {
bde9f75d 1594 unixctl_command_reply_error(conn, "bad hash");
3bfd3972 1595 goto out;
f620b43a
BP
1596 }
1597
91fc374a
BP
1598 member = bond_lookup_member(bond, member_s);
1599 if (!member) {
1600 unixctl_command_reply_error(conn, "no such member");
3bfd3972 1601 goto out;
f620b43a
BP
1602 }
1603
91fc374a
BP
1604 if (!member->enabled) {
1605 unixctl_command_reply_error(conn,
1606 "cannot migrate to disabled member");
3bfd3972 1607 goto out;
f620b43a
BP
1608 }
1609
1610 entry = &bond->hash[hash];
4a1b8f30 1611 bond->bond_revalidate = true;
91fc374a 1612 entry->member = member;
bde9f75d 1613 unixctl_command_reply(conn, "migrated");
3bfd3972
EJ
1614
1615out:
1616 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1617}
1618
1619static void
91fc374a
BP
1620bond_unixctl_set_active_member(struct unixctl_conn *conn,
1621 int argc OVS_UNUSED, const char *argv[],
1622 void *aux OVS_UNUSED)
f620b43a 1623{
0e15264f 1624 const char *bond_s = argv[1];
91fc374a 1625 const char *member_s = argv[2];
f620b43a 1626 struct bond *bond;
91fc374a 1627 struct bond_member *member;
f620b43a 1628
3bfd3972 1629 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1630 bond = bond_find(bond_s);
1631 if (!bond) {
bde9f75d 1632 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1633 goto out;
f620b43a
BP
1634 }
1635
91fc374a
BP
1636 member = bond_lookup_member(bond, member_s);
1637 if (!member) {
1638 unixctl_command_reply_error(conn, "no such member");
3bfd3972 1639 goto out;
f620b43a
BP
1640 }
1641
91fc374a
BP
1642 if (!member->enabled) {
1643 unixctl_command_reply_error(conn,
1644 "cannot make disabled member active");
3bfd3972 1645 goto out;
f620b43a
BP
1646 }
1647
91fc374a 1648 if (bond->active_member != member) {
4a1b8f30 1649 bond->bond_revalidate = true;
91fc374a
BP
1650 bond->active_member = member;
1651 VLOG_INFO("bond %s: active member is now %s",
1652 bond->name, member->name);
f620b43a 1653 bond->send_learning_packets = true;
bde9f75d 1654 unixctl_command_reply(conn, "done");
91fc374a 1655 bond_active_member_changed(bond);
f620b43a 1656 } else {
bde9f75d 1657 unixctl_command_reply(conn, "no change");
f620b43a 1658 }
3bfd3972
EJ
1659out:
1660 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1661}
1662
1663static void
91fc374a 1664enable_member(struct unixctl_conn *conn, const char *argv[], bool enable)
f620b43a 1665{
0e15264f 1666 const char *bond_s = argv[1];
91fc374a 1667 const char *member_s = argv[2];
f620b43a 1668 struct bond *bond;
91fc374a 1669 struct bond_member *member;
f620b43a 1670
3bfd3972 1671 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1672 bond = bond_find(bond_s);
1673 if (!bond) {
bde9f75d 1674 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1675 goto out;
f620b43a
BP
1676 }
1677
91fc374a
BP
1678 member = bond_lookup_member(bond, member_s);
1679 if (!member) {
1680 unixctl_command_reply_error(conn, "no such member");
3bfd3972 1681 goto out;
f620b43a
BP
1682 }
1683
91fc374a 1684 bond_enable_member(member, enable);
bde9f75d 1685 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
3bfd3972
EJ
1686
1687out:
1688 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1689}
1690
1691static void
91fc374a
BP
1692bond_unixctl_enable_member(struct unixctl_conn *conn,
1693 int argc OVS_UNUSED, const char *argv[],
1694 void *aux OVS_UNUSED)
f620b43a 1695{
91fc374a 1696 enable_member(conn, argv, true);
f620b43a
BP
1697}
1698
1699static void
91fc374a
BP
1700bond_unixctl_disable_member(struct unixctl_conn *conn,
1701 int argc OVS_UNUSED, const char *argv[],
1702 void *aux OVS_UNUSED)
f620b43a 1703{
91fc374a 1704 enable_member(conn, argv, false);
f620b43a
BP
1705}
1706
1707static void
0e15264f 1708bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
f620b43a
BP
1709 void *aux OVS_UNUSED)
1710{
0e15264f
BP
1711 const char *mac_s = argv[1];
1712 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1713 const char *basis_s = argc > 3 ? argv[3] : NULL;
74ff3298 1714 struct eth_addr mac;
f620b43a
BP
1715 uint8_t hash;
1716 char *hash_cstr;
1717 unsigned int vlan;
672d18b2 1718 uint32_t basis;
f620b43a
BP
1719
1720 if (vlan_s) {
c2c28dfd 1721 if (!ovs_scan(vlan_s, "%u", &vlan)) {
bde9f75d 1722 unixctl_command_reply_error(conn, "invalid vlan");
f620b43a
BP
1723 return;
1724 }
1725 } else {
dc155bff 1726 vlan = 0;
f620b43a
BP
1727 }
1728
672d18b2 1729 if (basis_s) {
c2c28dfd 1730 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
bde9f75d 1731 unixctl_command_reply_error(conn, "invalid basis");
672d18b2
EJ
1732 return;
1733 }
1734 } else {
1735 basis = 0;
1736 }
1737
c2c28dfd 1738 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
e9013d6a 1739 hash = hash_mac(mac, vlan, basis) & BOND_MASK;
f620b43a
BP
1740
1741 hash_cstr = xasprintf("%u", hash);
bde9f75d 1742 unixctl_command_reply(conn, hash_cstr);
f620b43a
BP
1743 free(hash_cstr);
1744 } else {
bde9f75d 1745 unixctl_command_reply_error(conn, "invalid mac");
f620b43a
BP
1746 }
1747}
1748
1749void
1750bond_init(void)
1751{
0e15264f 1752 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
c33a8a25
EJ
1753 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1754 NULL);
91fc374a 1755 unixctl_command_register("bond/migrate", "port hash member", 3, 3,
7ff2009a 1756 bond_unixctl_migrate, NULL);
91fc374a
BP
1757 unixctl_command_register("bond/set-active-member", "port member", 2, 2,
1758 bond_unixctl_set_active_member, NULL);
1759 unixctl_command_register("bond/enable-member", "port member", 2, 2,
1760 bond_unixctl_enable_member, NULL);
1761 unixctl_command_register("bond/disable-member", "port member", 2, 2,
1762 bond_unixctl_disable_member, NULL);
0e15264f 1763 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
7ff2009a 1764 bond_unixctl_hash, NULL);
91fc374a
BP
1765
1766 /* Backward-compatibility command names. */
1767 unixctl_command_register("bond/set-active-slave", NULL, 2, 2,
1768 bond_unixctl_set_active_member, NULL);
1769 unixctl_command_register("bond/enable-slave", NULL, 2, 2,
1770 bond_unixctl_enable_member, NULL);
1771 unixctl_command_register("bond/disable-slave", NULL, 2, 2,
1772 bond_unixctl_disable_member, NULL);
f620b43a
BP
1773}
1774\f
95aafb2a
EJ
1775static void
1776bond_entry_reset(struct bond *bond)
1777{
90061ea7 1778 if (bond->balance != BM_AB && !bond_is_falling_back_to_ab(bond)) {
9e1a6910 1779 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
95aafb2a
EJ
1780
1781 if (!bond->hash) {
1782 bond->hash = xmalloc(hash_len);
1783 }
1784 memset(bond->hash, 0, hash_len);
1785
1786 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1787 } else {
1788 free(bond->hash);
1789 bond->hash = NULL;
05df1623
AZ
1790 /* Remove existing post recirc rules. */
1791 update_recirc_rules(bond);
95aafb2a
EJ
1792 }
1793}
1794
91fc374a
BP
1795static struct bond_member *
1796bond_member_lookup(struct bond *bond, const void *member_)
f620b43a 1797{
91fc374a 1798 struct bond_member *member;
f620b43a 1799
91fc374a
BP
1800 HMAP_FOR_EACH_IN_BUCKET (member, hmap_node, hash_pointer(member_, 0),
1801 &bond->members) {
1802 if (member->aux == member_) {
1803 return member;
f620b43a
BP
1804 }
1805 }
1806
1807 return NULL;
1808}
1809
f620b43a 1810static void
91fc374a 1811bond_enable_member(struct bond_member *member, bool enable)
f620b43a 1812{
91fc374a 1813 struct bond *bond = member->bond;
5fef88ea 1814
91fc374a
BP
1815 member->delay_expires = LLONG_MAX;
1816 if (enable != member->enabled) {
1817 member->bond->bond_revalidate = true;
1818 member->enabled = enable;
f1c8a79c 1819
91fc374a 1820 ovs_mutex_lock(&member->bond->mutex);
f1c8a79c 1821 if (enable) {
91fc374a 1822 ovs_list_insert(&member->bond->enabled_members, &member->list_node);
f1c8a79c 1823 } else {
5fef88ea 1824 bond->send_learning_packets = true;
91fc374a 1825 ovs_list_remove(&member->list_node);
f1c8a79c 1826 }
91fc374a 1827 ovs_mutex_unlock(&member->bond->mutex);
f1c8a79c 1828
91fc374a
BP
1829 VLOG_INFO("member %s: %s", member->name,
1830 member->enabled ? "enabled" : "disabled");
f620b43a
BP
1831 }
1832}
1833
1834static void
91fc374a 1835bond_link_status_update(struct bond_member *member)
f620b43a 1836{
91fc374a 1837 struct bond *bond = member->bond;
f620b43a
BP
1838 bool up;
1839
91fc374a
BP
1840 up = netdev_get_carrier(member->netdev) && member->may_enable;
1841 if ((up == member->enabled) != (member->delay_expires == LLONG_MAX)) {
f620b43a 1842 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
91fc374a
BP
1843 VLOG_INFO_RL(&rl, "member %s: link state %s",
1844 member->name, up ? "up" : "down");
1845 if (up == member->enabled) {
1846 member->delay_expires = LLONG_MAX;
1847 VLOG_INFO_RL(&rl, "member %s: will not be %s",
1848 member->name, up ? "disabled" : "enabled");
f620b43a 1849 } else {
f9417807 1850 int delay = up ? bond->updelay : bond->downdelay;
91fc374a 1851 member->delay_expires = time_msec() + delay;
f620b43a 1852 if (delay) {
91fc374a 1853 VLOG_INFO_RL(&rl, "member %s: will be %s if it stays %s "
f620b43a 1854 "for %d ms",
91fc374a 1855 member->name,
f620b43a
BP
1856 up ? "enabled" : "disabled",
1857 up ? "up" : "down",
1858 delay);
1859 }
1860 }
1861 }
1862
91fc374a
BP
1863 if (time_msec() >= member->delay_expires) {
1864 bond_enable_member(member, up);
f620b43a
BP
1865 }
1866}
1867
fb0b29a3
EJ
1868static unsigned int
1869bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1870{
cb22974d 1871 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
fb0b29a3 1872
bdebeece 1873 return (bond->balance == BM_TCP
42781e77 1874 ? flow_hash_5tuple(flow, bond->basis)
e9013d6a 1875 : hash_mac(flow->dl_src, vlan, bond->basis));
fb0b29a3
EJ
1876}
1877
f620b43a
BP
1878static struct bond_entry *
1879lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1880 uint16_t vlan)
1881{
fb0b29a3 1882 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
f620b43a
BP
1883}
1884
91fc374a
BP
1885/* Selects and returns an enabled member from the 'enabled_members' list
1886 * in a round-robin fashion. If the 'enabled_members' list is empty,
f1c8a79c 1887 * returns NULL. */
91fc374a
BP
1888static struct bond_member *
1889get_enabled_member(struct bond *bond)
f1c8a79c 1890{
ca6ba700 1891 struct ovs_list *node;
f1c8a79c
AW
1892
1893 ovs_mutex_lock(&bond->mutex);
91fc374a 1894 if (ovs_list_is_empty(&bond->enabled_members)) {
f1c8a79c
AW
1895 ovs_mutex_unlock(&bond->mutex);
1896 return NULL;
1897 }
1898
91fc374a
BP
1899 node = ovs_list_pop_front(&bond->enabled_members);
1900 ovs_list_push_back(&bond->enabled_members, node);
f1c8a79c
AW
1901 ovs_mutex_unlock(&bond->mutex);
1902
91fc374a 1903 return CONTAINER_OF(node, struct bond_member, list_node);
f1c8a79c
AW
1904}
1905
91fc374a
BP
1906static struct bond_member *
1907choose_output_member(const struct bond *bond, const struct flow *flow,
4a1b8f30 1908 struct flow_wildcards *wc, uint16_t vlan)
f620b43a
BP
1909{
1910 struct bond_entry *e;
9dd165e0 1911 int balance;
f620b43a 1912
9dd165e0 1913 balance = bond->balance;
bdebeece
EJ
1914 if (bond->lacp_status == LACP_CONFIGURED) {
1915 /* LACP has been configured on this bond but negotiations were
9dd165e0
RK
1916 * unsuccussful. If lacp_fallback_ab is enabled use active-
1917 * backup mode else drop all traffic. */
1918 if (!bond->lacp_fallback_ab) {
1919 return NULL;
1920 }
1921 balance = BM_AB;
bdebeece
EJ
1922 }
1923
9dd165e0 1924 switch (balance) {
f620b43a 1925 case BM_AB:
91fc374a 1926 return bond->active_member;
f620b43a 1927
f620b43a 1928 case BM_TCP:
bdebeece
EJ
1929 if (bond->lacp_status != LACP_NEGOTIATED) {
1930 /* Must have LACP negotiations for TCP balanced bonds. */
1931 return NULL;
1932 }
bcd2633a 1933 if (wc) {
deb67947 1934 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L3L4_UDP);
bcd2633a 1935 }
bdebeece
EJ
1936 /* Fall Through. */
1937 case BM_SLB:
deb67947 1938 if (wc && balance == BM_SLB) {
6cdd5145 1939 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
bcd2633a 1940 }
f620b43a 1941 e = lookup_bond_entry(bond, flow, vlan);
91fc374a
BP
1942 if (!e->member || !e->member->enabled) {
1943 e->member = get_enabled_member(CONST_CAST(struct bond *, bond));
f620b43a 1944 }
91fc374a 1945 return e->member;
f620b43a
BP
1946
1947 default:
428b2edd 1948 OVS_NOT_REACHED();
f620b43a
BP
1949 }
1950}
1951
91fc374a
BP
1952static struct bond_member *
1953bond_choose_member(const struct bond *bond)
f620b43a 1954{
91fc374a 1955 struct bond_member *member, *best;
f620b43a 1956
b4e50218 1957 /* If there's a primary and it's active, return that. */
91fc374a
BP
1958 HMAP_FOR_EACH (member, hmap_node, &bond->members) {
1959 if (member->is_primary && member->enabled) {
1960 return member;
b4e50218
JS
1961 }
1962 }
1963
91fc374a
BP
1964 /* Find the last active member. */
1965 member = bond_find_member_by_mac(bond, bond->active_member_mac);
1966 if (member && member->enabled) {
1967 return member;
3e5aeeb5
AZ
1968 }
1969
91fc374a
BP
1970 /* Find an enabled member. */
1971 HMAP_FOR_EACH (member, hmap_node, &bond->members) {
1972 if (member->enabled) {
1973 return member;
f620b43a
BP
1974 }
1975 }
1976
91fc374a 1977 /* All members are disabled. Find an member that will be enabled
f620b43a
BP
1978 * after its updelay expires. */
1979 best = NULL;
91fc374a
BP
1980 HMAP_FOR_EACH (member, hmap_node, &bond->members) {
1981 if (member->delay_expires != LLONG_MAX
1982 && member->may_enable
1983 && (!best || member->delay_expires < best->delay_expires)) {
1984 best = member;
f620b43a
BP
1985 }
1986 }
1987 return best;
1988}
1989
1990static void
91fc374a 1991bond_choose_active_member(struct bond *bond)
f620b43a
BP
1992{
1993 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
91fc374a 1994 struct bond_member *old_active_member = bond->active_member;
f620b43a 1995
91fc374a
BP
1996 bond->active_member = bond_choose_member(bond);
1997 if (bond->active_member) {
1998 if (bond->active_member->enabled) {
1999 VLOG_INFO_RL(&rl, "bond %s: active member is now %s",
2000 bond->name, bond->active_member->name);
f620b43a 2001 } else {
91fc374a
BP
2002 VLOG_INFO_RL(&rl, "bond %s: active member is now %s, skipping "
2003 "remaining %lld ms updelay (since no member was "
2004 "enabled)", bond->name, bond->active_member->name,
2005 bond->active_member->delay_expires - time_msec());
2006 bond_enable_member(bond->active_member, true);
f620b43a
BP
2007 }
2008
2009 bond->send_learning_packets = true;
3e5aeeb5 2010
91fc374a
BP
2011 if (bond->active_member != old_active_member) {
2012 bond_active_member_changed(bond);
3e5aeeb5 2013 }
91fc374a
BP
2014 } else if (old_active_member) {
2015 bond_active_member_changed(bond);
2016 VLOG_INFO_RL(&rl, "bond %s: all members disabled", bond->name);
f620b43a
BP
2017 }
2018}
3e5aeeb5
AZ
2019
2020/*
91fc374a
BP
2021 * Return true if bond has unstored active member change.
2022 * If return true, 'mac' will store the bond's current active member's
3e5aeeb5
AZ
2023 * MAC address. */
2024bool
91fc374a 2025bond_get_changed_active_member(const char *name, struct eth_addr *mac,
74ff3298 2026 bool force)
3e5aeeb5
AZ
2027{
2028 struct bond *bond;
2029
2030 ovs_rwlock_wrlock(&rwlock);
2031 bond = bond_find(name);
2032 if (bond) {
91fc374a
BP
2033 if (bond->active_member_changed || force) {
2034 *mac = bond->active_member_mac;
2035 bond->active_member_changed = false;
3e5aeeb5
AZ
2036 ovs_rwlock_unlock(&rwlock);
2037 return true;
2038 }
2039 }
2040 ovs_rwlock_unlock(&rwlock);
2041
2042 return false;
2043}
9df65060
VDA
2044
2045bool
2046bond_use_lb_output_action(const struct bond *bond)
2047{
2048 return bond_may_recirc(bond) && bond->use_lb_output_action;
2049}
2050
2051static void
2052bond_add_lb_output_buckets(const struct bond *bond)
2053{
91fc374a 2054 ofp_port_t member_map[BOND_BUCKETS];
9df65060
VDA
2055
2056 for (int i = 0; i < BOND_BUCKETS; i++) {
91fc374a 2057 struct bond_member *member = bond->hash[i].member;
9df65060 2058
91fc374a
BP
2059 if (member) {
2060 member_map[i] = member->ofp_port;
9df65060 2061 } else {
91fc374a 2062 member_map[i] = OFPP_NONE;
9df65060
VDA
2063 }
2064 }
2065 ofproto_dpif_add_lb_output_buckets(bond->ofproto, bond->recirc_id,
91fc374a 2066 member_map);
9df65060
VDA
2067}
2068
2069static void
2070bond_del_lb_output_buckets(const struct bond *bond)
2071{
2072 ofproto_dpif_delete_lb_output_buckets(bond->ofproto,
2073 bond->recirc_id);
2074}