]> git.proxmox.com Git - mirror_ovs.git/blame - lib/bond.c
seq: Add some comments.
[mirror_ovs.git] / lib / bond.c
CommitLineData
f620b43a 1/*
09a5d390 2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
f620b43a
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
19#include "bond.h"
20
21#include <limits.h>
22#include <stdint.h>
23#include <stdlib.h>
75fad143 24#include <math.h>
f620b43a
BP
25
26#include "coverage.h"
27#include "dynamic-string.h"
28#include "flow.h"
29#include "hmap.h"
bdebeece 30#include "lacp.h"
f620b43a
BP
31#include "list.h"
32#include "netdev.h"
33#include "odp-util.h"
34#include "ofpbuf.h"
35#include "packets.h"
36#include "poll-loop.h"
fc1d4f01 37#include "shash.h"
f620b43a
BP
38#include "timeval.h"
39#include "unixctl.h"
40#include "vlog.h"
41
42VLOG_DEFINE_THIS_MODULE(bond);
43
f620b43a
BP
44/* Bit-mask for hashing a flow down to a bucket.
45 * There are (BOND_MASK + 1) buckets. */
46#define BOND_MASK 0xff
47
48/* A hash bucket for mapping a flow to a slave.
49 * "struct bond" has an array of (BOND_MASK + 1) of these. */
50struct bond_entry {
51 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
52 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
f620b43a
BP
53 struct list list_node; /* In bond_slave's 'entries' list. */
54};
55
56/* A bond slave, that is, one of the links comprising a bond. */
57struct bond_slave {
58 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
59 struct bond *bond; /* The bond that contains this slave. */
60 void *aux; /* Client-provided handle for this slave. */
61
62 struct netdev *netdev; /* Network device, owned by the client. */
1ea24138 63 unsigned int change_seq; /* Tracks changes in 'netdev'. */
f620b43a
BP
64 char *name; /* Name (a copy of netdev_get_name(netdev)). */
65
66 /* Link status. */
67 long long delay_expires; /* Time after which 'enabled' may change. */
f620b43a 68 bool enabled; /* May be chosen for flows? */
296f6519 69 bool may_enable; /* Client considers this slave bondable. */
f620b43a
BP
70
71 /* Rebalancing info. Used only by bond_rebalance(). */
72 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
73 struct list entries; /* 'struct bond_entry's assigned here. */
74 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
75};
76
77/* A bond, that is, a set of network devices grouped to improve performance or
78 * robustness. */
79struct bond {
80 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
81 char *name; /* Name provided by client. */
82
83 /* Slaves. */
84 struct hmap slaves;
85
86 /* Bonding info. */
87 enum bond_mode balance; /* Balancing mode, one of BM_*. */
88 struct bond_slave *active_slave;
f620b43a 89 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
bdebeece 90 enum lacp_status lacp_status; /* Status of LACP negotiations. */
62904702 91 bool bond_revalidate; /* True if flows need revalidation. */
672d18b2 92 uint32_t basis; /* Basis for flow hash function. */
f620b43a
BP
93
94 /* SLB specific bonding info. */
95 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
96 int rebalance_interval; /* Interval between rebalances, in ms. */
97 long long int next_rebalance; /* Next rebalancing time. */
98 bool send_learning_packets;
99
f620b43a
BP
100 /* Legacy compatibility. */
101 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
102
3bfd3972 103 atomic_int ref_cnt;
f620b43a
BP
104};
105
3bfd3972
EJ
106static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
107static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
108static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
f620b43a 109
3bfd3972
EJ
110static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
111static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
112 OVS_REQ_RDLOCK(rwlock);
4a1b8f30
EJ
113static void bond_enable_slave(struct bond_slave *, bool enable)
114 OVS_REQ_WRLOCK(rwlock);
115static void bond_link_status_update(struct bond_slave *)
3bfd3972 116 OVS_REQ_WRLOCK(rwlock);
4a1b8f30 117static void bond_choose_active_slave(struct bond *)
3bfd3972 118 OVS_REQ_WRLOCK(rwlock);;
f620b43a 119static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
672d18b2
EJ
120 uint16_t vlan, uint32_t basis);
121static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
122 uint32_t basis);
f620b43a
BP
123static struct bond_entry *lookup_bond_entry(const struct bond *,
124 const struct flow *,
3bfd3972
EJ
125 uint16_t vlan)
126 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
127static struct bond_slave *choose_output_slave(const struct bond *,
128 const struct flow *,
bcd2633a 129 struct flow_wildcards *,
4a1b8f30 130 uint16_t vlan)
3bfd3972
EJ
131 OVS_REQ_RDLOCK(rwlock);
132static void bond_update_fake_slave_stats(struct bond *)
133 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
134
135/* Attempts to parse 's' as the name of a bond balancing mode. If successful,
136 * stores the mode in '*balance' and returns true. Otherwise returns false
137 * without modifying '*balance'. */
138bool
139bond_mode_from_string(enum bond_mode *balance, const char *s)
140{
141 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
142 *balance = BM_TCP;
143 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
144 *balance = BM_SLB;
145 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
146 *balance = BM_AB;
147 } else {
148 return false;
149 }
150 return true;
151}
152
153/* Returns a string representing 'balance'. */
154const char *
155bond_mode_to_string(enum bond_mode balance) {
156 switch (balance) {
157 case BM_TCP:
158 return "balance-tcp";
159 case BM_SLB:
160 return "balance-slb";
161 case BM_AB:
162 return "active-backup";
163 }
164 NOT_REACHED();
165}
166
f620b43a
BP
167\f
168/* Creates and returns a new bond whose configuration is initially taken from
169 * 's'.
170 *
171 * The caller should register each slave on the new bond by calling
172 * bond_slave_register(). */
173struct bond *
174bond_create(const struct bond_settings *s)
175{
176 struct bond *bond;
177
178 bond = xzalloc(sizeof *bond);
179 hmap_init(&bond->slaves);
f620b43a 180 bond->next_fake_iface_update = LLONG_MAX;
3bfd3972 181 atomic_init(&bond->ref_cnt, 1);
f620b43a
BP
182
183 bond_reconfigure(bond, s);
f620b43a
BP
184 return bond;
185}
186
03366a2d
EJ
187struct bond *
188bond_ref(const struct bond *bond_)
189{
190 struct bond *bond = CONST_CAST(struct bond *, bond_);
191
bca0b3b4 192 if (bond) {
3bfd3972
EJ
193 int orig;
194 atomic_add(&bond->ref_cnt, 1, &orig);
195 ovs_assert(orig > 0);
bca0b3b4 196 }
03366a2d
EJ
197 return bond;
198}
199
f620b43a
BP
200/* Frees 'bond'. */
201void
03366a2d 202bond_unref(struct bond *bond)
f620b43a
BP
203{
204 struct bond_slave *slave, *next_slave;
3bfd3972 205 int orig;
f620b43a
BP
206
207 if (!bond) {
208 return;
209 }
210
3bfd3972
EJ
211 atomic_sub(&bond->ref_cnt, 1, &orig);
212 ovs_assert(orig > 0);
213 if (orig != 1) {
03366a2d
EJ
214 return;
215 }
216
3bfd3972
EJ
217 ovs_rwlock_wrlock(&rwlock);
218 hmap_remove(all_bonds, &bond->hmap_node);
219 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
220
221 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
222 hmap_remove(&bond->slaves, &slave->hmap_node);
223 /* Client owns 'slave->netdev'. */
224 free(slave->name);
225 free(slave);
226 }
227 hmap_destroy(&bond->slaves);
228
229 free(bond->hash);
f620b43a
BP
230 free(bond->name);
231 free(bond);
232}
233
234/* Updates 'bond''s overall configuration to 's'.
235 *
236 * The caller should register each slave on 'bond' by calling
237 * bond_slave_register(). This is optional if none of the slaves'
4d6fb5eb 238 * configuration has changed. In any case it can't hurt.
59d7b2b6
EJ
239 *
240 * Returns true if the configuration has changed in such a way that requires
241 * flow revalidation.
242 * */
243bool
f620b43a
BP
244bond_reconfigure(struct bond *bond, const struct bond_settings *s)
245{
59d7b2b6
EJ
246 bool revalidate = false;
247
3bfd3972 248 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
249 if (!bond->name || strcmp(bond->name, s->name)) {
250 if (bond->name) {
3bfd3972 251 hmap_remove(all_bonds, &bond->hmap_node);
f620b43a
BP
252 free(bond->name);
253 }
254 bond->name = xstrdup(s->name);
3bfd3972 255 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
f620b43a
BP
256 }
257
f620b43a
BP
258 bond->updelay = s->up_delay;
259 bond->downdelay = s->down_delay;
bc1b010c
EJ
260
261 if (bond->rebalance_interval != s->rebalance_interval) {
262 bond->rebalance_interval = s->rebalance_interval;
263 revalidate = true;
264 }
f620b43a 265
59d7b2b6
EJ
266 if (bond->balance != s->balance) {
267 bond->balance = s->balance;
268 revalidate = true;
269 }
270
672d18b2
EJ
271 if (bond->basis != s->basis) {
272 bond->basis = s->basis;
273 revalidate = true;
274 }
275
f620b43a
BP
276 if (s->fake_iface) {
277 if (bond->next_fake_iface_update == LLONG_MAX) {
278 bond->next_fake_iface_update = time_msec();
279 }
280 } else {
281 bond->next_fake_iface_update = LLONG_MAX;
282 }
59d7b2b6 283
62904702
EJ
284 if (bond->bond_revalidate) {
285 revalidate = true;
286 bond->bond_revalidate = false;
287 }
288
95aafb2a
EJ
289 if (bond->balance == BM_AB || !bond->hash || revalidate) {
290 bond_entry_reset(bond);
291 }
292
3bfd3972 293 ovs_rwlock_unlock(&rwlock);
59d7b2b6 294 return revalidate;
f620b43a
BP
295}
296
f8ddccd2 297static void
1ea24138 298bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
3bfd3972 299 OVS_REQ_WRLOCK(rwlock)
f8ddccd2
BP
300{
301 if (slave->netdev != netdev) {
f8ddccd2 302 slave->netdev = netdev;
1ea24138 303 slave->change_seq = 0;
f8ddccd2
BP
304 }
305}
306
f620b43a
BP
307/* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
308 * arbitrary client-provided pointer that uniquely identifies a slave within a
309 * bond. If 'slave_' already exists within 'bond' then this function
310 * reconfigures the existing slave.
311 *
312 * 'netdev' must be the network device that 'slave_' represents. It is owned
313 * by the client, so the client must not close it before either unregistering
314 * 'slave_' or destroying 'bond'.
4d6fb5eb 315 */
f620b43a 316void
df53d41c 317bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
f620b43a 318{
3bfd3972 319 struct bond_slave *slave;
f620b43a 320
3bfd3972
EJ
321 ovs_rwlock_wrlock(&rwlock);
322 slave = bond_slave_lookup(bond, slave_);
f620b43a
BP
323 if (!slave) {
324 slave = xzalloc(sizeof *slave);
325
326 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
327 slave->bond = bond;
328 slave->aux = slave_;
329 slave->delay_expires = LLONG_MAX;
244b2160 330 slave->name = xstrdup(netdev_get_name(netdev));
7321e30e 331 bond->bond_revalidate = true;
244b2160 332
b3c18f66 333 slave->enabled = false;
4a1b8f30 334 bond_enable_slave(slave, netdev_get_carrier(netdev));
f620b43a
BP
335 }
336
1ea24138 337 bond_slave_set_netdev__(slave, netdev);
a6934aa9 338
f620b43a
BP
339 free(slave->name);
340 slave->name = xstrdup(netdev_get_name(netdev));
3bfd3972 341 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
342}
343
f8ddccd2
BP
344/* Updates the network device to be used with 'slave_' to 'netdev'.
345 *
346 * This is useful if the caller closes and re-opens the network device
347 * registered with bond_slave_register() but doesn't need to change anything
348 * else. */
349void
350bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
351{
3bfd3972
EJ
352 struct bond_slave *slave;
353
354 ovs_rwlock_wrlock(&rwlock);
355 slave = bond_slave_lookup(bond, slave_);
f8ddccd2 356 if (slave) {
1ea24138 357 bond_slave_set_netdev__(slave, netdev);
f8ddccd2 358 }
3bfd3972 359 ovs_rwlock_unlock(&rwlock);
f8ddccd2
BP
360}
361
f620b43a
BP
362/* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
363 * then this function has no effect.
364 *
365 * Unregistering a slave invalidates all flows. */
366void
367bond_slave_unregister(struct bond *bond, const void *slave_)
368{
3bfd3972 369 struct bond_slave *slave;
f620b43a
BP
370 bool del_active;
371
3bfd3972
EJ
372 ovs_rwlock_wrlock(&rwlock);
373 slave = bond_slave_lookup(bond, slave_);
f620b43a 374 if (!slave) {
3bfd3972 375 goto out;
f620b43a
BP
376 }
377
4a1b8f30
EJ
378 bond->bond_revalidate = true;
379 bond_enable_slave(slave, false);
b3c18f66 380
f620b43a
BP
381 del_active = bond->active_slave == slave;
382 if (bond->hash) {
383 struct bond_entry *e;
384 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
385 if (e->slave == slave) {
386 e->slave = NULL;
387 }
388 }
389 }
390
391 free(slave->name);
392
393 hmap_remove(&bond->slaves, &slave->hmap_node);
394 /* Client owns 'slave->netdev'. */
395 free(slave);
396
397 if (del_active) {
4a1b8f30 398 bond_choose_active_slave(bond);
f620b43a
BP
399 bond->send_learning_packets = true;
400 }
3bfd3972
EJ
401out:
402 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
403}
404
296f6519
EJ
405/* Should be called on each slave in 'bond' before bond_run() to indicate
406 * whether or not 'slave_' may be enabled. This function is intended to allow
407 * other protocols to have some impact on bonding decisions. For example LACP
408 * or high level link monitoring protocols may decide that a given slave should
409 * not be able to send traffic. */
4d6fb5eb 410void
296f6519 411bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
4d6fb5eb 412{
3bfd3972 413 ovs_rwlock_wrlock(&rwlock);
296f6519 414 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
3bfd3972 415 ovs_rwlock_unlock(&rwlock);
4d6fb5eb
EJ
416}
417
4a1b8f30
EJ
418/* Performs periodic maintenance on 'bond'.
419 *
420 * Returns true if the caller should revalidate its flows.
f620b43a
BP
421 *
422 * The caller should check bond_should_send_learning_packets() afterward. */
4a1b8f30
EJ
423bool
424bond_run(struct bond *bond, enum lacp_status lacp_status)
f620b43a
BP
425{
426 struct bond_slave *slave;
4a1b8f30 427 bool revalidate;
f620b43a 428
3bfd3972 429 ovs_rwlock_wrlock(&rwlock);
bdebeece
EJ
430 if (bond->lacp_status != lacp_status) {
431 bond->lacp_status = lacp_status;
4592d0e2
EJ
432 bond->bond_revalidate = true;
433 }
4d6fb5eb 434
f620b43a
BP
435 /* Enable slaves based on link status and LACP feedback. */
436 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
4a1b8f30 437 bond_link_status_update(slave);
1ea24138 438 slave->change_seq = netdev_change_seq(slave->netdev);
f620b43a
BP
439 }
440 if (!bond->active_slave || !bond->active_slave->enabled) {
4a1b8f30 441 bond_choose_active_slave(bond);
f620b43a
BP
442 }
443
444 /* Update fake bond interface stats. */
445 if (time_msec() >= bond->next_fake_iface_update) {
446 bond_update_fake_slave_stats(bond);
447 bond->next_fake_iface_update = time_msec() + 1000;
448 }
449
4a1b8f30
EJ
450 revalidate = bond->bond_revalidate;
451 bond->bond_revalidate = false;
3bfd3972 452 ovs_rwlock_unlock(&rwlock);
4a1b8f30
EJ
453
454 return revalidate;
f620b43a
BP
455}
456
457/* Causes poll_block() to wake up when 'bond' needs something to be done. */
458void
459bond_wait(struct bond *bond)
460{
461 struct bond_slave *slave;
462
3bfd3972 463 ovs_rwlock_rdlock(&rwlock);
f620b43a
BP
464 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
465 if (slave->delay_expires != LLONG_MAX) {
466 poll_timer_wait_until(slave->delay_expires);
467 }
1ea24138
EJ
468
469 if (slave->change_seq != netdev_change_seq(slave->netdev)) {
470 poll_immediate_wake();
471 }
f620b43a
BP
472 }
473
474 if (bond->next_fake_iface_update != LLONG_MAX) {
475 poll_timer_wait_until(bond->next_fake_iface_update);
476 }
477
bbc13389 478 if (bond->bond_revalidate) {
f620b43a
BP
479 poll_immediate_wake();
480 }
3bfd3972 481 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
482
483 /* We don't wait for bond->next_rebalance because rebalancing can only run
484 * at a flow account checkpoint. ofproto does checkpointing on its own
485 * schedule and bond_rebalance() gets called afterward, so we'd just be
486 * waking up for no purpose. */
487}
488\f
489/* MAC learning table interaction. */
490
491static bool
492may_send_learning_packets(const struct bond *bond)
493{
bdebeece 494 return bond->lacp_status == LACP_DISABLED
64e2748d 495 && (bond->balance == BM_SLB || bond->balance == BM_AB)
bdebeece 496 && bond->active_slave;
f620b43a
BP
497}
498
499/* Returns true if 'bond' needs the client to send out packets to assist with
500 * MAC learning on 'bond'. If this function returns true, then the client
501 * should iterate through its MAC learning table for the bridge on which 'bond'
502 * is located. For each MAC that has been learned on a port other than 'bond',
ea131871 503 * it should call bond_compose_learning_packet().
f620b43a 504 *
477879ea
BP
505 * This function will only return true if 'bond' is in SLB or active-backup
506 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
507 * necessary.
f620b43a
BP
508 *
509 * Calling this function resets the state that it checks. */
510bool
511bond_should_send_learning_packets(struct bond *bond)
512{
3bfd3972
EJ
513 bool send;
514
515 ovs_rwlock_wrlock(&rwlock);
516 send = bond->send_learning_packets && may_send_learning_packets(bond);
f620b43a 517 bond->send_learning_packets = false;
3bfd3972 518 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
519 return send;
520}
521
522/* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
523 *
ea131871
JG
524 * See bond_should_send_learning_packets() for description of usage. The
525 * caller should send the composed packet on the port associated with
526 * port_aux and takes ownership of the returned ofpbuf. */
527struct ofpbuf *
528bond_compose_learning_packet(struct bond *bond,
529 const uint8_t eth_src[ETH_ADDR_LEN],
530 uint16_t vlan, void **port_aux)
f620b43a
BP
531{
532 struct bond_slave *slave;
ea131871 533 struct ofpbuf *packet;
f620b43a 534 struct flow flow;
f620b43a 535
3bfd3972 536 ovs_rwlock_rdlock(&rwlock);
cb22974d 537 ovs_assert(may_send_learning_packets(bond));
f620b43a
BP
538 memset(&flow, 0, sizeof flow);
539 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
4a1b8f30 540 slave = choose_output_slave(bond, &flow, NULL, vlan);
f620b43a 541
ea131871 542 packet = ofpbuf_new(0);
2ea838ac 543 compose_rarp(packet, eth_src);
f620b43a 544 if (vlan) {
ea131871 545 eth_push_vlan(packet, htons(vlan));
f620b43a 546 }
f620b43a 547
ea131871 548 *port_aux = slave->aux;
3bfd3972 549 ovs_rwlock_unlock(&rwlock);
ea131871 550 return packet;
f620b43a
BP
551}
552\f
553/* Checks whether a packet that arrived on 'slave_' within 'bond', with an
554 * Ethernet destination address of 'eth_dst', should be admitted.
555 *
556 * The return value is one of the following:
557 *
558 * - BV_ACCEPT: Admit the packet.
559 *
560 * - BV_DROP: Drop the packet.
561 *
562 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
563 * Ethernet source address and VLAN. If there is none, or if the packet
564 * is on the learned port, then admit the packet. If a different port has
565 * been learned, however, drop the packet (and do not use it for MAC
566 * learning).
567 */
568enum bond_verdict
569bond_check_admissibility(struct bond *bond, const void *slave_,
4a1b8f30 570 const uint8_t eth_dst[ETH_ADDR_LEN])
f620b43a 571{
3bfd3972
EJ
572 enum bond_verdict verdict = BV_DROP;
573 struct bond_slave *slave;
9a1c6450 574
3bfd3972
EJ
575 ovs_rwlock_rdlock(&rwlock);
576 slave = bond_slave_lookup(bond, slave_);
4222bbc8 577 if (!slave) {
3bfd3972 578 goto out;
4222bbc8
EJ
579 }
580
9a1c6450
EJ
581 /* LACP bonds have very loose admissibility restrictions because we can
582 * assume the remote switch is aware of the bond and will "do the right
583 * thing". However, as a precaution we drop packets on disabled slaves
584 * because no correctly implemented partner switch should be sending
bdebeece
EJ
585 * packets to them.
586 *
587 * If LACP is configured, but LACP negotiations have been unsuccessful, we
588 * drop all incoming traffic. */
589 switch (bond->lacp_status) {
3bfd3972
EJ
590 case LACP_NEGOTIATED:
591 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
592 goto out;
593 case LACP_CONFIGURED:
594 goto out;
595 case LACP_DISABLED:
596 break;
f620b43a
BP
597 }
598
599 /* Drop all multicast packets on inactive slaves. */
600 if (eth_addr_is_multicast(eth_dst)) {
4222bbc8 601 if (bond->active_slave != slave) {
3bfd3972 602 goto out;
f620b43a
BP
603 }
604 }
605
f931a4c9
BP
606 switch (bond->balance) {
607 case BM_AB:
608 /* Drop all packets which arrive on backup slaves. This is similar to
609 * how Linux bonding handles active-backup bonds. */
7ba7dcf0
EJ
610 if (bond->active_slave != slave) {
611 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
612
e6b2255c
BP
613 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
614 " slave (%s) destined for " ETH_ADDR_FMT,
615 slave->name, ETH_ADDR_ARGS(eth_dst));
3bfd3972 616 goto out;
7ba7dcf0 617 }
3bfd3972
EJ
618 verdict = BV_ACCEPT;
619 goto out;
f931a4c9
BP
620
621 case BM_TCP:
bdebeece
EJ
622 /* TCP balanced bonds require successful LACP negotiated. Based on the
623 * above check, LACP is off on this bond. Therfore, we drop all
624 * incoming traffic. */
3bfd3972 625 goto out;
bdebeece 626
f931a4c9
BP
627 case BM_SLB:
628 /* Drop all packets for which we have learned a different input port,
629 * because we probably sent the packet on one slave and got it back on
630 * the other. Gratuitous ARP packets are an exception to this rule:
631 * the host has moved to another switch. The exception to the
632 * exception is if we locked the learning table to avoid reflections on
633 * bond slaves. */
3bfd3972
EJ
634 verdict = BV_DROP_IF_MOVED;
635 goto out;
7ba7dcf0
EJ
636 }
637
f931a4c9 638 NOT_REACHED();
3bfd3972
EJ
639out:
640 ovs_rwlock_unlock(&rwlock);
641 return verdict;
642
f620b43a
BP
643}
644
645/* Returns the slave (registered on 'bond' by bond_slave_register()) to which
646 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
647 * NULL if the packet should be dropped because no slaves are enabled.
648 *
649 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
650 * should be a VID only (i.e. excluding the PCP bits). Second,
651 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
652 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
653 * packet belongs to (so for an access port it will be the access port's VLAN).
654 *
bcd2633a
JP
655 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
656 * significant in the selection. At some point earlier, 'wc' should
657 * have been initialized (e.g., by flow_wildcards_init_catchall()).
f620b43a
BP
658 */
659void *
660bond_choose_output_slave(struct bond *bond, const struct flow *flow,
4a1b8f30 661 struct flow_wildcards *wc, uint16_t vlan)
f620b43a 662{
3bfd3972 663 struct bond_slave *slave;
3bfd3972
EJ
664
665 ovs_rwlock_rdlock(&rwlock);
4a1b8f30 666 slave = choose_output_slave(bond, flow, wc, vlan);
3bfd3972 667 ovs_rwlock_unlock(&rwlock);
4a1b8f30 668 return slave;
f620b43a 669}
f620b43a
BP
670\f
671/* Rebalancing. */
672
1b137691 673static bool
3bfd3972 674bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
1b137691 675{
bc1b010c
EJ
676 return bond->rebalance_interval
677 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
1b137691
EJ
678}
679
f620b43a
BP
680/* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
681void
682bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
683 uint64_t n_bytes)
684{
3bfd3972 685 ovs_rwlock_wrlock(&rwlock);
1b137691 686 if (bond_is_balanced(bond)) {
f620b43a 687 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
f620b43a 688 }
3bfd3972 689 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
690}
691
692static struct bond_slave *
3bfd3972 693bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
694{
695 return CONTAINER_OF(bal, struct bond_slave, bal_node);
696}
697
698static void
699log_bals(struct bond *bond, const struct list *bals)
700{
701 if (VLOG_IS_DBG_ENABLED()) {
702 struct ds ds = DS_EMPTY_INITIALIZER;
703 const struct bond_slave *slave;
704
705 LIST_FOR_EACH (slave, bal_node, bals) {
706 if (ds.length) {
707 ds_put_char(&ds, ',');
708 }
709 ds_put_format(&ds, " %s %"PRIu64"kB",
710 slave->name, slave->tx_bytes / 1024);
711
712 if (!slave->enabled) {
713 ds_put_cstr(&ds, " (disabled)");
714 }
715 if (!list_is_empty(&slave->entries)) {
716 struct bond_entry *e;
717
718 ds_put_cstr(&ds, " (");
719 LIST_FOR_EACH (e, list_node, &slave->entries) {
720 if (&e->list_node != list_front(&slave->entries)) {
721 ds_put_cstr(&ds, " + ");
722 }
723 ds_put_format(&ds, "h%td: %"PRIu64"kB",
724 e - bond->hash, e->tx_bytes / 1024);
725 }
726 ds_put_cstr(&ds, ")");
727 }
728 }
729 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
730 ds_destroy(&ds);
731 }
732}
733
734/* Shifts 'hash' from its current slave to 'to'. */
735static void
4a1b8f30 736bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
f620b43a
BP
737{
738 struct bond_slave *from = hash->slave;
739 struct bond *bond = from->bond;
740 uint64_t delta = hash->tx_bytes;
741
742 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) "
743 "from %s to %s (now carrying %"PRIu64"kB and "
744 "%"PRIu64"kB load, respectively)",
745 bond->name, delta / 1024, hash - bond->hash,
746 from->name, to->name,
747 (from->tx_bytes - delta) / 1024,
748 (to->tx_bytes + delta) / 1024);
749
750 /* Shift load away from 'from' to 'to'. */
751 from->tx_bytes -= delta;
752 to->tx_bytes += delta;
753
754 /* Arrange for flows to be revalidated. */
4a1b8f30 755 bond->bond_revalidate = true;
f620b43a
BP
756}
757
09a5d390
BP
758/* Picks and returns a bond_entry to migrate from 'from' (the most heavily
759 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
f620b43a
BP
760 * given that doing so must decrease the ratio of the load on the two slaves by
761 * at least 0.1. Returns NULL if there is no appropriate entry.
762 *
763 * The list of entries isn't sorted. I don't know of a reason to prefer to
764 * shift away small hashes or large hashes. */
765static struct bond_entry *
766choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
767{
768 struct bond_entry *e;
769
770 if (list_is_short(&from->entries)) {
771 /* 'from' carries no more than one MAC hash, so shifting load away from
772 * it would be pointless. */
773 return NULL;
774 }
775
776 LIST_FOR_EACH (e, list_node, &from->entries) {
777 double old_ratio, new_ratio;
778 uint64_t delta;
779
780 if (to_tx_bytes == 0) {
781 /* Nothing on the new slave, move it. */
782 return e;
783 }
784
785 delta = e->tx_bytes;
786 old_ratio = (double)from->tx_bytes / to_tx_bytes;
787 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
75fad143
ZK
788 if (old_ratio - new_ratio > 0.1
789 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
790 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
791 and 'to' slave have the same load. Therefore, we only move an
792 entry if it decreases the load on 'from', and brings us closer
793 to equal traffic load. */
f620b43a
BP
794 return e;
795 }
796 }
797
798 return NULL;
799}
800
801/* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
802 * maintained. */
803static void
804insert_bal(struct list *bals, struct bond_slave *slave)
805{
806 struct bond_slave *pos;
807
808 LIST_FOR_EACH (pos, bal_node, bals) {
809 if (slave->tx_bytes > pos->tx_bytes) {
810 break;
811 }
812 }
813 list_insert(&pos->bal_node, &slave->bal_node);
814}
815
816/* Removes 'slave' from its current list and then inserts it into 'bals' so
817 * that descending order of 'tx_bytes' is maintained. */
818static void
819reinsert_bal(struct list *bals, struct bond_slave *slave)
820{
821 list_remove(&slave->bal_node);
822 insert_bal(bals, slave);
823}
824
825/* If 'bond' needs rebalancing, does so.
826 *
827 * The caller should have called bond_account() for each active flow, to ensure
828 * that flow data is consistently accounted at this point. */
829void
4a1b8f30 830bond_rebalance(struct bond *bond)
f620b43a
BP
831{
832 struct bond_slave *slave;
833 struct bond_entry *e;
834 struct list bals;
835
3bfd3972 836 ovs_rwlock_wrlock(&rwlock);
1b137691 837 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
3bfd3972 838 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
839 return;
840 }
841 bond->next_rebalance = time_msec() + bond->rebalance_interval;
842
843 /* Add each bond_entry to its slave's 'entries' list.
844 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
845 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
846 slave->tx_bytes = 0;
847 list_init(&slave->entries);
848 }
849 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
850 if (e->slave && e->tx_bytes) {
851 e->slave->tx_bytes += e->tx_bytes;
852 list_push_back(&e->slave->entries, &e->list_node);
853 }
854 }
855
856 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
857 *
858 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
859 * with a proper list sort algorithm. */
860 list_init(&bals);
861 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
862 if (slave->enabled) {
863 insert_bal(&bals, slave);
864 }
865 }
866 log_bals(bond, &bals);
867
868 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
869 while (!list_is_short(&bals)) {
870 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
871 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
872 uint64_t overload;
873
874 overload = from->tx_bytes - to->tx_bytes;
875 if (overload < to->tx_bytes >> 5 || overload < 100000) {
876 /* The extra load on 'from' (and all less-loaded slaves), compared
877 * to that of 'to' (the least-loaded slave), is less than ~3%, or
878 * it is less than ~1Mbps. No point in rebalancing. */
879 break;
880 }
881
09a5d390
BP
882 /* 'from' is carrying significantly more load than 'to'. Pick a hash
883 * to move from 'from' to 'to'. */
f620b43a
BP
884 e = choose_entry_to_migrate(from, to->tx_bytes);
885 if (e) {
4a1b8f30 886 bond_shift_load(e, to);
f620b43a
BP
887
888 /* Delete element from from->entries.
889 *
890 * We don't add the element to to->hashes. That would only allow
891 * 'e' to be migrated to another slave in this rebalancing run, and
892 * there is no point in doing that. */
893 list_remove(&e->list_node);
894
895 /* Re-sort 'bals'. */
896 reinsert_bal(&bals, from);
897 reinsert_bal(&bals, to);
898 } else {
899 /* Can't usefully migrate anything away from 'from'.
900 * Don't reconsider it. */
901 list_remove(&from->bal_node);
902 }
903 }
904
905 /* Implement exponentially weighted moving average. A weight of 1/2 causes
906 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
907 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
908 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
909 e->tx_bytes /= 2;
910 if (!e->tx_bytes) {
911 e->slave = NULL;
912 }
913 }
3bfd3972 914 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
915}
916\f
917/* Bonding unixctl user interface functions. */
918
919static struct bond *
3bfd3972 920bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
921{
922 struct bond *bond;
923
924 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
3bfd3972 925 all_bonds) {
f620b43a
BP
926 if (!strcmp(bond->name, name)) {
927 return bond;
928 }
929 }
930 return NULL;
931}
932
933static struct bond_slave *
934bond_lookup_slave(struct bond *bond, const char *slave_name)
935{
936 struct bond_slave *slave;
937
938 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
939 if (!strcmp(slave->name, slave_name)) {
940 return slave;
941 }
942 }
943 return NULL;
944}
945
946static void
947bond_unixctl_list(struct unixctl_conn *conn,
0e15264f
BP
948 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
949 void *aux OVS_UNUSED)
f620b43a
BP
950{
951 struct ds ds = DS_EMPTY_INITIALIZER;
952 const struct bond *bond;
953
954 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
955
3bfd3972
EJ
956 ovs_rwlock_rdlock(&rwlock);
957 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
f620b43a
BP
958 const struct bond_slave *slave;
959 size_t i;
960
961 ds_put_format(&ds, "%s\t%s\t",
962 bond->name, bond_mode_to_string(bond->balance));
963
964 i = 0;
965 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
966 if (i++ > 0) {
967 ds_put_cstr(&ds, ", ");
968 }
969 ds_put_cstr(&ds, slave->name);
970 }
971 ds_put_char(&ds, '\n');
972 }
3bfd3972 973 ovs_rwlock_unlock(&rwlock);
bde9f75d 974 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a
BP
975 ds_destroy(&ds);
976}
977
978static void
c33a8a25 979bond_print_details(struct ds *ds, const struct bond *bond)
3bfd3972 980 OVS_REQ_RDLOCK(rwlock)
f620b43a 981{
fc1d4f01
EJ
982 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
983 const struct shash_node **sorted_slaves = NULL;
f620b43a 984 const struct bond_slave *slave;
fc1d4f01 985 int i;
f620b43a 986
c33a8a25
EJ
987 ds_put_format(ds, "---- %s ----\n", bond->name);
988 ds_put_format(ds, "bond_mode: %s\n",
f620b43a
BP
989 bond_mode_to_string(bond->balance));
990
c33a8a25 991 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
672d18b2 992
c33a8a25
EJ
993 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
994 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
f620b43a 995
1b137691 996 if (bond_is_balanced(bond)) {
c33a8a25 997 ds_put_format(ds, "next rebalance: %lld ms\n",
f620b43a
BP
998 bond->next_rebalance - time_msec());
999 }
1000
bdebeece
EJ
1001 ds_put_cstr(ds, "lacp_status: ");
1002 switch (bond->lacp_status) {
1003 case LACP_NEGOTIATED:
1004 ds_put_cstr(ds, "negotiated\n");
1005 break;
1006 case LACP_CONFIGURED:
1007 ds_put_cstr(ds, "configured\n");
1008 break;
1009 case LACP_DISABLED:
1010 ds_put_cstr(ds, "off\n");
1011 break;
1012 default:
1013 ds_put_cstr(ds, "<unknown>\n");
1014 break;
1015 }
4d6fb5eb 1016
f620b43a 1017 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
fc1d4f01
EJ
1018 shash_add(&slave_shash, slave->name, slave);
1019 }
1020 sorted_slaves = shash_sort(&slave_shash);
1021
1022 for (i = 0; i < shash_count(&slave_shash); i++) {
f620b43a 1023 struct bond_entry *be;
f620b43a 1024
fc1d4f01
EJ
1025 slave = sorted_slaves[i]->data;
1026
f620b43a 1027 /* Basic info. */
c33a8a25 1028 ds_put_format(ds, "\nslave %s: %s\n",
f620b43a
BP
1029 slave->name, slave->enabled ? "enabled" : "disabled");
1030 if (slave == bond->active_slave) {
c33a8a25 1031 ds_put_cstr(ds, "\tactive slave\n");
f620b43a
BP
1032 }
1033 if (slave->delay_expires != LLONG_MAX) {
c33a8a25 1034 ds_put_format(ds, "\t%s expires in %lld ms\n",
f620b43a
BP
1035 slave->enabled ? "downdelay" : "updelay",
1036 slave->delay_expires - time_msec());
1037 }
1038
c33a8a25 1039 ds_put_format(ds, "\tmay_enable: %s\n",
296f6519 1040 slave->may_enable ? "true" : "false");
4d6fb5eb 1041
1b137691 1042 if (!bond_is_balanced(bond)) {
f620b43a
BP
1043 continue;
1044 }
1045
1046 /* Hashes. */
f620b43a
BP
1047 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1048 int hash = be - bond->hash;
1049
1050 if (be->slave != slave) {
1051 continue;
1052 }
1053
c33a8a25 1054 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
f620b43a
BP
1055 hash, be->tx_bytes / 1024);
1056
7b9f1974 1057 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
f620b43a
BP
1058 }
1059 }
fc1d4f01
EJ
1060 shash_destroy(&slave_shash);
1061 free(sorted_slaves);
c33a8a25
EJ
1062 ds_put_cstr(ds, "\n");
1063}
1064
1065static void
1066bond_unixctl_show(struct unixctl_conn *conn,
1067 int argc, const char *argv[],
1068 void *aux OVS_UNUSED)
1069{
1070 struct ds ds = DS_EMPTY_INITIALIZER;
1071
3bfd3972 1072 ovs_rwlock_rdlock(&rwlock);
c33a8a25
EJ
1073 if (argc > 1) {
1074 const struct bond *bond = bond_find(argv[1]);
1075
1076 if (!bond) {
bde9f75d 1077 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1078 goto out;
c33a8a25
EJ
1079 }
1080 bond_print_details(&ds, bond);
1081 } else {
1082 const struct bond *bond;
1083
3bfd3972 1084 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
c33a8a25
EJ
1085 bond_print_details(&ds, bond);
1086 }
1087 }
1088
bde9f75d 1089 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a 1090 ds_destroy(&ds);
3bfd3972
EJ
1091
1092out:
1093 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1094}
1095
1096static void
0e15264f
BP
1097bond_unixctl_migrate(struct unixctl_conn *conn,
1098 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1099 void *aux OVS_UNUSED)
1100{
0e15264f
BP
1101 const char *bond_s = argv[1];
1102 const char *hash_s = argv[2];
1103 const char *slave_s = argv[3];
f620b43a
BP
1104 struct bond *bond;
1105 struct bond_slave *slave;
1106 struct bond_entry *entry;
1107 int hash;
1108
3bfd3972 1109 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1110 bond = bond_find(bond_s);
1111 if (!bond) {
bde9f75d 1112 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1113 goto out;
f620b43a
BP
1114 }
1115
1116 if (bond->balance != BM_SLB) {
bde9f75d 1117 unixctl_command_reply_error(conn, "not an SLB bond");
3bfd3972 1118 goto out;
f620b43a
BP
1119 }
1120
1121 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1122 hash = atoi(hash_s) & BOND_MASK;
1123 } else {
bde9f75d 1124 unixctl_command_reply_error(conn, "bad hash");
3bfd3972 1125 goto out;
f620b43a
BP
1126 }
1127
1128 slave = bond_lookup_slave(bond, slave_s);
1129 if (!slave) {
bde9f75d 1130 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1131 goto out;
f620b43a
BP
1132 }
1133
1134 if (!slave->enabled) {
bde9f75d 1135 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
3bfd3972 1136 goto out;
f620b43a
BP
1137 }
1138
1139 entry = &bond->hash[hash];
4a1b8f30 1140 bond->bond_revalidate = true;
f620b43a 1141 entry->slave = slave;
bde9f75d 1142 unixctl_command_reply(conn, "migrated");
3bfd3972
EJ
1143
1144out:
1145 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1146}
1147
1148static void
0e15264f
BP
1149bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1150 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1151 void *aux OVS_UNUSED)
1152{
0e15264f
BP
1153 const char *bond_s = argv[1];
1154 const char *slave_s = argv[2];
f620b43a
BP
1155 struct bond *bond;
1156 struct bond_slave *slave;
1157
3bfd3972 1158 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1159 bond = bond_find(bond_s);
1160 if (!bond) {
bde9f75d 1161 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1162 goto out;
f620b43a
BP
1163 }
1164
1165 slave = bond_lookup_slave(bond, slave_s);
1166 if (!slave) {
bde9f75d 1167 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1168 goto out;
f620b43a
BP
1169 }
1170
1171 if (!slave->enabled) {
bde9f75d 1172 unixctl_command_reply_error(conn, "cannot make disabled slave active");
3bfd3972 1173 goto out;
f620b43a
BP
1174 }
1175
1176 if (bond->active_slave != slave) {
4a1b8f30 1177 bond->bond_revalidate = true;
f620b43a 1178 bond->active_slave = slave;
f620b43a
BP
1179 VLOG_INFO("bond %s: active interface is now %s",
1180 bond->name, slave->name);
1181 bond->send_learning_packets = true;
bde9f75d 1182 unixctl_command_reply(conn, "done");
f620b43a 1183 } else {
bde9f75d 1184 unixctl_command_reply(conn, "no change");
f620b43a 1185 }
3bfd3972
EJ
1186out:
1187 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1188}
1189
1190static void
0e15264f 1191enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
f620b43a 1192{
0e15264f
BP
1193 const char *bond_s = argv[1];
1194 const char *slave_s = argv[2];
f620b43a
BP
1195 struct bond *bond;
1196 struct bond_slave *slave;
1197
3bfd3972 1198 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1199 bond = bond_find(bond_s);
1200 if (!bond) {
bde9f75d 1201 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1202 goto out;
f620b43a
BP
1203 }
1204
1205 slave = bond_lookup_slave(bond, slave_s);
1206 if (!slave) {
bde9f75d 1207 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1208 goto out;
f620b43a
BP
1209 }
1210
4a1b8f30 1211 bond_enable_slave(slave, enable);
bde9f75d 1212 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
3bfd3972
EJ
1213
1214out:
1215 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1216}
1217
1218static void
0e15264f
BP
1219bond_unixctl_enable_slave(struct unixctl_conn *conn,
1220 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1221 void *aux OVS_UNUSED)
1222{
0e15264f 1223 enable_slave(conn, argv, true);
f620b43a
BP
1224}
1225
1226static void
0e15264f
BP
1227bond_unixctl_disable_slave(struct unixctl_conn *conn,
1228 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1229 void *aux OVS_UNUSED)
1230{
0e15264f 1231 enable_slave(conn, argv, false);
f620b43a
BP
1232}
1233
1234static void
0e15264f 1235bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
f620b43a
BP
1236 void *aux OVS_UNUSED)
1237{
0e15264f
BP
1238 const char *mac_s = argv[1];
1239 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1240 const char *basis_s = argc > 3 ? argv[3] : NULL;
f620b43a
BP
1241 uint8_t mac[ETH_ADDR_LEN];
1242 uint8_t hash;
1243 char *hash_cstr;
1244 unsigned int vlan;
672d18b2 1245 uint32_t basis;
f620b43a
BP
1246
1247 if (vlan_s) {
1248 if (sscanf(vlan_s, "%u", &vlan) != 1) {
bde9f75d 1249 unixctl_command_reply_error(conn, "invalid vlan");
f620b43a
BP
1250 return;
1251 }
1252 } else {
dc155bff 1253 vlan = 0;
f620b43a
BP
1254 }
1255
672d18b2
EJ
1256 if (basis_s) {
1257 if (sscanf(basis_s, "%"PRIu32, &basis) != 1) {
bde9f75d 1258 unixctl_command_reply_error(conn, "invalid basis");
672d18b2
EJ
1259 return;
1260 }
1261 } else {
1262 basis = 0;
1263 }
1264
f620b43a
BP
1265 if (sscanf(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))
1266 == ETH_ADDR_SCAN_COUNT) {
672d18b2 1267 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
f620b43a
BP
1268
1269 hash_cstr = xasprintf("%u", hash);
bde9f75d 1270 unixctl_command_reply(conn, hash_cstr);
f620b43a
BP
1271 free(hash_cstr);
1272 } else {
bde9f75d 1273 unixctl_command_reply_error(conn, "invalid mac");
f620b43a
BP
1274 }
1275}
1276
1277void
1278bond_init(void)
1279{
0e15264f 1280 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
c33a8a25
EJ
1281 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1282 NULL);
0e15264f 1283 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
7ff2009a 1284 bond_unixctl_migrate, NULL);
0e15264f 1285 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
f620b43a 1286 bond_unixctl_set_active_slave, NULL);
0e15264f 1287 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
7ff2009a 1288 bond_unixctl_enable_slave, NULL);
0e15264f 1289 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
7ff2009a 1290 bond_unixctl_disable_slave, NULL);
0e15264f 1291 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
7ff2009a 1292 bond_unixctl_hash, NULL);
f620b43a
BP
1293}
1294\f
95aafb2a
EJ
1295static void
1296bond_entry_reset(struct bond *bond)
1297{
1298 if (bond->balance != BM_AB) {
1299 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1300
1301 if (!bond->hash) {
1302 bond->hash = xmalloc(hash_len);
1303 }
1304 memset(bond->hash, 0, hash_len);
1305
1306 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1307 } else {
1308 free(bond->hash);
1309 bond->hash = NULL;
1310 }
1311}
1312
f620b43a
BP
1313static struct bond_slave *
1314bond_slave_lookup(struct bond *bond, const void *slave_)
1315{
1316 struct bond_slave *slave;
1317
1318 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1319 &bond->slaves) {
1320 if (slave->aux == slave_) {
1321 return slave;
1322 }
1323 }
1324
1325 return NULL;
1326}
1327
f620b43a 1328static void
4a1b8f30 1329bond_enable_slave(struct bond_slave *slave, bool enable)
f620b43a
BP
1330{
1331 slave->delay_expires = LLONG_MAX;
1332 if (enable != slave->enabled) {
4a1b8f30 1333 slave->bond->bond_revalidate = true;
f620b43a 1334 slave->enabled = enable;
4a1b8f30
EJ
1335 VLOG_INFO("interface %s: %s", slave->name,
1336 slave->enabled ? "enabled" : "disabled");
f620b43a
BP
1337 }
1338}
1339
1340static void
4a1b8f30 1341bond_link_status_update(struct bond_slave *slave)
f620b43a
BP
1342{
1343 struct bond *bond = slave->bond;
1344 bool up;
1345
296f6519 1346 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
f620b43a
BP
1347 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1348 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1349 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1350 slave->name, up ? "up" : "down");
1351 if (up == slave->enabled) {
1352 slave->delay_expires = LLONG_MAX;
1353 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1354 slave->name, up ? "disabled" : "enabled");
1355 } else {
bdebeece 1356 int delay = (bond->lacp_status != LACP_DISABLED ? 0
f620b43a
BP
1357 : up ? bond->updelay : bond->downdelay);
1358 slave->delay_expires = time_msec() + delay;
1359 if (delay) {
1360 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1361 "for %d ms",
1362 slave->name,
1363 up ? "enabled" : "disabled",
1364 up ? "up" : "down",
1365 delay);
1366 }
1367 }
1368 }
1369
1370 if (time_msec() >= slave->delay_expires) {
4a1b8f30 1371 bond_enable_slave(slave, up);
f620b43a
BP
1372 }
1373}
1374
f620b43a 1375static unsigned int
672d18b2 1376bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
f620b43a 1377{
672d18b2 1378 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
f620b43a
BP
1379}
1380
1381static unsigned int
672d18b2 1382bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
f620b43a
BP
1383{
1384 struct flow hash_flow = *flow;
d84d4b88 1385 hash_flow.vlan_tci = htons(vlan);
f620b43a
BP
1386
1387 /* The symmetric quality of this hash function is not required, but
1388 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1389 * purposes, so we use it out of convenience. */
672d18b2 1390 return flow_hash_symmetric_l4(&hash_flow, basis);
f620b43a
BP
1391}
1392
fb0b29a3
EJ
1393static unsigned int
1394bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1395{
cb22974d 1396 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
fb0b29a3 1397
bdebeece 1398 return (bond->balance == BM_TCP
672d18b2
EJ
1399 ? bond_hash_tcp(flow, vlan, bond->basis)
1400 : bond_hash_src(flow->dl_src, vlan, bond->basis));
fb0b29a3
EJ
1401}
1402
f620b43a
BP
1403static struct bond_entry *
1404lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1405 uint16_t vlan)
1406{
fb0b29a3 1407 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
f620b43a
BP
1408}
1409
1410static struct bond_slave *
1411choose_output_slave(const struct bond *bond, const struct flow *flow,
4a1b8f30 1412 struct flow_wildcards *wc, uint16_t vlan)
f620b43a
BP
1413{
1414 struct bond_entry *e;
1415
bdebeece
EJ
1416 if (bond->lacp_status == LACP_CONFIGURED) {
1417 /* LACP has been configured on this bond but negotiations were
1418 * unsuccussful. Drop all traffic. */
1419 return NULL;
1420 }
1421
f620b43a
BP
1422 switch (bond->balance) {
1423 case BM_AB:
1424 return bond->active_slave;
1425
f620b43a 1426 case BM_TCP:
bdebeece
EJ
1427 if (bond->lacp_status != LACP_NEGOTIATED) {
1428 /* Must have LACP negotiations for TCP balanced bonds. */
1429 return NULL;
1430 }
bcd2633a 1431 if (wc) {
6cdd5145 1432 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
bcd2633a 1433 }
bdebeece
EJ
1434 /* Fall Through. */
1435 case BM_SLB:
bcd2633a 1436 if (wc) {
6cdd5145 1437 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
bcd2633a 1438 }
f620b43a
BP
1439 e = lookup_bond_entry(bond, flow, vlan);
1440 if (!e->slave || !e->slave->enabled) {
c804cadf
EJ
1441 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1442 struct bond_slave, hmap_node);
1443 if (!e->slave->enabled) {
1444 e->slave = bond->active_slave;
1445 }
f620b43a
BP
1446 }
1447 return e->slave;
1448
1449 default:
1450 NOT_REACHED();
1451 }
1452}
1453
1454static struct bond_slave *
1455bond_choose_slave(const struct bond *bond)
1456{
1457 struct bond_slave *slave, *best;
1458
1459 /* Find an enabled slave. */
1460 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1461 if (slave->enabled) {
1462 return slave;
1463 }
1464 }
1465
1466 /* All interfaces are disabled. Find an interface that will be enabled
1467 * after its updelay expires. */
1468 best = NULL;
1469 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1470 if (slave->delay_expires != LLONG_MAX
296f6519 1471 && slave->may_enable
f620b43a
BP
1472 && (!best || slave->delay_expires < best->delay_expires)) {
1473 best = slave;
1474 }
1475 }
1476 return best;
1477}
1478
1479static void
4a1b8f30 1480bond_choose_active_slave(struct bond *bond)
f620b43a
BP
1481{
1482 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1483 struct bond_slave *old_active_slave = bond->active_slave;
1484
1485 bond->active_slave = bond_choose_slave(bond);
1486 if (bond->active_slave) {
1487 if (bond->active_slave->enabled) {
1488 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1489 bond->name, bond->active_slave->name);
1490 } else {
1491 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1492 "remaining %lld ms updelay (since no interface was "
1493 "enabled)", bond->name, bond->active_slave->name,
1494 bond->active_slave->delay_expires - time_msec());
4a1b8f30 1495 bond_enable_slave(bond->active_slave, true);
f620b43a
BP
1496 }
1497
1498 bond->send_learning_packets = true;
1499 } else if (old_active_slave) {
d28b9ead 1500 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
f620b43a
BP
1501 }
1502}
1503
f620b43a
BP
1504/* Attempts to make the sum of the bond slaves' statistics appear on the fake
1505 * bond interface. */
1506static void
1507bond_update_fake_slave_stats(struct bond *bond)
1508{
1509 struct netdev_stats bond_stats;
1510 struct bond_slave *slave;
1511 struct netdev *bond_dev;
1512
1513 memset(&bond_stats, 0, sizeof bond_stats);
1514
1515 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1516 struct netdev_stats slave_stats;
1517
1518 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1519 /* XXX: We swap the stats here because they are swapped back when
1520 * reported by the internal device. The reason for this is
1521 * internal devices normally represent packets going into the
1522 * system but when used as fake bond device they represent packets
1523 * leaving the system. We really should do this in the internal
1524 * device itself because changing it here reverses the counts from
1525 * the perspective of the switch. However, the internal device
1526 * doesn't know what type of device it represents so we have to do
1527 * it here for now. */
1528 bond_stats.tx_packets += slave_stats.rx_packets;
1529 bond_stats.tx_bytes += slave_stats.rx_bytes;
1530 bond_stats.rx_packets += slave_stats.tx_packets;
1531 bond_stats.rx_bytes += slave_stats.tx_bytes;
1532 }
1533 }
1534
18812dff 1535 if (!netdev_open(bond->name, "system", &bond_dev)) {
f620b43a
BP
1536 netdev_set_stats(bond_dev, &bond_stats);
1537 netdev_close(bond_dev);
1538 }
1539}