]> git.proxmox.com Git - mirror_ovs.git/blame - ofproto/bond.c
ofproto: Remove arbitrary handler thread limit.
[mirror_ovs.git] / ofproto / bond.c
CommitLineData
f620b43a 1/*
09a5d390 2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
f620b43a
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
19#include "bond.h"
20
21#include <limits.h>
22#include <stdint.h>
23#include <stdlib.h>
75fad143 24#include <math.h>
f620b43a
BP
25
26#include "coverage.h"
27#include "dynamic-string.h"
28#include "flow.h"
29#include "hmap.h"
bdebeece 30#include "lacp.h"
f620b43a
BP
31#include "list.h"
32#include "netdev.h"
33#include "odp-util.h"
34#include "ofpbuf.h"
35#include "packets.h"
36#include "poll-loop.h"
fc1d4f01 37#include "shash.h"
f620b43a
BP
38#include "timeval.h"
39#include "unixctl.h"
40#include "vlog.h"
41
42VLOG_DEFINE_THIS_MODULE(bond);
43
f620b43a
BP
44/* Bit-mask for hashing a flow down to a bucket.
45 * There are (BOND_MASK + 1) buckets. */
46#define BOND_MASK 0xff
47
48/* A hash bucket for mapping a flow to a slave.
49 * "struct bond" has an array of (BOND_MASK + 1) of these. */
50struct bond_entry {
51 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
52 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
f620b43a
BP
53 struct list list_node; /* In bond_slave's 'entries' list. */
54};
55
56/* A bond slave, that is, one of the links comprising a bond. */
57struct bond_slave {
58 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
59 struct bond *bond; /* The bond that contains this slave. */
60 void *aux; /* Client-provided handle for this slave. */
61
62 struct netdev *netdev; /* Network device, owned by the client. */
1ea24138 63 unsigned int change_seq; /* Tracks changes in 'netdev'. */
f620b43a
BP
64 char *name; /* Name (a copy of netdev_get_name(netdev)). */
65
66 /* Link status. */
67 long long delay_expires; /* Time after which 'enabled' may change. */
f620b43a 68 bool enabled; /* May be chosen for flows? */
296f6519 69 bool may_enable; /* Client considers this slave bondable. */
f620b43a
BP
70
71 /* Rebalancing info. Used only by bond_rebalance(). */
72 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
73 struct list entries; /* 'struct bond_entry's assigned here. */
74 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
75};
76
77/* A bond, that is, a set of network devices grouped to improve performance or
78 * robustness. */
79struct bond {
80 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
81 char *name; /* Name provided by client. */
82
83 /* Slaves. */
84 struct hmap slaves;
85
86 /* Bonding info. */
87 enum bond_mode balance; /* Balancing mode, one of BM_*. */
88 struct bond_slave *active_slave;
f620b43a 89 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
bdebeece 90 enum lacp_status lacp_status; /* Status of LACP negotiations. */
62904702 91 bool bond_revalidate; /* True if flows need revalidation. */
672d18b2 92 uint32_t basis; /* Basis for flow hash function. */
f620b43a
BP
93
94 /* SLB specific bonding info. */
95 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
96 int rebalance_interval; /* Interval between rebalances, in ms. */
97 long long int next_rebalance; /* Next rebalancing time. */
98 bool send_learning_packets;
99
f620b43a
BP
100 /* Legacy compatibility. */
101 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
9dd165e0 102 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
f620b43a 103
3bfd3972 104 atomic_int ref_cnt;
f620b43a
BP
105};
106
3bfd3972
EJ
107static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
108static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
109static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
f620b43a 110
3bfd3972
EJ
111static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
112static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
113 OVS_REQ_RDLOCK(rwlock);
4a1b8f30
EJ
114static void bond_enable_slave(struct bond_slave *, bool enable)
115 OVS_REQ_WRLOCK(rwlock);
116static void bond_link_status_update(struct bond_slave *)
3bfd3972 117 OVS_REQ_WRLOCK(rwlock);
4a1b8f30 118static void bond_choose_active_slave(struct bond *)
3bfd3972 119 OVS_REQ_WRLOCK(rwlock);;
f620b43a 120static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
672d18b2
EJ
121 uint16_t vlan, uint32_t basis);
122static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
123 uint32_t basis);
f620b43a
BP
124static struct bond_entry *lookup_bond_entry(const struct bond *,
125 const struct flow *,
3bfd3972
EJ
126 uint16_t vlan)
127 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
128static struct bond_slave *choose_output_slave(const struct bond *,
129 const struct flow *,
bcd2633a 130 struct flow_wildcards *,
4a1b8f30 131 uint16_t vlan)
3bfd3972
EJ
132 OVS_REQ_RDLOCK(rwlock);
133static void bond_update_fake_slave_stats(struct bond *)
134 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
135
136/* Attempts to parse 's' as the name of a bond balancing mode. If successful,
137 * stores the mode in '*balance' and returns true. Otherwise returns false
138 * without modifying '*balance'. */
139bool
140bond_mode_from_string(enum bond_mode *balance, const char *s)
141{
142 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
143 *balance = BM_TCP;
144 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
145 *balance = BM_SLB;
146 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
147 *balance = BM_AB;
148 } else {
149 return false;
150 }
151 return true;
152}
153
154/* Returns a string representing 'balance'. */
155const char *
156bond_mode_to_string(enum bond_mode balance) {
157 switch (balance) {
158 case BM_TCP:
159 return "balance-tcp";
160 case BM_SLB:
161 return "balance-slb";
162 case BM_AB:
163 return "active-backup";
164 }
165 NOT_REACHED();
166}
167
f620b43a
BP
168\f
169/* Creates and returns a new bond whose configuration is initially taken from
170 * 's'.
171 *
172 * The caller should register each slave on the new bond by calling
173 * bond_slave_register(). */
174struct bond *
175bond_create(const struct bond_settings *s)
176{
177 struct bond *bond;
178
179 bond = xzalloc(sizeof *bond);
180 hmap_init(&bond->slaves);
f620b43a 181 bond->next_fake_iface_update = LLONG_MAX;
3bfd3972 182 atomic_init(&bond->ref_cnt, 1);
f620b43a
BP
183
184 bond_reconfigure(bond, s);
f620b43a
BP
185 return bond;
186}
187
03366a2d
EJ
188struct bond *
189bond_ref(const struct bond *bond_)
190{
191 struct bond *bond = CONST_CAST(struct bond *, bond_);
192
bca0b3b4 193 if (bond) {
3bfd3972
EJ
194 int orig;
195 atomic_add(&bond->ref_cnt, 1, &orig);
196 ovs_assert(orig > 0);
bca0b3b4 197 }
03366a2d
EJ
198 return bond;
199}
200
f620b43a
BP
201/* Frees 'bond'. */
202void
03366a2d 203bond_unref(struct bond *bond)
f620b43a
BP
204{
205 struct bond_slave *slave, *next_slave;
3bfd3972 206 int orig;
f620b43a
BP
207
208 if (!bond) {
209 return;
210 }
211
3bfd3972
EJ
212 atomic_sub(&bond->ref_cnt, 1, &orig);
213 ovs_assert(orig > 0);
214 if (orig != 1) {
03366a2d
EJ
215 return;
216 }
217
3bfd3972
EJ
218 ovs_rwlock_wrlock(&rwlock);
219 hmap_remove(all_bonds, &bond->hmap_node);
220 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
221
222 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
223 hmap_remove(&bond->slaves, &slave->hmap_node);
224 /* Client owns 'slave->netdev'. */
225 free(slave->name);
226 free(slave);
227 }
228 hmap_destroy(&bond->slaves);
229
230 free(bond->hash);
f620b43a
BP
231 free(bond->name);
232 free(bond);
233}
234
235/* Updates 'bond''s overall configuration to 's'.
236 *
237 * The caller should register each slave on 'bond' by calling
238 * bond_slave_register(). This is optional if none of the slaves'
4d6fb5eb 239 * configuration has changed. In any case it can't hurt.
59d7b2b6
EJ
240 *
241 * Returns true if the configuration has changed in such a way that requires
242 * flow revalidation.
243 * */
244bool
f620b43a
BP
245bond_reconfigure(struct bond *bond, const struct bond_settings *s)
246{
59d7b2b6
EJ
247 bool revalidate = false;
248
3bfd3972 249 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
250 if (!bond->name || strcmp(bond->name, s->name)) {
251 if (bond->name) {
3bfd3972 252 hmap_remove(all_bonds, &bond->hmap_node);
f620b43a
BP
253 free(bond->name);
254 }
255 bond->name = xstrdup(s->name);
3bfd3972 256 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
f620b43a
BP
257 }
258
f620b43a
BP
259 bond->updelay = s->up_delay;
260 bond->downdelay = s->down_delay;
bc1b010c 261
9dd165e0
RK
262 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
263 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
264 revalidate = true;
265 }
266
bc1b010c
EJ
267 if (bond->rebalance_interval != s->rebalance_interval) {
268 bond->rebalance_interval = s->rebalance_interval;
269 revalidate = true;
270 }
f620b43a 271
59d7b2b6
EJ
272 if (bond->balance != s->balance) {
273 bond->balance = s->balance;
274 revalidate = true;
275 }
276
672d18b2
EJ
277 if (bond->basis != s->basis) {
278 bond->basis = s->basis;
279 revalidate = true;
280 }
281
f620b43a
BP
282 if (s->fake_iface) {
283 if (bond->next_fake_iface_update == LLONG_MAX) {
284 bond->next_fake_iface_update = time_msec();
285 }
286 } else {
287 bond->next_fake_iface_update = LLONG_MAX;
288 }
59d7b2b6 289
62904702
EJ
290 if (bond->bond_revalidate) {
291 revalidate = true;
292 bond->bond_revalidate = false;
293 }
294
95aafb2a
EJ
295 if (bond->balance == BM_AB || !bond->hash || revalidate) {
296 bond_entry_reset(bond);
297 }
298
3bfd3972 299 ovs_rwlock_unlock(&rwlock);
59d7b2b6 300 return revalidate;
f620b43a
BP
301}
302
f8ddccd2 303static void
1ea24138 304bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
3bfd3972 305 OVS_REQ_WRLOCK(rwlock)
f8ddccd2
BP
306{
307 if (slave->netdev != netdev) {
f8ddccd2 308 slave->netdev = netdev;
1ea24138 309 slave->change_seq = 0;
f8ddccd2
BP
310 }
311}
312
f620b43a
BP
313/* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
314 * arbitrary client-provided pointer that uniquely identifies a slave within a
315 * bond. If 'slave_' already exists within 'bond' then this function
316 * reconfigures the existing slave.
317 *
318 * 'netdev' must be the network device that 'slave_' represents. It is owned
319 * by the client, so the client must not close it before either unregistering
320 * 'slave_' or destroying 'bond'.
4d6fb5eb 321 */
f620b43a 322void
df53d41c 323bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
f620b43a 324{
3bfd3972 325 struct bond_slave *slave;
f620b43a 326
3bfd3972
EJ
327 ovs_rwlock_wrlock(&rwlock);
328 slave = bond_slave_lookup(bond, slave_);
f620b43a
BP
329 if (!slave) {
330 slave = xzalloc(sizeof *slave);
331
332 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
333 slave->bond = bond;
334 slave->aux = slave_;
335 slave->delay_expires = LLONG_MAX;
244b2160 336 slave->name = xstrdup(netdev_get_name(netdev));
7321e30e 337 bond->bond_revalidate = true;
244b2160 338
b3c18f66 339 slave->enabled = false;
4a1b8f30 340 bond_enable_slave(slave, netdev_get_carrier(netdev));
f620b43a
BP
341 }
342
1ea24138 343 bond_slave_set_netdev__(slave, netdev);
a6934aa9 344
f620b43a
BP
345 free(slave->name);
346 slave->name = xstrdup(netdev_get_name(netdev));
3bfd3972 347 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
348}
349
f8ddccd2
BP
350/* Updates the network device to be used with 'slave_' to 'netdev'.
351 *
352 * This is useful if the caller closes and re-opens the network device
353 * registered with bond_slave_register() but doesn't need to change anything
354 * else. */
355void
356bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
357{
3bfd3972
EJ
358 struct bond_slave *slave;
359
360 ovs_rwlock_wrlock(&rwlock);
361 slave = bond_slave_lookup(bond, slave_);
f8ddccd2 362 if (slave) {
1ea24138 363 bond_slave_set_netdev__(slave, netdev);
f8ddccd2 364 }
3bfd3972 365 ovs_rwlock_unlock(&rwlock);
f8ddccd2
BP
366}
367
f620b43a
BP
368/* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
369 * then this function has no effect.
370 *
371 * Unregistering a slave invalidates all flows. */
372void
373bond_slave_unregister(struct bond *bond, const void *slave_)
374{
3bfd3972 375 struct bond_slave *slave;
f620b43a
BP
376 bool del_active;
377
3bfd3972
EJ
378 ovs_rwlock_wrlock(&rwlock);
379 slave = bond_slave_lookup(bond, slave_);
f620b43a 380 if (!slave) {
3bfd3972 381 goto out;
f620b43a
BP
382 }
383
4a1b8f30
EJ
384 bond->bond_revalidate = true;
385 bond_enable_slave(slave, false);
b3c18f66 386
f620b43a
BP
387 del_active = bond->active_slave == slave;
388 if (bond->hash) {
389 struct bond_entry *e;
390 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
391 if (e->slave == slave) {
392 e->slave = NULL;
393 }
394 }
395 }
396
397 free(slave->name);
398
399 hmap_remove(&bond->slaves, &slave->hmap_node);
400 /* Client owns 'slave->netdev'. */
401 free(slave);
402
403 if (del_active) {
4a1b8f30 404 bond_choose_active_slave(bond);
f620b43a
BP
405 bond->send_learning_packets = true;
406 }
3bfd3972
EJ
407out:
408 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
409}
410
296f6519
EJ
411/* Should be called on each slave in 'bond' before bond_run() to indicate
412 * whether or not 'slave_' may be enabled. This function is intended to allow
413 * other protocols to have some impact on bonding decisions. For example LACP
414 * or high level link monitoring protocols may decide that a given slave should
415 * not be able to send traffic. */
4d6fb5eb 416void
296f6519 417bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
4d6fb5eb 418{
3bfd3972 419 ovs_rwlock_wrlock(&rwlock);
296f6519 420 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
3bfd3972 421 ovs_rwlock_unlock(&rwlock);
4d6fb5eb
EJ
422}
423
4a1b8f30
EJ
424/* Performs periodic maintenance on 'bond'.
425 *
426 * Returns true if the caller should revalidate its flows.
f620b43a
BP
427 *
428 * The caller should check bond_should_send_learning_packets() afterward. */
4a1b8f30
EJ
429bool
430bond_run(struct bond *bond, enum lacp_status lacp_status)
f620b43a
BP
431{
432 struct bond_slave *slave;
4a1b8f30 433 bool revalidate;
f620b43a 434
3bfd3972 435 ovs_rwlock_wrlock(&rwlock);
bdebeece
EJ
436 if (bond->lacp_status != lacp_status) {
437 bond->lacp_status = lacp_status;
4592d0e2
EJ
438 bond->bond_revalidate = true;
439 }
4d6fb5eb 440
f620b43a
BP
441 /* Enable slaves based on link status and LACP feedback. */
442 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
4a1b8f30 443 bond_link_status_update(slave);
1ea24138 444 slave->change_seq = netdev_change_seq(slave->netdev);
f620b43a
BP
445 }
446 if (!bond->active_slave || !bond->active_slave->enabled) {
4a1b8f30 447 bond_choose_active_slave(bond);
f620b43a
BP
448 }
449
450 /* Update fake bond interface stats. */
451 if (time_msec() >= bond->next_fake_iface_update) {
452 bond_update_fake_slave_stats(bond);
453 bond->next_fake_iface_update = time_msec() + 1000;
454 }
455
4a1b8f30
EJ
456 revalidate = bond->bond_revalidate;
457 bond->bond_revalidate = false;
3bfd3972 458 ovs_rwlock_unlock(&rwlock);
4a1b8f30
EJ
459
460 return revalidate;
f620b43a
BP
461}
462
463/* Causes poll_block() to wake up when 'bond' needs something to be done. */
464void
465bond_wait(struct bond *bond)
466{
467 struct bond_slave *slave;
468
3bfd3972 469 ovs_rwlock_rdlock(&rwlock);
f620b43a
BP
470 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
471 if (slave->delay_expires != LLONG_MAX) {
472 poll_timer_wait_until(slave->delay_expires);
473 }
1ea24138
EJ
474
475 if (slave->change_seq != netdev_change_seq(slave->netdev)) {
476 poll_immediate_wake();
477 }
f620b43a
BP
478 }
479
480 if (bond->next_fake_iface_update != LLONG_MAX) {
481 poll_timer_wait_until(bond->next_fake_iface_update);
482 }
483
bbc13389 484 if (bond->bond_revalidate) {
f620b43a
BP
485 poll_immediate_wake();
486 }
3bfd3972 487 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
488
489 /* We don't wait for bond->next_rebalance because rebalancing can only run
490 * at a flow account checkpoint. ofproto does checkpointing on its own
491 * schedule and bond_rebalance() gets called afterward, so we'd just be
492 * waking up for no purpose. */
493}
494\f
495/* MAC learning table interaction. */
496
497static bool
498may_send_learning_packets(const struct bond *bond)
499{
9dd165e0
RK
500 return ((bond->lacp_status == LACP_DISABLED
501 && (bond->balance == BM_SLB || bond->balance == BM_AB))
502 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
bdebeece 503 && bond->active_slave;
f620b43a
BP
504}
505
506/* Returns true if 'bond' needs the client to send out packets to assist with
507 * MAC learning on 'bond'. If this function returns true, then the client
508 * should iterate through its MAC learning table for the bridge on which 'bond'
509 * is located. For each MAC that has been learned on a port other than 'bond',
ea131871 510 * it should call bond_compose_learning_packet().
f620b43a 511 *
477879ea
BP
512 * This function will only return true if 'bond' is in SLB or active-backup
513 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
514 * necessary.
f620b43a
BP
515 *
516 * Calling this function resets the state that it checks. */
517bool
518bond_should_send_learning_packets(struct bond *bond)
519{
3bfd3972
EJ
520 bool send;
521
522 ovs_rwlock_wrlock(&rwlock);
523 send = bond->send_learning_packets && may_send_learning_packets(bond);
f620b43a 524 bond->send_learning_packets = false;
3bfd3972 525 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
526 return send;
527}
528
529/* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
530 *
ea131871
JG
531 * See bond_should_send_learning_packets() for description of usage. The
532 * caller should send the composed packet on the port associated with
533 * port_aux and takes ownership of the returned ofpbuf. */
534struct ofpbuf *
535bond_compose_learning_packet(struct bond *bond,
536 const uint8_t eth_src[ETH_ADDR_LEN],
537 uint16_t vlan, void **port_aux)
f620b43a
BP
538{
539 struct bond_slave *slave;
ea131871 540 struct ofpbuf *packet;
f620b43a 541 struct flow flow;
f620b43a 542
3bfd3972 543 ovs_rwlock_rdlock(&rwlock);
cb22974d 544 ovs_assert(may_send_learning_packets(bond));
f620b43a
BP
545 memset(&flow, 0, sizeof flow);
546 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
4a1b8f30 547 slave = choose_output_slave(bond, &flow, NULL, vlan);
f620b43a 548
ea131871 549 packet = ofpbuf_new(0);
2ea838ac 550 compose_rarp(packet, eth_src);
f620b43a 551 if (vlan) {
ea131871 552 eth_push_vlan(packet, htons(vlan));
f620b43a 553 }
f620b43a 554
ea131871 555 *port_aux = slave->aux;
3bfd3972 556 ovs_rwlock_unlock(&rwlock);
ea131871 557 return packet;
f620b43a
BP
558}
559\f
560/* Checks whether a packet that arrived on 'slave_' within 'bond', with an
561 * Ethernet destination address of 'eth_dst', should be admitted.
562 *
563 * The return value is one of the following:
564 *
565 * - BV_ACCEPT: Admit the packet.
566 *
567 * - BV_DROP: Drop the packet.
568 *
569 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
570 * Ethernet source address and VLAN. If there is none, or if the packet
571 * is on the learned port, then admit the packet. If a different port has
572 * been learned, however, drop the packet (and do not use it for MAC
573 * learning).
574 */
575enum bond_verdict
576bond_check_admissibility(struct bond *bond, const void *slave_,
4a1b8f30 577 const uint8_t eth_dst[ETH_ADDR_LEN])
f620b43a 578{
3bfd3972
EJ
579 enum bond_verdict verdict = BV_DROP;
580 struct bond_slave *slave;
9a1c6450 581
3bfd3972
EJ
582 ovs_rwlock_rdlock(&rwlock);
583 slave = bond_slave_lookup(bond, slave_);
4222bbc8 584 if (!slave) {
3bfd3972 585 goto out;
4222bbc8
EJ
586 }
587
9a1c6450
EJ
588 /* LACP bonds have very loose admissibility restrictions because we can
589 * assume the remote switch is aware of the bond and will "do the right
590 * thing". However, as a precaution we drop packets on disabled slaves
591 * because no correctly implemented partner switch should be sending
bdebeece
EJ
592 * packets to them.
593 *
594 * If LACP is configured, but LACP negotiations have been unsuccessful, we
9dd165e0 595 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
bdebeece 596 switch (bond->lacp_status) {
3bfd3972
EJ
597 case LACP_NEGOTIATED:
598 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
599 goto out;
600 case LACP_CONFIGURED:
9dd165e0
RK
601 if (!bond->lacp_fallback_ab) {
602 goto out;
603 }
3bfd3972
EJ
604 case LACP_DISABLED:
605 break;
f620b43a
BP
606 }
607
608 /* Drop all multicast packets on inactive slaves. */
609 if (eth_addr_is_multicast(eth_dst)) {
4222bbc8 610 if (bond->active_slave != slave) {
3bfd3972 611 goto out;
f620b43a
BP
612 }
613 }
614
f931a4c9 615 switch (bond->balance) {
9dd165e0
RK
616 case BM_TCP:
617 /* TCP balanced bonds require successful LACP negotiations. Based on the
618 * above check, LACP is off or lacp_fallback_ab is true on this bond.
619 * If lacp_fallback_ab is true fall through to BM_AB case else, we
620 * drop all incoming traffic. */
621 if (!bond->lacp_fallback_ab) {
622 goto out;
623 }
624
f931a4c9
BP
625 case BM_AB:
626 /* Drop all packets which arrive on backup slaves. This is similar to
627 * how Linux bonding handles active-backup bonds. */
7ba7dcf0
EJ
628 if (bond->active_slave != slave) {
629 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
630
e6b2255c
BP
631 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
632 " slave (%s) destined for " ETH_ADDR_FMT,
633 slave->name, ETH_ADDR_ARGS(eth_dst));
3bfd3972 634 goto out;
7ba7dcf0 635 }
3bfd3972
EJ
636 verdict = BV_ACCEPT;
637 goto out;
f931a4c9 638
f931a4c9
BP
639 case BM_SLB:
640 /* Drop all packets for which we have learned a different input port,
641 * because we probably sent the packet on one slave and got it back on
642 * the other. Gratuitous ARP packets are an exception to this rule:
643 * the host has moved to another switch. The exception to the
644 * exception is if we locked the learning table to avoid reflections on
645 * bond slaves. */
3bfd3972
EJ
646 verdict = BV_DROP_IF_MOVED;
647 goto out;
7ba7dcf0
EJ
648 }
649
f931a4c9 650 NOT_REACHED();
3bfd3972
EJ
651out:
652 ovs_rwlock_unlock(&rwlock);
653 return verdict;
654
f620b43a
BP
655}
656
657/* Returns the slave (registered on 'bond' by bond_slave_register()) to which
658 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
659 * NULL if the packet should be dropped because no slaves are enabled.
660 *
661 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
662 * should be a VID only (i.e. excluding the PCP bits). Second,
663 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
664 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
665 * packet belongs to (so for an access port it will be the access port's VLAN).
666 *
bcd2633a
JP
667 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
668 * significant in the selection. At some point earlier, 'wc' should
669 * have been initialized (e.g., by flow_wildcards_init_catchall()).
f620b43a
BP
670 */
671void *
672bond_choose_output_slave(struct bond *bond, const struct flow *flow,
4a1b8f30 673 struct flow_wildcards *wc, uint16_t vlan)
f620b43a 674{
3bfd3972 675 struct bond_slave *slave;
b5d5d7d3 676 void *aux;
3bfd3972
EJ
677
678 ovs_rwlock_rdlock(&rwlock);
4a1b8f30 679 slave = choose_output_slave(bond, flow, wc, vlan);
b5d5d7d3 680 aux = slave ? slave->aux : NULL;
3bfd3972 681 ovs_rwlock_unlock(&rwlock);
b5d5d7d3
AW
682
683 return aux;
f620b43a 684}
f620b43a
BP
685\f
686/* Rebalancing. */
687
1b137691 688static bool
3bfd3972 689bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
1b137691 690{
bc1b010c
EJ
691 return bond->rebalance_interval
692 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
1b137691
EJ
693}
694
f620b43a
BP
695/* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
696void
697bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
698 uint64_t n_bytes)
699{
3bfd3972 700 ovs_rwlock_wrlock(&rwlock);
1b137691 701 if (bond_is_balanced(bond)) {
f620b43a 702 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
f620b43a 703 }
3bfd3972 704 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
705}
706
707static struct bond_slave *
3bfd3972 708bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
709{
710 return CONTAINER_OF(bal, struct bond_slave, bal_node);
711}
712
713static void
714log_bals(struct bond *bond, const struct list *bals)
715{
716 if (VLOG_IS_DBG_ENABLED()) {
717 struct ds ds = DS_EMPTY_INITIALIZER;
718 const struct bond_slave *slave;
719
720 LIST_FOR_EACH (slave, bal_node, bals) {
721 if (ds.length) {
722 ds_put_char(&ds, ',');
723 }
724 ds_put_format(&ds, " %s %"PRIu64"kB",
725 slave->name, slave->tx_bytes / 1024);
726
727 if (!slave->enabled) {
728 ds_put_cstr(&ds, " (disabled)");
729 }
730 if (!list_is_empty(&slave->entries)) {
731 struct bond_entry *e;
732
733 ds_put_cstr(&ds, " (");
734 LIST_FOR_EACH (e, list_node, &slave->entries) {
735 if (&e->list_node != list_front(&slave->entries)) {
736 ds_put_cstr(&ds, " + ");
737 }
34582733 738 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
f620b43a
BP
739 e - bond->hash, e->tx_bytes / 1024);
740 }
741 ds_put_cstr(&ds, ")");
742 }
743 }
744 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
745 ds_destroy(&ds);
746 }
747}
748
749/* Shifts 'hash' from its current slave to 'to'. */
750static void
4a1b8f30 751bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
f620b43a
BP
752{
753 struct bond_slave *from = hash->slave;
754 struct bond *bond = from->bond;
755 uint64_t delta = hash->tx_bytes;
756
34582733 757 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
f620b43a
BP
758 "from %s to %s (now carrying %"PRIu64"kB and "
759 "%"PRIu64"kB load, respectively)",
760 bond->name, delta / 1024, hash - bond->hash,
761 from->name, to->name,
762 (from->tx_bytes - delta) / 1024,
763 (to->tx_bytes + delta) / 1024);
764
765 /* Shift load away from 'from' to 'to'. */
766 from->tx_bytes -= delta;
767 to->tx_bytes += delta;
768
769 /* Arrange for flows to be revalidated. */
dc30ea2d 770 hash->slave = to;
4a1b8f30 771 bond->bond_revalidate = true;
f620b43a
BP
772}
773
09a5d390
BP
774/* Picks and returns a bond_entry to migrate from 'from' (the most heavily
775 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
f620b43a
BP
776 * given that doing so must decrease the ratio of the load on the two slaves by
777 * at least 0.1. Returns NULL if there is no appropriate entry.
778 *
779 * The list of entries isn't sorted. I don't know of a reason to prefer to
780 * shift away small hashes or large hashes. */
781static struct bond_entry *
782choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
783{
784 struct bond_entry *e;
785
786 if (list_is_short(&from->entries)) {
787 /* 'from' carries no more than one MAC hash, so shifting load away from
788 * it would be pointless. */
789 return NULL;
790 }
791
792 LIST_FOR_EACH (e, list_node, &from->entries) {
793 double old_ratio, new_ratio;
794 uint64_t delta;
795
796 if (to_tx_bytes == 0) {
797 /* Nothing on the new slave, move it. */
798 return e;
799 }
800
801 delta = e->tx_bytes;
802 old_ratio = (double)from->tx_bytes / to_tx_bytes;
803 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
75fad143
ZK
804 if (old_ratio - new_ratio > 0.1
805 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
806 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
807 and 'to' slave have the same load. Therefore, we only move an
808 entry if it decreases the load on 'from', and brings us closer
809 to equal traffic load. */
f620b43a
BP
810 return e;
811 }
812 }
813
814 return NULL;
815}
816
817/* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
818 * maintained. */
819static void
820insert_bal(struct list *bals, struct bond_slave *slave)
821{
822 struct bond_slave *pos;
823
824 LIST_FOR_EACH (pos, bal_node, bals) {
825 if (slave->tx_bytes > pos->tx_bytes) {
826 break;
827 }
828 }
829 list_insert(&pos->bal_node, &slave->bal_node);
830}
831
832/* Removes 'slave' from its current list and then inserts it into 'bals' so
833 * that descending order of 'tx_bytes' is maintained. */
834static void
835reinsert_bal(struct list *bals, struct bond_slave *slave)
836{
837 list_remove(&slave->bal_node);
838 insert_bal(bals, slave);
839}
840
841/* If 'bond' needs rebalancing, does so.
842 *
843 * The caller should have called bond_account() for each active flow, to ensure
844 * that flow data is consistently accounted at this point. */
845void
4a1b8f30 846bond_rebalance(struct bond *bond)
f620b43a
BP
847{
848 struct bond_slave *slave;
849 struct bond_entry *e;
850 struct list bals;
851
3bfd3972 852 ovs_rwlock_wrlock(&rwlock);
1b137691 853 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
3bfd3972 854 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
855 return;
856 }
857 bond->next_rebalance = time_msec() + bond->rebalance_interval;
858
859 /* Add each bond_entry to its slave's 'entries' list.
860 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
861 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
862 slave->tx_bytes = 0;
863 list_init(&slave->entries);
864 }
865 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
866 if (e->slave && e->tx_bytes) {
867 e->slave->tx_bytes += e->tx_bytes;
868 list_push_back(&e->slave->entries, &e->list_node);
869 }
870 }
871
872 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
873 *
874 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
875 * with a proper list sort algorithm. */
876 list_init(&bals);
877 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
878 if (slave->enabled) {
879 insert_bal(&bals, slave);
880 }
881 }
882 log_bals(bond, &bals);
883
884 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
885 while (!list_is_short(&bals)) {
886 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
887 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
888 uint64_t overload;
889
890 overload = from->tx_bytes - to->tx_bytes;
891 if (overload < to->tx_bytes >> 5 || overload < 100000) {
892 /* The extra load on 'from' (and all less-loaded slaves), compared
893 * to that of 'to' (the least-loaded slave), is less than ~3%, or
894 * it is less than ~1Mbps. No point in rebalancing. */
895 break;
896 }
897
09a5d390
BP
898 /* 'from' is carrying significantly more load than 'to'. Pick a hash
899 * to move from 'from' to 'to'. */
f620b43a
BP
900 e = choose_entry_to_migrate(from, to->tx_bytes);
901 if (e) {
4a1b8f30 902 bond_shift_load(e, to);
f620b43a
BP
903
904 /* Delete element from from->entries.
905 *
906 * We don't add the element to to->hashes. That would only allow
907 * 'e' to be migrated to another slave in this rebalancing run, and
908 * there is no point in doing that. */
909 list_remove(&e->list_node);
910
911 /* Re-sort 'bals'. */
912 reinsert_bal(&bals, from);
913 reinsert_bal(&bals, to);
914 } else {
915 /* Can't usefully migrate anything away from 'from'.
916 * Don't reconsider it. */
917 list_remove(&from->bal_node);
918 }
919 }
920
921 /* Implement exponentially weighted moving average. A weight of 1/2 causes
922 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
923 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
924 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
925 e->tx_bytes /= 2;
926 if (!e->tx_bytes) {
927 e->slave = NULL;
928 }
929 }
3bfd3972 930 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
931}
932\f
933/* Bonding unixctl user interface functions. */
934
935static struct bond *
3bfd3972 936bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
937{
938 struct bond *bond;
939
940 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
3bfd3972 941 all_bonds) {
f620b43a
BP
942 if (!strcmp(bond->name, name)) {
943 return bond;
944 }
945 }
946 return NULL;
947}
948
949static struct bond_slave *
950bond_lookup_slave(struct bond *bond, const char *slave_name)
951{
952 struct bond_slave *slave;
953
954 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
955 if (!strcmp(slave->name, slave_name)) {
956 return slave;
957 }
958 }
959 return NULL;
960}
961
962static void
963bond_unixctl_list(struct unixctl_conn *conn,
0e15264f
BP
964 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
965 void *aux OVS_UNUSED)
f620b43a
BP
966{
967 struct ds ds = DS_EMPTY_INITIALIZER;
968 const struct bond *bond;
969
970 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
971
3bfd3972
EJ
972 ovs_rwlock_rdlock(&rwlock);
973 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
f620b43a
BP
974 const struct bond_slave *slave;
975 size_t i;
976
977 ds_put_format(&ds, "%s\t%s\t",
978 bond->name, bond_mode_to_string(bond->balance));
979
980 i = 0;
981 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
982 if (i++ > 0) {
983 ds_put_cstr(&ds, ", ");
984 }
985 ds_put_cstr(&ds, slave->name);
986 }
987 ds_put_char(&ds, '\n');
988 }
3bfd3972 989 ovs_rwlock_unlock(&rwlock);
bde9f75d 990 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a
BP
991 ds_destroy(&ds);
992}
993
994static void
c33a8a25 995bond_print_details(struct ds *ds, const struct bond *bond)
3bfd3972 996 OVS_REQ_RDLOCK(rwlock)
f620b43a 997{
fc1d4f01
EJ
998 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
999 const struct shash_node **sorted_slaves = NULL;
f620b43a 1000 const struct bond_slave *slave;
fc1d4f01 1001 int i;
f620b43a 1002
c33a8a25
EJ
1003 ds_put_format(ds, "---- %s ----\n", bond->name);
1004 ds_put_format(ds, "bond_mode: %s\n",
f620b43a
BP
1005 bond_mode_to_string(bond->balance));
1006
c33a8a25 1007 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
672d18b2 1008
c33a8a25
EJ
1009 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1010 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
f620b43a 1011
1b137691 1012 if (bond_is_balanced(bond)) {
c33a8a25 1013 ds_put_format(ds, "next rebalance: %lld ms\n",
f620b43a
BP
1014 bond->next_rebalance - time_msec());
1015 }
1016
bdebeece
EJ
1017 ds_put_cstr(ds, "lacp_status: ");
1018 switch (bond->lacp_status) {
1019 case LACP_NEGOTIATED:
1020 ds_put_cstr(ds, "negotiated\n");
1021 break;
1022 case LACP_CONFIGURED:
1023 ds_put_cstr(ds, "configured\n");
1024 break;
1025 case LACP_DISABLED:
1026 ds_put_cstr(ds, "off\n");
1027 break;
1028 default:
1029 ds_put_cstr(ds, "<unknown>\n");
1030 break;
1031 }
4d6fb5eb 1032
f620b43a 1033 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
fc1d4f01
EJ
1034 shash_add(&slave_shash, slave->name, slave);
1035 }
1036 sorted_slaves = shash_sort(&slave_shash);
1037
1038 for (i = 0; i < shash_count(&slave_shash); i++) {
f620b43a 1039 struct bond_entry *be;
f620b43a 1040
fc1d4f01
EJ
1041 slave = sorted_slaves[i]->data;
1042
f620b43a 1043 /* Basic info. */
c33a8a25 1044 ds_put_format(ds, "\nslave %s: %s\n",
f620b43a
BP
1045 slave->name, slave->enabled ? "enabled" : "disabled");
1046 if (slave == bond->active_slave) {
c33a8a25 1047 ds_put_cstr(ds, "\tactive slave\n");
f620b43a
BP
1048 }
1049 if (slave->delay_expires != LLONG_MAX) {
c33a8a25 1050 ds_put_format(ds, "\t%s expires in %lld ms\n",
f620b43a
BP
1051 slave->enabled ? "downdelay" : "updelay",
1052 slave->delay_expires - time_msec());
1053 }
1054
c33a8a25 1055 ds_put_format(ds, "\tmay_enable: %s\n",
296f6519 1056 slave->may_enable ? "true" : "false");
4d6fb5eb 1057
1b137691 1058 if (!bond_is_balanced(bond)) {
f620b43a
BP
1059 continue;
1060 }
1061
1062 /* Hashes. */
f620b43a
BP
1063 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1064 int hash = be - bond->hash;
1065
1066 if (be->slave != slave) {
1067 continue;
1068 }
1069
c33a8a25 1070 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
f620b43a
BP
1071 hash, be->tx_bytes / 1024);
1072
7b9f1974 1073 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
f620b43a
BP
1074 }
1075 }
fc1d4f01
EJ
1076 shash_destroy(&slave_shash);
1077 free(sorted_slaves);
c33a8a25
EJ
1078 ds_put_cstr(ds, "\n");
1079}
1080
1081static void
1082bond_unixctl_show(struct unixctl_conn *conn,
1083 int argc, const char *argv[],
1084 void *aux OVS_UNUSED)
1085{
1086 struct ds ds = DS_EMPTY_INITIALIZER;
1087
3bfd3972 1088 ovs_rwlock_rdlock(&rwlock);
c33a8a25
EJ
1089 if (argc > 1) {
1090 const struct bond *bond = bond_find(argv[1]);
1091
1092 if (!bond) {
bde9f75d 1093 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1094 goto out;
c33a8a25
EJ
1095 }
1096 bond_print_details(&ds, bond);
1097 } else {
1098 const struct bond *bond;
1099
3bfd3972 1100 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
c33a8a25
EJ
1101 bond_print_details(&ds, bond);
1102 }
1103 }
1104
bde9f75d 1105 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a 1106 ds_destroy(&ds);
3bfd3972
EJ
1107
1108out:
1109 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1110}
1111
1112static void
0e15264f
BP
1113bond_unixctl_migrate(struct unixctl_conn *conn,
1114 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1115 void *aux OVS_UNUSED)
1116{
0e15264f
BP
1117 const char *bond_s = argv[1];
1118 const char *hash_s = argv[2];
1119 const char *slave_s = argv[3];
f620b43a
BP
1120 struct bond *bond;
1121 struct bond_slave *slave;
1122 struct bond_entry *entry;
1123 int hash;
1124
3bfd3972 1125 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1126 bond = bond_find(bond_s);
1127 if (!bond) {
bde9f75d 1128 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1129 goto out;
f620b43a
BP
1130 }
1131
1132 if (bond->balance != BM_SLB) {
bde9f75d 1133 unixctl_command_reply_error(conn, "not an SLB bond");
3bfd3972 1134 goto out;
f620b43a
BP
1135 }
1136
1137 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1138 hash = atoi(hash_s) & BOND_MASK;
1139 } else {
bde9f75d 1140 unixctl_command_reply_error(conn, "bad hash");
3bfd3972 1141 goto out;
f620b43a
BP
1142 }
1143
1144 slave = bond_lookup_slave(bond, slave_s);
1145 if (!slave) {
bde9f75d 1146 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1147 goto out;
f620b43a
BP
1148 }
1149
1150 if (!slave->enabled) {
bde9f75d 1151 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
3bfd3972 1152 goto out;
f620b43a
BP
1153 }
1154
1155 entry = &bond->hash[hash];
4a1b8f30 1156 bond->bond_revalidate = true;
f620b43a 1157 entry->slave = slave;
bde9f75d 1158 unixctl_command_reply(conn, "migrated");
3bfd3972
EJ
1159
1160out:
1161 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1162}
1163
1164static void
0e15264f
BP
1165bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1166 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1167 void *aux OVS_UNUSED)
1168{
0e15264f
BP
1169 const char *bond_s = argv[1];
1170 const char *slave_s = argv[2];
f620b43a
BP
1171 struct bond *bond;
1172 struct bond_slave *slave;
1173
3bfd3972 1174 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1175 bond = bond_find(bond_s);
1176 if (!bond) {
bde9f75d 1177 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1178 goto out;
f620b43a
BP
1179 }
1180
1181 slave = bond_lookup_slave(bond, slave_s);
1182 if (!slave) {
bde9f75d 1183 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1184 goto out;
f620b43a
BP
1185 }
1186
1187 if (!slave->enabled) {
bde9f75d 1188 unixctl_command_reply_error(conn, "cannot make disabled slave active");
3bfd3972 1189 goto out;
f620b43a
BP
1190 }
1191
1192 if (bond->active_slave != slave) {
4a1b8f30 1193 bond->bond_revalidate = true;
f620b43a 1194 bond->active_slave = slave;
f620b43a
BP
1195 VLOG_INFO("bond %s: active interface is now %s",
1196 bond->name, slave->name);
1197 bond->send_learning_packets = true;
bde9f75d 1198 unixctl_command_reply(conn, "done");
f620b43a 1199 } else {
bde9f75d 1200 unixctl_command_reply(conn, "no change");
f620b43a 1201 }
3bfd3972
EJ
1202out:
1203 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1204}
1205
1206static void
0e15264f 1207enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
f620b43a 1208{
0e15264f
BP
1209 const char *bond_s = argv[1];
1210 const char *slave_s = argv[2];
f620b43a
BP
1211 struct bond *bond;
1212 struct bond_slave *slave;
1213
3bfd3972 1214 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1215 bond = bond_find(bond_s);
1216 if (!bond) {
bde9f75d 1217 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1218 goto out;
f620b43a
BP
1219 }
1220
1221 slave = bond_lookup_slave(bond, slave_s);
1222 if (!slave) {
bde9f75d 1223 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1224 goto out;
f620b43a
BP
1225 }
1226
4a1b8f30 1227 bond_enable_slave(slave, enable);
bde9f75d 1228 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
3bfd3972
EJ
1229
1230out:
1231 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1232}
1233
1234static void
0e15264f
BP
1235bond_unixctl_enable_slave(struct unixctl_conn *conn,
1236 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1237 void *aux OVS_UNUSED)
1238{
0e15264f 1239 enable_slave(conn, argv, true);
f620b43a
BP
1240}
1241
1242static void
0e15264f
BP
1243bond_unixctl_disable_slave(struct unixctl_conn *conn,
1244 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1245 void *aux OVS_UNUSED)
1246{
0e15264f 1247 enable_slave(conn, argv, false);
f620b43a
BP
1248}
1249
1250static void
0e15264f 1251bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
f620b43a
BP
1252 void *aux OVS_UNUSED)
1253{
0e15264f
BP
1254 const char *mac_s = argv[1];
1255 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1256 const char *basis_s = argc > 3 ? argv[3] : NULL;
f620b43a
BP
1257 uint8_t mac[ETH_ADDR_LEN];
1258 uint8_t hash;
1259 char *hash_cstr;
1260 unsigned int vlan;
672d18b2 1261 uint32_t basis;
f620b43a
BP
1262
1263 if (vlan_s) {
c2c28dfd 1264 if (!ovs_scan(vlan_s, "%u", &vlan)) {
bde9f75d 1265 unixctl_command_reply_error(conn, "invalid vlan");
f620b43a
BP
1266 return;
1267 }
1268 } else {
dc155bff 1269 vlan = 0;
f620b43a
BP
1270 }
1271
672d18b2 1272 if (basis_s) {
c2c28dfd 1273 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
bde9f75d 1274 unixctl_command_reply_error(conn, "invalid basis");
672d18b2
EJ
1275 return;
1276 }
1277 } else {
1278 basis = 0;
1279 }
1280
c2c28dfd 1281 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
672d18b2 1282 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
f620b43a
BP
1283
1284 hash_cstr = xasprintf("%u", hash);
bde9f75d 1285 unixctl_command_reply(conn, hash_cstr);
f620b43a
BP
1286 free(hash_cstr);
1287 } else {
bde9f75d 1288 unixctl_command_reply_error(conn, "invalid mac");
f620b43a
BP
1289 }
1290}
1291
1292void
1293bond_init(void)
1294{
0e15264f 1295 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
c33a8a25
EJ
1296 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1297 NULL);
0e15264f 1298 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
7ff2009a 1299 bond_unixctl_migrate, NULL);
0e15264f 1300 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
f620b43a 1301 bond_unixctl_set_active_slave, NULL);
0e15264f 1302 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
7ff2009a 1303 bond_unixctl_enable_slave, NULL);
0e15264f 1304 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
7ff2009a 1305 bond_unixctl_disable_slave, NULL);
0e15264f 1306 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
7ff2009a 1307 bond_unixctl_hash, NULL);
f620b43a
BP
1308}
1309\f
95aafb2a
EJ
1310static void
1311bond_entry_reset(struct bond *bond)
1312{
1313 if (bond->balance != BM_AB) {
1314 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1315
1316 if (!bond->hash) {
1317 bond->hash = xmalloc(hash_len);
1318 }
1319 memset(bond->hash, 0, hash_len);
1320
1321 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1322 } else {
1323 free(bond->hash);
1324 bond->hash = NULL;
1325 }
1326}
1327
f620b43a
BP
1328static struct bond_slave *
1329bond_slave_lookup(struct bond *bond, const void *slave_)
1330{
1331 struct bond_slave *slave;
1332
1333 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1334 &bond->slaves) {
1335 if (slave->aux == slave_) {
1336 return slave;
1337 }
1338 }
1339
1340 return NULL;
1341}
1342
f620b43a 1343static void
4a1b8f30 1344bond_enable_slave(struct bond_slave *slave, bool enable)
f620b43a
BP
1345{
1346 slave->delay_expires = LLONG_MAX;
1347 if (enable != slave->enabled) {
4a1b8f30 1348 slave->bond->bond_revalidate = true;
f620b43a 1349 slave->enabled = enable;
4a1b8f30
EJ
1350 VLOG_INFO("interface %s: %s", slave->name,
1351 slave->enabled ? "enabled" : "disabled");
f620b43a
BP
1352 }
1353}
1354
1355static void
4a1b8f30 1356bond_link_status_update(struct bond_slave *slave)
f620b43a
BP
1357{
1358 struct bond *bond = slave->bond;
1359 bool up;
1360
296f6519 1361 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
f620b43a
BP
1362 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1363 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1364 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1365 slave->name, up ? "up" : "down");
1366 if (up == slave->enabled) {
1367 slave->delay_expires = LLONG_MAX;
1368 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1369 slave->name, up ? "disabled" : "enabled");
1370 } else {
bdebeece 1371 int delay = (bond->lacp_status != LACP_DISABLED ? 0
f620b43a
BP
1372 : up ? bond->updelay : bond->downdelay);
1373 slave->delay_expires = time_msec() + delay;
1374 if (delay) {
1375 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1376 "for %d ms",
1377 slave->name,
1378 up ? "enabled" : "disabled",
1379 up ? "up" : "down",
1380 delay);
1381 }
1382 }
1383 }
1384
1385 if (time_msec() >= slave->delay_expires) {
4a1b8f30 1386 bond_enable_slave(slave, up);
f620b43a
BP
1387 }
1388}
1389
f620b43a 1390static unsigned int
672d18b2 1391bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
f620b43a 1392{
672d18b2 1393 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
f620b43a
BP
1394}
1395
1396static unsigned int
672d18b2 1397bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
f620b43a
BP
1398{
1399 struct flow hash_flow = *flow;
d84d4b88 1400 hash_flow.vlan_tci = htons(vlan);
f620b43a
BP
1401
1402 /* The symmetric quality of this hash function is not required, but
1403 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1404 * purposes, so we use it out of convenience. */
672d18b2 1405 return flow_hash_symmetric_l4(&hash_flow, basis);
f620b43a
BP
1406}
1407
fb0b29a3
EJ
1408static unsigned int
1409bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1410{
cb22974d 1411 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
fb0b29a3 1412
bdebeece 1413 return (bond->balance == BM_TCP
672d18b2
EJ
1414 ? bond_hash_tcp(flow, vlan, bond->basis)
1415 : bond_hash_src(flow->dl_src, vlan, bond->basis));
fb0b29a3
EJ
1416}
1417
f620b43a
BP
1418static struct bond_entry *
1419lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1420 uint16_t vlan)
1421{
fb0b29a3 1422 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
f620b43a
BP
1423}
1424
1425static struct bond_slave *
1426choose_output_slave(const struct bond *bond, const struct flow *flow,
4a1b8f30 1427 struct flow_wildcards *wc, uint16_t vlan)
f620b43a
BP
1428{
1429 struct bond_entry *e;
9dd165e0 1430 int balance;
f620b43a 1431
9dd165e0 1432 balance = bond->balance;
bdebeece
EJ
1433 if (bond->lacp_status == LACP_CONFIGURED) {
1434 /* LACP has been configured on this bond but negotiations were
9dd165e0
RK
1435 * unsuccussful. If lacp_fallback_ab is enabled use active-
1436 * backup mode else drop all traffic. */
1437 if (!bond->lacp_fallback_ab) {
1438 return NULL;
1439 }
1440 balance = BM_AB;
bdebeece
EJ
1441 }
1442
9dd165e0 1443 switch (balance) {
f620b43a
BP
1444 case BM_AB:
1445 return bond->active_slave;
1446
f620b43a 1447 case BM_TCP:
bdebeece
EJ
1448 if (bond->lacp_status != LACP_NEGOTIATED) {
1449 /* Must have LACP negotiations for TCP balanced bonds. */
1450 return NULL;
1451 }
bcd2633a 1452 if (wc) {
6cdd5145 1453 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
bcd2633a 1454 }
bdebeece
EJ
1455 /* Fall Through. */
1456 case BM_SLB:
bcd2633a 1457 if (wc) {
6cdd5145 1458 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
bcd2633a 1459 }
f620b43a
BP
1460 e = lookup_bond_entry(bond, flow, vlan);
1461 if (!e->slave || !e->slave->enabled) {
c804cadf
EJ
1462 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1463 struct bond_slave, hmap_node);
1464 if (!e->slave->enabled) {
1465 e->slave = bond->active_slave;
1466 }
f620b43a
BP
1467 }
1468 return e->slave;
1469
1470 default:
1471 NOT_REACHED();
1472 }
1473}
1474
1475static struct bond_slave *
1476bond_choose_slave(const struct bond *bond)
1477{
1478 struct bond_slave *slave, *best;
1479
1480 /* Find an enabled slave. */
1481 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1482 if (slave->enabled) {
1483 return slave;
1484 }
1485 }
1486
1487 /* All interfaces are disabled. Find an interface that will be enabled
1488 * after its updelay expires. */
1489 best = NULL;
1490 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1491 if (slave->delay_expires != LLONG_MAX
296f6519 1492 && slave->may_enable
f620b43a
BP
1493 && (!best || slave->delay_expires < best->delay_expires)) {
1494 best = slave;
1495 }
1496 }
1497 return best;
1498}
1499
1500static void
4a1b8f30 1501bond_choose_active_slave(struct bond *bond)
f620b43a
BP
1502{
1503 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1504 struct bond_slave *old_active_slave = bond->active_slave;
1505
1506 bond->active_slave = bond_choose_slave(bond);
1507 if (bond->active_slave) {
1508 if (bond->active_slave->enabled) {
1509 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1510 bond->name, bond->active_slave->name);
1511 } else {
1512 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1513 "remaining %lld ms updelay (since no interface was "
1514 "enabled)", bond->name, bond->active_slave->name,
1515 bond->active_slave->delay_expires - time_msec());
4a1b8f30 1516 bond_enable_slave(bond->active_slave, true);
f620b43a
BP
1517 }
1518
1519 bond->send_learning_packets = true;
1520 } else if (old_active_slave) {
d28b9ead 1521 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
f620b43a
BP
1522 }
1523}
1524
f620b43a
BP
1525/* Attempts to make the sum of the bond slaves' statistics appear on the fake
1526 * bond interface. */
1527static void
1528bond_update_fake_slave_stats(struct bond *bond)
1529{
1530 struct netdev_stats bond_stats;
1531 struct bond_slave *slave;
1532 struct netdev *bond_dev;
1533
1534 memset(&bond_stats, 0, sizeof bond_stats);
1535
1536 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1537 struct netdev_stats slave_stats;
1538
1539 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1540 /* XXX: We swap the stats here because they are swapped back when
1541 * reported by the internal device. The reason for this is
1542 * internal devices normally represent packets going into the
1543 * system but when used as fake bond device they represent packets
1544 * leaving the system. We really should do this in the internal
1545 * device itself because changing it here reverses the counts from
1546 * the perspective of the switch. However, the internal device
1547 * doesn't know what type of device it represents so we have to do
1548 * it here for now. */
1549 bond_stats.tx_packets += slave_stats.rx_packets;
1550 bond_stats.tx_bytes += slave_stats.rx_bytes;
1551 bond_stats.rx_packets += slave_stats.tx_packets;
1552 bond_stats.rx_bytes += slave_stats.tx_bytes;
1553 }
1554 }
1555
18812dff 1556 if (!netdev_open(bond->name, "system", &bond_dev)) {
f620b43a
BP
1557 netdev_set_stats(bond_dev, &bond_stats);
1558 netdev_close(bond_dev);
1559 }
1560}