]> git.proxmox.com Git - mirror_ovs.git/blame - ofproto/bond.c
netdev_class: Pass a struct ofpbuf * to rx_recv()
[mirror_ovs.git] / ofproto / bond.c
CommitLineData
f620b43a 1/*
09a5d390 2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
f620b43a
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
19#include "bond.h"
20
21#include <limits.h>
22#include <stdint.h>
23#include <stdlib.h>
75fad143 24#include <math.h>
f620b43a 25
da4a6191 26#include "connectivity.h"
f620b43a
BP
27#include "coverage.h"
28#include "dynamic-string.h"
29#include "flow.h"
30#include "hmap.h"
bdebeece 31#include "lacp.h"
f620b43a
BP
32#include "list.h"
33#include "netdev.h"
34#include "odp-util.h"
35#include "ofpbuf.h"
36#include "packets.h"
37#include "poll-loop.h"
da4a6191 38#include "seq.h"
fc1d4f01 39#include "shash.h"
f620b43a
BP
40#include "timeval.h"
41#include "unixctl.h"
42#include "vlog.h"
43
44VLOG_DEFINE_THIS_MODULE(bond);
45
f620b43a
BP
46/* Bit-mask for hashing a flow down to a bucket.
47 * There are (BOND_MASK + 1) buckets. */
48#define BOND_MASK 0xff
49
50/* A hash bucket for mapping a flow to a slave.
51 * "struct bond" has an array of (BOND_MASK + 1) of these. */
52struct bond_entry {
53 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
54 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
f620b43a
BP
55 struct list list_node; /* In bond_slave's 'entries' list. */
56};
57
58/* A bond slave, that is, one of the links comprising a bond. */
59struct bond_slave {
60 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
61 struct bond *bond; /* The bond that contains this slave. */
62 void *aux; /* Client-provided handle for this slave. */
63
64 struct netdev *netdev; /* Network device, owned by the client. */
1ea24138 65 unsigned int change_seq; /* Tracks changes in 'netdev'. */
f620b43a
BP
66 char *name; /* Name (a copy of netdev_get_name(netdev)). */
67
68 /* Link status. */
69 long long delay_expires; /* Time after which 'enabled' may change. */
f620b43a 70 bool enabled; /* May be chosen for flows? */
296f6519 71 bool may_enable; /* Client considers this slave bondable. */
f620b43a
BP
72
73 /* Rebalancing info. Used only by bond_rebalance(). */
74 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
75 struct list entries; /* 'struct bond_entry's assigned here. */
76 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
77};
78
79/* A bond, that is, a set of network devices grouped to improve performance or
80 * robustness. */
81struct bond {
82 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
83 char *name; /* Name provided by client. */
84
85 /* Slaves. */
86 struct hmap slaves;
87
88 /* Bonding info. */
89 enum bond_mode balance; /* Balancing mode, one of BM_*. */
90 struct bond_slave *active_slave;
f620b43a 91 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
bdebeece 92 enum lacp_status lacp_status; /* Status of LACP negotiations. */
62904702 93 bool bond_revalidate; /* True if flows need revalidation. */
672d18b2 94 uint32_t basis; /* Basis for flow hash function. */
f620b43a
BP
95
96 /* SLB specific bonding info. */
97 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
98 int rebalance_interval; /* Interval between rebalances, in ms. */
99 long long int next_rebalance; /* Next rebalancing time. */
100 bool send_learning_packets;
101
f620b43a
BP
102 /* Legacy compatibility. */
103 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
9dd165e0 104 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
f620b43a 105
37bec3d3 106 struct ovs_refcount ref_cnt;
f620b43a
BP
107};
108
3bfd3972
EJ
109static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
110static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
111static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
f620b43a 112
3bfd3972
EJ
113static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
114static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
115 OVS_REQ_RDLOCK(rwlock);
4a1b8f30
EJ
116static void bond_enable_slave(struct bond_slave *, bool enable)
117 OVS_REQ_WRLOCK(rwlock);
118static void bond_link_status_update(struct bond_slave *)
3bfd3972 119 OVS_REQ_WRLOCK(rwlock);
4a1b8f30 120static void bond_choose_active_slave(struct bond *)
3bfd3972 121 OVS_REQ_WRLOCK(rwlock);;
f620b43a 122static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
672d18b2
EJ
123 uint16_t vlan, uint32_t basis);
124static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
125 uint32_t basis);
f620b43a
BP
126static struct bond_entry *lookup_bond_entry(const struct bond *,
127 const struct flow *,
3bfd3972
EJ
128 uint16_t vlan)
129 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
130static struct bond_slave *choose_output_slave(const struct bond *,
131 const struct flow *,
bcd2633a 132 struct flow_wildcards *,
4a1b8f30 133 uint16_t vlan)
3bfd3972
EJ
134 OVS_REQ_RDLOCK(rwlock);
135static void bond_update_fake_slave_stats(struct bond *)
136 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
137
138/* Attempts to parse 's' as the name of a bond balancing mode. If successful,
139 * stores the mode in '*balance' and returns true. Otherwise returns false
140 * without modifying '*balance'. */
141bool
142bond_mode_from_string(enum bond_mode *balance, const char *s)
143{
144 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
145 *balance = BM_TCP;
146 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
147 *balance = BM_SLB;
148 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
149 *balance = BM_AB;
150 } else {
151 return false;
152 }
153 return true;
154}
155
156/* Returns a string representing 'balance'. */
157const char *
158bond_mode_to_string(enum bond_mode balance) {
159 switch (balance) {
160 case BM_TCP:
161 return "balance-tcp";
162 case BM_SLB:
163 return "balance-slb";
164 case BM_AB:
165 return "active-backup";
166 }
428b2edd 167 OVS_NOT_REACHED();
f620b43a
BP
168}
169
f620b43a
BP
170\f
171/* Creates and returns a new bond whose configuration is initially taken from
172 * 's'.
173 *
174 * The caller should register each slave on the new bond by calling
175 * bond_slave_register(). */
176struct bond *
177bond_create(const struct bond_settings *s)
178{
179 struct bond *bond;
180
181 bond = xzalloc(sizeof *bond);
182 hmap_init(&bond->slaves);
f620b43a 183 bond->next_fake_iface_update = LLONG_MAX;
37bec3d3 184 ovs_refcount_init(&bond->ref_cnt);
f620b43a
BP
185
186 bond_reconfigure(bond, s);
f620b43a
BP
187 return bond;
188}
189
03366a2d
EJ
190struct bond *
191bond_ref(const struct bond *bond_)
192{
193 struct bond *bond = CONST_CAST(struct bond *, bond_);
194
bca0b3b4 195 if (bond) {
37bec3d3 196 ovs_refcount_ref(&bond->ref_cnt);
bca0b3b4 197 }
03366a2d
EJ
198 return bond;
199}
200
f620b43a
BP
201/* Frees 'bond'. */
202void
03366a2d 203bond_unref(struct bond *bond)
f620b43a
BP
204{
205 struct bond_slave *slave, *next_slave;
206
37bec3d3 207 if (!bond || ovs_refcount_unref(&bond->ref_cnt) != 1) {
03366a2d
EJ
208 return;
209 }
210
3bfd3972
EJ
211 ovs_rwlock_wrlock(&rwlock);
212 hmap_remove(all_bonds, &bond->hmap_node);
213 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
214
215 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
216 hmap_remove(&bond->slaves, &slave->hmap_node);
217 /* Client owns 'slave->netdev'. */
218 free(slave->name);
219 free(slave);
220 }
221 hmap_destroy(&bond->slaves);
222
223 free(bond->hash);
f620b43a 224 free(bond->name);
37bec3d3 225 ovs_refcount_destroy(&bond->ref_cnt);
f620b43a
BP
226 free(bond);
227}
228
229/* Updates 'bond''s overall configuration to 's'.
230 *
231 * The caller should register each slave on 'bond' by calling
232 * bond_slave_register(). This is optional if none of the slaves'
4d6fb5eb 233 * configuration has changed. In any case it can't hurt.
59d7b2b6
EJ
234 *
235 * Returns true if the configuration has changed in such a way that requires
236 * flow revalidation.
237 * */
238bool
f620b43a
BP
239bond_reconfigure(struct bond *bond, const struct bond_settings *s)
240{
59d7b2b6
EJ
241 bool revalidate = false;
242
3bfd3972 243 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
244 if (!bond->name || strcmp(bond->name, s->name)) {
245 if (bond->name) {
3bfd3972 246 hmap_remove(all_bonds, &bond->hmap_node);
f620b43a
BP
247 free(bond->name);
248 }
249 bond->name = xstrdup(s->name);
3bfd3972 250 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
f620b43a
BP
251 }
252
f620b43a
BP
253 bond->updelay = s->up_delay;
254 bond->downdelay = s->down_delay;
bc1b010c 255
9dd165e0
RK
256 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
257 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
258 revalidate = true;
259 }
260
bc1b010c
EJ
261 if (bond->rebalance_interval != s->rebalance_interval) {
262 bond->rebalance_interval = s->rebalance_interval;
263 revalidate = true;
264 }
f620b43a 265
59d7b2b6
EJ
266 if (bond->balance != s->balance) {
267 bond->balance = s->balance;
268 revalidate = true;
269 }
270
672d18b2
EJ
271 if (bond->basis != s->basis) {
272 bond->basis = s->basis;
273 revalidate = true;
274 }
275
f620b43a
BP
276 if (s->fake_iface) {
277 if (bond->next_fake_iface_update == LLONG_MAX) {
278 bond->next_fake_iface_update = time_msec();
279 }
280 } else {
281 bond->next_fake_iface_update = LLONG_MAX;
282 }
59d7b2b6 283
62904702
EJ
284 if (bond->bond_revalidate) {
285 revalidate = true;
286 bond->bond_revalidate = false;
287 }
288
95aafb2a
EJ
289 if (bond->balance == BM_AB || !bond->hash || revalidate) {
290 bond_entry_reset(bond);
291 }
292
3bfd3972 293 ovs_rwlock_unlock(&rwlock);
59d7b2b6 294 return revalidate;
f620b43a
BP
295}
296
f8ddccd2 297static void
1ea24138 298bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
3bfd3972 299 OVS_REQ_WRLOCK(rwlock)
f8ddccd2
BP
300{
301 if (slave->netdev != netdev) {
f8ddccd2 302 slave->netdev = netdev;
1ea24138 303 slave->change_seq = 0;
f8ddccd2
BP
304 }
305}
306
f620b43a
BP
307/* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
308 * arbitrary client-provided pointer that uniquely identifies a slave within a
309 * bond. If 'slave_' already exists within 'bond' then this function
310 * reconfigures the existing slave.
311 *
312 * 'netdev' must be the network device that 'slave_' represents. It is owned
313 * by the client, so the client must not close it before either unregistering
314 * 'slave_' or destroying 'bond'.
4d6fb5eb 315 */
f620b43a 316void
df53d41c 317bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
f620b43a 318{
3bfd3972 319 struct bond_slave *slave;
f620b43a 320
3bfd3972
EJ
321 ovs_rwlock_wrlock(&rwlock);
322 slave = bond_slave_lookup(bond, slave_);
f620b43a
BP
323 if (!slave) {
324 slave = xzalloc(sizeof *slave);
325
326 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
327 slave->bond = bond;
328 slave->aux = slave_;
329 slave->delay_expires = LLONG_MAX;
244b2160 330 slave->name = xstrdup(netdev_get_name(netdev));
7321e30e 331 bond->bond_revalidate = true;
244b2160 332
b3c18f66 333 slave->enabled = false;
4a1b8f30 334 bond_enable_slave(slave, netdev_get_carrier(netdev));
f620b43a
BP
335 }
336
1ea24138 337 bond_slave_set_netdev__(slave, netdev);
a6934aa9 338
f620b43a
BP
339 free(slave->name);
340 slave->name = xstrdup(netdev_get_name(netdev));
3bfd3972 341 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
342}
343
f8ddccd2
BP
344/* Updates the network device to be used with 'slave_' to 'netdev'.
345 *
346 * This is useful if the caller closes and re-opens the network device
347 * registered with bond_slave_register() but doesn't need to change anything
348 * else. */
349void
350bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
351{
3bfd3972
EJ
352 struct bond_slave *slave;
353
354 ovs_rwlock_wrlock(&rwlock);
355 slave = bond_slave_lookup(bond, slave_);
f8ddccd2 356 if (slave) {
1ea24138 357 bond_slave_set_netdev__(slave, netdev);
f8ddccd2 358 }
3bfd3972 359 ovs_rwlock_unlock(&rwlock);
f8ddccd2
BP
360}
361
f620b43a
BP
362/* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
363 * then this function has no effect.
364 *
365 * Unregistering a slave invalidates all flows. */
366void
367bond_slave_unregister(struct bond *bond, const void *slave_)
368{
3bfd3972 369 struct bond_slave *slave;
f620b43a
BP
370 bool del_active;
371
3bfd3972
EJ
372 ovs_rwlock_wrlock(&rwlock);
373 slave = bond_slave_lookup(bond, slave_);
f620b43a 374 if (!slave) {
3bfd3972 375 goto out;
f620b43a
BP
376 }
377
4a1b8f30
EJ
378 bond->bond_revalidate = true;
379 bond_enable_slave(slave, false);
b3c18f66 380
f620b43a
BP
381 del_active = bond->active_slave == slave;
382 if (bond->hash) {
383 struct bond_entry *e;
384 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
385 if (e->slave == slave) {
386 e->slave = NULL;
387 }
388 }
389 }
390
391 free(slave->name);
392
393 hmap_remove(&bond->slaves, &slave->hmap_node);
394 /* Client owns 'slave->netdev'. */
395 free(slave);
396
397 if (del_active) {
4a1b8f30 398 bond_choose_active_slave(bond);
f620b43a
BP
399 bond->send_learning_packets = true;
400 }
3bfd3972
EJ
401out:
402 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
403}
404
296f6519
EJ
405/* Should be called on each slave in 'bond' before bond_run() to indicate
406 * whether or not 'slave_' may be enabled. This function is intended to allow
407 * other protocols to have some impact on bonding decisions. For example LACP
408 * or high level link monitoring protocols may decide that a given slave should
409 * not be able to send traffic. */
4d6fb5eb 410void
296f6519 411bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
4d6fb5eb 412{
3bfd3972 413 ovs_rwlock_wrlock(&rwlock);
296f6519 414 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
3bfd3972 415 ovs_rwlock_unlock(&rwlock);
4d6fb5eb
EJ
416}
417
4a1b8f30
EJ
418/* Performs periodic maintenance on 'bond'.
419 *
420 * Returns true if the caller should revalidate its flows.
f620b43a
BP
421 *
422 * The caller should check bond_should_send_learning_packets() afterward. */
4a1b8f30
EJ
423bool
424bond_run(struct bond *bond, enum lacp_status lacp_status)
f620b43a
BP
425{
426 struct bond_slave *slave;
4a1b8f30 427 bool revalidate;
f620b43a 428
3bfd3972 429 ovs_rwlock_wrlock(&rwlock);
bdebeece
EJ
430 if (bond->lacp_status != lacp_status) {
431 bond->lacp_status = lacp_status;
4592d0e2
EJ
432 bond->bond_revalidate = true;
433 }
4d6fb5eb 434
f620b43a
BP
435 /* Enable slaves based on link status and LACP feedback. */
436 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
4a1b8f30 437 bond_link_status_update(slave);
da4a6191 438 slave->change_seq = seq_read(connectivity_seq_get());
f620b43a
BP
439 }
440 if (!bond->active_slave || !bond->active_slave->enabled) {
4a1b8f30 441 bond_choose_active_slave(bond);
f620b43a
BP
442 }
443
444 /* Update fake bond interface stats. */
445 if (time_msec() >= bond->next_fake_iface_update) {
446 bond_update_fake_slave_stats(bond);
447 bond->next_fake_iface_update = time_msec() + 1000;
448 }
449
4a1b8f30
EJ
450 revalidate = bond->bond_revalidate;
451 bond->bond_revalidate = false;
3bfd3972 452 ovs_rwlock_unlock(&rwlock);
4a1b8f30
EJ
453
454 return revalidate;
f620b43a
BP
455}
456
457/* Causes poll_block() to wake up when 'bond' needs something to be done. */
458void
459bond_wait(struct bond *bond)
460{
461 struct bond_slave *slave;
462
3bfd3972 463 ovs_rwlock_rdlock(&rwlock);
f620b43a
BP
464 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
465 if (slave->delay_expires != LLONG_MAX) {
466 poll_timer_wait_until(slave->delay_expires);
467 }
1ea24138 468
da4a6191 469 seq_wait(connectivity_seq_get(), slave->change_seq);
f620b43a
BP
470 }
471
472 if (bond->next_fake_iface_update != LLONG_MAX) {
473 poll_timer_wait_until(bond->next_fake_iface_update);
474 }
475
bbc13389 476 if (bond->bond_revalidate) {
f620b43a
BP
477 poll_immediate_wake();
478 }
3bfd3972 479 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
480
481 /* We don't wait for bond->next_rebalance because rebalancing can only run
482 * at a flow account checkpoint. ofproto does checkpointing on its own
483 * schedule and bond_rebalance() gets called afterward, so we'd just be
484 * waking up for no purpose. */
485}
486\f
487/* MAC learning table interaction. */
488
489static bool
490may_send_learning_packets(const struct bond *bond)
491{
9dd165e0
RK
492 return ((bond->lacp_status == LACP_DISABLED
493 && (bond->balance == BM_SLB || bond->balance == BM_AB))
494 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
bdebeece 495 && bond->active_slave;
f620b43a
BP
496}
497
498/* Returns true if 'bond' needs the client to send out packets to assist with
499 * MAC learning on 'bond'. If this function returns true, then the client
500 * should iterate through its MAC learning table for the bridge on which 'bond'
501 * is located. For each MAC that has been learned on a port other than 'bond',
ea131871 502 * it should call bond_compose_learning_packet().
f620b43a 503 *
477879ea
BP
504 * This function will only return true if 'bond' is in SLB or active-backup
505 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
506 * necessary.
f620b43a
BP
507 *
508 * Calling this function resets the state that it checks. */
509bool
510bond_should_send_learning_packets(struct bond *bond)
511{
3bfd3972
EJ
512 bool send;
513
514 ovs_rwlock_wrlock(&rwlock);
515 send = bond->send_learning_packets && may_send_learning_packets(bond);
f620b43a 516 bond->send_learning_packets = false;
3bfd3972 517 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
518 return send;
519}
520
521/* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
522 *
ea131871
JG
523 * See bond_should_send_learning_packets() for description of usage. The
524 * caller should send the composed packet on the port associated with
525 * port_aux and takes ownership of the returned ofpbuf. */
526struct ofpbuf *
527bond_compose_learning_packet(struct bond *bond,
528 const uint8_t eth_src[ETH_ADDR_LEN],
529 uint16_t vlan, void **port_aux)
f620b43a
BP
530{
531 struct bond_slave *slave;
ea131871 532 struct ofpbuf *packet;
f620b43a 533 struct flow flow;
f620b43a 534
3bfd3972 535 ovs_rwlock_rdlock(&rwlock);
cb22974d 536 ovs_assert(may_send_learning_packets(bond));
f620b43a
BP
537 memset(&flow, 0, sizeof flow);
538 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
4a1b8f30 539 slave = choose_output_slave(bond, &flow, NULL, vlan);
f620b43a 540
ea131871 541 packet = ofpbuf_new(0);
2ea838ac 542 compose_rarp(packet, eth_src);
f620b43a 543 if (vlan) {
ea131871 544 eth_push_vlan(packet, htons(vlan));
f620b43a 545 }
f620b43a 546
ea131871 547 *port_aux = slave->aux;
3bfd3972 548 ovs_rwlock_unlock(&rwlock);
ea131871 549 return packet;
f620b43a
BP
550}
551\f
552/* Checks whether a packet that arrived on 'slave_' within 'bond', with an
553 * Ethernet destination address of 'eth_dst', should be admitted.
554 *
555 * The return value is one of the following:
556 *
557 * - BV_ACCEPT: Admit the packet.
558 *
559 * - BV_DROP: Drop the packet.
560 *
561 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
562 * Ethernet source address and VLAN. If there is none, or if the packet
563 * is on the learned port, then admit the packet. If a different port has
564 * been learned, however, drop the packet (and do not use it for MAC
565 * learning).
566 */
567enum bond_verdict
568bond_check_admissibility(struct bond *bond, const void *slave_,
4a1b8f30 569 const uint8_t eth_dst[ETH_ADDR_LEN])
f620b43a 570{
3bfd3972
EJ
571 enum bond_verdict verdict = BV_DROP;
572 struct bond_slave *slave;
9a1c6450 573
3bfd3972
EJ
574 ovs_rwlock_rdlock(&rwlock);
575 slave = bond_slave_lookup(bond, slave_);
4222bbc8 576 if (!slave) {
3bfd3972 577 goto out;
4222bbc8
EJ
578 }
579
9a1c6450
EJ
580 /* LACP bonds have very loose admissibility restrictions because we can
581 * assume the remote switch is aware of the bond and will "do the right
582 * thing". However, as a precaution we drop packets on disabled slaves
583 * because no correctly implemented partner switch should be sending
bdebeece
EJ
584 * packets to them.
585 *
586 * If LACP is configured, but LACP negotiations have been unsuccessful, we
9dd165e0 587 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
bdebeece 588 switch (bond->lacp_status) {
3bfd3972
EJ
589 case LACP_NEGOTIATED:
590 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
591 goto out;
592 case LACP_CONFIGURED:
9dd165e0
RK
593 if (!bond->lacp_fallback_ab) {
594 goto out;
595 }
3bfd3972
EJ
596 case LACP_DISABLED:
597 break;
f620b43a
BP
598 }
599
600 /* Drop all multicast packets on inactive slaves. */
601 if (eth_addr_is_multicast(eth_dst)) {
4222bbc8 602 if (bond->active_slave != slave) {
3bfd3972 603 goto out;
f620b43a
BP
604 }
605 }
606
f931a4c9 607 switch (bond->balance) {
9dd165e0
RK
608 case BM_TCP:
609 /* TCP balanced bonds require successful LACP negotiations. Based on the
610 * above check, LACP is off or lacp_fallback_ab is true on this bond.
611 * If lacp_fallback_ab is true fall through to BM_AB case else, we
612 * drop all incoming traffic. */
613 if (!bond->lacp_fallback_ab) {
614 goto out;
615 }
616
f931a4c9
BP
617 case BM_AB:
618 /* Drop all packets which arrive on backup slaves. This is similar to
619 * how Linux bonding handles active-backup bonds. */
7ba7dcf0
EJ
620 if (bond->active_slave != slave) {
621 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
622
e6b2255c
BP
623 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
624 " slave (%s) destined for " ETH_ADDR_FMT,
625 slave->name, ETH_ADDR_ARGS(eth_dst));
3bfd3972 626 goto out;
7ba7dcf0 627 }
3bfd3972
EJ
628 verdict = BV_ACCEPT;
629 goto out;
f931a4c9 630
f931a4c9
BP
631 case BM_SLB:
632 /* Drop all packets for which we have learned a different input port,
633 * because we probably sent the packet on one slave and got it back on
634 * the other. Gratuitous ARP packets are an exception to this rule:
635 * the host has moved to another switch. The exception to the
636 * exception is if we locked the learning table to avoid reflections on
637 * bond slaves. */
3bfd3972
EJ
638 verdict = BV_DROP_IF_MOVED;
639 goto out;
7ba7dcf0
EJ
640 }
641
428b2edd 642 OVS_NOT_REACHED();
3bfd3972
EJ
643out:
644 ovs_rwlock_unlock(&rwlock);
645 return verdict;
646
f620b43a
BP
647}
648
649/* Returns the slave (registered on 'bond' by bond_slave_register()) to which
650 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
651 * NULL if the packet should be dropped because no slaves are enabled.
652 *
653 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
654 * should be a VID only (i.e. excluding the PCP bits). Second,
655 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
656 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
657 * packet belongs to (so for an access port it will be the access port's VLAN).
658 *
bcd2633a
JP
659 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
660 * significant in the selection. At some point earlier, 'wc' should
661 * have been initialized (e.g., by flow_wildcards_init_catchall()).
f620b43a
BP
662 */
663void *
664bond_choose_output_slave(struct bond *bond, const struct flow *flow,
4a1b8f30 665 struct flow_wildcards *wc, uint16_t vlan)
f620b43a 666{
3bfd3972 667 struct bond_slave *slave;
b5d5d7d3 668 void *aux;
3bfd3972
EJ
669
670 ovs_rwlock_rdlock(&rwlock);
4a1b8f30 671 slave = choose_output_slave(bond, flow, wc, vlan);
b5d5d7d3 672 aux = slave ? slave->aux : NULL;
3bfd3972 673 ovs_rwlock_unlock(&rwlock);
b5d5d7d3
AW
674
675 return aux;
f620b43a 676}
f620b43a
BP
677\f
678/* Rebalancing. */
679
1b137691 680static bool
3bfd3972 681bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
1b137691 682{
bc1b010c
EJ
683 return bond->rebalance_interval
684 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
1b137691
EJ
685}
686
f620b43a
BP
687/* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
688void
689bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
690 uint64_t n_bytes)
691{
3bfd3972 692 ovs_rwlock_wrlock(&rwlock);
1b137691 693 if (bond_is_balanced(bond)) {
f620b43a 694 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
f620b43a 695 }
3bfd3972 696 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
697}
698
699static struct bond_slave *
3bfd3972 700bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
701{
702 return CONTAINER_OF(bal, struct bond_slave, bal_node);
703}
704
705static void
706log_bals(struct bond *bond, const struct list *bals)
707{
708 if (VLOG_IS_DBG_ENABLED()) {
709 struct ds ds = DS_EMPTY_INITIALIZER;
710 const struct bond_slave *slave;
711
712 LIST_FOR_EACH (slave, bal_node, bals) {
713 if (ds.length) {
714 ds_put_char(&ds, ',');
715 }
716 ds_put_format(&ds, " %s %"PRIu64"kB",
717 slave->name, slave->tx_bytes / 1024);
718
719 if (!slave->enabled) {
720 ds_put_cstr(&ds, " (disabled)");
721 }
722 if (!list_is_empty(&slave->entries)) {
723 struct bond_entry *e;
724
725 ds_put_cstr(&ds, " (");
726 LIST_FOR_EACH (e, list_node, &slave->entries) {
727 if (&e->list_node != list_front(&slave->entries)) {
728 ds_put_cstr(&ds, " + ");
729 }
34582733 730 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
f620b43a
BP
731 e - bond->hash, e->tx_bytes / 1024);
732 }
733 ds_put_cstr(&ds, ")");
734 }
735 }
736 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
737 ds_destroy(&ds);
738 }
739}
740
741/* Shifts 'hash' from its current slave to 'to'. */
742static void
4a1b8f30 743bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
f620b43a
BP
744{
745 struct bond_slave *from = hash->slave;
746 struct bond *bond = from->bond;
747 uint64_t delta = hash->tx_bytes;
748
34582733 749 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
f620b43a
BP
750 "from %s to %s (now carrying %"PRIu64"kB and "
751 "%"PRIu64"kB load, respectively)",
752 bond->name, delta / 1024, hash - bond->hash,
753 from->name, to->name,
754 (from->tx_bytes - delta) / 1024,
755 (to->tx_bytes + delta) / 1024);
756
757 /* Shift load away from 'from' to 'to'. */
758 from->tx_bytes -= delta;
759 to->tx_bytes += delta;
760
761 /* Arrange for flows to be revalidated. */
dc30ea2d 762 hash->slave = to;
4a1b8f30 763 bond->bond_revalidate = true;
f620b43a
BP
764}
765
09a5d390
BP
766/* Picks and returns a bond_entry to migrate from 'from' (the most heavily
767 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
f620b43a
BP
768 * given that doing so must decrease the ratio of the load on the two slaves by
769 * at least 0.1. Returns NULL if there is no appropriate entry.
770 *
771 * The list of entries isn't sorted. I don't know of a reason to prefer to
772 * shift away small hashes or large hashes. */
773static struct bond_entry *
774choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
775{
776 struct bond_entry *e;
777
778 if (list_is_short(&from->entries)) {
779 /* 'from' carries no more than one MAC hash, so shifting load away from
780 * it would be pointless. */
781 return NULL;
782 }
783
784 LIST_FOR_EACH (e, list_node, &from->entries) {
785 double old_ratio, new_ratio;
786 uint64_t delta;
787
788 if (to_tx_bytes == 0) {
789 /* Nothing on the new slave, move it. */
790 return e;
791 }
792
793 delta = e->tx_bytes;
794 old_ratio = (double)from->tx_bytes / to_tx_bytes;
795 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
75fad143
ZK
796 if (old_ratio - new_ratio > 0.1
797 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
798 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
799 and 'to' slave have the same load. Therefore, we only move an
800 entry if it decreases the load on 'from', and brings us closer
801 to equal traffic load. */
f620b43a
BP
802 return e;
803 }
804 }
805
806 return NULL;
807}
808
809/* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
810 * maintained. */
811static void
812insert_bal(struct list *bals, struct bond_slave *slave)
813{
814 struct bond_slave *pos;
815
816 LIST_FOR_EACH (pos, bal_node, bals) {
817 if (slave->tx_bytes > pos->tx_bytes) {
818 break;
819 }
820 }
821 list_insert(&pos->bal_node, &slave->bal_node);
822}
823
824/* Removes 'slave' from its current list and then inserts it into 'bals' so
825 * that descending order of 'tx_bytes' is maintained. */
826static void
827reinsert_bal(struct list *bals, struct bond_slave *slave)
828{
829 list_remove(&slave->bal_node);
830 insert_bal(bals, slave);
831}
832
833/* If 'bond' needs rebalancing, does so.
834 *
835 * The caller should have called bond_account() for each active flow, to ensure
836 * that flow data is consistently accounted at this point. */
837void
4a1b8f30 838bond_rebalance(struct bond *bond)
f620b43a
BP
839{
840 struct bond_slave *slave;
841 struct bond_entry *e;
842 struct list bals;
843
3bfd3972 844 ovs_rwlock_wrlock(&rwlock);
1b137691 845 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
3bfd3972 846 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
847 return;
848 }
849 bond->next_rebalance = time_msec() + bond->rebalance_interval;
850
851 /* Add each bond_entry to its slave's 'entries' list.
852 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
853 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
854 slave->tx_bytes = 0;
855 list_init(&slave->entries);
856 }
857 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
858 if (e->slave && e->tx_bytes) {
859 e->slave->tx_bytes += e->tx_bytes;
860 list_push_back(&e->slave->entries, &e->list_node);
861 }
862 }
863
864 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
865 *
866 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
867 * with a proper list sort algorithm. */
868 list_init(&bals);
869 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
870 if (slave->enabled) {
871 insert_bal(&bals, slave);
872 }
873 }
874 log_bals(bond, &bals);
875
876 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
877 while (!list_is_short(&bals)) {
878 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
879 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
880 uint64_t overload;
881
882 overload = from->tx_bytes - to->tx_bytes;
883 if (overload < to->tx_bytes >> 5 || overload < 100000) {
884 /* The extra load on 'from' (and all less-loaded slaves), compared
885 * to that of 'to' (the least-loaded slave), is less than ~3%, or
886 * it is less than ~1Mbps. No point in rebalancing. */
887 break;
888 }
889
09a5d390
BP
890 /* 'from' is carrying significantly more load than 'to'. Pick a hash
891 * to move from 'from' to 'to'. */
f620b43a
BP
892 e = choose_entry_to_migrate(from, to->tx_bytes);
893 if (e) {
4a1b8f30 894 bond_shift_load(e, to);
f620b43a
BP
895
896 /* Delete element from from->entries.
897 *
898 * We don't add the element to to->hashes. That would only allow
899 * 'e' to be migrated to another slave in this rebalancing run, and
900 * there is no point in doing that. */
901 list_remove(&e->list_node);
902
903 /* Re-sort 'bals'. */
904 reinsert_bal(&bals, from);
905 reinsert_bal(&bals, to);
906 } else {
907 /* Can't usefully migrate anything away from 'from'.
908 * Don't reconsider it. */
909 list_remove(&from->bal_node);
910 }
911 }
912
913 /* Implement exponentially weighted moving average. A weight of 1/2 causes
914 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
915 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
916 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
917 e->tx_bytes /= 2;
918 if (!e->tx_bytes) {
919 e->slave = NULL;
920 }
921 }
3bfd3972 922 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
923}
924\f
925/* Bonding unixctl user interface functions. */
926
927static struct bond *
3bfd3972 928bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
929{
930 struct bond *bond;
931
932 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
3bfd3972 933 all_bonds) {
f620b43a
BP
934 if (!strcmp(bond->name, name)) {
935 return bond;
936 }
937 }
938 return NULL;
939}
940
941static struct bond_slave *
942bond_lookup_slave(struct bond *bond, const char *slave_name)
943{
944 struct bond_slave *slave;
945
946 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
947 if (!strcmp(slave->name, slave_name)) {
948 return slave;
949 }
950 }
951 return NULL;
952}
953
954static void
955bond_unixctl_list(struct unixctl_conn *conn,
0e15264f
BP
956 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
957 void *aux OVS_UNUSED)
f620b43a
BP
958{
959 struct ds ds = DS_EMPTY_INITIALIZER;
960 const struct bond *bond;
961
962 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
963
3bfd3972
EJ
964 ovs_rwlock_rdlock(&rwlock);
965 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
f620b43a
BP
966 const struct bond_slave *slave;
967 size_t i;
968
969 ds_put_format(&ds, "%s\t%s\t",
970 bond->name, bond_mode_to_string(bond->balance));
971
972 i = 0;
973 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
974 if (i++ > 0) {
975 ds_put_cstr(&ds, ", ");
976 }
977 ds_put_cstr(&ds, slave->name);
978 }
979 ds_put_char(&ds, '\n');
980 }
3bfd3972 981 ovs_rwlock_unlock(&rwlock);
bde9f75d 982 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a
BP
983 ds_destroy(&ds);
984}
985
986static void
c33a8a25 987bond_print_details(struct ds *ds, const struct bond *bond)
3bfd3972 988 OVS_REQ_RDLOCK(rwlock)
f620b43a 989{
fc1d4f01
EJ
990 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
991 const struct shash_node **sorted_slaves = NULL;
f620b43a 992 const struct bond_slave *slave;
fc1d4f01 993 int i;
f620b43a 994
c33a8a25
EJ
995 ds_put_format(ds, "---- %s ----\n", bond->name);
996 ds_put_format(ds, "bond_mode: %s\n",
f620b43a
BP
997 bond_mode_to_string(bond->balance));
998
c33a8a25 999 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
672d18b2 1000
c33a8a25
EJ
1001 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1002 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
f620b43a 1003
1b137691 1004 if (bond_is_balanced(bond)) {
c33a8a25 1005 ds_put_format(ds, "next rebalance: %lld ms\n",
f620b43a
BP
1006 bond->next_rebalance - time_msec());
1007 }
1008
bdebeece
EJ
1009 ds_put_cstr(ds, "lacp_status: ");
1010 switch (bond->lacp_status) {
1011 case LACP_NEGOTIATED:
1012 ds_put_cstr(ds, "negotiated\n");
1013 break;
1014 case LACP_CONFIGURED:
1015 ds_put_cstr(ds, "configured\n");
1016 break;
1017 case LACP_DISABLED:
1018 ds_put_cstr(ds, "off\n");
1019 break;
1020 default:
1021 ds_put_cstr(ds, "<unknown>\n");
1022 break;
1023 }
4d6fb5eb 1024
f620b43a 1025 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
fc1d4f01
EJ
1026 shash_add(&slave_shash, slave->name, slave);
1027 }
1028 sorted_slaves = shash_sort(&slave_shash);
1029
1030 for (i = 0; i < shash_count(&slave_shash); i++) {
f620b43a 1031 struct bond_entry *be;
f620b43a 1032
fc1d4f01
EJ
1033 slave = sorted_slaves[i]->data;
1034
f620b43a 1035 /* Basic info. */
c33a8a25 1036 ds_put_format(ds, "\nslave %s: %s\n",
f620b43a
BP
1037 slave->name, slave->enabled ? "enabled" : "disabled");
1038 if (slave == bond->active_slave) {
c33a8a25 1039 ds_put_cstr(ds, "\tactive slave\n");
f620b43a
BP
1040 }
1041 if (slave->delay_expires != LLONG_MAX) {
c33a8a25 1042 ds_put_format(ds, "\t%s expires in %lld ms\n",
f620b43a
BP
1043 slave->enabled ? "downdelay" : "updelay",
1044 slave->delay_expires - time_msec());
1045 }
1046
c33a8a25 1047 ds_put_format(ds, "\tmay_enable: %s\n",
296f6519 1048 slave->may_enable ? "true" : "false");
4d6fb5eb 1049
1b137691 1050 if (!bond_is_balanced(bond)) {
f620b43a
BP
1051 continue;
1052 }
1053
1054 /* Hashes. */
f620b43a
BP
1055 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1056 int hash = be - bond->hash;
1057
1058 if (be->slave != slave) {
1059 continue;
1060 }
1061
c33a8a25 1062 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
f620b43a
BP
1063 hash, be->tx_bytes / 1024);
1064
7b9f1974 1065 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
f620b43a
BP
1066 }
1067 }
fc1d4f01
EJ
1068 shash_destroy(&slave_shash);
1069 free(sorted_slaves);
c33a8a25
EJ
1070 ds_put_cstr(ds, "\n");
1071}
1072
1073static void
1074bond_unixctl_show(struct unixctl_conn *conn,
1075 int argc, const char *argv[],
1076 void *aux OVS_UNUSED)
1077{
1078 struct ds ds = DS_EMPTY_INITIALIZER;
1079
3bfd3972 1080 ovs_rwlock_rdlock(&rwlock);
c33a8a25
EJ
1081 if (argc > 1) {
1082 const struct bond *bond = bond_find(argv[1]);
1083
1084 if (!bond) {
bde9f75d 1085 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1086 goto out;
c33a8a25
EJ
1087 }
1088 bond_print_details(&ds, bond);
1089 } else {
1090 const struct bond *bond;
1091
3bfd3972 1092 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
c33a8a25
EJ
1093 bond_print_details(&ds, bond);
1094 }
1095 }
1096
bde9f75d 1097 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a 1098 ds_destroy(&ds);
3bfd3972
EJ
1099
1100out:
1101 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1102}
1103
1104static void
0e15264f
BP
1105bond_unixctl_migrate(struct unixctl_conn *conn,
1106 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1107 void *aux OVS_UNUSED)
1108{
0e15264f
BP
1109 const char *bond_s = argv[1];
1110 const char *hash_s = argv[2];
1111 const char *slave_s = argv[3];
f620b43a
BP
1112 struct bond *bond;
1113 struct bond_slave *slave;
1114 struct bond_entry *entry;
1115 int hash;
1116
3bfd3972 1117 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1118 bond = bond_find(bond_s);
1119 if (!bond) {
bde9f75d 1120 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1121 goto out;
f620b43a
BP
1122 }
1123
1124 if (bond->balance != BM_SLB) {
bde9f75d 1125 unixctl_command_reply_error(conn, "not an SLB bond");
3bfd3972 1126 goto out;
f620b43a
BP
1127 }
1128
1129 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1130 hash = atoi(hash_s) & BOND_MASK;
1131 } else {
bde9f75d 1132 unixctl_command_reply_error(conn, "bad hash");
3bfd3972 1133 goto out;
f620b43a
BP
1134 }
1135
1136 slave = bond_lookup_slave(bond, slave_s);
1137 if (!slave) {
bde9f75d 1138 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1139 goto out;
f620b43a
BP
1140 }
1141
1142 if (!slave->enabled) {
bde9f75d 1143 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
3bfd3972 1144 goto out;
f620b43a
BP
1145 }
1146
1147 entry = &bond->hash[hash];
4a1b8f30 1148 bond->bond_revalidate = true;
f620b43a 1149 entry->slave = slave;
bde9f75d 1150 unixctl_command_reply(conn, "migrated");
3bfd3972
EJ
1151
1152out:
1153 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1154}
1155
1156static void
0e15264f
BP
1157bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1158 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1159 void *aux OVS_UNUSED)
1160{
0e15264f
BP
1161 const char *bond_s = argv[1];
1162 const char *slave_s = argv[2];
f620b43a
BP
1163 struct bond *bond;
1164 struct bond_slave *slave;
1165
3bfd3972 1166 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1167 bond = bond_find(bond_s);
1168 if (!bond) {
bde9f75d 1169 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1170 goto out;
f620b43a
BP
1171 }
1172
1173 slave = bond_lookup_slave(bond, slave_s);
1174 if (!slave) {
bde9f75d 1175 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1176 goto out;
f620b43a
BP
1177 }
1178
1179 if (!slave->enabled) {
bde9f75d 1180 unixctl_command_reply_error(conn, "cannot make disabled slave active");
3bfd3972 1181 goto out;
f620b43a
BP
1182 }
1183
1184 if (bond->active_slave != slave) {
4a1b8f30 1185 bond->bond_revalidate = true;
f620b43a 1186 bond->active_slave = slave;
f620b43a
BP
1187 VLOG_INFO("bond %s: active interface is now %s",
1188 bond->name, slave->name);
1189 bond->send_learning_packets = true;
bde9f75d 1190 unixctl_command_reply(conn, "done");
f620b43a 1191 } else {
bde9f75d 1192 unixctl_command_reply(conn, "no change");
f620b43a 1193 }
3bfd3972
EJ
1194out:
1195 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1196}
1197
1198static void
0e15264f 1199enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
f620b43a 1200{
0e15264f
BP
1201 const char *bond_s = argv[1];
1202 const char *slave_s = argv[2];
f620b43a
BP
1203 struct bond *bond;
1204 struct bond_slave *slave;
1205
3bfd3972 1206 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1207 bond = bond_find(bond_s);
1208 if (!bond) {
bde9f75d 1209 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1210 goto out;
f620b43a
BP
1211 }
1212
1213 slave = bond_lookup_slave(bond, slave_s);
1214 if (!slave) {
bde9f75d 1215 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1216 goto out;
f620b43a
BP
1217 }
1218
4a1b8f30 1219 bond_enable_slave(slave, enable);
bde9f75d 1220 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
3bfd3972
EJ
1221
1222out:
1223 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1224}
1225
1226static void
0e15264f
BP
1227bond_unixctl_enable_slave(struct unixctl_conn *conn,
1228 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1229 void *aux OVS_UNUSED)
1230{
0e15264f 1231 enable_slave(conn, argv, true);
f620b43a
BP
1232}
1233
1234static void
0e15264f
BP
1235bond_unixctl_disable_slave(struct unixctl_conn *conn,
1236 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1237 void *aux OVS_UNUSED)
1238{
0e15264f 1239 enable_slave(conn, argv, false);
f620b43a
BP
1240}
1241
1242static void
0e15264f 1243bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
f620b43a
BP
1244 void *aux OVS_UNUSED)
1245{
0e15264f
BP
1246 const char *mac_s = argv[1];
1247 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1248 const char *basis_s = argc > 3 ? argv[3] : NULL;
f620b43a
BP
1249 uint8_t mac[ETH_ADDR_LEN];
1250 uint8_t hash;
1251 char *hash_cstr;
1252 unsigned int vlan;
672d18b2 1253 uint32_t basis;
f620b43a
BP
1254
1255 if (vlan_s) {
c2c28dfd 1256 if (!ovs_scan(vlan_s, "%u", &vlan)) {
bde9f75d 1257 unixctl_command_reply_error(conn, "invalid vlan");
f620b43a
BP
1258 return;
1259 }
1260 } else {
dc155bff 1261 vlan = 0;
f620b43a
BP
1262 }
1263
672d18b2 1264 if (basis_s) {
c2c28dfd 1265 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
bde9f75d 1266 unixctl_command_reply_error(conn, "invalid basis");
672d18b2
EJ
1267 return;
1268 }
1269 } else {
1270 basis = 0;
1271 }
1272
c2c28dfd 1273 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
672d18b2 1274 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
f620b43a
BP
1275
1276 hash_cstr = xasprintf("%u", hash);
bde9f75d 1277 unixctl_command_reply(conn, hash_cstr);
f620b43a
BP
1278 free(hash_cstr);
1279 } else {
bde9f75d 1280 unixctl_command_reply_error(conn, "invalid mac");
f620b43a
BP
1281 }
1282}
1283
1284void
1285bond_init(void)
1286{
0e15264f 1287 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
c33a8a25
EJ
1288 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1289 NULL);
0e15264f 1290 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
7ff2009a 1291 bond_unixctl_migrate, NULL);
0e15264f 1292 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
f620b43a 1293 bond_unixctl_set_active_slave, NULL);
0e15264f 1294 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
7ff2009a 1295 bond_unixctl_enable_slave, NULL);
0e15264f 1296 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
7ff2009a 1297 bond_unixctl_disable_slave, NULL);
0e15264f 1298 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
7ff2009a 1299 bond_unixctl_hash, NULL);
f620b43a
BP
1300}
1301\f
95aafb2a
EJ
1302static void
1303bond_entry_reset(struct bond *bond)
1304{
1305 if (bond->balance != BM_AB) {
1306 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1307
1308 if (!bond->hash) {
1309 bond->hash = xmalloc(hash_len);
1310 }
1311 memset(bond->hash, 0, hash_len);
1312
1313 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1314 } else {
1315 free(bond->hash);
1316 bond->hash = NULL;
1317 }
1318}
1319
f620b43a
BP
1320static struct bond_slave *
1321bond_slave_lookup(struct bond *bond, const void *slave_)
1322{
1323 struct bond_slave *slave;
1324
1325 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1326 &bond->slaves) {
1327 if (slave->aux == slave_) {
1328 return slave;
1329 }
1330 }
1331
1332 return NULL;
1333}
1334
f620b43a 1335static void
4a1b8f30 1336bond_enable_slave(struct bond_slave *slave, bool enable)
f620b43a
BP
1337{
1338 slave->delay_expires = LLONG_MAX;
1339 if (enable != slave->enabled) {
4a1b8f30 1340 slave->bond->bond_revalidate = true;
f620b43a 1341 slave->enabled = enable;
4a1b8f30
EJ
1342 VLOG_INFO("interface %s: %s", slave->name,
1343 slave->enabled ? "enabled" : "disabled");
f620b43a
BP
1344 }
1345}
1346
1347static void
4a1b8f30 1348bond_link_status_update(struct bond_slave *slave)
f620b43a
BP
1349{
1350 struct bond *bond = slave->bond;
1351 bool up;
1352
296f6519 1353 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
f620b43a
BP
1354 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1355 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1356 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1357 slave->name, up ? "up" : "down");
1358 if (up == slave->enabled) {
1359 slave->delay_expires = LLONG_MAX;
1360 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1361 slave->name, up ? "disabled" : "enabled");
1362 } else {
bdebeece 1363 int delay = (bond->lacp_status != LACP_DISABLED ? 0
f620b43a
BP
1364 : up ? bond->updelay : bond->downdelay);
1365 slave->delay_expires = time_msec() + delay;
1366 if (delay) {
1367 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1368 "for %d ms",
1369 slave->name,
1370 up ? "enabled" : "disabled",
1371 up ? "up" : "down",
1372 delay);
1373 }
1374 }
1375 }
1376
1377 if (time_msec() >= slave->delay_expires) {
4a1b8f30 1378 bond_enable_slave(slave, up);
f620b43a
BP
1379 }
1380}
1381
f620b43a 1382static unsigned int
672d18b2 1383bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
f620b43a 1384{
672d18b2 1385 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
f620b43a
BP
1386}
1387
1388static unsigned int
672d18b2 1389bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
f620b43a
BP
1390{
1391 struct flow hash_flow = *flow;
d84d4b88 1392 hash_flow.vlan_tci = htons(vlan);
f620b43a
BP
1393
1394 /* The symmetric quality of this hash function is not required, but
1395 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1396 * purposes, so we use it out of convenience. */
672d18b2 1397 return flow_hash_symmetric_l4(&hash_flow, basis);
f620b43a
BP
1398}
1399
fb0b29a3
EJ
1400static unsigned int
1401bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1402{
cb22974d 1403 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
fb0b29a3 1404
bdebeece 1405 return (bond->balance == BM_TCP
672d18b2
EJ
1406 ? bond_hash_tcp(flow, vlan, bond->basis)
1407 : bond_hash_src(flow->dl_src, vlan, bond->basis));
fb0b29a3
EJ
1408}
1409
f620b43a
BP
1410static struct bond_entry *
1411lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1412 uint16_t vlan)
1413{
fb0b29a3 1414 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
f620b43a
BP
1415}
1416
1417static struct bond_slave *
1418choose_output_slave(const struct bond *bond, const struct flow *flow,
4a1b8f30 1419 struct flow_wildcards *wc, uint16_t vlan)
f620b43a
BP
1420{
1421 struct bond_entry *e;
9dd165e0 1422 int balance;
f620b43a 1423
9dd165e0 1424 balance = bond->balance;
bdebeece
EJ
1425 if (bond->lacp_status == LACP_CONFIGURED) {
1426 /* LACP has been configured on this bond but negotiations were
9dd165e0
RK
1427 * unsuccussful. If lacp_fallback_ab is enabled use active-
1428 * backup mode else drop all traffic. */
1429 if (!bond->lacp_fallback_ab) {
1430 return NULL;
1431 }
1432 balance = BM_AB;
bdebeece
EJ
1433 }
1434
9dd165e0 1435 switch (balance) {
f620b43a
BP
1436 case BM_AB:
1437 return bond->active_slave;
1438
f620b43a 1439 case BM_TCP:
bdebeece
EJ
1440 if (bond->lacp_status != LACP_NEGOTIATED) {
1441 /* Must have LACP negotiations for TCP balanced bonds. */
1442 return NULL;
1443 }
bcd2633a 1444 if (wc) {
6cdd5145 1445 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
bcd2633a 1446 }
bdebeece
EJ
1447 /* Fall Through. */
1448 case BM_SLB:
bcd2633a 1449 if (wc) {
6cdd5145 1450 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
bcd2633a 1451 }
f620b43a
BP
1452 e = lookup_bond_entry(bond, flow, vlan);
1453 if (!e->slave || !e->slave->enabled) {
c804cadf
EJ
1454 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1455 struct bond_slave, hmap_node);
1456 if (!e->slave->enabled) {
1457 e->slave = bond->active_slave;
1458 }
f620b43a
BP
1459 }
1460 return e->slave;
1461
1462 default:
428b2edd 1463 OVS_NOT_REACHED();
f620b43a
BP
1464 }
1465}
1466
1467static struct bond_slave *
1468bond_choose_slave(const struct bond *bond)
1469{
1470 struct bond_slave *slave, *best;
1471
1472 /* Find an enabled slave. */
1473 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1474 if (slave->enabled) {
1475 return slave;
1476 }
1477 }
1478
1479 /* All interfaces are disabled. Find an interface that will be enabled
1480 * after its updelay expires. */
1481 best = NULL;
1482 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1483 if (slave->delay_expires != LLONG_MAX
296f6519 1484 && slave->may_enable
f620b43a
BP
1485 && (!best || slave->delay_expires < best->delay_expires)) {
1486 best = slave;
1487 }
1488 }
1489 return best;
1490}
1491
1492static void
4a1b8f30 1493bond_choose_active_slave(struct bond *bond)
f620b43a
BP
1494{
1495 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1496 struct bond_slave *old_active_slave = bond->active_slave;
1497
1498 bond->active_slave = bond_choose_slave(bond);
1499 if (bond->active_slave) {
1500 if (bond->active_slave->enabled) {
1501 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1502 bond->name, bond->active_slave->name);
1503 } else {
1504 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1505 "remaining %lld ms updelay (since no interface was "
1506 "enabled)", bond->name, bond->active_slave->name,
1507 bond->active_slave->delay_expires - time_msec());
4a1b8f30 1508 bond_enable_slave(bond->active_slave, true);
f620b43a
BP
1509 }
1510
1511 bond->send_learning_packets = true;
1512 } else if (old_active_slave) {
d28b9ead 1513 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
f620b43a
BP
1514 }
1515}
1516
f620b43a
BP
1517/* Attempts to make the sum of the bond slaves' statistics appear on the fake
1518 * bond interface. */
1519static void
1520bond_update_fake_slave_stats(struct bond *bond)
1521{
1522 struct netdev_stats bond_stats;
1523 struct bond_slave *slave;
1524 struct netdev *bond_dev;
1525
1526 memset(&bond_stats, 0, sizeof bond_stats);
1527
1528 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1529 struct netdev_stats slave_stats;
1530
1531 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1532 /* XXX: We swap the stats here because they are swapped back when
1533 * reported by the internal device. The reason for this is
1534 * internal devices normally represent packets going into the
1535 * system but when used as fake bond device they represent packets
1536 * leaving the system. We really should do this in the internal
1537 * device itself because changing it here reverses the counts from
1538 * the perspective of the switch. However, the internal device
1539 * doesn't know what type of device it represents so we have to do
1540 * it here for now. */
1541 bond_stats.tx_packets += slave_stats.rx_packets;
1542 bond_stats.tx_bytes += slave_stats.rx_bytes;
1543 bond_stats.rx_packets += slave_stats.tx_packets;
1544 bond_stats.rx_bytes += slave_stats.tx_bytes;
1545 }
1546 }
1547
18812dff 1548 if (!netdev_open(bond->name, "system", &bond_dev)) {
f620b43a
BP
1549 netdev_set_stats(bond_dev, &bond_stats);
1550 netdev_close(bond_dev);
1551 }
1552}