]> git.proxmox.com Git - mirror_ovs.git/blame - lib/bond.c
mac-learning: Stop using tags.
[mirror_ovs.git] / lib / bond.c
CommitLineData
f620b43a 1/*
09a5d390 2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
f620b43a
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
19#include "bond.h"
20
21#include <limits.h>
22#include <stdint.h>
23#include <stdlib.h>
75fad143 24#include <math.h>
f620b43a
BP
25
26#include "coverage.h"
27#include "dynamic-string.h"
28#include "flow.h"
29#include "hmap.h"
bdebeece 30#include "lacp.h"
f620b43a
BP
31#include "list.h"
32#include "netdev.h"
33#include "odp-util.h"
34#include "ofpbuf.h"
35#include "packets.h"
36#include "poll-loop.h"
fc1d4f01 37#include "shash.h"
f620b43a
BP
38#include "tag.h"
39#include "timeval.h"
40#include "unixctl.h"
41#include "vlog.h"
42
43VLOG_DEFINE_THIS_MODULE(bond);
44
f620b43a
BP
45/* Bit-mask for hashing a flow down to a bucket.
46 * There are (BOND_MASK + 1) buckets. */
47#define BOND_MASK 0xff
48
49/* A hash bucket for mapping a flow to a slave.
50 * "struct bond" has an array of (BOND_MASK + 1) of these. */
51struct bond_entry {
52 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
53 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
5dd165d3 54 tag_type tag; /* Tag for entry<->facet association. */
f620b43a
BP
55 struct list list_node; /* In bond_slave's 'entries' list. */
56};
57
58/* A bond slave, that is, one of the links comprising a bond. */
59struct bond_slave {
60 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
61 struct bond *bond; /* The bond that contains this slave. */
62 void *aux; /* Client-provided handle for this slave. */
63
64 struct netdev *netdev; /* Network device, owned by the client. */
1ea24138 65 unsigned int change_seq; /* Tracks changes in 'netdev'. */
f620b43a
BP
66 char *name; /* Name (a copy of netdev_get_name(netdev)). */
67
68 /* Link status. */
69 long long delay_expires; /* Time after which 'enabled' may change. */
f620b43a 70 bool enabled; /* May be chosen for flows? */
296f6519 71 bool may_enable; /* Client considers this slave bondable. */
f620b43a
BP
72 tag_type tag; /* Tag associated with this slave. */
73
74 /* Rebalancing info. Used only by bond_rebalance(). */
75 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
76 struct list entries; /* 'struct bond_entry's assigned here. */
77 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
78};
79
80/* A bond, that is, a set of network devices grouped to improve performance or
81 * robustness. */
82struct bond {
83 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
84 char *name; /* Name provided by client. */
85
86 /* Slaves. */
87 struct hmap slaves;
88
89 /* Bonding info. */
90 enum bond_mode balance; /* Balancing mode, one of BM_*. */
91 struct bond_slave *active_slave;
92 tag_type no_slaves_tag; /* Tag for flows when all slaves disabled. */
93 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
bdebeece 94 enum lacp_status lacp_status; /* Status of LACP negotiations. */
62904702 95 bool bond_revalidate; /* True if flows need revalidation. */
672d18b2 96 uint32_t basis; /* Basis for flow hash function. */
f620b43a
BP
97
98 /* SLB specific bonding info. */
99 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
100 int rebalance_interval; /* Interval between rebalances, in ms. */
101 long long int next_rebalance; /* Next rebalancing time. */
102 bool send_learning_packets;
103
f620b43a
BP
104 /* Legacy compatibility. */
105 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
106
107 /* Tag set saved for next bond_run(). This tag set is a kluge for cases
108 * where we can't otherwise provide revalidation feedback to the client.
109 * That's only unixctl commands now; I hope no other cases will arise. */
110 struct tag_set unixctl_tags;
03366a2d 111
3bfd3972 112 atomic_int ref_cnt;
f620b43a
BP
113};
114
3bfd3972
EJ
115static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
116static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
117static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
f620b43a 118
3bfd3972
EJ
119static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
120static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
121 OVS_REQ_RDLOCK(rwlock);
f620b43a 122static void bond_enable_slave(struct bond_slave *, bool enable,
3bfd3972
EJ
123 struct tag_set *) OVS_REQ_WRLOCK(rwlock);
124static void bond_link_status_update(struct bond_slave *, struct tag_set *)
125 OVS_REQ_WRLOCK(rwlock);
126static void bond_choose_active_slave(struct bond *, struct tag_set *)
127 OVS_REQ_WRLOCK(rwlock);;
f620b43a 128static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
672d18b2
EJ
129 uint16_t vlan, uint32_t basis);
130static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
131 uint32_t basis);
f620b43a
BP
132static struct bond_entry *lookup_bond_entry(const struct bond *,
133 const struct flow *,
3bfd3972
EJ
134 uint16_t vlan)
135 OVS_REQ_RDLOCK(rwlock);
136static tag_type bond_get_active_slave_tag(const struct bond *)
137 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
138static struct bond_slave *choose_output_slave(const struct bond *,
139 const struct flow *,
bcd2633a 140 struct flow_wildcards *,
3bfd3972
EJ
141 uint16_t vlan, tag_type *tags)
142 OVS_REQ_RDLOCK(rwlock);
143static void bond_update_fake_slave_stats(struct bond *)
144 OVS_REQ_RDLOCK(rwlock);
f620b43a
BP
145
146/* Attempts to parse 's' as the name of a bond balancing mode. If successful,
147 * stores the mode in '*balance' and returns true. Otherwise returns false
148 * without modifying '*balance'. */
149bool
150bond_mode_from_string(enum bond_mode *balance, const char *s)
151{
152 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
153 *balance = BM_TCP;
154 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
155 *balance = BM_SLB;
156 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
157 *balance = BM_AB;
158 } else {
159 return false;
160 }
161 return true;
162}
163
164/* Returns a string representing 'balance'. */
165const char *
166bond_mode_to_string(enum bond_mode balance) {
167 switch (balance) {
168 case BM_TCP:
169 return "balance-tcp";
170 case BM_SLB:
171 return "balance-slb";
172 case BM_AB:
173 return "active-backup";
174 }
175 NOT_REACHED();
176}
177
f620b43a
BP
178\f
179/* Creates and returns a new bond whose configuration is initially taken from
180 * 's'.
181 *
182 * The caller should register each slave on the new bond by calling
183 * bond_slave_register(). */
184struct bond *
185bond_create(const struct bond_settings *s)
186{
187 struct bond *bond;
188
189 bond = xzalloc(sizeof *bond);
190 hmap_init(&bond->slaves);
191 bond->no_slaves_tag = tag_create_random();
f620b43a 192 bond->next_fake_iface_update = LLONG_MAX;
3bfd3972 193 atomic_init(&bond->ref_cnt, 1);
f620b43a
BP
194
195 bond_reconfigure(bond, s);
196
197 tag_set_init(&bond->unixctl_tags);
198
199 return bond;
200}
201
03366a2d
EJ
202struct bond *
203bond_ref(const struct bond *bond_)
204{
205 struct bond *bond = CONST_CAST(struct bond *, bond_);
206
bca0b3b4 207 if (bond) {
3bfd3972
EJ
208 int orig;
209 atomic_add(&bond->ref_cnt, 1, &orig);
210 ovs_assert(orig > 0);
bca0b3b4 211 }
03366a2d
EJ
212 return bond;
213}
214
f620b43a
BP
215/* Frees 'bond'. */
216void
03366a2d 217bond_unref(struct bond *bond)
f620b43a
BP
218{
219 struct bond_slave *slave, *next_slave;
3bfd3972 220 int orig;
f620b43a
BP
221
222 if (!bond) {
223 return;
224 }
225
3bfd3972
EJ
226 atomic_sub(&bond->ref_cnt, 1, &orig);
227 ovs_assert(orig > 0);
228 if (orig != 1) {
03366a2d
EJ
229 return;
230 }
231
3bfd3972
EJ
232 ovs_rwlock_wrlock(&rwlock);
233 hmap_remove(all_bonds, &bond->hmap_node);
234 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
235
236 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
237 hmap_remove(&bond->slaves, &slave->hmap_node);
238 /* Client owns 'slave->netdev'. */
239 free(slave->name);
240 free(slave);
241 }
242 hmap_destroy(&bond->slaves);
243
244 free(bond->hash);
f620b43a
BP
245 free(bond->name);
246 free(bond);
247}
248
249/* Updates 'bond''s overall configuration to 's'.
250 *
251 * The caller should register each slave on 'bond' by calling
252 * bond_slave_register(). This is optional if none of the slaves'
4d6fb5eb 253 * configuration has changed. In any case it can't hurt.
59d7b2b6
EJ
254 *
255 * Returns true if the configuration has changed in such a way that requires
256 * flow revalidation.
257 * */
258bool
f620b43a
BP
259bond_reconfigure(struct bond *bond, const struct bond_settings *s)
260{
59d7b2b6
EJ
261 bool revalidate = false;
262
3bfd3972 263 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
264 if (!bond->name || strcmp(bond->name, s->name)) {
265 if (bond->name) {
3bfd3972 266 hmap_remove(all_bonds, &bond->hmap_node);
f620b43a
BP
267 free(bond->name);
268 }
269 bond->name = xstrdup(s->name);
3bfd3972 270 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
f620b43a
BP
271 }
272
f620b43a
BP
273 bond->updelay = s->up_delay;
274 bond->downdelay = s->down_delay;
bc1b010c
EJ
275
276 if (bond->rebalance_interval != s->rebalance_interval) {
277 bond->rebalance_interval = s->rebalance_interval;
278 revalidate = true;
279 }
f620b43a 280
59d7b2b6
EJ
281 if (bond->balance != s->balance) {
282 bond->balance = s->balance;
283 revalidate = true;
284 }
285
672d18b2
EJ
286 if (bond->basis != s->basis) {
287 bond->basis = s->basis;
288 revalidate = true;
289 }
290
f620b43a
BP
291 if (s->fake_iface) {
292 if (bond->next_fake_iface_update == LLONG_MAX) {
293 bond->next_fake_iface_update = time_msec();
294 }
295 } else {
296 bond->next_fake_iface_update = LLONG_MAX;
297 }
59d7b2b6 298
62904702
EJ
299 if (bond->bond_revalidate) {
300 revalidate = true;
301 bond->bond_revalidate = false;
302 }
303
95aafb2a
EJ
304 if (bond->balance == BM_AB || !bond->hash || revalidate) {
305 bond_entry_reset(bond);
306 }
307
3bfd3972 308 ovs_rwlock_unlock(&rwlock);
59d7b2b6 309 return revalidate;
f620b43a
BP
310}
311
f8ddccd2 312static void
1ea24138 313bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
3bfd3972 314 OVS_REQ_WRLOCK(rwlock)
f8ddccd2
BP
315{
316 if (slave->netdev != netdev) {
f8ddccd2 317 slave->netdev = netdev;
1ea24138 318 slave->change_seq = 0;
f8ddccd2
BP
319 }
320}
321
f620b43a
BP
322/* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
323 * arbitrary client-provided pointer that uniquely identifies a slave within a
324 * bond. If 'slave_' already exists within 'bond' then this function
325 * reconfigures the existing slave.
326 *
327 * 'netdev' must be the network device that 'slave_' represents. It is owned
328 * by the client, so the client must not close it before either unregistering
329 * 'slave_' or destroying 'bond'.
4d6fb5eb 330 */
f620b43a 331void
df53d41c 332bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
f620b43a 333{
3bfd3972 334 struct bond_slave *slave;
f620b43a 335
3bfd3972
EJ
336 ovs_rwlock_wrlock(&rwlock);
337 slave = bond_slave_lookup(bond, slave_);
f620b43a
BP
338 if (!slave) {
339 slave = xzalloc(sizeof *slave);
340
341 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
342 slave->bond = bond;
343 slave->aux = slave_;
344 slave->delay_expires = LLONG_MAX;
244b2160 345 slave->name = xstrdup(netdev_get_name(netdev));
7321e30e 346 bond->bond_revalidate = true;
244b2160 347
b3c18f66 348 slave->enabled = false;
c8544aa1 349 bond_enable_slave(slave, netdev_get_carrier(netdev), NULL);
f620b43a
BP
350 }
351
1ea24138 352 bond_slave_set_netdev__(slave, netdev);
a6934aa9 353
f620b43a
BP
354 free(slave->name);
355 slave->name = xstrdup(netdev_get_name(netdev));
3bfd3972 356 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
357}
358
f8ddccd2
BP
359/* Updates the network device to be used with 'slave_' to 'netdev'.
360 *
361 * This is useful if the caller closes and re-opens the network device
362 * registered with bond_slave_register() but doesn't need to change anything
363 * else. */
364void
365bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
366{
3bfd3972
EJ
367 struct bond_slave *slave;
368
369 ovs_rwlock_wrlock(&rwlock);
370 slave = bond_slave_lookup(bond, slave_);
f8ddccd2 371 if (slave) {
1ea24138 372 bond_slave_set_netdev__(slave, netdev);
f8ddccd2 373 }
3bfd3972 374 ovs_rwlock_unlock(&rwlock);
f8ddccd2
BP
375}
376
f620b43a
BP
377/* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
378 * then this function has no effect.
379 *
380 * Unregistering a slave invalidates all flows. */
381void
382bond_slave_unregister(struct bond *bond, const void *slave_)
383{
3bfd3972 384 struct bond_slave *slave;
f620b43a
BP
385 bool del_active;
386
3bfd3972
EJ
387 ovs_rwlock_wrlock(&rwlock);
388 slave = bond_slave_lookup(bond, slave_);
f620b43a 389 if (!slave) {
3bfd3972 390 goto out;
f620b43a
BP
391 }
392
b3c18f66
EJ
393 bond_enable_slave(slave, false, NULL);
394
f620b43a
BP
395 del_active = bond->active_slave == slave;
396 if (bond->hash) {
397 struct bond_entry *e;
398 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
399 if (e->slave == slave) {
400 e->slave = NULL;
401 }
402 }
403 }
404
405 free(slave->name);
406
407 hmap_remove(&bond->slaves, &slave->hmap_node);
408 /* Client owns 'slave->netdev'. */
409 free(slave);
410
411 if (del_active) {
412 struct tag_set tags;
413
414 tag_set_init(&tags);
415 bond_choose_active_slave(bond, &tags);
416 bond->send_learning_packets = true;
417 }
3bfd3972
EJ
418out:
419 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
420}
421
296f6519
EJ
422/* Should be called on each slave in 'bond' before bond_run() to indicate
423 * whether or not 'slave_' may be enabled. This function is intended to allow
424 * other protocols to have some impact on bonding decisions. For example LACP
425 * or high level link monitoring protocols may decide that a given slave should
426 * not be able to send traffic. */
4d6fb5eb 427void
296f6519 428bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
4d6fb5eb 429{
3bfd3972 430 ovs_rwlock_wrlock(&rwlock);
296f6519 431 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
3bfd3972 432 ovs_rwlock_unlock(&rwlock);
4d6fb5eb
EJ
433}
434
f620b43a
BP
435/* Performs periodic maintenance on 'bond'. The caller must provide 'tags' to
436 * allow tagged flows to be invalidated.
437 *
438 * The caller should check bond_should_send_learning_packets() afterward. */
439void
bdebeece 440bond_run(struct bond *bond, struct tag_set *tags, enum lacp_status lacp_status)
f620b43a
BP
441{
442 struct bond_slave *slave;
443
3bfd3972 444 ovs_rwlock_wrlock(&rwlock);
bdebeece
EJ
445 if (bond->lacp_status != lacp_status) {
446 bond->lacp_status = lacp_status;
4592d0e2
EJ
447 bond->bond_revalidate = true;
448 }
4d6fb5eb 449
f620b43a
BP
450 /* Enable slaves based on link status and LACP feedback. */
451 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
452 bond_link_status_update(slave, tags);
1ea24138 453 slave->change_seq = netdev_change_seq(slave->netdev);
f620b43a
BP
454 }
455 if (!bond->active_slave || !bond->active_slave->enabled) {
456 bond_choose_active_slave(bond, tags);
457 }
458
459 /* Update fake bond interface stats. */
460 if (time_msec() >= bond->next_fake_iface_update) {
461 bond_update_fake_slave_stats(bond);
462 bond->next_fake_iface_update = time_msec() + 1000;
463 }
464
62904702 465 if (bond->bond_revalidate) {
df53d41c 466 struct bond_slave *slave;
dc9908b3 467
df53d41c 468 bond->bond_revalidate = false;
95aafb2a 469 bond_entry_reset(bond);
df53d41c
EJ
470 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
471 tag_set_add(tags, slave->tag);
dc9908b3 472 }
0008fbcb 473 tag_set_add(tags, bond->no_slaves_tag);
dc9908b3
EJ
474 }
475
f620b43a
BP
476 /* Invalidate any tags required by */
477 tag_set_union(tags, &bond->unixctl_tags);
478 tag_set_init(&bond->unixctl_tags);
3bfd3972 479 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
480}
481
482/* Causes poll_block() to wake up when 'bond' needs something to be done. */
483void
484bond_wait(struct bond *bond)
485{
486 struct bond_slave *slave;
487
3bfd3972 488 ovs_rwlock_rdlock(&rwlock);
f620b43a
BP
489 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
490 if (slave->delay_expires != LLONG_MAX) {
491 poll_timer_wait_until(slave->delay_expires);
492 }
1ea24138
EJ
493
494 if (slave->change_seq != netdev_change_seq(slave->netdev)) {
495 poll_immediate_wake();
496 }
f620b43a
BP
497 }
498
499 if (bond->next_fake_iface_update != LLONG_MAX) {
500 poll_timer_wait_until(bond->next_fake_iface_update);
501 }
502
503 /* Ensure that any saved tags get revalidated right away. */
504 if (!tag_set_is_empty(&bond->unixctl_tags)) {
505 poll_immediate_wake();
506 }
3bfd3972 507 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
508
509 /* We don't wait for bond->next_rebalance because rebalancing can only run
510 * at a flow account checkpoint. ofproto does checkpointing on its own
511 * schedule and bond_rebalance() gets called afterward, so we'd just be
512 * waking up for no purpose. */
513}
514\f
515/* MAC learning table interaction. */
516
517static bool
518may_send_learning_packets(const struct bond *bond)
519{
bdebeece 520 return bond->lacp_status == LACP_DISABLED
64e2748d 521 && (bond->balance == BM_SLB || bond->balance == BM_AB)
bdebeece 522 && bond->active_slave;
f620b43a
BP
523}
524
525/* Returns true if 'bond' needs the client to send out packets to assist with
526 * MAC learning on 'bond'. If this function returns true, then the client
527 * should iterate through its MAC learning table for the bridge on which 'bond'
528 * is located. For each MAC that has been learned on a port other than 'bond',
ea131871 529 * it should call bond_compose_learning_packet().
f620b43a 530 *
477879ea
BP
531 * This function will only return true if 'bond' is in SLB or active-backup
532 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
533 * necessary.
f620b43a
BP
534 *
535 * Calling this function resets the state that it checks. */
536bool
537bond_should_send_learning_packets(struct bond *bond)
538{
3bfd3972
EJ
539 bool send;
540
541 ovs_rwlock_wrlock(&rwlock);
542 send = bond->send_learning_packets && may_send_learning_packets(bond);
f620b43a 543 bond->send_learning_packets = false;
3bfd3972 544 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
545 return send;
546}
547
548/* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
549 *
ea131871
JG
550 * See bond_should_send_learning_packets() for description of usage. The
551 * caller should send the composed packet on the port associated with
552 * port_aux and takes ownership of the returned ofpbuf. */
553struct ofpbuf *
554bond_compose_learning_packet(struct bond *bond,
555 const uint8_t eth_src[ETH_ADDR_LEN],
556 uint16_t vlan, void **port_aux)
f620b43a
BP
557{
558 struct bond_slave *slave;
ea131871 559 struct ofpbuf *packet;
00ed8314 560 tag_type tags = 0;
f620b43a 561 struct flow flow;
f620b43a 562
3bfd3972 563 ovs_rwlock_rdlock(&rwlock);
cb22974d 564 ovs_assert(may_send_learning_packets(bond));
f620b43a
BP
565 memset(&flow, 0, sizeof flow);
566 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
bcd2633a 567 slave = choose_output_slave(bond, &flow, NULL, vlan, &tags);
f620b43a 568
ea131871 569 packet = ofpbuf_new(0);
2ea838ac 570 compose_rarp(packet, eth_src);
f620b43a 571 if (vlan) {
ea131871 572 eth_push_vlan(packet, htons(vlan));
f620b43a 573 }
f620b43a 574
ea131871 575 *port_aux = slave->aux;
3bfd3972 576 ovs_rwlock_unlock(&rwlock);
ea131871 577 return packet;
f620b43a
BP
578}
579\f
580/* Checks whether a packet that arrived on 'slave_' within 'bond', with an
581 * Ethernet destination address of 'eth_dst', should be admitted.
582 *
583 * The return value is one of the following:
584 *
585 * - BV_ACCEPT: Admit the packet.
586 *
587 * - BV_DROP: Drop the packet.
588 *
589 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
590 * Ethernet source address and VLAN. If there is none, or if the packet
591 * is on the learned port, then admit the packet. If a different port has
592 * been learned, however, drop the packet (and do not use it for MAC
593 * learning).
594 */
595enum bond_verdict
596bond_check_admissibility(struct bond *bond, const void *slave_,
597 const uint8_t eth_dst[ETH_ADDR_LEN], tag_type *tags)
598{
3bfd3972
EJ
599 enum bond_verdict verdict = BV_DROP;
600 struct bond_slave *slave;
9a1c6450 601
3bfd3972
EJ
602 ovs_rwlock_rdlock(&rwlock);
603 slave = bond_slave_lookup(bond, slave_);
4222bbc8 604 if (!slave) {
3bfd3972 605 goto out;
4222bbc8
EJ
606 }
607
9a1c6450
EJ
608 /* LACP bonds have very loose admissibility restrictions because we can
609 * assume the remote switch is aware of the bond and will "do the right
610 * thing". However, as a precaution we drop packets on disabled slaves
611 * because no correctly implemented partner switch should be sending
bdebeece
EJ
612 * packets to them.
613 *
614 * If LACP is configured, but LACP negotiations have been unsuccessful, we
615 * drop all incoming traffic. */
616 switch (bond->lacp_status) {
3bfd3972
EJ
617 case LACP_NEGOTIATED:
618 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
619 goto out;
620 case LACP_CONFIGURED:
621 goto out;
622 case LACP_DISABLED:
623 break;
f620b43a
BP
624 }
625
626 /* Drop all multicast packets on inactive slaves. */
627 if (eth_addr_is_multicast(eth_dst)) {
628 *tags |= bond_get_active_slave_tag(bond);
4222bbc8 629 if (bond->active_slave != slave) {
3bfd3972 630 goto out;
f620b43a
BP
631 }
632 }
633
f931a4c9
BP
634 switch (bond->balance) {
635 case BM_AB:
636 /* Drop all packets which arrive on backup slaves. This is similar to
637 * how Linux bonding handles active-backup bonds. */
7ba7dcf0
EJ
638 *tags |= bond_get_active_slave_tag(bond);
639 if (bond->active_slave != slave) {
640 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
641
e6b2255c
BP
642 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
643 " slave (%s) destined for " ETH_ADDR_FMT,
644 slave->name, ETH_ADDR_ARGS(eth_dst));
3bfd3972 645 goto out;
7ba7dcf0 646 }
3bfd3972
EJ
647 verdict = BV_ACCEPT;
648 goto out;
f931a4c9
BP
649
650 case BM_TCP:
bdebeece
EJ
651 /* TCP balanced bonds require successful LACP negotiated. Based on the
652 * above check, LACP is off on this bond. Therfore, we drop all
653 * incoming traffic. */
3bfd3972 654 goto out;
bdebeece 655
f931a4c9
BP
656 case BM_SLB:
657 /* Drop all packets for which we have learned a different input port,
658 * because we probably sent the packet on one slave and got it back on
659 * the other. Gratuitous ARP packets are an exception to this rule:
660 * the host has moved to another switch. The exception to the
661 * exception is if we locked the learning table to avoid reflections on
662 * bond slaves. */
3bfd3972
EJ
663 verdict = BV_DROP_IF_MOVED;
664 goto out;
7ba7dcf0
EJ
665 }
666
f931a4c9 667 NOT_REACHED();
3bfd3972
EJ
668out:
669 ovs_rwlock_unlock(&rwlock);
670 return verdict;
671
f620b43a
BP
672}
673
674/* Returns the slave (registered on 'bond' by bond_slave_register()) to which
675 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
676 * NULL if the packet should be dropped because no slaves are enabled.
677 *
678 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
679 * should be a VID only (i.e. excluding the PCP bits). Second,
680 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
681 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
682 * packet belongs to (so for an access port it will be the access port's VLAN).
683 *
684 * Adds a tag to '*tags' that associates the flow with the returned slave.
bcd2633a
JP
685 *
686 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
687 * significant in the selection. At some point earlier, 'wc' should
688 * have been initialized (e.g., by flow_wildcards_init_catchall()).
f620b43a
BP
689 */
690void *
691bond_choose_output_slave(struct bond *bond, const struct flow *flow,
bcd2633a
JP
692 struct flow_wildcards *wc, uint16_t vlan,
693 tag_type *tags)
f620b43a 694{
3bfd3972
EJ
695 struct bond_slave *slave;
696 void *result = NULL;
697
698 ovs_rwlock_rdlock(&rwlock);
699 slave = choose_output_slave(bond, flow, wc, vlan, tags);
f620b43a 700 if (slave) {
df53d41c 701 *tags |= slave->tag;
3bfd3972 702 result = slave->aux;
f620b43a
BP
703 } else {
704 *tags |= bond->no_slaves_tag;
f620b43a 705 }
3bfd3972
EJ
706 ovs_rwlock_unlock(&rwlock);
707 return result;
f620b43a 708}
f620b43a
BP
709\f
710/* Rebalancing. */
711
1b137691 712static bool
3bfd3972 713bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
1b137691 714{
bc1b010c
EJ
715 return bond->rebalance_interval
716 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
1b137691
EJ
717}
718
f620b43a
BP
719/* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
720void
721bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
722 uint64_t n_bytes)
723{
3bfd3972 724 ovs_rwlock_wrlock(&rwlock);
1b137691 725 if (bond_is_balanced(bond)) {
f620b43a 726 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
f620b43a 727 }
3bfd3972 728 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
729}
730
731static struct bond_slave *
3bfd3972 732bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
733{
734 return CONTAINER_OF(bal, struct bond_slave, bal_node);
735}
736
737static void
738log_bals(struct bond *bond, const struct list *bals)
739{
740 if (VLOG_IS_DBG_ENABLED()) {
741 struct ds ds = DS_EMPTY_INITIALIZER;
742 const struct bond_slave *slave;
743
744 LIST_FOR_EACH (slave, bal_node, bals) {
745 if (ds.length) {
746 ds_put_char(&ds, ',');
747 }
748 ds_put_format(&ds, " %s %"PRIu64"kB",
749 slave->name, slave->tx_bytes / 1024);
750
751 if (!slave->enabled) {
752 ds_put_cstr(&ds, " (disabled)");
753 }
754 if (!list_is_empty(&slave->entries)) {
755 struct bond_entry *e;
756
757 ds_put_cstr(&ds, " (");
758 LIST_FOR_EACH (e, list_node, &slave->entries) {
759 if (&e->list_node != list_front(&slave->entries)) {
760 ds_put_cstr(&ds, " + ");
761 }
762 ds_put_format(&ds, "h%td: %"PRIu64"kB",
763 e - bond->hash, e->tx_bytes / 1024);
764 }
765 ds_put_cstr(&ds, ")");
766 }
767 }
768 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
769 ds_destroy(&ds);
770 }
771}
772
773/* Shifts 'hash' from its current slave to 'to'. */
774static void
775bond_shift_load(struct bond_entry *hash, struct bond_slave *to,
776 struct tag_set *set)
777{
778 struct bond_slave *from = hash->slave;
779 struct bond *bond = from->bond;
780 uint64_t delta = hash->tx_bytes;
781
782 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) "
783 "from %s to %s (now carrying %"PRIu64"kB and "
784 "%"PRIu64"kB load, respectively)",
785 bond->name, delta / 1024, hash - bond->hash,
786 from->name, to->name,
787 (from->tx_bytes - delta) / 1024,
788 (to->tx_bytes + delta) / 1024);
789
790 /* Shift load away from 'from' to 'to'. */
791 from->tx_bytes -= delta;
792 to->tx_bytes += delta;
793
794 /* Arrange for flows to be revalidated. */
795 tag_set_add(set, hash->tag);
796 hash->slave = to;
797 hash->tag = tag_create_random();
798}
799
09a5d390
BP
800/* Picks and returns a bond_entry to migrate from 'from' (the most heavily
801 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
f620b43a
BP
802 * given that doing so must decrease the ratio of the load on the two slaves by
803 * at least 0.1. Returns NULL if there is no appropriate entry.
804 *
805 * The list of entries isn't sorted. I don't know of a reason to prefer to
806 * shift away small hashes or large hashes. */
807static struct bond_entry *
808choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
809{
810 struct bond_entry *e;
811
812 if (list_is_short(&from->entries)) {
813 /* 'from' carries no more than one MAC hash, so shifting load away from
814 * it would be pointless. */
815 return NULL;
816 }
817
818 LIST_FOR_EACH (e, list_node, &from->entries) {
819 double old_ratio, new_ratio;
820 uint64_t delta;
821
822 if (to_tx_bytes == 0) {
823 /* Nothing on the new slave, move it. */
824 return e;
825 }
826
827 delta = e->tx_bytes;
828 old_ratio = (double)from->tx_bytes / to_tx_bytes;
829 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
75fad143
ZK
830 if (old_ratio - new_ratio > 0.1
831 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
832 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
833 and 'to' slave have the same load. Therefore, we only move an
834 entry if it decreases the load on 'from', and brings us closer
835 to equal traffic load. */
f620b43a
BP
836 return e;
837 }
838 }
839
840 return NULL;
841}
842
843/* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
844 * maintained. */
845static void
846insert_bal(struct list *bals, struct bond_slave *slave)
847{
848 struct bond_slave *pos;
849
850 LIST_FOR_EACH (pos, bal_node, bals) {
851 if (slave->tx_bytes > pos->tx_bytes) {
852 break;
853 }
854 }
855 list_insert(&pos->bal_node, &slave->bal_node);
856}
857
858/* Removes 'slave' from its current list and then inserts it into 'bals' so
859 * that descending order of 'tx_bytes' is maintained. */
860static void
861reinsert_bal(struct list *bals, struct bond_slave *slave)
862{
863 list_remove(&slave->bal_node);
864 insert_bal(bals, slave);
865}
866
867/* If 'bond' needs rebalancing, does so.
868 *
869 * The caller should have called bond_account() for each active flow, to ensure
870 * that flow data is consistently accounted at this point. */
871void
872bond_rebalance(struct bond *bond, struct tag_set *tags)
873{
874 struct bond_slave *slave;
875 struct bond_entry *e;
876 struct list bals;
877
3bfd3972 878 ovs_rwlock_wrlock(&rwlock);
1b137691 879 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
3bfd3972 880 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
881 return;
882 }
883 bond->next_rebalance = time_msec() + bond->rebalance_interval;
884
885 /* Add each bond_entry to its slave's 'entries' list.
886 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
887 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
888 slave->tx_bytes = 0;
889 list_init(&slave->entries);
890 }
891 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
892 if (e->slave && e->tx_bytes) {
893 e->slave->tx_bytes += e->tx_bytes;
894 list_push_back(&e->slave->entries, &e->list_node);
895 }
896 }
897
898 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
899 *
900 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
901 * with a proper list sort algorithm. */
902 list_init(&bals);
903 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
904 if (slave->enabled) {
905 insert_bal(&bals, slave);
906 }
907 }
908 log_bals(bond, &bals);
909
910 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
911 while (!list_is_short(&bals)) {
912 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
913 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
914 uint64_t overload;
915
916 overload = from->tx_bytes - to->tx_bytes;
917 if (overload < to->tx_bytes >> 5 || overload < 100000) {
918 /* The extra load on 'from' (and all less-loaded slaves), compared
919 * to that of 'to' (the least-loaded slave), is less than ~3%, or
920 * it is less than ~1Mbps. No point in rebalancing. */
921 break;
922 }
923
09a5d390
BP
924 /* 'from' is carrying significantly more load than 'to'. Pick a hash
925 * to move from 'from' to 'to'. */
f620b43a
BP
926 e = choose_entry_to_migrate(from, to->tx_bytes);
927 if (e) {
928 bond_shift_load(e, to, tags);
929
930 /* Delete element from from->entries.
931 *
932 * We don't add the element to to->hashes. That would only allow
933 * 'e' to be migrated to another slave in this rebalancing run, and
934 * there is no point in doing that. */
935 list_remove(&e->list_node);
936
937 /* Re-sort 'bals'. */
938 reinsert_bal(&bals, from);
939 reinsert_bal(&bals, to);
940 } else {
941 /* Can't usefully migrate anything away from 'from'.
942 * Don't reconsider it. */
943 list_remove(&from->bal_node);
944 }
945 }
946
947 /* Implement exponentially weighted moving average. A weight of 1/2 causes
948 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
949 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
950 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
951 e->tx_bytes /= 2;
952 if (!e->tx_bytes) {
953 e->slave = NULL;
954 }
955 }
3bfd3972 956 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
957}
958\f
959/* Bonding unixctl user interface functions. */
960
961static struct bond *
3bfd3972 962bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
f620b43a
BP
963{
964 struct bond *bond;
965
966 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
3bfd3972 967 all_bonds) {
f620b43a
BP
968 if (!strcmp(bond->name, name)) {
969 return bond;
970 }
971 }
972 return NULL;
973}
974
975static struct bond_slave *
976bond_lookup_slave(struct bond *bond, const char *slave_name)
977{
978 struct bond_slave *slave;
979
980 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
981 if (!strcmp(slave->name, slave_name)) {
982 return slave;
983 }
984 }
985 return NULL;
986}
987
988static void
989bond_unixctl_list(struct unixctl_conn *conn,
0e15264f
BP
990 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
991 void *aux OVS_UNUSED)
f620b43a
BP
992{
993 struct ds ds = DS_EMPTY_INITIALIZER;
994 const struct bond *bond;
995
996 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
997
3bfd3972
EJ
998 ovs_rwlock_rdlock(&rwlock);
999 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
f620b43a
BP
1000 const struct bond_slave *slave;
1001 size_t i;
1002
1003 ds_put_format(&ds, "%s\t%s\t",
1004 bond->name, bond_mode_to_string(bond->balance));
1005
1006 i = 0;
1007 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1008 if (i++ > 0) {
1009 ds_put_cstr(&ds, ", ");
1010 }
1011 ds_put_cstr(&ds, slave->name);
1012 }
1013 ds_put_char(&ds, '\n');
1014 }
3bfd3972 1015 ovs_rwlock_unlock(&rwlock);
bde9f75d 1016 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a
BP
1017 ds_destroy(&ds);
1018}
1019
1020static void
c33a8a25 1021bond_print_details(struct ds *ds, const struct bond *bond)
3bfd3972 1022 OVS_REQ_RDLOCK(rwlock)
f620b43a 1023{
fc1d4f01
EJ
1024 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1025 const struct shash_node **sorted_slaves = NULL;
f620b43a 1026 const struct bond_slave *slave;
fc1d4f01 1027 int i;
f620b43a 1028
c33a8a25
EJ
1029 ds_put_format(ds, "---- %s ----\n", bond->name);
1030 ds_put_format(ds, "bond_mode: %s\n",
f620b43a
BP
1031 bond_mode_to_string(bond->balance));
1032
c33a8a25 1033 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
672d18b2 1034
c33a8a25
EJ
1035 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1036 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
f620b43a 1037
1b137691 1038 if (bond_is_balanced(bond)) {
c33a8a25 1039 ds_put_format(ds, "next rebalance: %lld ms\n",
f620b43a
BP
1040 bond->next_rebalance - time_msec());
1041 }
1042
bdebeece
EJ
1043 ds_put_cstr(ds, "lacp_status: ");
1044 switch (bond->lacp_status) {
1045 case LACP_NEGOTIATED:
1046 ds_put_cstr(ds, "negotiated\n");
1047 break;
1048 case LACP_CONFIGURED:
1049 ds_put_cstr(ds, "configured\n");
1050 break;
1051 case LACP_DISABLED:
1052 ds_put_cstr(ds, "off\n");
1053 break;
1054 default:
1055 ds_put_cstr(ds, "<unknown>\n");
1056 break;
1057 }
4d6fb5eb 1058
f620b43a 1059 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
fc1d4f01
EJ
1060 shash_add(&slave_shash, slave->name, slave);
1061 }
1062 sorted_slaves = shash_sort(&slave_shash);
1063
1064 for (i = 0; i < shash_count(&slave_shash); i++) {
f620b43a 1065 struct bond_entry *be;
f620b43a 1066
fc1d4f01
EJ
1067 slave = sorted_slaves[i]->data;
1068
f620b43a 1069 /* Basic info. */
c33a8a25 1070 ds_put_format(ds, "\nslave %s: %s\n",
f620b43a
BP
1071 slave->name, slave->enabled ? "enabled" : "disabled");
1072 if (slave == bond->active_slave) {
c33a8a25 1073 ds_put_cstr(ds, "\tactive slave\n");
f620b43a
BP
1074 }
1075 if (slave->delay_expires != LLONG_MAX) {
c33a8a25 1076 ds_put_format(ds, "\t%s expires in %lld ms\n",
f620b43a
BP
1077 slave->enabled ? "downdelay" : "updelay",
1078 slave->delay_expires - time_msec());
1079 }
1080
c33a8a25 1081 ds_put_format(ds, "\tmay_enable: %s\n",
296f6519 1082 slave->may_enable ? "true" : "false");
4d6fb5eb 1083
1b137691 1084 if (!bond_is_balanced(bond)) {
f620b43a
BP
1085 continue;
1086 }
1087
1088 /* Hashes. */
f620b43a
BP
1089 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1090 int hash = be - bond->hash;
1091
1092 if (be->slave != slave) {
1093 continue;
1094 }
1095
c33a8a25 1096 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
f620b43a
BP
1097 hash, be->tx_bytes / 1024);
1098
7b9f1974 1099 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
f620b43a
BP
1100 }
1101 }
fc1d4f01
EJ
1102 shash_destroy(&slave_shash);
1103 free(sorted_slaves);
c33a8a25
EJ
1104 ds_put_cstr(ds, "\n");
1105}
1106
1107static void
1108bond_unixctl_show(struct unixctl_conn *conn,
1109 int argc, const char *argv[],
1110 void *aux OVS_UNUSED)
1111{
1112 struct ds ds = DS_EMPTY_INITIALIZER;
1113
3bfd3972 1114 ovs_rwlock_rdlock(&rwlock);
c33a8a25
EJ
1115 if (argc > 1) {
1116 const struct bond *bond = bond_find(argv[1]);
1117
1118 if (!bond) {
bde9f75d 1119 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1120 goto out;
c33a8a25
EJ
1121 }
1122 bond_print_details(&ds, bond);
1123 } else {
1124 const struct bond *bond;
1125
3bfd3972 1126 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
c33a8a25
EJ
1127 bond_print_details(&ds, bond);
1128 }
1129 }
1130
bde9f75d 1131 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a 1132 ds_destroy(&ds);
3bfd3972
EJ
1133
1134out:
1135 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1136}
1137
1138static void
0e15264f
BP
1139bond_unixctl_migrate(struct unixctl_conn *conn,
1140 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1141 void *aux OVS_UNUSED)
1142{
0e15264f
BP
1143 const char *bond_s = argv[1];
1144 const char *hash_s = argv[2];
1145 const char *slave_s = argv[3];
f620b43a
BP
1146 struct bond *bond;
1147 struct bond_slave *slave;
1148 struct bond_entry *entry;
1149 int hash;
1150
3bfd3972 1151 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1152 bond = bond_find(bond_s);
1153 if (!bond) {
bde9f75d 1154 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1155 goto out;
f620b43a
BP
1156 }
1157
1158 if (bond->balance != BM_SLB) {
bde9f75d 1159 unixctl_command_reply_error(conn, "not an SLB bond");
3bfd3972 1160 goto out;
f620b43a
BP
1161 }
1162
1163 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1164 hash = atoi(hash_s) & BOND_MASK;
1165 } else {
bde9f75d 1166 unixctl_command_reply_error(conn, "bad hash");
3bfd3972 1167 goto out;
f620b43a
BP
1168 }
1169
1170 slave = bond_lookup_slave(bond, slave_s);
1171 if (!slave) {
bde9f75d 1172 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1173 goto out;
f620b43a
BP
1174 }
1175
1176 if (!slave->enabled) {
bde9f75d 1177 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
3bfd3972 1178 goto out;
f620b43a
BP
1179 }
1180
1181 entry = &bond->hash[hash];
1182 tag_set_add(&bond->unixctl_tags, entry->tag);
1183 entry->slave = slave;
1184 entry->tag = tag_create_random();
bde9f75d 1185 unixctl_command_reply(conn, "migrated");
3bfd3972
EJ
1186
1187out:
1188 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1189}
1190
1191static void
0e15264f
BP
1192bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1193 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1194 void *aux OVS_UNUSED)
1195{
0e15264f
BP
1196 const char *bond_s = argv[1];
1197 const char *slave_s = argv[2];
f620b43a
BP
1198 struct bond *bond;
1199 struct bond_slave *slave;
1200
3bfd3972 1201 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1202 bond = bond_find(bond_s);
1203 if (!bond) {
bde9f75d 1204 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1205 goto out;
f620b43a
BP
1206 }
1207
1208 slave = bond_lookup_slave(bond, slave_s);
1209 if (!slave) {
bde9f75d 1210 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1211 goto out;
f620b43a
BP
1212 }
1213
1214 if (!slave->enabled) {
bde9f75d 1215 unixctl_command_reply_error(conn, "cannot make disabled slave active");
3bfd3972 1216 goto out;
f620b43a
BP
1217 }
1218
1219 if (bond->active_slave != slave) {
1220 tag_set_add(&bond->unixctl_tags, bond_get_active_slave_tag(bond));
1221 bond->active_slave = slave;
1222 bond->active_slave->tag = tag_create_random();
1223 VLOG_INFO("bond %s: active interface is now %s",
1224 bond->name, slave->name);
1225 bond->send_learning_packets = true;
bde9f75d 1226 unixctl_command_reply(conn, "done");
f620b43a 1227 } else {
bde9f75d 1228 unixctl_command_reply(conn, "no change");
f620b43a 1229 }
3bfd3972
EJ
1230out:
1231 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1232}
1233
1234static void
0e15264f 1235enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
f620b43a 1236{
0e15264f
BP
1237 const char *bond_s = argv[1];
1238 const char *slave_s = argv[2];
f620b43a
BP
1239 struct bond *bond;
1240 struct bond_slave *slave;
1241
3bfd3972 1242 ovs_rwlock_wrlock(&rwlock);
f620b43a
BP
1243 bond = bond_find(bond_s);
1244 if (!bond) {
bde9f75d 1245 unixctl_command_reply_error(conn, "no such bond");
3bfd3972 1246 goto out;
f620b43a
BP
1247 }
1248
1249 slave = bond_lookup_slave(bond, slave_s);
1250 if (!slave) {
bde9f75d 1251 unixctl_command_reply_error(conn, "no such slave");
3bfd3972 1252 goto out;
f620b43a
BP
1253 }
1254
1255 bond_enable_slave(slave, enable, &bond->unixctl_tags);
bde9f75d 1256 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
3bfd3972
EJ
1257
1258out:
1259 ovs_rwlock_unlock(&rwlock);
f620b43a
BP
1260}
1261
1262static void
0e15264f
BP
1263bond_unixctl_enable_slave(struct unixctl_conn *conn,
1264 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1265 void *aux OVS_UNUSED)
1266{
0e15264f 1267 enable_slave(conn, argv, true);
f620b43a
BP
1268}
1269
1270static void
0e15264f
BP
1271bond_unixctl_disable_slave(struct unixctl_conn *conn,
1272 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1273 void *aux OVS_UNUSED)
1274{
0e15264f 1275 enable_slave(conn, argv, false);
f620b43a
BP
1276}
1277
1278static void
0e15264f 1279bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
f620b43a
BP
1280 void *aux OVS_UNUSED)
1281{
0e15264f
BP
1282 const char *mac_s = argv[1];
1283 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1284 const char *basis_s = argc > 3 ? argv[3] : NULL;
f620b43a
BP
1285 uint8_t mac[ETH_ADDR_LEN];
1286 uint8_t hash;
1287 char *hash_cstr;
1288 unsigned int vlan;
672d18b2 1289 uint32_t basis;
f620b43a
BP
1290
1291 if (vlan_s) {
1292 if (sscanf(vlan_s, "%u", &vlan) != 1) {
bde9f75d 1293 unixctl_command_reply_error(conn, "invalid vlan");
f620b43a
BP
1294 return;
1295 }
1296 } else {
dc155bff 1297 vlan = 0;
f620b43a
BP
1298 }
1299
672d18b2
EJ
1300 if (basis_s) {
1301 if (sscanf(basis_s, "%"PRIu32, &basis) != 1) {
bde9f75d 1302 unixctl_command_reply_error(conn, "invalid basis");
672d18b2
EJ
1303 return;
1304 }
1305 } else {
1306 basis = 0;
1307 }
1308
f620b43a
BP
1309 if (sscanf(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))
1310 == ETH_ADDR_SCAN_COUNT) {
672d18b2 1311 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
f620b43a
BP
1312
1313 hash_cstr = xasprintf("%u", hash);
bde9f75d 1314 unixctl_command_reply(conn, hash_cstr);
f620b43a
BP
1315 free(hash_cstr);
1316 } else {
bde9f75d 1317 unixctl_command_reply_error(conn, "invalid mac");
f620b43a
BP
1318 }
1319}
1320
1321void
1322bond_init(void)
1323{
0e15264f 1324 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
c33a8a25
EJ
1325 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1326 NULL);
0e15264f 1327 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
7ff2009a 1328 bond_unixctl_migrate, NULL);
0e15264f 1329 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
f620b43a 1330 bond_unixctl_set_active_slave, NULL);
0e15264f 1331 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
7ff2009a 1332 bond_unixctl_enable_slave, NULL);
0e15264f 1333 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
7ff2009a 1334 bond_unixctl_disable_slave, NULL);
0e15264f 1335 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
7ff2009a 1336 bond_unixctl_hash, NULL);
f620b43a
BP
1337}
1338\f
95aafb2a
EJ
1339static void
1340bond_entry_reset(struct bond *bond)
1341{
1342 if (bond->balance != BM_AB) {
1343 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1344
1345 if (!bond->hash) {
1346 bond->hash = xmalloc(hash_len);
1347 }
1348 memset(bond->hash, 0, hash_len);
1349
1350 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1351 } else {
1352 free(bond->hash);
1353 bond->hash = NULL;
1354 }
1355}
1356
f620b43a
BP
1357static struct bond_slave *
1358bond_slave_lookup(struct bond *bond, const void *slave_)
1359{
1360 struct bond_slave *slave;
1361
1362 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1363 &bond->slaves) {
1364 if (slave->aux == slave_) {
1365 return slave;
1366 }
1367 }
1368
1369 return NULL;
1370}
1371
f620b43a
BP
1372static void
1373bond_enable_slave(struct bond_slave *slave, bool enable, struct tag_set *tags)
1374{
1375 slave->delay_expires = LLONG_MAX;
1376 if (enable != slave->enabled) {
1377 slave->enabled = enable;
1378 if (!slave->enabled) {
d28b9ead 1379 VLOG_INFO("interface %s: disabled", slave->name);
b3c18f66
EJ
1380 if (tags) {
1381 tag_set_add(tags, slave->tag);
1382 }
f620b43a 1383 } else {
d28b9ead 1384 VLOG_INFO("interface %s: enabled", slave->name);
f620b43a
BP
1385 slave->tag = tag_create_random();
1386 }
1387 }
1388}
1389
1390static void
1391bond_link_status_update(struct bond_slave *slave, struct tag_set *tags)
1392{
1393 struct bond *bond = slave->bond;
1394 bool up;
1395
296f6519 1396 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
f620b43a
BP
1397 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1398 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1399 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1400 slave->name, up ? "up" : "down");
1401 if (up == slave->enabled) {
1402 slave->delay_expires = LLONG_MAX;
1403 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1404 slave->name, up ? "disabled" : "enabled");
1405 } else {
bdebeece 1406 int delay = (bond->lacp_status != LACP_DISABLED ? 0
f620b43a
BP
1407 : up ? bond->updelay : bond->downdelay);
1408 slave->delay_expires = time_msec() + delay;
1409 if (delay) {
1410 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1411 "for %d ms",
1412 slave->name,
1413 up ? "enabled" : "disabled",
1414 up ? "up" : "down",
1415 delay);
1416 }
1417 }
1418 }
1419
1420 if (time_msec() >= slave->delay_expires) {
1421 bond_enable_slave(slave, up, tags);
1422 }
1423}
1424
f620b43a 1425static unsigned int
672d18b2 1426bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
f620b43a 1427{
672d18b2 1428 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
f620b43a
BP
1429}
1430
1431static unsigned int
672d18b2 1432bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
f620b43a
BP
1433{
1434 struct flow hash_flow = *flow;
d84d4b88 1435 hash_flow.vlan_tci = htons(vlan);
f620b43a
BP
1436
1437 /* The symmetric quality of this hash function is not required, but
1438 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1439 * purposes, so we use it out of convenience. */
672d18b2 1440 return flow_hash_symmetric_l4(&hash_flow, basis);
f620b43a
BP
1441}
1442
fb0b29a3
EJ
1443static unsigned int
1444bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1445{
cb22974d 1446 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
fb0b29a3 1447
bdebeece 1448 return (bond->balance == BM_TCP
672d18b2
EJ
1449 ? bond_hash_tcp(flow, vlan, bond->basis)
1450 : bond_hash_src(flow->dl_src, vlan, bond->basis));
fb0b29a3
EJ
1451}
1452
f620b43a
BP
1453static struct bond_entry *
1454lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1455 uint16_t vlan)
1456{
fb0b29a3 1457 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
f620b43a
BP
1458}
1459
1460static struct bond_slave *
1461choose_output_slave(const struct bond *bond, const struct flow *flow,
bcd2633a 1462 struct flow_wildcards *wc, uint16_t vlan, tag_type *tags)
f620b43a
BP
1463{
1464 struct bond_entry *e;
1465
bdebeece
EJ
1466 if (bond->lacp_status == LACP_CONFIGURED) {
1467 /* LACP has been configured on this bond but negotiations were
1468 * unsuccussful. Drop all traffic. */
1469 return NULL;
1470 }
1471
f620b43a
BP
1472 switch (bond->balance) {
1473 case BM_AB:
1474 return bond->active_slave;
1475
f620b43a 1476 case BM_TCP:
bdebeece
EJ
1477 if (bond->lacp_status != LACP_NEGOTIATED) {
1478 /* Must have LACP negotiations for TCP balanced bonds. */
1479 return NULL;
1480 }
bcd2633a 1481 if (wc) {
6cdd5145 1482 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
bcd2633a 1483 }
bdebeece
EJ
1484 /* Fall Through. */
1485 case BM_SLB:
bcd2633a 1486 if (wc) {
6cdd5145 1487 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
bcd2633a 1488 }
f620b43a
BP
1489 e = lookup_bond_entry(bond, flow, vlan);
1490 if (!e->slave || !e->slave->enabled) {
c804cadf
EJ
1491 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1492 struct bond_slave, hmap_node);
1493 if (!e->slave->enabled) {
1494 e->slave = bond->active_slave;
1495 }
f620b43a
BP
1496 e->tag = tag_create_random();
1497 }
00ed8314 1498 *tags |= e->tag;
f620b43a
BP
1499 return e->slave;
1500
1501 default:
1502 NOT_REACHED();
1503 }
1504}
1505
1506static struct bond_slave *
1507bond_choose_slave(const struct bond *bond)
1508{
1509 struct bond_slave *slave, *best;
1510
1511 /* Find an enabled slave. */
1512 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1513 if (slave->enabled) {
1514 return slave;
1515 }
1516 }
1517
1518 /* All interfaces are disabled. Find an interface that will be enabled
1519 * after its updelay expires. */
1520 best = NULL;
1521 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1522 if (slave->delay_expires != LLONG_MAX
296f6519 1523 && slave->may_enable
f620b43a
BP
1524 && (!best || slave->delay_expires < best->delay_expires)) {
1525 best = slave;
1526 }
1527 }
1528 return best;
1529}
1530
1531static void
1532bond_choose_active_slave(struct bond *bond, struct tag_set *tags)
1533{
1534 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1535 struct bond_slave *old_active_slave = bond->active_slave;
1536
1537 bond->active_slave = bond_choose_slave(bond);
1538 if (bond->active_slave) {
1539 if (bond->active_slave->enabled) {
1540 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1541 bond->name, bond->active_slave->name);
1542 } else {
1543 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1544 "remaining %lld ms updelay (since no interface was "
1545 "enabled)", bond->name, bond->active_slave->name,
1546 bond->active_slave->delay_expires - time_msec());
1547 bond_enable_slave(bond->active_slave, true, tags);
1548 }
1549
1550 if (!old_active_slave) {
1551 tag_set_add(tags, bond->no_slaves_tag);
1552 }
1553
1554 bond->send_learning_packets = true;
1555 } else if (old_active_slave) {
d28b9ead 1556 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
f620b43a
BP
1557 }
1558}
1559
1560/* Returns the tag for 'bond''s active slave, or 'bond''s no_slaves_tag if
1561 * there is no active slave. */
1562static tag_type
1563bond_get_active_slave_tag(const struct bond *bond)
1564{
1565 return (bond->active_slave
1566 ? bond->active_slave->tag
1567 : bond->no_slaves_tag);
1568}
1569
1570/* Attempts to make the sum of the bond slaves' statistics appear on the fake
1571 * bond interface. */
1572static void
1573bond_update_fake_slave_stats(struct bond *bond)
1574{
1575 struct netdev_stats bond_stats;
1576 struct bond_slave *slave;
1577 struct netdev *bond_dev;
1578
1579 memset(&bond_stats, 0, sizeof bond_stats);
1580
1581 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1582 struct netdev_stats slave_stats;
1583
1584 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1585 /* XXX: We swap the stats here because they are swapped back when
1586 * reported by the internal device. The reason for this is
1587 * internal devices normally represent packets going into the
1588 * system but when used as fake bond device they represent packets
1589 * leaving the system. We really should do this in the internal
1590 * device itself because changing it here reverses the counts from
1591 * the perspective of the switch. However, the internal device
1592 * doesn't know what type of device it represents so we have to do
1593 * it here for now. */
1594 bond_stats.tx_packets += slave_stats.rx_packets;
1595 bond_stats.tx_bytes += slave_stats.rx_bytes;
1596 bond_stats.rx_packets += slave_stats.tx_packets;
1597 bond_stats.rx_bytes += slave_stats.tx_bytes;
1598 }
1599 }
1600
18812dff 1601 if (!netdev_open(bond->name, "system", &bond_dev)) {
f620b43a
BP
1602 netdev_set_stats(bond_dev, &bond_stats);
1603 netdev_close(bond_dev);
1604 }
1605}