]> git.proxmox.com Git - mirror_ovs.git/blame - lib/bond.c
ofproto-dpif: Store relevant fields for wildcarding in facet.
[mirror_ovs.git] / lib / bond.c
CommitLineData
f620b43a 1/*
09a5d390 2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
f620b43a
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
19#include "bond.h"
20
21#include <limits.h>
22#include <stdint.h>
23#include <stdlib.h>
75fad143 24#include <math.h>
f620b43a
BP
25
26#include "coverage.h"
27#include "dynamic-string.h"
28#include "flow.h"
29#include "hmap.h"
bdebeece 30#include "lacp.h"
f620b43a
BP
31#include "list.h"
32#include "netdev.h"
33#include "odp-util.h"
34#include "ofpbuf.h"
35#include "packets.h"
36#include "poll-loop.h"
fc1d4f01 37#include "shash.h"
f620b43a
BP
38#include "tag.h"
39#include "timeval.h"
40#include "unixctl.h"
41#include "vlog.h"
42
43VLOG_DEFINE_THIS_MODULE(bond);
44
f620b43a
BP
45/* Bit-mask for hashing a flow down to a bucket.
46 * There are (BOND_MASK + 1) buckets. */
47#define BOND_MASK 0xff
48
49/* A hash bucket for mapping a flow to a slave.
50 * "struct bond" has an array of (BOND_MASK + 1) of these. */
51struct bond_entry {
52 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
53 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
5dd165d3 54 tag_type tag; /* Tag for entry<->facet association. */
f620b43a
BP
55 struct list list_node; /* In bond_slave's 'entries' list. */
56};
57
58/* A bond slave, that is, one of the links comprising a bond. */
59struct bond_slave {
60 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
61 struct bond *bond; /* The bond that contains this slave. */
62 void *aux; /* Client-provided handle for this slave. */
63
64 struct netdev *netdev; /* Network device, owned by the client. */
1ea24138 65 unsigned int change_seq; /* Tracks changes in 'netdev'. */
f620b43a
BP
66 char *name; /* Name (a copy of netdev_get_name(netdev)). */
67
68 /* Link status. */
69 long long delay_expires; /* Time after which 'enabled' may change. */
f620b43a 70 bool enabled; /* May be chosen for flows? */
296f6519 71 bool may_enable; /* Client considers this slave bondable. */
f620b43a
BP
72 tag_type tag; /* Tag associated with this slave. */
73
74 /* Rebalancing info. Used only by bond_rebalance(). */
75 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
76 struct list entries; /* 'struct bond_entry's assigned here. */
77 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
78};
79
80/* A bond, that is, a set of network devices grouped to improve performance or
81 * robustness. */
82struct bond {
83 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
84 char *name; /* Name provided by client. */
85
86 /* Slaves. */
87 struct hmap slaves;
88
89 /* Bonding info. */
90 enum bond_mode balance; /* Balancing mode, one of BM_*. */
91 struct bond_slave *active_slave;
92 tag_type no_slaves_tag; /* Tag for flows when all slaves disabled. */
93 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
bdebeece 94 enum lacp_status lacp_status; /* Status of LACP negotiations. */
62904702 95 bool bond_revalidate; /* True if flows need revalidation. */
672d18b2 96 uint32_t basis; /* Basis for flow hash function. */
f620b43a
BP
97
98 /* SLB specific bonding info. */
99 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
100 int rebalance_interval; /* Interval between rebalances, in ms. */
101 long long int next_rebalance; /* Next rebalancing time. */
102 bool send_learning_packets;
103
f620b43a
BP
104 /* Legacy compatibility. */
105 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
106
107 /* Tag set saved for next bond_run(). This tag set is a kluge for cases
108 * where we can't otherwise provide revalidation feedback to the client.
109 * That's only unixctl commands now; I hope no other cases will arise. */
110 struct tag_set unixctl_tags;
111};
112
113static struct hmap all_bonds = HMAP_INITIALIZER(&all_bonds);
114
95aafb2a 115static void bond_entry_reset(struct bond *);
f620b43a 116static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_);
f620b43a
BP
117static void bond_enable_slave(struct bond_slave *, bool enable,
118 struct tag_set *);
119static void bond_link_status_update(struct bond_slave *, struct tag_set *);
120static void bond_choose_active_slave(struct bond *, struct tag_set *);
f620b43a 121static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
672d18b2
EJ
122 uint16_t vlan, uint32_t basis);
123static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
124 uint32_t basis);
f620b43a
BP
125static struct bond_entry *lookup_bond_entry(const struct bond *,
126 const struct flow *,
127 uint16_t vlan);
128static tag_type bond_get_active_slave_tag(const struct bond *);
129static struct bond_slave *choose_output_slave(const struct bond *,
130 const struct flow *,
bcd2633a 131 struct flow_wildcards *,
00ed8314 132 uint16_t vlan, tag_type *tags);
f620b43a
BP
133static void bond_update_fake_slave_stats(struct bond *);
134
135/* Attempts to parse 's' as the name of a bond balancing mode. If successful,
136 * stores the mode in '*balance' and returns true. Otherwise returns false
137 * without modifying '*balance'. */
138bool
139bond_mode_from_string(enum bond_mode *balance, const char *s)
140{
141 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
142 *balance = BM_TCP;
143 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
144 *balance = BM_SLB;
145 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
146 *balance = BM_AB;
147 } else {
148 return false;
149 }
150 return true;
151}
152
153/* Returns a string representing 'balance'. */
154const char *
155bond_mode_to_string(enum bond_mode balance) {
156 switch (balance) {
157 case BM_TCP:
158 return "balance-tcp";
159 case BM_SLB:
160 return "balance-slb";
161 case BM_AB:
162 return "active-backup";
163 }
164 NOT_REACHED();
165}
166
f620b43a
BP
167\f
168/* Creates and returns a new bond whose configuration is initially taken from
169 * 's'.
170 *
171 * The caller should register each slave on the new bond by calling
172 * bond_slave_register(). */
173struct bond *
174bond_create(const struct bond_settings *s)
175{
176 struct bond *bond;
177
178 bond = xzalloc(sizeof *bond);
179 hmap_init(&bond->slaves);
180 bond->no_slaves_tag = tag_create_random();
f620b43a
BP
181 bond->next_fake_iface_update = LLONG_MAX;
182
183 bond_reconfigure(bond, s);
184
185 tag_set_init(&bond->unixctl_tags);
186
187 return bond;
188}
189
190/* Frees 'bond'. */
191void
192bond_destroy(struct bond *bond)
193{
194 struct bond_slave *slave, *next_slave;
195
196 if (!bond) {
197 return;
198 }
199
200 hmap_remove(&all_bonds, &bond->hmap_node);
201
202 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
203 hmap_remove(&bond->slaves, &slave->hmap_node);
204 /* Client owns 'slave->netdev'. */
205 free(slave->name);
206 free(slave);
207 }
208 hmap_destroy(&bond->slaves);
209
210 free(bond->hash);
f620b43a
BP
211 free(bond->name);
212 free(bond);
213}
214
215/* Updates 'bond''s overall configuration to 's'.
216 *
217 * The caller should register each slave on 'bond' by calling
218 * bond_slave_register(). This is optional if none of the slaves'
4d6fb5eb 219 * configuration has changed. In any case it can't hurt.
59d7b2b6
EJ
220 *
221 * Returns true if the configuration has changed in such a way that requires
222 * flow revalidation.
223 * */
224bool
f620b43a
BP
225bond_reconfigure(struct bond *bond, const struct bond_settings *s)
226{
59d7b2b6
EJ
227 bool revalidate = false;
228
f620b43a
BP
229 if (!bond->name || strcmp(bond->name, s->name)) {
230 if (bond->name) {
231 hmap_remove(&all_bonds, &bond->hmap_node);
232 free(bond->name);
233 }
234 bond->name = xstrdup(s->name);
235 hmap_insert(&all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
236 }
237
f620b43a
BP
238 bond->updelay = s->up_delay;
239 bond->downdelay = s->down_delay;
bc1b010c
EJ
240
241 if (bond->rebalance_interval != s->rebalance_interval) {
242 bond->rebalance_interval = s->rebalance_interval;
243 revalidate = true;
244 }
f620b43a 245
59d7b2b6
EJ
246 if (bond->balance != s->balance) {
247 bond->balance = s->balance;
248 revalidate = true;
249 }
250
672d18b2
EJ
251 if (bond->basis != s->basis) {
252 bond->basis = s->basis;
253 revalidate = true;
254 }
255
f620b43a
BP
256 if (s->fake_iface) {
257 if (bond->next_fake_iface_update == LLONG_MAX) {
258 bond->next_fake_iface_update = time_msec();
259 }
260 } else {
261 bond->next_fake_iface_update = LLONG_MAX;
262 }
59d7b2b6 263
62904702
EJ
264 if (bond->bond_revalidate) {
265 revalidate = true;
266 bond->bond_revalidate = false;
267 }
268
95aafb2a
EJ
269 if (bond->balance == BM_AB || !bond->hash || revalidate) {
270 bond_entry_reset(bond);
271 }
272
59d7b2b6 273 return revalidate;
f620b43a
BP
274}
275
f8ddccd2 276static void
1ea24138 277bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
f8ddccd2
BP
278{
279 if (slave->netdev != netdev) {
f8ddccd2 280 slave->netdev = netdev;
1ea24138 281 slave->change_seq = 0;
f8ddccd2
BP
282 }
283}
284
f620b43a
BP
285/* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
286 * arbitrary client-provided pointer that uniquely identifies a slave within a
287 * bond. If 'slave_' already exists within 'bond' then this function
288 * reconfigures the existing slave.
289 *
290 * 'netdev' must be the network device that 'slave_' represents. It is owned
291 * by the client, so the client must not close it before either unregistering
292 * 'slave_' or destroying 'bond'.
4d6fb5eb 293 */
f620b43a 294void
df53d41c 295bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
f620b43a
BP
296{
297 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
298
299 if (!slave) {
300 slave = xzalloc(sizeof *slave);
301
302 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
303 slave->bond = bond;
304 slave->aux = slave_;
305 slave->delay_expires = LLONG_MAX;
244b2160 306 slave->name = xstrdup(netdev_get_name(netdev));
7321e30e 307 bond->bond_revalidate = true;
244b2160 308
b3c18f66 309 slave->enabled = false;
c8544aa1 310 bond_enable_slave(slave, netdev_get_carrier(netdev), NULL);
f620b43a
BP
311 }
312
1ea24138 313 bond_slave_set_netdev__(slave, netdev);
a6934aa9 314
f620b43a
BP
315 free(slave->name);
316 slave->name = xstrdup(netdev_get_name(netdev));
f620b43a
BP
317}
318
f8ddccd2
BP
319/* Updates the network device to be used with 'slave_' to 'netdev'.
320 *
321 * This is useful if the caller closes and re-opens the network device
322 * registered with bond_slave_register() but doesn't need to change anything
323 * else. */
324void
325bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
326{
327 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
328 if (slave) {
1ea24138 329 bond_slave_set_netdev__(slave, netdev);
f8ddccd2
BP
330 }
331}
332
f620b43a
BP
333/* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
334 * then this function has no effect.
335 *
336 * Unregistering a slave invalidates all flows. */
337void
338bond_slave_unregister(struct bond *bond, const void *slave_)
339{
340 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
341 bool del_active;
342
343 if (!slave) {
344 return;
345 }
346
b3c18f66
EJ
347 bond_enable_slave(slave, false, NULL);
348
f620b43a
BP
349 del_active = bond->active_slave == slave;
350 if (bond->hash) {
351 struct bond_entry *e;
352 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
353 if (e->slave == slave) {
354 e->slave = NULL;
355 }
356 }
357 }
358
359 free(slave->name);
360
361 hmap_remove(&bond->slaves, &slave->hmap_node);
362 /* Client owns 'slave->netdev'. */
363 free(slave);
364
365 if (del_active) {
366 struct tag_set tags;
367
368 tag_set_init(&tags);
369 bond_choose_active_slave(bond, &tags);
370 bond->send_learning_packets = true;
371 }
372}
373
296f6519
EJ
374/* Should be called on each slave in 'bond' before bond_run() to indicate
375 * whether or not 'slave_' may be enabled. This function is intended to allow
376 * other protocols to have some impact on bonding decisions. For example LACP
377 * or high level link monitoring protocols may decide that a given slave should
378 * not be able to send traffic. */
4d6fb5eb 379void
296f6519 380bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
4d6fb5eb 381{
296f6519 382 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
4d6fb5eb
EJ
383}
384
f620b43a
BP
385/* Performs periodic maintenance on 'bond'. The caller must provide 'tags' to
386 * allow tagged flows to be invalidated.
387 *
388 * The caller should check bond_should_send_learning_packets() afterward. */
389void
bdebeece 390bond_run(struct bond *bond, struct tag_set *tags, enum lacp_status lacp_status)
f620b43a
BP
391{
392 struct bond_slave *slave;
393
bdebeece
EJ
394 if (bond->lacp_status != lacp_status) {
395 bond->lacp_status = lacp_status;
4592d0e2
EJ
396 bond->bond_revalidate = true;
397 }
4d6fb5eb 398
f620b43a
BP
399 /* Enable slaves based on link status and LACP feedback. */
400 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
401 bond_link_status_update(slave, tags);
1ea24138 402 slave->change_seq = netdev_change_seq(slave->netdev);
f620b43a
BP
403 }
404 if (!bond->active_slave || !bond->active_slave->enabled) {
405 bond_choose_active_slave(bond, tags);
406 }
407
408 /* Update fake bond interface stats. */
409 if (time_msec() >= bond->next_fake_iface_update) {
410 bond_update_fake_slave_stats(bond);
411 bond->next_fake_iface_update = time_msec() + 1000;
412 }
413
62904702 414 if (bond->bond_revalidate) {
df53d41c 415 struct bond_slave *slave;
dc9908b3 416
df53d41c 417 bond->bond_revalidate = false;
95aafb2a 418 bond_entry_reset(bond);
df53d41c
EJ
419 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
420 tag_set_add(tags, slave->tag);
dc9908b3 421 }
0008fbcb 422 tag_set_add(tags, bond->no_slaves_tag);
dc9908b3
EJ
423 }
424
f620b43a
BP
425 /* Invalidate any tags required by */
426 tag_set_union(tags, &bond->unixctl_tags);
427 tag_set_init(&bond->unixctl_tags);
428}
429
430/* Causes poll_block() to wake up when 'bond' needs something to be done. */
431void
432bond_wait(struct bond *bond)
433{
434 struct bond_slave *slave;
435
f620b43a
BP
436 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
437 if (slave->delay_expires != LLONG_MAX) {
438 poll_timer_wait_until(slave->delay_expires);
439 }
1ea24138
EJ
440
441 if (slave->change_seq != netdev_change_seq(slave->netdev)) {
442 poll_immediate_wake();
443 }
f620b43a
BP
444 }
445
446 if (bond->next_fake_iface_update != LLONG_MAX) {
447 poll_timer_wait_until(bond->next_fake_iface_update);
448 }
449
450 /* Ensure that any saved tags get revalidated right away. */
451 if (!tag_set_is_empty(&bond->unixctl_tags)) {
452 poll_immediate_wake();
453 }
454
455 /* We don't wait for bond->next_rebalance because rebalancing can only run
456 * at a flow account checkpoint. ofproto does checkpointing on its own
457 * schedule and bond_rebalance() gets called afterward, so we'd just be
458 * waking up for no purpose. */
459}
460\f
461/* MAC learning table interaction. */
462
463static bool
464may_send_learning_packets(const struct bond *bond)
465{
bdebeece 466 return bond->lacp_status == LACP_DISABLED
64e2748d 467 && (bond->balance == BM_SLB || bond->balance == BM_AB)
bdebeece 468 && bond->active_slave;
f620b43a
BP
469}
470
471/* Returns true if 'bond' needs the client to send out packets to assist with
472 * MAC learning on 'bond'. If this function returns true, then the client
473 * should iterate through its MAC learning table for the bridge on which 'bond'
474 * is located. For each MAC that has been learned on a port other than 'bond',
ea131871 475 * it should call bond_compose_learning_packet().
f620b43a 476 *
477879ea
BP
477 * This function will only return true if 'bond' is in SLB or active-backup
478 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
479 * necessary.
f620b43a
BP
480 *
481 * Calling this function resets the state that it checks. */
482bool
483bond_should_send_learning_packets(struct bond *bond)
484{
485 bool send = bond->send_learning_packets && may_send_learning_packets(bond);
486 bond->send_learning_packets = false;
487 return send;
488}
489
490/* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
491 *
ea131871
JG
492 * See bond_should_send_learning_packets() for description of usage. The
493 * caller should send the composed packet on the port associated with
494 * port_aux and takes ownership of the returned ofpbuf. */
495struct ofpbuf *
496bond_compose_learning_packet(struct bond *bond,
497 const uint8_t eth_src[ETH_ADDR_LEN],
498 uint16_t vlan, void **port_aux)
f620b43a
BP
499{
500 struct bond_slave *slave;
ea131871 501 struct ofpbuf *packet;
00ed8314 502 tag_type tags = 0;
f620b43a 503 struct flow flow;
f620b43a 504
cb22974d 505 ovs_assert(may_send_learning_packets(bond));
f620b43a
BP
506
507 memset(&flow, 0, sizeof flow);
508 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
bcd2633a 509 slave = choose_output_slave(bond, &flow, NULL, vlan, &tags);
f620b43a 510
ea131871 511 packet = ofpbuf_new(0);
2ea838ac 512 compose_rarp(packet, eth_src);
f620b43a 513 if (vlan) {
ea131871 514 eth_push_vlan(packet, htons(vlan));
f620b43a 515 }
f620b43a 516
ea131871
JG
517 *port_aux = slave->aux;
518 return packet;
f620b43a
BP
519}
520\f
521/* Checks whether a packet that arrived on 'slave_' within 'bond', with an
522 * Ethernet destination address of 'eth_dst', should be admitted.
523 *
524 * The return value is one of the following:
525 *
526 * - BV_ACCEPT: Admit the packet.
527 *
528 * - BV_DROP: Drop the packet.
529 *
530 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
531 * Ethernet source address and VLAN. If there is none, or if the packet
532 * is on the learned port, then admit the packet. If a different port has
533 * been learned, however, drop the packet (and do not use it for MAC
534 * learning).
535 */
536enum bond_verdict
537bond_check_admissibility(struct bond *bond, const void *slave_,
538 const uint8_t eth_dst[ETH_ADDR_LEN], tag_type *tags)
539{
9a1c6450
EJ
540 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
541
542 /* LACP bonds have very loose admissibility restrictions because we can
543 * assume the remote switch is aware of the bond and will "do the right
544 * thing". However, as a precaution we drop packets on disabled slaves
545 * because no correctly implemented partner switch should be sending
bdebeece
EJ
546 * packets to them.
547 *
548 * If LACP is configured, but LACP negotiations have been unsuccessful, we
549 * drop all incoming traffic. */
550 switch (bond->lacp_status) {
551 case LACP_NEGOTIATED: return slave->enabled ? BV_ACCEPT : BV_DROP;
552 case LACP_CONFIGURED: return BV_DROP;
553 case LACP_DISABLED: break;
f620b43a
BP
554 }
555
556 /* Drop all multicast packets on inactive slaves. */
557 if (eth_addr_is_multicast(eth_dst)) {
558 *tags |= bond_get_active_slave_tag(bond);
559 if (bond->active_slave != bond_slave_lookup(bond, slave_)) {
560 return BV_DROP;
561 }
562 }
563
f931a4c9
BP
564 switch (bond->balance) {
565 case BM_AB:
566 /* Drop all packets which arrive on backup slaves. This is similar to
567 * how Linux bonding handles active-backup bonds. */
7ba7dcf0
EJ
568 *tags |= bond_get_active_slave_tag(bond);
569 if (bond->active_slave != slave) {
570 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
571
e6b2255c
BP
572 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
573 " slave (%s) destined for " ETH_ADDR_FMT,
574 slave->name, ETH_ADDR_ARGS(eth_dst));
7ba7dcf0
EJ
575 return BV_DROP;
576 }
f931a4c9
BP
577 return BV_ACCEPT;
578
579 case BM_TCP:
bdebeece
EJ
580 /* TCP balanced bonds require successful LACP negotiated. Based on the
581 * above check, LACP is off on this bond. Therfore, we drop all
582 * incoming traffic. */
583 return BV_DROP;
584
f931a4c9
BP
585 case BM_SLB:
586 /* Drop all packets for which we have learned a different input port,
587 * because we probably sent the packet on one slave and got it back on
588 * the other. Gratuitous ARP packets are an exception to this rule:
589 * the host has moved to another switch. The exception to the
590 * exception is if we locked the learning table to avoid reflections on
591 * bond slaves. */
592 return BV_DROP_IF_MOVED;
7ba7dcf0
EJ
593 }
594
f931a4c9 595 NOT_REACHED();
f620b43a
BP
596}
597
598/* Returns the slave (registered on 'bond' by bond_slave_register()) to which
599 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
600 * NULL if the packet should be dropped because no slaves are enabled.
601 *
602 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
603 * should be a VID only (i.e. excluding the PCP bits). Second,
604 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
605 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
606 * packet belongs to (so for an access port it will be the access port's VLAN).
607 *
608 * Adds a tag to '*tags' that associates the flow with the returned slave.
bcd2633a
JP
609 *
610 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
611 * significant in the selection. At some point earlier, 'wc' should
612 * have been initialized (e.g., by flow_wildcards_init_catchall()).
f620b43a
BP
613 */
614void *
615bond_choose_output_slave(struct bond *bond, const struct flow *flow,
bcd2633a
JP
616 struct flow_wildcards *wc, uint16_t vlan,
617 tag_type *tags)
f620b43a 618{
bcd2633a 619 struct bond_slave *slave = choose_output_slave(bond, flow, wc, vlan, tags);
f620b43a 620 if (slave) {
df53d41c 621 *tags |= slave->tag;
f620b43a
BP
622 return slave->aux;
623 } else {
624 *tags |= bond->no_slaves_tag;
625 return NULL;
626 }
627}
f620b43a
BP
628\f
629/* Rebalancing. */
630
1b137691
EJ
631static bool
632bond_is_balanced(const struct bond *bond)
633{
bc1b010c
EJ
634 return bond->rebalance_interval
635 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
1b137691
EJ
636}
637
f620b43a
BP
638/* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
639void
640bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
641 uint64_t n_bytes)
642{
1b137691 643 if (bond_is_balanced(bond)) {
f620b43a 644 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
f620b43a
BP
645 }
646}
647
648static struct bond_slave *
649bond_slave_from_bal_node(struct list *bal)
650{
651 return CONTAINER_OF(bal, struct bond_slave, bal_node);
652}
653
654static void
655log_bals(struct bond *bond, const struct list *bals)
656{
657 if (VLOG_IS_DBG_ENABLED()) {
658 struct ds ds = DS_EMPTY_INITIALIZER;
659 const struct bond_slave *slave;
660
661 LIST_FOR_EACH (slave, bal_node, bals) {
662 if (ds.length) {
663 ds_put_char(&ds, ',');
664 }
665 ds_put_format(&ds, " %s %"PRIu64"kB",
666 slave->name, slave->tx_bytes / 1024);
667
668 if (!slave->enabled) {
669 ds_put_cstr(&ds, " (disabled)");
670 }
671 if (!list_is_empty(&slave->entries)) {
672 struct bond_entry *e;
673
674 ds_put_cstr(&ds, " (");
675 LIST_FOR_EACH (e, list_node, &slave->entries) {
676 if (&e->list_node != list_front(&slave->entries)) {
677 ds_put_cstr(&ds, " + ");
678 }
679 ds_put_format(&ds, "h%td: %"PRIu64"kB",
680 e - bond->hash, e->tx_bytes / 1024);
681 }
682 ds_put_cstr(&ds, ")");
683 }
684 }
685 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
686 ds_destroy(&ds);
687 }
688}
689
690/* Shifts 'hash' from its current slave to 'to'. */
691static void
692bond_shift_load(struct bond_entry *hash, struct bond_slave *to,
693 struct tag_set *set)
694{
695 struct bond_slave *from = hash->slave;
696 struct bond *bond = from->bond;
697 uint64_t delta = hash->tx_bytes;
698
699 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) "
700 "from %s to %s (now carrying %"PRIu64"kB and "
701 "%"PRIu64"kB load, respectively)",
702 bond->name, delta / 1024, hash - bond->hash,
703 from->name, to->name,
704 (from->tx_bytes - delta) / 1024,
705 (to->tx_bytes + delta) / 1024);
706
707 /* Shift load away from 'from' to 'to'. */
708 from->tx_bytes -= delta;
709 to->tx_bytes += delta;
710
711 /* Arrange for flows to be revalidated. */
712 tag_set_add(set, hash->tag);
713 hash->slave = to;
714 hash->tag = tag_create_random();
715}
716
09a5d390
BP
717/* Picks and returns a bond_entry to migrate from 'from' (the most heavily
718 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
f620b43a
BP
719 * given that doing so must decrease the ratio of the load on the two slaves by
720 * at least 0.1. Returns NULL if there is no appropriate entry.
721 *
722 * The list of entries isn't sorted. I don't know of a reason to prefer to
723 * shift away small hashes or large hashes. */
724static struct bond_entry *
725choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
726{
727 struct bond_entry *e;
728
729 if (list_is_short(&from->entries)) {
730 /* 'from' carries no more than one MAC hash, so shifting load away from
731 * it would be pointless. */
732 return NULL;
733 }
734
735 LIST_FOR_EACH (e, list_node, &from->entries) {
736 double old_ratio, new_ratio;
737 uint64_t delta;
738
739 if (to_tx_bytes == 0) {
740 /* Nothing on the new slave, move it. */
741 return e;
742 }
743
744 delta = e->tx_bytes;
745 old_ratio = (double)from->tx_bytes / to_tx_bytes;
746 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
75fad143
ZK
747 if (old_ratio - new_ratio > 0.1
748 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
749 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
750 and 'to' slave have the same load. Therefore, we only move an
751 entry if it decreases the load on 'from', and brings us closer
752 to equal traffic load. */
f620b43a
BP
753 return e;
754 }
755 }
756
757 return NULL;
758}
759
760/* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
761 * maintained. */
762static void
763insert_bal(struct list *bals, struct bond_slave *slave)
764{
765 struct bond_slave *pos;
766
767 LIST_FOR_EACH (pos, bal_node, bals) {
768 if (slave->tx_bytes > pos->tx_bytes) {
769 break;
770 }
771 }
772 list_insert(&pos->bal_node, &slave->bal_node);
773}
774
775/* Removes 'slave' from its current list and then inserts it into 'bals' so
776 * that descending order of 'tx_bytes' is maintained. */
777static void
778reinsert_bal(struct list *bals, struct bond_slave *slave)
779{
780 list_remove(&slave->bal_node);
781 insert_bal(bals, slave);
782}
783
784/* If 'bond' needs rebalancing, does so.
785 *
786 * The caller should have called bond_account() for each active flow, to ensure
787 * that flow data is consistently accounted at this point. */
788void
789bond_rebalance(struct bond *bond, struct tag_set *tags)
790{
791 struct bond_slave *slave;
792 struct bond_entry *e;
793 struct list bals;
794
1b137691 795 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
f620b43a
BP
796 return;
797 }
798 bond->next_rebalance = time_msec() + bond->rebalance_interval;
799
800 /* Add each bond_entry to its slave's 'entries' list.
801 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
802 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
803 slave->tx_bytes = 0;
804 list_init(&slave->entries);
805 }
806 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
807 if (e->slave && e->tx_bytes) {
808 e->slave->tx_bytes += e->tx_bytes;
809 list_push_back(&e->slave->entries, &e->list_node);
810 }
811 }
812
813 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
814 *
815 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
816 * with a proper list sort algorithm. */
817 list_init(&bals);
818 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
819 if (slave->enabled) {
820 insert_bal(&bals, slave);
821 }
822 }
823 log_bals(bond, &bals);
824
825 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
826 while (!list_is_short(&bals)) {
827 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
828 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
829 uint64_t overload;
830
831 overload = from->tx_bytes - to->tx_bytes;
832 if (overload < to->tx_bytes >> 5 || overload < 100000) {
833 /* The extra load on 'from' (and all less-loaded slaves), compared
834 * to that of 'to' (the least-loaded slave), is less than ~3%, or
835 * it is less than ~1Mbps. No point in rebalancing. */
836 break;
837 }
838
09a5d390
BP
839 /* 'from' is carrying significantly more load than 'to'. Pick a hash
840 * to move from 'from' to 'to'. */
f620b43a
BP
841 e = choose_entry_to_migrate(from, to->tx_bytes);
842 if (e) {
843 bond_shift_load(e, to, tags);
844
845 /* Delete element from from->entries.
846 *
847 * We don't add the element to to->hashes. That would only allow
848 * 'e' to be migrated to another slave in this rebalancing run, and
849 * there is no point in doing that. */
850 list_remove(&e->list_node);
851
852 /* Re-sort 'bals'. */
853 reinsert_bal(&bals, from);
854 reinsert_bal(&bals, to);
855 } else {
856 /* Can't usefully migrate anything away from 'from'.
857 * Don't reconsider it. */
858 list_remove(&from->bal_node);
859 }
860 }
861
862 /* Implement exponentially weighted moving average. A weight of 1/2 causes
863 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
864 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
865 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
866 e->tx_bytes /= 2;
867 if (!e->tx_bytes) {
868 e->slave = NULL;
869 }
870 }
871}
872\f
873/* Bonding unixctl user interface functions. */
874
875static struct bond *
876bond_find(const char *name)
877{
878 struct bond *bond;
879
880 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
881 &all_bonds) {
882 if (!strcmp(bond->name, name)) {
883 return bond;
884 }
885 }
886 return NULL;
887}
888
889static struct bond_slave *
890bond_lookup_slave(struct bond *bond, const char *slave_name)
891{
892 struct bond_slave *slave;
893
894 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
895 if (!strcmp(slave->name, slave_name)) {
896 return slave;
897 }
898 }
899 return NULL;
900}
901
902static void
903bond_unixctl_list(struct unixctl_conn *conn,
0e15264f
BP
904 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
905 void *aux OVS_UNUSED)
f620b43a
BP
906{
907 struct ds ds = DS_EMPTY_INITIALIZER;
908 const struct bond *bond;
909
910 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
911
912 HMAP_FOR_EACH (bond, hmap_node, &all_bonds) {
913 const struct bond_slave *slave;
914 size_t i;
915
916 ds_put_format(&ds, "%s\t%s\t",
917 bond->name, bond_mode_to_string(bond->balance));
918
919 i = 0;
920 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
921 if (i++ > 0) {
922 ds_put_cstr(&ds, ", ");
923 }
924 ds_put_cstr(&ds, slave->name);
925 }
926 ds_put_char(&ds, '\n');
927 }
bde9f75d 928 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a
BP
929 ds_destroy(&ds);
930}
931
932static void
c33a8a25 933bond_print_details(struct ds *ds, const struct bond *bond)
f620b43a 934{
fc1d4f01
EJ
935 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
936 const struct shash_node **sorted_slaves = NULL;
f620b43a 937 const struct bond_slave *slave;
fc1d4f01 938 int i;
f620b43a 939
c33a8a25
EJ
940 ds_put_format(ds, "---- %s ----\n", bond->name);
941 ds_put_format(ds, "bond_mode: %s\n",
f620b43a
BP
942 bond_mode_to_string(bond->balance));
943
c33a8a25 944 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
672d18b2 945
c33a8a25
EJ
946 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
947 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
f620b43a 948
1b137691 949 if (bond_is_balanced(bond)) {
c33a8a25 950 ds_put_format(ds, "next rebalance: %lld ms\n",
f620b43a
BP
951 bond->next_rebalance - time_msec());
952 }
953
bdebeece
EJ
954 ds_put_cstr(ds, "lacp_status: ");
955 switch (bond->lacp_status) {
956 case LACP_NEGOTIATED:
957 ds_put_cstr(ds, "negotiated\n");
958 break;
959 case LACP_CONFIGURED:
960 ds_put_cstr(ds, "configured\n");
961 break;
962 case LACP_DISABLED:
963 ds_put_cstr(ds, "off\n");
964 break;
965 default:
966 ds_put_cstr(ds, "<unknown>\n");
967 break;
968 }
4d6fb5eb 969
f620b43a 970 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
fc1d4f01
EJ
971 shash_add(&slave_shash, slave->name, slave);
972 }
973 sorted_slaves = shash_sort(&slave_shash);
974
975 for (i = 0; i < shash_count(&slave_shash); i++) {
f620b43a 976 struct bond_entry *be;
f620b43a 977
fc1d4f01
EJ
978 slave = sorted_slaves[i]->data;
979
f620b43a 980 /* Basic info. */
c33a8a25 981 ds_put_format(ds, "\nslave %s: %s\n",
f620b43a
BP
982 slave->name, slave->enabled ? "enabled" : "disabled");
983 if (slave == bond->active_slave) {
c33a8a25 984 ds_put_cstr(ds, "\tactive slave\n");
f620b43a
BP
985 }
986 if (slave->delay_expires != LLONG_MAX) {
c33a8a25 987 ds_put_format(ds, "\t%s expires in %lld ms\n",
f620b43a
BP
988 slave->enabled ? "downdelay" : "updelay",
989 slave->delay_expires - time_msec());
990 }
991
c33a8a25 992 ds_put_format(ds, "\tmay_enable: %s\n",
296f6519 993 slave->may_enable ? "true" : "false");
4d6fb5eb 994
1b137691 995 if (!bond_is_balanced(bond)) {
f620b43a
BP
996 continue;
997 }
998
999 /* Hashes. */
f620b43a
BP
1000 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1001 int hash = be - bond->hash;
1002
1003 if (be->slave != slave) {
1004 continue;
1005 }
1006
c33a8a25 1007 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
f620b43a
BP
1008 hash, be->tx_bytes / 1024);
1009
7b9f1974 1010 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
f620b43a
BP
1011 }
1012 }
fc1d4f01
EJ
1013 shash_destroy(&slave_shash);
1014 free(sorted_slaves);
c33a8a25
EJ
1015 ds_put_cstr(ds, "\n");
1016}
1017
1018static void
1019bond_unixctl_show(struct unixctl_conn *conn,
1020 int argc, const char *argv[],
1021 void *aux OVS_UNUSED)
1022{
1023 struct ds ds = DS_EMPTY_INITIALIZER;
1024
1025 if (argc > 1) {
1026 const struct bond *bond = bond_find(argv[1]);
1027
1028 if (!bond) {
bde9f75d 1029 unixctl_command_reply_error(conn, "no such bond");
c33a8a25
EJ
1030 return;
1031 }
1032 bond_print_details(&ds, bond);
1033 } else {
1034 const struct bond *bond;
1035
1036 HMAP_FOR_EACH (bond, hmap_node, &all_bonds) {
1037 bond_print_details(&ds, bond);
1038 }
1039 }
1040
bde9f75d 1041 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a
BP
1042 ds_destroy(&ds);
1043}
1044
1045static void
0e15264f
BP
1046bond_unixctl_migrate(struct unixctl_conn *conn,
1047 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1048 void *aux OVS_UNUSED)
1049{
0e15264f
BP
1050 const char *bond_s = argv[1];
1051 const char *hash_s = argv[2];
1052 const char *slave_s = argv[3];
f620b43a
BP
1053 struct bond *bond;
1054 struct bond_slave *slave;
1055 struct bond_entry *entry;
1056 int hash;
1057
f620b43a
BP
1058 bond = bond_find(bond_s);
1059 if (!bond) {
bde9f75d 1060 unixctl_command_reply_error(conn, "no such bond");
f620b43a
BP
1061 return;
1062 }
1063
1064 if (bond->balance != BM_SLB) {
bde9f75d 1065 unixctl_command_reply_error(conn, "not an SLB bond");
f620b43a
BP
1066 return;
1067 }
1068
1069 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1070 hash = atoi(hash_s) & BOND_MASK;
1071 } else {
bde9f75d 1072 unixctl_command_reply_error(conn, "bad hash");
f620b43a
BP
1073 return;
1074 }
1075
1076 slave = bond_lookup_slave(bond, slave_s);
1077 if (!slave) {
bde9f75d 1078 unixctl_command_reply_error(conn, "no such slave");
f620b43a
BP
1079 return;
1080 }
1081
1082 if (!slave->enabled) {
bde9f75d 1083 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
f620b43a
BP
1084 return;
1085 }
1086
1087 entry = &bond->hash[hash];
1088 tag_set_add(&bond->unixctl_tags, entry->tag);
1089 entry->slave = slave;
1090 entry->tag = tag_create_random();
bde9f75d 1091 unixctl_command_reply(conn, "migrated");
f620b43a
BP
1092}
1093
1094static void
0e15264f
BP
1095bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1096 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1097 void *aux OVS_UNUSED)
1098{
0e15264f
BP
1099 const char *bond_s = argv[1];
1100 const char *slave_s = argv[2];
f620b43a
BP
1101 struct bond *bond;
1102 struct bond_slave *slave;
1103
f620b43a
BP
1104 bond = bond_find(bond_s);
1105 if (!bond) {
bde9f75d 1106 unixctl_command_reply_error(conn, "no such bond");
f620b43a
BP
1107 return;
1108 }
1109
1110 slave = bond_lookup_slave(bond, slave_s);
1111 if (!slave) {
bde9f75d 1112 unixctl_command_reply_error(conn, "no such slave");
f620b43a
BP
1113 return;
1114 }
1115
1116 if (!slave->enabled) {
bde9f75d 1117 unixctl_command_reply_error(conn, "cannot make disabled slave active");
f620b43a
BP
1118 return;
1119 }
1120
1121 if (bond->active_slave != slave) {
1122 tag_set_add(&bond->unixctl_tags, bond_get_active_slave_tag(bond));
1123 bond->active_slave = slave;
1124 bond->active_slave->tag = tag_create_random();
1125 VLOG_INFO("bond %s: active interface is now %s",
1126 bond->name, slave->name);
1127 bond->send_learning_packets = true;
bde9f75d 1128 unixctl_command_reply(conn, "done");
f620b43a 1129 } else {
bde9f75d 1130 unixctl_command_reply(conn, "no change");
f620b43a
BP
1131 }
1132}
1133
1134static void
0e15264f 1135enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
f620b43a 1136{
0e15264f
BP
1137 const char *bond_s = argv[1];
1138 const char *slave_s = argv[2];
f620b43a
BP
1139 struct bond *bond;
1140 struct bond_slave *slave;
1141
f620b43a
BP
1142 bond = bond_find(bond_s);
1143 if (!bond) {
bde9f75d 1144 unixctl_command_reply_error(conn, "no such bond");
f620b43a
BP
1145 return;
1146 }
1147
1148 slave = bond_lookup_slave(bond, slave_s);
1149 if (!slave) {
bde9f75d 1150 unixctl_command_reply_error(conn, "no such slave");
f620b43a
BP
1151 return;
1152 }
1153
1154 bond_enable_slave(slave, enable, &bond->unixctl_tags);
bde9f75d 1155 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
f620b43a
BP
1156}
1157
1158static void
0e15264f
BP
1159bond_unixctl_enable_slave(struct unixctl_conn *conn,
1160 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1161 void *aux OVS_UNUSED)
1162{
0e15264f 1163 enable_slave(conn, argv, true);
f620b43a
BP
1164}
1165
1166static void
0e15264f
BP
1167bond_unixctl_disable_slave(struct unixctl_conn *conn,
1168 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1169 void *aux OVS_UNUSED)
1170{
0e15264f 1171 enable_slave(conn, argv, false);
f620b43a
BP
1172}
1173
1174static void
0e15264f 1175bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
f620b43a
BP
1176 void *aux OVS_UNUSED)
1177{
0e15264f
BP
1178 const char *mac_s = argv[1];
1179 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1180 const char *basis_s = argc > 3 ? argv[3] : NULL;
f620b43a
BP
1181 uint8_t mac[ETH_ADDR_LEN];
1182 uint8_t hash;
1183 char *hash_cstr;
1184 unsigned int vlan;
672d18b2 1185 uint32_t basis;
f620b43a
BP
1186
1187 if (vlan_s) {
1188 if (sscanf(vlan_s, "%u", &vlan) != 1) {
bde9f75d 1189 unixctl_command_reply_error(conn, "invalid vlan");
f620b43a
BP
1190 return;
1191 }
1192 } else {
dc155bff 1193 vlan = 0;
f620b43a
BP
1194 }
1195
672d18b2
EJ
1196 if (basis_s) {
1197 if (sscanf(basis_s, "%"PRIu32, &basis) != 1) {
bde9f75d 1198 unixctl_command_reply_error(conn, "invalid basis");
672d18b2
EJ
1199 return;
1200 }
1201 } else {
1202 basis = 0;
1203 }
1204
f620b43a
BP
1205 if (sscanf(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))
1206 == ETH_ADDR_SCAN_COUNT) {
672d18b2 1207 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
f620b43a
BP
1208
1209 hash_cstr = xasprintf("%u", hash);
bde9f75d 1210 unixctl_command_reply(conn, hash_cstr);
f620b43a
BP
1211 free(hash_cstr);
1212 } else {
bde9f75d 1213 unixctl_command_reply_error(conn, "invalid mac");
f620b43a
BP
1214 }
1215}
1216
1217void
1218bond_init(void)
1219{
0e15264f 1220 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
c33a8a25
EJ
1221 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1222 NULL);
0e15264f 1223 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
7ff2009a 1224 bond_unixctl_migrate, NULL);
0e15264f 1225 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
f620b43a 1226 bond_unixctl_set_active_slave, NULL);
0e15264f 1227 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
7ff2009a 1228 bond_unixctl_enable_slave, NULL);
0e15264f 1229 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
7ff2009a 1230 bond_unixctl_disable_slave, NULL);
0e15264f 1231 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
7ff2009a 1232 bond_unixctl_hash, NULL);
f620b43a
BP
1233}
1234\f
95aafb2a
EJ
1235static void
1236bond_entry_reset(struct bond *bond)
1237{
1238 if (bond->balance != BM_AB) {
1239 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1240
1241 if (!bond->hash) {
1242 bond->hash = xmalloc(hash_len);
1243 }
1244 memset(bond->hash, 0, hash_len);
1245
1246 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1247 } else {
1248 free(bond->hash);
1249 bond->hash = NULL;
1250 }
1251}
1252
f620b43a
BP
1253static struct bond_slave *
1254bond_slave_lookup(struct bond *bond, const void *slave_)
1255{
1256 struct bond_slave *slave;
1257
1258 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1259 &bond->slaves) {
1260 if (slave->aux == slave_) {
1261 return slave;
1262 }
1263 }
1264
1265 return NULL;
1266}
1267
f620b43a
BP
1268static void
1269bond_enable_slave(struct bond_slave *slave, bool enable, struct tag_set *tags)
1270{
1271 slave->delay_expires = LLONG_MAX;
1272 if (enable != slave->enabled) {
1273 slave->enabled = enable;
1274 if (!slave->enabled) {
d28b9ead 1275 VLOG_INFO("interface %s: disabled", slave->name);
b3c18f66
EJ
1276 if (tags) {
1277 tag_set_add(tags, slave->tag);
1278 }
f620b43a 1279 } else {
d28b9ead 1280 VLOG_INFO("interface %s: enabled", slave->name);
f620b43a
BP
1281 slave->tag = tag_create_random();
1282 }
1283 }
1284}
1285
1286static void
1287bond_link_status_update(struct bond_slave *slave, struct tag_set *tags)
1288{
1289 struct bond *bond = slave->bond;
1290 bool up;
1291
296f6519 1292 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
f620b43a
BP
1293 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1294 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1295 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1296 slave->name, up ? "up" : "down");
1297 if (up == slave->enabled) {
1298 slave->delay_expires = LLONG_MAX;
1299 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1300 slave->name, up ? "disabled" : "enabled");
1301 } else {
bdebeece 1302 int delay = (bond->lacp_status != LACP_DISABLED ? 0
f620b43a
BP
1303 : up ? bond->updelay : bond->downdelay);
1304 slave->delay_expires = time_msec() + delay;
1305 if (delay) {
1306 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1307 "for %d ms",
1308 slave->name,
1309 up ? "enabled" : "disabled",
1310 up ? "up" : "down",
1311 delay);
1312 }
1313 }
1314 }
1315
1316 if (time_msec() >= slave->delay_expires) {
1317 bond_enable_slave(slave, up, tags);
1318 }
1319}
1320
f620b43a 1321static unsigned int
672d18b2 1322bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
f620b43a 1323{
672d18b2 1324 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
f620b43a
BP
1325}
1326
1327static unsigned int
672d18b2 1328bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
f620b43a
BP
1329{
1330 struct flow hash_flow = *flow;
d84d4b88 1331 hash_flow.vlan_tci = htons(vlan);
f620b43a
BP
1332
1333 /* The symmetric quality of this hash function is not required, but
1334 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1335 * purposes, so we use it out of convenience. */
672d18b2 1336 return flow_hash_symmetric_l4(&hash_flow, basis);
f620b43a
BP
1337}
1338
fb0b29a3
EJ
1339static unsigned int
1340bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1341{
cb22974d 1342 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
fb0b29a3 1343
bdebeece 1344 return (bond->balance == BM_TCP
672d18b2
EJ
1345 ? bond_hash_tcp(flow, vlan, bond->basis)
1346 : bond_hash_src(flow->dl_src, vlan, bond->basis));
fb0b29a3
EJ
1347}
1348
f620b43a
BP
1349static struct bond_entry *
1350lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1351 uint16_t vlan)
1352{
fb0b29a3 1353 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
f620b43a
BP
1354}
1355
1356static struct bond_slave *
1357choose_output_slave(const struct bond *bond, const struct flow *flow,
bcd2633a 1358 struct flow_wildcards *wc, uint16_t vlan, tag_type *tags)
f620b43a
BP
1359{
1360 struct bond_entry *e;
1361
bdebeece
EJ
1362 if (bond->lacp_status == LACP_CONFIGURED) {
1363 /* LACP has been configured on this bond but negotiations were
1364 * unsuccussful. Drop all traffic. */
1365 return NULL;
1366 }
1367
f620b43a
BP
1368 switch (bond->balance) {
1369 case BM_AB:
1370 return bond->active_slave;
1371
f620b43a 1372 case BM_TCP:
bdebeece
EJ
1373 if (bond->lacp_status != LACP_NEGOTIATED) {
1374 /* Must have LACP negotiations for TCP balanced bonds. */
1375 return NULL;
1376 }
bcd2633a
JP
1377 if (wc) {
1378 flow_mask_hash_fields(wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1379 }
bdebeece
EJ
1380 /* Fall Through. */
1381 case BM_SLB:
bcd2633a
JP
1382 if (wc) {
1383 flow_mask_hash_fields(wc, NX_HASH_FIELDS_ETH_SRC);
1384 }
f620b43a
BP
1385 e = lookup_bond_entry(bond, flow, vlan);
1386 if (!e->slave || !e->slave->enabled) {
c804cadf
EJ
1387 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1388 struct bond_slave, hmap_node);
1389 if (!e->slave->enabled) {
1390 e->slave = bond->active_slave;
1391 }
f620b43a
BP
1392 e->tag = tag_create_random();
1393 }
00ed8314 1394 *tags |= e->tag;
f620b43a
BP
1395 return e->slave;
1396
1397 default:
1398 NOT_REACHED();
1399 }
1400}
1401
1402static struct bond_slave *
1403bond_choose_slave(const struct bond *bond)
1404{
1405 struct bond_slave *slave, *best;
1406
1407 /* Find an enabled slave. */
1408 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1409 if (slave->enabled) {
1410 return slave;
1411 }
1412 }
1413
1414 /* All interfaces are disabled. Find an interface that will be enabled
1415 * after its updelay expires. */
1416 best = NULL;
1417 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1418 if (slave->delay_expires != LLONG_MAX
296f6519 1419 && slave->may_enable
f620b43a
BP
1420 && (!best || slave->delay_expires < best->delay_expires)) {
1421 best = slave;
1422 }
1423 }
1424 return best;
1425}
1426
1427static void
1428bond_choose_active_slave(struct bond *bond, struct tag_set *tags)
1429{
1430 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1431 struct bond_slave *old_active_slave = bond->active_slave;
1432
1433 bond->active_slave = bond_choose_slave(bond);
1434 if (bond->active_slave) {
1435 if (bond->active_slave->enabled) {
1436 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1437 bond->name, bond->active_slave->name);
1438 } else {
1439 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1440 "remaining %lld ms updelay (since no interface was "
1441 "enabled)", bond->name, bond->active_slave->name,
1442 bond->active_slave->delay_expires - time_msec());
1443 bond_enable_slave(bond->active_slave, true, tags);
1444 }
1445
1446 if (!old_active_slave) {
1447 tag_set_add(tags, bond->no_slaves_tag);
1448 }
1449
1450 bond->send_learning_packets = true;
1451 } else if (old_active_slave) {
d28b9ead 1452 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
f620b43a
BP
1453 }
1454}
1455
1456/* Returns the tag for 'bond''s active slave, or 'bond''s no_slaves_tag if
1457 * there is no active slave. */
1458static tag_type
1459bond_get_active_slave_tag(const struct bond *bond)
1460{
1461 return (bond->active_slave
1462 ? bond->active_slave->tag
1463 : bond->no_slaves_tag);
1464}
1465
1466/* Attempts to make the sum of the bond slaves' statistics appear on the fake
1467 * bond interface. */
1468static void
1469bond_update_fake_slave_stats(struct bond *bond)
1470{
1471 struct netdev_stats bond_stats;
1472 struct bond_slave *slave;
1473 struct netdev *bond_dev;
1474
1475 memset(&bond_stats, 0, sizeof bond_stats);
1476
1477 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1478 struct netdev_stats slave_stats;
1479
1480 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1481 /* XXX: We swap the stats here because they are swapped back when
1482 * reported by the internal device. The reason for this is
1483 * internal devices normally represent packets going into the
1484 * system but when used as fake bond device they represent packets
1485 * leaving the system. We really should do this in the internal
1486 * device itself because changing it here reverses the counts from
1487 * the perspective of the switch. However, the internal device
1488 * doesn't know what type of device it represents so we have to do
1489 * it here for now. */
1490 bond_stats.tx_packets += slave_stats.rx_packets;
1491 bond_stats.tx_bytes += slave_stats.rx_bytes;
1492 bond_stats.rx_packets += slave_stats.tx_packets;
1493 bond_stats.rx_bytes += slave_stats.tx_bytes;
1494 }
1495 }
1496
18812dff 1497 if (!netdev_open(bond->name, "system", &bond_dev)) {
f620b43a
BP
1498 netdev_set_stats(bond_dev, &bond_stats);
1499 netdev_close(bond_dev);
1500 }
1501}