]> git.proxmox.com Git - mirror_ovs.git/blame - lib/bond.c
ofproto-dpif: Move tag_the_flow() to ofproto-dpif.c
[mirror_ovs.git] / lib / bond.c
CommitLineData
f620b43a 1/*
09a5d390 2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
f620b43a
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
19#include "bond.h"
20
21#include <limits.h>
22#include <stdint.h>
23#include <stdlib.h>
75fad143 24#include <math.h>
f620b43a
BP
25
26#include "coverage.h"
27#include "dynamic-string.h"
28#include "flow.h"
29#include "hmap.h"
bdebeece 30#include "lacp.h"
f620b43a
BP
31#include "list.h"
32#include "netdev.h"
33#include "odp-util.h"
34#include "ofpbuf.h"
35#include "packets.h"
36#include "poll-loop.h"
fc1d4f01 37#include "shash.h"
f620b43a
BP
38#include "tag.h"
39#include "timeval.h"
40#include "unixctl.h"
41#include "vlog.h"
42
43VLOG_DEFINE_THIS_MODULE(bond);
44
f620b43a
BP
45/* Bit-mask for hashing a flow down to a bucket.
46 * There are (BOND_MASK + 1) buckets. */
47#define BOND_MASK 0xff
48
49/* A hash bucket for mapping a flow to a slave.
50 * "struct bond" has an array of (BOND_MASK + 1) of these. */
51struct bond_entry {
52 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
53 uint64_t tx_bytes; /* Count of bytes recently transmitted. */
5dd165d3 54 tag_type tag; /* Tag for entry<->facet association. */
f620b43a
BP
55 struct list list_node; /* In bond_slave's 'entries' list. */
56};
57
58/* A bond slave, that is, one of the links comprising a bond. */
59struct bond_slave {
60 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
61 struct bond *bond; /* The bond that contains this slave. */
62 void *aux; /* Client-provided handle for this slave. */
63
64 struct netdev *netdev; /* Network device, owned by the client. */
1ea24138 65 unsigned int change_seq; /* Tracks changes in 'netdev'. */
f620b43a
BP
66 char *name; /* Name (a copy of netdev_get_name(netdev)). */
67
68 /* Link status. */
69 long long delay_expires; /* Time after which 'enabled' may change. */
f620b43a 70 bool enabled; /* May be chosen for flows? */
296f6519 71 bool may_enable; /* Client considers this slave bondable. */
f620b43a
BP
72 tag_type tag; /* Tag associated with this slave. */
73
74 /* Rebalancing info. Used only by bond_rebalance(). */
75 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
76 struct list entries; /* 'struct bond_entry's assigned here. */
77 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
78};
79
80/* A bond, that is, a set of network devices grouped to improve performance or
81 * robustness. */
82struct bond {
83 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
84 char *name; /* Name provided by client. */
85
86 /* Slaves. */
87 struct hmap slaves;
88
89 /* Bonding info. */
90 enum bond_mode balance; /* Balancing mode, one of BM_*. */
91 struct bond_slave *active_slave;
92 tag_type no_slaves_tag; /* Tag for flows when all slaves disabled. */
93 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
bdebeece 94 enum lacp_status lacp_status; /* Status of LACP negotiations. */
62904702 95 bool bond_revalidate; /* True if flows need revalidation. */
672d18b2 96 uint32_t basis; /* Basis for flow hash function. */
f620b43a
BP
97
98 /* SLB specific bonding info. */
99 struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */
100 int rebalance_interval; /* Interval between rebalances, in ms. */
101 long long int next_rebalance; /* Next rebalancing time. */
102 bool send_learning_packets;
103
f620b43a
BP
104 /* Legacy compatibility. */
105 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
106
107 /* Tag set saved for next bond_run(). This tag set is a kluge for cases
108 * where we can't otherwise provide revalidation feedback to the client.
109 * That's only unixctl commands now; I hope no other cases will arise. */
110 struct tag_set unixctl_tags;
03366a2d
EJ
111
112 int ref_cnt;
f620b43a
BP
113};
114
115static struct hmap all_bonds = HMAP_INITIALIZER(&all_bonds);
116
95aafb2a 117static void bond_entry_reset(struct bond *);
f620b43a 118static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_);
f620b43a
BP
119static void bond_enable_slave(struct bond_slave *, bool enable,
120 struct tag_set *);
121static void bond_link_status_update(struct bond_slave *, struct tag_set *);
122static void bond_choose_active_slave(struct bond *, struct tag_set *);
f620b43a 123static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
672d18b2
EJ
124 uint16_t vlan, uint32_t basis);
125static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
126 uint32_t basis);
f620b43a
BP
127static struct bond_entry *lookup_bond_entry(const struct bond *,
128 const struct flow *,
129 uint16_t vlan);
130static tag_type bond_get_active_slave_tag(const struct bond *);
131static struct bond_slave *choose_output_slave(const struct bond *,
132 const struct flow *,
bcd2633a 133 struct flow_wildcards *,
00ed8314 134 uint16_t vlan, tag_type *tags);
f620b43a
BP
135static void bond_update_fake_slave_stats(struct bond *);
136
137/* Attempts to parse 's' as the name of a bond balancing mode. If successful,
138 * stores the mode in '*balance' and returns true. Otherwise returns false
139 * without modifying '*balance'. */
140bool
141bond_mode_from_string(enum bond_mode *balance, const char *s)
142{
143 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
144 *balance = BM_TCP;
145 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
146 *balance = BM_SLB;
147 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
148 *balance = BM_AB;
149 } else {
150 return false;
151 }
152 return true;
153}
154
155/* Returns a string representing 'balance'. */
156const char *
157bond_mode_to_string(enum bond_mode balance) {
158 switch (balance) {
159 case BM_TCP:
160 return "balance-tcp";
161 case BM_SLB:
162 return "balance-slb";
163 case BM_AB:
164 return "active-backup";
165 }
166 NOT_REACHED();
167}
168
f620b43a
BP
169\f
170/* Creates and returns a new bond whose configuration is initially taken from
171 * 's'.
172 *
173 * The caller should register each slave on the new bond by calling
174 * bond_slave_register(). */
175struct bond *
176bond_create(const struct bond_settings *s)
177{
178 struct bond *bond;
179
180 bond = xzalloc(sizeof *bond);
181 hmap_init(&bond->slaves);
182 bond->no_slaves_tag = tag_create_random();
f620b43a 183 bond->next_fake_iface_update = LLONG_MAX;
03366a2d 184 bond->ref_cnt = 1;
f620b43a
BP
185
186 bond_reconfigure(bond, s);
187
188 tag_set_init(&bond->unixctl_tags);
189
190 return bond;
191}
192
03366a2d
EJ
193struct bond *
194bond_ref(const struct bond *bond_)
195{
196 struct bond *bond = CONST_CAST(struct bond *, bond_);
197
198 ovs_assert(bond->ref_cnt > 0);
199 bond->ref_cnt++;
200 return bond;
201}
202
f620b43a
BP
203/* Frees 'bond'. */
204void
03366a2d 205bond_unref(struct bond *bond)
f620b43a
BP
206{
207 struct bond_slave *slave, *next_slave;
208
209 if (!bond) {
210 return;
211 }
212
03366a2d
EJ
213 ovs_assert(bond->ref_cnt > 0);
214 if (--bond->ref_cnt) {
215 return;
216 }
217
f620b43a
BP
218 hmap_remove(&all_bonds, &bond->hmap_node);
219
220 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
221 hmap_remove(&bond->slaves, &slave->hmap_node);
222 /* Client owns 'slave->netdev'. */
223 free(slave->name);
224 free(slave);
225 }
226 hmap_destroy(&bond->slaves);
227
228 free(bond->hash);
f620b43a
BP
229 free(bond->name);
230 free(bond);
231}
232
233/* Updates 'bond''s overall configuration to 's'.
234 *
235 * The caller should register each slave on 'bond' by calling
236 * bond_slave_register(). This is optional if none of the slaves'
4d6fb5eb 237 * configuration has changed. In any case it can't hurt.
59d7b2b6
EJ
238 *
239 * Returns true if the configuration has changed in such a way that requires
240 * flow revalidation.
241 * */
242bool
f620b43a
BP
243bond_reconfigure(struct bond *bond, const struct bond_settings *s)
244{
59d7b2b6
EJ
245 bool revalidate = false;
246
f620b43a
BP
247 if (!bond->name || strcmp(bond->name, s->name)) {
248 if (bond->name) {
249 hmap_remove(&all_bonds, &bond->hmap_node);
250 free(bond->name);
251 }
252 bond->name = xstrdup(s->name);
253 hmap_insert(&all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
254 }
255
f620b43a
BP
256 bond->updelay = s->up_delay;
257 bond->downdelay = s->down_delay;
bc1b010c
EJ
258
259 if (bond->rebalance_interval != s->rebalance_interval) {
260 bond->rebalance_interval = s->rebalance_interval;
261 revalidate = true;
262 }
f620b43a 263
59d7b2b6
EJ
264 if (bond->balance != s->balance) {
265 bond->balance = s->balance;
266 revalidate = true;
267 }
268
672d18b2
EJ
269 if (bond->basis != s->basis) {
270 bond->basis = s->basis;
271 revalidate = true;
272 }
273
f620b43a
BP
274 if (s->fake_iface) {
275 if (bond->next_fake_iface_update == LLONG_MAX) {
276 bond->next_fake_iface_update = time_msec();
277 }
278 } else {
279 bond->next_fake_iface_update = LLONG_MAX;
280 }
59d7b2b6 281
62904702
EJ
282 if (bond->bond_revalidate) {
283 revalidate = true;
284 bond->bond_revalidate = false;
285 }
286
95aafb2a
EJ
287 if (bond->balance == BM_AB || !bond->hash || revalidate) {
288 bond_entry_reset(bond);
289 }
290
59d7b2b6 291 return revalidate;
f620b43a
BP
292}
293
f8ddccd2 294static void
1ea24138 295bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
f8ddccd2
BP
296{
297 if (slave->netdev != netdev) {
f8ddccd2 298 slave->netdev = netdev;
1ea24138 299 slave->change_seq = 0;
f8ddccd2
BP
300 }
301}
302
f620b43a
BP
303/* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
304 * arbitrary client-provided pointer that uniquely identifies a slave within a
305 * bond. If 'slave_' already exists within 'bond' then this function
306 * reconfigures the existing slave.
307 *
308 * 'netdev' must be the network device that 'slave_' represents. It is owned
309 * by the client, so the client must not close it before either unregistering
310 * 'slave_' or destroying 'bond'.
4d6fb5eb 311 */
f620b43a 312void
df53d41c 313bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev)
f620b43a
BP
314{
315 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
316
317 if (!slave) {
318 slave = xzalloc(sizeof *slave);
319
320 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
321 slave->bond = bond;
322 slave->aux = slave_;
323 slave->delay_expires = LLONG_MAX;
244b2160 324 slave->name = xstrdup(netdev_get_name(netdev));
7321e30e 325 bond->bond_revalidate = true;
244b2160 326
b3c18f66 327 slave->enabled = false;
c8544aa1 328 bond_enable_slave(slave, netdev_get_carrier(netdev), NULL);
f620b43a
BP
329 }
330
1ea24138 331 bond_slave_set_netdev__(slave, netdev);
a6934aa9 332
f620b43a
BP
333 free(slave->name);
334 slave->name = xstrdup(netdev_get_name(netdev));
f620b43a
BP
335}
336
f8ddccd2
BP
337/* Updates the network device to be used with 'slave_' to 'netdev'.
338 *
339 * This is useful if the caller closes and re-opens the network device
340 * registered with bond_slave_register() but doesn't need to change anything
341 * else. */
342void
343bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
344{
345 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
346 if (slave) {
1ea24138 347 bond_slave_set_netdev__(slave, netdev);
f8ddccd2
BP
348 }
349}
350
f620b43a
BP
351/* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
352 * then this function has no effect.
353 *
354 * Unregistering a slave invalidates all flows. */
355void
356bond_slave_unregister(struct bond *bond, const void *slave_)
357{
358 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
359 bool del_active;
360
361 if (!slave) {
362 return;
363 }
364
b3c18f66
EJ
365 bond_enable_slave(slave, false, NULL);
366
f620b43a
BP
367 del_active = bond->active_slave == slave;
368 if (bond->hash) {
369 struct bond_entry *e;
370 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
371 if (e->slave == slave) {
372 e->slave = NULL;
373 }
374 }
375 }
376
377 free(slave->name);
378
379 hmap_remove(&bond->slaves, &slave->hmap_node);
380 /* Client owns 'slave->netdev'. */
381 free(slave);
382
383 if (del_active) {
384 struct tag_set tags;
385
386 tag_set_init(&tags);
387 bond_choose_active_slave(bond, &tags);
388 bond->send_learning_packets = true;
389 }
390}
391
296f6519
EJ
392/* Should be called on each slave in 'bond' before bond_run() to indicate
393 * whether or not 'slave_' may be enabled. This function is intended to allow
394 * other protocols to have some impact on bonding decisions. For example LACP
395 * or high level link monitoring protocols may decide that a given slave should
396 * not be able to send traffic. */
4d6fb5eb 397void
296f6519 398bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
4d6fb5eb 399{
296f6519 400 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
4d6fb5eb
EJ
401}
402
f620b43a
BP
403/* Performs periodic maintenance on 'bond'. The caller must provide 'tags' to
404 * allow tagged flows to be invalidated.
405 *
406 * The caller should check bond_should_send_learning_packets() afterward. */
407void
bdebeece 408bond_run(struct bond *bond, struct tag_set *tags, enum lacp_status lacp_status)
f620b43a
BP
409{
410 struct bond_slave *slave;
411
bdebeece
EJ
412 if (bond->lacp_status != lacp_status) {
413 bond->lacp_status = lacp_status;
4592d0e2
EJ
414 bond->bond_revalidate = true;
415 }
4d6fb5eb 416
f620b43a
BP
417 /* Enable slaves based on link status and LACP feedback. */
418 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
419 bond_link_status_update(slave, tags);
1ea24138 420 slave->change_seq = netdev_change_seq(slave->netdev);
f620b43a
BP
421 }
422 if (!bond->active_slave || !bond->active_slave->enabled) {
423 bond_choose_active_slave(bond, tags);
424 }
425
426 /* Update fake bond interface stats. */
427 if (time_msec() >= bond->next_fake_iface_update) {
428 bond_update_fake_slave_stats(bond);
429 bond->next_fake_iface_update = time_msec() + 1000;
430 }
431
62904702 432 if (bond->bond_revalidate) {
df53d41c 433 struct bond_slave *slave;
dc9908b3 434
df53d41c 435 bond->bond_revalidate = false;
95aafb2a 436 bond_entry_reset(bond);
df53d41c
EJ
437 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
438 tag_set_add(tags, slave->tag);
dc9908b3 439 }
0008fbcb 440 tag_set_add(tags, bond->no_slaves_tag);
dc9908b3
EJ
441 }
442
f620b43a
BP
443 /* Invalidate any tags required by */
444 tag_set_union(tags, &bond->unixctl_tags);
445 tag_set_init(&bond->unixctl_tags);
446}
447
448/* Causes poll_block() to wake up when 'bond' needs something to be done. */
449void
450bond_wait(struct bond *bond)
451{
452 struct bond_slave *slave;
453
f620b43a
BP
454 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
455 if (slave->delay_expires != LLONG_MAX) {
456 poll_timer_wait_until(slave->delay_expires);
457 }
1ea24138
EJ
458
459 if (slave->change_seq != netdev_change_seq(slave->netdev)) {
460 poll_immediate_wake();
461 }
f620b43a
BP
462 }
463
464 if (bond->next_fake_iface_update != LLONG_MAX) {
465 poll_timer_wait_until(bond->next_fake_iface_update);
466 }
467
468 /* Ensure that any saved tags get revalidated right away. */
469 if (!tag_set_is_empty(&bond->unixctl_tags)) {
470 poll_immediate_wake();
471 }
472
473 /* We don't wait for bond->next_rebalance because rebalancing can only run
474 * at a flow account checkpoint. ofproto does checkpointing on its own
475 * schedule and bond_rebalance() gets called afterward, so we'd just be
476 * waking up for no purpose. */
477}
478\f
479/* MAC learning table interaction. */
480
481static bool
482may_send_learning_packets(const struct bond *bond)
483{
bdebeece 484 return bond->lacp_status == LACP_DISABLED
64e2748d 485 && (bond->balance == BM_SLB || bond->balance == BM_AB)
bdebeece 486 && bond->active_slave;
f620b43a
BP
487}
488
489/* Returns true if 'bond' needs the client to send out packets to assist with
490 * MAC learning on 'bond'. If this function returns true, then the client
491 * should iterate through its MAC learning table for the bridge on which 'bond'
492 * is located. For each MAC that has been learned on a port other than 'bond',
ea131871 493 * it should call bond_compose_learning_packet().
f620b43a 494 *
477879ea
BP
495 * This function will only return true if 'bond' is in SLB or active-backup
496 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
497 * necessary.
f620b43a
BP
498 *
499 * Calling this function resets the state that it checks. */
500bool
501bond_should_send_learning_packets(struct bond *bond)
502{
503 bool send = bond->send_learning_packets && may_send_learning_packets(bond);
504 bond->send_learning_packets = false;
505 return send;
506}
507
508/* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
509 *
ea131871
JG
510 * See bond_should_send_learning_packets() for description of usage. The
511 * caller should send the composed packet on the port associated with
512 * port_aux and takes ownership of the returned ofpbuf. */
513struct ofpbuf *
514bond_compose_learning_packet(struct bond *bond,
515 const uint8_t eth_src[ETH_ADDR_LEN],
516 uint16_t vlan, void **port_aux)
f620b43a
BP
517{
518 struct bond_slave *slave;
ea131871 519 struct ofpbuf *packet;
00ed8314 520 tag_type tags = 0;
f620b43a 521 struct flow flow;
f620b43a 522
cb22974d 523 ovs_assert(may_send_learning_packets(bond));
f620b43a
BP
524
525 memset(&flow, 0, sizeof flow);
526 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
bcd2633a 527 slave = choose_output_slave(bond, &flow, NULL, vlan, &tags);
f620b43a 528
ea131871 529 packet = ofpbuf_new(0);
2ea838ac 530 compose_rarp(packet, eth_src);
f620b43a 531 if (vlan) {
ea131871 532 eth_push_vlan(packet, htons(vlan));
f620b43a 533 }
f620b43a 534
ea131871
JG
535 *port_aux = slave->aux;
536 return packet;
f620b43a
BP
537}
538\f
539/* Checks whether a packet that arrived on 'slave_' within 'bond', with an
540 * Ethernet destination address of 'eth_dst', should be admitted.
541 *
542 * The return value is one of the following:
543 *
544 * - BV_ACCEPT: Admit the packet.
545 *
546 * - BV_DROP: Drop the packet.
547 *
548 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
549 * Ethernet source address and VLAN. If there is none, or if the packet
550 * is on the learned port, then admit the packet. If a different port has
551 * been learned, however, drop the packet (and do not use it for MAC
552 * learning).
553 */
554enum bond_verdict
555bond_check_admissibility(struct bond *bond, const void *slave_,
556 const uint8_t eth_dst[ETH_ADDR_LEN], tag_type *tags)
557{
9a1c6450
EJ
558 struct bond_slave *slave = bond_slave_lookup(bond, slave_);
559
560 /* LACP bonds have very loose admissibility restrictions because we can
561 * assume the remote switch is aware of the bond and will "do the right
562 * thing". However, as a precaution we drop packets on disabled slaves
563 * because no correctly implemented partner switch should be sending
bdebeece
EJ
564 * packets to them.
565 *
566 * If LACP is configured, but LACP negotiations have been unsuccessful, we
567 * drop all incoming traffic. */
568 switch (bond->lacp_status) {
569 case LACP_NEGOTIATED: return slave->enabled ? BV_ACCEPT : BV_DROP;
570 case LACP_CONFIGURED: return BV_DROP;
571 case LACP_DISABLED: break;
f620b43a
BP
572 }
573
574 /* Drop all multicast packets on inactive slaves. */
575 if (eth_addr_is_multicast(eth_dst)) {
576 *tags |= bond_get_active_slave_tag(bond);
577 if (bond->active_slave != bond_slave_lookup(bond, slave_)) {
578 return BV_DROP;
579 }
580 }
581
f931a4c9
BP
582 switch (bond->balance) {
583 case BM_AB:
584 /* Drop all packets which arrive on backup slaves. This is similar to
585 * how Linux bonding handles active-backup bonds. */
7ba7dcf0
EJ
586 *tags |= bond_get_active_slave_tag(bond);
587 if (bond->active_slave != slave) {
588 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
589
e6b2255c
BP
590 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
591 " slave (%s) destined for " ETH_ADDR_FMT,
592 slave->name, ETH_ADDR_ARGS(eth_dst));
7ba7dcf0
EJ
593 return BV_DROP;
594 }
f931a4c9
BP
595 return BV_ACCEPT;
596
597 case BM_TCP:
bdebeece
EJ
598 /* TCP balanced bonds require successful LACP negotiated. Based on the
599 * above check, LACP is off on this bond. Therfore, we drop all
600 * incoming traffic. */
601 return BV_DROP;
602
f931a4c9
BP
603 case BM_SLB:
604 /* Drop all packets for which we have learned a different input port,
605 * because we probably sent the packet on one slave and got it back on
606 * the other. Gratuitous ARP packets are an exception to this rule:
607 * the host has moved to another switch. The exception to the
608 * exception is if we locked the learning table to avoid reflections on
609 * bond slaves. */
610 return BV_DROP_IF_MOVED;
7ba7dcf0
EJ
611 }
612
f931a4c9 613 NOT_REACHED();
f620b43a
BP
614}
615
616/* Returns the slave (registered on 'bond' by bond_slave_register()) to which
617 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
618 * NULL if the packet should be dropped because no slaves are enabled.
619 *
620 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
621 * should be a VID only (i.e. excluding the PCP bits). Second,
622 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
623 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
624 * packet belongs to (so for an access port it will be the access port's VLAN).
625 *
626 * Adds a tag to '*tags' that associates the flow with the returned slave.
bcd2633a
JP
627 *
628 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
629 * significant in the selection. At some point earlier, 'wc' should
630 * have been initialized (e.g., by flow_wildcards_init_catchall()).
f620b43a
BP
631 */
632void *
633bond_choose_output_slave(struct bond *bond, const struct flow *flow,
bcd2633a
JP
634 struct flow_wildcards *wc, uint16_t vlan,
635 tag_type *tags)
f620b43a 636{
bcd2633a 637 struct bond_slave *slave = choose_output_slave(bond, flow, wc, vlan, tags);
f620b43a 638 if (slave) {
df53d41c 639 *tags |= slave->tag;
f620b43a
BP
640 return slave->aux;
641 } else {
642 *tags |= bond->no_slaves_tag;
643 return NULL;
644 }
645}
f620b43a
BP
646\f
647/* Rebalancing. */
648
1b137691
EJ
649static bool
650bond_is_balanced(const struct bond *bond)
651{
bc1b010c
EJ
652 return bond->rebalance_interval
653 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
1b137691
EJ
654}
655
f620b43a
BP
656/* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
657void
658bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
659 uint64_t n_bytes)
660{
1b137691 661 if (bond_is_balanced(bond)) {
f620b43a 662 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
f620b43a
BP
663 }
664}
665
666static struct bond_slave *
667bond_slave_from_bal_node(struct list *bal)
668{
669 return CONTAINER_OF(bal, struct bond_slave, bal_node);
670}
671
672static void
673log_bals(struct bond *bond, const struct list *bals)
674{
675 if (VLOG_IS_DBG_ENABLED()) {
676 struct ds ds = DS_EMPTY_INITIALIZER;
677 const struct bond_slave *slave;
678
679 LIST_FOR_EACH (slave, bal_node, bals) {
680 if (ds.length) {
681 ds_put_char(&ds, ',');
682 }
683 ds_put_format(&ds, " %s %"PRIu64"kB",
684 slave->name, slave->tx_bytes / 1024);
685
686 if (!slave->enabled) {
687 ds_put_cstr(&ds, " (disabled)");
688 }
689 if (!list_is_empty(&slave->entries)) {
690 struct bond_entry *e;
691
692 ds_put_cstr(&ds, " (");
693 LIST_FOR_EACH (e, list_node, &slave->entries) {
694 if (&e->list_node != list_front(&slave->entries)) {
695 ds_put_cstr(&ds, " + ");
696 }
697 ds_put_format(&ds, "h%td: %"PRIu64"kB",
698 e - bond->hash, e->tx_bytes / 1024);
699 }
700 ds_put_cstr(&ds, ")");
701 }
702 }
703 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
704 ds_destroy(&ds);
705 }
706}
707
708/* Shifts 'hash' from its current slave to 'to'. */
709static void
710bond_shift_load(struct bond_entry *hash, struct bond_slave *to,
711 struct tag_set *set)
712{
713 struct bond_slave *from = hash->slave;
714 struct bond *bond = from->bond;
715 uint64_t delta = hash->tx_bytes;
716
717 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) "
718 "from %s to %s (now carrying %"PRIu64"kB and "
719 "%"PRIu64"kB load, respectively)",
720 bond->name, delta / 1024, hash - bond->hash,
721 from->name, to->name,
722 (from->tx_bytes - delta) / 1024,
723 (to->tx_bytes + delta) / 1024);
724
725 /* Shift load away from 'from' to 'to'. */
726 from->tx_bytes -= delta;
727 to->tx_bytes += delta;
728
729 /* Arrange for flows to be revalidated. */
730 tag_set_add(set, hash->tag);
731 hash->slave = to;
732 hash->tag = tag_create_random();
733}
734
09a5d390
BP
735/* Picks and returns a bond_entry to migrate from 'from' (the most heavily
736 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
f620b43a
BP
737 * given that doing so must decrease the ratio of the load on the two slaves by
738 * at least 0.1. Returns NULL if there is no appropriate entry.
739 *
740 * The list of entries isn't sorted. I don't know of a reason to prefer to
741 * shift away small hashes or large hashes. */
742static struct bond_entry *
743choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
744{
745 struct bond_entry *e;
746
747 if (list_is_short(&from->entries)) {
748 /* 'from' carries no more than one MAC hash, so shifting load away from
749 * it would be pointless. */
750 return NULL;
751 }
752
753 LIST_FOR_EACH (e, list_node, &from->entries) {
754 double old_ratio, new_ratio;
755 uint64_t delta;
756
757 if (to_tx_bytes == 0) {
758 /* Nothing on the new slave, move it. */
759 return e;
760 }
761
762 delta = e->tx_bytes;
763 old_ratio = (double)from->tx_bytes / to_tx_bytes;
764 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
75fad143
ZK
765 if (old_ratio - new_ratio > 0.1
766 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
767 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
768 and 'to' slave have the same load. Therefore, we only move an
769 entry if it decreases the load on 'from', and brings us closer
770 to equal traffic load. */
f620b43a
BP
771 return e;
772 }
773 }
774
775 return NULL;
776}
777
778/* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
779 * maintained. */
780static void
781insert_bal(struct list *bals, struct bond_slave *slave)
782{
783 struct bond_slave *pos;
784
785 LIST_FOR_EACH (pos, bal_node, bals) {
786 if (slave->tx_bytes > pos->tx_bytes) {
787 break;
788 }
789 }
790 list_insert(&pos->bal_node, &slave->bal_node);
791}
792
793/* Removes 'slave' from its current list and then inserts it into 'bals' so
794 * that descending order of 'tx_bytes' is maintained. */
795static void
796reinsert_bal(struct list *bals, struct bond_slave *slave)
797{
798 list_remove(&slave->bal_node);
799 insert_bal(bals, slave);
800}
801
802/* If 'bond' needs rebalancing, does so.
803 *
804 * The caller should have called bond_account() for each active flow, to ensure
805 * that flow data is consistently accounted at this point. */
806void
807bond_rebalance(struct bond *bond, struct tag_set *tags)
808{
809 struct bond_slave *slave;
810 struct bond_entry *e;
811 struct list bals;
812
1b137691 813 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
f620b43a
BP
814 return;
815 }
816 bond->next_rebalance = time_msec() + bond->rebalance_interval;
817
818 /* Add each bond_entry to its slave's 'entries' list.
819 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
820 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
821 slave->tx_bytes = 0;
822 list_init(&slave->entries);
823 }
824 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
825 if (e->slave && e->tx_bytes) {
826 e->slave->tx_bytes += e->tx_bytes;
827 list_push_back(&e->slave->entries, &e->list_node);
828 }
829 }
830
831 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
832 *
833 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
834 * with a proper list sort algorithm. */
835 list_init(&bals);
836 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
837 if (slave->enabled) {
838 insert_bal(&bals, slave);
839 }
840 }
841 log_bals(bond, &bals);
842
843 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
844 while (!list_is_short(&bals)) {
845 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
846 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
847 uint64_t overload;
848
849 overload = from->tx_bytes - to->tx_bytes;
850 if (overload < to->tx_bytes >> 5 || overload < 100000) {
851 /* The extra load on 'from' (and all less-loaded slaves), compared
852 * to that of 'to' (the least-loaded slave), is less than ~3%, or
853 * it is less than ~1Mbps. No point in rebalancing. */
854 break;
855 }
856
09a5d390
BP
857 /* 'from' is carrying significantly more load than 'to'. Pick a hash
858 * to move from 'from' to 'to'. */
f620b43a
BP
859 e = choose_entry_to_migrate(from, to->tx_bytes);
860 if (e) {
861 bond_shift_load(e, to, tags);
862
863 /* Delete element from from->entries.
864 *
865 * We don't add the element to to->hashes. That would only allow
866 * 'e' to be migrated to another slave in this rebalancing run, and
867 * there is no point in doing that. */
868 list_remove(&e->list_node);
869
870 /* Re-sort 'bals'. */
871 reinsert_bal(&bals, from);
872 reinsert_bal(&bals, to);
873 } else {
874 /* Can't usefully migrate anything away from 'from'.
875 * Don't reconsider it. */
876 list_remove(&from->bal_node);
877 }
878 }
879
880 /* Implement exponentially weighted moving average. A weight of 1/2 causes
881 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
882 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
883 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
884 e->tx_bytes /= 2;
885 if (!e->tx_bytes) {
886 e->slave = NULL;
887 }
888 }
889}
890\f
891/* Bonding unixctl user interface functions. */
892
893static struct bond *
894bond_find(const char *name)
895{
896 struct bond *bond;
897
898 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
899 &all_bonds) {
900 if (!strcmp(bond->name, name)) {
901 return bond;
902 }
903 }
904 return NULL;
905}
906
907static struct bond_slave *
908bond_lookup_slave(struct bond *bond, const char *slave_name)
909{
910 struct bond_slave *slave;
911
912 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
913 if (!strcmp(slave->name, slave_name)) {
914 return slave;
915 }
916 }
917 return NULL;
918}
919
920static void
921bond_unixctl_list(struct unixctl_conn *conn,
0e15264f
BP
922 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
923 void *aux OVS_UNUSED)
f620b43a
BP
924{
925 struct ds ds = DS_EMPTY_INITIALIZER;
926 const struct bond *bond;
927
928 ds_put_cstr(&ds, "bond\ttype\tslaves\n");
929
930 HMAP_FOR_EACH (bond, hmap_node, &all_bonds) {
931 const struct bond_slave *slave;
932 size_t i;
933
934 ds_put_format(&ds, "%s\t%s\t",
935 bond->name, bond_mode_to_string(bond->balance));
936
937 i = 0;
938 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
939 if (i++ > 0) {
940 ds_put_cstr(&ds, ", ");
941 }
942 ds_put_cstr(&ds, slave->name);
943 }
944 ds_put_char(&ds, '\n');
945 }
bde9f75d 946 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a
BP
947 ds_destroy(&ds);
948}
949
950static void
c33a8a25 951bond_print_details(struct ds *ds, const struct bond *bond)
f620b43a 952{
fc1d4f01
EJ
953 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
954 const struct shash_node **sorted_slaves = NULL;
f620b43a 955 const struct bond_slave *slave;
fc1d4f01 956 int i;
f620b43a 957
c33a8a25
EJ
958 ds_put_format(ds, "---- %s ----\n", bond->name);
959 ds_put_format(ds, "bond_mode: %s\n",
f620b43a
BP
960 bond_mode_to_string(bond->balance));
961
c33a8a25 962 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
672d18b2 963
c33a8a25
EJ
964 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
965 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
f620b43a 966
1b137691 967 if (bond_is_balanced(bond)) {
c33a8a25 968 ds_put_format(ds, "next rebalance: %lld ms\n",
f620b43a
BP
969 bond->next_rebalance - time_msec());
970 }
971
bdebeece
EJ
972 ds_put_cstr(ds, "lacp_status: ");
973 switch (bond->lacp_status) {
974 case LACP_NEGOTIATED:
975 ds_put_cstr(ds, "negotiated\n");
976 break;
977 case LACP_CONFIGURED:
978 ds_put_cstr(ds, "configured\n");
979 break;
980 case LACP_DISABLED:
981 ds_put_cstr(ds, "off\n");
982 break;
983 default:
984 ds_put_cstr(ds, "<unknown>\n");
985 break;
986 }
4d6fb5eb 987
f620b43a 988 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
fc1d4f01
EJ
989 shash_add(&slave_shash, slave->name, slave);
990 }
991 sorted_slaves = shash_sort(&slave_shash);
992
993 for (i = 0; i < shash_count(&slave_shash); i++) {
f620b43a 994 struct bond_entry *be;
f620b43a 995
fc1d4f01
EJ
996 slave = sorted_slaves[i]->data;
997
f620b43a 998 /* Basic info. */
c33a8a25 999 ds_put_format(ds, "\nslave %s: %s\n",
f620b43a
BP
1000 slave->name, slave->enabled ? "enabled" : "disabled");
1001 if (slave == bond->active_slave) {
c33a8a25 1002 ds_put_cstr(ds, "\tactive slave\n");
f620b43a
BP
1003 }
1004 if (slave->delay_expires != LLONG_MAX) {
c33a8a25 1005 ds_put_format(ds, "\t%s expires in %lld ms\n",
f620b43a
BP
1006 slave->enabled ? "downdelay" : "updelay",
1007 slave->delay_expires - time_msec());
1008 }
1009
c33a8a25 1010 ds_put_format(ds, "\tmay_enable: %s\n",
296f6519 1011 slave->may_enable ? "true" : "false");
4d6fb5eb 1012
1b137691 1013 if (!bond_is_balanced(bond)) {
f620b43a
BP
1014 continue;
1015 }
1016
1017 /* Hashes. */
f620b43a
BP
1018 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1019 int hash = be - bond->hash;
1020
1021 if (be->slave != slave) {
1022 continue;
1023 }
1024
c33a8a25 1025 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
f620b43a
BP
1026 hash, be->tx_bytes / 1024);
1027
7b9f1974 1028 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
f620b43a
BP
1029 }
1030 }
fc1d4f01
EJ
1031 shash_destroy(&slave_shash);
1032 free(sorted_slaves);
c33a8a25
EJ
1033 ds_put_cstr(ds, "\n");
1034}
1035
1036static void
1037bond_unixctl_show(struct unixctl_conn *conn,
1038 int argc, const char *argv[],
1039 void *aux OVS_UNUSED)
1040{
1041 struct ds ds = DS_EMPTY_INITIALIZER;
1042
1043 if (argc > 1) {
1044 const struct bond *bond = bond_find(argv[1]);
1045
1046 if (!bond) {
bde9f75d 1047 unixctl_command_reply_error(conn, "no such bond");
c33a8a25
EJ
1048 return;
1049 }
1050 bond_print_details(&ds, bond);
1051 } else {
1052 const struct bond *bond;
1053
1054 HMAP_FOR_EACH (bond, hmap_node, &all_bonds) {
1055 bond_print_details(&ds, bond);
1056 }
1057 }
1058
bde9f75d 1059 unixctl_command_reply(conn, ds_cstr(&ds));
f620b43a
BP
1060 ds_destroy(&ds);
1061}
1062
1063static void
0e15264f
BP
1064bond_unixctl_migrate(struct unixctl_conn *conn,
1065 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1066 void *aux OVS_UNUSED)
1067{
0e15264f
BP
1068 const char *bond_s = argv[1];
1069 const char *hash_s = argv[2];
1070 const char *slave_s = argv[3];
f620b43a
BP
1071 struct bond *bond;
1072 struct bond_slave *slave;
1073 struct bond_entry *entry;
1074 int hash;
1075
f620b43a
BP
1076 bond = bond_find(bond_s);
1077 if (!bond) {
bde9f75d 1078 unixctl_command_reply_error(conn, "no such bond");
f620b43a
BP
1079 return;
1080 }
1081
1082 if (bond->balance != BM_SLB) {
bde9f75d 1083 unixctl_command_reply_error(conn, "not an SLB bond");
f620b43a
BP
1084 return;
1085 }
1086
1087 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1088 hash = atoi(hash_s) & BOND_MASK;
1089 } else {
bde9f75d 1090 unixctl_command_reply_error(conn, "bad hash");
f620b43a
BP
1091 return;
1092 }
1093
1094 slave = bond_lookup_slave(bond, slave_s);
1095 if (!slave) {
bde9f75d 1096 unixctl_command_reply_error(conn, "no such slave");
f620b43a
BP
1097 return;
1098 }
1099
1100 if (!slave->enabled) {
bde9f75d 1101 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
f620b43a
BP
1102 return;
1103 }
1104
1105 entry = &bond->hash[hash];
1106 tag_set_add(&bond->unixctl_tags, entry->tag);
1107 entry->slave = slave;
1108 entry->tag = tag_create_random();
bde9f75d 1109 unixctl_command_reply(conn, "migrated");
f620b43a
BP
1110}
1111
1112static void
0e15264f
BP
1113bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1114 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1115 void *aux OVS_UNUSED)
1116{
0e15264f
BP
1117 const char *bond_s = argv[1];
1118 const char *slave_s = argv[2];
f620b43a
BP
1119 struct bond *bond;
1120 struct bond_slave *slave;
1121
f620b43a
BP
1122 bond = bond_find(bond_s);
1123 if (!bond) {
bde9f75d 1124 unixctl_command_reply_error(conn, "no such bond");
f620b43a
BP
1125 return;
1126 }
1127
1128 slave = bond_lookup_slave(bond, slave_s);
1129 if (!slave) {
bde9f75d 1130 unixctl_command_reply_error(conn, "no such slave");
f620b43a
BP
1131 return;
1132 }
1133
1134 if (!slave->enabled) {
bde9f75d 1135 unixctl_command_reply_error(conn, "cannot make disabled slave active");
f620b43a
BP
1136 return;
1137 }
1138
1139 if (bond->active_slave != slave) {
1140 tag_set_add(&bond->unixctl_tags, bond_get_active_slave_tag(bond));
1141 bond->active_slave = slave;
1142 bond->active_slave->tag = tag_create_random();
1143 VLOG_INFO("bond %s: active interface is now %s",
1144 bond->name, slave->name);
1145 bond->send_learning_packets = true;
bde9f75d 1146 unixctl_command_reply(conn, "done");
f620b43a 1147 } else {
bde9f75d 1148 unixctl_command_reply(conn, "no change");
f620b43a
BP
1149 }
1150}
1151
1152static void
0e15264f 1153enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
f620b43a 1154{
0e15264f
BP
1155 const char *bond_s = argv[1];
1156 const char *slave_s = argv[2];
f620b43a
BP
1157 struct bond *bond;
1158 struct bond_slave *slave;
1159
f620b43a
BP
1160 bond = bond_find(bond_s);
1161 if (!bond) {
bde9f75d 1162 unixctl_command_reply_error(conn, "no such bond");
f620b43a
BP
1163 return;
1164 }
1165
1166 slave = bond_lookup_slave(bond, slave_s);
1167 if (!slave) {
bde9f75d 1168 unixctl_command_reply_error(conn, "no such slave");
f620b43a
BP
1169 return;
1170 }
1171
1172 bond_enable_slave(slave, enable, &bond->unixctl_tags);
bde9f75d 1173 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
f620b43a
BP
1174}
1175
1176static void
0e15264f
BP
1177bond_unixctl_enable_slave(struct unixctl_conn *conn,
1178 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1179 void *aux OVS_UNUSED)
1180{
0e15264f 1181 enable_slave(conn, argv, true);
f620b43a
BP
1182}
1183
1184static void
0e15264f
BP
1185bond_unixctl_disable_slave(struct unixctl_conn *conn,
1186 int argc OVS_UNUSED, const char *argv[],
f620b43a
BP
1187 void *aux OVS_UNUSED)
1188{
0e15264f 1189 enable_slave(conn, argv, false);
f620b43a
BP
1190}
1191
1192static void
0e15264f 1193bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
f620b43a
BP
1194 void *aux OVS_UNUSED)
1195{
0e15264f
BP
1196 const char *mac_s = argv[1];
1197 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1198 const char *basis_s = argc > 3 ? argv[3] : NULL;
f620b43a
BP
1199 uint8_t mac[ETH_ADDR_LEN];
1200 uint8_t hash;
1201 char *hash_cstr;
1202 unsigned int vlan;
672d18b2 1203 uint32_t basis;
f620b43a
BP
1204
1205 if (vlan_s) {
1206 if (sscanf(vlan_s, "%u", &vlan) != 1) {
bde9f75d 1207 unixctl_command_reply_error(conn, "invalid vlan");
f620b43a
BP
1208 return;
1209 }
1210 } else {
dc155bff 1211 vlan = 0;
f620b43a
BP
1212 }
1213
672d18b2
EJ
1214 if (basis_s) {
1215 if (sscanf(basis_s, "%"PRIu32, &basis) != 1) {
bde9f75d 1216 unixctl_command_reply_error(conn, "invalid basis");
672d18b2
EJ
1217 return;
1218 }
1219 } else {
1220 basis = 0;
1221 }
1222
f620b43a
BP
1223 if (sscanf(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))
1224 == ETH_ADDR_SCAN_COUNT) {
672d18b2 1225 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
f620b43a
BP
1226
1227 hash_cstr = xasprintf("%u", hash);
bde9f75d 1228 unixctl_command_reply(conn, hash_cstr);
f620b43a
BP
1229 free(hash_cstr);
1230 } else {
bde9f75d 1231 unixctl_command_reply_error(conn, "invalid mac");
f620b43a
BP
1232 }
1233}
1234
1235void
1236bond_init(void)
1237{
0e15264f 1238 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
c33a8a25
EJ
1239 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1240 NULL);
0e15264f 1241 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
7ff2009a 1242 bond_unixctl_migrate, NULL);
0e15264f 1243 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
f620b43a 1244 bond_unixctl_set_active_slave, NULL);
0e15264f 1245 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
7ff2009a 1246 bond_unixctl_enable_slave, NULL);
0e15264f 1247 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
7ff2009a 1248 bond_unixctl_disable_slave, NULL);
0e15264f 1249 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
7ff2009a 1250 bond_unixctl_hash, NULL);
f620b43a
BP
1251}
1252\f
95aafb2a
EJ
1253static void
1254bond_entry_reset(struct bond *bond)
1255{
1256 if (bond->balance != BM_AB) {
1257 size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash;
1258
1259 if (!bond->hash) {
1260 bond->hash = xmalloc(hash_len);
1261 }
1262 memset(bond->hash, 0, hash_len);
1263
1264 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1265 } else {
1266 free(bond->hash);
1267 bond->hash = NULL;
1268 }
1269}
1270
f620b43a
BP
1271static struct bond_slave *
1272bond_slave_lookup(struct bond *bond, const void *slave_)
1273{
1274 struct bond_slave *slave;
1275
1276 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1277 &bond->slaves) {
1278 if (slave->aux == slave_) {
1279 return slave;
1280 }
1281 }
1282
1283 return NULL;
1284}
1285
f620b43a
BP
1286static void
1287bond_enable_slave(struct bond_slave *slave, bool enable, struct tag_set *tags)
1288{
1289 slave->delay_expires = LLONG_MAX;
1290 if (enable != slave->enabled) {
1291 slave->enabled = enable;
1292 if (!slave->enabled) {
d28b9ead 1293 VLOG_INFO("interface %s: disabled", slave->name);
b3c18f66
EJ
1294 if (tags) {
1295 tag_set_add(tags, slave->tag);
1296 }
f620b43a 1297 } else {
d28b9ead 1298 VLOG_INFO("interface %s: enabled", slave->name);
f620b43a
BP
1299 slave->tag = tag_create_random();
1300 }
1301 }
1302}
1303
1304static void
1305bond_link_status_update(struct bond_slave *slave, struct tag_set *tags)
1306{
1307 struct bond *bond = slave->bond;
1308 bool up;
1309
296f6519 1310 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
f620b43a
BP
1311 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1312 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1313 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1314 slave->name, up ? "up" : "down");
1315 if (up == slave->enabled) {
1316 slave->delay_expires = LLONG_MAX;
1317 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1318 slave->name, up ? "disabled" : "enabled");
1319 } else {
bdebeece 1320 int delay = (bond->lacp_status != LACP_DISABLED ? 0
f620b43a
BP
1321 : up ? bond->updelay : bond->downdelay);
1322 slave->delay_expires = time_msec() + delay;
1323 if (delay) {
1324 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1325 "for %d ms",
1326 slave->name,
1327 up ? "enabled" : "disabled",
1328 up ? "up" : "down",
1329 delay);
1330 }
1331 }
1332 }
1333
1334 if (time_msec() >= slave->delay_expires) {
1335 bond_enable_slave(slave, up, tags);
1336 }
1337}
1338
f620b43a 1339static unsigned int
672d18b2 1340bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
f620b43a 1341{
672d18b2 1342 return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis);
f620b43a
BP
1343}
1344
1345static unsigned int
672d18b2 1346bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
f620b43a
BP
1347{
1348 struct flow hash_flow = *flow;
d84d4b88 1349 hash_flow.vlan_tci = htons(vlan);
f620b43a
BP
1350
1351 /* The symmetric quality of this hash function is not required, but
1352 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1353 * purposes, so we use it out of convenience. */
672d18b2 1354 return flow_hash_symmetric_l4(&hash_flow, basis);
f620b43a
BP
1355}
1356
fb0b29a3
EJ
1357static unsigned int
1358bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1359{
cb22974d 1360 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
fb0b29a3 1361
bdebeece 1362 return (bond->balance == BM_TCP
672d18b2
EJ
1363 ? bond_hash_tcp(flow, vlan, bond->basis)
1364 : bond_hash_src(flow->dl_src, vlan, bond->basis));
fb0b29a3
EJ
1365}
1366
f620b43a
BP
1367static struct bond_entry *
1368lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1369 uint16_t vlan)
1370{
fb0b29a3 1371 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
f620b43a
BP
1372}
1373
1374static struct bond_slave *
1375choose_output_slave(const struct bond *bond, const struct flow *flow,
bcd2633a 1376 struct flow_wildcards *wc, uint16_t vlan, tag_type *tags)
f620b43a
BP
1377{
1378 struct bond_entry *e;
1379
bdebeece
EJ
1380 if (bond->lacp_status == LACP_CONFIGURED) {
1381 /* LACP has been configured on this bond but negotiations were
1382 * unsuccussful. Drop all traffic. */
1383 return NULL;
1384 }
1385
f620b43a
BP
1386 switch (bond->balance) {
1387 case BM_AB:
1388 return bond->active_slave;
1389
f620b43a 1390 case BM_TCP:
bdebeece
EJ
1391 if (bond->lacp_status != LACP_NEGOTIATED) {
1392 /* Must have LACP negotiations for TCP balanced bonds. */
1393 return NULL;
1394 }
bcd2633a
JP
1395 if (wc) {
1396 flow_mask_hash_fields(wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1397 }
bdebeece
EJ
1398 /* Fall Through. */
1399 case BM_SLB:
bcd2633a
JP
1400 if (wc) {
1401 flow_mask_hash_fields(wc, NX_HASH_FIELDS_ETH_SRC);
1402 }
f620b43a
BP
1403 e = lookup_bond_entry(bond, flow, vlan);
1404 if (!e->slave || !e->slave->enabled) {
c804cadf
EJ
1405 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
1406 struct bond_slave, hmap_node);
1407 if (!e->slave->enabled) {
1408 e->slave = bond->active_slave;
1409 }
f620b43a
BP
1410 e->tag = tag_create_random();
1411 }
00ed8314 1412 *tags |= e->tag;
f620b43a
BP
1413 return e->slave;
1414
1415 default:
1416 NOT_REACHED();
1417 }
1418}
1419
1420static struct bond_slave *
1421bond_choose_slave(const struct bond *bond)
1422{
1423 struct bond_slave *slave, *best;
1424
1425 /* Find an enabled slave. */
1426 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1427 if (slave->enabled) {
1428 return slave;
1429 }
1430 }
1431
1432 /* All interfaces are disabled. Find an interface that will be enabled
1433 * after its updelay expires. */
1434 best = NULL;
1435 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1436 if (slave->delay_expires != LLONG_MAX
296f6519 1437 && slave->may_enable
f620b43a
BP
1438 && (!best || slave->delay_expires < best->delay_expires)) {
1439 best = slave;
1440 }
1441 }
1442 return best;
1443}
1444
1445static void
1446bond_choose_active_slave(struct bond *bond, struct tag_set *tags)
1447{
1448 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1449 struct bond_slave *old_active_slave = bond->active_slave;
1450
1451 bond->active_slave = bond_choose_slave(bond);
1452 if (bond->active_slave) {
1453 if (bond->active_slave->enabled) {
1454 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1455 bond->name, bond->active_slave->name);
1456 } else {
1457 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1458 "remaining %lld ms updelay (since no interface was "
1459 "enabled)", bond->name, bond->active_slave->name,
1460 bond->active_slave->delay_expires - time_msec());
1461 bond_enable_slave(bond->active_slave, true, tags);
1462 }
1463
1464 if (!old_active_slave) {
1465 tag_set_add(tags, bond->no_slaves_tag);
1466 }
1467
1468 bond->send_learning_packets = true;
1469 } else if (old_active_slave) {
d28b9ead 1470 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
f620b43a
BP
1471 }
1472}
1473
1474/* Returns the tag for 'bond''s active slave, or 'bond''s no_slaves_tag if
1475 * there is no active slave. */
1476static tag_type
1477bond_get_active_slave_tag(const struct bond *bond)
1478{
1479 return (bond->active_slave
1480 ? bond->active_slave->tag
1481 : bond->no_slaves_tag);
1482}
1483
1484/* Attempts to make the sum of the bond slaves' statistics appear on the fake
1485 * bond interface. */
1486static void
1487bond_update_fake_slave_stats(struct bond *bond)
1488{
1489 struct netdev_stats bond_stats;
1490 struct bond_slave *slave;
1491 struct netdev *bond_dev;
1492
1493 memset(&bond_stats, 0, sizeof bond_stats);
1494
1495 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1496 struct netdev_stats slave_stats;
1497
1498 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1499 /* XXX: We swap the stats here because they are swapped back when
1500 * reported by the internal device. The reason for this is
1501 * internal devices normally represent packets going into the
1502 * system but when used as fake bond device they represent packets
1503 * leaving the system. We really should do this in the internal
1504 * device itself because changing it here reverses the counts from
1505 * the perspective of the switch. However, the internal device
1506 * doesn't know what type of device it represents so we have to do
1507 * it here for now. */
1508 bond_stats.tx_packets += slave_stats.rx_packets;
1509 bond_stats.tx_bytes += slave_stats.rx_bytes;
1510 bond_stats.rx_packets += slave_stats.tx_packets;
1511 bond_stats.rx_bytes += slave_stats.tx_bytes;
1512 }
1513 }
1514
18812dff 1515 if (!netdev_open(bond->name, "system", &bond_dev)) {
f620b43a
BP
1516 netdev_set_stats(bond_dev, &bond_stats);
1517 netdev_close(bond_dev);
1518 }
1519}