]> git.proxmox.com Git - mirror_ovs.git/blob - lib/reconnect.c
Suppress ovsdb-server log messages about connections from ovs-vsctl.
[mirror_ovs.git] / lib / reconnect.c
1 /*
2 * Copyright (c) 2008, 2009, 2010 Nicira Networks.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "reconnect.h"
19
20 #include <assert.h>
21 #include <stdlib.h>
22
23 #include "poll-loop.h"
24
25 #define THIS_MODULE VLM_reconnect
26 #include "vlog.h"
27
28 #define STATES \
29 STATE(VOID, 1 << 0) \
30 STATE(BACKOFF, 1 << 1) \
31 STATE(CONNECT_IN_PROGRESS, 1 << 3) \
32 STATE(ACTIVE, 1 << 4) \
33 STATE(IDLE, 1 << 5) \
34 STATE(RECONNECT, 1 << 6) \
35 STATE(LISTENING, 1 << 7)
36 enum state {
37 #define STATE(NAME, VALUE) S_##NAME = VALUE,
38 STATES
39 #undef STATE
40 };
41
42 static bool
43 is_connected_state(enum state state)
44 {
45 return (state & (S_ACTIVE | S_IDLE)) != 0;
46 }
47
48 struct reconnect {
49 /* Configuration. */
50 char *name;
51 int min_backoff;
52 int max_backoff;
53 int probe_interval;
54 bool passive;
55 enum vlog_level info; /* Used for informational messages. */
56
57 /* State. */
58 enum state state;
59 long long int state_entered;
60 int backoff;
61 long long int last_received;
62 long long int last_connected;
63 unsigned int max_tries;
64
65 /* These values are simply for statistics reporting, not otherwise used
66 * directly by anything internal. */
67 long long int creation_time;
68 unsigned int n_attempted_connections, n_successful_connections;
69 unsigned int total_connected_duration;
70 unsigned int seqno;
71 };
72
73 static void reconnect_transition__(struct reconnect *, long long int now,
74 enum state state);
75 static long long int reconnect_deadline__(const struct reconnect *);
76 static bool reconnect_may_retry(struct reconnect *);
77
78 static const char *
79 reconnect_state_name__(enum state state)
80 {
81 switch (state) {
82 #define STATE(NAME, VALUE) case S_##NAME: return #NAME;
83 STATES
84 #undef STATE
85 }
86 return "***ERROR***";
87 }
88
89 /* Creates and returns a new reconnect FSM with default settings. The FSM is
90 * initially disabled. The caller will likely want to call reconnect_enable()
91 * and reconnect_set_name() on the returned object. */
92 struct reconnect *
93 reconnect_create(long long int now)
94 {
95 struct reconnect *fsm = xzalloc(sizeof *fsm);
96
97 fsm->name = xstrdup("void");
98 fsm->min_backoff = 1000;
99 fsm->max_backoff = 8000;
100 fsm->probe_interval = 5000;
101 fsm->passive = false;
102 fsm->info = VLL_INFO;
103
104 fsm->state = S_VOID;
105 fsm->state_entered = now;
106 fsm->backoff = 0;
107 fsm->last_received = now;
108 fsm->last_connected = now;
109 fsm->max_tries = UINT_MAX;
110 fsm->creation_time = now;
111
112 return fsm;
113 }
114
115 /* Frees 'fsm'. */
116 void
117 reconnect_destroy(struct reconnect *fsm)
118 {
119 if (fsm) {
120 free(fsm->name);
121 free(fsm);
122 }
123 }
124
125 /* If 'quiet' is true, 'fsm' will log informational messages at level VLL_DBG,
126 * by default keeping them out of log files. This is appropriate if the
127 * connection is one that is expected to be short-lived, so that the log
128 * messages are merely distracting.
129 *
130 * If 'quiet' is false, 'fsm' logs informational messages at level VLL_INFO.
131 * This is the default.
132 *
133 * This setting has no effect on the log level of debugging, warning, or error
134 * messages. */
135 void
136 reconnect_set_quiet(struct reconnect *fsm, bool quiet)
137 {
138 fsm->info = quiet ? VLL_DBG : VLL_INFO;
139 }
140
141 /* Returns 'fsm''s name. */
142 const char *
143 reconnect_get_name(const struct reconnect *fsm)
144 {
145 return fsm->name;
146 }
147
148 /* Sets 'fsm''s name to 'name'. If 'name' is null, then "void" is used
149 * instead.
150 *
151 * The name set for 'fsm' is used in log messages. */
152 void
153 reconnect_set_name(struct reconnect *fsm, const char *name)
154 {
155 free(fsm->name);
156 fsm->name = xstrdup(name ? name : "void");
157 }
158
159 /* Return the minimum number of milliseconds to back off between consecutive
160 * connection attempts. The default is 1000 ms. */
161 int
162 reconnect_get_min_backoff(const struct reconnect *fsm)
163 {
164 return fsm->min_backoff;
165 }
166
167 /* Return the maximum number of milliseconds to back off between consecutive
168 * connection attempts. The default is 8000 ms. */
169 int
170 reconnect_get_max_backoff(const struct reconnect *fsm)
171 {
172 return fsm->max_backoff;
173 }
174
175 /* Returns the "probe interval" for 'fsm' in milliseconds. If this is zero, it
176 * disables the connection keepalive feature. If it is nonzero, then if the
177 * interval passes while 'fsm' is connected and without reconnect_received()
178 * being called for 'fsm', reconnect_run() returns RECONNECT_PROBE. If the
179 * interval passes again without reconnect_received() being called,
180 * reconnect_run() returns RECONNECT_DISCONNECT for 'fsm'. */
181 int
182 reconnect_get_probe_interval(const struct reconnect *fsm)
183 {
184 return fsm->probe_interval;
185 }
186
187 /* Limits the maximum number of times that 'fsm' will ask the client to try to
188 * reconnect to 'max_tries'. UINT_MAX (the default) means an unlimited number
189 * of tries.
190 *
191 * After the number of tries has expired, the 'fsm' will disable itself
192 * instead of backing off and retrying. */
193 void
194 reconnect_set_max_tries(struct reconnect *fsm, unsigned int max_tries)
195 {
196 fsm->max_tries = max_tries;
197 }
198
199 /* Returns the current remaining number of connection attempts, UINT_MAX if
200 * the number is unlimited. */
201 unsigned int
202 reconnect_get_max_tries(struct reconnect *fsm)
203 {
204 return fsm->max_tries;
205 }
206
207 /* Configures the backoff parameters for 'fsm'. 'min_backoff' is the minimum
208 * number of milliseconds, and 'max_backoff' is the maximum, between connection
209 * attempts.
210 *
211 * 'min_backoff' must be at least 1000, and 'max_backoff' must be greater than
212 * or equal to 'min_backoff'. */
213 void
214 reconnect_set_backoff(struct reconnect *fsm, int min_backoff, int max_backoff)
215 {
216 fsm->min_backoff = MAX(min_backoff, 1000);
217 fsm->max_backoff = max_backoff ? MAX(max_backoff, 1000) : 8000;
218 if (fsm->min_backoff > fsm->max_backoff) {
219 fsm->max_backoff = fsm->min_backoff;
220 }
221
222 if (fsm->state == S_BACKOFF && fsm->backoff > max_backoff) {
223 fsm->backoff = max_backoff;
224 }
225 }
226
227 /* Sets the "probe interval" for 'fsm' to 'probe_interval', in milliseconds.
228 * If this is zero, it disables the connection keepalive feature. If it is
229 * nonzero, then if the interval passes while 'fsm' is connected and without
230 * reconnect_received() being called for 'fsm', reconnect_run() returns
231 * RECONNECT_PROBE. If the interval passes again without reconnect_received()
232 * being called, reconnect_run() returns RECONNECT_DISCONNECT for 'fsm'.
233 *
234 * If 'probe_interval' is nonzero, then it will be forced to a value of at
235 * least 1000 ms. */
236 void
237 reconnect_set_probe_interval(struct reconnect *fsm, int probe_interval)
238 {
239 fsm->probe_interval = probe_interval ? MAX(1000, probe_interval) : 0;
240 }
241
242 /* Returns true if 'fsm' is in passive mode, false if 'fsm' is in active mode
243 * (the default). */
244 bool
245 reconnect_is_passive(const struct reconnect *fsm)
246 {
247 return fsm->passive;
248 }
249
250 /* Configures 'fsm' for active or passive mode. In active mode (the default),
251 * the FSM is attempting to connect to a remote host. In passive mode, the FSM
252 * is listening for connections from a remote host. */
253 void
254 reconnect_set_passive(struct reconnect *fsm, bool passive, long long int now)
255 {
256 if (fsm->passive != passive) {
257 fsm->passive = passive;
258
259 if (passive
260 ? fsm->state & (S_CONNECT_IN_PROGRESS | S_RECONNECT)
261 : fsm->state == S_LISTENING && reconnect_may_retry(fsm)) {
262 reconnect_transition__(fsm, now, S_BACKOFF);
263 fsm->backoff = 0;
264 }
265 }
266 }
267
268 /* Returns true if 'fsm' has been enabled with reconnect_enable(). Calling
269 * another function that indicates a change in connection state, such as
270 * reconnect_disconnected() or reconnect_force_reconnect(), will also enable
271 * a reconnect FSM. */
272 bool
273 reconnect_is_enabled(const struct reconnect *fsm)
274 {
275 return fsm->state != S_VOID;
276 }
277
278 /* If 'fsm' is disabled (the default for newly created FSMs), enables it, so
279 * that the next call to reconnect_run() for 'fsm' will return
280 * RECONNECT_CONNECT.
281 *
282 * If 'fsm' is not disabled, this function has no effect. */
283 void
284 reconnect_enable(struct reconnect *fsm, long long int now)
285 {
286 if (fsm->state == S_VOID && reconnect_may_retry(fsm)) {
287 reconnect_transition__(fsm, now, S_BACKOFF);
288 fsm->backoff = 0;
289 }
290 }
291
292 /* Disables 'fsm'. Until 'fsm' is enabled again, reconnect_run() will always
293 * return 0. */
294 void
295 reconnect_disable(struct reconnect *fsm, long long int now)
296 {
297 if (fsm->state != S_VOID) {
298 reconnect_transition__(fsm, now, S_VOID);
299 }
300 }
301
302 /* If 'fsm' is enabled and currently connected (or attempting to connect),
303 * forces reconnect_run() for 'fsm' to return RECONNECT_DISCONNECT the next
304 * time it is called, which should cause the client to drop the connection (or
305 * attempt), back off, and then reconnect. */
306 void
307 reconnect_force_reconnect(struct reconnect *fsm, long long int now)
308 {
309 if (fsm->state & (S_CONNECT_IN_PROGRESS | S_ACTIVE | S_IDLE)) {
310 reconnect_transition__(fsm, now, S_RECONNECT);
311 }
312 }
313
314 /* Tell 'fsm' that the connection dropped or that a connection attempt failed.
315 * 'error' specifies the reason: a positive value represents an errno value,
316 * EOF indicates that the connection was closed by the peer (e.g. read()
317 * returned 0), and 0 indicates no specific error.
318 *
319 * The FSM will back off, then reconnect. */
320 void
321 reconnect_disconnected(struct reconnect *fsm, long long int now, int error)
322 {
323 if (!(fsm->state & (S_BACKOFF | S_VOID))) {
324 /* Report what happened. */
325 if (fsm->state & (S_ACTIVE | S_IDLE)) {
326 if (error > 0) {
327 VLOG_WARN("%s: connection dropped (%s)",
328 fsm->name, strerror(error));
329 } else if (error == EOF) {
330 VLOG(fsm->info, "%s: connection closed by peer", fsm->name);
331 } else {
332 VLOG(fsm->info, "%s: connection dropped", fsm->name);
333 }
334 } else if (fsm->state == S_LISTENING) {
335 if (error > 0) {
336 VLOG_WARN("%s: error listening for connections (%s)",
337 fsm->name, strerror(error));
338 } else {
339 VLOG(fsm->info, "%s: error listening for connections",
340 fsm->name);
341 }
342 } else {
343 const char *type = fsm->passive ? "listen" : "connection";
344 if (error > 0) {
345 VLOG_WARN("%s: %s attempt failed (%s)",
346 fsm->name, type, strerror(error));
347 } else {
348 VLOG(fsm->info, "%s: %s attempt timed out", fsm->name, type);
349 }
350 }
351
352 /* Back off. */
353 if (fsm->state & (S_ACTIVE | S_IDLE)
354 && (fsm->last_received - fsm->last_connected >= fsm->backoff
355 || fsm->passive)) {
356 fsm->backoff = fsm->passive ? 0 : fsm->min_backoff;
357 } else {
358 if (fsm->backoff < fsm->min_backoff) {
359 fsm->backoff = fsm->min_backoff;
360 } else if (fsm->backoff >= fsm->max_backoff / 2) {
361 fsm->backoff = fsm->max_backoff;
362 } else {
363 fsm->backoff *= 2;
364 }
365 if (fsm->passive) {
366 VLOG(fsm->info, "%s: waiting %.3g seconds before trying to "
367 "listen again", fsm->name, fsm->backoff / 1000.0);
368 } else {
369 VLOG(fsm->info, "%s: waiting %.3g seconds before reconnect",
370 fsm->name, fsm->backoff / 1000.0);
371 }
372 }
373
374 reconnect_transition__(fsm, now,
375 reconnect_may_retry(fsm) ? S_BACKOFF : S_VOID);
376 }
377 }
378
379 /* Tell 'fsm' that a connection or listening attempt is in progress.
380 *
381 * The FSM will start a timer, after which the connection or listening attempt
382 * will be aborted (by returning RECONNECT_DISCONNECT from reconect_run()). */
383 void
384 reconnect_connecting(struct reconnect *fsm, long long int now)
385 {
386 if (fsm->state != S_CONNECT_IN_PROGRESS) {
387 if (fsm->passive) {
388 VLOG(fsm->info, "%s: listening...", fsm->name);
389 } else {
390 VLOG(fsm->info, "%s: connecting...", fsm->name);
391 }
392 reconnect_transition__(fsm, now, S_CONNECT_IN_PROGRESS);
393 }
394 }
395
396 /* Tell 'fsm' that the client is listening for connection attempts. This state
397 * last indefinitely until the client reports some change.
398 *
399 * The natural progression from this state is for the client to report that a
400 * connection has been accepted or is in progress of being accepted, by calling
401 * reconnect_connecting() or reconnect_connected().
402 *
403 * The client may also report that listening failed (e.g. accept() returned an
404 * unexpected error such as ENOMEM) by calling reconnect_listen_error(), in
405 * which case the FSM will back off and eventually return RECONNECT_CONNECT
406 * from reconnect_run() to tell the client to try listening again. */
407 void
408 reconnect_listening(struct reconnect *fsm, long long int now)
409 {
410 if (fsm->state != S_LISTENING) {
411 VLOG(fsm->info, "%s: listening...", fsm->name);
412 reconnect_transition__(fsm, now, S_LISTENING);
413 }
414 }
415
416 /* Tell 'fsm' that the client's attempt to accept a connection failed
417 * (e.g. accept() returned an unexpected error such as ENOMEM).
418 *
419 * If the FSM is currently listening (reconnect_listening() was called), it
420 * will back off and eventually return RECONNECT_CONNECT from reconnect_run()
421 * to tell the client to try listening again. If there is an active
422 * connection, this will be delayed until that connection drops. */
423 void
424 reconnect_listen_error(struct reconnect *fsm, long long int now, int error)
425 {
426 if (fsm->state == S_LISTENING) {
427 reconnect_disconnected(fsm, now, error);
428 }
429 }
430
431 /* Tell 'fsm' that the connection was successful.
432 *
433 * The FSM will start the probe interval timer, which is reset by
434 * reconnect_received(). If the timer expires, a probe will be sent (by
435 * returning RECONNECT_PROBE from reconnect_run()). If the timer expires
436 * again without being reset, the connection will be aborted (by returning
437 * RECONNECT_DISCONNECT from reconnect_run()). */
438 void
439 reconnect_connected(struct reconnect *fsm, long long int now)
440 {
441 if (!is_connected_state(fsm->state)) {
442 reconnect_connecting(fsm, now);
443
444 VLOG(fsm->info, "%s: connected", fsm->name);
445 reconnect_transition__(fsm, now, S_ACTIVE);
446 fsm->last_connected = now;
447 }
448 }
449
450 /* Tell 'fsm' that the connection attempt failed.
451 *
452 * The FSM will back off and attempt to reconnect. */
453 void
454 reconnect_connect_failed(struct reconnect *fsm, long long int now, int error)
455 {
456 reconnect_connecting(fsm, now);
457 reconnect_disconnected(fsm, now, error);
458 }
459
460 /* Tell 'fsm' that some data was received. This resets the probe interval
461 * timer, so that the connection is known not to be idle. */
462 void
463 reconnect_received(struct reconnect *fsm, long long int now)
464 {
465 if (fsm->state != S_ACTIVE) {
466 reconnect_transition__(fsm, now, S_ACTIVE);
467 }
468 fsm->last_received = now;
469 }
470
471 static void
472 reconnect_transition__(struct reconnect *fsm, long long int now,
473 enum state state)
474 {
475 if (fsm->state == S_CONNECT_IN_PROGRESS) {
476 fsm->n_attempted_connections++;
477 if (state == S_ACTIVE) {
478 fsm->n_successful_connections++;
479 }
480 }
481 if (is_connected_state(fsm->state) != is_connected_state(state)) {
482 if (is_connected_state(fsm->state)) {
483 fsm->total_connected_duration += now - fsm->last_connected;
484 }
485 fsm->seqno++;
486 }
487
488 VLOG_DBG("%s: entering %s", fsm->name, reconnect_state_name__(state));
489 fsm->state = state;
490 fsm->state_entered = now;
491 }
492
493 static long long int
494 reconnect_deadline__(const struct reconnect *fsm)
495 {
496 assert(fsm->state_entered != LLONG_MIN);
497 switch (fsm->state) {
498 case S_VOID:
499 case S_LISTENING:
500 return LLONG_MAX;
501
502 case S_BACKOFF:
503 return fsm->state_entered + fsm->backoff;
504
505 case S_CONNECT_IN_PROGRESS:
506 return fsm->state_entered + MAX(1000, fsm->backoff);
507
508 case S_ACTIVE:
509 if (fsm->probe_interval) {
510 long long int base = MAX(fsm->last_received, fsm->state_entered);
511 return base + fsm->probe_interval;
512 }
513 return LLONG_MAX;
514
515 case S_IDLE:
516 return fsm->state_entered + fsm->probe_interval;
517
518 case S_RECONNECT:
519 return fsm->state_entered;
520 }
521
522 NOT_REACHED();
523 }
524
525 /* Assesses whether any action should be taken on 'fsm'. The return value is
526 * one of:
527 *
528 * - 0: The client need not take any action.
529 *
530 * - Active client, RECONNECT_CONNECT: The client should start a connection
531 * attempt and indicate this by calling reconnect_connecting(). If the
532 * connection attempt has definitely succeeded, it should call
533 * reconnect_connected(). If the connection attempt has definitely
534 * failed, it should call reconnect_connect_failed().
535 *
536 * The FSM is smart enough to back off correctly after successful
537 * connections that quickly abort, so it is OK to call
538 * reconnect_connected() after a low-level successful connection
539 * (e.g. connect()) even if the connection might soon abort due to a
540 * failure at a high-level (e.g. SSL negotiation failure).
541 *
542 * - Passive client, RECONNECT_CONNECT: The client should try to listen for
543 * a connection, if it is not already listening. It should call
544 * reconnect_listening() if successful, otherwise reconnect_connecting()
545 * or reconnected_connect_failed() if the attempt is in progress or
546 * definitely failed, respectively.
547 *
548 * A listening passive client should constantly attempt to accept a new
549 * connection and report an accepted connection with
550 * reconnect_connected().
551 *
552 * - RECONNECT_DISCONNECT: The client should abort the current connection
553 * or connection attempt or listen attempt and call
554 * reconnect_disconnected() or reconnect_connect_failed() to indicate it.
555 *
556 * - RECONNECT_PROBE: The client should send some kind of request to the
557 * peer that will elicit a response, to ensure that the connection is
558 * indeed in working order. (This will only be returned if the "probe
559 * interval" is nonzero--see reconnect_set_probe_interval()).
560 */
561 enum reconnect_action
562 reconnect_run(struct reconnect *fsm, long long int now)
563 {
564 if (now >= reconnect_deadline__(fsm)) {
565 switch (fsm->state) {
566 case S_VOID:
567 return 0;
568
569 case S_BACKOFF:
570 return RECONNECT_CONNECT;
571
572 case S_CONNECT_IN_PROGRESS:
573 return RECONNECT_DISCONNECT;
574
575 case S_ACTIVE:
576 VLOG_DBG("%s: idle %lld ms, sending inactivity probe", fsm->name,
577 now - MAX(fsm->last_received, fsm->state_entered));
578 reconnect_transition__(fsm, now, S_IDLE);
579 return RECONNECT_PROBE;
580
581 case S_IDLE:
582 VLOG_ERR("%s: no response to inactivity probe after %.3g "
583 "seconds, disconnecting",
584 fsm->name, (now - fsm->state_entered) / 1000.0);
585 return RECONNECT_DISCONNECT;
586
587 case S_RECONNECT:
588 return RECONNECT_DISCONNECT;
589
590 case S_LISTENING:
591 return 0;
592 }
593
594 NOT_REACHED();
595 } else {
596 return 0;
597 }
598 }
599
600 /* Causes the next call to poll_block() to wake up when reconnect_run() should
601 * be called on 'fsm'. */
602 void
603 reconnect_wait(struct reconnect *fsm, long long int now)
604 {
605 int timeout = reconnect_timeout(fsm, now);
606 if (timeout >= 0) {
607 poll_timer_wait(timeout);
608 }
609 }
610
611 /* Returns the number of milliseconds after which reconnect_run() should be
612 * called on 'fsm' if nothing else notable happens in the meantime, or a
613 * negative number if this is currently unnecessary. */
614 int
615 reconnect_timeout(struct reconnect *fsm, long long int now)
616 {
617 long long int deadline = reconnect_deadline__(fsm);
618 if (deadline != LLONG_MAX) {
619 long long int remaining = deadline - now;
620 return MAX(0, MIN(INT_MAX, remaining));
621 }
622 return -1;
623 }
624
625 /* Returns true if 'fsm' is currently believed to be connected, that is, if
626 * reconnect_connected() was called more recently than any call to
627 * reconnect_connect_failed() or reconnect_disconnected() or
628 * reconnect_disable(), and false otherwise. */
629 bool
630 reconnect_is_connected(const struct reconnect *fsm)
631 {
632 return is_connected_state(fsm->state);
633 }
634
635 /* Returns the number of milliseconds for which 'fsm' has been continuously
636 * connected to its peer. (If 'fsm' is not currently connected, this is 0.) */
637 unsigned int
638 reconnect_get_connection_duration(const struct reconnect *fsm,
639 long long int now)
640 {
641 return reconnect_is_connected(fsm) ? now - fsm->last_connected : 0;
642 }
643
644 /* Copies various statistics for 'fsm' into '*stats'. */
645 void
646 reconnect_get_stats(const struct reconnect *fsm, long long int now,
647 struct reconnect_stats *stats)
648 {
649 stats->creation_time = fsm->creation_time;
650 stats->last_received = fsm->last_received;
651 stats->last_connected = fsm->last_connected;
652 stats->backoff = fsm->backoff;
653 stats->seqno = fsm->seqno;
654 stats->is_connected = reconnect_is_connected(fsm);
655 stats->current_connection_duration
656 = reconnect_get_connection_duration(fsm, now);
657 stats->total_connected_duration = (stats->current_connection_duration
658 + fsm->total_connected_duration);
659 stats->n_attempted_connections = fsm->n_attempted_connections;
660 stats->n_successful_connections = fsm->n_successful_connections;
661 stats->state = reconnect_state_name__(fsm->state);
662 stats->state_elapsed = now - fsm->state_entered;
663 }
664
665 static bool
666 reconnect_may_retry(struct reconnect *fsm)
667 {
668 bool may_retry = fsm->max_tries > 0;
669 if (may_retry && fsm->max_tries != UINT_MAX) {
670 fsm->max_tries--;
671 }
672 return may_retry;
673 }