2 * Copyright (c) 2008, 2009, 2010, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 #include "reconnect.h"
22 #include "poll-loop.h"
24 #include "openvswitch/vlog.h"
26 VLOG_DEFINE_THIS_MODULE(reconnect
);
30 STATE(BACKOFF, 1 << 1) \
31 STATE(CONNECTING, 1 << 3) \
32 STATE(ACTIVE, 1 << 4) \
34 STATE(RECONNECT, 1 << 6) \
35 STATE(LISTENING, 1 << 7)
37 #define STATE(NAME, VALUE) S_##NAME = VALUE,
43 is_connected_state(enum state state
)
45 return (state
& (S_ACTIVE
| S_IDLE
)) != 0;
55 enum vlog_level info
; /* Used for informational messages. */
59 long long int state_entered
;
61 long long int last_activity
;
62 long long int last_connected
;
63 long long int last_disconnected
;
64 unsigned int max_tries
;
66 /* These values are simply for statistics reporting, not otherwise used
67 * directly by anything internal. */
68 long long int creation_time
;
69 unsigned int n_attempted_connections
, n_successful_connections
;
70 unsigned int total_connected_duration
;
74 static void reconnect_transition__(struct reconnect
*, long long int now
,
76 static long long int reconnect_deadline__(const struct reconnect
*);
77 static bool reconnect_may_retry(struct reconnect
*);
80 reconnect_state_name__(enum state state
)
83 #define STATE(NAME, VALUE) case S_##NAME: return #NAME;
90 /* Creates and returns a new reconnect FSM with default settings. The FSM is
91 * initially disabled. The caller will likely want to call reconnect_enable()
92 * and reconnect_set_name() on the returned object. */
94 reconnect_create(long long int now
)
96 struct reconnect
*fsm
= xzalloc(sizeof *fsm
);
98 fsm
->name
= xstrdup("void");
99 fsm
->min_backoff
= RECONNECT_DEFAULT_MIN_BACKOFF
;
100 fsm
->max_backoff
= RECONNECT_DEFAULT_MAX_BACKOFF
;
101 fsm
->probe_interval
= RECONNECT_DEFAULT_PROBE_INTERVAL
;
102 fsm
->passive
= false;
103 fsm
->info
= VLL_INFO
;
106 fsm
->state_entered
= now
;
108 fsm
->last_activity
= now
;
109 fsm
->last_connected
= LLONG_MAX
;
110 fsm
->last_disconnected
= LLONG_MAX
;
111 fsm
->max_tries
= UINT_MAX
;
112 fsm
->creation_time
= now
;
119 reconnect_destroy(struct reconnect
*fsm
)
127 /* If 'quiet' is true, 'fsm' will log informational messages at level VLL_DBG,
128 * by default keeping them out of log files. This is appropriate if the
129 * connection is one that is expected to be short-lived, so that the log
130 * messages are merely distracting.
132 * If 'quiet' is false, 'fsm' logs informational messages at level VLL_INFO.
133 * This is the default.
135 * This setting has no effect on the log level of debugging, warning, or error
138 reconnect_set_quiet(struct reconnect
*fsm
, bool quiet
)
140 fsm
->info
= quiet
? VLL_DBG
: VLL_INFO
;
143 /* Returns 'fsm''s name. */
145 reconnect_get_name(const struct reconnect
*fsm
)
150 /* Sets 'fsm''s name to 'name'. If 'name' is null, then "void" is used
153 * The name set for 'fsm' is used in log messages. */
155 reconnect_set_name(struct reconnect
*fsm
, const char *name
)
158 fsm
->name
= xstrdup(name
? name
: "void");
161 /* Return the minimum number of milliseconds to back off between consecutive
162 * connection attempts. The default is RECONNECT_DEFAULT_MIN_BACKOFF. */
164 reconnect_get_min_backoff(const struct reconnect
*fsm
)
166 return fsm
->min_backoff
;
169 /* Return the maximum number of milliseconds to back off between consecutive
170 * connection attempts. The default is RECONNECT_DEFAULT_MAX_BACKOFF. */
172 reconnect_get_max_backoff(const struct reconnect
*fsm
)
174 return fsm
->max_backoff
;
177 /* Returns the "probe interval" for 'fsm' in milliseconds. If this is zero, it
178 * disables the connection keepalive feature. If it is nonzero, then if the
179 * interval passes while 'fsm' is connected and without reconnect_activity()
180 * being called for 'fsm', reconnect_run() returns RECONNECT_PROBE. If the
181 * interval passes again without reconnect_activity() being called,
182 * reconnect_run() returns RECONNECT_DISCONNECT for 'fsm'. */
184 reconnect_get_probe_interval(const struct reconnect
*fsm
)
186 return fsm
->probe_interval
;
189 /* Limits the maximum number of times that 'fsm' will ask the client to try to
190 * reconnect to 'max_tries'. UINT_MAX (the default) means an unlimited number
193 * After the number of tries has expired, the 'fsm' will disable itself
194 * instead of backing off and retrying. */
196 reconnect_set_max_tries(struct reconnect
*fsm
, unsigned int max_tries
)
198 fsm
->max_tries
= max_tries
;
201 /* Returns the current remaining number of connection attempts, UINT_MAX if
202 * the number is unlimited. */
204 reconnect_get_max_tries(struct reconnect
*fsm
)
206 return fsm
->max_tries
;
209 /* Configures the backoff parameters for 'fsm'. 'min_backoff' is the minimum
210 * number of milliseconds, and 'max_backoff' is the maximum, between connection
211 * attempts. The current backoff is also the duration that 'fsm' is willing to
212 * wait for a given connection to succeed or fail.
214 * 'min_backoff' must be at least 1000, and 'max_backoff' must be greater than
215 * or equal to 'min_backoff'.
217 * Pass 0 for 'min_backoff' or 'max_backoff' or both to use the defaults. */
219 reconnect_set_backoff(struct reconnect
*fsm
, int min_backoff
, int max_backoff
)
221 fsm
->min_backoff
= MAX(min_backoff
, 1000);
222 fsm
->max_backoff
= (max_backoff
223 ? MAX(max_backoff
, 1000)
224 : RECONNECT_DEFAULT_MAX_BACKOFF
);
225 if (fsm
->min_backoff
> fsm
->max_backoff
) {
226 fsm
->max_backoff
= fsm
->min_backoff
;
229 if (fsm
->state
== S_BACKOFF
&& fsm
->backoff
> max_backoff
) {
230 fsm
->backoff
= max_backoff
;
234 /* Sets the "probe interval" for 'fsm' to 'probe_interval', in milliseconds.
235 * If this is zero, it disables the connection keepalive feature. If it is
236 * nonzero, then if the interval passes while 'fsm' is connected and without
237 * reconnect_activity() being called for 'fsm', reconnect_run() returns
238 * RECONNECT_PROBE. If the interval passes again without reconnect_activity()
239 * being called, reconnect_run() returns RECONNECT_DISCONNECT for 'fsm'.
241 * If 'probe_interval' is nonzero, then it will be forced to a value of at
244 reconnect_set_probe_interval(struct reconnect
*fsm
, int probe_interval
)
246 fsm
->probe_interval
= probe_interval
? MAX(1000, probe_interval
) : 0;
249 /* Returns true if 'fsm' is in passive mode, false if 'fsm' is in active mode
252 reconnect_is_passive(const struct reconnect
*fsm
)
257 /* Configures 'fsm' for active or passive mode. In active mode (the default),
258 * the FSM is attempting to connect to a remote host. In passive mode, the FSM
259 * is listening for connections from a remote host. */
261 reconnect_set_passive(struct reconnect
*fsm
, bool passive
, long long int now
)
263 if (fsm
->passive
!= passive
) {
264 fsm
->passive
= passive
;
267 ? fsm
->state
& (S_CONNECTING
| S_RECONNECT
)
268 : fsm
->state
== S_LISTENING
&& reconnect_may_retry(fsm
)) {
269 reconnect_transition__(fsm
, now
, S_BACKOFF
);
275 /* Returns true if 'fsm' has been enabled with reconnect_enable(). Calling
276 * another function that indicates a change in connection state, such as
277 * reconnect_disconnected() or reconnect_force_reconnect(), will also enable
278 * a reconnect FSM. */
280 reconnect_is_enabled(const struct reconnect
*fsm
)
282 return fsm
->state
!= S_VOID
;
285 /* If 'fsm' is disabled (the default for newly created FSMs), enables it, so
286 * that the next call to reconnect_run() for 'fsm' will return
289 * If 'fsm' is not disabled, this function has no effect. */
291 reconnect_enable(struct reconnect
*fsm
, long long int now
)
293 if (fsm
->state
== S_VOID
&& reconnect_may_retry(fsm
)) {
294 reconnect_transition__(fsm
, now
, S_BACKOFF
);
299 /* Disables 'fsm'. Until 'fsm' is enabled again, reconnect_run() will always
302 reconnect_disable(struct reconnect
*fsm
, long long int now
)
304 if (fsm
->state
!= S_VOID
) {
305 reconnect_transition__(fsm
, now
, S_VOID
);
309 /* If 'fsm' is enabled and currently connected (or attempting to connect),
310 * forces reconnect_run() for 'fsm' to return RECONNECT_DISCONNECT the next
311 * time it is called, which should cause the client to drop the connection (or
312 * attempt), back off, and then reconnect. */
314 reconnect_force_reconnect(struct reconnect
*fsm
, long long int now
)
316 if (fsm
->state
& (S_CONNECTING
| S_ACTIVE
| S_IDLE
)) {
317 reconnect_transition__(fsm
, now
, S_RECONNECT
);
321 /* Tell 'fsm' that the connection dropped or that a connection attempt failed.
322 * 'error' specifies the reason: a positive value represents an errno value,
323 * EOF indicates that the connection was closed by the peer (e.g. read()
324 * returned 0), and 0 indicates no specific error.
326 * The FSM will back off, then reconnect. */
328 reconnect_disconnected(struct reconnect
*fsm
, long long int now
, int error
)
330 if (!(fsm
->state
& (S_BACKOFF
| S_VOID
))) {
331 /* Report what happened. */
332 if (fsm
->state
& (S_ACTIVE
| S_IDLE
)) {
334 VLOG_WARN("%s: connection dropped (%s)",
335 fsm
->name
, ovs_strerror(error
));
336 } else if (error
== EOF
) {
337 VLOG(fsm
->info
, "%s: connection closed by peer", fsm
->name
);
339 VLOG(fsm
->info
, "%s: connection dropped", fsm
->name
);
341 } else if (fsm
->state
== S_LISTENING
) {
343 VLOG_WARN("%s: error listening for connections (%s)",
344 fsm
->name
, ovs_strerror(error
));
346 VLOG(fsm
->info
, "%s: error listening for connections",
350 const char *type
= fsm
->passive
? "listen" : "connection";
352 VLOG_INFO("%s: %s attempt failed (%s)",
353 fsm
->name
, type
, ovs_strerror(error
));
355 VLOG(fsm
->info
, "%s: %s attempt timed out", fsm
->name
, type
);
359 if (fsm
->state
& (S_ACTIVE
| S_IDLE
)) {
360 fsm
->last_disconnected
= now
;
363 if (fsm
->state
& (S_ACTIVE
| S_IDLE
)
364 && (fsm
->last_activity
- fsm
->last_connected
>= fsm
->backoff
366 fsm
->backoff
= fsm
->passive
? 0 : fsm
->min_backoff
;
368 if (fsm
->backoff
< fsm
->min_backoff
) {
369 fsm
->backoff
= fsm
->min_backoff
;
370 } else if (fsm
->backoff
>= fsm
->max_backoff
/ 2) {
371 fsm
->backoff
= fsm
->max_backoff
;
376 VLOG(fsm
->info
, "%s: waiting %.3g seconds before trying to "
377 "listen again", fsm
->name
, fsm
->backoff
/ 1000.0);
379 VLOG(fsm
->info
, "%s: waiting %.3g seconds before reconnect",
380 fsm
->name
, fsm
->backoff
/ 1000.0);
384 reconnect_transition__(fsm
, now
,
385 reconnect_may_retry(fsm
) ? S_BACKOFF
: S_VOID
);
389 /* Tell 'fsm' that a connection or listening attempt is in progress.
391 * The FSM will start a timer, after which the connection or listening attempt
392 * will be aborted (by returning RECONNECT_DISCONNECT from
393 * reconnect_run()). */
395 reconnect_connecting(struct reconnect
*fsm
, long long int now
)
397 if (fsm
->state
!= S_CONNECTING
) {
399 VLOG(fsm
->info
, "%s: listening...", fsm
->name
);
401 VLOG(fsm
->info
, "%s: connecting...", fsm
->name
);
403 reconnect_transition__(fsm
, now
, S_CONNECTING
);
407 /* Tell 'fsm' that the client is listening for connection attempts. This state
408 * last indefinitely until the client reports some change.
410 * The natural progression from this state is for the client to report that a
411 * connection has been accepted or is in progress of being accepted, by calling
412 * reconnect_connecting() or reconnect_connected().
414 * The client may also report that listening failed (e.g. accept() returned an
415 * unexpected error such as ENOMEM) by calling reconnect_listen_error(), in
416 * which case the FSM will back off and eventually return RECONNECT_CONNECT
417 * from reconnect_run() to tell the client to try listening again. */
419 reconnect_listening(struct reconnect
*fsm
, long long int now
)
421 if (fsm
->state
!= S_LISTENING
) {
422 VLOG(fsm
->info
, "%s: listening...", fsm
->name
);
423 reconnect_transition__(fsm
, now
, S_LISTENING
);
427 /* Tell 'fsm' that the client's attempt to accept a connection failed
428 * (e.g. accept() returned an unexpected error such as ENOMEM).
430 * If the FSM is currently listening (reconnect_listening() was called), it
431 * will back off and eventually return RECONNECT_CONNECT from reconnect_run()
432 * to tell the client to try listening again. If there is an active
433 * connection, this will be delayed until that connection drops. */
435 reconnect_listen_error(struct reconnect
*fsm
, long long int now
, int error
)
437 if (fsm
->state
== S_LISTENING
) {
438 reconnect_disconnected(fsm
, now
, error
);
442 /* Tell 'fsm' that the connection was successful.
444 * The FSM will start the probe interval timer, which is reset by
445 * reconnect_activity(). If the timer expires, a probe will be sent (by
446 * returning RECONNECT_PROBE from reconnect_run()). If the timer expires
447 * again without being reset, the connection will be aborted (by returning
448 * RECONNECT_DISCONNECT from reconnect_run()). */
450 reconnect_connected(struct reconnect
*fsm
, long long int now
)
452 if (!is_connected_state(fsm
->state
)) {
453 reconnect_connecting(fsm
, now
);
455 VLOG(fsm
->info
, "%s: connected", fsm
->name
);
456 reconnect_transition__(fsm
, now
, S_ACTIVE
);
457 fsm
->last_connected
= now
;
461 /* Tell 'fsm' that the connection attempt failed.
463 * The FSM will back off and attempt to reconnect. */
465 reconnect_connect_failed(struct reconnect
*fsm
, long long int now
, int error
)
467 reconnect_connecting(fsm
, now
);
468 reconnect_disconnected(fsm
, now
, error
);
471 /* Tell 'fsm' that some activity has occurred on the connection. This resets
472 * the probe interval timer, so that the connection is known not to be idle. */
474 reconnect_activity(struct reconnect
*fsm
, long long int now
)
476 if (fsm
->state
!= S_ACTIVE
) {
477 reconnect_transition__(fsm
, now
, S_ACTIVE
);
479 fsm
->last_activity
= now
;
483 reconnect_transition__(struct reconnect
*fsm
, long long int now
,
486 if (fsm
->state
== S_CONNECTING
) {
487 fsm
->n_attempted_connections
++;
488 if (state
== S_ACTIVE
) {
489 fsm
->n_successful_connections
++;
492 if (is_connected_state(fsm
->state
) != is_connected_state(state
)) {
493 if (is_connected_state(fsm
->state
)) {
494 fsm
->total_connected_duration
+= now
- fsm
->last_connected
;
499 VLOG_DBG("%s: entering %s", fsm
->name
, reconnect_state_name__(state
));
501 fsm
->state_entered
= now
;
505 reconnect_deadline__(const struct reconnect
*fsm
)
507 ovs_assert(fsm
->state_entered
!= LLONG_MIN
);
508 switch (fsm
->state
) {
514 return fsm
->state_entered
+ fsm
->backoff
;
517 return fsm
->state_entered
+ MAX(1000, fsm
->backoff
);
520 if (fsm
->probe_interval
) {
521 long long int base
= MAX(fsm
->last_activity
, fsm
->state_entered
);
522 return base
+ fsm
->probe_interval
;
527 if (fsm
->probe_interval
) {
528 return fsm
->state_entered
+ fsm
->probe_interval
;
533 return fsm
->state_entered
;
539 /* Assesses whether any action should be taken on 'fsm'. The return value is
542 * - 0: The client need not take any action.
544 * - Active client, RECONNECT_CONNECT: The client should start a connection
545 * attempt and indicate this by calling reconnect_connecting(). If the
546 * connection attempt has definitely succeeded, it should call
547 * reconnect_connected(). If the connection attempt has definitely
548 * failed, it should call reconnect_connect_failed().
550 * The FSM is smart enough to back off correctly after successful
551 * connections that quickly abort, so it is OK to call
552 * reconnect_connected() after a low-level successful connection
553 * (e.g. connect()) even if the connection might soon abort due to a
554 * failure at a high-level (e.g. SSL negotiation failure).
556 * - Passive client, RECONNECT_CONNECT: The client should try to listen for
557 * a connection, if it is not already listening. It should call
558 * reconnect_listening() if successful, otherwise reconnect_connecting()
559 * or reconnected_connect_failed() if the attempt is in progress or
560 * definitely failed, respectively.
562 * A listening passive client should constantly attempt to accept a new
563 * connection and report an accepted connection with
564 * reconnect_connected().
566 * - RECONNECT_DISCONNECT: The client should abort the current connection
567 * or connection attempt or listen attempt and call
568 * reconnect_disconnected() or reconnect_connect_failed() to indicate it.
570 * - RECONNECT_PROBE: The client should send some kind of request to the
571 * peer that will elicit a response, to ensure that the connection is
572 * indeed in working order. (This will only be returned if the "probe
573 * interval" is nonzero--see reconnect_set_probe_interval()).
575 enum reconnect_action
576 reconnect_run(struct reconnect
*fsm
, long long int now
)
578 if (now
>= reconnect_deadline__(fsm
)) {
579 switch (fsm
->state
) {
584 return RECONNECT_CONNECT
;
587 return RECONNECT_DISCONNECT
;
590 VLOG_DBG("%s: idle %lld ms, sending inactivity probe", fsm
->name
,
591 now
- MAX(fsm
->last_activity
, fsm
->state_entered
));
592 reconnect_transition__(fsm
, now
, S_IDLE
);
593 return RECONNECT_PROBE
;
596 VLOG_ERR("%s: no response to inactivity probe after %.3g "
597 "seconds, disconnecting",
598 fsm
->name
, (now
- fsm
->state_entered
) / 1000.0);
599 return RECONNECT_DISCONNECT
;
602 return RECONNECT_DISCONNECT
;
614 /* Causes the next call to poll_block() to wake up when reconnect_run() should
615 * be called on 'fsm'. */
617 reconnect_wait(struct reconnect
*fsm
, long long int now
)
619 int timeout
= reconnect_timeout(fsm
, now
);
621 poll_timer_wait(timeout
);
625 /* Returns the number of milliseconds after which reconnect_run() should be
626 * called on 'fsm' if nothing else notable happens in the meantime, or a
627 * negative number if this is currently unnecessary. */
629 reconnect_timeout(struct reconnect
*fsm
, long long int now
)
631 long long int deadline
= reconnect_deadline__(fsm
);
632 if (deadline
!= LLONG_MAX
) {
633 long long int remaining
= deadline
- now
;
634 return MAX(0, MIN(INT_MAX
, remaining
));
639 /* Returns true if 'fsm' is currently believed to be connected, that is, if
640 * reconnect_connected() was called more recently than any call to
641 * reconnect_connect_failed() or reconnect_disconnected() or
642 * reconnect_disable(), and false otherwise. */
644 reconnect_is_connected(const struct reconnect
*fsm
)
646 return is_connected_state(fsm
->state
);
649 /* Returns the number of milliseconds since 'fsm' last successfully connected
650 * to its peer (even if it has since disconnected). Returns UINT_MAX if never
653 reconnect_get_last_connect_elapsed(const struct reconnect
*fsm
,
656 return fsm
->last_connected
== LLONG_MAX
? UINT_MAX
657 : now
- fsm
->last_connected
;
660 /* Returns the number of milliseconds since 'fsm' last disconnected
661 * from its peer (even if it has since reconnected). Returns UINT_MAX if never
664 reconnect_get_last_disconnect_elapsed(const struct reconnect
*fsm
,
667 return fsm
->last_disconnected
== LLONG_MAX
? UINT_MAX
668 : now
- fsm
->last_disconnected
;
671 /* Copies various statistics for 'fsm' into '*stats'. */
673 reconnect_get_stats(const struct reconnect
*fsm
, long long int now
,
674 struct reconnect_stats
*stats
)
676 stats
->creation_time
= fsm
->creation_time
;
677 stats
->last_activity
= fsm
->last_activity
;
678 stats
->last_connected
= fsm
->last_connected
;
679 stats
->last_disconnected
= fsm
->last_disconnected
;
680 stats
->backoff
= fsm
->backoff
;
681 stats
->seqno
= fsm
->seqno
;
682 stats
->is_connected
= reconnect_is_connected(fsm
);
683 stats
->msec_since_connect
684 = reconnect_get_last_connect_elapsed(fsm
, now
);
685 stats
->msec_since_disconnect
686 = reconnect_get_last_disconnect_elapsed(fsm
, now
);
687 stats
->total_connected_duration
= fsm
->total_connected_duration
688 + (is_connected_state(fsm
->state
)
689 ? reconnect_get_last_connect_elapsed(fsm
, now
) : 0);
690 stats
->n_attempted_connections
= fsm
->n_attempted_connections
;
691 stats
->n_successful_connections
= fsm
->n_successful_connections
;
692 stats
->state
= reconnect_state_name__(fsm
->state
);
693 stats
->state_elapsed
= now
- fsm
->state_entered
;
697 reconnect_may_retry(struct reconnect
*fsm
)
699 bool may_retry
= fsm
->max_tries
> 0;
700 if (may_retry
&& fsm
->max_tries
!= UINT_MAX
) {