2 * Copyright (c) 2008, 2009, 2010, 2012 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 #include "reconnect.h"
22 #include "poll-loop.h"
25 VLOG_DEFINE_THIS_MODULE(reconnect);
29 STATE(BACKOFF, 1 << 1) \
30 STATE(CONNECTING, 1 << 3) \
31 STATE(ACTIVE, 1 << 4) \
33 STATE(RECONNECT, 1 << 6) \
34 STATE(LISTENING, 1 << 7)
36 #define STATE(NAME, VALUE) S_##NAME = VALUE,
42 is_connected_state(enum state state)
44 return (state & (S_ACTIVE | S_IDLE)) != 0;
54 enum vlog_level info; /* Used for informational messages. */
58 long long int state_entered;
60 long long int last_activity;
61 long long int last_connected;
62 long long int last_disconnected;
63 unsigned int max_tries;
65 /* These values are simply for statistics reporting, not otherwise used
66 * directly by anything internal. */
67 long long int creation_time;
68 unsigned int n_attempted_connections, n_successful_connections;
69 unsigned int total_connected_duration;
73 static void reconnect_transition__(struct reconnect *, long long int now,
75 static long long int reconnect_deadline__(const struct reconnect *);
76 static bool reconnect_may_retry(struct reconnect *);
79 reconnect_state_name__(enum state state)
82 #define STATE(NAME, VALUE) case S_##NAME: return #NAME;
89 /* Creates and returns a new reconnect FSM with default settings. The FSM is
90 * initially disabled. The caller will likely want to call reconnect_enable()
91 * and reconnect_set_name() on the returned object. */
93 reconnect_create(long long int now)
95 struct reconnect *fsm = xzalloc(sizeof *fsm);
97 fsm->name = xstrdup("void");
98 fsm->min_backoff = RECONNECT_DEFAULT_MIN_BACKOFF;
99 fsm->max_backoff = RECONNECT_DEFAULT_MAX_BACKOFF;
100 fsm->probe_interval = RECONNECT_DEFAULT_PROBE_INTERVAL;
101 fsm->passive = false;
102 fsm->info = VLL_INFO;
105 fsm->state_entered = now;
107 fsm->last_activity = now;
108 fsm->last_connected = LLONG_MAX;
109 fsm->last_disconnected = LLONG_MAX;
110 fsm->max_tries = UINT_MAX;
111 fsm->creation_time = now;
118 reconnect_destroy(struct reconnect *fsm)
126 /* If 'quiet' is true, 'fsm' will log informational messages at level VLL_DBG,
127 * by default keeping them out of log files. This is appropriate if the
128 * connection is one that is expected to be short-lived, so that the log
129 * messages are merely distracting.
131 * If 'quiet' is false, 'fsm' logs informational messages at level VLL_INFO.
132 * This is the default.
134 * This setting has no effect on the log level of debugging, warning, or error
137 reconnect_set_quiet(struct reconnect *fsm, bool quiet)
139 fsm->info = quiet ? VLL_DBG : VLL_INFO;
142 /* Returns 'fsm''s name. */
144 reconnect_get_name(const struct reconnect *fsm)
149 /* Sets 'fsm''s name to 'name'. If 'name' is null, then "void" is used
152 * The name set for 'fsm' is used in log messages. */
154 reconnect_set_name(struct reconnect *fsm, const char *name)
157 fsm->name = xstrdup(name ? name : "void");
160 /* Return the minimum number of milliseconds to back off between consecutive
161 * connection attempts. The default is RECONNECT_DEFAULT_MIN_BACKOFF. */
163 reconnect_get_min_backoff(const struct reconnect *fsm)
165 return fsm->min_backoff;
168 /* Return the maximum number of milliseconds to back off between consecutive
169 * connection attempts. The default is RECONNECT_DEFAULT_MAX_BACKOFF. */
171 reconnect_get_max_backoff(const struct reconnect *fsm)
173 return fsm->max_backoff;
176 /* Returns the "probe interval" for 'fsm' in milliseconds. If this is zero, it
177 * disables the connection keepalive feature. If it is nonzero, then if the
178 * interval passes while 'fsm' is connected and without reconnect_activity()
179 * being called for 'fsm', reconnect_run() returns RECONNECT_PROBE. If the
180 * interval passes again without reconnect_activity() being called,
181 * reconnect_run() returns RECONNECT_DISCONNECT for 'fsm'. */
183 reconnect_get_probe_interval(const struct reconnect *fsm)
185 return fsm->probe_interval;
188 /* Limits the maximum number of times that 'fsm' will ask the client to try to
189 * reconnect to 'max_tries'. UINT_MAX (the default) means an unlimited number
192 * After the number of tries has expired, the 'fsm' will disable itself
193 * instead of backing off and retrying. */
195 reconnect_set_max_tries(struct reconnect *fsm, unsigned int max_tries)
197 fsm->max_tries = max_tries;
200 /* Returns the current remaining number of connection attempts, UINT_MAX if
201 * the number is unlimited. */
203 reconnect_get_max_tries(struct reconnect *fsm)
205 return fsm->max_tries;
208 /* Configures the backoff parameters for 'fsm'. 'min_backoff' is the minimum
209 * number of milliseconds, and 'max_backoff' is the maximum, between connection
212 * 'min_backoff' must be at least 1000, and 'max_backoff' must be greater than
213 * or equal to 'min_backoff'.
215 * Pass 0 for 'min_backoff' or 'max_backoff' or both to use the defaults. */
217 reconnect_set_backoff(struct reconnect *fsm, int min_backoff, int max_backoff)
219 fsm->min_backoff = MAX(min_backoff, 1000);
220 fsm->max_backoff = (max_backoff
221 ? MAX(max_backoff, 1000)
222 : RECONNECT_DEFAULT_MAX_BACKOFF);
223 if (fsm->min_backoff > fsm->max_backoff) {
224 fsm->max_backoff = fsm->min_backoff;
227 if (fsm->state == S_BACKOFF && fsm->backoff > max_backoff) {
228 fsm->backoff = max_backoff;
232 /* Sets the "probe interval" for 'fsm' to 'probe_interval', in milliseconds.
233 * If this is zero, it disables the connection keepalive feature. If it is
234 * nonzero, then if the interval passes while 'fsm' is connected and without
235 * reconnect_activity() being called for 'fsm', reconnect_run() returns
236 * RECONNECT_PROBE. If the interval passes again without reconnect_activity()
237 * being called, reconnect_run() returns RECONNECT_DISCONNECT for 'fsm'.
239 * If 'probe_interval' is nonzero, then it will be forced to a value of at
242 reconnect_set_probe_interval(struct reconnect *fsm, int probe_interval)
244 fsm->probe_interval = probe_interval ? MAX(1000, probe_interval) : 0;
247 /* Returns true if 'fsm' is in passive mode, false if 'fsm' is in active mode
250 reconnect_is_passive(const struct reconnect *fsm)
255 /* Configures 'fsm' for active or passive mode. In active mode (the default),
256 * the FSM is attempting to connect to a remote host. In passive mode, the FSM
257 * is listening for connections from a remote host. */
259 reconnect_set_passive(struct reconnect *fsm, bool passive, long long int now)
261 if (fsm->passive != passive) {
262 fsm->passive = passive;
265 ? fsm->state & (S_CONNECTING | S_RECONNECT)
266 : fsm->state == S_LISTENING && reconnect_may_retry(fsm)) {
267 reconnect_transition__(fsm, now, S_BACKOFF);
273 /* Returns true if 'fsm' has been enabled with reconnect_enable(). Calling
274 * another function that indicates a change in connection state, such as
275 * reconnect_disconnected() or reconnect_force_reconnect(), will also enable
276 * a reconnect FSM. */
278 reconnect_is_enabled(const struct reconnect *fsm)
280 return fsm->state != S_VOID;
283 /* If 'fsm' is disabled (the default for newly created FSMs), enables it, so
284 * that the next call to reconnect_run() for 'fsm' will return
287 * If 'fsm' is not disabled, this function has no effect. */
289 reconnect_enable(struct reconnect *fsm, long long int now)
291 if (fsm->state == S_VOID && reconnect_may_retry(fsm)) {
292 reconnect_transition__(fsm, now, S_BACKOFF);
297 /* Disables 'fsm'. Until 'fsm' is enabled again, reconnect_run() will always
300 reconnect_disable(struct reconnect *fsm, long long int now)
302 if (fsm->state != S_VOID) {
303 reconnect_transition__(fsm, now, S_VOID);
307 /* If 'fsm' is enabled and currently connected (or attempting to connect),
308 * forces reconnect_run() for 'fsm' to return RECONNECT_DISCONNECT the next
309 * time it is called, which should cause the client to drop the connection (or
310 * attempt), back off, and then reconnect. */
312 reconnect_force_reconnect(struct reconnect *fsm, long long int now)
314 if (fsm->state & (S_CONNECTING | S_ACTIVE | S_IDLE)) {
315 reconnect_transition__(fsm, now, S_RECONNECT);
319 /* Tell 'fsm' that the connection dropped or that a connection attempt failed.
320 * 'error' specifies the reason: a positive value represents an errno value,
321 * EOF indicates that the connection was closed by the peer (e.g. read()
322 * returned 0), and 0 indicates no specific error.
324 * The FSM will back off, then reconnect. */
326 reconnect_disconnected(struct reconnect *fsm, long long int now, int error)
328 if (!(fsm->state & (S_BACKOFF | S_VOID))) {
329 /* Report what happened. */
330 if (fsm->state & (S_ACTIVE | S_IDLE)) {
332 VLOG_WARN("%s: connection dropped (%s)",
333 fsm->name, strerror(error));
334 } else if (error == EOF) {
335 VLOG(fsm->info, "%s: connection closed by peer", fsm->name);
337 VLOG(fsm->info, "%s: connection dropped", fsm->name);
339 } else if (fsm->state == S_LISTENING) {
341 VLOG_WARN("%s: error listening for connections (%s)",
342 fsm->name, strerror(error));
344 VLOG(fsm->info, "%s: error listening for connections",
348 const char *type = fsm->passive ? "listen" : "connection";
350 VLOG_WARN("%s: %s attempt failed (%s)",
351 fsm->name, type, strerror(error));
353 VLOG(fsm->info, "%s: %s attempt timed out", fsm->name, type);
357 if (fsm->state & (S_ACTIVE | S_IDLE)) {
358 fsm->last_disconnected = now;
361 if (fsm->state & (S_ACTIVE | S_IDLE)
362 && (fsm->last_activity - fsm->last_connected >= fsm->backoff
364 fsm->backoff = fsm->passive ? 0 : fsm->min_backoff;
366 if (fsm->backoff < fsm->min_backoff) {
367 fsm->backoff = fsm->min_backoff;
368 } else if (fsm->backoff >= fsm->max_backoff / 2) {
369 fsm->backoff = fsm->max_backoff;
374 VLOG(fsm->info, "%s: waiting %.3g seconds before trying to "
375 "listen again", fsm->name, fsm->backoff / 1000.0);
377 VLOG(fsm->info, "%s: waiting %.3g seconds before reconnect",
378 fsm->name, fsm->backoff / 1000.0);
382 reconnect_transition__(fsm, now,
383 reconnect_may_retry(fsm) ? S_BACKOFF : S_VOID);
387 /* Tell 'fsm' that a connection or listening attempt is in progress.
389 * The FSM will start a timer, after which the connection or listening attempt
390 * will be aborted (by returning RECONNECT_DISCONNECT from
391 * reconnect_run()). */
393 reconnect_connecting(struct reconnect *fsm, long long int now)
395 if (fsm->state != S_CONNECTING) {
397 VLOG(fsm->info, "%s: listening...", fsm->name);
399 VLOG(fsm->info, "%s: connecting...", fsm->name);
401 reconnect_transition__(fsm, now, S_CONNECTING);
405 /* Tell 'fsm' that the client is listening for connection attempts. This state
406 * last indefinitely until the client reports some change.
408 * The natural progression from this state is for the client to report that a
409 * connection has been accepted or is in progress of being accepted, by calling
410 * reconnect_connecting() or reconnect_connected().
412 * The client may also report that listening failed (e.g. accept() returned an
413 * unexpected error such as ENOMEM) by calling reconnect_listen_error(), in
414 * which case the FSM will back off and eventually return RECONNECT_CONNECT
415 * from reconnect_run() to tell the client to try listening again. */
417 reconnect_listening(struct reconnect *fsm, long long int now)
419 if (fsm->state != S_LISTENING) {
420 VLOG(fsm->info, "%s: listening...", fsm->name);
421 reconnect_transition__(fsm, now, S_LISTENING);
425 /* Tell 'fsm' that the client's attempt to accept a connection failed
426 * (e.g. accept() returned an unexpected error such as ENOMEM).
428 * If the FSM is currently listening (reconnect_listening() was called), it
429 * will back off and eventually return RECONNECT_CONNECT from reconnect_run()
430 * to tell the client to try listening again. If there is an active
431 * connection, this will be delayed until that connection drops. */
433 reconnect_listen_error(struct reconnect *fsm, long long int now, int error)
435 if (fsm->state == S_LISTENING) {
436 reconnect_disconnected(fsm, now, error);
440 /* Tell 'fsm' that the connection was successful.
442 * The FSM will start the probe interval timer, which is reset by
443 * reconnect_activity(). If the timer expires, a probe will be sent (by
444 * returning RECONNECT_PROBE from reconnect_run()). If the timer expires
445 * again without being reset, the connection will be aborted (by returning
446 * RECONNECT_DISCONNECT from reconnect_run()). */
448 reconnect_connected(struct reconnect *fsm, long long int now)
450 if (!is_connected_state(fsm->state)) {
451 reconnect_connecting(fsm, now);
453 VLOG(fsm->info, "%s: connected", fsm->name);
454 reconnect_transition__(fsm, now, S_ACTIVE);
455 fsm->last_connected = now;
459 /* Tell 'fsm' that the connection attempt failed.
461 * The FSM will back off and attempt to reconnect. */
463 reconnect_connect_failed(struct reconnect *fsm, long long int now, int error)
465 reconnect_connecting(fsm, now);
466 reconnect_disconnected(fsm, now, error);
469 /* Tell 'fsm' that some activity has occurred on the connection. This resets
470 * the probe interval timer, so that the connection is known not to be idle. */
472 reconnect_activity(struct reconnect *fsm, long long int now)
474 if (fsm->state != S_ACTIVE) {
475 reconnect_transition__(fsm, now, S_ACTIVE);
477 fsm->last_activity = now;
481 reconnect_transition__(struct reconnect *fsm, long long int now,
484 if (fsm->state == S_CONNECTING) {
485 fsm->n_attempted_connections++;
486 if (state == S_ACTIVE) {
487 fsm->n_successful_connections++;
490 if (is_connected_state(fsm->state) != is_connected_state(state)) {
491 if (is_connected_state(fsm->state)) {
492 fsm->total_connected_duration += now - fsm->last_connected;
497 VLOG_DBG("%s: entering %s", fsm->name, reconnect_state_name__(state));
499 fsm->state_entered = now;
503 reconnect_deadline__(const struct reconnect *fsm)
505 ovs_assert(fsm->state_entered != LLONG_MIN);
506 switch (fsm->state) {
512 return fsm->state_entered + fsm->backoff;
515 return fsm->state_entered + MAX(1000, fsm->backoff);
518 if (fsm->probe_interval) {
519 long long int base = MAX(fsm->last_activity, fsm->state_entered);
520 return base + fsm->probe_interval;
525 if (fsm->probe_interval) {
526 return fsm->state_entered + fsm->probe_interval;
531 return fsm->state_entered;
537 /* Assesses whether any action should be taken on 'fsm'. The return value is
540 * - 0: The client need not take any action.
542 * - Active client, RECONNECT_CONNECT: The client should start a connection
543 * attempt and indicate this by calling reconnect_connecting(). If the
544 * connection attempt has definitely succeeded, it should call
545 * reconnect_connected(). If the connection attempt has definitely
546 * failed, it should call reconnect_connect_failed().
548 * The FSM is smart enough to back off correctly after successful
549 * connections that quickly abort, so it is OK to call
550 * reconnect_connected() after a low-level successful connection
551 * (e.g. connect()) even if the connection might soon abort due to a
552 * failure at a high-level (e.g. SSL negotiation failure).
554 * - Passive client, RECONNECT_CONNECT: The client should try to listen for
555 * a connection, if it is not already listening. It should call
556 * reconnect_listening() if successful, otherwise reconnect_connecting()
557 * or reconnected_connect_failed() if the attempt is in progress or
558 * definitely failed, respectively.
560 * A listening passive client should constantly attempt to accept a new
561 * connection and report an accepted connection with
562 * reconnect_connected().
564 * - RECONNECT_DISCONNECT: The client should abort the current connection
565 * or connection attempt or listen attempt and call
566 * reconnect_disconnected() or reconnect_connect_failed() to indicate it.
568 * - RECONNECT_PROBE: The client should send some kind of request to the
569 * peer that will elicit a response, to ensure that the connection is
570 * indeed in working order. (This will only be returned if the "probe
571 * interval" is nonzero--see reconnect_set_probe_interval()).
573 enum reconnect_action
574 reconnect_run(struct reconnect *fsm, long long int now)
576 if (now >= reconnect_deadline__(fsm)) {
577 switch (fsm->state) {
582 return RECONNECT_CONNECT;
585 return RECONNECT_DISCONNECT;
588 VLOG_DBG("%s: idle %lld ms, sending inactivity probe", fsm->name,
589 now - MAX(fsm->last_activity, fsm->state_entered));
590 reconnect_transition__(fsm, now, S_IDLE);
591 return RECONNECT_PROBE;
594 VLOG_ERR("%s: no response to inactivity probe after %.3g "
595 "seconds, disconnecting",
596 fsm->name, (now - fsm->state_entered) / 1000.0);
597 return RECONNECT_DISCONNECT;
600 return RECONNECT_DISCONNECT;
612 /* Causes the next call to poll_block() to wake up when reconnect_run() should
613 * be called on 'fsm'. */
615 reconnect_wait(struct reconnect *fsm, long long int now)
617 int timeout = reconnect_timeout(fsm, now);
619 poll_timer_wait(timeout);
623 /* Returns the number of milliseconds after which reconnect_run() should be
624 * called on 'fsm' if nothing else notable happens in the meantime, or a
625 * negative number if this is currently unnecessary. */
627 reconnect_timeout(struct reconnect *fsm, long long int now)
629 long long int deadline = reconnect_deadline__(fsm);
630 if (deadline != LLONG_MAX) {
631 long long int remaining = deadline - now;
632 return MAX(0, MIN(INT_MAX, remaining));
637 /* Returns true if 'fsm' is currently believed to be connected, that is, if
638 * reconnect_connected() was called more recently than any call to
639 * reconnect_connect_failed() or reconnect_disconnected() or
640 * reconnect_disable(), and false otherwise. */
642 reconnect_is_connected(const struct reconnect *fsm)
644 return is_connected_state(fsm->state);
647 /* Returns the number of milliseconds since 'fsm' last successfully connected
648 * to its peer (even if it has since disconnected). Returns UINT_MAX if never
651 reconnect_get_last_connect_elapsed(const struct reconnect *fsm,
654 return fsm->last_connected == LLONG_MAX ? UINT_MAX
655 : now - fsm->last_connected;
658 /* Returns the number of milliseconds since 'fsm' last disconnected
659 * from its peer (even if it has since reconnected). Returns UINT_MAX if never
662 reconnect_get_last_disconnect_elapsed(const struct reconnect *fsm,
665 return fsm->last_disconnected == LLONG_MAX ? UINT_MAX
666 : now - fsm->last_disconnected;
669 /* Copies various statistics for 'fsm' into '*stats'. */
671 reconnect_get_stats(const struct reconnect *fsm, long long int now,
672 struct reconnect_stats *stats)
674 stats->creation_time = fsm->creation_time;
675 stats->last_activity = fsm->last_activity;
676 stats->last_connected = fsm->last_connected;
677 stats->last_disconnected = fsm->last_disconnected;
678 stats->backoff = fsm->backoff;
679 stats->seqno = fsm->seqno;
680 stats->is_connected = reconnect_is_connected(fsm);
681 stats->msec_since_connect
682 = reconnect_get_last_connect_elapsed(fsm, now);
683 stats->msec_since_disconnect
684 = reconnect_get_last_disconnect_elapsed(fsm, now);
685 stats->total_connected_duration = fsm->total_connected_duration
686 + (is_connected_state(fsm->state)
687 ? reconnect_get_last_connect_elapsed(fsm, now) : 0);
688 stats->n_attempted_connections = fsm->n_attempted_connections;
689 stats->n_successful_connections = fsm->n_successful_connections;
690 stats->state = reconnect_state_name__(fsm->state);
691 stats->state_elapsed = now - fsm->state_entered;
695 reconnect_may_retry(struct reconnect *fsm)
697 bool may_retry = fsm->max_tries > 0;
698 if (may_retry && fsm->max_tries != UINT_MAX) {