|
@@ -399,8 +399,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
|
|
mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
|
|
mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
|
|
mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
|
|
mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
|
|
|
|
|
|
- /* we won't reconnect after our valid conn goes away for
|
|
|
|
- * this hb iteration.. here so it shows up in the logs */
|
|
|
|
if (was_valid && !valid && err == 0)
|
|
if (was_valid && !valid && err == 0)
|
|
err = -ENOTCONN;
|
|
err = -ENOTCONN;
|
|
|
|
|
|
@@ -430,11 +428,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
|
|
|
|
|
|
if (!was_valid && valid) {
|
|
if (!was_valid && valid) {
|
|
o2quo_conn_up(o2net_num_from_nn(nn));
|
|
o2quo_conn_up(o2net_num_from_nn(nn));
|
|
- /* this is a bit of a hack. we only try reconnecting
|
|
|
|
- * when heartbeating starts until we get a connection.
|
|
|
|
- * if that connection then dies we don't try reconnecting.
|
|
|
|
- * the only way to start connecting again is to down
|
|
|
|
- * heartbeat and bring it back up. */
|
|
|
|
cancel_delayed_work(&nn->nn_connect_expired);
|
|
cancel_delayed_work(&nn->nn_connect_expired);
|
|
printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
|
|
printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
|
|
o2nm_this_node() > sc->sc_node->nd_num ?
|
|
o2nm_this_node() > sc->sc_node->nd_num ?
|
|
@@ -457,6 +450,18 @@ static void o2net_set_nn_state(struct o2net_node *nn,
|
|
delay = 0;
|
|
delay = 0;
|
|
mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
|
|
mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
|
|
queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
|
|
queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * Delay the expired work after idle timeout.
|
|
|
|
+ *
|
|
|
|
+ * We might have lots of failed connection attempts that run
|
|
|
|
+ * through here but we only cancel the connect_expired work when
|
|
|
|
+ * a connection attempt succeeds. So only the first enqueue of
|
|
|
|
+ * the connect_expired work will do anything. The rest will see
|
|
|
|
+ * that it's already queued and do nothing.
|
|
|
|
+ */
|
|
|
|
+ delay += msecs_to_jiffies(o2net_idle_timeout(NULL));
|
|
|
|
+ queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay);
|
|
}
|
|
}
|
|
|
|
|
|
/* keep track of the nn's sc ref for the caller */
|
|
/* keep track of the nn's sc ref for the caller */
|
|
@@ -1193,6 +1198,7 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
|
|
* shut down already */
|
|
* shut down already */
|
|
if (nn->nn_sc == sc) {
|
|
if (nn->nn_sc == sc) {
|
|
o2net_sc_reset_idle_timer(sc);
|
|
o2net_sc_reset_idle_timer(sc);
|
|
|
|
+ atomic_set(&nn->nn_timeout, 0);
|
|
o2net_set_nn_state(nn, sc, 1, 0);
|
|
o2net_set_nn_state(nn, sc, 1, 0);
|
|
}
|
|
}
|
|
spin_unlock(&nn->nn_lock);
|
|
spin_unlock(&nn->nn_lock);
|
|
@@ -1391,6 +1397,7 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
|
|
static void o2net_idle_timer(unsigned long data)
|
|
static void o2net_idle_timer(unsigned long data)
|
|
{
|
|
{
|
|
struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
|
|
struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
|
|
|
|
+ struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
|
|
struct timeval now;
|
|
struct timeval now;
|
|
|
|
|
|
do_gettimeofday(&now);
|
|
do_gettimeofday(&now);
|
|
@@ -1413,6 +1420,12 @@ static void o2net_idle_timer(unsigned long data)
|
|
sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
|
|
sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
|
|
sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
|
|
sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
|
|
|
|
|
|
|
|
+ /*
|
|
|
|
+ * Initialize the nn_timeout so that the next connection attempt
|
|
|
|
+ * will continue in o2net_start_connect.
|
|
|
|
+ */
|
|
|
|
+ atomic_set(&nn->nn_timeout, 1);
|
|
|
|
+
|
|
o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
|
|
o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
|
|
}
|
|
}
|
|
|
|
|
|
@@ -1447,6 +1460,7 @@ static void o2net_start_connect(struct work_struct *work)
|
|
struct socket *sock = NULL;
|
|
struct socket *sock = NULL;
|
|
struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
|
|
struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
|
|
int ret = 0, stop;
|
|
int ret = 0, stop;
|
|
|
|
+ unsigned int timeout;
|
|
|
|
|
|
/* if we're greater we initiate tx, otherwise we accept */
|
|
/* if we're greater we initiate tx, otherwise we accept */
|
|
if (o2nm_this_node() <= o2net_num_from_nn(nn))
|
|
if (o2nm_this_node() <= o2net_num_from_nn(nn))
|
|
@@ -1466,8 +1480,17 @@ static void o2net_start_connect(struct work_struct *work)
|
|
}
|
|
}
|
|
|
|
|
|
spin_lock(&nn->nn_lock);
|
|
spin_lock(&nn->nn_lock);
|
|
- /* see if we already have one pending or have given up */
|
|
|
|
- stop = (nn->nn_sc || nn->nn_persistent_error);
|
|
|
|
|
|
+ /*
|
|
|
|
+ * see if we already have one pending or have given up.
|
|
|
|
+ * For nn_timeout, it is set when we close the connection
|
|
|
|
+ * because of the idle time out. So it means that we have
|
|
|
|
+ * at least connected to that node successfully once,
|
|
|
|
+ * now try to connect to it again.
|
|
|
|
+ */
|
|
|
|
+ timeout = atomic_read(&nn->nn_timeout);
|
|
|
|
+ stop = (nn->nn_sc ||
|
|
|
|
+ (nn->nn_persistent_error &&
|
|
|
|
+ (nn->nn_persistent_error != -ENOTCONN || timeout == 0)));
|
|
spin_unlock(&nn->nn_lock);
|
|
spin_unlock(&nn->nn_lock);
|
|
if (stop)
|
|
if (stop)
|
|
goto out;
|
|
goto out;
|
|
@@ -1579,6 +1602,7 @@ void o2net_disconnect_node(struct o2nm_node *node)
|
|
|
|
|
|
/* don't reconnect until it's heartbeating again */
|
|
/* don't reconnect until it's heartbeating again */
|
|
spin_lock(&nn->nn_lock);
|
|
spin_lock(&nn->nn_lock);
|
|
|
|
+ atomic_set(&nn->nn_timeout, 0);
|
|
o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
|
|
o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
|
|
spin_unlock(&nn->nn_lock);
|
|
spin_unlock(&nn->nn_lock);
|
|
|
|
|
|
@@ -1613,17 +1637,12 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
|
|
(msecs_to_jiffies(o2net_reconnect_delay(node)) + 1);
|
|
(msecs_to_jiffies(o2net_reconnect_delay(node)) + 1);
|
|
|
|
|
|
if (node_num != o2nm_this_node()) {
|
|
if (node_num != o2nm_this_node()) {
|
|
- /* heartbeat doesn't work unless a local node number is
|
|
|
|
- * configured and doing so brings up the o2net_wq, so we can
|
|
|
|
- * use it.. */
|
|
|
|
- queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
|
|
|
|
- msecs_to_jiffies(o2net_idle_timeout(node)));
|
|
|
|
-
|
|
|
|
/* believe it or not, accept and node hearbeating testing
|
|
/* believe it or not, accept and node hearbeating testing
|
|
* can succeed for this node before we got here.. so
|
|
* can succeed for this node before we got here.. so
|
|
* only use set_nn_state to clear the persistent error
|
|
* only use set_nn_state to clear the persistent error
|
|
* if that hasn't already happened */
|
|
* if that hasn't already happened */
|
|
spin_lock(&nn->nn_lock);
|
|
spin_lock(&nn->nn_lock);
|
|
|
|
+ atomic_set(&nn->nn_timeout, 0);
|
|
if (nn->nn_persistent_error)
|
|
if (nn->nn_persistent_error)
|
|
o2net_set_nn_state(nn, NULL, 0, 0);
|
|
o2net_set_nn_state(nn, NULL, 0, 0);
|
|
spin_unlock(&nn->nn_lock);
|
|
spin_unlock(&nn->nn_lock);
|
|
@@ -1747,6 +1766,7 @@ static int o2net_accept_one(struct socket *sock)
|
|
new_sock = NULL;
|
|
new_sock = NULL;
|
|
|
|
|
|
spin_lock(&nn->nn_lock);
|
|
spin_lock(&nn->nn_lock);
|
|
|
|
+ atomic_set(&nn->nn_timeout, 0);
|
|
o2net_set_nn_state(nn, sc, 0, 0);
|
|
o2net_set_nn_state(nn, sc, 0, 0);
|
|
spin_unlock(&nn->nn_lock);
|
|
spin_unlock(&nn->nn_lock);
|
|
|
|
|
|
@@ -1941,6 +1961,7 @@ int o2net_init(void)
|
|
for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
|
|
for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
|
|
struct o2net_node *nn = o2net_nn_from_num(i);
|
|
struct o2net_node *nn = o2net_nn_from_num(i);
|
|
|
|
|
|
|
|
+ atomic_set(&nn->nn_timeout, 0);
|
|
spin_lock_init(&nn->nn_lock);
|
|
spin_lock_init(&nn->nn_lock);
|
|
INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect);
|
|
INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect);
|
|
INIT_DELAYED_WORK(&nn->nn_connect_expired,
|
|
INIT_DELAYED_WORK(&nn->nn_connect_expired,
|