aboutsummaryrefslogtreecommitdiff
path: root/fs/dlm/lowcomms.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/dlm/lowcomms.c')
-rw-r--r--fs/dlm/lowcomms.c533
1 files changed, 414 insertions, 119 deletions
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 37a34c2c622..d08e079ea5d 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -52,7 +52,7 @@
#include <linux/mutex.h>
#include <linux/sctp.h>
#include <linux/slab.h>
-#include <net/sctp/user.h>
+#include <net/sctp/sctp.h>
#include <net/ipv6.h>
#include "dlm_internal.h"
@@ -63,6 +63,9 @@
#define NEEDED_RMEM (4*1024*1024)
#define CONN_HASH_SIZE 32
+/* Number of messages to send before rescheduling */
+#define MAX_SEND_MSG_COUNT 25
+
struct cbuf {
unsigned int base;
unsigned int len;
@@ -108,6 +111,7 @@ struct connection {
#define CF_INIT_PENDING 4
#define CF_IS_OTHERCON 5
#define CF_CLOSE 6
+#define CF_APP_LIMITED 7
struct list_head writequeue; /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock;
int (*rx_action) (struct connection *); /* What to do when active */
@@ -121,6 +125,7 @@ struct connection {
struct connection *othercon;
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
+ bool try_new_addr;
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -135,8 +140,20 @@ struct writequeue_entry {
struct connection *con;
};
+struct dlm_node_addr {
+ struct list_head list;
+ int nodeid;
+ int addr_count;
+ int curr_addr_index;
+ struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
+};
+
+static LIST_HEAD(dlm_node_addrs);
+static DEFINE_SPINLOCK(dlm_node_addrs_spin);
+
static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
static int dlm_local_count;
+static int dlm_allow_conn;
/* Work queues */
static struct workqueue_struct *recv_workqueue;
@@ -161,12 +178,11 @@ static inline int nodeid_hash(int nodeid)
static struct connection *__find_con(int nodeid)
{
int r;
- struct hlist_node *h;
struct connection *con;
r = nodeid_hash(nodeid);
- hlist_for_each_entry(con, h, &connection_hash[r], list) {
+ hlist_for_each_entry(con, &connection_hash[r], list) {
if (con->nodeid == nodeid)
return con;
}
@@ -216,13 +232,12 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
static void foreach_conn(void (*conn_func)(struct connection *c))
{
int i;
- struct hlist_node *h, *n;
+ struct hlist_node *n;
struct connection *con;
for (i = 0; i < CONN_HASH_SIZE; i++) {
- hlist_for_each_entry_safe(con, h, n, &connection_hash[i], list){
+ hlist_for_each_entry_safe(con, n, &connection_hash[i], list)
conn_func(con);
- }
}
}
@@ -241,13 +256,12 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation)
static struct connection *assoc2con(int assoc_id)
{
int i;
- struct hlist_node *h;
struct connection *con;
mutex_lock(&connections_lock);
for (i = 0 ; i < CONN_HASH_SIZE; i++) {
- hlist_for_each_entry(con, h, &connection_hash[i], list) {
+ hlist_for_each_entry(con, &connection_hash[i], list) {
if (con->sctp_assoc == assoc_id) {
mutex_unlock(&connections_lock);
return con;
@@ -258,33 +272,159 @@ static struct connection *assoc2con(int assoc_id)
return NULL;
}
-static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
+static struct dlm_node_addr *find_node_addr(int nodeid)
+{
+ struct dlm_node_addr *na;
+
+ list_for_each_entry(na, &dlm_node_addrs, list) {
+ if (na->nodeid == nodeid)
+ return na;
+ }
+ return NULL;
+}
+
+static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
+{
+ switch (x->ss_family) {
+ case AF_INET: {
+ struct sockaddr_in *sinx = (struct sockaddr_in *)x;
+ struct sockaddr_in *siny = (struct sockaddr_in *)y;
+ if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
+ return 0;
+ if (sinx->sin_port != siny->sin_port)
+ return 0;
+ break;
+ }
+ case AF_INET6: {
+ struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
+ struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
+ if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
+ return 0;
+ if (sinx->sin6_port != siny->sin6_port)
+ return 0;
+ break;
+ }
+ default:
+ return 0;
+ }
+ return 1;
+}
+
+static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
+ struct sockaddr *sa_out, bool try_new_addr)
{
- struct sockaddr_storage addr;
- int error;
+ struct sockaddr_storage sas;
+ struct dlm_node_addr *na;
if (!dlm_local_count)
return -1;
- error = dlm_nodeid_to_addr(nodeid, &addr);
- if (error)
- return error;
+ spin_lock(&dlm_node_addrs_spin);
+ na = find_node_addr(nodeid);
+ if (na && na->addr_count) {
+ if (try_new_addr) {
+ na->curr_addr_index++;
+ if (na->curr_addr_index == na->addr_count)
+ na->curr_addr_index = 0;
+ }
+
+ memcpy(&sas, na->addr[na->curr_addr_index ],
+ sizeof(struct sockaddr_storage));
+ }
+ spin_unlock(&dlm_node_addrs_spin);
+
+ if (!na)
+ return -EEXIST;
+
+ if (!na->addr_count)
+ return -ENOENT;
+
+ if (sas_out)
+ memcpy(sas_out, &sas, sizeof(struct sockaddr_storage));
+
+ if (!sa_out)
+ return 0;
if (dlm_local_addr[0]->ss_family == AF_INET) {
- struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
- struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
+ struct sockaddr_in *in4 = (struct sockaddr_in *) &sas;
+ struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out;
ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
} else {
- struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
- struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
- ipv6_addr_copy(&ret6->sin6_addr, &in6->sin6_addr);
+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &sas;
+ struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) sa_out;
+ ret6->sin6_addr = in6->sin6_addr;
+ }
+
+ return 0;
+}
+
+static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
+{
+ struct dlm_node_addr *na;
+ int rv = -EEXIST;
+ int addr_i;
+
+ spin_lock(&dlm_node_addrs_spin);
+ list_for_each_entry(na, &dlm_node_addrs, list) {
+ if (!na->addr_count)
+ continue;
+
+ for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
+ if (addr_compare(na->addr[addr_i], addr)) {
+ *nodeid = na->nodeid;
+ rv = 0;
+ goto unlock;
+ }
+ }
+ }
+unlock:
+ spin_unlock(&dlm_node_addrs_spin);
+ return rv;
+}
+
+int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
+{
+ struct sockaddr_storage *new_addr;
+ struct dlm_node_addr *new_node, *na;
+
+ new_node = kzalloc(sizeof(struct dlm_node_addr), GFP_NOFS);
+ if (!new_node)
+ return -ENOMEM;
+
+ new_addr = kzalloc(sizeof(struct sockaddr_storage), GFP_NOFS);
+ if (!new_addr) {
+ kfree(new_node);
+ return -ENOMEM;
+ }
+
+ memcpy(new_addr, addr, len);
+
+ spin_lock(&dlm_node_addrs_spin);
+ na = find_node_addr(nodeid);
+ if (!na) {
+ new_node->nodeid = nodeid;
+ new_node->addr[0] = new_addr;
+ new_node->addr_count = 1;
+ list_add(&new_node->list, &dlm_node_addrs);
+ spin_unlock(&dlm_node_addrs_spin);
+ return 0;
+ }
+
+ if (na->addr_count >= DLM_MAX_ADDR_COUNT) {
+ spin_unlock(&dlm_node_addrs_spin);
+ kfree(new_addr);
+ kfree(new_node);
+ return -ENOSPC;
}
+ na->addr[na->addr_count++] = new_addr;
+ spin_unlock(&dlm_node_addrs_spin);
+ kfree(new_node);
return 0;
}
/* Data available on socket or listen socket received a connect */
-static void lowcomms_data_ready(struct sock *sk, int count_unused)
+static void lowcomms_data_ready(struct sock *sk)
{
struct connection *con = sock2con(sk);
if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
@@ -295,7 +435,17 @@ static void lowcomms_write_space(struct sock *sk)
{
struct connection *con = sock2con(sk);
- if (con && !test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+ if (!con)
+ return;
+
+ clear_bit(SOCK_NOSPACE, &con->sock->flags);
+
+ if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
+ con->sock->sk->sk_write_pending--;
+ clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
+ }
+
+ if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
queue_work(send_workqueue, &con->swork);
}
@@ -332,7 +482,7 @@ int dlm_lowcomms_connect_node(int nodeid)
}
/* Make a socket active */
-static int add_sock(struct socket *sock, struct connection *con)
+static void add_sock(struct socket *sock, struct connection *con)
{
con->sock = sock;
@@ -342,7 +492,6 @@ static int add_sock(struct socket *sock, struct connection *con)
con->sock->sk->sk_state_change = lowcomms_state_change;
con->sock->sk->sk_user_data = con;
con->sock->sk->sk_allocation = GFP_NOFS;
- return 0;
}
/* Add the port number to an IPv6 or 4 sockaddr and return the address
@@ -424,8 +573,23 @@ static void sctp_send_shutdown(sctp_assoc_t associd)
static void sctp_init_failed_foreach(struct connection *con)
{
+
+ /*
+ * Don't try to recover base con and handle race where the
+ * other node's assoc init creates a assoc and we get that
+ * notification, then we get a notification that our attempt
+ * failed due. This happens when we are still trying the primary
+ * address, but the other node has already tried secondary addrs
+ * and found one that worked.
+ */
+ if (!con->nodeid || con->sctp_assoc)
+ return;
+
+ log_print("Retrying SCTP association init for node %d\n", con->nodeid);
+
+ con->try_new_addr = true;
con->sctp_assoc = 0;
- if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+ if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) {
if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
queue_work(send_workqueue, &con->swork);
}
@@ -442,15 +606,62 @@ static void sctp_init_failed(void)
mutex_unlock(&connections_lock);
}
+static void retry_failed_sctp_send(struct connection *recv_con,
+ struct sctp_send_failed *sn_send_failed,
+ char *buf)
+{
+ int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed);
+ struct dlm_mhandle *mh;
+ struct connection *con;
+ char *retry_buf;
+ int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
+
+ log_print("Retry sending %d bytes to node id %d", len, nodeid);
+
+ if (!nodeid) {
+ log_print("Shouldn't resend data via listening connection.");
+ return;
+ }
+
+ con = nodeid2con(nodeid, 0);
+ if (!con) {
+ log_print("Could not look up con for nodeid %d\n",
+ nodeid);
+ return;
+ }
+
+ mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf);
+ if (!mh) {
+ log_print("Could not allocate buf for retry.");
+ return;
+ }
+ memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len);
+ dlm_lowcomms_commit_buffer(mh);
+
+ /*
+ * If we got a assoc changed event before the send failed event then
+ * we only need to retry the send.
+ */
+ if (con->sctp_assoc) {
+ if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+ queue_work(send_workqueue, &con->swork);
+ } else
+ sctp_init_failed_foreach(con);
+}
+
/* Something happened to an association */
static void process_sctp_notification(struct connection *con,
struct msghdr *msg, char *buf)
{
union sctp_notification *sn = (union sctp_notification *)buf;
+ struct linger linger;
- if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+ switch (sn->sn_header.sn_type) {
+ case SCTP_SEND_FAILED:
+ retry_failed_sctp_send(con, &sn->sn_send_failed, buf);
+ break;
+ case SCTP_ASSOC_CHANGE:
switch (sn->sn_assoc_change.sac_state) {
-
case SCTP_COMM_UP:
case SCTP_RESTART:
{
@@ -460,9 +671,6 @@ static void process_sctp_notification(struct connection *con,
int prim_len, ret;
int addr_len;
struct connection *new_con;
- sctp_peeloff_arg_t parg;
- int parglen = sizeof(parg);
- int err;
/*
* We get this before any data for an association.
@@ -497,13 +705,11 @@ static void process_sctp_notification(struct connection *con,
return;
}
make_sockaddr(&prim.ssp_addr, 0, &addr_len);
- if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
- int i;
+ if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
unsigned char *b=(unsigned char *)&prim.ssp_addr;
log_print("reject connect from unknown addr");
- for (i=0; i<sizeof(struct sockaddr_storage);i++)
- printk("%02x ", b[i]);
- printk("\n");
+ print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE,
+ b, sizeof(struct sockaddr_storage));
sctp_send_shutdown(prim.ssp_assoc_id);
return;
}
@@ -513,30 +719,35 @@ static void process_sctp_notification(struct connection *con,
return;
/* Peel off a new sock */
- parg.associd = sn->sn_assoc_change.sac_assoc_id;
- ret = kernel_getsockopt(con->sock, IPPROTO_SCTP,
- SCTP_SOCKOPT_PEELOFF,
- (void *)&parg, &parglen);
+ lock_sock(con->sock->sk);
+ ret = sctp_do_peeloff(con->sock->sk,
+ sn->sn_assoc_change.sac_assoc_id,
+ &new_con->sock);
+ release_sock(con->sock->sk);
if (ret < 0) {
log_print("Can't peel off a socket for "
"connection %d to node %d: err=%d",
- parg.associd, nodeid, ret);
- return;
- }
- new_con->sock = sockfd_lookup(parg.sd, &err);
- if (!new_con->sock) {
- log_print("sockfd_lookup error %d", err);
+ (int)sn->sn_assoc_change.sac_assoc_id,
+ nodeid, ret);
return;
}
add_sock(new_con->sock, new_con);
- sockfd_put(new_con->sock);
+
+ linger.l_onoff = 1;
+ linger.l_linger = 0;
+ ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER,
+ (char *)&linger, sizeof(linger));
+ if (ret < 0)
+ log_print("set socket option SO_LINGER failed");
log_print("connecting to %d sctp association %d",
nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
+ new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id;
+ new_con->try_new_addr = false;
/* Send any pending writes */
clear_bit(CF_CONNECT_PENDING, &new_con->flags);
- clear_bit(CF_INIT_PENDING, &con->flags);
+ clear_bit(CF_INIT_PENDING, &new_con->flags);
if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
queue_work(send_workqueue, &new_con->swork);
}
@@ -555,14 +766,10 @@ static void process_sctp_notification(struct connection *con,
}
break;
- /* We don't know which INIT failed, so clear the PENDING flags
- * on them all. if assoc_id is zero then it will then try
- * again */
-
case SCTP_CANT_STR_ASSOC:
{
+ /* Will retry init when we get the send failed notification */
log_print("Can't start SCTP association - retrying");
- sctp_init_failed();
}
break;
@@ -571,6 +778,8 @@ static void process_sctp_notification(struct connection *con,
(int)sn->sn_assoc_change.sac_assoc_id,
sn->sn_assoc_change.sac_state);
}
+ default:
+ ; /* fall through */
}
}
@@ -704,6 +913,13 @@ static int tcp_accept_from_sock(struct connection *con)
struct connection *newcon;
struct connection *addcon;
+ mutex_lock(&connections_lock);
+ if (!dlm_allow_conn) {
+ mutex_unlock(&connections_lock);
+ return -1;
+ }
+ mutex_unlock(&connections_lock);
+
memset(&peeraddr, 0, sizeof(peeraddr));
result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
IPPROTO_TCP, &newsock);
@@ -733,8 +949,11 @@ static int tcp_accept_from_sock(struct connection *con)
/* Get the new node's NODEID */
make_sockaddr(&peeraddr, 0, &len);
- if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) {
+ if (addr_to_nodeid(&peeraddr, &nodeid)) {
+ unsigned char *b=(unsigned char *)&peeraddr;
log_print("connect from non cluster node");
+ print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE,
+ b, sizeof(struct sockaddr_storage));
sock_release(newsock);
mutex_unlock(&con->sock_mutex);
return -1;
@@ -796,7 +1015,7 @@ static int tcp_accept_from_sock(struct connection *con)
/*
* Add it to the active queue in case we got data
- * beween processing the accept adding the socket
+ * between processing the accept adding the socket
* to the read_sockets list
*/
if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
@@ -820,6 +1039,24 @@ static void free_entry(struct writequeue_entry *e)
kfree(e);
}
+/*
+ * writequeue_entry_complete - try to delete and free write queue entry
+ * @e: write queue entry to try to delete
+ * @completed: bytes completed
+ *
+ * writequeue_lock must be held.
+ */
+static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
+{
+ e->offset += completed;
+ e->len -= completed;
+
+ if (e->len == 0 && e->users == 0) {
+ list_del(&e->list);
+ free_entry(e);
+ }
+}
+
/* Initiate an SCTP association.
This is a special case of send_to_sock() in that we don't yet have a
peeled-off socket for this association, so we use the listening socket
@@ -839,15 +1076,14 @@ static void sctp_init_assoc(struct connection *con)
int addrlen;
struct kvec iov[1];
+ mutex_lock(&con->sock_mutex);
if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
- return;
-
- if (con->retries++ > MAX_CONNECT_RETRIES)
- return;
+ goto unlock;
- if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) {
+ if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr,
+ con->try_new_addr)) {
log_print("no address for nodeid %d", con->nodeid);
- return;
+ goto unlock;
}
base_con = nodeid2con(0, 0);
BUG_ON(base_con == NULL);
@@ -865,17 +1101,25 @@ static void sctp_init_assoc(struct connection *con)
if (list_empty(&con->writequeue)) {
spin_unlock(&con->writequeue_lock);
log_print("writequeue empty for nodeid %d", con->nodeid);
- return;
+ goto unlock;
}
e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
len = e->len;
offset = e->offset;
- spin_unlock(&con->writequeue_lock);
/* Send the first block off the write queue */
iov[0].iov_base = page_address(e->page)+offset;
iov[0].iov_len = len;
+ spin_unlock(&con->writequeue_lock);
+
+ if (rem_addr.ss_family == AF_INET) {
+ struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr;
+ log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr);
+ } else {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr;
+ log_print("Trying to connect to %pI6", &sin6->sin6_addr);
+ }
cmsg = CMSG_FIRSTHDR(&outmessage);
cmsg->cmsg_level = IPPROTO_SCTP;
@@ -883,8 +1127,9 @@ static void sctp_init_assoc(struct connection *con)
cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
sinfo = CMSG_DATA(cmsg);
memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
- sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid());
+ sinfo->sinfo_ppid = cpu_to_le32(con->nodeid);
outmessage.msg_controllen = cmsg->cmsg_len;
+ sinfo->sinfo_flags |= SCTP_ADDR_OVER;
ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
if (ret < 0) {
@@ -897,24 +1142,22 @@ static void sctp_init_assoc(struct connection *con)
}
else {
spin_lock(&con->writequeue_lock);
- e->offset += ret;
- e->len -= ret;
-
- if (e->len == 0 && e->users == 0) {
- list_del(&e->list);
- free_entry(e);
- }
+ writequeue_entry_complete(e, ret);
spin_unlock(&con->writequeue_lock);
}
+
+unlock:
+ mutex_unlock(&con->sock_mutex);
}
/* Connect a new socket to its peer */
static void tcp_connect_to_sock(struct connection *con)
{
- int result = -EHOSTUNREACH;
struct sockaddr_storage saddr, src_addr;
int addr_len;
struct socket *sock = NULL;
+ int one = 1;
+ int result;
if (con->nodeid == 0) {
log_print("attempt to connect sock 0 foiled");
@@ -926,10 +1169,8 @@ static void tcp_connect_to_sock(struct connection *con)
goto out;
/* Some odd races can cause double-connects, ignore them */
- if (con->sock) {
- result = 0;
+ if (con->sock)
goto out;
- }
/* Create a socket to communicate with */
result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
@@ -938,8 +1179,11 @@ static void tcp_connect_to_sock(struct connection *con)
goto out_err;
memset(&saddr, 0, sizeof(saddr));
- if (dlm_nodeid_to_addr(con->nodeid, &saddr))
+ result = nodeid_to_addr(con->nodeid, &saddr, NULL, false);
+ if (result < 0) {
+ log_print("no address for nodeid %d", con->nodeid);
goto out_err;
+ }
sock->sk->sk_user_data = con;
con->rx_action = receive_from_sock;
@@ -960,8 +1204,12 @@ static void tcp_connect_to_sock(struct connection *con)
make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
log_print("connecting to %d", con->nodeid);
- result =
- sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
+
+ /* Turn off Nagle's algorithm */
+ kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+ sizeof(one));
+
+ result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
O_NONBLOCK);
if (result == -EINPROGRESS)
result = 0;
@@ -979,11 +1227,17 @@ out_err:
* Some errors are fatal and this list might need adjusting. For other
* errors we try again until the max number of retries is reached.
*/
- if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
- result != -ENETDOWN && result != -EINVAL
- && result != -EPROTONOSUPPORT) {
+ if (result != -EHOSTUNREACH &&
+ result != -ENETUNREACH &&
+ result != -ENETDOWN &&
+ result != -EINVAL &&
+ result != -EPROTONOSUPPORT) {
+ log_print("connect %d try %d error %d", con->nodeid,
+ con->retries, result);
+ mutex_unlock(&con->sock_mutex);
+ msleep(1000);
lowcomms_connect_sock(con);
- result = 0;
+ return;
}
out:
mutex_unlock(&con->sock_mutex);
@@ -1011,16 +1265,18 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
goto create_out;
}
+ /* Turn off Nagle's algorithm */
+ kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+ sizeof(one));
+
result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
(char *)&one, sizeof(one));
if (result < 0) {
log_print("Failed to set SO_REUSEADDR on socket: %d", result);
}
- sock->sk->sk_user_data = con;
con->rx_action = tcp_accept_from_sock;
con->connect_action = tcp_connect_to_sock;
- con->sock = sock;
/* Bind to our port */
make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
@@ -1057,7 +1313,7 @@ static void init_local(void)
int i;
dlm_local_count = 0;
- for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
+ for (i = 0; i < DLM_MAX_ADDR_COUNT; i++) {
if (dlm_our_addr(&sas, i))
break;
@@ -1102,6 +1358,7 @@ static int sctp_listen_for_all(void)
int result = -EINVAL, num = 1, i, addr_len;
struct connection *con = nodeid2con(0, GFP_NOFS);
int bufsize = NEEDED_RMEM;
+ int one = 1;
if (!con)
return -ENOMEM;
@@ -1136,6 +1393,11 @@ static int sctp_listen_for_all(void)
goto create_delsock;
}
+ result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
+ sizeof(one));
+ if (result < 0)
+ log_print("Could not set SCTP NODELAY error %d\n", result);
+
/* Init con struct */
sock->sk->sk_user_data = con;
con->sock = sock;
@@ -1230,7 +1492,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
struct connection *con;
struct writequeue_entry *e;
int offset = 0;
- int users = 0;
con = nodeid2con(nodeid, allocation);
if (!con)
@@ -1244,7 +1505,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
} else {
offset = e->end;
e->end += len;
- users = e->users++;
+ e->users++;
}
spin_unlock(&con->writequeue_lock);
@@ -1259,7 +1520,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
spin_lock(&con->writequeue_lock);
offset = e->end;
e->end += len;
- users = e->users++;
+ e->users++;
list_add_tail(&e->list, &con->writequeue);
spin_unlock(&con->writequeue_lock);
goto got_one;
@@ -1297,6 +1558,7 @@ static void send_to_sock(struct connection *con)
const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
struct writequeue_entry *e;
int len, offset;
+ int count = 0;
mutex_lock(&con->sock_mutex);
if (con->sock == NULL)
@@ -1319,24 +1581,29 @@ static void send_to_sock(struct connection *con)
ret = kernel_sendpage(con->sock, e->page, offset, len,
msg_flags);
if (ret == -EAGAIN || ret == 0) {
+ if (ret == -EAGAIN &&
+ test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
+ !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+ /* Notify TCP that we're limited by the
+ * application window size.
+ */
+ set_bit(SOCK_NOSPACE, &con->sock->flags);
+ con->sock->sk->sk_write_pending++;
+ }
cond_resched();
goto out;
- }
- if (ret <= 0)
+ } else if (ret < 0)
goto send_error;
}
- /* Don't starve people filling buffers */
+
+ /* Don't starve people filling buffers */
+ if (++count >= MAX_SEND_MSG_COUNT) {
cond_resched();
+ count = 0;
+ }
spin_lock(&con->writequeue_lock);
- e->offset += ret;
- e->len -= ret;
-
- if (e->len == 0 && e->users == 0) {
- list_del(&e->list);
- free_entry(e);
- continue;
- }
+ writequeue_entry_complete(e, ret);
}
spin_unlock(&con->writequeue_lock);
out:
@@ -1353,7 +1620,6 @@ out_connect:
mutex_unlock(&con->sock_mutex);
if (!test_bit(CF_INIT_PENDING, &con->flags))
lowcomms_connect_sock(con);
- return;
}
static void clean_one_writequeue(struct connection *con)
@@ -1373,6 +1639,7 @@ static void clean_one_writequeue(struct connection *con)
int dlm_lowcomms_close(int nodeid)
{
struct connection *con;
+ struct dlm_node_addr *na;
log_print("closing connection to node %d", nodeid);
con = nodeid2con(nodeid, 0);
@@ -1387,6 +1654,17 @@ int dlm_lowcomms_close(int nodeid)
clean_one_writequeue(con);
close_connection(con, true);
}
+
+ spin_lock(&dlm_node_addrs_spin);
+ na = find_node_addr(nodeid);
+ if (na) {
+ list_del(&na->list);
+ while (na->addr_count--)
+ kfree(na->addr[na->addr_count]);
+ kfree(na);
+ }
+ spin_unlock(&dlm_node_addrs_spin);
+
return 0;
}
@@ -1430,20 +1708,19 @@ static void work_stop(void)
static int work_start(void)
{
- int error;
- recv_workqueue = create_workqueue("dlm_recv");
- error = IS_ERR(recv_workqueue);
- if (error) {
- log_print("can't start dlm_recv %d", error);
- return error;
- }
-
- send_workqueue = create_singlethread_workqueue("dlm_send");
- error = IS_ERR(send_workqueue);
- if (error) {
- log_print("can't start dlm_send %d", error);
+ recv_workqueue = alloc_workqueue("dlm_recv",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
+ if (!recv_workqueue) {
+ log_print("can't start dlm_recv");
+ return -ENOMEM;
+ }
+
+ send_workqueue = alloc_workqueue("dlm_send",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
+ if (!send_workqueue) {
+ log_print("can't start dlm_send");
destroy_workqueue(recv_workqueue);
- return error;
+ return -ENOMEM;
}
return 0;
@@ -1471,6 +1748,7 @@ void dlm_lowcomms_stop(void)
socket activity.
*/
mutex_lock(&connections_lock);
+ dlm_allow_conn = 0;
foreach_conn(stop_conn);
mutex_unlock(&connections_lock);
@@ -1498,7 +1776,7 @@ int dlm_lowcomms_start(void)
if (!dlm_local_count) {
error = -ENOTCONN;
log_print("no local IP address has been set");
- goto out;
+ goto fail;
}
error = -ENOMEM;
@@ -1506,7 +1784,13 @@ int dlm_lowcomms_start(void)
__alignof__(struct connection), 0,
NULL);
if (!con_cache)
- goto out;
+ goto fail;
+
+ error = work_start();
+ if (error)
+ goto fail_destroy;
+
+ dlm_allow_conn = 1;
/* Start listening */
if (dlm_config.ci_protocol == 0)
@@ -1516,20 +1800,31 @@ int dlm_lowcomms_start(void)
if (error)
goto fail_unlisten;
- error = work_start();
- if (error)
- goto fail_unlisten;
-
return 0;
fail_unlisten:
+ dlm_allow_conn = 0;
con = nodeid2con(0,0);
if (con) {
close_connection(con, false);
kmem_cache_free(con_cache, con);
}
+fail_destroy:
kmem_cache_destroy(con_cache);
-
-out:
+fail:
return error;
}
+
+void dlm_lowcomms_exit(void)
+{
+ struct dlm_node_addr *na, *safe;
+
+ spin_lock(&dlm_node_addrs_spin);
+ list_for_each_entry_safe(na, safe, &dlm_node_addrs, list) {
+ list_del(&na->list);
+ while (na->addr_count--)
+ kfree(na->addr[na->addr_count]);
+ kfree(na);
+ }
+ spin_unlock(&dlm_node_addrs_spin);
+}