aboutsummaryrefslogtreecommitdiff
path: root/net/rds
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-09-14 10:37:28 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-14 10:37:28 -0700
commitd7e9660ad9d5e0845f52848bce31bcf5cdcdea6b (patch)
treec6c67d145771187b194d79d603742b31090a59d6 /net/rds
parentb8cb48aae1b8c50b37dcb7710363aa69a7a0d9ca (diff)
parent13af7a6ea502fcdd4c0e3d7de6e332b102309491 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6: (1623 commits) netxen: update copyright netxen: fix tx timeout recovery netxen: fix file firmware leak netxen: improve pci memory access netxen: change firmware write size tg3: Fix return ring size breakage netxen: build fix for INET=n cdc-phonet: autoconfigure Phonet address Phonet: back-end for autoconfigured addresses Phonet: fix netlink address dump error handling ipv6: Add IFA_F_DADFAILED flag net: Add DEVTYPE support for Ethernet based devices mv643xx_eth.c: remove unused txq_set_wrr() ucc_geth: Fix hangs after switching from full to half duplex ucc_geth: Rearrange some code to avoid forward declarations phy/marvell: Make non-aneg speed/duplex forcing work for 88E1111 PHYs drivers/net/phy: introduce missing kfree drivers/net/wan: introduce missing kfree net: force bridge module(s) to be GPL Subject: [PATCH] appletalk: Fix skb leak when ipddp interface is not loaded ... Fixed up trivial conflicts: - arch/x86/include/asm/socket.h converted to <asm-generic/socket.h> in the x86 tree. The generic header has the same new #define's, so that works out fine. - drivers/net/tun.c fix conflict between 89f56d1e9 ("tun: reuse struct sock fields") that switched over to using 'tun->socket.sk' instead of the redundantly available (and thus removed) 'tun->sk', and 2b980dbd ("lsm: Add hooks to the TUN driver") which added a new 'tun->sk' use. Noted in 'next' by Stephen Rothwell.
Diffstat (limited to 'net/rds')
-rw-r--r--net/rds/Kconfig26
-rw-r--r--net/rds/Makefile11
-rw-r--r--net/rds/af_rds.c8
-rw-r--r--net/rds/bind.c3
-rw-r--r--net/rds/cong.c1
-rw-r--r--net/rds/connection.c54
-rw-r--r--net/rds/ib.c7
-rw-r--r--net/rds/ib.h18
-rw-r--r--net/rds/ib_cm.c62
-rw-r--r--net/rds/ib_rdma.c12
-rw-r--r--net/rds/ib_recv.c53
-rw-r--r--net/rds/ib_stats.c2
-rw-r--r--net/rds/ib_sysctl.c12
-rw-r--r--net/rds/info.c3
-rw-r--r--net/rds/iw.c16
-rw-r--r--net/rds/iw.h1
-rw-r--r--net/rds/iw_rdma.c28
-rw-r--r--net/rds/iw_send.c2
-rw-r--r--net/rds/iw_stats.c2
-rw-r--r--net/rds/message.c6
-rw-r--r--net/rds/page.c1
-rw-r--r--net/rds/rdma_transport.c16
-rw-r--r--net/rds/rds.h9
-rw-r--r--net/rds/recv.c28
-rw-r--r--net/rds/send.c3
-rw-r--r--net/rds/stats.c6
-rw-r--r--net/rds/tcp.c320
-rw-r--r--net/rds/tcp.h93
-rw-r--r--net/rds/tcp_connect.c153
-rw-r--r--net/rds/tcp_listen.c199
-rw-r--r--net/rds/tcp_recv.c356
-rw-r--r--net/rds/tcp_send.c263
-rw-r--r--net/rds/tcp_stats.c74
-rw-r--r--net/rds/threads.c2
-rw-r--r--net/rds/transport.c31
35 files changed, 1744 insertions, 137 deletions
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
index 796773b5df9..ec753b3ae72 100644
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -1,14 +1,28 @@
config RDS
- tristate "Reliable Datagram Sockets (RDS) (EXPERIMENTAL)"
- depends on INET && INFINIBAND_IPOIB && EXPERIMENTAL
- depends on INFINIBAND && INFINIBAND_ADDR_TRANS
+ tristate "The RDS Protocol (EXPERIMENTAL)"
+ depends on INET && EXPERIMENTAL
---help---
- RDS provides reliable, sequenced delivery of datagrams
- over Infiniband.
+ The RDS (Reliable Datagram Sockets) protocol provides reliable,
+ sequenced delivery of datagrams over Infiniband, iWARP,
+ or TCP.
+
+config RDS_RDMA
+ tristate "RDS over Infiniband and iWARP"
+ depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
+ ---help---
+ Allow RDS to use Infiniband and iWARP as a transport.
+ This transport supports RDMA operations.
+
+config RDS_TCP
+ tristate "RDS over TCP"
+ depends on RDS
+ ---help---
+ Allow RDS to use TCP as a transport.
+ This transport does not support RDMA operations.
config RDS_DEBUG
- bool "Debugging messages"
+ bool "RDS debugging messages"
depends on RDS
default n
diff --git a/net/rds/Makefile b/net/rds/Makefile
index 51f27585fa0..b46eca10968 100644
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -1,13 +1,20 @@
obj-$(CONFIG_RDS) += rds.o
rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \
recv.o send.o stats.o sysctl.o threads.o transport.o \
- loop.o page.o rdma.o \
- rdma_transport.o \
+ loop.o page.o rdma.o
+
+obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
+rds_rdma-objs := rdma_transport.o \
ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
ib_sysctl.o ib_rdma.o \
iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
iw_sysctl.o iw_rdma.o
+
+obj-$(CONFIG_RDS_TCP) += rds_tcp.o
+rds_tcp-objs := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
+ tcp_send.o tcp_stats.o
+
ifeq ($(CONFIG_RDS_DEBUG), y)
EXTRA_CFLAGS += -DDEBUG
endif
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index b11e7e52786..108ed2e671c 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -39,7 +39,6 @@
#include "rds.h"
#include "rdma.h"
-#include "rdma_transport.h"
/* this is just used for stats gathering :/ */
static DEFINE_SPINLOCK(rds_sock_lock);
@@ -509,7 +508,6 @@ out:
static void __exit rds_exit(void)
{
- rds_rdma_exit();
sock_unregister(rds_family_ops.family);
proto_unregister(&rds_proto);
rds_conn_exit();
@@ -549,14 +547,8 @@ static int __init rds_init(void)
rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
- /* ib/iwarp transports currently compiled-in */
- ret = rds_rdma_init();
- if (ret)
- goto out_sock;
goto out;
-out_sock:
- sock_unregister(rds_family_ops.family);
out_proto:
proto_unregister(&rds_proto);
out_stats:
diff --git a/net/rds/bind.c b/net/rds/bind.c
index c17cc39160c..5d95fc007f1 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -187,6 +187,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (trans == NULL) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
+ if (printk_ratelimit())
+ printk(KERN_INFO "RDS: rds_bind() could not find a transport, "
+ "load rds_tcp or rds_rdma?\n");
goto out;
}
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 710e4599d76..dd2711df640 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -254,6 +254,7 @@ void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
}
}
+EXPORT_SYMBOL_GPL(rds_cong_map_updated);
int rds_cong_updated_since(unsigned long *recent)
{
diff --git a/net/rds/connection.c b/net/rds/connection.c
index d14445c4830..cc8b568c0c8 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -126,7 +126,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp,
int is_outgoing)
{
- struct rds_connection *conn, *tmp, *parent = NULL;
+ struct rds_connection *conn, *parent = NULL;
struct hlist_head *head = rds_conn_bucket(laddr, faddr);
unsigned long flags;
int ret;
@@ -155,7 +155,6 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
}
INIT_HLIST_NODE(&conn->c_hash_node);
- conn->c_version = RDS_PROTOCOL_3_0;
conn->c_laddr = laddr;
conn->c_faddr = faddr;
spin_lock_init(&conn->c_lock);
@@ -211,26 +210,40 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
trans->t_name ? trans->t_name : "[unknown]",
is_outgoing ? "(outgoing)" : "");
+ /*
+ * Since we ran without holding the conn lock, someone could
+ * have created the same conn (either normal or passive) in the
+ * interim. We check while holding the lock. If we won, we complete
+ * init and return our conn. If we lost, we rollback and return the
+ * other one.
+ */
spin_lock_irqsave(&rds_conn_lock, flags);
- if (parent == NULL) {
- tmp = rds_conn_lookup(head, laddr, faddr, trans);
- if (tmp == NULL)
- hlist_add_head(&conn->c_hash_node, head);
- } else {
- tmp = parent->c_passive;
- if (!tmp)
+ if (parent) {
+ /* Creating passive conn */
+ if (parent->c_passive) {
+ trans->conn_free(conn->c_transport_data);
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = parent->c_passive;
+ } else {
parent->c_passive = conn;
- }
-
- if (tmp) {
- trans->conn_free(conn->c_transport_data);
- kmem_cache_free(rds_conn_slab, conn);
- conn = tmp;
+ rds_cong_add_conn(conn);
+ rds_conn_count++;
+ }
} else {
- rds_cong_add_conn(conn);
- rds_conn_count++;
+ /* Creating normal conn */
+ struct rds_connection *found;
+
+ found = rds_conn_lookup(head, laddr, faddr, trans);
+ if (found) {
+ trans->conn_free(conn->c_transport_data);
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = found;
+ } else {
+ hlist_add_head(&conn->c_hash_node, head);
+ rds_cong_add_conn(conn);
+ rds_conn_count++;
+ }
}
-
spin_unlock_irqrestore(&rds_conn_lock, flags);
out:
@@ -242,12 +255,14 @@ struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
{
return __rds_conn_create(laddr, faddr, trans, gfp, 0);
}
+EXPORT_SYMBOL_GPL(rds_conn_create);
struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp)
{
return __rds_conn_create(laddr, faddr, trans, gfp, 1);
}
+EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
void rds_conn_destroy(struct rds_connection *conn)
{
@@ -290,6 +305,7 @@ void rds_conn_destroy(struct rds_connection *conn)
rds_conn_count--;
}
+EXPORT_SYMBOL_GPL(rds_conn_destroy);
static void rds_conn_message_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
@@ -393,6 +409,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
spin_unlock_irqrestore(&rds_conn_lock, flags);
}
+EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
static int rds_conn_info_visitor(struct rds_connection *conn,
void *buffer)
@@ -468,6 +485,7 @@ void rds_conn_drop(struct rds_connection *conn)
atomic_set(&conn->c_state, RDS_CONN_ERROR);
queue_work(rds_wq, &conn->c_down_w);
}
+EXPORT_SYMBOL_GPL(rds_conn_drop);
/*
* An error occurred on the connection
diff --git a/net/rds/ib.c b/net/rds/ib.c
index b9bcd32431e..536ebe5d3f6 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -43,11 +43,14 @@
unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
+unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
module_param(fmr_pool_size, int, 0444);
MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
module_param(fmr_message_size, int, 0444);
MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
+module_param(rds_ib_retry_count, int, 0444);
+MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
struct list_head rds_ib_devices;
@@ -82,9 +85,6 @@ void rds_ib_add_one(struct ib_device *device)
rds_ibdev->max_wrs = dev_attr->max_qp_wr;
rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
- rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1);
- rds_ibdev->fmr_page_size = 1 << rds_ibdev->fmr_page_shift;
- rds_ibdev->fmr_page_mask = ~((u64) rds_ibdev->fmr_page_size - 1);
rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
rds_ibdev->max_fmrs = dev_attr->max_fmr ?
min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
@@ -282,6 +282,7 @@ struct rds_transport rds_ib_transport = {
.flush_mrs = rds_ib_flush_mrs,
.t_owner = THIS_MODULE,
.t_name = "infiniband",
+ .t_type = RDS_TRANS_IB
};
int __init rds_ib_init(void)
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 455ae73047f..1378b854cac 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -15,6 +15,8 @@
#define RDS_IB_DEFAULT_RECV_WR 1024
#define RDS_IB_DEFAULT_SEND_WR 256
+#define RDS_IB_DEFAULT_RETRY_COUNT 2
+
#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
extern struct list_head rds_ib_devices;
@@ -157,9 +159,6 @@ struct rds_ib_device {
struct ib_pd *pd;
struct ib_mr *mr;
struct rds_ib_mr_pool *mr_pool;
- int fmr_page_shift;
- int fmr_page_size;
- u64 fmr_page_mask;
unsigned int fmr_max_remaps;
unsigned int max_fmrs;
int max_sge;
@@ -247,6 +246,7 @@ extern struct ib_client rds_ib_client;
extern unsigned int fmr_pool_size;
extern unsigned int fmr_message_size;
+extern unsigned int rds_ib_retry_count;
extern spinlock_t ib_nodev_conns_lock;
extern struct list_head ib_nodev_conns;
@@ -355,17 +355,25 @@ extern ctl_table rds_ib_sysctl_table[];
/*
* Helper functions for getting/setting the header and data SGEs in
* RDS packets (not RDMA)
+ *
+ * From version 3.1 onwards, header is in front of data in the sge.
*/
static inline struct ib_sge *
rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
{
- return &sge[0];
+ if (ic->conn->c_version > RDS_PROTOCOL_3_0)
+ return &sge[0];
+ else
+ return &sge[1];
}
static inline struct ib_sge *
rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
{
- return &sge[1];
+ if (ic->conn->c_version > RDS_PROTOCOL_3_0)
+ return &sge[1];
+ else
+ return &sge[0];
}
#endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index f8e40e1a603..c2d372f13db 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -98,21 +98,34 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
struct ib_qp_attr qp_attr;
int err;
- if (event->param.conn.private_data_len) {
+ if (event->param.conn.private_data_len >= sizeof(*dp)) {
dp = event->param.conn.private_data;
- rds_ib_set_protocol(conn,
+ /* make sure it isn't empty data */
+ if (dp->dp_protocol_major) {
+ rds_ib_set_protocol(conn,
RDS_PROTOCOL(dp->dp_protocol_major,
- dp->dp_protocol_minor));
- rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+ dp->dp_protocol_minor));
+ rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+ }
}
printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
- &conn->c_laddr,
+ &conn->c_faddr,
RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version),
ic->i_flowctl ? ", flow control" : "");
+ /*
+ * Init rings and fill recv. this needs to wait until protocol negotiation
+ * is complete, since ring layout is different from 3.0 to 3.1.
+ */
+ rds_ib_send_init_ring(ic);
+ rds_ib_recv_init_ring(ic);
+ /* Post receive buffers - as a side effect, this will update
+ * the posted credit count. */
+ rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+
/* Tune RNR behavior */
rds_ib_tune_rnr(ic, &qp_attr);
@@ -145,7 +158,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
/* XXX tune these? */
conn_param->responder_resources = 1;
conn_param->initiator_depth = 1;
- conn_param->retry_count = 7;
+ conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
conn_param->rnr_retry_count = 7;
if (dp) {
@@ -190,9 +203,9 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
break;
default:
- printk(KERN_WARNING "RDS/ib: unhandled QP event %u "
- "on connection to %pI4\n", event->event,
- &conn->c_faddr);
+ rds_ib_conn_error(conn, "RDS/IB: Fatal QP Event %u "
+ "- connection %pI4->%pI4, reconnecting\n",
+ event->event, &conn->c_laddr, &conn->c_faddr);
break;
}
}
@@ -321,7 +334,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
rdsdebug("send allocation failed\n");
goto out;
}
- rds_ib_send_init_ring(ic);
+ memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
if (ic->i_recvs == NULL) {
@@ -329,14 +342,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
rdsdebug("recv allocation failed\n");
goto out;
}
+ memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
- rds_ib_recv_init_ring(ic);
rds_ib_recv_init_ack(ic);
- /* Post receive buffers - as a side effect, this will update
- * the posted credit count. */
- rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
-
rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
ic->i_send_cq, ic->i_recv_cq);
@@ -344,19 +353,32 @@ out:
return ret;
}
-static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp)
+static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
{
+ const struct rds_ib_connect_private *dp = event->param.conn.private_data;
u16 common;
u32 version = 0;
- /* rdma_cm private data is odd - when there is any private data in the
+ /*
+ * rdma_cm private data is odd - when there is any private data in the
* request, we will be given a pretty large buffer without telling us the
* original size. The only way to tell the difference is by looking at
* the contents, which are initialized to zero.
* If the protocol version fields aren't set, this is a connection attempt
* from an older version. This could could be 3.0 or 2.0 - we can't tell.
- * We really should have changed this for OFED 1.3 :-( */
- if (dp->dp_protocol_major == 0)
+ * We really should have changed this for OFED 1.3 :-(
+ */
+
+ /* Be paranoid. RDS always has privdata */
+ if (!event->param.conn.private_data_len) {
+ printk(KERN_NOTICE "RDS incoming connection has no private data, "
+ "rejecting\n");
+ return 0;
+ }
+
+ /* Even if len is crap *now* I still want to check it. -ASG */
+ if (event->param.conn.private_data_len < sizeof (*dp)
+ || dp->dp_protocol_major == 0)
return RDS_PROTOCOL_3_0;
common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
@@ -388,7 +410,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
int err, destroy = 1;
/* Check whether the remote protocol version matches ours. */
- version = rds_ib_protocol_compatible(dp);
+ version = rds_ib_protocol_compatible(event);
if (!version)
goto out;
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 81033af9302..ef3ab5b7283 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -211,7 +211,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
pool->fmr_attr.max_pages = fmr_message_size;
pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
- pool->fmr_attr.page_shift = rds_ibdev->fmr_page_shift;
+ pool->fmr_attr.page_shift = PAGE_SHIFT;
pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
/* We never allow more than max_items MRs to be allocated.
@@ -349,13 +349,13 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
- if (dma_addr & ~rds_ibdev->fmr_page_mask) {
+ if (dma_addr & ~PAGE_MASK) {
if (i > 0)
return -EINVAL;
else
++page_cnt;
}
- if ((dma_addr + dma_len) & ~rds_ibdev->fmr_page_mask) {
+ if ((dma_addr + dma_len) & ~PAGE_MASK) {
if (i < sg_dma_len - 1)
return -EINVAL;
else
@@ -365,7 +365,7 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
len += dma_len;
}
- page_cnt += len >> rds_ibdev->fmr_page_shift;
+ page_cnt += len >> PAGE_SHIFT;
if (page_cnt > fmr_message_size)
return -EINVAL;
@@ -378,9 +378,9 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
- for (j = 0; j < dma_len; j += rds_ibdev->fmr_page_size)
+ for (j = 0; j < dma_len; j += PAGE_SIZE)
dma_pages[page_cnt++] =
- (dma_addr & rds_ibdev->fmr_page_mask) + j;
+ (dma_addr & PAGE_MASK) + j;
}
ret = ib_map_phys_fmr(ibmr->fmr,
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 5709bad2832..cd7a6cfcab0 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -555,6 +555,47 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
return rds_ib_get_ack(ic);
}
+static struct rds_header *rds_ib_get_header(struct rds_connection *conn,
+ struct rds_ib_recv_work *recv,
+ u32 data_len)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs];
+ void *addr;
+ u32 misplaced_hdr_bytes;
+
+ /*
+ * Support header at the front (RDS 3.1+) as well as header-at-end.
+ *
+ * Cases:
+ * 1) header all in header buff (great!)
+ * 2) header all in data page (copy all to header buff)
+ * 3) header split across hdr buf + data page
+ * (move bit in hdr buff to end before copying other bit from data page)
+ */
+ if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE)
+ return hdr_buff;
+
+ if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) {
+ addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
+ memcpy(hdr_buff,
+ addr + recv->r_frag->f_offset + data_len,
+ sizeof(struct rds_header));
+ kunmap_atomic(addr, KM_SOFTIRQ0);
+ return hdr_buff;
+ }
+
+ misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len));
+
+ memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes);
+
+ addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
+ memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len,
+ sizeof(struct rds_header) - misplaced_hdr_bytes);
+ kunmap_atomic(addr, KM_SOFTIRQ0);
+ return hdr_buff;
+}
+
/*
* It's kind of lame that we're copying from the posted receive pages into
* long-lived bitmaps. We could have posted the bitmaps and rdma written into
@@ -645,7 +686,7 @@ struct rds_ib_ack_state {
};
static void rds_ib_process_recv(struct rds_connection *conn,
- struct rds_ib_recv_work *recv, u32 byte_len,
+ struct rds_ib_recv_work *recv, u32 data_len,
struct rds_ib_ack_state *state)
{
struct rds_ib_connection *ic = conn->c_transport_data;
@@ -655,9 +696,9 @@ static void rds_ib_process_recv(struct rds_connection *conn,
/* XXX shut down the connection if port 0,0 are seen? */
rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
- byte_len);
+ data_len);
- if (byte_len < sizeof(struct rds_header)) {
+ if (data_len < sizeof(struct rds_header)) {
rds_ib_conn_error(conn, "incoming message "
"from %pI4 didn't inclue a "
"header, disconnecting and "
@@ -665,9 +706,9 @@ static void rds_ib_process_recv(struct rds_connection *conn,
&conn->c_faddr);
return;
}
- byte_len -= sizeof(struct rds_header);
+ data_len -= sizeof(struct rds_header);
- ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+ ihdr = rds_ib_get_header(conn, recv, data_len);
/* Validate the checksum. */
if (!rds_message_verify_checksum(ihdr)) {
@@ -687,7 +728,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
if (ihdr->h_credit)
rds_ib_send_add_credits(conn, ihdr->h_credit);
- if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
+ if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
/* This is an ACK-only packet. The fact that it gets
* special treatment here is that historically, ACKs
* were rather special beasts.
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index 02e3e3d50d4..8d8488306fe 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -39,7 +39,7 @@
DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned;
-static char *rds_ib_stat_names[] = {
+static const char *const rds_ib_stat_names[] = {
"ib_connect_raced",
"ib_listen_closed_stale",
"ib_tx_cq_call",
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
index d87830db93a..84b5ffcb280 100644
--- a/net/rds/ib_sysctl.c
+++ b/net/rds/ib_sysctl.c
@@ -53,7 +53,17 @@ unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
-unsigned int rds_ib_sysctl_flow_control = 1;
+/*
+ * This sysctl does nothing.
+ *
+ * Backwards compatibility with RDS 3.0 wire protocol
+ * disables initial FC credit exchange.
+ * If it's ever possible to drop 3.0 support,
+ * setting this to 1 and moving init/refill of send/recv
+ * rings from ib_cm_connect_complete() back into ib_setup_qp()
+ * will cause credits to be added before protocol negotiation.
+ */
+unsigned int rds_ib_sysctl_flow_control = 0;
ctl_table rds_ib_sysctl_table[] = {
{
diff --git a/net/rds/info.c b/net/rds/info.c
index 62aeef37aef..814a91a6f4a 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -79,6 +79,7 @@ void rds_info_register_func(int optname, rds_info_func func)
rds_info_funcs[offset] = func;
spin_unlock(&rds_info_lock);
}
+EXPORT_SYMBOL_GPL(rds_info_register_func);
void rds_info_deregister_func(int optname, rds_info_func func)
{
@@ -91,6 +92,7 @@ void rds_info_deregister_func(int optname, rds_info_func func)
rds_info_funcs[offset] = NULL;
spin_unlock(&rds_info_lock);
}
+EXPORT_SYMBOL_GPL(rds_info_deregister_func);
/*
* Typically we hold an atomic kmap across multiple rds_info_copy() calls
@@ -137,6 +139,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
}
}
}
+EXPORT_SYMBOL_GPL(rds_info_copy);
/*
* @optval points to the userspace buffer that the information snapshot
diff --git a/net/rds/iw.c b/net/rds/iw.c
index d16e1cbc8e8..db224f7c293 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -83,23 +83,16 @@ void rds_iw_add_one(struct ib_device *device)
rds_iwdev->max_wrs = dev_attr->max_qp_wr;
rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
- rds_iwdev->page_shift = max(PAGE_SHIFT, ffs(dev_attr->page_size_cap) - 1);
-
rds_iwdev->dev = device;
rds_iwdev->pd = ib_alloc_pd(device);
if (IS_ERR(rds_iwdev->pd))
goto free_dev;
if (!rds_iwdev->dma_local_lkey) {
- if (device->node_type != RDMA_NODE_RNIC) {
- rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
- IB_ACCESS_LOCAL_WRITE);
- } else {
- rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
- IB_ACCESS_REMOTE_READ |
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_LOCAL_WRITE);
- }
+ rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(rds_iwdev->mr))
goto err_pd;
} else
@@ -291,6 +284,7 @@ struct rds_transport rds_iw_transport = {
.flush_mrs = rds_iw_flush_mrs,
.t_owner = THIS_MODULE,
.t_name = "iwarp",
+ .t_type = RDS_TRANS_IWARP,
.t_prefer_loopback = 1,
};
diff --git a/net/rds/iw.h b/net/rds/iw.h
index 0715dde323e..dd72b62bd50 100644
--- a/net/rds/iw.h
+++ b/net/rds/iw.h
@@ -181,7 +181,6 @@ struct rds_iw_device {
struct ib_pd *pd;
struct ib_mr *mr;
struct rds_iw_mr_pool *mr_pool;
- int page_shift;
int max_sge;
unsigned int max_wrs;
unsigned int dma_local_lkey:1;
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
index d