diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-14 10:37:28 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-14 10:37:28 -0700 |
commit | d7e9660ad9d5e0845f52848bce31bcf5cdcdea6b (patch) | |
tree | c6c67d145771187b194d79d603742b31090a59d6 /net/rds | |
parent | b8cb48aae1b8c50b37dcb7710363aa69a7a0d9ca (diff) | |
parent | 13af7a6ea502fcdd4c0e3d7de6e332b102309491 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6: (1623 commits)
netxen: update copyright
netxen: fix tx timeout recovery
netxen: fix file firmware leak
netxen: improve pci memory access
netxen: change firmware write size
tg3: Fix return ring size breakage
netxen: build fix for INET=n
cdc-phonet: autoconfigure Phonet address
Phonet: back-end for autoconfigured addresses
Phonet: fix netlink address dump error handling
ipv6: Add IFA_F_DADFAILED flag
net: Add DEVTYPE support for Ethernet based devices
mv643xx_eth.c: remove unused txq_set_wrr()
ucc_geth: Fix hangs after switching from full to half duplex
ucc_geth: Rearrange some code to avoid forward declarations
phy/marvell: Make non-aneg speed/duplex forcing work for 88E1111 PHYs
drivers/net/phy: introduce missing kfree
drivers/net/wan: introduce missing kfree
net: force bridge module(s) to be GPL
Subject: [PATCH] appletalk: Fix skb leak when ipddp interface is not loaded
...
Fixed up trivial conflicts:
- arch/x86/include/asm/socket.h
converted to <asm-generic/socket.h> in the x86 tree. The generic
header has the same new #define's, so that works out fine.
- drivers/net/tun.c
fix conflict between 89f56d1e9 ("tun: reuse struct sock fields") that
switched over to using 'tun->socket.sk' instead of the redundantly
available (and thus removed) 'tun->sk', and 2b980dbd ("lsm: Add hooks
to the TUN driver") which added a new 'tun->sk' use.
Noted in 'next' by Stephen Rothwell.
Diffstat (limited to 'net/rds')
35 files changed, 1744 insertions, 137 deletions
diff --git a/net/rds/Kconfig b/net/rds/Kconfig index 796773b5df9..ec753b3ae72 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -1,14 +1,28 @@ config RDS - tristate "Reliable Datagram Sockets (RDS) (EXPERIMENTAL)" - depends on INET && INFINIBAND_IPOIB && EXPERIMENTAL - depends on INFINIBAND && INFINIBAND_ADDR_TRANS + tristate "The RDS Protocol (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL ---help--- - RDS provides reliable, sequenced delivery of datagrams - over Infiniband. + The RDS (Reliable Datagram Sockets) protocol provides reliable, + sequenced delivery of datagrams over Infiniband, iWARP, + or TCP. + +config RDS_RDMA + tristate "RDS over Infiniband and iWARP" + depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS + ---help--- + Allow RDS to use Infiniband and iWARP as a transport. + This transport supports RDMA operations. + +config RDS_TCP + tristate "RDS over TCP" + depends on RDS + ---help--- + Allow RDS to use TCP as a transport. + This transport does not support RDMA operations. config RDS_DEBUG - bool "Debugging messages" + bool "RDS debugging messages" depends on RDS default n diff --git a/net/rds/Makefile b/net/rds/Makefile index 51f27585fa0..b46eca10968 100644 --- a/net/rds/Makefile +++ b/net/rds/Makefile @@ -1,13 +1,20 @@ obj-$(CONFIG_RDS) += rds.o rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \ recv.o send.o stats.o sysctl.o threads.o transport.o \ - loop.o page.o rdma.o \ - rdma_transport.o \ + loop.o page.o rdma.o + +obj-$(CONFIG_RDS_RDMA) += rds_rdma.o +rds_rdma-objs := rdma_transport.o \ ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \ ib_sysctl.o ib_rdma.o \ iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \ iw_sysctl.o iw_rdma.o + +obj-$(CONFIG_RDS_TCP) += rds_tcp.o +rds_tcp-objs := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \ + tcp_send.o tcp_stats.o + ifeq ($(CONFIG_RDS_DEBUG), y) EXTRA_CFLAGS += -DDEBUG endif diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index b11e7e52786..108ed2e671c 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -39,7 +39,6 @@ #include "rds.h" #include "rdma.h" -#include "rdma_transport.h" /* this is just used for stats gathering :/ */ static DEFINE_SPINLOCK(rds_sock_lock); @@ -509,7 +508,6 @@ out: static void __exit rds_exit(void) { - rds_rdma_exit(); sock_unregister(rds_family_ops.family); proto_unregister(&rds_proto); rds_conn_exit(); @@ -549,14 +547,8 @@ static int __init rds_init(void) rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); - /* ib/iwarp transports currently compiled-in */ - ret = rds_rdma_init(); - if (ret) - goto out_sock; goto out; -out_sock: - sock_unregister(rds_family_ops.family); out_proto: proto_unregister(&rds_proto); out_stats: diff --git a/net/rds/bind.c b/net/rds/bind.c index c17cc39160c..5d95fc007f1 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -187,6 +187,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (trans == NULL) { ret = -EADDRNOTAVAIL; rds_remove_bound(rs); + if (printk_ratelimit()) + printk(KERN_INFO "RDS: rds_bind() could not find a transport, " + "load rds_tcp or rds_rdma?\n"); goto out; } diff --git a/net/rds/cong.c b/net/rds/cong.c index 710e4599d76..dd2711df640 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -254,6 +254,7 @@ void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask) read_unlock_irqrestore(&rds_cong_monitor_lock, flags); } } +EXPORT_SYMBOL_GPL(rds_cong_map_updated); int rds_cong_updated_since(unsigned long *recent) { diff --git a/net/rds/connection.c b/net/rds/connection.c index d14445c4830..cc8b568c0c8 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -126,7 +126,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp, int is_outgoing) { - struct rds_connection *conn, *tmp, *parent = NULL; + struct rds_connection *conn, *parent = NULL; struct hlist_head *head = rds_conn_bucket(laddr, faddr); unsigned long flags; int ret; @@ -155,7 +155,6 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, } INIT_HLIST_NODE(&conn->c_hash_node); - conn->c_version = RDS_PROTOCOL_3_0; conn->c_laddr = laddr; conn->c_faddr = faddr; spin_lock_init(&conn->c_lock); @@ -211,26 +210,40 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, trans->t_name ? trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : ""); + /* + * Since we ran without holding the conn lock, someone could + * have created the same conn (either normal or passive) in the + * interim. We check while holding the lock. If we won, we complete + * init and return our conn. If we lost, we rollback and return the + * other one. + */ spin_lock_irqsave(&rds_conn_lock, flags); - if (parent == NULL) { - tmp = rds_conn_lookup(head, laddr, faddr, trans); - if (tmp == NULL) - hlist_add_head(&conn->c_hash_node, head); - } else { - tmp = parent->c_passive; - if (!tmp) + if (parent) { + /* Creating passive conn */ + if (parent->c_passive) { + trans->conn_free(conn->c_transport_data); + kmem_cache_free(rds_conn_slab, conn); + conn = parent->c_passive; + } else { parent->c_passive = conn; - } - - if (tmp) { - trans->conn_free(conn->c_transport_data); - kmem_cache_free(rds_conn_slab, conn); - conn = tmp; + rds_cong_add_conn(conn); + rds_conn_count++; + } } else { - rds_cong_add_conn(conn); - rds_conn_count++; + /* Creating normal conn */ + struct rds_connection *found; + + found = rds_conn_lookup(head, laddr, faddr, trans); + if (found) { + trans->conn_free(conn->c_transport_data); + kmem_cache_free(rds_conn_slab, conn); + conn = found; + } else { + hlist_add_head(&conn->c_hash_node, head); + rds_cong_add_conn(conn); + rds_conn_count++; + } } - spin_unlock_irqrestore(&rds_conn_lock, flags); out: @@ -242,12 +255,14 @@ struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, { return __rds_conn_create(laddr, faddr, trans, gfp, 0); } +EXPORT_SYMBOL_GPL(rds_conn_create); struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp) { return __rds_conn_create(laddr, faddr, trans, gfp, 1); } +EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); void rds_conn_destroy(struct rds_connection *conn) { @@ -290,6 +305,7 @@ void rds_conn_destroy(struct rds_connection *conn) rds_conn_count--; } +EXPORT_SYMBOL_GPL(rds_conn_destroy); static void rds_conn_message_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, @@ -393,6 +409,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, spin_unlock_irqrestore(&rds_conn_lock, flags); } +EXPORT_SYMBOL_GPL(rds_for_each_conn_info); static int rds_conn_info_visitor(struct rds_connection *conn, void *buffer) @@ -468,6 +485,7 @@ void rds_conn_drop(struct rds_connection *conn) atomic_set(&conn->c_state, RDS_CONN_ERROR); queue_work(rds_wq, &conn->c_down_w); } +EXPORT_SYMBOL_GPL(rds_conn_drop); /* * An error occurred on the connection diff --git a/net/rds/ib.c b/net/rds/ib.c index b9bcd32431e..536ebe5d3f6 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -43,11 +43,14 @@ unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ +unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; module_param(fmr_pool_size, int, 0444); MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA"); module_param(fmr_message_size, int, 0444); MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); +module_param(rds_ib_retry_count, int, 0444); +MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); struct list_head rds_ib_devices; @@ -82,9 +85,6 @@ void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_wrs = dev_attr->max_qp_wr; rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); - rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1); - rds_ibdev->fmr_page_size = 1 << rds_ibdev->fmr_page_shift; - rds_ibdev->fmr_page_mask = ~((u64) rds_ibdev->fmr_page_size - 1); rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; rds_ibdev->max_fmrs = dev_attr->max_fmr ? min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : @@ -282,6 +282,7 @@ struct rds_transport rds_ib_transport = { .flush_mrs = rds_ib_flush_mrs, .t_owner = THIS_MODULE, .t_name = "infiniband", + .t_type = RDS_TRANS_IB }; int __init rds_ib_init(void) diff --git a/net/rds/ib.h b/net/rds/ib.h index 455ae73047f..1378b854cac 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -15,6 +15,8 @@ #define RDS_IB_DEFAULT_RECV_WR 1024 #define RDS_IB_DEFAULT_SEND_WR 256 +#define RDS_IB_DEFAULT_RETRY_COUNT 2 + #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ extern struct list_head rds_ib_devices; @@ -157,9 +159,6 @@ struct rds_ib_device { struct ib_pd *pd; struct ib_mr *mr; struct rds_ib_mr_pool *mr_pool; - int fmr_page_shift; - int fmr_page_size; - u64 fmr_page_mask; unsigned int fmr_max_remaps; unsigned int max_fmrs; int max_sge; @@ -247,6 +246,7 @@ extern struct ib_client rds_ib_client; extern unsigned int fmr_pool_size; extern unsigned int fmr_message_size; +extern unsigned int rds_ib_retry_count; extern spinlock_t ib_nodev_conns_lock; extern struct list_head ib_nodev_conns; @@ -355,17 +355,25 @@ extern ctl_table rds_ib_sysctl_table[]; /* * Helper functions for getting/setting the header and data SGEs in * RDS packets (not RDMA) + * + * From version 3.1 onwards, header is in front of data in the sge. */ static inline struct ib_sge * rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge) { - return &sge[0]; + if (ic->conn->c_version > RDS_PROTOCOL_3_0) + return &sge[0]; + else + return &sge[1]; } static inline struct ib_sge * rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge) { - return &sge[1]; + if (ic->conn->c_version > RDS_PROTOCOL_3_0) + return &sge[1]; + else + return &sge[0]; } #endif diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index f8e40e1a603..c2d372f13db 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -98,21 +98,34 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even struct ib_qp_attr qp_attr; int err; - if (event->param.conn.private_data_len) { + if (event->param.conn.private_data_len >= sizeof(*dp)) { dp = event->param.conn.private_data; - rds_ib_set_protocol(conn, + /* make sure it isn't empty data */ + if (dp->dp_protocol_major) { + rds_ib_set_protocol(conn, RDS_PROTOCOL(dp->dp_protocol_major, - dp->dp_protocol_minor)); - rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + dp->dp_protocol_minor)); + rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + } } printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", - &conn->c_laddr, + &conn->c_faddr, RDS_PROTOCOL_MAJOR(conn->c_version), RDS_PROTOCOL_MINOR(conn->c_version), ic->i_flowctl ? ", flow control" : ""); + /* + * Init rings and fill recv. this needs to wait until protocol negotiation + * is complete, since ring layout is different from 3.0 to 3.1. + */ + rds_ib_send_init_ring(ic); + rds_ib_recv_init_ring(ic); + /* Post receive buffers - as a side effect, this will update + * the posted credit count. */ + rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); + /* Tune RNR behavior */ rds_ib_tune_rnr(ic, &qp_attr); @@ -145,7 +158,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, /* XXX tune these? */ conn_param->responder_resources = 1; conn_param->initiator_depth = 1; - conn_param->retry_count = 7; + conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); conn_param->rnr_retry_count = 7; if (dp) { @@ -190,9 +203,9 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); break; default: - printk(KERN_WARNING "RDS/ib: unhandled QP event %u " - "on connection to %pI4\n", event->event, - &conn->c_faddr); + rds_ib_conn_error(conn, "RDS/IB: Fatal QP Event %u " + "- connection %pI4->%pI4, reconnecting\n", + event->event, &conn->c_laddr, &conn->c_faddr); break; } } @@ -321,7 +334,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) rdsdebug("send allocation failed\n"); goto out; } - rds_ib_send_init_ring(ic); + memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); if (ic->i_recvs == NULL) { @@ -329,14 +342,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn) rdsdebug("recv allocation failed\n"); goto out; } + memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); - rds_ib_recv_init_ring(ic); rds_ib_recv_init_ack(ic); - /* Post receive buffers - as a side effect, this will update - * the posted credit count. */ - rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); - rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, ic->i_send_cq, ic->i_recv_cq); @@ -344,19 +353,32 @@ out: return ret; } -static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp) +static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) { + const struct rds_ib_connect_private *dp = event->param.conn.private_data; u16 common; u32 version = 0; - /* rdma_cm private data is odd - when there is any private data in the + /* + * rdma_cm private data is odd - when there is any private data in the * request, we will be given a pretty large buffer without telling us the * original size. The only way to tell the difference is by looking at * the contents, which are initialized to zero. * If the protocol version fields aren't set, this is a connection attempt * from an older version. This could could be 3.0 or 2.0 - we can't tell. - * We really should have changed this for OFED 1.3 :-( */ - if (dp->dp_protocol_major == 0) + * We really should have changed this for OFED 1.3 :-( + */ + + /* Be paranoid. RDS always has privdata */ + if (!event->param.conn.private_data_len) { + printk(KERN_NOTICE "RDS incoming connection has no private data, " + "rejecting\n"); + return 0; + } + + /* Even if len is crap *now* I still want to check it. -ASG */ + if (event->param.conn.private_data_len < sizeof (*dp) + || dp->dp_protocol_major == 0) return RDS_PROTOCOL_3_0; common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; @@ -388,7 +410,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, int err, destroy = 1; /* Check whether the remote protocol version matches ours. */ - version = rds_ib_protocol_compatible(dp); + version = rds_ib_protocol_compatible(event); if (!version) goto out; diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 81033af9302..ef3ab5b7283 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -211,7 +211,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) pool->fmr_attr.max_pages = fmr_message_size; pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; - pool->fmr_attr.page_shift = rds_ibdev->fmr_page_shift; + pool->fmr_attr.page_shift = PAGE_SHIFT; pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4; /* We never allow more than max_items MRs to be allocated. @@ -349,13 +349,13 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); - if (dma_addr & ~rds_ibdev->fmr_page_mask) { + if (dma_addr & ~PAGE_MASK) { if (i > 0) return -EINVAL; else ++page_cnt; } - if ((dma_addr + dma_len) & ~rds_ibdev->fmr_page_mask) { + if ((dma_addr + dma_len) & ~PAGE_MASK) { if (i < sg_dma_len - 1) return -EINVAL; else @@ -365,7 +365,7 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm len += dma_len; } - page_cnt += len >> rds_ibdev->fmr_page_shift; + page_cnt += len >> PAGE_SHIFT; if (page_cnt > fmr_message_size) return -EINVAL; @@ -378,9 +378,9 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); - for (j = 0; j < dma_len; j += rds_ibdev->fmr_page_size) + for (j = 0; j < dma_len; j += PAGE_SIZE) dma_pages[page_cnt++] = - (dma_addr & rds_ibdev->fmr_page_mask) + j; + (dma_addr & PAGE_MASK) + j; } ret = ib_map_phys_fmr(ibmr->fmr, diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 5709bad2832..cd7a6cfcab0 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -555,6 +555,47 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic) return rds_ib_get_ack(ic); } +static struct rds_header *rds_ib_get_header(struct rds_connection *conn, + struct rds_ib_recv_work *recv, + u32 data_len) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs]; + void *addr; + u32 misplaced_hdr_bytes; + + /* + * Support header at the front (RDS 3.1+) as well as header-at-end. + * + * Cases: + * 1) header all in header buff (great!) + * 2) header all in data page (copy all to header buff) + * 3) header split across hdr buf + data page + * (move bit in hdr buff to end before copying other bit from data page) + */ + if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE) + return hdr_buff; + + if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) { + addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); + memcpy(hdr_buff, + addr + recv->r_frag->f_offset + data_len, + sizeof(struct rds_header)); + kunmap_atomic(addr, KM_SOFTIRQ0); + return hdr_buff; + } + + misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len)); + + memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes); + + addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); + memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len, + sizeof(struct rds_header) - misplaced_hdr_bytes); + kunmap_atomic(addr, KM_SOFTIRQ0); + return hdr_buff; +} + /* * It's kind of lame that we're copying from the posted receive pages into * long-lived bitmaps. We could have posted the bitmaps and rdma written into @@ -645,7 +686,7 @@ struct rds_ib_ack_state { }; static void rds_ib_process_recv(struct rds_connection *conn, - struct rds_ib_recv_work *recv, u32 byte_len, + struct rds_ib_recv_work *recv, u32 data_len, struct rds_ib_ack_state *state) { struct rds_ib_connection *ic = conn->c_transport_data; @@ -655,9 +696,9 @@ static void rds_ib_process_recv(struct rds_connection *conn, /* XXX shut down the connection if port 0,0 are seen? */ rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv, - byte_len); + data_len); - if (byte_len < sizeof(struct rds_header)) { + if (data_len < sizeof(struct rds_header)) { rds_ib_conn_error(conn, "incoming message " "from %pI4 didn't inclue a " "header, disconnecting and " @@ -665,9 +706,9 @@ static void rds_ib_process_recv(struct rds_connection *conn, &conn->c_faddr); return; } - byte_len -= sizeof(struct rds_header); + data_len -= sizeof(struct rds_header); - ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; + ihdr = rds_ib_get_header(conn, recv, data_len); /* Validate the checksum. */ if (!rds_message_verify_checksum(ihdr)) { @@ -687,7 +728,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, if (ihdr->h_credit) rds_ib_send_add_credits(conn, ihdr->h_credit); - if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) { + if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) { /* This is an ACK-only packet. The fact that it gets * special treatment here is that historically, ACKs * were rather special beasts. diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index 02e3e3d50d4..8d8488306fe 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -39,7 +39,7 @@ DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned; -static char *rds_ib_stat_names[] = { +static const char *const rds_ib_stat_names[] = { "ib_connect_raced", "ib_listen_closed_stale", "ib_tx_cq_call", diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index d87830db93a..84b5ffcb280 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c @@ -53,7 +53,17 @@ unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20); static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1; static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL; -unsigned int rds_ib_sysctl_flow_control = 1; +/* + * This sysctl does nothing. + * + * Backwards compatibility with RDS 3.0 wire protocol + * disables initial FC credit exchange. + * If it's ever possible to drop 3.0 support, + * setting this to 1 and moving init/refill of send/recv + * rings from ib_cm_connect_complete() back into ib_setup_qp() + * will cause credits to be added before protocol negotiation. + */ +unsigned int rds_ib_sysctl_flow_control = 0; ctl_table rds_ib_sysctl_table[] = { { diff --git a/net/rds/info.c b/net/rds/info.c index 62aeef37aef..814a91a6f4a 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -79,6 +79,7 @@ void rds_info_register_func(int optname, rds_info_func func) rds_info_funcs[offset] = func; spin_unlock(&rds_info_lock); } +EXPORT_SYMBOL_GPL(rds_info_register_func); void rds_info_deregister_func(int optname, rds_info_func func) { @@ -91,6 +92,7 @@ void rds_info_deregister_func(int optname, rds_info_func func) rds_info_funcs[offset] = NULL; spin_unlock(&rds_info_lock); } +EXPORT_SYMBOL_GPL(rds_info_deregister_func); /* * Typically we hold an atomic kmap across multiple rds_info_copy() calls @@ -137,6 +139,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data, } } } +EXPORT_SYMBOL_GPL(rds_info_copy); /* * @optval points to the userspace buffer that the information snapshot diff --git a/net/rds/iw.c b/net/rds/iw.c index d16e1cbc8e8..db224f7c293 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -83,23 +83,16 @@ void rds_iw_add_one(struct ib_device *device) rds_iwdev->max_wrs = dev_attr->max_qp_wr; rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE); - rds_iwdev->page_shift = max(PAGE_SHIFT, ffs(dev_attr->page_size_cap) - 1); - rds_iwdev->dev = device; rds_iwdev->pd = ib_alloc_pd(device); if (IS_ERR(rds_iwdev->pd)) goto free_dev; if (!rds_iwdev->dma_local_lkey) { - if (device->node_type != RDMA_NODE_RNIC) { - rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, - IB_ACCESS_LOCAL_WRITE); - } else { - rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE); - } + rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_LOCAL_WRITE); if (IS_ERR(rds_iwdev->mr)) goto err_pd; } else @@ -291,6 +284,7 @@ struct rds_transport rds_iw_transport = { .flush_mrs = rds_iw_flush_mrs, .t_owner = THIS_MODULE, .t_name = "iwarp", + .t_type = RDS_TRANS_IWARP, .t_prefer_loopback = 1, }; diff --git a/net/rds/iw.h b/net/rds/iw.h index 0715dde323e..dd72b62bd50 100644 --- a/net/rds/iw.h +++ b/net/rds/iw.h @@ -181,7 +181,6 @@ struct rds_iw_device { struct ib_pd *pd; struct ib_mr *mr; struct rds_iw_mr_pool *mr_pool; - int page_shift; int max_sge; unsigned int max_wrs; unsigned int dma_local_lkey:1; diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index d |