diff options
Diffstat (limited to 'net/rds')
46 files changed, 4528 insertions, 1992 deletions
diff --git a/net/rds/Kconfig b/net/rds/Kconfig index 796773b5df9..f2c670ba7b9 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -1,14 +1,28 @@ config RDS - tristate "Reliable Datagram Sockets (RDS) (EXPERIMENTAL)" - depends on INET && INFINIBAND_IPOIB && EXPERIMENTAL - depends on INFINIBAND && INFINIBAND_ADDR_TRANS + tristate "The RDS Protocol" + depends on INET ---help--- - RDS provides reliable, sequenced delivery of datagrams - over Infiniband. + The RDS (Reliable Datagram Sockets) protocol provides reliable, + sequenced delivery of datagrams over Infiniband, iWARP, + or TCP. + +config RDS_RDMA + tristate "RDS over Infiniband and iWARP" + depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS + ---help--- + Allow RDS to use Infiniband and iWARP as a transport. + This transport supports RDMA operations. + +config RDS_TCP + tristate "RDS over TCP" + depends on RDS + ---help--- + Allow RDS to use TCP as a transport. + This transport does not support RDMA operations. config RDS_DEBUG - bool "Debugging messages" + bool "RDS debugging messages" depends on RDS default n diff --git a/net/rds/Makefile b/net/rds/Makefile index 51f27585fa0..56d3f6023ce 100644 --- a/net/rds/Makefile +++ b/net/rds/Makefile @@ -1,14 +1,19 @@ obj-$(CONFIG_RDS) += rds.o rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \ recv.o send.o stats.o sysctl.o threads.o transport.o \ - loop.o page.o rdma.o \ - rdma_transport.o \ + loop.o page.o rdma.o + +obj-$(CONFIG_RDS_RDMA) += rds_rdma.o +rds_rdma-y := rdma_transport.o \ ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \ ib_sysctl.o ib_rdma.o \ iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \ iw_sysctl.o iw_rdma.o -ifeq ($(CONFIG_RDS_DEBUG), y) -EXTRA_CFLAGS += -DDEBUG -endif + +obj-$(CONFIG_RDS_TCP) += rds_tcp.o +rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \ + tcp_send.o tcp_stats.o + +ccflags-$(CONFIG_RDS_DEBUG) := -DDEBUG diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 20cf16fc572..424ff622ab5 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -33,14 +33,21 @@ #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> +#include <linux/gfp.h> #include <linux/in.h> #include <linux/poll.h> -#include <linux/version.h> #include <net/sock.h> #include "rds.h" -#include "rdma.h" -#include "rdma_transport.h" + +char *rds_str_array(char **array, size_t elements, size_t index) +{ + if ((index < elements) && array[index]) + return array[index]; + else + return "unknown"; +} +EXPORT_SYMBOL(rds_str_array); /* this is just used for stats gathering :/ */ static DEFINE_SPINLOCK(rds_sock_lock); @@ -61,9 +68,8 @@ static int rds_release(struct socket *sock) { struct sock *sk = sock->sk; struct rds_sock *rs; - unsigned long flags; - if (sk == NULL) + if (!sk) goto out; rs = rds_sk_to_rs(sk); @@ -74,15 +80,25 @@ static int rds_release(struct socket *sock) * with the socket. */ rds_clear_recv_queue(rs); rds_cong_remove_socket(rs); + + /* + * the binding lookup hash uses rcu, we need to + * make sure we sychronize_rcu before we free our + * entry + */ rds_remove_bound(rs); + synchronize_rcu(); + rds_send_drop_to(rs, NULL); rds_rdma_drop_keys(rs); rds_notify_queue_get(rs, NULL); - spin_lock_irqsave(&rds_sock_lock, flags); + spin_lock_bh(&rds_sock_lock); list_del_init(&rs->rs_item); rds_sock_count--; - spin_unlock_irqrestore(&rds_sock_lock, flags); + spin_unlock_bh(&rds_sock_lock); + + rds_trans_put(rs->rs_transport); sock->sk = NULL; sock_put(sk); @@ -159,9 +175,10 @@ static unsigned int rds_poll(struct file *file, struct socket *sock, unsigned int mask = 0; unsigned long flags; - poll_wait(file, sk->sk_sleep, wait); + poll_wait(file, sk_sleep(sk), wait); - poll_wait(file, &rds_poll_waitq, wait); + if (rs->rs_seen_congestion) + poll_wait(file, &rds_poll_waitq, wait); read_lock_irqsave(&rs->rs_recv_lock, flags); if (!rs->rs_cong_monitor) { @@ -176,13 +193,17 @@ static unsigned int rds_poll(struct file *file, struct socket *sock, mask |= (POLLIN | POLLRDNORM); spin_unlock(&rs->rs_lock); } - if (!list_empty(&rs->rs_recv_queue) - || !list_empty(&rs->rs_notify_queue)) + if (!list_empty(&rs->rs_recv_queue) || + !list_empty(&rs->rs_notify_queue)) mask |= (POLLIN | POLLRDNORM); if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) mask |= (POLLOUT | POLLWRNORM); read_unlock_irqrestore(&rs->rs_recv_lock, flags); + /* clear state any time we wake a seen-congested socket */ + if (mask) + rs->rs_seen_congestion = 0; + return mask; } @@ -250,7 +271,7 @@ static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, } static int rds_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { struct rds_sock *rs = rds_sk_to_rs(sock->sk); int ret; @@ -267,6 +288,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname, case RDS_GET_MR: ret = rds_get_mr(rs, optval, optlen); break; + case RDS_GET_MR_FOR_DEST: + ret = rds_get_mr_for_dest(rs, optval, optlen); + break; case RDS_FREE_MR: ret = rds_free_mr(rs, optval, optlen); break; @@ -307,8 +331,8 @@ static int rds_getsockopt(struct socket *sock, int level, int optname, if (len < sizeof(int)) ret = -EINVAL; else - if (put_user(rs->rs_recverr, (int __user *) optval) - || put_user(sizeof(int), optlen)) + if (put_user(rs->rs_recverr, (int __user *) optval) || + put_user(sizeof(int), optlen)) ret = -EFAULT; else ret = 0; @@ -361,7 +385,7 @@ static struct proto rds_proto = { .obj_size = sizeof(struct rds_sock), }; -static struct proto_ops rds_proto_ops = { +static const struct proto_ops rds_proto_ops = { .family = AF_RDS, .owner = THIS_MODULE, .release = rds_release, @@ -384,7 +408,6 @@ static struct proto_ops rds_proto_ops = { static int __rds_create(struct socket *sock, struct sock *sk, int protocol) { - unsigned long flags; struct rds_sock *rs; sock_init_data(sock, sk); @@ -401,15 +424,16 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) spin_lock_init(&rs->rs_rdma_lock); rs->rs_rdma_keys = RB_ROOT; - spin_lock_irqsave(&rds_sock_lock, flags); + spin_lock_bh(&rds_sock_lock); list_add_tail(&rs->rs_item, &rds_sock_list); rds_sock_count++; - spin_unlock_irqrestore(&rds_sock_lock, flags); + spin_unlock_bh(&rds_sock_lock); return 0; } -static int rds_create(struct net *net, struct socket *sock, int protocol) +static int rds_create(struct net *net, struct socket *sock, int protocol, + int kern) { struct sock *sk; @@ -433,7 +457,7 @@ void rds_sock_put(struct rds_sock *rs) sock_put(rds_rs_to_sk(rs)); } -static struct net_proto_family rds_family_ops = { +static const struct net_proto_family rds_family_ops = { .family = AF_RDS, .create = rds_create, .owner = THIS_MODULE, @@ -444,17 +468,14 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len, struct rds_info_lengths *lens) { struct rds_sock *rs; - struct sock *sk; struct rds_incoming *inc; - unsigned long flags; unsigned int total = 0; len /= sizeof(struct rds_info_message); - spin_lock_irqsave(&rds_sock_lock, flags); + spin_lock_bh(&rds_sock_lock); list_for_each_entry(rs, &rds_sock_list, rs_item) { - sk = rds_rs_to_sk(rs); read_lock(&rs->rs_recv_lock); /* XXX too lazy to maintain counts.. */ @@ -468,7 +489,7 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len, read_unlock(&rs->rs_recv_lock); } - spin_unlock_irqrestore(&rds_sock_lock, flags); + spin_unlock_bh(&rds_sock_lock); lens->nr = total; lens->each = sizeof(struct rds_info_message); @@ -480,11 +501,10 @@ static void rds_sock_info(struct socket *sock, unsigned int len, { struct rds_info_socket sinfo; struct rds_sock *rs; - unsigned long flags; len /= sizeof(struct rds_info_socket); - spin_lock_irqsave(&rds_sock_lock, flags); + spin_lock_bh(&rds_sock_lock); if (len < rds_sock_count) goto out; @@ -505,12 +525,11 @@ out: lens->nr = rds_sock_count; lens->each = sizeof(struct rds_info_socket); - spin_unlock_irqrestore(&rds_sock_lock, flags); + spin_unlock_bh(&rds_sock_lock); } -static void __exit rds_exit(void) +static void rds_exit(void) { - rds_rdma_exit(); sock_unregister(rds_family_ops.family); proto_unregister(&rds_proto); rds_conn_exit(); @@ -524,7 +543,7 @@ static void __exit rds_exit(void) } module_exit(rds_exit); -static int __init rds_init(void) +static int rds_init(void) { int ret; @@ -550,14 +569,8 @@ static int __init rds_init(void) rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); - /* ib/iwarp transports currently compiled-in */ - ret = rds_rdma_init(); - if (ret) - goto out_sock; goto out; -out_sock: - sock_unregister(rds_family_ops.family); out_proto: proto_unregister(&rds_proto); out_stats: diff --git a/net/rds/bind.c b/net/rds/bind.c index c17cc39160c..a2e6562da75 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -34,45 +34,52 @@ #include <net/sock.h> #include <linux/in.h> #include <linux/if_arp.h> +#include <linux/jhash.h> +#include <linux/ratelimit.h> #include "rds.h" -/* - * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't - * particularly zippy. - * - * This is now called for every incoming frame so we arguably care much more - * about it than we used to. - */ +#define BIND_HASH_SIZE 1024 +static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; static DEFINE_SPINLOCK(rds_bind_lock); -static struct rb_root rds_bind_tree = RB_ROOT; -static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, - struct rds_sock *insert) +static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) +{ + return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) & + (BIND_HASH_SIZE - 1)); +} + +static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, + struct rds_sock *insert) { - struct rb_node **p = &rds_bind_tree.rb_node; - struct rb_node *parent = NULL; struct rds_sock *rs; + struct hlist_head *head = hash_to_bucket(addr, port); u64 cmp; u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); - while (*p) { - parent = *p; - rs = rb_entry(parent, struct rds_sock, rs_bound_node); - + rcu_read_lock(); + hlist_for_each_entry_rcu(rs, head, rs_bound_node) { cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | be16_to_cpu(rs->rs_bound_port); - if (needle < cmp) - p = &(*p)->rb_left; - else if (needle > cmp) - p = &(*p)->rb_right; - else + if (cmp == needle) { + rcu_read_unlock(); return rs; + } } + rcu_read_unlock(); if (insert) { - rb_link_node(&insert->rs_bound_node, parent, p); - rb_insert_color(&insert->rs_bound_node, &rds_bind_tree); + /* + * make sure our addr and port are set before + * we are added to the list, other people + * in rcu will find us as soon as the + * hlist_add_head_rcu is done + */ + insert->rs_bound_addr = addr; + insert->rs_bound_port = port; + rds_sock_addref(insert); + + hlist_add_head_rcu(&insert->rs_bound_node, head); } return NULL; } @@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, struct rds_sock *rds_find_bound(__be32 addr, __be16 port) { struct rds_sock *rs; - unsigned long flags; - spin_lock_irqsave(&rds_bind_lock, flags); - rs = rds_bind_tree_walk(addr, port, NULL); + rs = rds_bind_lookup(addr, port, NULL); + if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) rds_sock_addref(rs); else rs = NULL; - spin_unlock_irqrestore(&rds_bind_lock, flags); rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, ntohs(port)); @@ -112,7 +117,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) rover = be16_to_cpu(*port); last = rover; } else { - rover = max_t(u16, net_random(), 2); + rover = max_t(u16, prandom_u32(), 2); last = rover - 1; } @@ -121,22 +126,15 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) do { if (rover == 0) rover++; - if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) { - *port = cpu_to_be16(rover); + if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) { + *port = rs->rs_bound_port; ret = 0; + rdsdebug("rs %p binding to %pI4:%d\n", + rs, &addr, (int)ntohs(*port)); break; } } while (rover++ != last); - if (ret == 0) { - rs->rs_bound_addr = addr; - rs->rs_bound_port = *port; - rds_sock_addref(rs); - - rdsdebug("rs %p binding to %pI4:%d\n", - rs, &addr, (int)ntohs(*port)); - } - spin_unlock_irqrestore(&rds_bind_lock, flags); return ret; @@ -153,7 +151,7 @@ void rds_remove_bound(struct rds_sock *rs) rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port)); - rb_erase(&rs->rs_bound_node, &rds_bind_tree); + hlist_del_init_rcu(&rs->rs_bound_node); rds_sock_put(rs); rs->rs_bound_addr = 0; } @@ -184,9 +182,11 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; trans = rds_trans_get_preferred(sin->sin_addr.s_addr); - if (trans == NULL) { + if (!trans) { ret = -EADDRNOTAVAIL; rds_remove_bound(rs); + printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, " + "load rds_tcp or rds_rdma?\n"); goto out; } @@ -195,5 +195,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) out: release_sock(sk); + + /* we might have called rds_remove_bound on error */ + if (ret) + synchronize_rcu(); return ret; } diff --git a/net/rds/cong.c b/net/rds/cong.c index 710e4599d76..e5b65acd650 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -30,10 +30,11 @@ * SOFTWARE. * */ +#include <linux/slab.h> #include <linux/types.h> #include <linux/rbtree.h> - -#include <asm-generic/bitops/le.h> +#include <linux/bitops.h> +#include <linux/export.h> #include "rds.h" @@ -140,7 +141,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) unsigned long flags; map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); - if (map == NULL) + if (!map) return NULL; map->m_addr = addr; @@ -158,7 +159,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) ret = rds_cong_tree_walk(addr, map); spin_unlock_irqrestore(&rds_cong_lock, flags); - if (ret == NULL) { + if (!ret) { ret = map; map = NULL; } @@ -204,7 +205,7 @@ int rds_cong_get_maps(struct rds_connection *conn) conn->c_lcong = rds_cong_from_addr(conn->c_laddr); conn->c_fcong = rds_cong_from_addr(conn->c_faddr); - if (conn->c_lcong == NULL || conn->c_fcong == NULL) + if (!(conn->c_lcong && conn->c_fcong)) return -ENOMEM; return 0; @@ -220,7 +221,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map) list_for_each_entry(conn, &map->m_conn_list, c_map_item) { if (!test_and_set_bit(0, &conn->c_map_queued)) { rds_stats_inc(s_cong_update_queued); - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + rds_send_xmit(conn); } } @@ -254,6 +255,7 @@ void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask) read_unlock_irqrestore(&rds_cong_monitor_lock, flags); } } +EXPORT_SYMBOL_GPL(rds_cong_map_updated); int rds_cong_updated_since(unsigned long *recent) { @@ -283,7 +285,7 @@ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - generic___set_le_bit(off, (void *)map->m_page_addrs[i]); + __set_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) @@ -297,7 +299,7 @@ void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - generic___clear_le_bit(off, (void *)map->m_page_addrs[i]); + __clear_bit_le(off, (void *)map->m_page_addrs[i]); } static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) @@ -308,7 +310,7 @@ static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - return generic_test_le_bit(off, (void *)map->m_page_addrs[i]); + return test_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_add_socket(struct rds_sock *rs) diff --git a/net/rds/connection.c b/net/rds/connection.c index 273f064930a..378c3a6acf8 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -32,11 +32,12 @@ */ #include <linux/kernel.h> #include <linux/list.h> +#include <linux/slab.h> +#include <linux/export.h> #include <net/inet_hashtables.h> #include "rds.h" #include "loop.h" -#include "rdma.h" #define RDS_CONNECTION_HASH_BITS 12 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) @@ -50,10 +51,16 @@ static struct kmem_cache *rds_conn_slab; static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) { + static u32 rds_hash_secret __read_mostly; + + unsigned long hash; + + net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); + /* Pass NULL, don't need struct net for hash */ - unsigned long hash = inet_ehashfn(NULL, - be32_to_cpu(laddr), 0, - be32_to_cpu(faddr), 0); + hash = __inet_ehashfn(be32_to_cpu(laddr), 0, + be32_to_cpu(faddr), 0, + rds_hash_secret); return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; } @@ -62,26 +69,14 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ } while (0) -static inline int rds_conn_is_sending(struct rds_connection *conn) -{ - int ret = 0; - - if (!mutex_trylock(&conn->c_send_lock)) - ret = 1; - else - mutex_unlock(&conn->c_send_lock); - - return ret; -} - +/* rcu read lock must be held or the connection spinlock */ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, __be32 laddr, __be32 faddr, struct rds_transport *trans) { struct rds_connection *conn, *ret = NULL; - struct hlist_node *pos; - hlist_for_each_entry(conn, pos, head, c_hash_node) { + hlist_for_each_entry_rcu(conn, head, c_hash_node) { if (conn->c_faddr == faddr && conn->c_laddr == laddr && conn->c_trans == trans) { ret = conn; @@ -99,7 +94,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, * and receiving over this connection again in the future. It is up to * the transport to have serialized this call with its send and recv. */ -void rds_conn_reset(struct rds_connection *conn) +static void rds_conn_reset(struct rds_connection *conn) { rdsdebug("connection %pI4 to %pI4 reset\n", &conn->c_laddr, &conn->c_faddr); @@ -126,17 +121,16 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp, int is_outgoing) { - struct rds_connection *conn, *tmp, *parent = NULL; + struct rds_connection *conn, *parent = NULL; struct hlist_head *head = rds_conn_bucket(laddr, faddr); + struct rds_transport *loop_trans; unsigned long flags; int ret; - spin_lock_irqsave(&rds_conn_lock, flags); + rcu_read_lock(); conn = rds_conn_lookup(head, laddr, faddr, trans); - if (conn - && conn->c_loopback - && conn->c_trans != &rds_loop_transport - && !is_outgoing) { + if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && + !is_outgoing) { /* This is a looped back IB connection, and we're * called by the code handling the incoming connect. * We need a second connection object into which we @@ -144,26 +138,23 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, parent = conn; conn = parent->c_passive; } - spin_unlock_irqrestore(&rds_conn_lock, flags); + rcu_read_unlock(); if (conn) goto out; - conn = kmem_cache_alloc(rds_conn_slab, gfp); - if (conn == NULL) { + conn = kmem_cache_zalloc(rds_conn_slab, gfp); + if (!conn) { conn = ERR_PTR(-ENOMEM); goto out; } - memset(conn, 0, sizeof(*conn)); - INIT_HLIST_NODE(&conn->c_hash_node); - conn->c_version = RDS_PROTOCOL_3_0; conn->c_laddr = laddr; conn->c_faddr = faddr; spin_lock_init(&conn->c_lock); conn->c_next_tx_seq = 1; - mutex_init(&conn->c_send_lock); + init_waitqueue_head(&conn->c_waitq); INIT_LIST_HEAD(&conn->c_send_queue); INIT_LIST_HEAD(&conn->c_retrans); @@ -179,7 +170,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, * can bind to the destination address then we'd rather the messages * flow through loopback rather than either transport. */ - if (rds_trans_get_preferred(faddr)) { + loop_trans = rds_trans_get_preferred(faddr); + if (loop_trans) { + rds_trans_put(loop_trans); conn->c_loopback = 1; if (is_outgoing && trans->t_prefer_loopback) { /* "outgoing" connection - and the transport @@ -213,26 +206,40 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, trans->t_name ? trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : ""); + /* + * Since we ran without holding the conn lock, someone could + * have created the same conn (either normal or passive) in the + * interim. We check while holding the lock. If we won, we complete + * init and return our conn. If we lost, we rollback and return the + * other one. + */ spin_lock_irqsave(&rds_conn_lock, flags); - if (parent == NULL) { - tmp = rds_conn_lookup(head, laddr, faddr, trans); - if (tmp == NULL) - hlist_add_head(&conn->c_hash_node, head); - } else { - tmp = parent->c_passive; - if (!tmp) + if (parent) { + /* Creating passive conn */ + if (parent->c_passive) { + trans->conn_free(conn->c_transport_data); + kmem_cache_free(rds_conn_slab, conn); + conn = parent->c_passive; + } else { parent->c_passive = conn; - } - - if (tmp) { - trans->conn_free(conn->c_transport_data); - kmem_cache_free(rds_conn_slab, conn); - conn = tmp; + rds_cong_add_conn(conn); + rds_conn_count++; + } } else { - rds_cong_add_conn(conn); - rds_conn_count++; + /* Creating normal conn */ + struct rds_connection *found; + + found = rds_conn_lookup(head, laddr, faddr, trans); + if (found) { + trans->conn_free(conn->c_transport_data); + kmem_cache_free(rds_conn_slab, conn); + conn = found; + } else { + hlist_add_head_rcu(&conn->c_hash_node, head); + rds_cong_add_conn(conn); + rds_conn_count++; + } } - spin_unlock_irqrestore(&rds_conn_lock, flags); out: @@ -244,28 +251,100 @@ struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, { return __rds_conn_create(laddr, faddr, trans, gfp, 0); } +EXPORT_SYMBOL_GPL(rds_conn_create); struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp) { return __rds_conn_create(laddr, faddr, trans, gfp, 1); } +EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); + +void rds_conn_shutdown(struct rds_connection *conn) +{ + /* shut it down unless it's down already */ + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { + /* + * Quiesce the connection mgmt handlers before we start tearing + * things down. We don't hold the mutex for the entire + * duration of the shutdown operation, else we may be + * deadlocking with the CM handler. Instead, the CM event + * handler is supposed to check for state DISCONNECTING + */ + mutex_lock(&conn->c_cm_lock); + if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) + && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { + rds_conn_error(conn, "shutdown called in state %d\n", + atomic_read(&conn->c_state)); + mutex_unlock(&conn->c_cm_lock); + return; + } + mutex_unlock(&conn->c_cm_lock); + + wait_event(conn->c_waitq, + !test_bit(RDS_IN_XMIT, &conn->c_flags)); + + conn->c_trans->conn_shutdown(conn); + rds_conn_reset(conn); + + if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { + /* This can happen - eg when we're in the middle of tearing + * down the connection, and someone unloads the rds module. + * Quite reproduceable with loopback connections. + * Mostly harmless. + */ + rds_conn_error(conn, + "%s: failed to transition to state DOWN, " + "current state is %d\n", + __func__, + atomic_read(&conn->c_state)); + return; + } + } + /* Then reconnect if it's still live. + * The passive side of an IB loopback connection is never added + * to the conn hash, so we never trigger a reconnect on this + * conn - the reconnect is always triggered by the active peer. */ + cancel_delayed_work_sync(&conn->c_conn_w); + rcu_read_lock(); + if (!hlist_unhashed(&conn->c_hash_node)) { + rcu_read_unlock(); + rds_queue_reconnect(conn); + } else { + rcu_read_unlock(); + } +} + +/* + * Stop and free a connection. + * + * This can only be used in very limited circumstances. It assumes that once + * the conn has been shutdown that no one else is referencing the connection. + * We can only ensure this in the rmmod path in the current code. + */ void rds_conn_destroy(struct rds_connection *conn) { struct rds_message *rm, *rtmp; + unsigned long flags; rdsdebug("freeing conn %p for %pI4 -> " "%pI4\n", conn, &conn->c_laddr, &conn->c_faddr); - hlist_del_init(&conn->c_hash_node); + /* Ensure conn will not be scheduled for reconnect */ + spin_lock_irq(&rds_conn_lock); + hlist_del_init_rcu(&conn->c_hash_node); + spin_unlock_irq(&rds_conn_lock); + synchronize_rcu(); - /* wait for the rds thread to shut it down */ - atomic_set(&conn->c_state, RDS_CONN_ERROR); - cancel_delayed_work(&conn->c_conn_w); - queue_work(rds_wq, &conn->c_down_w); - flush_workqueue(rds_wq); + /* shut the connection down */ + rds_conn_drop(conn); + flush_work(&conn->c_down_w); + + /* make sure lingering queued work won't try to ref the conn */ + cancel_delayed_work_sync(&conn->c_send_w); + cancel_delayed_work_sync(&conn->c_recv_w); /* tear down queued messages */ list_for_each_entry_safe(rm, rtmp, @@ -290,8 +369,11 @@ void rds_conn_destroy(struct rds_connection *conn) BUG_ON(!list_empty(&conn->c_retrans)); kmem_cache_free(rds_conn_slab, conn); + spin_lock_irqsave(&rds_conn_lock, flags); rds_conn_count--; + spin_unlock_irqrestore(&rds_conn_lock, flags); } +EXPORT_SYMBOL_GPL(rds_conn_destroy); static void rds_conn_message_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, @@ -299,27 +381,26 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, int want_send) { struct hlist_head *head; - struct hlist_node *pos; struct list_head *list; struct rds_connection *conn; struct rds_message *rm; - unsigned long flags; unsigned int total = 0; + unsigned long flags; size_t i; len /= sizeof(struct rds_info_message); - spin_lock_irqsave(&rds_conn_lock, flags); + rcu_read_lock(); for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { - hlist_for_each_entry(conn, pos, head, c_hash_node) { + hlist_for_each_entry_rcu(conn, head, c_hash_node) { if (want_send) list = &conn->c_send_queue; else list = &conn->c_retrans; - spin_lock(&conn->c_lock); + spin_lock_irqsave(&conn->c_lock, flags); /* XXX too lazy to maintain counts.. */ list_for_each_entry(rm, list, m_conn_item) { @@ -330,11 +411,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, conn->c_faddr, 0); } - spin_unlock(&conn->c_lock); + spin_unlock_irqrestore(&conn->c_lock, flags); } } - - spin_unlock_irqrestore(&rds_conn_lock, flags); + rcu_read_unlock(); lens->nr = total; lens->each = sizeof(struct rds_info_message); @@ -363,20 +443,17 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, { uint64_t buffer[(item_len + 7) / 8]; struct hlist_head *head; - struct hlist_node *pos; - struct hlist_node *tmp; struct rds_connection *conn; - unsigned long flags; size_t i; - spin_lock_irqsave(&rds_conn_lock, flags); + rcu_read_lock(); lens->nr = 0; lens->each = item_len; for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { - hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) { + hlist_for_each_entry_rcu(conn, head, c_hash_node) { /* XXX no c_lock usage.. */ if (!visitor(conn, buffer)) @@ -392,9 +469,9 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, lens->nr++; } } - - spin_unlock_irqrestore(&rds_conn_lock, flags); + rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(rds_for_each_conn_info); static int rds_conn_info_visitor(struct rds_connection *conn, void *buffer) @@ -409,8 +486,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn, sizeof(cinfo->transport)); cinfo->flags = 0; - rds_conn_info_set(cinfo->flags, - rds_conn_is_sending(conn), SENDING); + rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags), + SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo->flags, atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, @@ -430,12 +507,12 @@ static void rds_conn_info(struct socket *sock, unsigned int len, sizeof(struct rds_info_connection)); } -int __init rds_conn_init(void) +int rds_conn_init(void) { rds_conn_slab = kmem_cache_create("rds_connection", sizeof(struct rds_connection), 0, 0, NULL); - if (rds_conn_slab == NULL) + if (!rds_conn_slab) return -ENOMEM; rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); @@ -470,6 +547,19 @@ void rds_conn_drop(struct rds_connection *conn) atomic_set(&conn->c_state, RDS_CONN_ERROR); queue_work(rds_wq, &conn->c_down_w); } +EXPORT_SYMBOL_GPL(rds_conn_drop); + +/* + * If the connection is down, trigger a connect. We may have scheduled a + * delayed reconnect however - in this case we should not interfere. + */ +void rds_conn_connect_if_down(struct rds_connection *conn) +{ + if (rds_conn_state(conn) == RDS_CONN_DOWN && + !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_conn_w, 0); +} +EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); /* * An error occurred on the connection diff --git a/net/rds/ib.c b/net/rds/ib.c index 4933b380985..ba2dffeff60 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -37,25 +37,89 @@ #include <linux/inetdevice.h> #include <linux/if_arp.h> #include <linux/delay.h> +#include <linux/slab.h> +#include <linux/module.h> #include "rds.h" #include "ib.h" -unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; +static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ +unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; module_param(fmr_pool_size, int, 0444); MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA"); module_param(fmr_message_size, int, 0444); MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); +module_param(rds_ib_retry_count, int, 0444); +MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); +/* + * we have a clumsy combination of RCU and a rwsem protecting this list + * because it is used both in the get_mr fast path and while blocking in + * the FMR flushing path. + */ +DECLARE_RWSEM(rds_ib_devices_lock); struct list_head rds_ib_devices; /* NOTE: if also grabbing ibdev lock, grab this first */ DEFINE_SPINLOCK(ib_nodev_conns_lock); LIST_HEAD(ib_nodev_conns); -void rds_ib_add_one(struct ib_device *device) +static void rds_ib_nodev_connect(void) +{ + struct rds_ib_connection *ic; + + spin_lock(&ib_nodev_conns_lock); + list_for_each_entry(ic, &ib_nodev_conns, ib_node) + rds_conn_connect_if_down(ic->conn); + spin_unlock(&ib_nodev_conns_lock); +} + +static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_connection *ic; + unsigned long flags; + + spin_lock_irqsave(&rds_ibdev->spinlock, flags); + list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) + rds_conn_drop(ic->conn); + spin_unlock_irqrestore(&rds_ibdev->spinlock, flags); +} + +/* + * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references + * from interrupt context so we push freing off into a work struct in krdsd. + */ +static void rds_ib_dev_free(struct work_struct *work) +{ + struct rds_ib_ipaddr *i_ipaddr, *i_next; + struct rds_ib_device *rds_ibdev = container_of(work, + struct rds_ib_device, free_work); + + if (rds_ibdev->mr_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); + if (rds_ibdev->mr) + ib_dereg_mr(rds_ibdev->mr); + if (rds_ibdev->pd) + ib_dealloc_pd(rds_ibdev->pd); + + list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { + list_del(&i_ipaddr->list); + kfree(i_ipaddr); + } + + kfree(rds_ibdev); +} + +void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) +{ + BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0); + if (atomic_dec_and_test(&rds_ibdev->refcount)) + queue_work(rds_wq, &rds_ibdev->free_work); +} + +static void rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; struct ib_device_attr *dev_attr; @@ -73,85 +137,124 @@ void rds_ib_add_one(struct ib_device *device) goto free_attr; } - rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); + rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, + ibdev_to_node(device)); if (!rds_ibdev) goto free_attr; spin_lock_init(&rds_ibdev->spinlock); + atomic_set(&rds_ibdev->refcount, 1); + INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); rds_ibdev->max_wrs = dev_attr->max_qp_wr; rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); - rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1); - rds_ibdev->fmr_page_size = 1 << rds_ibdev->fmr_page_shift; - rds_ibdev->fmr_page_mask = ~((u64) rds_ibdev->fmr_page_size - 1); rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; rds_ibdev->max_fmrs = dev_attr->max_fmr ? min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : fmr_pool_size; + rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; + rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; + rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device); - if (IS_ERR(rds_ibdev->pd)) - goto free_dev; + if (IS_ERR(rds_ibdev->pd)) { + rds_ibdev->pd = NULL; + goto put_dev; + } - rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(rds_ibdev->mr)) - goto err_pd; + rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(rds_ibdev->mr)) { + rds_ibdev->mr = NULL; + goto put_dev; + } rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); if (IS_ERR(rds_ibdev->mr_pool)) { rds_ibdev->mr_pool = NULL; - goto err_mr; + goto put_dev; } INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); INIT_LIST_HEAD(&rds_ibdev->conn_list); - list_add_tail(&rds_ibdev->list, &rds_ib_devices); + + down_write(&rds_ib_devices_lock); + list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); + up_write(&rds_ib_devices_lock); + atomic_inc(&rds_ibdev->refcount); ib_set_client_data(device, &rds_ib_client, rds_ibdev); + atomic_inc(&rds_ibdev->refcount); - goto free_attr; + rds_ib_nodev_connect(); -err_mr: - ib_dereg_mr(rds_ibdev->mr); -err_pd: - ib_dealloc_pd(rds_ibdev->pd); -free_dev: - kfree(rds_ibdev); +put_dev: + rds_ib_dev_put(rds_ibdev); free_attr: kfree(dev_attr); } -void rds_ib_remove_one(struct ib_device *device) +/* + * New connections use this to find the device to associate with the + * connection. It's not in the fast path so we're not concerned about the + * performance of the IB call. (As of this writing, it uses an interrupt + * blocking spinlock to serialize walking a per-device list of all registered + * clients.) + * + * RCU is used to handle incoming connections racing with device teardown. + * Rather than use a lock to serialize removal from the client_data and + * getting a new reference, we use an RCU grace period. The destruction + * path removes the device from client_data and then waits for all RCU + * readers to finish. + * + * A new connection can get NULL from this if its arriving on a + * device that is in the process of being removed. + */ +struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) { struct rds_ib_device *rds_ibdev; - struct rds_ib_ipaddr *i_ipaddr, *i_next; + rcu_read_lock(); rds_ibdev = ib_get_client_data(device, &rds_ib_client); - if (!rds_ibdev) - return; + if (rds_ibdev) + atomic_inc(&rds_ibdev->refcount); + rcu_read_unlock(); + return rds_ibdev; +} - list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { - list_del(&i_ipaddr->list); - kfree(i_ipaddr); - } +/* + * The IB stack is letting us know that a device is going away. This can + * happen if the underlying HCA driver is removed or if PCI hotplug is removing + * the pci function, for example. + * + * This can be called at any time and can be racing with any other RDS path. + */ +static void rds_ib_remove_one(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; - rds_ib_destroy_conns(rds_ibdev); + rds_ibdev = ib_get_client_data(device, &rds_ib_client); + if (!rds_ibdev) + return; - if (rds_ibdev->mr_pool) - rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); + rds_ib_dev_shutdown(rds_ibdev); - ib_dereg_mr(rds_ibdev->mr); + /* stop connection attempts from getting a reference to this device. */ + ib_set_client_data(device, &rds_ib_client, NULL); - while (ib_dealloc_pd(rds_ibdev->pd)) { - rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); - msleep(1); - } + down_write(&rds_ib_devices_lock); + list_del_rcu(&rds_ibdev->list); + up_write(&rds_ib_devices_lock); - list_del(&rds_ibdev->list); - kfree(rds_ibdev); + /* + * This synchronize rcu is waiting for readers of both the ib + * client data and the devices list to finish before we drop + * both of those references. + */ + synchronize_rcu(); + rds_ib_dev_put(rds_ibdev); + rds_ib_dev_put(rds_ibdev); } struct ib_client rds_ib_client = { @@ -182,10 +285,10 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, ic = conn->c_transport_data; dev_addr = &ic->i_cm_id->route.addr.dev_addr; - ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); - ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); + rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); + rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); - rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + rds_ibdev = ic->rds_ibdev; iinfo->max_send_wr = ic->i_send_ring.w_nr; iinfo->max_recv_wr = ic->i_recv_ring.w_nr; iinfo->max_send_sge = rds_ibdev->max_sge; @@ -223,9 +326,9 @@ static int rds_ib_laddr_check(__be32 addr) /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ - cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); - if (!cm_id) - return -EADDRNOTAVAIL; + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(cm_id)) + return PTR_ERR(cm_id); memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; @@ -235,7 +338,8 @@ static int rds_ib_laddr_check(__be32 addr) ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); /* due to this, we will claim to support iWARP devices unless we check node_type. */ - if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) + if (ret || !cm_id->device || + cm_id->device->node_type != RDMA_NODE_IB_CA) ret = -EADDRNOTAVAIL; rdsdebug("addr %pI4 ret %d node type %d\n", @@ -247,11 +351,18 @@ static int rds_ib_laddr_check(__be32 addr) return ret; } +static void rds_ib_unregister_client(void) +{ + ib_unregister_client(&rds_ib_client); + /* wait for rds_ib_dev_free() to complete */ + flush_workqueue(rds_wq); +} + void rds_ib_exit(void) { rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + rds_ib_unregister_client(); rds_ib_destroy_nodev_conns(); - ib_unregister_client(&rds_ib_client); rds_ib_sysctl_exit(); rds_ib_recv_exit(); rds_trans_unregister(&rds_ib_transport); @@ -261,15 +372,14 @@ struct rds_transport rds_ib_transport = { .laddr_check = rds_ib_laddr_check, .xmit_complete = rds_ib_xmit_complete, .xmit = rds_ib_xmit, - .xmit_cong_map = NULL, .xmit_rdma = rds_ib_xmit_rdma, + .xmit_atomic = rds_ib_xmit_atomic, .recv = rds_ib_recv, .conn_alloc = rds_ib_conn_alloc, .conn_free = rds_ib_conn_free, .conn_connect = rds_ib_conn_connect, .conn_shutdown = rds_ib_conn_shutdown, .inc_copy_to_user = rds_ib_inc_copy_to_user, - .inc_purge = rds_ib_inc_purge, .inc_free = rds_ib_inc_free, .cm_initiate_connect = rds_ib_cm_initiate_connect, .cm_handle_connect = rds_ib_cm_handle_connect, @@ -282,9 +392,10 @@ struct rds_transport rds_ib_transport = { .flush_mrs = rds_ib_flush_mrs, .t_owner = THIS_MODULE, .t_name = "infiniband", + .t_type = RDS_TRANS_IB }; -int __init rds_ib_init(void) +int rds_ib_init(void) { int ret; @@ -315,7 +426,7 @@ out_recv: out_sysctl: rds_ib_sysctl_exit(); out_ibreg: - ib_unregister_client(&rds_ib_client); + rds_ib_unregister_client(); out: return ret; } diff --git a/net/rds/ib.h b/net/rds/ib.h index 069206cae73..7280ab8810c 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -3,11 +3,14 @@ #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> +#include <linux/interrupt.h> +#include <linux/pci.h> +#include <linux/slab.h> #include "rds.h" #include "rdma_transport.h" #define RDS_FMR_SIZE 256 -#define RDS_FMR_POOL_SIZE 4096 +#define RDS_FMR_POOL_SIZE 8192 #define RDS_IB_MAX_SGE 8 #define RDS_IB_RECV_SGE 2 @@ -15,8 +18,13 @@ #define RDS_IB_DEFAULT_RECV_WR 1024 #define RDS_IB_DEFAULT_SEND_WR 256 +#define RDS_IB_DEFAULT_RETRY_COUNT 2 + #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ +#define RDS_IB_RECYCLE_BATCH_COUNT 32 + +extern struct rw_semaphore rds_ib_devices_lock; extern struct list_head rds_ib_devices; /* @@ -24,20 +32,29 @@ extern struct list_head rds_ib_devices; * try and minimize the amount of memory tied up both the device and * socket receive queues. */ -/* page offset of the final full frag that fits in the page */ -#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) struct rds_page_frag { struct list_head f_item; - struct page *f_page; - unsigned long f_offset; - dma_addr_t f_mapped; + struct list_head f_cache_entry; + struct scatterlist f_sg; }; struct rds_ib_incoming { struct list_head ii_frags; + struct list_head ii_cache_entry; struct rds_incoming ii_inc; }; +struct rds_ib_cache_head { + struct list_head *first; + unsigned long count; +}; + +struct rds_ib_refill_cache { + struct rds_ib_cache_head __percpu *percpu; + struct list_head *xfer; + struct list_head *ready; +}; + struct rds_ib_connect_private { /* Add new fields at the end, and don't permute existing fields. */ __be32 dp_saddr; @@ -51,8 +68,7 @@ struct rds_ib_connect_private { }; struct rds_ib_send_work { - struct rds_message *s_rm; - struct rds_rdma_op *s_op; + void *s_op; struct ib_send_wr s_wr; struct ib_sge s_sge[RDS_IB_MAX_SGE]; unsigned long s_queued; @@ -90,12 +106,14 @@ struct rds_ib_connection { /* tx */ struct rds_ib_work_ring i_send_ring; - struct rds_message *i_rm; + struct rm_data_op *i_data_op; struct rds_header *i_send_hdrs; u64 i_send_hdrs_dma; struct rds_ib_send_work *i_sends; + atomic_t i_signaled_sends; /* rx */ + struct tasklet_struct i_recv_tasklet; struct mutex i_recv_mutex; struct rds_ib_work_ring i_recv_ring; struct rds_ib_incoming *i_ibinc; @@ -103,8 +121,9 @@ struct rds_ib_connection { struct rds_header *i_recv_hdrs; u64 i_recv_hdrs_dma; struct rds_ib_recv_work *i_recvs; - struct rds_page_frag i_frag; u64 i_ack_recv; /* last ACK received */ + struct rds_ib_refill_cache i_cache_incs; + struct rds_ib_refill_cache i_cache_frags; /* sending acks */ unsigned long i_ack_flags; @@ -135,7 +154,6 @@ struct rds_ib_connection { /* Batched completions */ unsigned int i_unsignaled_wrs; - long i_unsignaled_bytes; }; /* This assumes that atomic_t is at least 32 bits */ @@ -157,16 +175,20 @@ struct rds_ib_device { struct ib_pd *pd; struct ib_mr *mr; struct rds_ib_mr_pool *mr_pool; - int fmr_page_shift; - int fmr_page_size; - u64 fmr_page_mask; unsigned int fmr_max_remaps; unsigned int max_fmrs; int max_sge; unsigned int max_wrs; + unsigned int max_initiator_depth; + unsigned int max_responder_resources; spinlock_t spinlock; /* protect the above */ + atomic_t refcount; + struct work_struct free_work; }; +#define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device) +#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev) + /* bits for i_ack_flags */ #define IB_ACK_IN_FLIGHT 0 #define IB_ACK_REQUESTED 1 @@ -202,6 +224,8 @@ struct rds_ib_statistics { uint64_t s_ib_rdma_mr_pool_flush; uint64_t s_ib_rdma_mr_pool_wait; uint64_t s_ib_rdma_mr_pool_depleted; + uint64_t s_ib_atomic_cswp; + uint64_t s_ib_atomic_fadd; }; extern struct workqueue_struct *rds_ib_wq; @@ -241,12 +265,12 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, /* ib.c */ extern struct rds_transport rds_ib_transport; -extern void rds_ib_add_one(struct ib_device *device); -extern void rds_ib_remove_one(struct ib_device *device); +struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); +void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); extern struct ib_client rds_ib_client; -extern unsigned int fmr_pool_size; extern unsigned int fmr_message_size; +extern unsigned int rds_ib_retry_count; extern spinlock_t ib_nodev_conns_lock; extern struct list_head ib_nodev_conns; @@ -257,7 +281,7 @@ void rds_ib_conn_free(void *arg); int rds_ib_conn_connect(struct rds_connection *conn); void rds_ib_conn_shutdown(struct rds_connection *conn); void rds_ib_state_change(struct sock *sk); -int __init rds_ib_listen_init(void); +int rds_ib_listen_init(void); void rds_ib_listen_stop(void); void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, @@ -274,15 +298,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); -void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock); -static inline void rds_ib_destroy_nodev_conns(void) -{ - __rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock); -} -static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev) -{ - __rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock); -} +void rds_ib_destroy_nodev_conns(void); struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); @@ -293,16 +309,17 @@ void rds_ib_free_mr(void *trans_private, int invalidate); void rds_ib_flush_mrs(void); /* ib_recv.c */ -int __init rds_ib_recv_init(void); +int rds_ib_recv_init(void); void rds_ib_recv_exit(void); int rds_ib_recv(struct rds_connection *conn); -int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, - gfp_t page_gfp, int prefill); -void rds_ib_inc_purge(struct rds_incoming *inc); +int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); +void rds_ib_recv_free_caches(struct rds_ib_connection *ic); +void rds_ib_recv_refill(struct rds_connection *conn, int prefill); void rds_ib_inc_free(struct rds_incoming *inc); int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, size_t size); void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_recv_tasklet_fn(unsigned long data); void rds_ib_recv_init_ring(struct rds_ib_connection *ic); void rds_ib_recv_clear_ring(struct rds_ib_connection *ic); void rds_ib_recv_init_ack(struct rds_ib_connection *ic); @@ -323,17 +340,19 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); extern wait_queue_head_t rds_ib_ring_empty_wait; /* ib_send.c */ +char *rds_ib_wc_status_str(enum ib_wc_status status); void rds_ib_xmit_complete(struct rds_connection *conn); int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); void rds_ib_send_init_ring(struct rds_ib_connection *ic); void rds_ib_send_clear_ring(struct rds_ib_connection *ic); -int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); +int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, - u32 *adv_credits, int need_posted); + u32 *adv_credits, int need_posted, int max_posted); +int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); /* ib_stats.c */ DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); @@ -342,7 +361,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); /* ib_sysctl.c */ -int __init rds_ib_sysctl_init(void); +int rds_ib_sysctl_init(void); void rds_ib_sysctl_exit(void); extern unsigned long rds_ib_sysctl_max_send_wr; extern unsigned long rds_ib_sysctl_max_recv_wr; @@ -350,22 +369,5 @@ extern unsigned long rds_ib_sysctl_max_unsig_wrs; extern unsigned long rds_ib_sysctl_max_unsig_bytes; extern unsigned long rds_ib_sysctl_max_recv_allocation; extern unsigned int rds_ib_sysctl_flow_control; -extern ctl_table rds_ib_sysctl_table[]; - -/* - * Helper functions for getting/setting the header and data SGEs in - * RDS packets (not RDMA) - */ -static inline struct ib_sge * -rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge) -{ - return &sge[0]; -} - -static inline struct ib_sge * -rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge) -{ - return &sge[1]; -} #endif diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index f8e40e1a603..31b74f5e61a 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -32,11 +32,43 @@ */ #include <linux/kernel.h> #include <linux/in.h> +#include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/ratelimit.h> #include "rds.h" #include "ib.h" +static char *rds_ib_event_type_strings[] = { +#define RDS_IB_EVENT_STRING(foo) \ + [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo) + RDS_IB_EVENT_STRING(CQ_ERR), + RDS_IB_EVENT_STRING(QP_FATAL), + RDS_IB_EVENT_STRING(QP_REQ_ERR), + RDS_IB_EVENT_STRING(QP_ACCESS_ERR), + RDS_IB_EVENT_STRING(COMM_EST), + RDS_IB_EVENT_STRING(SQ_DRAINED), + RDS_IB_EVENT_STRING(PATH_MIG), + RDS_IB_EVENT_STRING(PATH_MIG_ERR), + RDS_IB_EVENT_STRING(DEVICE_FATAL), + RDS_IB_EVENT_STRING(PORT_ACTIVE), + RDS_IB_EVENT_STRING(PORT_ERR), + RDS_IB_EVENT_STRING(LID_CHANGE), + RDS_IB_EVENT_STRING(PKEY_CHANGE), + RDS_IB_EVENT_STRING(SM_CHANGE), + RDS_IB_EVENT_STRING(SRQ_ERR), + RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED), + RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED), + RDS_IB_EVENT_STRING(CLIENT_REREGISTER), +#undef RDS_IB_EVENT_STRING +}; + +static char *rds_ib_event_str(enum ib_event_type type) +{ + return rds_str_array(rds_ib_event_type_strings, + ARRAY_SIZE(rds_ib_event_type_strings), type); +}; + /* * Set the selected protocol version */ @@ -94,24 +126,46 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even { const struct rds_ib_connect_private *dp = NULL; struct rds_ib_connection *ic = conn->c_transport_data; - struct rds_ib_device *rds_ibdev; struct ib_qp_attr qp_attr; int err; - if (event->param.conn.private_data_len) { + if (event->param.conn.private_data_len >= sizeof(*dp)) { dp = event->param.conn.private_data; - rds_ib_set_protocol(conn, + /* make sure it isn't empty data */ + if (dp->dp_protocol_major) { + rds_ib_set_protocol(conn, RDS_PROTOCOL(dp->dp_protocol_major, - dp->dp_protocol_minor)); - rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + dp->dp_protocol_minor)); + rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + } + } + + if (conn->c_version < RDS_PROTOCOL(3,1)) { + printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed," + " no longer supported\n", + &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version)); + rds_conn_destroy(conn); + return; + } else { + printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", + &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); } - printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", - &conn->c_laddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version), - ic->i_flowctl ? ", flow control" : ""); + /* + * Init rings and fill recv. this needs to wait until protocol negotiation + * is complete, since ring layout is different from 3.0 to 3.1. + */ + rds_ib_send_init_ring(ic); + rds_ib_recv_init_ring(ic); + /* Post receive buffers - as a side effect, this will update + * the posted credit count. */ + rds_ib_recv_refill(conn, 1); /* Tune RNR behavior */ rds_ib_tune_rnr(ic, &qp_attr); @@ -121,12 +175,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even if (err) printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); - /* update ib_device with this local ipaddr & conn */ - rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); - err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr); + /* update ib_device with this local ipaddr */ + err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr); if (err) - printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); - rds_ib_add_conn(rds_ibdev, conn); + printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", + err); /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ @@ -139,18 +192,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, struct rdma_conn_param *conn_param, struct rds_ib_connect_private *dp, - u32 protocol_version) + u32 protocol_version, + u32 max_responder_resources, + u32 max_initiator_depth) { + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_device *rds_ibdev = ic->rds_ibdev; + memset(conn_param, 0, sizeof(struct rdma_conn_param)); - /* XXX tune these? */ - conn_param->responder_resources = 1; - conn_param->initiator_depth = 1; - conn_param->retry_count = 7; + + conn_param->responder_resources = + min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources); + conn_param->initiator_depth = + min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth); + conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); conn_param->rnr_retry_count = 7; if (dp) { - struct rds_ib_connection *ic = conn->c_transport_data; - memset(dp, 0, sizeof(*dp)); dp->dp_saddr = conn->c_laddr; dp->dp_daddr = conn->c_faddr; @@ -175,7 +233,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, static void rds_ib_cq_event_handler(struct ib_event *event, void *data) { - rdsdebug("event %u data %p\n", event->event, data); + rdsdebug("event %u (%s) data %p\n", + event->event, rds_ib_event_str(event->event), data); } static void rds_ib_qp_event_handler(struct ib_event *event, void *data) @@ -183,16 +242,19 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) struct rds_connection *conn = data; struct rds_ib_connection *ic = conn->c_transport_data; - rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); + rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event, + rds_ib_event_str(event->event)); switch (event->event) { case IB_EVENT_COMM_EST: rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); break; default: - printk(KERN_WARNING "RDS/ib: unhandled QP event %u " - "on connection to %pI4\n", event->event, - &conn->c_faddr); + rdsdebug("Fatal QP Event %u (%s) " + "- connection %pI4->%pI4, reconnecting\n", + event->event, rds_ib_event_str(event->event), + &conn->c_laddr, &conn->c_faddr); + rds_conn_drop(conn); break; } } @@ -209,18 +271,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct rds_ib_device *rds_ibdev; int ret; - /* rds_ib_add_one creates a rds_ib_device object per IB device, - * and allocates a protection domain, memory range and FMR pool - * for each. If that fails for any reason, it will not register - * the rds_ibdev at all. + /* + * It's normal to see a null device if an incoming connection races + * with device removal, so we don't print a warning. */ - rds_ibdev = ib_get_client_data(dev, &rds_ib_client); - if (rds_ibdev == NULL) { - if (printk_ratelimit()) - printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n", - dev->name); + rds_ibdev = rds_ib_get_client_data(dev); + if (!rds_ibdev) return -EOPNOTSUPP; - } + + /* add the conn now so that connection establishment has the dev */ + rds_ib_add_conn(rds_ibdev, conn); if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); @@ -291,7 +351,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_send_ring.w_nr * sizeof(struct rds_header), &ic->i_send_hdrs_dma, GFP_KERNEL); - if (ic->i_send_hdrs == NULL) { + if (!ic->i_send_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent send failed\n"); goto out; @@ -301,7 +361,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_recv_ring.w_nr * sizeof(struct rds_header), &ic->i_recv_hdrs_dma, GFP_KERNEL); - if (ic->i_recv_hdrs == NULL) { + if (!ic->i_recv_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent recv failed\n"); goto out; @@ -309,54 +369,64 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), &ic->i_ack_dma, GFP_KERNEL); - if (ic->i_ack == NULL) { + if (!ic->i_ack) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent ack failed\n"); goto out; } - ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); - if (ic->i_sends == NULL) { + ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work), + ibdev_to_node(dev)); + if (!ic->i_sends) { ret = -ENOMEM; rdsdebug("send allocation failed\n"); goto out; } - rds_ib_send_init_ring(ic); - ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); - if (ic->i_recvs == NULL) { + ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work), + ibdev_to_node(dev)); + if (!ic->i_recvs) { ret = -ENOMEM; rdsdebug("recv allocation failed\n"); goto out; } - rds_ib_recv_init_ring(ic); rds_ib_recv_init_ack(ic); - /* Post receive buffers - as a side effect, this will update - * the posted credit count. */ - rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); - rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, ic->i_send_cq, ic->i_recv_cq); out: + rds_ib_dev_put(rds_ibdev); return ret; } -static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp) +static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) { + const struct rds_ib_connect_private *dp = event->param.conn.private_data; u16 common; u32 version = 0; - /* rdma_cm private data is odd - when there is any private data in the + /* + * rdma_cm private data is odd - when there is any private data in the * request, we will be given a pretty large buffer without telling us the * original size. The only way to tell the difference is by looking at * the contents, which are initialized to zero. * If the protocol version fields aren't set, this is a connection attempt * from an older version. This could could be 3.0 or 2.0 - we can't tell. - * We really should have changed this for OFED 1.3 :-( */ - if (dp->dp_protocol_major == 0) + * We really should have changed this for OFED 1.3 :-( + */ + + /* Be paranoid. RDS always has privdata */ + if (!event->param.conn.private_data_len) { + printk(KERN_NOTICE "RDS incoming connection has no private data, " + "rejecting\n"); + return 0; + } + + /* Even if len is crap *now* I still want to check it. -ASG */ + if (event->param.conn.private_data_len < sizeof (*dp) || + dp->dp_protocol_major == 0) return RDS_PROTOCOL_3_0; common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; @@ -364,13 +434,11 @@ static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp) version = RDS_PROTOCOL_3_0; while ((common >>= 1) != 0) version++; - } else if (printk_ratelimit()) { - printk(KERN_NOTICE "RDS: Connection from %pI4 using " - "incompatible protocol version %u.%u\n", - &dp->dp_saddr, - dp->dp_protocol_major, - dp->dp_protocol_minor); - } + } else + printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n", + &dp->dp_saddr, + dp->dp_protocol_major, + dp->dp_protocol_minor); return version; } @@ -385,10 +453,10 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, struct rds_ib_connection *ic = NULL; struct rdma_conn_param conn_param; u32 version; - int err, destroy = 1; + int err = 1, destroy = 1; /* Check whether the remote protocol version matches ours. */ - version = rds_ib_protocol_compatible(dp); + version = rds_ib_protocol_compatible(event); if (!version) goto out; @@ -424,7 +492,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, /* Wait and see - our connect may still be succeeding */ rds_ib_stats_inc(s_ib_connect_raced); } - mutex_unlock(&conn->c_cm_lock); goto out; } @@ -454,20 +521,20 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, goto out; } - rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, + event->param.conn.responder_resources, + event->param.conn.initiator_depth); /* rdma_accept() calls rdma_reject() internally if it fails */ err = rdma_accept(cm_id, &conn_param); - mutex_unlock(&conn->c_cm_lock); - if (err) { + if (err) rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); - goto out; - } - - return 0; out: - rdma_reject(cm_id, NULL, 0); + if (conn) + mutex_unlock(&conn->c_cm_lock); + if (err) + rdma_reject(cm_id, NULL, 0); return destroy; } @@ -491,8 +558,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) goto out; } - rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); - + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION, + UINT_MAX, UINT_MAX); ret = rdma_connect(cm_id, &conn_param); if (ret) rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); @@ -517,7 +584,7 @@ int rds_ib_conn_connect(struct rds_connection *conn) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, - RDMA_PS_TCP); + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { ret = PTR_ERR(ic->i_cm_id); ic->i_cm_id = NULL; @@ -576,9 +643,19 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) ic->i_cm_id, err); } + /* + * We want to wait for tx and rx completion to finish + * before we tear down the connection, but we have to be + * careful not to get stuck waiting on a send ring that + * only has unsignaled sends in it. We've shutdown new + * sends before getting here so by waiting for signaled + * sends to complete we're ensured that there will be no + * more tx processing. + */ wait_event(rds_ib_ring_empty_wait, - rds_ib_ring_empty(&ic->i_send_ring) && - rds_ib_ring_empty(&ic->i_recv_ring)); + rds_ib_ring_empty(&ic->i_recv_ring) && + (atomic_read(&ic->i_signaled_sends) == 0)); + tasklet_kill(&ic->i_recv_tasklet); if (ic->i_send_hdrs) ib_dma_free_coherent(dev, @@ -629,9 +706,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) BUG_ON(ic->rds_ibdev); /* Clear pending transmit */ - if (ic->i_rm) { - rds_message_put(ic->i_rm); - ic->i_rm = NULL; + if (ic->i_data_op) { + struct rds_message *rm; + + rm = container_of(ic->i_data_op, struct rds_message, data); + rds_message_put(rm); + ic->i_data_op = NULL; } /* Clear the ACK state */ @@ -665,17 +745,27 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) { struct rds_ib_connection *ic; unsigned long flags; + int ret; /* XXX too lazy? */ - ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); - if (ic == NULL) + ic = kzalloc(sizeof(struct rds_ib_connection), gfp); + if (!ic) return -ENOMEM; + ret = rds_ib_recv_alloc_caches(ic); + if (ret) { + kfree(ic); + return ret; + } + INIT_LIST_HEAD(&ic->ib_node); + tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, + (unsigned long) ic); mutex_init(&ic->i_recv_mutex); #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&ic->i_ack_lock); #endif + atomic_set(&ic->i_signaled_sends, 0); /* * rds_ib_conn_shutdown() waits for these to be emptied so they @@ -717,6 +807,8 @@ void rds_ib_conn_free(void *arg) list_del(&ic->ib_node); spin_unlock_irq(lock_ptr); + rds_ib_recv_free_caches(ic); + kfree(ic); } diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 81033af9302..e8fdb172adb 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -31,11 +31,15 @@ * */ #include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/rculist.h> +#include <linux/llist.h> #include "rds.h" -#include "rdma.h" #include "ib.h" +static DEFINE_PER_CPU(unsigned long, clean_list_grace); +#define CLEAN_LIST_BUSY_BIT 0 /* * This is stored as mr->r_trans_private. @@ -44,7 +48,11 @@ struct rds_ib_mr { struct rds_ib_device *device; struct rds_ib_mr_pool *pool; struct ib_fmr *fmr; - struct list_head list; + + struct llist_node llnode; + + /* unmap_list is for freeing */ + struct list_head unmap_list; unsigned int remap_count; struct scatterlist *sg; @@ -58,14 +66,16 @@ struct rds_ib_mr { */ struct rds_ib_mr_pool { struct mutex flush_lock; /* serialize fmr invalidate */ - struct work_struct flush_worker; /* flush worker */ + struct delayed_work flush_worker; /* flush worker */ - spinlock_t list_lock; /* protect variables below */ atomic_t item_count; /* total # of MRs */ atomic_t dirty_count; /* # dirty of MRs */ - struct list_head drop_list; /* MRs that have reached their max_maps limit */ - struct list_head free_list; /* unused MRs */ - struct list_head clean_list; /* unused & unamapped MRs */ + + struct llist_head drop_list; /* MRs that have reached their max_maps limit */ + struct llist_head free_list; /* unused MRs */ + struct llist_head clean_list; /* global unused & unamapped MRs */ + wait_queue_head_t flush_wait; + atomic_t free_pinned; /* memory pinned by free MRs */ unsigned long max_items; unsigned long max_items_soft; @@ -73,7 +83,7 @@ struct rds_ib_mr_pool { struct ib_fmr_attr fmr_attr; }; -static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all); +static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **); static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); static void rds_ib_mr_pool_flush_worker(struct work_struct *work); @@ -82,16 +92,17 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) struct rds_ib_device *rds_ibdev; struct rds_ib_ipaddr *i_ipaddr; - list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { - spin_lock_irq(&rds_ibdev->spinlock); - list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) { + list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { - spin_unlock_irq(&rds_ibdev->spinlock); + atomic_inc(&rds_ibdev->refcount); + rcu_read_unlock(); return rds_ibdev; } } - spin_unlock_irq(&rds_ibdev->spinlock); } + rcu_read_unlock(); return NULL; } @@ -107,7 +118,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) i_ipaddr->ipaddr = ipaddr; spin_lock_irq(&rds_ibdev->spinlock); - list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list); + list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list); spin_unlock_irq(&rds_ibdev->spinlock); return 0; @@ -115,17 +126,24 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { - struct rds_ib_ipaddr *i_ipaddr, *next; + struct rds_ib_ipaddr *i_ipaddr; + struct rds_ib_ipaddr *to_free = NULL; + spin_lock_irq(&rds_ibdev->spinlock); - list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) { + list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { - list_del(&i_ipaddr->list); - kfree(i_ipaddr); + list_del_rcu(&i_ipaddr->list); + to_free = i_ipaddr; break; } } spin_unlock_irq(&rds_ibdev->spinlock); + + if (to_free) { + synchronize_rcu(); + kfree(to_free); + } } int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) @@ -133,8 +151,10 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) struct rds_ib_device *rds_ibdev_old; rds_ibdev_old = rds_ib_get_device(ipaddr); - if (rds_ibdev_old) + if (rds_ibdev_old) { rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); + rds_ib_dev_put(rds_ibdev_old); + } return rds_ib_add_ipaddr(rds_ibdev, ipaddr); } @@ -149,12 +169,13 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con BUG_ON(list_empty(&ic->ib_node)); list_del(&ic->ib_node); - spin_lock_irq(&rds_ibdev->spinlock); + spin_lock(&rds_ibdev->spinlock); list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); - spin_unlock_irq(&rds_ibdev->spinlock); + spin_unlock(&rds_ibdev->spinlock); spin_unlock_irq(&ib_nodev_conns_lock); ic->rds_ibdev = rds_ibdev; + atomic_inc(&rds_ibdev->refcount); } void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) @@ -174,24 +195,21 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection * spin_unlock(&ib_nodev_conns_lock); ic->rds_ibdev = NULL; + rds_ib_dev_put(rds_ibdev); } -void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock) +void rds_ib_destroy_nodev_conns(void) { struct rds_ib_connection *ic, *_ic; LIST_HEAD(tmp_list); /* avoid calling conn_destroy with irqs off */ - spin_lock_irq(list_lock); - list_splice(list, &tmp_list); - INIT_LIST_HEAD(list); - spin_unlock_irq(list_lock); - - list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) { - if (ic->conn->c_passive) - rds_conn_destroy(ic->conn->c_passive); + spin_lock_irq(&ib_nodev_conns_lock); + list_splice(&ib_nodev_conns, &tmp_list); + spin_unlock_irq(&ib_nodev_conns_lock); + + list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) rds_conn_destroy(ic->conn); - } } struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) @@ -202,16 +220,16 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) if (!pool) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&pool->free_list); - INIT_LIST_HEAD(&pool->drop_list); - INIT_LIST_HEAD(&pool->clean_list); + init_llist_head(&pool->free_list); + init_llist_head(&pool->drop_list); + init_llist_head(&pool->clean_list); mutex_init(&pool->flush_lock); - spin_lock_init(&pool->list_lock); - INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); + init_waitqueue_head(&pool->flush_wait); + INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); pool->fmr_attr.max_pages = fmr_message_size; pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; - pool->fmr_attr.page_shift = rds_ibdev->fmr_page_shift; + pool->fmr_attr.page_shift = PAGE_SHIFT; pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4; /* We never allow more than max_items MRs to be allocated. @@ -235,34 +253,52 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) { - flush_workqueue(rds_wq); - rds_ib_flush_mr_pool(pool, 1); - BUG_ON(atomic_read(&pool->item_count)); - BUG_ON(atomic_read(&pool->free_pinned)); + cancel_delayed_work_sync(&pool->flush_worker); + rds_ib_flush_mr_pool(pool, 1, NULL); + WARN_ON(atomic_read(&pool->item_count)); + WARN_ON(atomic_read(&pool->free_pinned)); kfree(pool); } static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; - unsigned long flags; + struct llist_node *ret; + unsigned long *flag; - spin_lock_irqsave(&pool->list_lock, flags); - if (!list_empty(&pool->clean_list)) { - ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list); - list_del_init(&ibmr->list); - } - spin_unlock_irqrestore(&pool->list_lock, flags); + preempt_disable(); + flag = &__get_cpu_var(clean_list_grace); + set_bit(CLEAN_LIST_BUSY_BIT, flag); + ret = llist_del_first(&pool->clean_list); + if (ret) + ibmr = llist_entry(ret, struct rds_ib_mr, llnode); + clear_bit(CLEAN_LIST_BUSY_BIT, flag); + preempt_enable(); return ibmr; } +static inline void wait_clean_list_grace(void) +{ + int cpu; + unsigned long *flag; + + for_each_online_cpu(cpu) { + flag = &per_cpu(clean_list_grace, cpu); + while (test_bit(CLEAN_LIST_BUSY_BIT, flag)) + cpu_relax(); + } +} + static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) { struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; struct rds_ib_mr *ibmr = NULL; int err = 0, iter = 0; + if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) + schedule_delayed_work(&pool->flush_worker, 10); + while (1) { ibmr = rds_ib_reuse_fmr(pool); if (ibmr) @@ -289,19 +325,24 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) /* We do have some empty MRs. Flush them out. */ rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); - rds_ib_flush_mr_pool(pool, 0); + rds_ib_flush_mr_pool(pool, 0, &ibmr); + if (ibmr) + return ibmr; } - ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); + ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev)); if (!ibmr) { err = -ENOMEM; goto out_no_cigar; } + memset(ibmr, 0, sizeof(*ibmr)); + ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE), + IB_ACCESS_REMOTE_WRITE| + IB_ACCESS_REMOTE_ATOMIC), &pool->fmr_attr); if (IS_ERR(ibmr->fmr)) { err = PTR_ERR(ibmr->fmr); @@ -349,13 +390,13 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); - if (dma_addr & ~rds_ibdev->fmr_page_mask) { + if (dma_addr & ~PAGE_MASK) { if (i > 0) return -EINVAL; else ++page_cnt; } - if ((dma_addr + dma_len) & ~rds_ibdev->fmr_page_mask) { + if ((dma_addr + dma_len) & ~PAGE_MASK) { if (i < sg_dma_len - 1) return -EINVAL; else @@ -365,11 +406,12 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm len += dma_len; } - page_cnt += len >> rds_ibdev->fmr_page_shift; + page_cnt += len >> PAGE_SHIFT; if (page_cnt > fmr_message_size) return -EINVAL; - dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC); + dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, + rdsibdev_to_node(rds_ibdev)); if (!dma_pages) return -ENOMEM; @@ -378,9 +420,9 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); - for (j = 0; j < dma_len; j += rds_ibdev->fmr_page_size) + for (j = 0; j < dma_len; j += PAGE_SIZE) dma_pages[page_cnt++] = - (dma_addr & rds_ibdev->fmr_page_mask) + j; + (dma_addr & PAGE_MASK) + j; } ret = ib_map_phys_fmr(ibmr->fmr, @@ -443,6 +485,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) /* FIXME we need a way to tell a r/w MR * from a r/o MR */ + BUG_ON(irqs_disabled()); set_page_dirty(page); put_page(page); } @@ -478,33 +521,107 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr } /* + * given an llist of mrs, put them all into the list_head for more processing + */ +static void llist_append_to_list(struct llist_head *llist, struct list_head *list) +{ + struct rds_ib_mr *ibmr; + struct llist_node *node; + struct llist_node *next; + + node = llist_del_all(llist); + while (node) { + next = node->next; + ibmr = llist_entry(node, struct rds_ib_mr, llnode); + list_add_tail(&ibmr->unmap_list, list); + node = next; + } +} + +/* + * this takes a list head of mrs and turns it into linked llist nodes + * of clusters. Each cluster has linked llist nodes of + * MR_CLUSTER_SIZE mrs that are ready for reuse. + */ +static void list_to_llist_nodes(struct rds_ib_mr_pool *pool, + struct list_head *list, + struct llist_node **nodes_head, + struct llist_node **nodes_tail) +{ + struct rds_ib_mr *ibmr; + struct llist_node *cur = NULL; + struct llist_node **next = nodes_head; + + list_for_each_entry(ibmr, list, unmap_list) { + cur = &ibmr->llnode; + *next = cur; + next = &cur->next; + } + *next = NULL; + *nodes_tail = cur; +} + +/* * Flush our pool of MRs. * At a minimum, all currently unused MRs are unmapped. * If the number of MRs allocated exceeds the limit, we also try * to free as many MRs as needed to get back to this limit. */ -static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) +static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, + int free_all, struct rds_ib_mr **ibmr_ret) { struct rds_ib_mr *ibmr, *next; + struct llist_node *clean_nodes; + struct llist_node *clean_tail; LIST_HEAD(unmap_list); LIST_HEAD(fmr_list); unsigned long unpinned = 0; - unsigned long flags; unsigned int nfreed = 0, ncleaned = 0, free_goal; int ret = 0; rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); - mutex_lock(&pool->flush_lock); + if (ibmr_ret) { + DEFINE_WAIT(wait); + while(!mutex_trylock(&pool->flush_lock)) { + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + finish_wait(&pool->flush_wait, &wait); + goto out_nolock; + } + + prepare_to_wait(&pool->flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (llist_empty(&pool->clean_list)) + schedule(); + + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + finish_wait(&pool->flush_wait, &wait); + goto out_nolock; + } + } + finish_wait(&pool->flush_wait, &wait); + } else + mutex_lock(&pool->flush_lock); + + if (ibmr_ret) { + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + goto out; + } + } - spin_lock_irqsave(&pool->list_lock, flags); /* Get the list of all MRs to be dropped. Ordering matters - - * we want to put drop_list ahead of free_list. */ - list_splice_init(&pool->free_list, &unmap_list); - list_splice_init(&pool->drop_list, &unmap_list); + * we want to put drop_list ahead of free_list. + */ + llist_append_to_list(&pool->drop_list, &unmap_list); + llist_append_to_list(&pool->free_list, &unmap_list); if (free_all) - list_splice_init(&pool->clean_list, &unmap_list); - spin_unlock_irqrestore(&pool->list_lock, flags); + llist_append_to_list(&pool->clean_list, &unmap_list); free_goal = rds_ib_flush_goal(pool, free_all); @@ -512,19 +629,20 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) goto out; /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ - list_for_each_entry(ibmr, &unmap_list, list) + list_for_each_entry(ibmr, &unmap_list, unmap_list) list_add(&ibmr->fmr->list, &fmr_list); + ret = ib_unmap_fmr(&fmr_list); if (ret) printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); /* Now we can destroy the DMA mapping and unpin any pages */ - list_for_each_entry_safe(ibmr, next, &unmap_list, list) { + list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) { unpinned += ibmr->sg_len; __rds_ib_teardown_mr(ibmr); if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { rds_ib_stats_inc(s_ib_rdma_mr_free); - list_del(&ibmr->list); + list_del(&ibmr->unmap_list); ib_dealloc_fmr(ibmr->fmr); kfree(ibmr); nfreed++; @@ -532,9 +650,27 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) ncleaned++; } - spin_lock_irqsave(&pool->list_lock, flags); - list_splice(&unmap_list, &pool->clean_list); - spin_unlock_irqrestore(&pool->list_lock, flags); + if (!list_empty(&unmap_list)) { + /* we have to make sure that none of the things we're about + * to put on the clean list would race with other cpus trying + * to pull items off. The llist would explode if we managed to + * remove something from the clean list and then add it back again + * while another CPU was spinning on that same item in llist_del_first. + * + * This is pretty unlikely, but just in case wait for an llist grace period + * here before adding anything back into the clean list. + */ + wait_clean_list_grace(); + + list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail); + if (ibmr_ret) + *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); + + /* more than one entry in llist nodes */ + if (clean_nodes->next) + llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list); + + } atomic_sub(unpinned, &pool->free_pinned); atomic_sub(ncleaned, &pool->dirty_count); @@ -542,14 +678,17 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) out: mutex_unlock(&pool->flush_lock); + if (waitqueue_active(&pool->flush_wait)) + wake_up(&pool->flush_wait); +out_nolock: return ret; } static void rds_ib_mr_pool_flush_worker(struct work_struct *work) { - struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker); + struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work); - rds_ib_flush_mr_pool(pool, 0); + rds_ib_flush_mr_pool(pool, 0, NULL); } void rds_ib_free_mr(void *trans_private, int invalidate) @@ -557,47 +696,48 @@ void rds_ib_free_mr(void *trans_private, int invalidate) struct rds_ib_mr *ibmr = trans_private; struct rds_ib_device *rds_ibdev = ibmr->device; struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; - unsigned long flags; rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); /* Return it to the pool's free list */ - spin_lock_irqsave(&pool->list_lock, flags); if (ibmr->remap_count >= pool->fmr_attr.max_maps) - list_add(&ibmr->list, &pool->drop_list); + llist_add(&ibmr->llnode, &pool->drop_list); else - list_add(&ibmr->list, &pool->free_list); + llist_add(&ibmr->llnode, &pool->free_list); atomic_add(ibmr->sg_len, &pool->free_pinned); atomic_inc(&pool->dirty_count); - spin_unlock_irqrestore(&pool->list_lock, flags); /* If we've pinned too many pages, request a flush */ - if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned - || atomic_read(&pool->dirty_count) >= pool->max_items / 10) - queue_work(rds_wq, &pool->flush_worker); + if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || + atomic_read(&pool->dirty_count) >= pool->max_items / 10) + schedule_delayed_work(&pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { - rds_ib_flush_mr_pool(pool, 0); + rds_ib_flush_mr_pool(pool, 0, NULL); } else { /* We get here if the user created a MR marked * as use_once and invalidate at the same time. */ - queue_work(rds_wq, &pool->flush_worker); + schedule_delayed_work(&pool->flush_worker, 10); } } + + rds_ib_dev_put(rds_ibdev); } void rds_ib_flush_mrs(void) { struct rds_ib_device *rds_ibdev; + down_read(&rds_ib_devices_lock); list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; if (pool) - rds_ib_flush_mr_pool(pool, 0); + rds_ib_flush_mr_pool(pool, 0, NULL); } + up_read(&rds_ib_devices_lock); } void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, @@ -629,6 +769,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); ibmr->device = rds_ibdev; + rds_ibdev = NULL; out: if (ret) { @@ -636,5 +777,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, rds_ib_free_mr(ibmr, 0); ibmr = ERR_PTR(ret); } + if (rds_ibdev) + rds_ib_dev_put(rds_ibdev); return ibmr; } + diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 36d931573ff..d67de453c35 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -31,6 +31,7 @@ * */ #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/pci.h> #include <linux/dma-mapping.h> #include <rdma/rdma_cm.h> @@ -42,42 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab; static struct kmem_cache *rds_ib_frag_slab; static atomic_t rds_ib_allocation = ATOMIC_INIT(0); -static void rds_ib_frag_drop_page(struct rds_page_frag *frag) -{ - rdsdebug("frag %p page %p\n", frag, frag->f_page); - __free_page(frag->f_page); - frag->f_page = NULL; -} - -static void rds_ib_frag_free(struct rds_page_frag *frag) -{ - rdsdebug("frag %p page %p\n", frag, frag->f_page); - BUG_ON(frag->f_page != NULL); - kmem_cache_free(rds_ib_frag_slab, frag); -} - -/* - * We map a page at a time. Its fragments are posted in order. This - * is called in fragment order as the fragments get send completion events. - * Only the last frag in the page performs the unmapping. - * - * It's OK for ring cleanup to call this in whatever order it likes because - * DMA is not in flight and so we can unmap while other ring entries still - * hold page references in their frags. - */ -static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic, - struct rds_ib_recv_work *recv) -{ - struct rds_page_frag *frag = recv->r_frag; - - rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); - if (frag->f_mapped) - ib_dma_unmap_page(ic->i_cm_id->device, - frag->f_mapped, - RDS_FRAG_SIZE, DMA_FROM_DEVICE); - frag->f_mapped = 0; -} - void rds_ib_recv_init_ring(struct rds_ib_connection *ic) { struct rds_ib_recv_work *recv; @@ -94,18 +59,163 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic) recv->r_wr.sg_list = recv->r_sge; recv->r_wr.num_sge = RDS_IB_RECV_SGE; - sge = rds_ib_data_sge(ic, recv->r_sge); + sge = &recv->r_sge[0]; + sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; + + sge = &recv->r_sge[1]; sge->addr = 0; sge->length = RDS_FRAG_SIZE; sge->lkey = ic->i_mr->lkey; + } +} - sge = rds_ib_header_sge(ic, recv->r_sge); - sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; +/* + * The entire 'from' list, including the from element itself, is put on + * to the tail of the 'to' list. + */ +static void list_splice_entire_tail(struct list_head *from, + struct list_head *to) +{ + struct list_head *from_last = from->prev; + + list_splice_tail(from_last, to); + list_add_tail(from_last, to); +} + +static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache) +{ + struct list_head *tmp; + + tmp = xchg(&cache->xfer, NULL); + if (tmp) { + if (cache->ready) + list_splice_entire_tail(tmp, cache->ready); + else + cache->ready = tmp; } } +static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache) +{ + struct rds_ib_cache_head *head; + int cpu; + + cache->percpu = alloc_percpu(struct rds_ib_cache_head); + if (!cache->percpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + head = per_cpu_ptr(cache->percpu, cpu); + head->first = NULL; + head->count = 0; + } + cache->xfer = NULL; + cache->ready = NULL; + + return 0; +} + +int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic) +{ + int ret; + + ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs); + if (!ret) { + ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags); + if (ret) + free_percpu(ic->i_cache_incs.percpu); + } + + return ret; +} + +static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache, + struct list_head *caller_list) +{ + struct rds_ib_cache_head *head; + int cpu; + + for_each_possible_cpu(cpu) { + head = per_cpu_ptr(cache->percpu, cpu); + if (head->first) { + list_splice_entire_tail(head->first, caller_list); + head->first = NULL; + } + } + + if (cache->ready) { + list_splice_entire_tail(cache->ready, caller_list); + cache->ready = NULL; + } +} + +void rds_ib_recv_free_caches(struct rds_ib_connection *ic) +{ + struct rds_ib_incoming *inc; + struct rds_ib_incoming *inc_tmp; + struct rds_page_frag *frag; + struct rds_page_frag *frag_tmp; + LIST_HEAD(list); + + rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); + rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list); + free_percpu(ic->i_cache_incs.percpu); + + list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) { + list_del(&inc->ii_cache_entry); + WARN_ON(!list_empty(&inc->ii_frags)); + kmem_cache_free(rds_ib_incoming_slab, inc); + } + + rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); + rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list); + free_percpu(ic->i_cache_frags.percpu); + + list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) { + list_del(&frag->f_cache_entry); + WARN_ON(!list_empty(&frag->f_item)); + kmem_cache_free(rds_ib_frag_slab, frag); + } +} + +/* fwd decl */ +static void rds_ib_recv_cache_put(struct list_head *new_item, + struct rds_ib_refill_cache *cache); +static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache); + + +/* Recycle frag and attached recv buffer f_sg */ +static void rds_ib_frag_free(struct rds_ib_connection *ic, + struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); + + rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags); +} + +/* Recycle inc after freeing attached frags */ +void rds_ib_inc_free(struct rds_incoming *inc) +{ + struct rds_ib_incoming *ibinc; + struct rds_page_frag *frag; + struct rds_page_frag *pos; + struct rds_ib_connection *ic = inc->i_conn->c_transport_data; + + ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + + /* Free attached frags */ + list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { + list_del_init(&frag->f_item); + rds_ib_frag_free(ic, frag); + } + BUG_ON(!list_empty(&ibinc->ii_frags)); + + rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); + rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs); +} + static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, struct rds_ib_recv_work *recv) { @@ -114,10 +224,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, recv->r_ibinc = NULL; } if (recv->r_frag) { - rds_ib_recv_unmap_page(ic, recv); - if (recv->r_frag->f_page) - rds_ib_frag_drop_page(recv->r_frag); - rds_ib_frag_free(recv->r_frag); + ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); + rds_ib_frag_free(ic, recv->r_frag); recv->r_frag = NULL; } } @@ -128,83 +236,111 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) for (i = 0; i < ic->i_recv_ring.w_nr; i++) rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); +} - if (ic->i_frag.f_page) - rds_ib_frag_drop_page(&ic->i_frag); +static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic, + gfp_t slab_mask) +{ + struct rds_ib_incoming *ibinc; + struct list_head *cache_item; + int avail_allocs; + + cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs); + if (cache_item) { + ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry); + } else { + avail_allocs = atomic_add_unless(&rds_ib_allocation, + 1, rds_ib_sysctl_max_recv_allocation); + if (!avail_allocs) { + rds_ib_stats_inc(s_ib_rx_alloc_limit); + return NULL; + } + ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask); + if (!ibinc) { + atomic_dec(&rds_ib_allocation); + return NULL; + } + } + INIT_LIST_HEAD(&ibinc->ii_frags); + rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); + + return ibinc; +} + +static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic, + gfp_t slab_mask, gfp_t page_mask) +{ + struct rds_page_frag *frag; + struct list_head *cache_item; + int ret; + + cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags); + if (cache_item) { + frag = container_of(cache_item, struct rds_page_frag, f_cache_entry); + } else { + frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask); + if (!frag) + return NULL; + + sg_init_table(&frag->f_sg, 1); + ret = rds_page_remainder_alloc(&frag->f_sg, + RDS_FRAG_SIZE, page_mask); + if (ret) { + kmem_cache_free(rds_ib_frag_slab, frag); + return NULL; + } + } + + INIT_LIST_HEAD(&frag->f_item); + + return frag; } static int rds_ib_recv_refill_one(struct rds_connection *conn, - struct rds_ib_recv_work *recv, - gfp_t kptr_gfp, gfp_t page_gfp) + struct rds_ib_recv_work *recv, int prefill) { struct rds_ib_connection *ic = conn->c_transport_data; - dma_addr_t dma_addr; struct ib_sge *sge; int ret = -ENOMEM; + gfp_t slab_mask = GFP_NOWAIT; + gfp_t page_mask = GFP_NOWAIT; - if (recv->r_ibinc == NULL) { - if (atomic_read(&rds_ib_allocation) >= rds_ib_sysctl_max_recv_allocation) { - rds_ib_stats_inc(s_ib_rx_alloc_limit); - goto out; - } - recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, - kptr_gfp); - if (recv->r_ibinc == NULL) - goto out; - atomic_inc(&rds_ib_allocation); - INIT_LIST_HEAD(&recv->r_ibinc->ii_frags); - rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr); + if (prefill) { + slab_mask = GFP_KERNEL; + page_mask = GFP_HIGHUSER; } - if (recv->r_frag == NULL) { - recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp); - if (recv->r_frag == NULL) - goto out; - INIT_LIST_HEAD(&recv->r_frag->f_item); - recv->r_frag->f_page = NULL; - } + if (!ic->i_cache_incs.ready) + rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); + if (!ic->i_cache_frags.ready) + rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); - if (ic->i_frag.f_page == NULL) { - ic->i_frag.f_page = alloc_page(page_gfp); - if (ic->i_frag.f_page == NULL) + /* + * ibinc was taken from recv if recv contained the start of a message. + * recvs that were continuations will still have this allocated. + */ + if (!recv->r_ibinc) { + recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask); + if (!recv->r_ibinc) goto out; - ic->i_frag.f_offset = 0; } - dma_addr = ib_dma_map_page(ic->i_cm_id->device, - ic->i_frag.f_page, - ic->i_frag.f_offset, - RDS_FRAG_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) + WARN_ON(recv->r_frag); /* leak! */ + recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask); + if (!recv->r_frag) goto out; - /* - * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap() - * must be called on this recv. This happens as completions hit - * in order or on connection shutdown. - */ - recv->r_frag->f_page = ic->i_frag.f_page; - recv->r_frag->f_offset = ic->i_frag.f_offset; - recv->r_frag->f_mapped = dma_addr; - - sge = rds_ib_data_sge(ic, recv->r_sge); - sge->addr = dma_addr; - sge->length = RDS_FRAG_SIZE; + ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, + 1, DMA_FROM_DEVICE); + WARN_ON(ret != 1); - sge = rds_ib_header_sge(ic, recv->r_sge); + sge = &recv->r_sge[0]; sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); sge->length = sizeof(struct rds_header); - get_page(recv->r_frag->f_page); - - if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { - ic->i_frag.f_offset += RDS_FRAG_SIZE; - } else { - put_page(ic->i_frag.f_page); - ic->i_frag.f_page = NULL; - ic->i_frag.f_offset = 0; - } + sge = &recv->r_sge[1]; + sge->addr = ib_sg_dma_address(ic->i_cm_id->device, &recv->r_frag->f_sg); + sge->length = ib_sg_dma_len(ic->i_cm_id->device, &recv->r_frag->f_sg); ret = 0; out: @@ -214,13 +350,11 @@ out: /* * This tries to allocate and post unused work requests after making sure that * they have all the allocations they need to queue received fragments into - * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc - * pairs don't go unmatched. + * sockets. * * -1 is returned if posting fails due to temporary resource exhaustion. */ -int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, - gfp_t page_gfp, int prefill) +void rds_ib_recv_refill(struct rds_connection *conn, int prefill) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_recv_work *recv; @@ -229,33 +363,33 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, int ret = 0; u32 pos; - while ((prefill || rds_conn_up(conn)) - && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) { + while ((prefill || rds_conn_up(conn)) && + rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) { if (pos >= ic->i_recv_ring.w_nr) { printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", pos); - ret = -EINVAL; break; } recv = &ic->i_recvs[pos]; - ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp); + ret = rds_ib_recv_refill_one(conn, recv, prefill); if (ret) { - ret = -1; break; } /* XXX when can this fail? */ ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, - recv->r_ibinc, recv->r_frag->f_page, - (long) recv->r_frag->f_mapped, ret); + recv->r_ibinc, sg_page(&recv->r_frag->f_sg), + (long) ib_sg_dma_address( + ic->i_cm_id->device, + &recv->r_frag->f_sg), + ret); if (ret) { rds_ib_conn_error(conn, "recv post on " "%pI4 returned %d, disconnecting and " "reconnecting\n", &conn->c_faddr, ret); - ret = -1; break; } @@ -268,37 +402,74 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, if (ret) rds_ib_ring_unalloc(&ic->i_recv_ring, 1); - return ret; } -void rds_ib_inc_purge(struct rds_incoming *inc) +/* + * We want to recycle several types of recv allocations, like incs and frags. + * To use this, the *_free() function passes in the ptr to a list_head within + * the recyclee, as well as the cache to put it on. + * + * First, we put the memory on a percpu list. When this reaches a certain size, + * We move it to an intermediate non-percpu list in a lockless manner, with some + * xchg/compxchg wizardry. + * + * N.B. Instead of a list_head as the anchor, we use a single pointer, which can + * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and + * list_empty() will return true with one element is actually present. + */ +static void rds_ib_recv_cache_put(struct list_head *new_item, + struct rds_ib_refill_cache *cache) { - struct rds_ib_incoming *ibinc; - struct rds_page_frag *frag; - struct rds_page_frag *pos; + unsigned long flags; + struct list_head *old, *chpfirst; - ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); - rdsdebug("purging ibinc %p inc %p\n", ibinc, inc); + local_irq_save(flags); - list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { - list_del_init(&frag->f_item); - rds_ib_frag_drop_page(frag); - rds_ib_frag_free(frag); - } + chpfirst = __this_cpu_read(cache->percpu->first); + if (!chpfirst) + INIT_LIST_HEAD(new_item); + else /* put on front */ + list_add_tail(new_item, chpfirst); + + __this_cpu_write(cache->percpu->first, new_item); + __this_cpu_inc(cache->percpu->count); + + if (__this_cpu_read(cache->percpu->count) < RDS_IB_RECYCLE_BATCH_COUNT) + goto end; + + /* + * Return our per-cpu first list to the cache's xfer by atomically + * grabbing the current xfer list, appending it to our per-cpu list, + * and then atomically returning that entire list back to the + * cache's xfer list as long as it's still empty. + */ + do { + old = xchg(&cache->xfer, NULL); + if (old) + list_splice_entire_tail(old, chpfirst); + old = cmpxchg(&cache->xfer, NULL, chpfirst); + } while (old); + + + __this_cpu_write(cache->percpu->first, NULL); + __this_cpu_write(cache->percpu->count, 0); +end: + local_irq_restore(flags); } -void rds_ib_inc_free(struct rds_incoming *inc) +static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache) { - struct rds_ib_incoming *ibinc; - - ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + struct list_head *head = cache->ready; + + if (head) { + if (!list_empty(head)) { + cache->ready = head->next; + list_del_init(head); + } else + cache->ready = NULL; + } - rds_ib_inc_purge(inc); - rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); - BUG_ON(!list_empty(&ibinc->ii_frags)); - kmem_cache_free(rds_ib_incoming_slab, ibinc); - atomic_dec(&rds_ib_allocation); - BUG_ON(atomic_read(&rds_ib_allocation) < 0); + return head; } int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, @@ -334,13 +505,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, to_copy = min_t(unsigned long, to_copy, len - copied); rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " - "[%p, %lu] + %lu\n", + "[%p, %u] + %lu\n", to_copy, iov->iov_base, iov->iov_len, iov_off, - frag->f_page, frag->f_offset, frag_off); + sg_page(&frag->f_sg), frag->f_sg.offset, frag_off); /* XXX needs + offset for multiple recvs per page */ - ret = rds_page_copy_to_user(frag->f_page, - frag->f_offset + frag_off, + ret = rds_page_copy_to_user(sg_page(&frag->f_sg), + frag->f_sg.offset + frag_off, iov->iov_base + iov_off, to_copy); if (ret) { @@ -427,7 +598,7 @@ static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, { atomic64_set(&ic->i_ack_next, seq); if (ack_required) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } } @@ -435,7 +606,7 @@ static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, static u64 rds_ib_get_ack(struct rds_ib_connection *ic) { clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return atomic64_read(&ic->i_ack_next); } @@ -467,8 +638,8 @@ static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credi set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); rds_ib_stats_inc(s_ib_ack_send_failure); - /* Need to finesse this later. */ - BUG(); + + rds_ib_conn_error(ic->conn, "sending ack failed\n"); } else rds_ib_stats_inc(s_ib_ack_sent); } @@ -524,7 +695,7 @@ void rds_ib_attempt_ack(struct rds_ib_connection *ic) } /* Can we get a send credit? */ - if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) { + if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) { rds_ib_stats_inc(s_ib_tx_throttle); clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); return; @@ -596,7 +767,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ - addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); + addr = kmap_atomic(sg_page(&frag->f_sg)); src = addr + frag_off; dst = (void *)map->m_page_addrs[map_page] + map_off; @@ -606,7 +777,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, uncongested |= ~(*src) & *dst; *dst++ = *src++; } - kunmap_atomic(addr, KM_SOFTIRQ0); + kunmap_atomic(addr); copied += to_copy; @@ -645,7 +816,7 @@ struct rds_ib_ack_state { }; static void rds_ib_process_recv(struct rds_connection *conn, - struct rds_ib_recv_work *recv, u32 byte_len, + struct rds_ib_recv_work *recv, u32 data_len, struct rds_ib_ack_state *state) { struct rds_ib_connection *ic = conn->c_transport_data; @@ -655,17 +826,17 @@ static void rds_ib_process_recv(struct rds_connection *conn, /* XXX shut down the connection if port 0,0 are seen? */ rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv, - byte_len); + data_len); - if (byte_len < sizeof(struct rds_header)) { + if (data_len < sizeof(struct rds_header)) { rds_ib_conn_error(conn, "incoming message " - "from %pI4 didn't inclue a " + "from %pI4 didn't include a " "header, disconnecting and " "reconnecting\n", &conn->c_faddr); return; } - byte_len -= sizeof(struct rds_header); + data_len -= sizeof(struct rds_header); ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; @@ -687,7 +858,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, if (ihdr->h_credit) rds_ib_send_add_credits(conn, ihdr->h_credit); - if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) { + if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) { /* This is an ACK-only packet. The fact that it gets * special treatment here is that historically, ACKs * were rather special beasts. @@ -699,12 +870,12 @@ static void rds_ib_process_recv(struct rds_connection *conn, * the inc is freed. We don't go that route, so we have to drop the * page ref ourselves. We can't just leave the page on the recv * because that confuses the dma mapping of pages and each recv's use - * of a partial page. We can leave the frag, though, it will be - * reused. + * of a partial page. * * FIXME: Fold this into the code path below. */ - rds_ib_frag_drop_page(recv->r_frag); + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; return; } @@ -714,7 +885,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, * into the inc and save the inc so we can hang upcoming fragments * off its list. */ - if (ibinc == NULL) { + if (!ibinc) { ibinc = recv->r_ibinc; recv->r_ibinc = NULL; ic->i_ibinc = ibinc; @@ -729,10 +900,10 @@ static void rds_ib_process_recv(struct rds_connection *conn, hdr = &ibinc->ii_inc.i_hdr; /* We can't just use memcmp here; fragments of a * single message may carry different ACKs */ - if (hdr->h_sequence != ihdr->h_sequence - || hdr->h_len != ihdr->h_len - || hdr->h_sport != ihdr->h_sport - || hdr->h_dport != ihdr->h_dport) { + if (hdr->h_sequence != ihdr->h_sequence || + hdr->h_len != ihdr->h_len || + hdr->h_sport != ihdr->h_sport || + hdr->h_dport != ihdr->h_dport) { rds_ib_conn_error(conn, "fragment header mismatch; forcing reconnect\n"); return; @@ -752,8 +923,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, rds_ib_cong_recv(conn, ibinc); else { rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, - &ibinc->ii_inc, GFP_ATOMIC, - KM_SOFTIRQ0); + &ibinc->ii_inc, GFP_ATOMIC); state->ack_next = be64_to_cpu(hdr->h_sequence); state->ack_next_valid = 1; } @@ -783,45 +953,67 @@ void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context) { struct rds_connection *conn = context; struct rds_ib_connection *ic = conn->c_transport_data; - struct ib_wc wc; - struct rds_ib_ack_state state = { 0, }; - struct rds_ib_recv_work *recv; rdsdebug("conn %p cq %p\n", conn, cq); rds_ib_stats_inc(s_ib_rx_cq_call); - ib_req_notify_cq(cq, IB_CQ_SOLICITED); + tasklet_schedule(&ic->i_recv_tasklet); +} - while (ib_poll_cq(cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, wc.byte_len, +static inline void rds_poll_cq(struct rds_ib_connection *ic, + struct rds_ib_ack_state *state) +{ + struct rds_connection *conn = ic->conn; + struct ib_wc wc; + struct rds_ib_recv_work *recv; + + while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, + rds_ib_wc_status_str(wc.status), wc.byte_len, be32_to_cpu(wc.ex.imm_data)); rds_ib_stats_inc(s_ib_rx_cq_event); recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; - rds_ib_recv_unmap_page(ic, recv); + ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); /* * Also process recvs in connecting state because it is possible * to get a recv completion _before_ the rdmacm ESTABLISHED * event is processed. */ - if (rds_conn_up(conn) || rds_conn_connecting(conn)) { + if (wc.status == IB_WC_SUCCESS) { + rds_ib_process_recv(conn, recv, wc.byte_len, state); + } else { /* We expect errors as the qp is drained during shutdown */ - if (wc.status == IB_WC_SUCCESS) { - rds_ib_process_recv(conn, recv, wc.byte_len, &state); - } else { - rds_ib_conn_error(conn, "recv completion on " - "%pI4 had status %u, disconnecting and " - "reconnecting\n", &conn->c_faddr, - wc.status); - } + if (rds_conn_up(conn) || rds_conn_connecting(conn)) + rds_ib_conn_error(conn, "recv completion on %pI4 had " + "status %u (%s), disconnecting and " + "reconnecting\n", &conn->c_faddr, + wc.status, + rds_ib_wc_status_str(wc.status)); } + /* + * It's very important that we only free this ring entry if we've truly + * freed the resources allocated to the entry. The refilling path can + * leak if we don't. + */ rds_ib_ring_free(&ic->i_recv_ring, 1); } +} + +void rds_ib_recv_tasklet_fn(unsigned long data) +{ + struct rds_ib_connection *ic = (struct rds_ib_connection *) data; + struct rds_connection *conn = ic->conn; + struct rds_ib_ack_state state = { 0, }; + + rds_poll_cq(ic, &state); + ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); + rds_poll_cq(ic, &state); if (state.ack_next_valid) rds_ib_set_ack(ic, state.ack_next, state.ack_required); @@ -838,11 +1030,8 @@ void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context) if (rds_ib_ring_empty(&ic->i_recv_ring)) rds_ib_stats_inc(s_ib_rx_ring_empty); - /* - * If the ring is running low, then schedule the thread to refill. - */ if (rds_ib_ring_low(&ic->i_recv_ring)) - queue_delayed_work(rds_wq, &conn->c_recv_w, 0); + rds_ib_recv_refill(conn, 0); } int rds_ib_recv(struct rds_connection *conn) @@ -851,25 +1040,13 @@ int rds_ib_recv(struct rds_connection *conn) int ret = 0; rdsdebug("conn %p\n", conn); - - /* - * If we get a temporary posting failure in this context then - * we're really low and we want the caller to back off for a bit. - */ - mutex_lock(&ic->i_recv_mutex); - if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) - ret = -ENOMEM; - else - rds_ib_stats_inc(s_ib_rx_refill_from_thread); - mutex_unlock(&ic->i_recv_mutex); - if (rds_conn_up(conn)) rds_ib_attempt_ack(ic); return ret; } -int __init rds_ib_recv_init(void) +int rds_ib_recv_init(void) { struct sysinfo si; int ret = -ENOMEM; @@ -880,14 +1057,14 @@ int __init rds_ib_recv_init(void) rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", sizeof(struct rds_ib_incoming), - 0, 0, NULL); - if (rds_ib_incoming_slab == NULL) + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!rds_ib_incoming_slab) goto out; rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", sizeof(struct rds_page_frag), - 0, 0, NULL); - if (rds_ib_frag_slab == NULL) + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!rds_ib_frag_slab) kmem_cache_destroy(rds_ib_incoming_slab); else ret = 0; diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c index 99a6ccae964..ff97e8eda85 100644 --- a/net/rds/ib_ring.c +++ b/net/rds/ib_ring.c @@ -137,7 +137,7 @@ int rds_ib_ring_empty(struct rds_ib_work_ring *ring) int rds_ib_ring_low(struct rds_ib_work_ring *ring) { - return __rds_ib_ring_used(ring) <= (ring->w_nr >> 2); + return __rds_ib_ring_used(ring) <= (ring->w_nr >> 1); } /* diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index cb6c52cb1c4..1dde91e3dc7 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -34,13 +34,52 @@ #include <linux/in.h> #include <linux/device.h> #include <linux/dmapool.h> +#include <linux/ratelimit.h> #include "rds.h" -#include "rdma.h" #include "ib.h" -static void rds_ib_send_rdma_complete(struct rds_message *rm, - int wc_status) +static char *rds_ib_wc_status_strings[] = { +#define RDS_IB_WC_STATUS_STR(foo) \ + [IB_WC_##foo] = __stringify(IB_WC_##foo) + RDS_IB_WC_STATUS_STR(SUCCESS), + RDS_IB_WC_STATUS_STR(LOC_LEN_ERR), + RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR), + RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR), + RDS_IB_WC_STATUS_STR(LOC_PROT_ERR), + RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR), + RDS_IB_WC_STATUS_STR(MW_BIND_ERR), + RDS_IB_WC_STATUS_STR(BAD_RESP_ERR), + RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR), + RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR), + RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR), + RDS_IB_WC_STATUS_STR(REM_OP_ERR), + RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR), + RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR), + RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR), + RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR), + RDS_IB_WC_STATUS_STR(REM_ABORT_ERR), + RDS_IB_WC_STATUS_STR(INV_EECN_ERR), + RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR), + RDS_IB_WC_STATUS_STR(FATAL_ERR), + RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR), + RDS_IB_WC_STATUS_STR(GENERAL_ERR), +#undef RDS_IB_WC_STATUS_STR +}; + +char *rds_ib_wc_status_str(enum ib_wc_status status) +{ + return rds_str_array(rds_ib_wc_status_strings, + ARRAY_SIZE(rds_ib_wc_status_strings), status); +} + +/* + * Convert IB-specific error message to RDS error message and call core + * completion handler. + */ +static void rds_ib_send_complete(struct rds_message *rm, + int wc_status, + void (*complete)(struct rds_message *rm, int status)) { int notify_status; @@ -60,69 +99,124 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm, notify_status = RDS_RDMA_OTHER_ERROR; break; } - rds_rdma_send_complete(rm, notify_status); + complete(rm, notify_status); +} + +static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, + struct rm_data_op *op, + int wc_status) +{ + if (op->op_nents) + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + DMA_TO_DEVICE); } static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, - struct rds_rdma_op *op) + struct rm_rdma_op *op, + int wc_status) { - if (op->r_mapped) { + if (op->op_mapped) { ib_dma_unmap_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, - op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - op->r_mapped = 0; + op->op_sg, op->op_nents, + op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->op_mapped = 0; } + + /* If the user asked for a completion notification on this + * message, we can implement three different semantics: + * 1. Notify when we received the ACK on the RDS message + * that was queued with the RDMA. This provides reliable + * notification of RDMA status at the expense of a one-way + * packet delay. + * 2. Notify when the IB stack gives us the completion event for + * the RDMA operation. + * 3. Notify when the IB stack gives us the completion event for + * the accompanying RDS messages. + * Here, we implement approach #3. To implement approach #2, + * we would need to take an event for the rdma WR. To implement #1, + * don't call rds_rdma_send_complete at all, and fall back to the notify + * handling in the ACK processing code. + * + * Note: There's no need to explicitly sync any RDMA buffers using + * ib_dma_sync_sg_for_cpu - the completion for the RDMA + * operation itself unmapped the RDMA buffers, which takes care + * of synching. + */ + rds_ib_send_complete(container_of(op, struct rds_message, rdma), + wc_status, rds_rdma_send_complete); + + if (op->op_write) + rds_stats_add(s_send_rdma_bytes, op->op_bytes); + else + rds_stats_add(s_recv_rdma_bytes, op->op_bytes); } -static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, - struct rds_ib_send_work *send, - int wc_status) +static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, + struct rm_atomic_op *op, + int wc_status) { - struct rds_message *rm = send->s_rm; - - rdsdebug("ic %p send %p rm %p\n", ic, send, rm); - - ib_dma_unmap_sg(ic->i_cm_id->device, - rm->m_sg, rm->m_nents, - DMA_TO_DEVICE); - - if (rm->m_rdma_op != NULL) { - rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); - - /* If the user asked for a completion notification on this - * message, we can implement three different semantics: - * 1. Notify when we received the ACK on the RDS message - * that was queued with the RDMA. This provides reliable - * notification of RDMA status at the expense of a one-way - * packet delay. - * 2. Notify when the IB stack gives us the completion event for - * the RDMA operation. - * 3. Notify when the IB stack gives us the completion event for - * the accompanying RDS messages. - * Here, we implement approach #3. To implement approach #2, - * call rds_rdma_send_complete from the cq_handler. To implement #1, - * don't call rds_rdma_send_complete at all, and fall back to the notify - * handling in the ACK processing code. - * - * Note: There's no need to explicitly sync any RDMA buffers using - * ib_dma_sync_sg_for_cpu - the completion for the RDMA - * operation itself unmapped the RDMA buffers, which takes care - * of synching. - */ - rds_ib_send_rdma_complete(rm, wc_status); + /* unmap atomic recvbuf */ + if (op->op_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1, + DMA_FROM_DEVICE); + op->op_mapped = 0; + } - if (rm->m_rdma_op->r_write) - rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); - else - rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); + rds_ib_send_complete(container_of(op, struct rds_message, atomic), + wc_status, rds_atomic_send_complete); + + if (op->op_type == RDS_ATOMIC_TYPE_CSWP) + rds_ib_stats_inc(s_ib_atomic_cswp); + else + rds_ib_stats_inc(s_ib_atomic_fadd); +} + +/* + * Unmap the resources associated with a struct send_work. + * + * Returns the rm for no good reason other than it is unobtainable + * other than by switching on wr.opcode, currently, and the caller, + * the event handler, needs it. + */ +static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, + int wc_status) +{ + struct rds_message *rm = NULL; + + /* In the error case, wc.opcode sometimes contains garbage */ + switch (send->s_wr.opcode) { + case IB_WR_SEND: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, data); + rds_ib_send_unmap_data(ic, send->s_op, wc_status); + } + break; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_READ: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, rdma); + rds_ib_send_unmap_rdma(ic, send->s_op, wc_status); + } + break; + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_ATOMIC_CMP_AND_SWP: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, atomic); + rds_ib_send_unmap_atomic(ic, send->s_op, wc_status); + } + break; + default: + printk_ratelimited(KERN_NOTICE + "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", + __func__, send->s_wr.opcode); + break; } - /* If anyone waited for this message to get flushed out, wake - * them up now */ - rds_message_unmapped(rm); + send->s_wr.opcode = 0xdead; - rds_message_put(rm); - send->s_rm = NULL; + return rm; } void rds_ib_send_init_ring(struct rds_ib_connection *ic) @@ -133,23 +227,18 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { struct ib_sge *sge; - send->s_rm = NULL; send->s_op = NULL; send->s_wr.wr_id = i; send->s_wr.sg_list = send->s_sge; - send->s_wr.num_sge = 1; - send->s_wr.opcode = IB_WR_SEND; - send->s_wr.send_flags = 0; send->s_wr.ex.imm_data = 0; - sge = rds_ib_data_sge(ic, send->s_sge); - sge->lkey = ic->i_mr->lkey; - - sge = rds_ib_header_sge(ic, send->s_sge); + sge = &send->s_sge[0]; sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); sge->length = sizeof(struct rds_header); sge->lkey = ic->i_mr->lkey; + + send->s_sge[1].lkey = ic->i_mr->lkey; } } @@ -159,16 +248,24 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic) u32 i; for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { - if (send->s_wr.opcode == 0xdead) - continue; - if (send->s_rm) - rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); - if (send->s_op) - rds_ib_send_unmap_rdma(ic, send->s_op); + if (send->s_op && send->s_wr.opcode != 0xdead) + rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR); } } /* + * The only fast path caller always has a non-zero nr, so we don't + * bother testing nr before performing the atomic sub. + */ +static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) +{ + if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) && + waitqueue_active(&rds_ib_ring_empty_wait)) + wake_up(&rds_ib_ring_empty_wait); + BUG_ON(atomic_read(&ic->i_signaled_sends) < 0); +} + +/* * The _oldest/_free ring operations here race cleanly with the alloc/unalloc * operations performed in the send path. As the sender allocs and potentially * unallocs the next free entry in the ring it doesn't alter which is @@ -178,12 +275,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) { struct rds_connection *conn = context; struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_message *rm = NULL; struct ib_wc wc; struct rds_ib_send_work *send; u32 completed; u32 oldest; u32 i = 0; int ret; + int nr_sig = 0; rdsdebug("cq %p conn %p\n", cq, conn); rds_ib_stats_inc(s_ib_tx_cq_call); @@ -192,13 +291,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) rdsdebug("ib_req_notify_cq send failed: %d\n", ret); while (ib_poll_cq(cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, + rds_ib_wc_status_str(wc.status), wc.byte_len, be32_to_cpu(wc.ex.imm_data)); rds_ib_stats_inc(s_ib_tx_cq_event); if (wc.wr_id == RDS_IB_ACK_WR_ID) { - if (ic->i_ack_queued + HZ/2 < jiffies) + if (time_after(jiffies, ic->i_ack_queued + HZ/2)) rds_ib_stats_inc(s_ib_tx_stalled); rds_ib_ack_send_complete(ic); continue; @@ -210,58 +310,41 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) for (i = 0; i < completed; i++) { send = &ic->i_sends[oldest]; + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; - /* In the error case, wc.opcode sometimes contains garbage */ - switch (send->s_wr.opcode) { - case IB_WR_SEND: - if (send->s_rm) - rds_ib_send_unmap_rm(ic, send, wc.status); - break; - case IB_WR_RDMA_WRITE: - case IB_WR_RDMA_READ: - /* Nothing to be done - the SG list will be unmapped - * when the SEND completes. */ - break; - default: - if (printk_ratelimit()) - printk(KERN_NOTICE - "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", - __func__, send->s_wr.opcode); - break; - } + rm = rds_ib_send_unmap_op(ic, send, wc.status); - send->s_wr.opcode = 0xdead; - send->s_wr.num_sge = 1; - if (send->s_queued + HZ/2 < jiffies) + if (time_after(jiffies, send->s_queued + HZ/2)) rds_ib_stats_inc(s_ib_tx_stalled); - /* If a RDMA operation produced an error, signal this right - * away. If we don't, the subsequent SEND that goes with this - * RDMA will be canceled with ERR_WFLUSH, and the application - * never learn that the RDMA failed. */ - if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { - struct rds_message *rm; - - rm = rds_send_get_message(conn, send->s_op); - if (rm) - rds_ib_send_rdma_complete(rm, wc.status); + if (send->s_op) { + if (send->s_op == rm->m_final_op) { + /* If anyone waited for this message to get flushed out, wake + * them up now */ + rds_message_unmapped(rm); + } + rds_message_put(rm); + send->s_op = NULL; } oldest = (oldest + 1) % ic->i_send_ring.w_nr; } rds_ib_ring_free(&ic->i_send_ring, completed); + rds_ib_sub_signaled(ic, nr_sig); + nr_sig = 0; - if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) - || test_bit(0, &conn->c_map_queued)) + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || + test_bit(0, &conn->c_map_queued)) queue_delayed_work(rds_wq, &conn->c_send_w, 0); /* We expect errors as the qp is drained during shutdown */ if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, - "send completion on %pI4 " - "had status %u, disconnecting and reconnecting\n", - &conn->c_faddr, wc.status); + rds_ib_conn_error(conn, "send completion on %pI4 had status " + "%u (%s), disconnecting and reconnecting\n", + &conn->c_faddr, wc.status, + rds_ib_wc_status_str(wc.status)); } } } @@ -272,7 +355,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) * * Conceptually, we have two counters: * - send credits: this tells us how many WRs we're allowed - * to submit without overruning the reciever's queue. For + * to submit without overruning the receiver's queue. For * each SEND WR we post, we decrement this by one. * * - posted credits: this tells us how many WRs we recently @@ -290,7 +373,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) * credits (see rds_ib_send_add_credits below). * * The RDS send code is essentially single-threaded; rds_send_xmit - * grabs c_send_lock to ensure exclusive access to the send ring. + * sets RDS_IN_XMIT to ensure exclusive access to the send ring. * However, the ACK sending code is independent and can race with * message SENDs. * @@ -311,7 +394,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) * and using atomic_cmpxchg when updating the two counters. */ int rds_ib_send_grab_credits(struct rds_ib_connection *ic, - u32 wanted, u32 *adv_credits, int need_posted) + u32 wanted, u32 *adv_credits, int need_posted, int max_posted) { unsigned int avail, posted, got = 0, advertise; long oldval, newval; @@ -351,7 +434,7 @@ try_again: * available. */ if (posted && (got || need_posted)) { - advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT); + advertise = min_t(unsigned int, posted, max_posted); newval -= IB_SET_POST_CREDITS(advertise); } @@ -409,40 +492,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } -static inline void -rds_ib_xmit_populate_wr(struct rds_ib_connection *ic, - struct rds_ib_send_work *send, unsigned int pos, - unsigned long buffer, unsigned int length, - int send_flags) +static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, + bool notify) { - struct ib_sge *sge; - - WARN_ON(pos != send - ic->i_sends); - - send->s_wr.send_flags = send_flags; - send->s_wr.opcode = IB_WR_SEND; - send->s_wr.num_sge = 2; - send->s_wr.next = NULL; - send->s_queued = jiffies; - send->s_op = NULL; - - if (length != 0) { - sge = rds_ib_data_sge(ic, send->s_sge); - sge->addr = buffer; - sge->length = length; - sge->lkey = ic->i_mr->lkey; - - sge = rds_ib_header_sge(ic, send->s_sge); - } else { - /* We're sending a packet with no payload. There is only - * one SGE */ - send->s_wr.num_sge = 1; - sge = &send->s_sge[0]; + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time + * on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0 || notify) { + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + send->s_wr.send_flags |= IB_SEND_SIGNALED; + return 1; } - - sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; + return 0; } /* @@ -471,17 +535,27 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, u32 pos; u32 i; u32 work_alloc; - u32 credit_alloc; + u32 credit_alloc = 0; u32 posted; u32 adv_credits = 0; int send_flags = 0; - int sent; + int bytes_sent = 0; int ret; int flow_controlled = 0; + int nr_sig = 0; BUG_ON(off % RDS_FRAG_SIZE); BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); + /* Do not send cong updates to IB loopback */ + if (conn->c_loopback + && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { + rds_cong_map_updated(conn->c_fcong, ~(u64) 0); + scat = &rm->data.op_sg[sg]; + ret = max_t(int, RDS_CONG_MAP_BYTES, scat->length); + return sizeof(struct rds_header) + ret; + } + /* FIXME we may overallocate here */ if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) i = 1; @@ -496,17 +570,16 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, goto out; } - credit_alloc = work_alloc; if (ic->i_flowctl) { - credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0); + credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); adv_credits += posted; if (credit_alloc < work_alloc) { rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); work_alloc = credit_alloc; - flow_controlled++; + flow_controlled = 1; } if (work_alloc == 0) { - rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); rds_ib_stats_inc(s_ib_tx_throttle); ret = -ENOMEM; goto out; @@ -514,31 +587,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, } /* map the message the first time we see it */ - if (ic->i_rm == NULL) { - /* - printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", - be16_to_cpu(rm->m_inc.i_hdr.h_dport), - rm->m_inc.i_hdr.h_flags, - be32_to_cpu(rm->m_inc.i_hdr.h_len)); - */ - if (rm->m_nents) { - rm->m_count = ib_dma_map_sg(dev, - rm->m_sg, rm->m_nents, DMA_TO_DEVICE); - rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); - if (rm->m_count == 0) { + if (!ic->i_data_op) { + if (rm->data.op_nents) { + rm->data.op_count = ib_dma_map_sg(dev, + rm->data.op_sg, + rm->data.op_nents, + DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); + if (rm->data.op_count == 0) { rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); ret = -ENOMEM; /* XXX ? */ goto out; } } else { - rm->m_count = 0; + rm->data.op_count = 0; } - ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; - ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; rds_message_addref(rm); - ic->i_rm = rm; + ic->i_data_op = &rm->data; /* Finalize the header */ if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) @@ -548,10 +615,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* If it has a RDMA op, tell the peer we did it. This is * used by the peer to release use-once RDMA MRs. */ - if (rm->m_rdma_op) { + if (rm->rdma.op_active) { struct rds_ext_header_rdma ext_hdr; - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); } @@ -571,18 +638,12 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* * Update adv_credits since we reset the ACK_REQUIRED bit. */ - rds_ib_send_grab_credits(ic, 0, &posted, 1); - adv_credits += posted; - BUG_ON(adv_credits > 255); - } else if (ic->i_rm != rm) - BUG(); - - send = &ic->i_sends[pos]; - first = send; - prev = NULL; - scat = &rm->m_sg[sg]; - sent = 0; - i = 0; + if (ic->i_flowctl) { + rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); + adv_credits += posted; + BUG_ON(adv_credits > 255); + } + } /* Sometimes you want to put a fence between an RDMA * READ and the following SEND. @@ -590,81 +651,64 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, * or when requested by the user. Right now, we let * the application choose. */ - if (rm->m_rdma_op && rm->m_rdma_op->r_fence) + if (rm->rdma.op_active && rm->rdma.op_fence) send_flags = IB_SEND_FENCE; - /* - * We could be copying the header into the unused tail of the page. - * That would need to be changed in the future when those pages might - * be mapped userspace pages or page cache pages. So instead we always - * use a second sge and our long-lived ring of mapped headers. We send - * the header after the data so that the data payload can be aligned on - * the receiver. - */ + /* Each frag gets a header. Msgs may be 0 bytes */ + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &ic->i_data_op->op_sg[sg]; + i = 0; + do { + unsigned int len = 0; - /* handle a 0-len message */ - if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { - rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); - goto add_header; - } + /* Set up the header */ + send->s_wr.send_flags = send_flags; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.num_sge = 1; + send->s_wr.next = NULL; + send->s_queued = jiffies; + send->s_op = NULL; - /* if there's data reference it with a chain of work reqs */ - for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { - unsigned int len; + send->s_sge[0].addr = ic->i_send_hdrs_dma + + (pos * sizeof(struct rds_header)); + send->s_sge[0].length = sizeof(struct rds_header); - send = &ic->i_sends[pos]; + memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); - len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); - rds_ib_xmit_populate_wr(ic, send, pos, - ib_sg_dma_address(dev, scat) + off, len, - send_flags); + /* Set up the data, if present */ + if (i < work_alloc + && scat != &rm->data.op_sg[rm->data.op_count]) { + len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + send->s_wr.num_sge = 2; - /* - * We want to delay signaling completions just enough to get - * the batching benefits but not so much that we create dead time - * on the wire. - */ - if (ic->i_unsignaled_wrs-- == 0) { - ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; - send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - } + send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off; + send->s_sge[1].length = len; - ic->i_unsignaled_bytes -= len; - if (ic->i_unsignaled_bytes <= 0) { - ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; - send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + bytes_sent += len; + off += len; + if (off == ib_sg_dma_len(dev, scat)) { + scat++; + off = 0; + } } + rds_ib_set_wr_signal_state(ic, send, 0); + /* * Always signal the last one if we're stopping due to flow control. */ - if (flow_controlled && i == (work_alloc-1)) + if (ic->i_flowctl && flow_controlled && i == (work_alloc-1)) send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; + rdsdebug("send %p wr %p num_sge %u next %p\n", send, &send->s_wr, send->s_wr.num_sge, send->s_wr.next); - sent += len; - off += len; - if (off == ib_sg_dma_len(dev, scat)) { - scat++; - off = 0; - } - -add_header: - /* Tack on the header after the data. The header SGE should already - * have been set up to point to the right header buffer. */ - memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); - - if (0) { - struct rds_header *hdr = &ic->i_send_hdrs[pos]; - - printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", - be16_to_cpu(hdr->h_dport), - hdr->h_flags, - be32_to_cpu(hdr->h_len)); - } - if (adv_credits) { + if (ic->i_flowctl && adv_credits) { struct rds_header *hdr = &ic->i_send_hdrs[pos]; /* add credit and redo the header checksum */ @@ -679,20 +723,25 @@ add_header: prev = send; pos = (pos + 1) % ic->i_send_ring.w_nr; - } + send = &ic->i_sends[pos]; + i++; + + } while (i < work_alloc + && scat != &rm->data.op_sg[rm->data.op_count]); /* Account the RDS header in the number of bytes we sent, but just once. * The caller has no concept of fragmentation. */ if (hdr_off == 0) - sent += sizeof(struct rds_header); + bytes_sent += sizeof(struct rds_header); /* if we finished the message then send completion owns it */ - if (scat == &rm->m_sg[rm->m_count]) { - prev->s_rm = ic->i_rm; - prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - ic->i_rm = NULL; + if (scat == &rm->data.op_sg[rm->data.op_count]) { + prev->s_op = ic->i_data_op; + prev->s_wr.send_flags |= IB_SEND_SOLICITED; + ic->i_data_op = NULL; } + /* Put back wrs & credits we didn't use */ if (i < work_alloc) { rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); work_alloc = i; @@ -700,6 +749,9 @@ add_header: if (ic->i_flowctl && i < credit_alloc) rds_ib_send_add_credits(conn, credit_alloc - i); + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + /* XXX need to worry about failed_wr and partial sends. */ failed_wr = &first->s_wr; ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); @@ -710,32 +762,127 @@ add_header: printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); - if (prev->s_rm) { - ic->i_rm = prev->s_rm; - prev->s_rm = NULL; + rds_ib_sub_signaled(ic, nr_sig); + if (prev->s_op) { + ic->i_data_op = prev->s_op; + prev->s_op = NULL; } - /* Finesse this later */ - BUG(); + + rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); goto out; } - ret = sent; + ret = bytes_sent; out: BUG_ON(adv_credits); return ret; } -int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) +/* + * Issue atomic operation. + * A simplified version of the rdma case, we always map 1 SG, and + * only 8 bytes, for the return value from the atomic operation. + */ +int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_send_work *send = NULL; + struct ib_send_wr *failed_wr; + struct rds_ib_device *rds_ibdev; + u32 pos; + u32 work_alloc; + int ret; + int nr_sig = 0; + + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + + work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); + if (work_alloc != 1) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + /* address of send request in ring */ + send = &ic->i_sends[pos]; + send->s_queued = jiffies; + + if (op->op_type == RDS_ATOMIC_TYPE_CSWP) { + send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; + send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare; + send->s_wr.wr.atomic.swap = op->op_m_cswp.swap; + send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask; + send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask; + } else { /* FADD */ + send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; + send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add; + send->s_wr.wr.atomic.swap = 0; + send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask; + send->s_wr.wr.atomic.swap_mask = 0; + } + nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); + send->s_wr.num_sge = 1; + send->s_wr.next = NULL; + send->s_wr.wr.atomic.remote_addr = op->op_remote_addr; + send->s_wr.wr.atomic.rkey = op->op_rkey; + send->s_op = op; + rds_message_addref(container_of(send->s_op, struct rds_message, atomic)); + + /* map 8 byte retval buffer to the device */ + ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE); + rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret); + if (ret != 1) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + + /* Convert our struct scatterlist to struct ib_sge */ + send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg); + send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg); + send->s_sge[0].lkey = ic->i_mr->lkey; + + rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, + send->s_sge[0].addr, send->s_sge[0].length); + + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + + failed_wr = &send->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr); + rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic, + send, &send->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &send->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_sub_signaled(ic, nr_sig); + goto out; + } + + if (unlikely(failed_wr != &send->s_wr)) { + printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret); + BUG_ON(failed_wr != &send->s_wr); + } + +out: + return ret; +} + +int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_send_work *send = NULL; struct rds_ib_send_work *first; struct rds_ib_send_work *prev; struct ib_send_wr *failed_wr; - struct rds_ib_device *rds_ibdev; struct scatterlist *scat; unsigned long len; - u64 remote_addr = op->r_remote_addr; + u64 remote_addr = op->op_remote_addr; + u32 max_sge = ic->rds_ibdev->max_sge; u32 pos; u32 work_alloc; u32 i; @@ -743,29 +890,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) int sent; int ret; int num_sge; - - rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); - - /* map the message the first time we see it */ - if (!op->r_mapped) { - op->r_count = ib_dma_map_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, (op->r_write) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); - if (op->r_count == 0) { + int nr_sig = 0; + + /* map the op the first time we see it */ + if (!op->op_mapped) { + op->op_count = ib_dma_map_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, (op->op_write) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); + if (op->op_count == 0) { rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); ret = -ENOMEM; /* XXX ? */ goto out; } - op->r_mapped = 1; + op->op_mapped = 1; } /* * Instead of knowing how to return a partial rdma read/write we insist that there * be enough work requests to send the entire message. */ - i = ceil(op->r_count, rds_ibdev->max_sge); + i = ceil(op->op_count, max_sge); work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); if (work_alloc != i) { @@ -778,30 +924,24 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &op->r_sg[0]; + scat = &op->op_sg[0]; sent = 0; - num_sge = op->r_count; + num_sge = op->op_count; - for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { + for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { send->s_wr.send_flags = 0; send->s_queued = jiffies; - /* - * We want to delay signaling completions just enough to get - * the batching benefits but not so much that we create dead time on the wire. - */ - if (ic->i_unsignaled_wrs-- == 0) { - ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; - send->s_wr.send_flags = IB_SEND_SIGNALED; - } + send->s_op = NULL; + + nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify); - send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; + send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; send->s_wr.wr.rdma.remote_addr = remote_addr; - send->s_wr.wr.rdma.rkey = op->r_key; - send->s_op = op; + send->s_wr.wr.rdma.rkey = op->op_rkey; - if (num_sge > rds_ibdev->max_sge) { - send->s_wr.num_sge = rds_ibdev->max_sge; - num_sge -= rds_ibdev->max_sge; + if (num_sge > max_sge) { + send->s_wr.num_sge = max_sge; + num_sge -= max_sge; } else { send->s_wr.num_sge = num_sge; } @@ -811,7 +951,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) if (prev) prev->s_wr.next = &send->s_wr; - for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { + for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { len = ib_sg_dma_len(ic->i_cm_id->device, scat); send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); @@ -833,15 +973,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) send = ic->i_sends; } - /* if we finished the message then send completion owns it */ - if (scat == &op->r_sg[op->r_count]) - prev->s_wr.send_flags = IB_SEND_SIGNALED; + /* give a reference to the last op */ + if (scat == &op->op_sg[op->op_count]) { + prev->s_op = op; + rds_message_addref(container_of(op, struct rds_message, rdma)); + } if (i < work_alloc) { rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); work_alloc = i; } + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + failed_wr = &first->s_wr; ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, @@ -851,6 +996,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_sub_signaled(ic, nr_sig); goto out; } diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index 02e3e3d50d4..2d5965d6e97 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -37,9 +37,9 @@ #include "rds.h" #include "ib.h" -DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned; +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); -static char *rds_ib_stat_names[] = { +static const char *const rds_ib_stat_names[] = { "ib_connect_raced", "ib_listen_closed_stale", "ib_tx_cq_call", @@ -67,6 +67,8 @@ static char *rds_ib_stat_names[] = { "ib_rdma_mr_pool_flush", "ib_rdma_mr_pool_wait", "ib_rdma_mr_pool_depleted", + "ib_atomic_cswp", + "ib_atomic_fadd", }; unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index d87830db93a..e4e41b3afce 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c @@ -49,89 +49,73 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16; static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1; static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; -unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20); -static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1; -static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL; - -unsigned int rds_ib_sysctl_flow_control = 1; +/* + * This sysctl does nothing. + * + * Backwards compatibility with RDS 3.0 wire protocol + * disables initial FC credit exchange. + * If it's ever possible to drop 3.0 support, + * setting this to 1 and moving init/refill of send/recv + * rings from ib_cm_connect_complete() back into ib_setup_qp() + * will cause credits to be added before protocol negotiation. + */ +unsigned int rds_ib_sysctl_flow_control = 0; -ctl_table rds_ib_sysctl_table[] = { +static struct ctl_table rds_ib_sysctl_table[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "max_send_wr", .data = &rds_ib_sysctl_max_send_wr, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &rds_ib_sysctl_max_wr_min, .extra2 = &rds_ib_sysctl_max_wr_max, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "max_recv_wr", .data = &rds_ib_sysctl_max_recv_wr, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &rds_ib_sysctl_max_wr_min, .extra2 = &rds_ib_sysctl_max_wr_max, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "max_unsignaled_wr", .data = &rds_ib_sysctl_max_unsig_wrs, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &rds_ib_sysctl_max_unsig_wr_min, .extra2 = &rds_ib_sysctl_max_unsig_wr_max, }, { - .ctl_name = CTL_UNNUMBERED, - .procname = "max_unsignaled_bytes", - .data = &rds_ib_sysctl_max_unsig_bytes, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, - .extra1 = &rds_ib_sysctl_max_unsig_bytes_min, - .extra2 = &rds_ib_sysctl_max_unsig_bytes_max, - }, - { - .ctl_name = CTL_UNNUMBERED, .procname = "max_recv_allocation", .data = &rds_ib_sysctl_max_recv_allocation, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "flow_control", .data = &rds_ib_sysctl_flow_control, .maxlen = sizeof(rds_ib_sysctl_flow_control), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { .ctl_name = 0} -}; - -static struct ctl_path rds_ib_sysctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "rds", .ctl_name = CTL_UNNUMBERED, }, - { .procname = "ib", .ctl_name = CTL_UNNUMBERED, }, { } }; void rds_ib_sysctl_exit(void) { if (rds_ib_sysctl_hdr) - unregister_sysctl_table(rds_ib_sysctl_hdr); + unregister_net_sysctl_table(rds_ib_sysctl_hdr); } -int __init rds_ib_sysctl_init(void) +int rds_ib_sysctl_init(void) { - rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table); - if (rds_ib_sysctl_hdr == NULL) + rds_ib_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/ib", rds_ib_sysctl_table); + if (!rds_ib_sysctl_hdr) return -ENOMEM; return 0; } diff --git a/net/rds/info.c b/net/rds/info.c index 1d885535214..9a6b4f66187 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -32,7 +32,9 @@ */ #include <linux/percpu.h> #include <linux/seq_file.h> +#include <linux/slab.h> #include <linux/proc_fs.h> +#include <linux/export.h> #include "rds.h" @@ -75,10 +77,11 @@ void rds_info_register_func(int optname, rds_info_func func) BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); spin_lock(&rds_info_lock); - BUG_ON(rds_info_funcs[offset] != NULL); + BUG_ON(rds_info_funcs[offset]); rds_info_funcs[offset] = func; spin_unlock(&rds_info_lock); } +EXPORT_SYMBOL_GPL(rds_info_register_func); void rds_info_deregister_func(int optname, rds_info_func func) { @@ -91,6 +94,7 @@ void rds_info_deregister_func(int optname, rds_info_func func) rds_info_funcs[offset] = NULL; spin_unlock(&rds_info_lock); } +EXPORT_SYMBOL_GPL(rds_info_deregister_func); /* * Typically we hold an atomic kmap across multiple rds_info_copy() calls @@ -99,8 +103,8 @@ void rds_info_deregister_func(int optname, rds_info_func func) */ void rds_info_iter_unmap(struct rds_info_iterator *iter) { - if (iter->addr != NULL) { - kunmap_atomic(iter->addr, KM_USER0); + if (iter->addr) { + kunmap_atomic(iter->addr); iter->addr = NULL; } } @@ -114,8 +118,8 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data, unsigned long this; while (bytes) { - if (iter->addr == NULL) - iter->addr = kmap_atomic(*iter->pages, KM_USER0); + if (!iter->addr) + iter->addr = kmap_atomic(*iter->pages); this = min(bytes, PAGE_SIZE - iter->offset); @@ -130,13 +134,14 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data, iter->offset += this; if (iter->offset == PAGE_SIZE) { - kunmap_atomic(iter->addr, KM_USER0); + kunmap_atomic(iter->addr); iter->addr = NULL; iter->offset = 0; iter->pages++; } } } +EXPORT_SYMBOL_GPL(rds_info_copy); /* * @optval points to the userspace buffer that the information snapshot @@ -184,14 +189,11 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, >> PAGE_SHIFT; pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); - if (pages == NULL) { + if (!pages) { ret = -ENOMEM; goto out; } - down_read(¤t->mm->mmap_sem); - ret = get_user_pages(current, current->mm, start, nr_pages, 1, 0, - pages, NULL); - up_read(¤t->mm->mmap_sem); + ret = get_user_pages_fast(start, nr_pages, 1, pages); if (ret != nr_pages) { if (ret > 0) nr_pages = ret; @@ -205,7 +207,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, call_func: func = rds_info_funcs[optname - RDS_INFO_FIRST]; - if (func == NULL) { + if (!func) { ret = -ENOPROTOOPT; goto out; } @@ -233,7 +235,7 @@ call_func: ret = -EFAULT; out: - for (i = 0; pages != NULL && i < nr_pages; i++) + for (i = 0; pages && i < nr_pages; i++) put_page(pages[i]); kfree(pages); diff --git a/net/rds/iw.c b/net/rds/iw.c index b732efb5b63..589935661d6 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -37,6 +37,8 @@ #include <linux/inetdevice.h> #include <linux/if_arp.h> #include <linux/delay.h> +#include <linux/slab.h> +#include <linux/module.h> #include "rds.h" #include "iw.h" @@ -55,7 +57,7 @@ struct list_head rds_iw_devices; DEFINE_SPINLOCK(iw_nodev_conns_lock); LIST_HEAD(iw_nodev_conns); -void rds_iw_add_one(struct ib_device *device) +static void rds_iw_add_one(struct ib_device *device) { struct rds_iw_device *rds_iwdev; struct ib_device_attr *dev_attr; @@ -83,23 +85,16 @@ void rds_iw_add_one(struct ib_device *device) rds_iwdev->max_wrs = dev_attr->max_qp_wr; rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE); - rds_iwdev->page_shift = max(PAGE_SHIFT, ffs(dev_attr->page_size_cap) - 1); - rds_iwdev->dev = device; rds_iwdev->pd = ib_alloc_pd(device); if (IS_ERR(rds_iwdev->pd)) goto free_dev; if (!rds_iwdev->dma_local_lkey) { - if (device->node_type != RDMA_NODE_RNIC) { - rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, - IB_ACCESS_LOCAL_WRITE); - } else { - rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE); - } + rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_LOCAL_WRITE); if (IS_ERR(rds_iwdev->mr)) goto err_pd; } else @@ -130,7 +125,7 @@ free_attr: kfree(dev_attr); } -void rds_iw_remove_one(struct ib_device *device) +static void rds_iw_remove_one(struct ib_device *device) { struct rds_iw_device *rds_iwdev; struct rds_iw_cm_id *i_cm_id, *next; @@ -191,8 +186,8 @@ static int rds_iw_conn_info_visitor(struct rds_connection *conn, ic = conn->c_transport_data; dev_addr = &ic->i_cm_id->route.addr.dev_addr; - ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); - ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); + rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); + rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); iinfo->max_send_wr = ic->i_send_ring.w_nr; @@ -232,9 +227,9 @@ static int rds_iw_laddr_check(__be32 addr) /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ - cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); - if (!cm_id) - return -EADDRNOTAVAIL; + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(cm_id)) + return PTR_ERR(cm_id); memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; @@ -244,7 +239,8 @@ static int rds_iw_laddr_check(__be32 addr) ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); /* due to this, we will claim to support IB devices unless we check node_type. */ - if (ret || cm_id->device->node_type != RDMA_NODE_RNIC) + if (ret || !cm_id->device || + cm_id->device->node_type != RDMA_NODE_RNIC) ret = -EADDRNOTAVAIL; rdsdebug("addr %pI4 ret %d node type %d\n", @@ -270,7 +266,6 @@ struct rds_transport rds_iw_transport = { .laddr_check = rds_iw_laddr_check, .xmit_complete = rds_iw_xmit_complete, .xmit = rds_iw_xmit, - .xmit_cong_map = NULL, .xmit_rdma = rds_iw_xmit_rdma, .recv = rds_iw_recv, .conn_alloc = rds_iw_conn_alloc, @@ -278,7 +273,6 @@ struct rds_transport rds_iw_transport = { .conn_connect = rds_iw_conn_connect, .conn_shutdown = rds_iw_conn_shutdown, .inc_copy_to_user = rds_iw_inc_copy_to_user, - .inc_purge = rds_iw_inc_purge, .inc_free = rds_iw_inc_free, .cm_initiate_connect = rds_iw_cm_initiate_connect, .cm_handle_connect = rds_iw_cm_handle_connect, @@ -291,10 +285,11 @@ struct rds_transport rds_iw_transport = { .flush_mrs = rds_iw_flush_mrs, .t_owner = THIS_MODULE, .t_name = "iwarp", + .t_type = RDS_TRANS_IWARP, .t_prefer_loopback = 1, }; -int __init rds_iw_init(void) +int rds_iw_init(void) { int ret; diff --git a/net/rds/iw.h b/net/rds/iw.h index b4fb2725289..04ce3b193f7 100644 --- a/net/rds/iw.h +++ b/net/rds/iw.h @@ -1,6 +1,7 @@ #ifndef _RDS_IW_H #define _RDS_IW_H +#include <linux/interrupt.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> #include "rds.h" @@ -70,7 +71,7 @@ struct rds_iw_send_work { struct rds_message *s_rm; /* We should really put these into a union: */ - struct rds_rdma_op *s_op; + struct rm_rdma_op *s_op; struct rds_iw_mapping *s_mapping; struct ib_mr *s_mr; struct ib_fast_reg_page_list *s_page_list; @@ -119,6 +120,7 @@ struct rds_iw_connection { struct rds_iw_send_work *i_sends; /* rx */ + struct tasklet_struct i_recv_tasklet; struct mutex i_recv_mutex; struct rds_iw_work_ring i_recv_ring; struct rds_iw_incoming *i_iwinc; @@ -181,7 +183,6 @@ struct rds_iw_device { struct ib_pd *pd; struct ib_mr *mr; struct rds_iw_mr_pool *mr_pool; - int page_shift; int max_sge; unsigned int max_wrs; unsigned int dma_local_lkey:1; @@ -268,8 +269,6 @@ static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic) /* ib.c */ extern struct rds_transport rds_iw_transport; -extern void rds_iw_add_one(struct ib_device *device); -extern void rds_iw_remove_one(struct ib_device *device); extern struct ib_client rds_iw_client; extern unsigned int fastreg_pool_size; @@ -284,7 +283,7 @@ void rds_iw_conn_free(void *arg); int rds_iw_conn_connect(struct rds_connection *conn); void rds_iw_conn_shutdown(struct rds_connection *conn); void rds_iw_state_change(struct sock *sk); -int __init rds_iw_listen_init(void); +int rds_iw_listen_init(void); void rds_iw_listen_stop(void); void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, @@ -318,19 +317,18 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, void rds_iw_sync_mr(void *trans_private, int dir); void rds_iw_free_mr(void *trans_private, int invalidate); void rds_iw_flush_mrs(void); -void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); /* ib_recv.c */ -int __init rds_iw_recv_init(void); +int rds_iw_recv_init(void); void rds_iw_recv_exit(void); int rds_iw_recv(struct rds_connection *conn); int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, gfp_t page_gfp, int prefill); -void rds_iw_inc_purge(struct rds_incoming *inc); void rds_iw_inc_free(struct rds_incoming *inc); int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, size_t size); void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_iw_recv_tasklet_fn(unsigned long data); void rds_iw_recv_init_ring(struct rds_iw_connection *ic); void rds_iw_recv_clear_ring(struct rds_iw_connection *ic); void rds_iw_recv_init_ack(struct rds_iw_connection *ic); @@ -357,11 +355,11 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); void rds_iw_send_init_ring(struct rds_iw_connection *ic); void rds_iw_send_clear_ring(struct rds_iw_connection *ic); -int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); +int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, - u32 *adv_credits, int need_posted); + u32 *adv_credits, int need_posted, int max_posted); /* ib_stats.c */ DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats); @@ -370,7 +368,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); /* ib_sysctl.c */ -int __init rds_iw_sysctl_init(void); +int rds_iw_sysctl_init(void); void rds_iw_sysctl_exit(void); extern unsigned long rds_iw_sysctl_max_send_wr; extern unsigned long rds_iw_sysctl_max_recv_wr; @@ -378,7 +376,6 @@ extern unsigned long rds_iw_sysctl_max_unsig_wrs; extern unsigned long rds_iw_sysctl_max_unsig_bytes; extern unsigned long rds_iw_sysctl_max_recv_allocation; extern unsigned int rds_iw_sysctl_flow_control; -extern ctl_table rds_iw_sysctl_table[]; /* * Helper functions for getting/setting the header and data SGEs in diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c index a416b0d492b..a91e1db62ee 100644 --- a/net/rds/iw_cm.c +++ b/net/rds/iw_cm.c @@ -32,7 +32,9 @@ */ #include <linux/kernel.h> #include <linux/in.h> +#include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/ratelimit.h> #include "rds.h" #include "iw.h" @@ -156,9 +158,11 @@ static void rds_iw_qp_event_handler(struct ib_event *event, void *data) case IB_EVENT_QP_REQ_ERR: case IB_EVENT_QP_FATAL: default: - rds_iw_conn_error(conn, "RDS/IW: Fatal QP Event %u - connection %pI4->%pI4...reconnecting\n", + rdsdebug("Fatal QP Event %u " + "- connection %pI4->%pI4, reconnecting\n", event->event, &conn->c_laddr, &conn->c_faddr); + rds_conn_drop(conn); break; } } @@ -178,7 +182,7 @@ static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, unsigned int send_size, recv_size; int ret; - /* The offset of 1 is to accomodate the additional ACK WR. */ + /* The offset of 1 is to accommodate the additional ACK WR. */ send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1); recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1); rds_iw_ring_resize(send_ring, send_size - 1); @@ -254,9 +258,8 @@ static int rds_iw_setup_qp(struct rds_connection *conn) * the rds_iwdev at all. */ rds_iwdev = ib_get_client_data(dev, &rds_iw_client); - if (rds_iwdev == NULL) { - if (printk_ratelimit()) - printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n", + if (!rds_iwdev) { + printk_ratelimited(KERN_NOTICE "RDS/IW: No client_data for device %s\n", dev->name); return -EOPNOTSUPP; } @@ -289,7 +292,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ic->i_send_ring.w_nr * sizeof(struct rds_header), &ic->i_send_hdrs_dma, GFP_KERNEL); - if (ic->i_send_hdrs == NULL) { + if (!ic->i_send_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent send failed\n"); goto out; @@ -299,7 +302,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ic->i_recv_ring.w_nr * sizeof(struct rds_header), &ic->i_recv_hdrs_dma, GFP_KERNEL); - if (ic->i_recv_hdrs == NULL) { + if (!ic->i_recv_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent recv failed\n"); goto out; @@ -307,14 +310,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), &ic->i_ack_dma, GFP_KERNEL); - if (ic->i_ack == NULL) { + if (!ic->i_ack) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent ack failed\n"); goto out; } ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); - if (ic->i_sends == NULL) { + if (!ic->i_sends) { ret = -ENOMEM; rdsdebug("send allocation failed\n"); goto out; @@ -322,7 +325,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) rds_iw_send_init_ring(ic); ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); - if (ic->i_recvs == NULL) { + if (!ic->i_recvs) { ret = -ENOMEM; rdsdebug("recv allocation failed\n"); goto out; @@ -362,13 +365,12 @@ static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp) version = RDS_PROTOCOL_3_0; while ((common >>= 1) != 0) version++; - } else if (printk_ratelimit()) { - printk(KERN_NOTICE "RDS: Connection from %pI4 using " + } + printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using " "incompatible protocol version %u.%u\n", &dp->dp_saddr, dp->dp_protocol_major, dp->dp_protocol_minor); - } return version; } @@ -449,6 +451,7 @@ int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, err = rds_iw_setup_qp(conn); if (err) { rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err); + mutex_unlock(&conn->c_cm_lock); goto out; } @@ -518,7 +521,7 @@ int rds_iw_conn_connect(struct rds_connection *conn) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, - RDMA_PS_TCP); + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { ret = PTR_ERR(ic->i_cm_id); ic->i_cm_id = NULL; @@ -691,11 +694,13 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) unsigned long flags; /* XXX too lazy? */ - ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); - if (ic == NULL) + ic = kzalloc(sizeof(struct rds_iw_connection), gfp); + if (!ic) return -ENOMEM; INIT_LIST_HEAD(&ic->iw_node); + tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn, + (unsigned long) ic); mutex_init(&ic->i_recv_mutex); #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&ic->i_ack_lock); diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index dcdb37da80f..a817705ce2d 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c @@ -31,9 +31,10 @@ * */ #include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/ratelimit.h> #include "rds.h" -#include "rdma.h" #include "iw.h" @@ -83,7 +84,8 @@ static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, struct list_head *unmap_list, - struct list_head *kill_list); + struct list_head *kill_list, + int *unpinned); static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id) @@ -122,7 +124,7 @@ static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwd #else /* FIXME - needs to compare the local and remote * ipaddr/port tuple, but the ipaddr is the only - * available infomation in the rds_sock (as the rest are + * available information in the rds_sock (as the rest are * zero'ed. It doesn't appear to be properly populated * during connection setup... */ @@ -157,7 +159,8 @@ static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id * return 0; } -void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) +static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, + struct rdma_cm_id *cm_id) { struct rds_iw_cm_id *i_cm_id; @@ -206,9 +209,9 @@ void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *con BUG_ON(list_empty(&ic->iw_node)); list_del(&ic->iw_node); - spin_lock_irq(&rds_iwdev->spinlock); + spin_lock(&rds_iwdev->spinlock); list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); - spin_unlock_irq(&rds_iwdev->spinlock); + spin_unlock(&rds_iwdev->spinlock); spin_unlock_irq(&iw_nodev_conns_lock); ic->rds_iwdev = rds_iwdev; @@ -245,11 +248,8 @@ void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock) INIT_LIST_HEAD(list); spin_unlock_irq(list_lock); - list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) { - if (ic->conn->c_passive) - rds_conn_destroy(ic->conn->c_passive); + list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) rds_conn_destroy(ic->conn); - } } static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg, @@ -263,18 +263,12 @@ static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg, } static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev, - struct rds_iw_scatterlist *sg, - unsigned int dma_page_shift) + struct rds_iw_scatterlist *sg) { struct ib_device *dev = rds_iwdev->dev; u64 *dma_pages = NULL; - u64 dma_mask; - unsigned int dma_page_size; int i, j, ret; - dma_page_size = 1 << dma_page_shift; - dma_mask = dma_page_size - 1; - WARN_ON(sg->dma_len); sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL); @@ -295,18 +289,18 @@ static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev, sg->bytes += dma_len; end_addr = dma_addr + dma_len; - if (dma_addr & dma_mask) { + if (dma_addr & PAGE_MASK) { if (i > 0) goto out_unmap; - dma_addr &= ~dma_mask; + dma_addr &= ~PAGE_MASK; } - if (end_addr & dma_mask) { + if (end_addr & PAGE_MASK) { if (i < sg->dma_len - 1) goto out_unmap; - end_addr = (end_addr + dma_mask) & ~dma_mask; + end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK; } - sg->dma_npages += (end_addr - dma_addr) >> dma_page_shift; + sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT; } /* Now gather the dma addrs into one list */ @@ -325,8 +319,8 @@ static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev, u64 end_addr; end_addr = dma_addr + dma_len; - dma_addr &= ~dma_mask; - for (; dma_addr < end_addr; dma_addr += dma_page_size) + dma_addr &= ~PAGE_MASK; + for (; dma_addr < end_addr; dma_addr += PAGE_SIZE) dma_pages[j++] = dma_addr; BUG_ON(j > sg->dma_npages); } @@ -483,17 +477,6 @@ void rds_iw_sync_mr(void *trans_private, int direction) } } -static inline unsigned int rds_iw_flush_goal(struct rds_iw_mr_pool *pool, int free_all) -{ - unsigned int item_count; - - item_count = atomic_read(&pool->item_count); - if (free_all) - return item_count; - - return 0; -} - /* * Flush our pool of MRs. * At a minimum, all currently unused MRs are unmapped. @@ -506,7 +489,7 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) LIST_HEAD(unmap_list); LIST_HEAD(kill_list); unsigned long flags; - unsigned int nfreed = 0, ncleaned = 0, free_goal; + unsigned int nfreed = 0, ncleaned = 0, unpinned = 0; int ret = 0; rds_iw_stats_inc(s_iw_rdma_mr_pool_flush); @@ -520,8 +503,6 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) list_splice_init(&pool->clean_list, &kill_list); spin_unlock_irqrestore(&pool->list_lock, flags); - free_goal = rds_iw_flush_goal(pool, free_all); - /* Batched invalidate of dirty MRs. * For FMR based MRs, the mappings on the unmap list are * actually members of an ibmr (ibmr->mapping). They either @@ -531,7 +512,8 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) * will be destroyed by the unmap function. */ if (!list_empty(&unmap_list)) { - ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list); + ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, + &kill_list, &unpinned); /* If we've been asked to destroy all MRs, move those * that were simply cleaned to the kill list */ if (free_all) @@ -555,6 +537,7 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) spin_unlock_irqrestore(&pool->list_lock, flags); } + atomic_sub(unpinned, &pool->free_pinned); atomic_sub(ncleaned, &pool->dirty_count); atomic_sub(nfreed, &pool->item_count); @@ -582,8 +565,8 @@ void rds_iw_free_mr(void *trans_private, int invalidate) rds_iw_free_fastreg(pool, ibmr); /* If we've pinned too many pages, request a flush */ - if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned - || atomic_read(&pool->dirty_count) >= pool->max_items / 10) + if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || + atomic_read(&pool->dirty_count) >= pool->max_items / 10) queue_work(rds_wq, &pool->flush_worker); if (invalidate) { @@ -727,7 +710,7 @@ static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping) f_wr.wr.fast_reg.rkey = mapping->m_rkey; f_wr.wr.fast_reg.page_list = ibmr->page_list; f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len; - f_wr.wr.fast_reg.page_shift = ibmr->device->page_shift; + f_wr.wr.fast_reg.page_shift = PAGE_SHIFT; f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; @@ -737,8 +720,8 @@ static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping) failed_wr = &f_wr; ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr); BUG_ON(failed_wr != &f_wr); - if (ret && printk_ratelimit()) - printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", + if (ret) + printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", __func__, __LINE__, ret); return ret; } @@ -759,8 +742,8 @@ static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr) failed_wr = &s_wr; ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr); - if (ret && printk_ratelimit()) { - printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", + if (ret) { + printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", __func__, __LINE__, ret); goto out; } @@ -780,9 +763,7 @@ static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len); - dma_pages = rds_iw_map_scatterlist(rds_iwdev, - &mapping->m_sg, - rds_iwdev->page_shift); + dma_pages = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg); if (IS_ERR(dma_pages)) { ret = PTR_ERR(dma_pages); dma_pages = NULL; @@ -837,7 +818,8 @@ static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, struct list_head *unmap_list, - struct list_head *kill_list) + struct list_head *kill_list, + int *unpinned) { struct rds_iw_mapping *mapping, *next; unsigned int ncleaned = 0; @@ -864,6 +846,7 @@ static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, spin_lock_irqsave(&pool->list_lock, flags); list_for_each_entry_safe(mapping, next, unmap_list, m_list) { + *unpinned += mapping->m_sg.len; list_move(&mapping->m_list, &laundered); ncleaned++; } diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c index fde470fa50d..aa8bf678600 100644 --- a/net/rds/iw_recv.c +++ b/net/rds/iw_recv.c @@ -31,6 +31,7 @@ * */ #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/pci.h> #include <linux/dma-mapping.h> #include <rdma/rdma_cm.h> @@ -52,7 +53,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag) static void rds_iw_frag_free(struct rds_page_frag *frag) { rdsdebug("frag %p page %p\n", frag, frag->f_page); - BUG_ON(frag->f_page != NULL); + BUG_ON(frag->f_page); kmem_cache_free(rds_iw_frag_slab, frag); } @@ -142,31 +143,32 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn, struct ib_sge *sge; int ret = -ENOMEM; - if (recv->r_iwinc == NULL) { - if (atomic_read(&rds_iw_allocation) >= rds_iw_sysctl_max_recv_allocation) { + if (!recv->r_iwinc) { + if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { rds_iw_stats_inc(s_iw_rx_alloc_limit); goto out; } recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, kptr_gfp); - if (recv->r_iwinc == NULL) + if (!recv->r_iwinc) { + atomic_dec(&rds_iw_allocation); goto out; - atomic_inc(&rds_iw_allocation); + } INIT_LIST_HEAD(&recv->r_iwinc->ii_frags); rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); } - if (recv->r_frag == NULL) { + if (!recv->r_frag) { recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); - if (recv->r_frag == NULL) + if (!recv->r_frag) goto out; INIT_LIST_HEAD(&recv->r_frag->f_item); recv->r_frag->f_page = NULL; } - if (ic->i_frag.f_page == NULL) { + if (!ic->i_frag.f_page) { ic->i_frag.f_page = alloc_page(page_gfp); - if (ic->i_frag.f_page == NULL) + if (!ic->i_frag.f_page) goto out; ic->i_frag.f_offset = 0; } @@ -229,8 +231,8 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, int ret = 0; u32 pos; - while ((prefill || rds_conn_up(conn)) - && rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) { + while ((prefill || rds_conn_up(conn)) && + rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) { if (pos >= ic->i_recv_ring.w_nr) { printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", pos); @@ -271,7 +273,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, return ret; } -void rds_iw_inc_purge(struct rds_incoming *inc) +static void rds_iw_inc_purge(struct rds_incoming *inc) { struct rds_iw_incoming *iwinc; struct rds_page_frag *frag; @@ -427,7 +429,7 @@ static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, { atomic64_set(&ic->i_ack_next, seq); if (ack_required) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } } @@ -435,7 +437,7 @@ static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, static u64 rds_iw_get_ack(struct rds_iw_connection *ic) { clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return atomic64_read(&ic->i_ack_next); } @@ -467,8 +469,8 @@ static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credi set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); rds_iw_stats_inc(s_iw_ack_send_failure); - /* Need to finesse this later. */ - BUG(); + + rds_iw_conn_error(ic->conn, "sending ack failed\n"); } else rds_iw_stats_inc(s_iw_ack_sent); } @@ -524,7 +526,7 @@ void rds_iw_attempt_ack(struct rds_iw_connection *ic) } /* Can we get a send credit? */ - if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0)) { + if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) { rds_iw_stats_inc(s_iw_tx_throttle); clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); return; @@ -596,7 +598,7 @@ static void rds_iw_cong_recv(struct rds_connection *conn, to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ - addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); + addr = kmap_atomic(frag->f_page); src = addr + frag_off; dst = (void *)map->m_page_addrs[map_page] + map_off; @@ -606,7 +608,7 @@ static void rds_iw_cong_recv(struct rds_connection *conn, uncongested |= ~(*src) & *dst; *dst++ = *src++; } - kunmap_atomic(addr, KM_SOFTIRQ0); + kunmap_atomic(addr); copied += to_copy; @@ -659,7 +661,7 @@ static void rds_iw_process_recv(struct rds_connection *conn, if (byte_len < sizeof(struct rds_header)) { rds_iw_conn_error(conn, "incoming message " - "from %pI4 didn't inclue a " + "from %pI4 didn't include a " "header, disconnecting and " "reconnecting\n", &conn->c_faddr); @@ -714,7 +716,7 @@ static void rds_iw_process_recv(struct rds_connection *conn, * into the inc and save the inc so we can hang upcoming fragments * off its list. */ - if (iwinc == NULL) { + if (!iwinc) { iwinc = recv->r_iwinc; recv->r_iwinc = NULL; ic->i_iwinc = iwinc; @@ -729,10 +731,10 @@ static void rds_iw_process_recv(struct rds_connection *conn, hdr = &iwinc->ii_inc.i_hdr; /* We can't just use memcmp here; fragments of a * single message may carry different ACKs */ - if (hdr->h_sequence != ihdr->h_sequence - || hdr->h_len != ihdr->h_len - || hdr->h_sport != ihdr->h_sport - || hdr->h_dport != ihdr->h_dport) { + if (hdr->h_sequence != ihdr->h_sequence || + hdr->h_len != ihdr->h_len || + hdr->h_sport != ihdr->h_sport || + hdr->h_dport != ihdr->h_dport) { rds_iw_conn_error(conn, "fragment header mismatch; forcing reconnect\n"); return; @@ -752,8 +754,7 @@ static void rds_iw_process_recv(struct rds_connection *conn, rds_iw_cong_recv(conn, iwinc); else { rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, - &iwinc->ii_inc, GFP_ATOMIC, - KM_SOFTIRQ0); + &iwinc->ii_inc, GFP_ATOMIC); state->ack_next = be64_to_cpu(hdr->h_sequence); state->ack_next_valid = 1; } @@ -783,17 +784,22 @@ void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context) { struct rds_connection *conn = context; struct rds_iw_connection *ic = conn->c_transport_data; - struct ib_wc wc; - struct rds_iw_ack_state state = { 0, }; - struct rds_iw_recv_work *recv; rdsdebug("conn %p cq %p\n", conn, cq); rds_iw_stats_inc(s_iw_rx_cq_call); - ib_req_notify_cq(cq, IB_CQ_SOLICITED); + tasklet_schedule(&ic->i_recv_tasklet); +} - while (ib_poll_cq(cq, 1, &wc) > 0) { +static inline void rds_poll_cq(struct rds_iw_connection *ic, + struct rds_iw_ack_state *state) +{ + struct rds_connection *conn = ic->conn; + struct ib_wc wc; + struct rds_iw_recv_work *recv; + + while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", (unsigned long long)wc.wr_id, wc.status, wc.byte_len, be32_to_cpu(wc.ex.imm_data)); @@ -811,7 +817,7 @@ void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context) if (rds_conn_up(conn) || rds_conn_connecting(conn)) { /* We expect errors as the qp is drained during shutdown */ if (wc.status == IB_WC_SUCCESS) { - rds_iw_process_recv(conn, recv, wc.byte_len, &state); + rds_iw_process_recv(conn, recv, wc.byte_len, state); } else { rds_iw_conn_error(conn, "recv completion on " "%pI4 had status %u, disconnecting and " @@ -822,6 +828,17 @@ void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context) rds_iw_ring_free(&ic->i_recv_ring, 1); } +} + +void rds_iw_recv_tasklet_fn(unsigned long data) +{ + struct rds_iw_connection *ic = (struct rds_iw_connection *) data; + struct rds_connection *conn = ic->conn; + struct rds_iw_ack_state state = { 0, }; + + rds_poll_cq(ic, &state); + ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); + rds_poll_cq(ic, &state); if (state.ack_next_valid) rds_iw_set_ack(ic, state.ack_next, state.ack_required); @@ -869,7 +886,7 @@ int rds_iw_recv(struct rds_connection *conn) return ret; } -int __init rds_iw_recv_init(void) +int rds_iw_recv_init(void) { struct sysinfo si; int ret = -ENOMEM; @@ -881,13 +898,13 @@ int __init rds_iw_recv_init(void) rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", sizeof(struct rds_iw_incoming), 0, 0, NULL); - if (rds_iw_incoming_slab == NULL) + if (!rds_iw_incoming_slab) goto out; rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", sizeof(struct rds_page_frag), 0, 0, NULL); - if (rds_iw_frag_slab == NULL) + if (!rds_iw_frag_slab) kmem_cache_destroy(rds_iw_incoming_slab); else ret = 0; diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c index d422d4b5dee..da8e3b63f66 100644 --- a/net/rds/iw_ring.c +++ b/net/rds/iw_ring.c @@ -137,7 +137,7 @@ int rds_iw_ring_empty(struct rds_iw_work_ring *ring) int rds_iw_ring_low(struct rds_iw_work_ring *ring) { - return __rds_iw_ring_used(ring) <= (ring->w_nr >> 2); + return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1); } diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index 22dd38ffd60..9105ea03aec 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -34,9 +34,9 @@ #include <linux/in.h> #include <linux/device.h> #include <linux/dmapool.h> +#include <linux/ratelimit.h> #include "rds.h" -#include "rdma.h" #include "iw.h" static void rds_iw_send_rdma_complete(struct rds_message *rm, @@ -64,13 +64,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm, } static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, - struct rds_rdma_op *op) + struct rm_rdma_op *op) { - if (op->r_mapped) { + if (op->op_mapped) { ib_dma_unmap_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, - op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - op->r_mapped = 0; + op->op_sg, op->op_nents, + op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->op_mapped = 0; } } @@ -83,11 +83,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, rdsdebug("ic %p send %p rm %p\n", ic, send, rm); ib_dma_unmap_sg(ic->i_cm_id->device, - rm->m_sg, rm->m_nents, + rm->data.op_sg, rm->data.op_nents, DMA_TO_DEVICE); - if (rm->m_rdma_op != NULL) { - rds_iw_send_unmap_rdma(ic, rm->m_rdma_op); + if (rm->rdma.op_active) { + rds_iw_send_unmap_rdma(ic, &rm->rdma); /* If the user asked for a completion notification on this * message, we can implement three different semantics: @@ -111,10 +111,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, */ rds_iw_send_rdma_complete(rm, wc_status); - if (rm->m_rdma_op->r_write) - rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); + if (rm->rdma.op_write) + rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes); else - rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); + rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes); } /* If anyone waited for this message to get flushed out, wake @@ -232,7 +232,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) } if (wc.wr_id == RDS_IW_ACK_WR_ID) { - if (ic->i_ack_queued + HZ/2 < jiffies) + if (time_after(jiffies, ic->i_ack_queued + HZ/2)) rds_iw_stats_inc(s_iw_tx_stalled); rds_iw_ack_send_complete(ic); continue; @@ -259,8 +259,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) * when the SEND completes. */ break; default: - if (printk_ratelimit()) - printk(KERN_NOTICE + printk_ratelimited(KERN_NOTICE "RDS/IW: %s: unexpected opcode 0x%x in WR!\n", __func__, send->s_wr.opcode); break; @@ -268,7 +267,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) send->s_wr.opcode = 0xdead; send->s_wr.num_sge = 1; - if (send->s_queued + HZ/2 < jiffies) + if (time_after(jiffies, send->s_queued + HZ/2)) rds_iw_stats_inc(s_iw_tx_stalled); /* If a RDMA operation produced an error, signal this right @@ -288,8 +287,8 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) rds_iw_ring_free(&ic->i_send_ring, completed); - if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) - || test_bit(0, &conn->c_map_queued)) + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || + test_bit(0, &conn->c_map_queued)) queue_delayed_work(rds_wq, &conn->c_send_w, 0); /* We expect errors as the qp is drained during shutdown */ @@ -308,7 +307,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) * * Conceptually, we have two counters: * - send credits: this tells us how many WRs we're allowed - * to submit without overruning the reciever's queue. For + * to submit without overruning the receiver's queue. For * each SEND WR we post, we decrement this by one. * * - posted credits: this tells us how many WRs we recently @@ -347,7 +346,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) * and using atomic_cmpxchg when updating the two counters. */ int rds_iw_send_grab_credits(struct rds_iw_connection *ic, - u32 wanted, u32 *adv_credits, int need_posted) + u32 wanted, u32 *adv_credits, int need_posted, int max_posted) { unsigned int avail, posted, got = 0, advertise; long oldval, newval; @@ -387,7 +386,7 @@ try_again: * available. */ if (posted && (got || need_posted)) { - advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT); + advertise = min_t(unsigned int, posted, max_posted); newval -= IB_SET_POST_CREDITS(advertise); } @@ -519,8 +518,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); /* Fastreg support */ - if (rds_rdma_cookie_key(rm->m_rdma_cookie) - && !ic->i_fastreg_posted) { + if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) { ret = -EAGAIN; goto out; } @@ -541,7 +539,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, credit_alloc = work_alloc; if (ic->i_flowctl) { - credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0); + credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); adv_credits += posted; if (credit_alloc < work_alloc) { rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); @@ -549,7 +547,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, flow_controlled++; } if (work_alloc == 0) { - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); rds_iw_stats_inc(s_iw_tx_throttle); ret = -ENOMEM; goto out; @@ -557,25 +555,27 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, } /* map the message the first time we see it */ - if (ic->i_rm == NULL) { + if (!ic->i_rm) { /* printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", be16_to_cpu(rm->m_inc.i_hdr.h_dport), rm->m_inc.i_hdr.h_flags, be32_to_cpu(rm->m_inc.i_hdr.h_len)); */ - if (rm->m_nents) { - rm->m_count = ib_dma_map_sg(dev, - rm->m_sg, rm->m_nents, DMA_TO_DEVICE); - rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); - if (rm->m_count == 0) { + if (rm->data.op_nents) { + rm->data.op_count = ib_dma_map_sg(dev, + rm->data.op_sg, + rm->data.op_nents, + DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); + if (rm->data.op_count == 0) { rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); ret = -ENOMEM; /* XXX ? */ goto out; } } else { - rm->m_count = 0; + rm->data.op_count = 0; } ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; @@ -591,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, /* If it has a RDMA op, tell the peer we did it. This is * used by the peer to release use-once RDMA MRs. */ - if (rm->m_rdma_op) { + if (rm->rdma.op_active) { struct rds_ext_header_rdma ext_hdr; - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); } @@ -614,16 +614,15 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, /* * Update adv_credits since we reset the ACK_REQUIRED bit. */ - rds_iw_send_grab_credits(ic, 0, &posted, 1); + rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); adv_credits += posted; BUG_ON(adv_credits > 255); - } else if (ic->i_rm != rm) - BUG(); + } send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &rm->m_sg[sg]; + scat = &rm->data.op_sg[sg]; sent = 0; i = 0; @@ -633,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, * or when requested by the user. Right now, we let * the application choose. */ - if (rm->m_rdma_op && rm->m_rdma_op->r_fence) + if (rm->rdma.op_active && rm->rdma.op_fence) send_flags = IB_SEND_FENCE; /* @@ -652,7 +651,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, } /* if there's data reference it with a chain of work reqs */ - for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { + for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) { unsigned int len; send = &ic->i_sends[pos]; @@ -730,7 +729,7 @@ add_header: sent += sizeof(struct rds_header); /* if we finished the message then send completion owns it */ - if (scat == &rm->m_sg[rm->m_count]) { + if (scat == &rm->data.op_sg[rm->data.op_count]) { prev->s_rm = ic->i_rm; prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; ic->i_rm = NULL; @@ -779,14 +778,14 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey; send->s_wr.wr.fast_reg.page_list = send->s_page_list; send->s_wr.wr.fast_reg.page_list_len = nent; - send->s_wr.wr.fast_reg.page_shift = rds_iwdev->page_shift; + send->s_wr.wr.fast_reg.page_shift = PAGE_SHIFT; send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE; send->s_wr.wr.fast_reg.iova_start = sg_addr; ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); } -int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) +int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) { struct rds_iw_connection *ic = conn->c_transport_data; struct rds_iw_send_work *send = NULL; @@ -796,7 +795,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) struct rds_iw_device *rds_iwdev; struct scatterlist *scat; unsigned long len; - u64 remote_addr = op->r_remote_addr; + u64 remote_addr = op->op_remote_addr; u32 pos, fr_pos; u32 work_alloc; u32 i; @@ -808,21 +807,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); /* map the message the first time we see it */ - if (!op->r_mapped) { - op->r_count = ib_dma_map_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, (op->r_write) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); - if (op->r_count == 0) { + if (!op->op_mapped) { + op->op_count = ib_dma_map_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, (op->op_write) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); + if (op->op_count == 0) { rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); ret = -ENOMEM; /* XXX ? */ goto out; } - op->r_mapped = 1; + op->op_mapped = 1; } - if (!op->r_write) { + if (!op->op_write) { /* Alloc space on the send queue for the fastreg */ work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); if (work_alloc != 1) { @@ -837,7 +836,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) * Instead of knowing how to return a partial rdma read/write we insist that there * be enough work requests to send the entire message. */ - i = ceil(op->r_count, rds_iwdev->max_sge); + i = ceil(op->op_count, rds_iwdev->max_sge); work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); if (work_alloc != i) { @@ -848,17 +847,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) } send = &ic->i_sends[pos]; - if (!op->r_write) { + if (!op->op_write) { first = prev = &ic->i_sends[fr_pos]; } else { first = send; prev = NULL; } - scat = &op->r_sg[0]; + scat = &op->op_sg[0]; sent = 0; - num_sge = op->r_count; + num_sge = op->op_count; - for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { + for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { send->s_wr.send_flags = 0; send->s_queued = jiffies; @@ -875,13 +874,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) * for local access after RDS is finished with it, using * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. */ - if (op->r_write) + if (op->op_write) send->s_wr.opcode = IB_WR_RDMA_WRITE; else send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; send->s_wr.wr.rdma.remote_addr = remote_addr; - send->s_wr.wr.rdma.rkey = op->r_key; + send->s_wr.wr.rdma.rkey = op->op_rkey; send->s_op = op; if (num_sge > rds_iwdev->max_sge) { @@ -895,7 +894,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) if (prev) prev->s_wr.next = &send->s_wr; - for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { + for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { len = ib_sg_dma_len(ic->i_cm_id->device, scat); if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) @@ -929,7 +928,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) } /* if we finished the message then send completion owns it */ - if (scat == &op->r_sg[op->r_count]) + if (scat == &op->op_sg[op->op_count]) first->s_wr.send_flags = IB_SEND_SIGNALED; if (i < work_alloc) { @@ -943,9 +942,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) * adapters do not allow using the lkey for this at all. To bypass this use a * fastreg_mr (or possibly a dma_mr) */ - if (!op->r_write) { + if (!op->op_write) { rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], - op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); + op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); work_alloc++; } diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c index ccc7e8f0bf0..5fe67f6a1d8 100644 --- a/net/rds/iw_stats.c +++ b/net/rds/iw_stats.c @@ -37,9 +37,9 @@ #include "rds.h" #include "iw.h" -DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned; +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats); -static char *rds_iw_stat_names[] = { +static const char *const rds_iw_stat_names[] = { "iw_connect_raced", "iw_listen_closed_stale", "iw_tx_cq_call", diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c index 9590678cd61..139239d2cb2 100644 --- a/net/rds/iw_sysctl.c +++ b/net/rds/iw_sysctl.c @@ -55,83 +55,69 @@ static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL; unsigned int rds_iw_sysctl_flow_control = 1; -ctl_table rds_iw_sysctl_table[] = { +static struct ctl_table rds_iw_sysctl_table[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "max_send_wr", .data = &rds_iw_sysctl_max_send_wr, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &rds_iw_sysctl_max_wr_min, .extra2 = &rds_iw_sysctl_max_wr_max, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "max_recv_wr", .data = &rds_iw_sysctl_max_recv_wr, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &rds_iw_sysctl_max_wr_min, .extra2 = &rds_iw_sysctl_max_wr_max, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "max_unsignaled_wr", .data = &rds_iw_sysctl_max_unsig_wrs, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &rds_iw_sysctl_max_unsig_wr_min, .extra2 = &rds_iw_sysctl_max_unsig_wr_max, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "max_unsignaled_bytes", .data = &rds_iw_sysctl_max_unsig_bytes, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &rds_iw_sysctl_max_unsig_bytes_min, .extra2 = &rds_iw_sysctl_max_unsig_bytes_max, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "max_recv_allocation", .data = &rds_iw_sysctl_max_recv_allocation, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "flow_control", .data = &rds_iw_sysctl_flow_control, .maxlen = sizeof(rds_iw_sysctl_flow_control), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { .ctl_name = 0} -}; - -static struct ctl_path rds_iw_sysctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "rds", .ctl_name = CTL_UNNUMBERED, }, - { .procname = "iw", .ctl_name = CTL_UNNUMBERED, }, { } }; void rds_iw_sysctl_exit(void) { - if (rds_iw_sysctl_hdr) - unregister_sysctl_table(rds_iw_sysctl_hdr); + unregister_net_sysctl_table(rds_iw_sysctl_hdr); } -int __init rds_iw_sysctl_init(void) +int rds_iw_sysctl_init(void) { - rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table); - if (rds_iw_sysctl_hdr == NULL) + rds_iw_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/iw", rds_iw_sysctl_table); + if (!rds_iw_sysctl_hdr) return -ENOMEM; return 0; } diff --git a/net/rds/loop.c b/net/rds/loop.c index 4a61997f554..6b12b68541a 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -31,6 +31,7 @@ * */ #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/in.h> #include "rds.h" @@ -60,39 +61,42 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off) { + struct scatterlist *sgp = &rm->data.op_sg[sg]; + int ret = sizeof(struct rds_header) + + be32_to_cpu(rm->m_inc.i_hdr.h_len); + + /* Do not send cong updates to loopback */ + if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { + rds_cong_map_updated(conn->c_fcong, ~(u64) 0); + ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off); + goto out; + } + BUG_ON(hdr_off || sg || off); rds_inc_init(&rm->m_inc, conn, conn->c_laddr); - rds_message_addref(rm); /* for the inc */ + /* For the embedded inc. Matching put is in loop_inc_free() */ + rds_message_addref(rm); rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, - GFP_KERNEL, KM_USER0); + GFP_KERNEL); rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence), NULL); rds_inc_put(&rm->m_inc); - - return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); +out: + return ret; } -static int rds_loop_xmit_cong_map(struct rds_connection *conn, - struct rds_cong_map *map, - unsigned long offset) +/* + * See rds_loop_xmit(). Since our inc is embedded in the rm, we + * make sure the rm lives at least until the inc is done. + */ +static void rds_loop_inc_free(struct rds_incoming *inc) { - unsigned long i; - - BUG_ON(offset); - BUG_ON(map != conn->c_lcong); - - for (i = 0; i < RDS_CONG_MAP_PAGES; i++) { - memcpy((void *)conn->c_fcong->m_page_addrs[i], - (void *)map->m_page_addrs[i], PAGE_SIZE); - } - - rds_cong_map_updated(conn->c_fcong, ~(u64) 0); - - return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; + struct rds_message *rm = container_of(inc, struct rds_message, m_inc); + rds_message_put(rm); } /* we need to at least give the thread something to succeed */ @@ -117,8 +121,8 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) struct rds_loop_connection *lc; unsigned long flags; - lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL); - if (lc == NULL) + lc = kzalloc(sizeof(struct rds_loop_connection), gfp); + if (!lc) return -ENOMEM; INIT_LIST_HEAD(&lc->loop_node); @@ -135,8 +139,12 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) static void rds_loop_conn_free(void *arg) { struct rds_loop_connection *lc = arg; + unsigned long flags; + rdsdebug("lc %p\n", lc); + spin_lock_irqsave(&loop_conns_lock, flags); list_del(&lc->loop_node); + spin_unlock_irqrestore(&loop_conns_lock, flags); kfree(lc); } @@ -175,14 +183,12 @@ void rds_loop_exit(void) */ struct rds_transport rds_loop_transport = { .xmit = rds_loop_xmit, - .xmit_cong_map = rds_loop_xmit_cong_map, .recv = rds_loop_recv, .conn_alloc = rds_loop_conn_alloc, .conn_free = rds_loop_conn_free, .conn_connect = rds_loop_conn_connect, .conn_shutdown = rds_loop_conn_shutdown, .inc_copy_to_user = rds_message_inc_copy_to_user, - .inc_purge = rds_message_inc_purge, - .inc_free = rds_message_inc_free, + .inc_free = rds_loop_inc_free, .t_name = "loopback", }; diff --git a/net/rds/message.c b/net/rds/message.c index 5a15dc8d0cd..aba232f9f30 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -31,11 +31,10 @@ * */ #include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/export.h> #include "rds.h" -#include "rdma.h" - -static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq); static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_NONE] = 0, @@ -50,6 +49,7 @@ void rds_message_addref(struct rds_message *rm) rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); atomic_inc(&rm->m_refcount); } +EXPORT_SYMBOL_GPL(rds_message_addref); /* * This relies on dma_map_sg() not touching sg[].page during merging. @@ -61,29 +61,28 @@ static void rds_message_purge(struct rds_message *rm) if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; - for (i = 0; i < rm->m_nents; i++) { - rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i])); + for (i = 0; i < rm->data.op_nents; i++) { + rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i])); /* XXX will have to put_page for page refs */ - __free_page(sg_page(&rm->m_sg[i])); + __free_page(sg_page(&rm->data.op_sg[i])); } - rm->m_nents = 0; + rm->data.op_nents = 0; - if (rm->m_rdma_op) - rds_rdma_free_op(rm->m_rdma_op); - if (rm->m_rdma_mr) - rds_mr_put(rm->m_rdma_mr); -} + if (rm->rdma.op_active) + rds_rdma_free_op(&rm->rdma); + if (rm->rdma.op_rdma_mr) + rds_mr_put(rm->rdma.op_rdma_mr); -void rds_message_inc_purge(struct rds_incoming *inc) -{ - struct rds_message *rm = container_of(inc, struct rds_message, m_inc); - rds_message_purge(rm); + if (rm->atomic.op_active) + rds_atomic_free_op(&rm->atomic); + if (rm->atomic.op_rdma_mr) + rds_mr_put(rm->atomic.op_rdma_mr); } void rds_message_put(struct rds_message *rm) { rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); - + WARN(!atomic_read(&rm->m_refcount), "danger refcount zero on %p\n", rm); if (atomic_dec_and_test(&rm->m_refcount)) { BUG_ON(!list_empty(&rm->m_sock_item)); BUG_ON(!list_empty(&rm->m_conn_item)); @@ -92,12 +91,7 @@ void rds_message_put(struct rds_message *rm) kfree(rm); } } - -void rds_message_inc_free(struct rds_incoming *inc) -{ - struct rds_message *rm = container_of(inc, struct rds_message, m_inc); - rds_message_put(rm); -} +EXPORT_SYMBOL_GPL(rds_message_put); void rds_message_populate_header(struct rds_header *hdr, __be16 sport, __be16 dport, u64 seq) @@ -108,9 +102,10 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport, hdr->h_sequence = cpu_to_be64(seq); hdr->h_exthdr[0] = RDS_EXTHDR_NONE; } +EXPORT_SYMBOL_GPL(rds_message_populate_header); -int rds_message_add_extension(struct rds_header *hdr, - unsigned int type, const void *data, unsigned int len) +int rds_message_add_extension(struct rds_header *hdr, unsigned int type, + const void *data, unsigned int len) { unsigned int ext_len = sizeof(u8) + len; unsigned char *dst; @@ -119,8 +114,7 @@ int rds_message_add_extension(struct rds_header *hdr, if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE) return 0; - if (type >= __RDS_EXTHDR_MAX - || len != rds_exthdr_size[type]) + if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type]) return 0; if (ext_len >= RDS_HEADER_EXT_SPACE) @@ -133,6 +127,7 @@ int rds_message_add_extension(struct rds_header *hdr, dst[len] = RDS_EXTHDR_NONE; return 1; } +EXPORT_SYMBOL_GPL(rds_message_add_extension); /* * If a message has extension headers, retrieve them here. @@ -180,26 +175,6 @@ none: return RDS_EXTHDR_NONE; } -int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version) -{ - struct rds_ext_header_version ext_hdr; - - ext_hdr.h_version = cpu_to_be32(version); - return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr)); -} - -int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version) -{ - struct rds_ext_header_version ext_hdr; - unsigned int pos = 0, len = sizeof(ext_hdr); - - /* We assume the version extension is the only one present */ - if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION) - return 0; - *version = be32_to_cpu(ext_hdr.h_version); - return 1; -} - int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset) { struct rds_ext_header_rdma_dest ext_hdr; @@ -208,42 +183,80 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o ext_hdr.h_rdma_offset = cpu_to_be32(offset); return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr)); } +EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension); -struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp) +/* + * Each rds_message is allocated with extra space for the scatterlist entries + * rds ops will need. This is to minimize memory allocation count. Then, each rds op + * can grab SGs when initializing its part of the rds_message. + */ +struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp) { struct rds_message *rm; - rm = kzalloc(sizeof(struct rds_message) + - (nents * sizeof(struct scatterlist)), gfp); + if (extra_len > KMALLOC_MAX_SIZE - sizeof(struct rds_message)) + return NULL; + + rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp); if (!rm) goto out; - if (nents) - sg_init_table(rm->m_sg, nents); + rm->m_used_sgs = 0; + rm->m_total_sgs = extra_len / sizeof(struct scatterlist); + atomic_set(&rm->m_refcount, 1); INIT_LIST_HEAD(&rm->m_sock_item); INIT_LIST_HEAD(&rm->m_conn_item); spin_lock_init(&rm->m_rs_lock); + init_waitqueue_head(&rm->m_flush_wait); out: return rm; } +/* + * RDS ops use this to grab SG entries from the rm's sg pool. + */ +struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents) +{ + struct scatterlist *sg_first = (struct scatterlist *) &rm[1]; + struct scatterlist *sg_ret; + + WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs); + WARN_ON(!nents); + + if (rm->m_used_sgs + nents > rm->m_total_sgs) + return NULL; + + sg_ret = &sg_first[rm->m_used_sgs]; + sg_init_table(sg_ret, nents); + rm->m_used_sgs += nents; + + return sg_ret; +} + struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) { struct rds_message *rm; unsigned int i; + int num_sgs = ceil(total_len, PAGE_SIZE); + int extra_bytes = num_sgs * sizeof(struct scatterlist); - rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); - if (rm == NULL) + rm = rds_message_alloc(extra_bytes, GFP_NOWAIT); + if (!rm) return ERR_PTR(-ENOMEM); set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); - rm->m_nents = ceil(total_len, PAGE_SIZE); + rm->data.op_nents = ceil(total_len, PAGE_SIZE); + rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); + if (!rm->data.op_sg) { + rds_message_put(rm); + return ERR_PTR(-ENOMEM); + } - for (i = 0; i < rm->m_nents; ++i) { - sg_set_page(&rm->m_sg[i], + for (i = 0; i < rm->data.op_nents; ++i) { + sg_set_page(&rm->data.op_sg[i], virt_to_page(page_addrs[i]), PAGE_SIZE, 0); } @@ -251,40 +264,33 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in return rm; } -struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, +int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, size_t total_len) { unsigned long to_copy; unsigned long iov_off; unsigned long sg_off; - struct rds_message *rm; struct iovec *iov; struct scatterlist *sg; - int ret; - - rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); - if (rm == NULL) { - ret = -ENOMEM; - goto out; - } + int ret = 0; rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); /* * now allocate and copy in the data payload. */ - sg = rm->m_sg; + sg = rm->data.op_sg; iov = first_iov; iov_off = 0; sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ while (total_len) { - if (sg_page(sg) == NULL) { + if (!sg_page(sg)) { ret = rds_page_remainder_alloc(sg, total_len, GFP_HIGHUSER); if (ret) goto out; - rm->m_nents++; + rm->data.op_nents++; sg_off = 0; } @@ -315,14 +321,8 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, sg++; } - ret = 0; out: - if (ret) { - if (rm) - rds_message_put(rm); - rm = ERR_PTR(ret); - } - return rm; + return ret; } int rds_message_inc_copy_to_user(struct rds_incoming *inc, @@ -343,7 +343,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, iov = first_iov; iov_off = 0; - sg = rm->m_sg; + sg = rm->data.op_sg; vec_off = 0; copied = 0; @@ -389,14 +389,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, */ void rds_message_wait(struct rds_message *rm) { - wait_event(rds_message_flush_waitq, + wait_event_interruptible(rm->m_flush_wait, !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); } void rds_message_unmapped(struct rds_message *rm) { clear_bit(RDS_MSG_MAPPED, &rm->m_flags); - if (waitqueue_active(&rds_message_flush_waitq)) - wake_up(&rds_message_flush_waitq); + wake_up_interruptible(&rm->m_flush_wait); } +EXPORT_SYMBOL_GPL(rds_message_unmapped); diff --git a/net/rds/page.c b/net/rds/page.c index c460743a89a..9005a2c920e 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -31,6 +31,9 @@ * */ #include <linux/highmem.h> +#include <linux/gfp.h> +#include <linux/cpu.h> +#include <linux/export.h> #include "rds.h" @@ -39,7 +42,8 @@ struct rds_page_remainder { unsigned long r_offset; }; -DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, + rds_page_remainders); /* * returns 0 on success or -errno on failure. @@ -56,37 +60,26 @@ int rds_page_copy_user(struct page *page, unsigned long offset, unsigned long ret; void *addr; - if (to_user) + addr = kmap(page); + if (to_user) { rds_stats_add(s_copy_to_user, bytes); - else + ret = copy_to_user(ptr, addr + offset, bytes); + } else { rds_stats_add(s_copy_from_user, bytes); - - addr = kmap_atomic(page, KM_USER0); - if (to_user) - ret = __copy_to_user_inatomic(ptr, addr + offset, bytes); - else - ret = __copy_from_user_inatomic(addr + offset, ptr, bytes); - kunmap_atomic(addr, KM_USER0); - - if (ret) { - addr = kmap(page); - if (to_user) - ret = copy_to_user(ptr, addr + offset, bytes); - else - ret = copy_from_user(addr + offset, ptr, bytes); - kunmap(page); - if (ret) - return -EFAULT; + ret = copy_from_user(addr + offset, ptr, bytes); } + kunmap(page); - return 0; + return ret ? -EFAULT : 0; } +EXPORT_SYMBOL_GPL(rds_page_copy_user); -/* - * Message allocation uses this to build up regions of a message. +/** + * rds_page_remainder_alloc - build up regions of a message. * - * @bytes - the number of bytes needed. - * @gfp - the waiting behaviour of the allocation + * @scat: Scatter list for message + * @bytes: the number of bytes needed. + * @gfp: the waiting behaviour of the allocation * * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to * kmap the pages, etc. @@ -114,7 +107,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, /* jump straight to allocation if we're trying for a huge page */ if (bytes >= PAGE_SIZE) { page = alloc_page(gfp); - if (page == NULL) { + if (!page) { ret = -ENOMEM; } else { sg_set_page(scat, page, PAGE_SIZE, 0); @@ -160,7 +153,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, rem = &per_cpu(rds_page_remainders, get_cpu()); local_irq_save(flags); - if (page == NULL) { + if (!page) { ret = -ENOMEM; break; } @@ -184,6 +177,7 @@ out: ret ? 0 : scat->length); return ret; } +EXPORT_SYMBOL_GPL(rds_page_remainder_alloc); static int rds_page_remainder_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) diff --git a/net/rds/rdma.c b/net/rds/rdma.c index eaeeb91e111..4e37c1cbe8b 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -31,10 +31,11 @@ * */ #include <linux/pagemap.h> +#include <linux/slab.h> #include <linux/rbtree.h> #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ -#include "rdma.h" +#include "rds.h" /* * XXX @@ -129,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs) { struct rds_mr *mr; struct rb_node *node; + unsigned long flags; /* Release any MRs associated with this socket */ + spin_lock_irqsave(&rs->rs_rdma_lock, flags); while ((node = rb_first(&rs->rs_rdma_keys))) { mr = container_of(node, struct rds_mr, r_rb_node); if (mr->r_trans == rs->rs_transport) mr->r_invalidate = 0; + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + RB_CLEAR_NODE(&mr->r_rb_node); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + rds_destroy_mr(mr); rds_mr_put(mr); + spin_lock_irqsave(&rs->rs_rdma_lock, flags); } + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (rs->rs_transport && rs->rs_transport->flush_mrs) rs->rs_transport->flush_mrs(); @@ -150,12 +159,9 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, { int ret; - down_read(¤t->mm->mmap_sem); - ret = get_user_pages(current, current->mm, user_addr, - nr_pages, write, 0, pages, NULL); - up_read(¤t->mm->mmap_sem); + ret = get_user_pages_fast(user_addr, nr_pages, write, pages); - if (0 <= ret && (unsigned) ret < nr_pages) { + if (ret >= 0 && ret < nr_pages) { while (ret--) put_page(pages[ret]); ret = -EFAULT; @@ -183,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, goto out; } - if (rs->rs_transport->get_mr == NULL) { + if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out; } @@ -199,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, /* XXX clamp nr_pages to limit the size of this alloc? */ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (pages == NULL) { + if (!pages) { ret = -ENOMEM; goto out; } mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); - if (mr == NULL) { + if (!mr) { ret = -ENOMEM; goto out; } @@ -232,13 +238,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to * the zero page. */ - ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1); + ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); if (ret < 0) goto out; nents = ret; sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); - if (sg == NULL) { + if (!sg) { ret = -ENOMEM; goto out; } @@ -320,6 +326,30 @@ int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen) return __rds_rdma_map(rs, &args, NULL, NULL); } +int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen) +{ + struct rds_get_mr_for_dest_args args; + struct rds_get_mr_args new_args; + + if (optlen != sizeof(struct rds_get_mr_for_dest_args)) + return -EINVAL; + + if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval, + sizeof(struct rds_get_mr_for_dest_args))) + return -EFAULT; + + /* + * Initially, just behave like get_mr(). + * TODO: Implement get_mr as wrapper around this + * and deprecate it. + */ + new_args.vec = args.vec; + new_args.cookie_addr = args.cookie_addr; + new_args.flags = args.flags; + + return __rds_rdma_map(rs, &new_args, NULL, NULL); +} + /* * Free the MR indicated by the given R_Key */ @@ -384,132 +414,217 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); - if (mr && (mr->r_use_once || force)) { + if (!mr) { + printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + return; + } + + if (mr->r_use_once || force) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); zot_me = 1; - } else if (mr) - atomic_inc(&mr->r_refcount); + } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); /* May have to issue a dma_sync on this memory region. * Note we could avoid this if the operation was a RDMA READ, * but at this point we can't tell. */ - if (mr != NULL) { - if (mr->r_trans->sync_mr) - mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); - - /* If the MR was marked as invalidate, this will - * trigger an async flush. */ - if (zot_me) - rds_destroy_mr(mr); - rds_mr_put(mr); - } + if (mr->r_trans->sync_mr) + mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); + + /* If the MR was marked as invalidate, this will + * trigger an async flush. */ + if (zot_me) + rds_destroy_mr(mr); + rds_mr_put(mr); } -void rds_rdma_free_op(struct rds_rdma_op *ro) +void rds_rdma_free_op(struct rm_rdma_op *ro) { unsigned int i; - for (i = 0; i < ro->r_nents; i++) { - struct page *page = sg_page(&ro->r_sg[i]); + for (i = 0; i < ro->op_nents; i++) { + struct page *page = sg_page(&ro->op_sg[i]); /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ - if (!ro->r_write) + if (!ro->op_write) { + BUG_ON(irqs_disabled()); set_page_dirty(page); + } put_page(page); } - kfree(ro->r_notifier); - kfree(ro); + kfree(ro->op_notifier); + ro->op_notifier = NULL; + ro->op_active = 0; +} + +void rds_atomic_free_op(struct rm_atomic_op *ao) +{ + struct page *page = sg_page(ao->op_sg); + + /* Mark page dirty if it was possibly modified, which + * is the case for a RDMA_READ which copies from remote + * to local memory */ + set_page_dirty(page); + put_page(page); + + kfree(ao->op_notifier); + ao->op_notifier = NULL; + ao->op_active = 0; } + /* - * args is a pointer to an in-kernel copy in the sendmsg cmsg. + * Count the number of pages needed to describe an incoming iovec array. */ -static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, - struct rds_rdma_args *args) +static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) +{ + int tot_pages = 0; + unsigned int nr_pages; + unsigned int i; + + /* figure out the number of pages in the vector */ + for (i = 0; i < nr_iovecs; i++) { + nr_pages = rds_pages_in_vec(&iov[i]); + if (nr_pages == 0) + return -EINVAL; + + tot_pages += nr_pages; + + /* + * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, + * so tot_pages cannot overflow without first going negative. + */ + if (tot_pages < 0) + return -EINVAL; + } + + return tot_pages; +} + +int rds_rdma_extra_size(struct rds_rdma_args *args) { struct rds_iovec vec; - struct rds_rdma_op *op = NULL; + struct rds_iovec __user *local_vec; + int tot_pages = 0; unsigned int nr_pages; - unsigned int max_pages; + unsigned int i; + + local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; + + /* figure out the number of pages in the vector */ + for (i = 0; i < args->nr_local; i++) { + if (copy_from_user(&vec, &local_vec[i], + sizeof(struct rds_iovec))) + return -EFAULT; + + nr_pages = rds_pages_in_vec(&vec); + if (nr_pages == 0) + return -EINVAL; + + tot_pages += nr_pages; + + /* + * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, + * so tot_pages cannot overflow without first going negative. + */ + if (tot_pages < 0) + return -EINVAL; + } + + return tot_pages * sizeof(struct scatterlist); +} + +/* + * The application asks for a RDMA transfer. + * Extract all arguments and set up the rdma_op + */ +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + struct rds_rdma_args *args; + struct rm_rdma_op *op = &rm->rdma; + int nr_pages; unsigned int nr_bytes; struct page **pages = NULL; - struct rds_iovec __user *local_vec; - struct scatterlist *sg; - unsigned int nr; + struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack; + int iov_size; unsigned int i, j; - int ret; + int ret = 0; + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) + || rm->rdma.op_active) + return -EINVAL; + + args = CMSG_DATA(cmsg); if (rs->rs_bound_addr == 0) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } - if (args->nr_local > (u64)UINT_MAX) { + if (args->nr_local > UIO_MAXIOV) { ret = -EMSGSIZE; goto out; } - nr_pages = 0; - max_pages = 0; - - local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; - - /* figure out the number of pages in the vector */ - for (i = 0; i < args->nr_local; i++) { - if (copy_from_user(&vec, &local_vec[i], - sizeof(struct rds_iovec))) { - ret = -EFAULT; - goto out; - } - - nr = rds_pages_in_vec(&vec); - if (nr == 0) { - ret = -EINVAL; + /* Check whether to allocate the iovec area */ + iov_size = args->nr_local * sizeof(struct rds_iovec); + if (args->nr_local > UIO_FASTIOV) { + iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL); + if (!iovs) { + ret = -ENOMEM; goto out; } + } - max_pages = max(nr, max_pages); - nr_pages += nr; + if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) { + ret = -EFAULT; + goto out; } - pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL); - if (pages == NULL) { - ret = -ENOMEM; + nr_pages = rds_rdma_pages(iovs, args->nr_local); + if (nr_pages < 0) { + ret = -EINVAL; goto out; } - op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL); - if (op == NULL) { + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) { ret = -ENOMEM; goto out; } - op->r_write = !!(args->flags & RDS_RDMA_READWRITE); - op->r_fence = !!(args->flags & RDS_RDMA_FENCE); - op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); - op->r_recverr = rs->rs_recverr; + op->op_write = !!(args->flags & RDS_RDMA_READWRITE); + op->op_fence = !!(args->flags & RDS_RDMA_FENCE); + op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + op->op_silent = !!(args->flags & RDS_RDMA_SILENT); + op->op_active = 1; + op->op_recverr = rs->rs_recverr; WARN_ON(!nr_pages); - sg_init_table(op->r_sg, nr_pages); + op->op_sg = rds_message_alloc_sgs(rm, nr_pages); + if (!op->op_sg) { + ret = -ENOMEM; + goto out; + } - if (op->r_notify || op->r_recverr) { + if (op->op_notify || op->op_recverr) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ - op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); - if (!op->r_notifier) { + op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); + if (!op->op_notifier) { ret = -ENOMEM; goto out; } - op->r_notifier->n_user_token = args->user_token; - op->r_notifier->n_status = RDS_RDMA_SUCCESS; + op->op_notifier->n_user_token = args->user_token; + op->op_notifier->n_status = RDS_RDMA_SUCCESS; } /* The cookie contains the R_Key of the remote memory region, and @@ -519,68 +634,55 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, * destination address (which is really an offset into the MR) * FIXME: We may want to move this into ib_rdma.c */ - op->r_key = rds_rdma_cookie_key(args->cookie); - op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); + op->op_rkey = rds_rdma_cookie_key(args->cookie); + op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); nr_bytes = 0; rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", (unsigned long long)args->nr_local, (unsigned long long)args->remote_vec.addr, - op->r_key); + op->op_rkey); for (i = 0; i < args->nr_local; i++) { - if (copy_from_user(&vec, &local_vec[i], - sizeof(struct rds_iovec))) { - ret = -EFAULT; - goto out; - } + struct rds_iovec *iov = &iovs[i]; + /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */ + unsigned int nr = rds_pages_in_vec(iov); - nr = rds_pages_in_vec(&vec); - if (nr == 0) { - ret = -EINVAL; - goto out; - } - - rs->rs_user_addr = vec.addr; - rs->rs_user_bytes = vec.bytes; + rs->rs_user_addr = iov->addr; + rs->rs_user_bytes = iov->bytes; - /* did the user change the vec under us? */ - if (nr > max_pages || op->r_nents + nr > nr_pages) { - ret = -EINVAL; - goto out; - } /* If it's a WRITE operation, we want to pin the pages for reading. * If it's a READ operation, we need to pin the pages for writing. */ - ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write); + ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); if (ret < 0) goto out; - rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n", - nr_bytes, nr, vec.bytes, vec.addr); + rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", + nr_bytes, nr, iov->bytes, iov->addr); - nr_bytes += vec.bytes; + nr_bytes += iov->bytes; for (j = 0; j < nr; j++) { - unsigned int offset = vec.addr & ~PAGE_MASK; + unsigned int offset = iov->addr & ~PAGE_MASK; + struct scatterlist *sg; - sg = &op->r_sg[op->r_nents + j]; + sg = &op->op_sg[op->op_nents + j]; sg_set_page(sg, pages[j], - min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), + min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), offset); - rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n", - sg->offset, sg->length, vec.addr, vec.bytes); + rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", + sg->offset, sg->length, iov->addr, iov->bytes); - vec.addr += sg->length; - vec.bytes -= sg->length; + iov->addr += sg->length; + iov->bytes -= sg->length; } - op->r_nents += nr; + op->op_nents += nr; } - if (nr_bytes > args->remote_vec.bytes) { rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", nr_bytes, @@ -588,38 +690,18 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, ret = -EINVAL; goto out; } - op->r_bytes = nr_bytes; + op->op_bytes = nr_bytes; - ret = 0; out: + if (iovs != iovstack) + sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size); kfree(pages); - if (ret) { - if (op) - rds_rdma_free_op(op); - op = ERR_PTR(ret); - } - return op; -} - -/* - * The application asks for a RDMA transfer. - * Extract all arguments and set up the rdma_op - */ -int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg) -{ - struct rds_rdma_op *op; - - if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) - || rm->m_rdma_op != NULL) - return -EINVAL; + if (ret) + rds_rdma_free_op(op); + else + rds_stats_inc(s_send_rdma); - op = rds_rdma_prepare(rs, CMSG_DATA(cmsg)); - if (IS_ERR(op)) - return PTR_ERR(op); - rds_stats_inc(s_send_rdma); - rm->m_rdma_op = op; - return 0; + return ret; } /* @@ -634,8 +716,8 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, u32 r_key; int err = 0; - if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) - || rm->m_rdma_cookie != 0) + if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) || + rm->m_rdma_cookie != 0) return -EINVAL; memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie)); @@ -649,7 +731,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); - if (mr == NULL) + if (!mr) err = -EINVAL; /* invalid r_key */ else atomic_inc(&mr->r_refcount); @@ -657,7 +739,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, if (mr) { mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); - rm->m_rdma_mr = mr; + rm->rdma.op_rdma_mr = mr; } return err; } @@ -671,9 +753,106 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { - if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) - || rm->m_rdma_cookie != 0) + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) || + rm->m_rdma_cookie != 0) + return -EINVAL; + + return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr); +} + +/* + * Fill in rds_message for an atomic request. + */ +int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + struct page *page = NULL; + struct rds_atomic_args *args; + int ret = 0; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args)) + || rm->atomic.op_active) return -EINVAL; - return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr); + args = CMSG_DATA(cmsg); + + /* Nonmasked & masked cmsg ops converted to masked hw ops */ + switch (cmsg->cmsg_type) { + case RDS_CMSG_ATOMIC_FADD: + rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; + rm->atomic.op_m_fadd.add = args->fadd.add; + rm->atomic.op_m_fadd.nocarry_mask = 0; + break; + case RDS_CMSG_MASKED_ATOMIC_FADD: + rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; + rm->atomic.op_m_fadd.add = args->m_fadd.add; + rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask; + break; + case RDS_CMSG_ATOMIC_CSWP: + rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; + rm->atomic.op_m_cswp.compare = args->cswp.compare; + rm->atomic.op_m_cswp.swap = args->cswp.swap; + rm->atomic.op_m_cswp.compare_mask = ~0; + rm->atomic.op_m_cswp.swap_mask = ~0; + break; + case RDS_CMSG_MASKED_ATOMIC_CSWP: + rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; + rm->atomic.op_m_cswp.compare = args->m_cswp.compare; + rm->atomic.op_m_cswp.swap = args->m_cswp.swap; + rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask; + rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask; + break; + default: + BUG(); /* should never happen */ + } + + rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); + rm->atomic.op_active = 1; + rm->atomic.op_recverr = rs->rs_recverr; + rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); + if (!rm->atomic.op_sg) { + ret = -ENOMEM; + goto err; + } + + /* verify 8 byte-aligned */ + if (args->local_addr & 0x7) { + ret = -EFAULT; + goto err; + } + + ret = rds_pin_pages(args->local_addr, 1, &page, 1); + if (ret != 1) + goto err; + ret = 0; + + sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr)); + + if (rm->atomic.op_notify || rm->atomic.op_recverr) { + /* We allocate an uninitialized notifier here, because + * we don't want to do that in the completion handler. We + * would have to use GFP_ATOMIC there, and don't want to deal + * with failed allocations. + */ + rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); + if (!rm->atomic.op_notifier) { + ret = -ENOMEM; + goto err; + } + + rm->atomic.op_notifier->n_user_token = args->user_token; + rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; + } + + rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); + rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); + + return ret; +err: + if (page) + put_page(page); + kfree(rm->atomic.op_notifier); + + return ret; } diff --git a/net/rds/rdma.h b/net/rds/rdma.h deleted file mode 100644 index 425512098b0..00000000000 --- a/net/rds/rdma.h +++ /dev/null @@ -1,84 +0,0 @@ -#ifndef _RDS_RDMA_H -#define _RDS_RDMA_H - -#include <linux/rbtree.h> -#include <linux/spinlock.h> -#include <linux/scatterlist.h> - -#include "rds.h" - -struct rds_mr { - struct rb_node r_rb_node; - atomic_t r_refcount; - u32 r_key; - - /* A copy of the creation flags */ - unsigned int r_use_once:1; - unsigned int r_invalidate:1; - unsigned int r_write:1; - - /* This is for RDS_MR_DEAD. - * It would be nice & consistent to make this part of the above - * bit field here, but we need to use test_and_set_bit. - */ - unsigned long r_state; - struct rds_sock *r_sock; /* back pointer to the socket that owns us */ - struct rds_transport *r_trans; - void *r_trans_private; -}; - -/* Flags for mr->r_state */ -#define RDS_MR_DEAD 0 - -struct rds_rdma_op { - u32 r_key; - u64 r_remote_addr; - unsigned int r_write:1; - unsigned int r_fence:1; - unsigned int r_notify:1; - unsigned int r_recverr:1; - unsigned int r_mapped:1; - struct rds_notifier *r_notifier; - unsigned int r_bytes; - unsigned int r_nents; - unsigned int r_count; - struct scatterlist r_sg[0]; -}; - -static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) -{ - return r_key | (((u64) offset) << 32); -} - -static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) -{ - return cookie; -} - -static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) -{ - return cookie >> 32; -} - -int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); -int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); -void rds_rdma_drop_keys(struct rds_sock *rs); -int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); -int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); -int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); -int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); -void rds_rdma_free_op(struct rds_rdma_op *ro); -void rds_rdma_send_complete(struct rds_message *rm, int); - -extern void __rds_put_mr_final(struct rds_mr *mr); -static inline void rds_mr_put(struct rds_mr *mr) -{ - if (atomic_dec_and_test(&mr->r_refcount)) - __rds_put_mr_final(mr); -} - -#endif diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 7b19024f970..6cd9d1deafc 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -30,11 +30,40 @@ * SOFTWARE. * */ +#include <linux/module.h> #include <rdma/rdma_cm.h> #include "rdma_transport.h" -static struct rdma_cm_id *rds_iw_listen_id; +static struct rdma_cm_id *rds_rdma_listen_id; + +static char *rds_cm_event_strings[] = { +#define RDS_CM_EVENT_STRING(foo) \ + [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo) + RDS_CM_EVENT_STRING(ADDR_RESOLVED), + RDS_CM_EVENT_STRING(ADDR_ERROR), + RDS_CM_EVENT_STRING(ROUTE_RESOLVED), + RDS_CM_EVENT_STRING(ROUTE_ERROR), + RDS_CM_EVENT_STRING(CONNECT_REQUEST), + RDS_CM_EVENT_STRING(CONNECT_RESPONSE), + RDS_CM_EVENT_STRING(CONNECT_ERROR), + RDS_CM_EVENT_STRING(UNREACHABLE), + RDS_CM_EVENT_STRING(REJECTED), + RDS_CM_EVENT_STRING(ESTABLISHED), + RDS_CM_EVENT_STRING(DISCONNECTED), + RDS_CM_EVENT_STRING(DEVICE_REMOVAL), + RDS_CM_EVENT_STRING(MULTICAST_JOIN), + RDS_CM_EVENT_STRING(MULTICAST_ERROR), + RDS_CM_EVENT_STRING(ADDR_CHANGE), + RDS_CM_EVENT_STRING(TIMEWAIT_EXIT), +#undef RDS_CM_EVENT_STRING +}; + +static char *rds_cm_event_str(enum rdma_cm_event_type type) +{ + return rds_str_array(rds_cm_event_strings, + ARRAY_SIZE(rds_cm_event_strings), type); +}; int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) @@ -44,8 +73,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rds_transport *trans; int ret = 0; - rdsdebug("conn %p id %p handling event %u\n", conn, cm_id, - event->event); + rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, + event->event, rds_cm_event_str(event->event)); if (cm_id->device->node_type == RDMA_NODE_RNIC) trans = &rds_iw_transport; @@ -101,7 +130,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, break; case RDMA_CM_EVENT_DISCONNECTED: - printk(KERN_WARNING "RDS/IW: DISCONNECT event - dropping connection " + rdsdebug("DISCONNECT event - dropping connection " "%pI4->%pI4\n", &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn); @@ -109,8 +138,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, default: /* things like device disconnect? */ - printk(KERN_ERR "unknown event %u\n", event->event); - BUG(); + printk(KERN_ERR "RDS: unknown event %u (%s)!\n", + event->event, rds_cm_event_str(event->event)); break; } @@ -118,26 +147,28 @@ out: if (conn) mutex_unlock(&conn->c_cm_lock); - rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret); + rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event, + rds_cm_event_str(event->event), ret); return ret; } -static int __init rds_rdma_listen_init(void) +static int rds_rdma_listen_init(void) { struct sockaddr_in sin; struct rdma_cm_id *cm_id; int ret; - cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP); + cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP, + IB_QPT_RC); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); - printk(KERN_ERR "RDS/IW: failed to setup listener, " + printk(KERN_ERR "RDS/RDMA: failed to setup listener, " "rdma_create_id() returned %d\n", ret); - goto out; + return ret; } - sin.sin_family = PF_INET, + sin.sin_family = AF_INET; sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); sin.sin_port = (__force u16)htons(RDS_PORT); @@ -147,21 +178,21 @@ static int __init rds_rdma_listen_init(void) */ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); if (ret) { - printk(KERN_ERR "RDS/IW: failed to setup listener, " + printk(KERN_ERR "RDS/RDMA: failed to setup listener, " "rdma_bind_addr() returned %d\n", ret); goto out; } ret = rdma_listen(cm_id, 128); if (ret) { - printk(KERN_ERR "RDS/IW: failed to setup listener, " + printk(KERN_ERR "RDS/RDMA: failed to setup listener, " "rdma_listen() returned %d\n", ret); goto out; } rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT); - rds_iw_listen_id = cm_id; + rds_rdma_listen_id = cm_id; cm_id = NULL; out: if (cm_id) @@ -171,14 +202,14 @@ out: static void rds_rdma_listen_stop(void) { - if (rds_iw_listen_id) { - rdsdebug("cm %p\n", rds_iw_listen_id); - rdma_destroy_id(rds_iw_listen_id); - rds_iw_listen_id = NULL; + if (rds_rdma_listen_id) { + rdsdebug("cm %p\n", rds_rdma_listen_id); + rdma_destroy_id(rds_rdma_listen_id); + rds_rdma_listen_id = NULL; } } -int __init rds_rdma_init(void) +static int rds_rdma_init(void) { int ret; @@ -203,12 +234,18 @@ err_iw_init: out: return ret; } +module_init(rds_rdma_init); -void rds_rdma_exit(void) +static void rds_rdma_exit(void) { /* stop listening first to ensure no new connections are attempted */ rds_rdma_listen_stop(); rds_ib_exit(); rds_iw_exit(); } +module_exit(rds_rdma_exit); + +MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); +MODULE_DESCRIPTION("RDS: IB/iWARP transport"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index 2f2c7d976c2..faba4e38269 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -11,10 +11,6 @@ int rds_rdma_conn_connect(struct rds_connection *conn); int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); -/* from rdma_transport.c */ -int rds_rdma_init(void); -void rds_rdma_exit(void); - /* from ib.c */ extern struct rds_transport rds_ib_transport; int rds_ib_init(void); diff --git a/net/rds/rds.h b/net/rds/rds.h index 71794449ca4..48f8ffc60f8 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -36,8 +36,8 @@ #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) #else /* sigh, pr_debug() causes unused variable warnings */ -static inline void __attribute__ ((format (printf, 1, 2))) -rdsdebug(char *fmt, ...) +static inline __printf(1, 2) +void rdsdebug(char *fmt, ...) { } #endif @@ -50,7 +50,6 @@ rdsdebug(char *fmt, ...) #define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) #define RDS_CONG_MAP_BYTES (65536 / 8) -#define RDS_CONG_MAP_LONGS (RDS_CONG_MAP_BYTES / sizeof(unsigned long)) #define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) #define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) @@ -80,6 +79,7 @@ enum { /* Bits for c_flags */ #define RDS_LL_SEND_FULL 0 #define RDS_RECONNECT_PENDING 1 +#define RDS_IN_XMIT 2 struct rds_connection { struct hlist_node c_hash_node; @@ -91,12 +91,13 @@ struct rds_connection { struct rds_cong_map *c_lcong; struct rds_cong_map *c_fcong; - struct mutex c_send_lock; /* protect send ring */ struct rds_message *c_xmit_rm; unsigned long c_xmit_sg; unsigned int c_xmit_hdr_off; unsigned int c_xmit_data_off; + unsigned int c_xmit_atomic_sent; unsigned int c_xmit_rdma_sent; + unsigned int c_xmit_data_sent; spinlock_t c_lock; /* protect msg queues */ u64 c_next_tx_seq; @@ -116,11 +117,10 @@ struct rds_connection { struct delayed_work c_conn_w; struct work_struct c_down_w; struct mutex c_cm_lock; /* protect conn state & cm */ + wait_queue_head_t c_waitq; struct list_head c_map_item; unsigned long c_map_queued; - unsigned long c_map_offset; - unsigned long c_map_bytes; unsigned int c_unacked_packets; unsigned int c_unacked_bytes; @@ -132,7 +132,7 @@ struct rds_connection { #define RDS_FLAG_CONG_BITMAP 0x01 #define RDS_FLAG_ACK_REQUIRED 0x02 #define RDS_FLAG_RETRANSMITTED 0x04 -#define RDS_MAX_ADV_CREDIT 127 +#define RDS_MAX_ADV_CREDIT 255 /* * Maximum space available for extension headers. @@ -206,6 +206,48 @@ struct rds_incoming { rds_rdma_cookie_t i_rdma_cookie; }; +struct rds_mr { + struct rb_node r_rb_node; + atomic_t r_refcount; + u32 r_key; + + /* A copy of the creation flags */ + unsigned int r_use_once:1; + unsigned int r_invalidate:1; + unsigned int r_write:1; + + /* This is for RDS_MR_DEAD. + * It would be nice & consistent to make this part of the above + * bit field here, but we need to use test_and_set_bit. + */ + unsigned long r_state; + struct rds_sock *r_sock; /* back pointer to the socket that owns us */ + struct rds_transport *r_trans; + void *r_trans_private; +}; + +/* Flags for mr->r_state */ +#define RDS_MR_DEAD 0 + +static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) +{ + return r_key | (((u64) offset) << 32); +} + +static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) +{ + return cookie; +} + +static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) +{ + return cookie >> 32; +} + +/* atomic operation types */ +#define RDS_ATOMIC_TYPE_CSWP 0 +#define RDS_ATOMIC_TYPE_FADD 1 + /* * m_sock_item and m_conn_item are on lists that are serialized under * conn->c_lock. m_sock_item has additional meaning in that once it is empty @@ -258,13 +300,71 @@ struct rds_message { * -> rs->rs_lock */ spinlock_t m_rs_lock; + wait_queue_head_t m_flush_wait; + struct rds_sock *m_rs; - struct rds_rdma_op *m_rdma_op; + + /* cookie to send to remote, in rds header */ rds_rdma_cookie_t m_rdma_cookie; - struct rds_mr *m_rdma_mr; - unsigned int m_nents; - unsigned int m_count; - struct scatterlist m_sg[0]; + + unsigned int m_used_sgs; + unsigned int m_total_sgs; + + void *m_final_op; + + struct { + struct rm_atomic_op { + int op_type; + union { + struct { + uint64_t compare; + uint64_t swap; + uint64_t compare_mask; + uint64_t swap_mask; + } op_m_cswp; + struct { + uint64_t add; + uint64_t nocarry_mask; + } op_m_fadd; + }; + + u32 op_rkey; + u64 op_remote_addr; + unsigned int op_notify:1; + unsigned int op_recverr:1; + unsigned int op_mapped:1; + unsigned int op_silent:1; + unsigned int op_active:1; + struct scatterlist *op_sg; + struct rds_notifier *op_notifier; + + struct rds_mr *op_rdma_mr; + } atomic; + struct rm_rdma_op { + u32 op_rkey; + u64 op_remote_addr; + unsigned int op_write:1; + unsigned int op_fence:1; + unsigned int op_notify:1; + unsigned int op_recverr:1; + unsigned int op_mapped:1; + unsigned int op_silent:1; + unsigned int op_active:1; + unsigned int op_bytes; + unsigned int op_nents; + unsigned int op_count; + struct scatterlist *op_sg; + struct rds_notifier *op_notifier; + + struct rds_mr *op_rdma_mr; + } rdma; + struct rm_data_op { + unsigned int op_active:1; + unsigned int op_nents; + unsigned int op_count; + struct scatterlist *op_sg; + } data; + }; }; /* @@ -305,17 +405,19 @@ struct rds_notifier { * transport is responsible for other serialization, including * rds_recv_incoming(). This is called in process context but * should try hard not to block. - * - * @xmit_cong_map: This asks the transport to send the local bitmap down the - * given connection. XXX get a better story about the bitmap - * flag and header. */ +#define RDS_TRANS_IB 0 +#define RDS_TRANS_IWARP 1 +#define RDS_TRANS_TCP 2 +#define RDS_TRANS_COUNT 3 + struct rds_transport { char t_name[TRANSNAMSIZ]; struct list_head t_item; struct module *t_owner; unsigned int t_prefer_loopback:1; + unsigned int t_type; int (*laddr_check)(__be32 addr); int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); @@ -326,13 +428,11 @@ struct rds_transport { void (*xmit_complete)(struct rds_connection *conn); int (*xmit)(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); - int (*xmit_cong_map)(struct rds_connection *conn, - struct rds_cong_map *map, unsigned long offset); - int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op); + int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); + int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); int (*recv)(struct rds_connection *conn); int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, size_t size); - void (*inc_purge)(struct rds_incoming *inc); void (*inc_free)(struct rds_incoming *inc); int (*cm_handle_connect)(struct rdma_cm_id *cm_id, @@ -361,17 +461,11 @@ struct rds_sock { * bound_addr used for both incoming and outgoing, no INADDR_ANY * support. */ - struct rb_node rs_bound_node; + struct hlist_node rs_bound_node; __be32 rs_bound_addr; __be32 rs_conn_addr; __be16 rs_bound_port; __be16 rs_conn_port; - - /* - * This is only used to communicate the transport between bind and - * initiating connections. All other trans use is referenced through - * the connection. - */ struct rds_transport *rs_transport; /* @@ -382,6 +476,8 @@ struct rds_sock { /* flag indicating we were congested or not */ int rs_congested; + /* seen congestion (ENOBUFS) when sending? */ + int rs_seen_congestion; /* rs_lock protects all these adjacent members before the newline */ spinlock_t rs_lock; @@ -458,8 +554,8 @@ struct rds_statistics { uint64_t s_recv_ping; uint64_t s_send_queue_empty; uint64_t s_send_queue_full; - uint64_t s_send_sem_contention; - uint64_t s_send_sem_queue_raced; + uint64_t s_send_lock_contention; + uint64_t s_send_lock_queue_raced; uint64_t s_send_immediate_retry; uint64_t s_send_delayed_retry; uint64_t s_send_drop_acked; @@ -479,12 +575,13 @@ struct rds_statistics { }; /* af_rds.c */ +char *rds_str_array(char **array, size_t elements, size_t index); void rds_sock_addref(struct rds_sock *rs); void rds_sock_put(struct rds_sock *rs); void rds_wake_sk_sleep(struct rds_sock *rs); static inline void __rds_wake_sk_sleep(struct sock *sk) { - wait_queue_head_t *waitq = sk->sk_sleep; + wait_queue_head_t *waitq = sk_sleep(sk); if (!sock_flag(sk, SOCK_DEAD) && waitq) wake_up(waitq); @@ -513,22 +610,23 @@ void rds_cong_exit(void); struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); /* conn.c */ -int __init rds_conn_init(void); +int rds_conn_init(void); void rds_conn_exit(void); struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp); struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp); +void rds_conn_shutdown(struct rds_connection *conn); void rds_conn_destroy(struct rds_connection *conn); -void rds_conn_reset(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); +void rds_conn_connect_if_down(struct rds_connection *conn); void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_connection *, void *), size_t item_len); -void __rds_conn_error(struct rds_connection *conn, const char *, ...) - __attribute__ ((format (printf, 2, 3))); +__printf(2, 3) +void __rds_conn_error(struct rds_connection *conn, const char *, ...); #define rds_conn_error(conn, fmt...) \ __rds_conn_error(conn, KERN_WARNING "RDS: " fmt) @@ -558,7 +656,8 @@ rds_conn_connecting(struct rds_connection *conn) /* message.c */ struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); -struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, +struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); +int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, size_t total_len); struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); void rds_message_populate_header(struct rds_header *hdr, __be16 sport, @@ -567,12 +666,9 @@ int rds_message_add_extension(struct rds_header *hdr, unsigned int type, const void *data, unsigned int len); int rds_message_next_extension(struct rds_header *hdr, unsigned int *pos, void *buf, unsigned int *buflen); -int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version); -int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version); int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, size_t size); -void rds_message_inc_purge(struct rds_incoming *inc); void rds_message_inc_free(struct rds_incoming *inc); void rds_message_addref(struct rds_message *rm); void rds_message_put(struct rds_message *rm); @@ -606,10 +702,9 @@ void rds_page_exit(void); /* recv.c */ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, __be32 saddr); -void rds_inc_addref(struct rds_incoming *inc); void rds_inc_put(struct rds_incoming *inc); void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, - struct rds_incoming *inc, gfp_t gfp, enum km_type km); + struct rds_incoming *inc, gfp_t gfp); int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int msg_flags); void rds_clear_recv_queue(struct rds_sock *rs); @@ -628,14 +723,38 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked); -int rds_send_acked_before(struct rds_connection *conn, u64 seq); -void rds_send_remove_from_sock(struct list_head *messages, int status); int rds_send_pong(struct rds_connection *conn, __be16 dport); struct rds_message *rds_send_get_message(struct rds_connection *, - struct rds_rdma_op *); + struct rm_rdma_op *); /* rdma.c */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); +int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); +int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); +int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); +void rds_rdma_drop_keys(struct rds_sock *rs); +int rds_rdma_extra_size(struct rds_rdma_args *args); +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +void rds_rdma_free_op(struct rm_rdma_op *ro); +void rds_atomic_free_op(struct rm_atomic_op *ao); +void rds_rdma_send_complete(struct rds_message *rm, int wc_status); +void rds_atomic_send_complete(struct rds_message *rm, int wc_status); +int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); + +void __rds_put_mr_final(struct rds_mr *mr); +static inline void rds_mr_put(struct rds_mr *mr) +{ + if (atomic_dec_and_test(&mr->r_refcount)) + __rds_put_mr_final(mr); +} /* stats.c */ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); @@ -649,13 +768,14 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); put_cpu(); \ } while (0) #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) -int __init rds_stats_init(void); +int rds_stats_init(void); void rds_stats_exit(void); void rds_stats_info_copy(struct rds_info_iterator *iter, - uint64_t *values, char **names, size_t nr); + uint64_t *values, const char *const *names, + size_t nr); /* sysctl.c */ -int __init rds_sysctl_init(void); +int rds_sysctl_init(void); void rds_sysctl_exit(void); extern unsigned long rds_sysctl_sndbuf_min; extern unsigned long rds_sysctl_sndbuf_default; @@ -669,9 +789,10 @@ extern unsigned long rds_sysctl_trace_flags; extern unsigned int rds_sysctl_trace_level; /* threads.c */ -int __init rds_threads_init(void); +int rds_threads_init(void); void rds_threads_exit(void); extern struct workqueue_struct *rds_wq; +void rds_queue_reconnect(struct rds_connection *conn); void rds_connect_worker(struct work_struct *); void rds_shutdown_worker(struct work_struct *); void rds_send_worker(struct work_struct *); @@ -682,9 +803,10 @@ void rds_connect_complete(struct rds_connection *conn); int rds_trans_register(struct rds_transport *trans); void rds_trans_unregister(struct rds_transport *trans); struct rds_transport *rds_trans_get_preferred(__be32 addr); +void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); -int __init rds_trans_init(void); +int rds_trans_init(void); void rds_trans_exit(void); #endif diff --git a/net/rds/recv.c b/net/rds/recv.c index f2118c51cfa..bd82522534f 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -31,11 +31,12 @@ * */ #include <linux/kernel.h> +#include <linux/slab.h> #include <net/sock.h> #include <linux/in.h> +#include <linux/export.h> #include "rds.h" -#include "rdma.h" void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, __be32 saddr) @@ -46,8 +47,9 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, inc->i_saddr = saddr; inc->i_rdma_cookie = 0; } +EXPORT_SYMBOL_GPL(rds_inc_init); -void rds_inc_addref(struct rds_incoming *inc) +static void rds_inc_addref(struct rds_incoming *inc) { rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); atomic_inc(&inc->i_refcount); @@ -62,6 +64,7 @@ void rds_inc_put(struct rds_incoming *inc) inc->i_conn->c_trans->inc_free(inc); } } +EXPORT_SYMBOL_GPL(rds_inc_put); static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, struct rds_cong_map *map, @@ -152,7 +155,7 @@ static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock * tell us which roles the addrs in the conn are playing for this message. */ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, - struct rds_incoming *inc, gfp_t gfp, enum km_type km) + struct rds_incoming *inc, gfp_t gfp) { struct rds_sock *rs = NULL; struct sock *sk; @@ -192,8 +195,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, * XXX we could spend more on the wire to get more robust failure * detection, arguably worth it to avoid data corruption. */ - if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq - && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { + if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq && + (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { rds_stats_inc(s_recv_drop_old_seq); goto out; } @@ -206,7 +209,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, } rs = rds_find_bound(daddr, inc->i_hdr.h_dport); - if (rs == NULL) { + if (!rs) { rds_stats_inc(s_recv_drop_no_sock); goto out; } @@ -237,6 +240,7 @@ out: if (rs) rds_sock_put(rs); } +EXPORT_SYMBOL_GPL(rds_recv_incoming); /* * be very careful here. This is being called as the condition in @@ -246,7 +250,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc) { unsigned long flags; - if (*inc == NULL) { + if (!*inc) { read_lock_irqsave(&rs->rs_recv_lock, flags); if (!list_empty(&rs->rs_recv_queue)) { *inc = list_entry(rs->rs_recv_queue.next, @@ -292,7 +296,7 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) { struct rds_notifier *notifier; - struct rds_rdma_notify cmsg; + struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */ unsigned int count = 0, max_messages = ~0U; unsigned long flags; LIST_HEAD(copy); @@ -329,10 +333,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) if (msghdr) { cmsg.user_token = notifier->n_user_token; - cmsg.status = notifier->n_status; + cmsg.status = notifier->n_status; err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, - sizeof(cmsg), &cmsg); + sizeof(cmsg), &cmsg); if (err) break; } @@ -398,7 +402,7 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, struct rds_sock *rs = rds_sk_to_rs(sk); long timeo; int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; - struct sockaddr_in *sin; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct rds_incoming *inc = NULL; /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */ @@ -409,27 +413,28 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, if (msg_flags & MSG_OOB) goto out; - /* If there are pending notifications, do those - and nothing else */ - if (!list_empty(&rs->rs_notify_queue)) { - ret = rds_notify_queue_get(rs, msg); - goto out; - } + while (1) { + /* If there are pending notifications, do those - and nothing else */ + if (!list_empty(&rs->rs_notify_queue)) { + ret = rds_notify_queue_get(rs, msg); + break; + } - if (rs->rs_cong_notify) { - ret = rds_notify_cong(rs, msg); - goto out; - } + if (rs->rs_cong_notify) { + ret = rds_notify_cong(rs, msg); + break; + } - while (1) { if (!rds_next_incoming(rs, &inc)) { if (nonblock) { ret = -EAGAIN; break; } - timeo = wait_event_interruptible_timeout(*sk->sk_sleep, - rds_next_incoming(rs, &inc), - timeo); + timeo = wait_event_interruptible_timeout(*sk_sleep(sk), + (!list_empty(&rs->rs_notify_queue) || + rs->rs_cong_notify || + rds_next_incoming(rs, &inc)), timeo); rdsdebug("recvmsg woke inc %p timeo %ld\n", inc, timeo); if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) @@ -474,12 +479,12 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, rds_stats_inc(s_recv_delivered); - sin = (struct sockaddr_in *)msg->msg_name; if (sin) { sin->sin_family = AF_INET; sin->sin_port = inc->i_hdr.h_sport; sin->sin_addr.s_addr = inc->i_saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + msg->msg_namelen = sizeof(*sin); } break; } diff --git a/net/rds/send.c b/net/rds/send.c index 104fe033203..23718160d71 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -31,12 +31,15 @@ * */ #include <linux/kernel.h> +#include <linux/moduleparam.h> +#include <linux/gfp.h> #include <net/sock.h> #include <linux/in.h> #include <linux/list.h> +#include <linux/ratelimit.h> +#include <linux/export.h> #include "rds.h" -#include "rdma.h" /* When transmitting messages in rds_send_xmit, we need to emerge from * time to time and briefly release the CPU. Otherwise the softlock watchdog @@ -52,8 +55,11 @@ static int send_batch_count = 64; module_param(send_batch_count, int, 0444); MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); +static void rds_send_remove_from_sock(struct list_head *messages, int status); + /* - * Reset the send state. Caller must hold c_send_lock when calling here. + * Reset the send state. Callers must ensure that this doesn't race with + * rds_send_xmit(). */ void rds_send_reset(struct rds_connection *conn) { @@ -61,18 +67,22 @@ void rds_send_reset(struct rds_connection *conn) unsigned long flags; if (conn->c_xmit_rm) { + rm = conn->c_xmit_rm; + conn->c_xmit_rm = NULL; /* Tell the user the RDMA op is no longer mapped by the * transport. This isn't entirely true (it's flushed out * independently) but as the connection is down, there's * no ongoing RDMA to/from that memory */ - rds_message_unmapped(conn->c_xmit_rm); - rds_message_put(conn->c_xmit_rm); - conn->c_xmit_rm = NULL; + rds_message_unmapped(rm); + rds_message_put(rm); } + conn->c_xmit_sg = 0; conn->c_xmit_hdr_off = 0; conn->c_xmit_data_off = 0; + conn->c_xmit_atomic_sent = 0; conn->c_xmit_rdma_sent = 0; + conn->c_xmit_data_sent = 0; conn->c_map_queued = 0; @@ -89,8 +99,27 @@ void rds_send_reset(struct rds_connection *conn) spin_unlock_irqrestore(&conn->c_lock, flags); } +static int acquire_in_xmit(struct rds_connection *conn) +{ + return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0; +} + +static void release_in_xmit(struct rds_connection *conn) +{ + clear_bit(RDS_IN_XMIT, &conn->c_flags); + smp_mb__after_atomic(); + /* + * We don't use wait_on_bit()/wake_up_bit() because our waking is in a + * hot path and finding waiters is very rare. We don't want to walk + * the system-wide hashed waitqueue buckets in the fast path only to + * almost never find waiters. + */ + if (waitqueue_active(&conn->c_waitq)) + wake_up_all(&conn->c_waitq); +} + /* - * We're making the concious trade-off here to only send one message + * We're making the conscious trade-off here to only send one message * down the connection at a time. * Pro: * - tx queueing is a simple fifo list @@ -108,102 +137,69 @@ int rds_send_xmit(struct rds_connection *conn) struct rds_message *rm; unsigned long flags; unsigned int tmp; - unsigned int send_quota = send_batch_count; struct scatterlist *sg; int ret = 0; - int was_empty = 0; LIST_HEAD(to_be_dropped); +restart: + /* * sendmsg calls here after having queued its message on the send * queue. We only have one task feeding the connection at a time. If * another thread is already feeding the queue then we back off. This * avoids blocking the caller and trading per-connection data between * caches per message. - * - * The sem holder will issue a retry if they notice that someone queued - * a message after they stopped walking the send queue but before they - * dropped the sem. */ - if (!mutex_trylock(&conn->c_send_lock)) { - rds_stats_inc(s_send_sem_contention); + if (!acquire_in_xmit(conn)) { + rds_stats_inc(s_send_lock_contention); ret = -ENOMEM; goto out; } + /* + * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, + * we do the opposite to avoid races. + */ + if (!rds_conn_up(conn)) { + release_in_xmit(conn); + ret = 0; + goto out; + } + if (conn->c_trans->xmit_prepare) conn->c_trans->xmit_prepare(conn); /* * spin trying to push headers and data down the connection until - * the connection doens't make forward progress. + * the connection doesn't make forward progress. */ - while (--send_quota) { - /* - * See if need to send a congestion map update if we're - * between sending messages. The send_sem protects our sole - * use of c_map_offset and _bytes. - * Note this is used only by transports that define a special - * xmit_cong_map function. For all others, we create allocate - * a cong_map message and treat it just like any other send. - */ - if (conn->c_map_bytes) { - ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong, - conn->c_map_offset); - if (ret <= 0) - break; - - conn->c_map_offset += ret; - conn->c_map_bytes -= ret; - if (conn->c_map_bytes) - continue; - } + while (1) { - /* If we're done sending the current message, clear the - * offset and S/G temporaries. - */ rm = conn->c_xmit_rm; - if (rm != NULL && - conn->c_xmit_hdr_off == sizeof(struct rds_header) && - conn->c_xmit_sg == rm->m_nents) { - conn->c_xmit_rm = NULL; - conn->c_xmit_sg = 0; - conn->c_xmit_hdr_off = 0; - conn->c_xmit_data_off = 0; - conn->c_xmit_rdma_sent = 0; - /* Release the reference to the previous message. */ - rds_message_put(rm); - rm = NULL; - } - - /* If we're asked to send a cong map update, do so. + /* + * If between sending messages, we can send a pending congestion + * map update. */ - if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) { - if (conn->c_trans->xmit_cong_map != NULL) { - conn->c_map_offset = 0; - conn->c_map_bytes = sizeof(struct rds_header) + - RDS_CONG_MAP_BYTES; - continue; - } - + if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) { rm = rds_cong_update_alloc(conn); if (IS_ERR(rm)) { ret = PTR_ERR(rm); break; } + rm->data.op_active = 1; conn->c_xmit_rm = rm; } /* - * Grab the next message from the send queue, if there is one. + * If not already working on one, grab the next message. * * c_xmit_rm holds a ref while we're sending this message down * the connction. We can use this ref while holding the * send_sem.. rds_send_reset() is serialized with it. */ - if (rm == NULL) { + if (!rm) { unsigned int len; spin_lock_irqsave(&conn->c_lock, flags); @@ -223,10 +219,8 @@ int rds_send_xmit(struct rds_connection *conn) spin_unlock_irqrestore(&conn->c_lock, flags); - if (rm == NULL) { - was_empty = 1; + if (!rm) break; - } /* Unfortunately, the way Infiniband deals with * RDMA to a bad MR key is by moving the entire @@ -235,20 +229,19 @@ int rds_send_xmit(struct rds_connection *conn) * connection. * Therefore, we never retransmit messages with RDMA ops. */ - if (rm->m_rdma_op - && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { + if (rm->rdma.op_active && + test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { spin_lock_irqsave(&conn->c_lock, flags); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) list_move(&rm->m_conn_item, &to_be_dropped); spin_unlock_irqrestore(&conn->c_lock, flags); - rds_message_put(rm); continue; } /* Require an ACK every once in a while */ len = ntohl(rm->m_inc.i_hdr.h_len); - if (conn->c_unacked_packets == 0 - || conn->c_unacked_bytes < len) { + if (conn->c_unacked_packets == 0 || + conn->c_unacked_bytes < len) { __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); conn->c_unacked_packets = rds_sysctl_max_unacked_packets; @@ -262,23 +255,55 @@ int rds_send_xmit(struct rds_connection *conn) conn->c_xmit_rm = rm; } - /* - * Try and send an rdma message. Let's see if we can - * keep this simple and require that the transport either - * send the whole rdma or none of it. - */ - if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) { - ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op); + /* The transport either sends the whole rdma or none of it */ + if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { + rm->m_final_op = &rm->rdma; + ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); if (ret) break; conn->c_xmit_rdma_sent = 1; + /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue */ set_bit(RDS_MSG_MAPPED, &rm->m_flags); } - if (conn->c_xmit_hdr_off < sizeof(struct rds_header) || - conn->c_xmit_sg < rm->m_nents) { + if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { + rm->m_final_op = &rm->atomic; + ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); + if (ret) + break; + conn->c_xmit_atomic_sent = 1; + + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); + } + + /* + * A number of cases require an RDS header to be sent + * even if there is no data. + * We permit 0-byte sends; rds-ping depends on this. + * However, if there are exclusively attached silent ops, + * we skip the hdr/data send, to enable silent operation. + */ + if (rm->data.op_nents == 0) { + int ops_present; + int all_ops_are_silent = 1; + + ops_present = (rm->atomic.op_active || rm->rdma.op_active); + if (rm->atomic.op_active && !rm->atomic.op_silent) + all_ops_are_silent = 0; + if (rm->rdma.op_active && !rm->rdma.op_silent) + all_ops_are_silent = 0; + + if (ops_present && all_ops_are_silent + && !rm->m_rdma_cookie) + rm->data.op_active = 0; + } + + if (rm->data.op_active && !conn->c_xmit_data_sent) { + rm->m_final_op = &rm->data; ret = conn->c_trans->xmit(conn, rm, conn->c_xmit_hdr_off, conn->c_xmit_sg, @@ -294,7 +319,7 @@ int rds_send_xmit(struct rds_connection *conn) ret -= tmp; } - sg = &rm->m_sg[conn->c_xmit_sg]; + sg = &rm->data.op_sg[conn->c_xmit_sg]; while (ret) { tmp = min_t(int, ret, sg->length - conn->c_xmit_data_off); @@ -305,49 +330,63 @@ int rds_send_xmit(struct rds_connection *conn) sg++; conn->c_xmit_sg++; BUG_ON(ret != 0 && - conn->c_xmit_sg == rm->m_nents); + conn->c_xmit_sg == rm->data.op_nents); } } + + if (conn->c_xmit_hdr_off == sizeof(struct rds_header) && + (conn->c_xmit_sg == rm->data.op_nents)) + conn->c_xmit_data_sent = 1; } - } - /* Nuke any messages we decided not to retransmit. */ - if (!list_empty(&to_be_dropped)) - rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); + /* + * A rm will only take multiple times through this loop + * if there is a data op. Thus, if the data is sent (or there was + * none), then we're done with the rm. + */ + if (!rm->data.op_active || conn->c_xmit_data_sent) { + conn->c_xmit_rm = NULL; + conn->c_xmit_sg = 0; + conn->c_xmit_hdr_off = 0; + conn->c_xmit_data_off = 0; + conn->c_xmit_rdma_sent = 0; + conn->c_xmit_atomic_sent = 0; + conn->c_xmit_data_sent = 0; + + rds_message_put(rm); + } + } if (conn->c_trans->xmit_complete) conn->c_trans->xmit_complete(conn); - /* - * We might be racing with another sender who queued a message but - * backed off on noticing that we held the c_send_lock. If we check - * for queued messages after dropping the sem then either we'll - * see the queued message or the queuer will get the sem. If we - * notice the queued message then we trigger an immediate retry. - * - * We need to be careful only to do this when we stopped processing - * the send queue because it was empty. It's the only way we - * stop processing the loop when the transport hasn't taken - * responsibility for forward progress. - */ - mutex_unlock(&conn->c_send_lock); + release_in_xmit(conn); - if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) { - /* We exhausted the send quota, but there's work left to - * do. Return and (re-)schedule the send worker. - */ - ret = -EAGAIN; + /* Nuke any messages we decided not to retransmit. */ + if (!list_empty(&to_be_dropped)) { + /* irqs on here, so we can put(), unlike above */ + list_for_each_entry(rm, &to_be_dropped, m_conn_item) + rds_message_put(rm); + rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); } - if (ret == 0 && was_empty) { - /* A simple bit test would be way faster than taking the - * spin lock */ - spin_lock_irqsave(&conn->c_lock, flags); + /* + * Other senders can queue a message after we last test the send queue + * but before we clear RDS_IN_XMIT. In that case they'd back off and + * not try and send their newly queued message. We need to check the + * send queue after having cleared RDS_IN_XMIT so that their message + * doesn't get stuck on the send queue. + * + * If the transport cannot continue (i.e ret != 0), then it must + * call us when more room is available, such as from the tx + * completion handler. + */ + if (ret == 0) { + smp_mb(); if (!list_empty(&conn->c_send_queue)) { - rds_stats_inc(s_send_sem_queue_raced); - ret = -EAGAIN; + rds_stats_inc(s_send_lock_queue_raced); + goto restart; } - spin_unlock_irqrestore(&conn->c_lock, flags); } out: return ret; @@ -375,52 +414,60 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, } /* - * Returns true if there are no messages on the send and retransmit queues - * which have a sequence number greater than or equal to the given sequence - * number. + * This is pretty similar to what happens below in the ACK + * handling code - except that we call here as soon as we get + * the IB send completion on the RDMA op and the accompanying + * message. */ -int rds_send_acked_before(struct rds_connection *conn, u64 seq) +void rds_rdma_send_complete(struct rds_message *rm, int status) { - struct rds_message *rm, *tmp; - int ret = 1; + struct rds_sock *rs = NULL; + struct rm_rdma_op *ro; + struct rds_notifier *notifier; + unsigned long flags; - spin_lock(&conn->c_lock); + spin_lock_irqsave(&rm->m_rs_lock, flags); - list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { - if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) - ret = 0; - break; - } + ro = &rm->rdma; + if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && + ro->op_active && ro->op_notify && ro->op_notifier) { + notifier = ro->op_notifier; + rs = rm->m_rs; + sock_hold(rds_rs_to_sk(rs)); - list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { - if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) - ret = 0; - break; + notifier->n_status = status; + spin_lock(&rs->rs_lock); + list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); + spin_unlock(&rs->rs_lock); + + ro->op_notifier = NULL; } - spin_unlock(&conn->c_lock); + spin_unlock_irqrestore(&rm->m_rs_lock, flags); - return ret; + if (rs) { + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } } +EXPORT_SYMBOL_GPL(rds_rdma_send_complete); /* - * This is pretty similar to what happens below in the ACK - * handling code - except that we call here as soon as we get - * the IB send completion on the RDMA op and the accompanying - * message. + * Just like above, except looks at atomic op */ -void rds_rdma_send_complete(struct rds_message *rm, int status) +void rds_atomic_send_complete(struct rds_message *rm, int status) { struct rds_sock *rs = NULL; - struct rds_rdma_op *ro; + struct rm_atomic_op *ao; struct rds_notifier *notifier; + unsigned long flags; - spin_lock(&rm->m_rs_lock); + spin_lock_irqsave(&rm->m_rs_lock, flags); - ro = rm->m_rdma_op; + ao = &rm->atomic; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) - && ro && ro->r_notify && ro->r_notifier) { - notifier = ro->r_notifier; + && ao->op_active && ao->op_notify && ao->op_notifier) { + notifier = ao->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); @@ -429,16 +476,17 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); spin_unlock(&rs->rs_lock); - ro->r_notifier = NULL; + ao->op_notifier = NULL; } - spin_unlock(&rm->m_rs_lock); + spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } } +EXPORT_SYMBOL_GPL(rds_atomic_send_complete); /* * This is the same as rds_rdma_send_complete except we @@ -446,15 +494,23 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) * socket, socket lock) and can just move the notifier. */ static inline void -__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) +__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) { - struct rds_rdma_op *ro; + struct rm_rdma_op *ro; + struct rm_atomic_op *ao; + + ro = &rm->rdma; + if (ro->op_active && ro->op_notify && ro->op_notifier) { + ro->op_notifier->n_status = status; + list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue); + ro->op_notifier = NULL; + } - ro = rm->m_rdma_op; - if (ro && ro->r_notify && ro->r_notifier) { - ro->r_notifier->n_status = status; - list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue); - ro->r_notifier = NULL; + ao = &rm->atomic; + if (ao->op_active && ao->op_notify && ao->op_notifier) { + ao->op_notifier->n_status = status; + list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue); + ao->op_notifier = NULL; } /* No need to wake the app - caller does this */ @@ -466,7 +522,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status * So speed is not an issue here. */ struct rds_message *rds_send_get_message(struct rds_connection *conn, - struct rds_rdma_op *op) + struct rm_rdma_op *op) { struct rds_message *rm, *tmp, *found = NULL; unsigned long flags; @@ -474,7 +530,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, spin_lock_irqsave(&conn->c_lock, flags); list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { - if (rm->m_rdma_op == op) { + if (&rm->rdma == op) { atomic_inc(&rm->m_refcount); found = rm; goto out; @@ -482,7 +538,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, } list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { - if (rm->m_rdma_op == op) { + if (&rm->rdma == op) { atomic_inc(&rm->m_refcount); found = rm; break; @@ -494,6 +550,7 @@ out: return found; } +EXPORT_SYMBOL_GPL(rds_send_get_message); /* * This removes messages from the socket's list if they're on it. The list @@ -503,14 +560,15 @@ out: * removing the messages from the 'messages' list regardless of if it found * the messages on the socket list or not. */ -void rds_send_remove_from_sock(struct list_head *messages, int status) +static void rds_send_remove_from_sock(struct list_head *messages, int status) { - unsigned long flags = 0; /* silence gcc :P */ + unsigned long flags; struct rds_sock *rs = NULL; struct rds_message *rm; - local_irq_save(flags); while (!list_empty(messages)) { + int was_on_sock = 0; + rm = list_entry(messages->next, struct rds_message, m_conn_item); list_del_init(&rm->m_conn_item); @@ -525,52 +583,52 @@ void rds_send_remove_from_sock(struct list_head *messages, int status) * while we're messing with it. It does not prevent the * message from being removed from the socket, though. */ - spin_lock(&rm->m_rs_lock); + spin_lock_irqsave(&rm->m_rs_lock, flags); if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) goto unlock_and_drop; if (rs != rm->m_rs) { if (rs) { - spin_unlock(&rs->rs_lock); rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } rs = rm->m_rs; - spin_lock(&rs->rs_lock); sock_hold(rds_rs_to_sk(rs)); } + spin_lock(&rs->rs_lock); if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { - struct rds_rdma_op *ro = rm->m_rdma_op; + struct rm_rdma_op *ro = &rm->rdma; struct rds_notifier *notifier; list_del_init(&rm->m_sock_item); rds_send_sndbuf_remove(rs, rm); - if (ro && ro->r_notifier - && (status || ro->r_notify)) { - notifier = ro->r_notifier; + if (ro->op_active && ro->op_notifier && + (ro->op_notify || (ro->op_recverr && status))) { + notifier = ro->op_notifier; list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); if (!notifier->n_status) notifier->n_status = status; - rm->m_rdma_op->r_notifier = NULL; + rm->rdma.op_notifier = NULL; } - rds_message_put(rm); + was_on_sock = 1; rm->m_rs = NULL; } + spin_unlock(&rs->rs_lock); unlock_and_drop: - spin_unlock(&rm->m_rs_lock); + spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); + if (was_on_sock) + rds_message_put(rm); } if (rs) { - spin_unlock(&rs->rs_lock); rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } - local_irq_restore(flags); } /* @@ -603,21 +661,21 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack, /* order flag updates with spin locks */ if (!list_empty(&list)) - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); spin_unlock_irqrestore(&conn->c_lock, flags); /* now remove the messages from the sock list as needed */ rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS); } +EXPORT_SYMBOL_GPL(rds_send_drop_acked); void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) { struct rds_message *rm, *tmp; struct rds_connection *conn; - unsigned long flags, flags2; + unsigned long flags; LIST_HEAD(list); - int wake = 0; /* get all the messages we're dropping under the rs lock */ spin_lock_irqsave(&rs->rs_lock, flags); @@ -627,58 +685,54 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) dest->sin_port != rm->m_inc.i_hdr.h_dport)) continue; - wake = 1; list_move(&rm->m_sock_item, &list); rds_send_sndbuf_remove(rs, rm); clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); - - /* If this is a RDMA operation, notify the app. */ - __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED); } /* order flag updates with the rs lock */ - if (wake) - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); spin_unlock_irqrestore(&rs->rs_lock, flags); - if (wake) - rds_wake_sk_sleep(rs); - - conn = NULL; + if (list_empty(&list)) + return; - /* now remove the messages from the conn list as needed */ + /* Remove the messages from the conn */ list_for_each_entry(rm, &list, m_sock_item) { - /* We do this here rather than in the loop above, so that - * we don't have to nest m_rs_lock under rs->rs_lock */ - spin_lock_irqsave(&rm->m_rs_lock, flags2); - rm->m_rs = NULL; - spin_unlock_irqrestore(&rm->m_rs_lock, flags2); + conn = rm->m_inc.i_conn; + + spin_lock_irqsave(&conn->c_lock, flags); /* - * If we see this flag cleared then we're *sure* that someone - * else beat us to removing it from the conn. If we race - * with their flag update we'll get the lock and then really - * see that the flag has been cleared. + * Maybe someone else beat us to removing rm from the conn. + * If we race with their flag update we'll get the lock and + * then really see that the flag has been cleared. */ - if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags)) + if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { + spin_unlock_irqrestore(&conn->c_lock, flags); continue; - - if (conn != rm->m_inc.i_conn) { - if (conn) - spin_unlock_irqrestore(&conn->c_lock, flags); - conn = rm->m_inc.i_conn; - spin_lock_irqsave(&conn->c_lock, flags); } + list_del_init(&rm->m_conn_item); + spin_unlock_irqrestore(&conn->c_lock, flags); - if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { - list_del_init(&rm->m_conn_item); - rds_message_put(rm); - } + /* + * Couldn't grab m_rs_lock in top loop (lock ordering), + * but we can now. + */ + spin_lock_irqsave(&rm->m_rs_lock, flags); + + spin_lock(&rs->rs_lock); + __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); + spin_unlock(&rs->rs_lock); + + rm->m_rs = NULL; + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + + rds_message_put(rm); } - if (conn) - spin_unlock_irqrestore(&conn->c_lock, flags); + rds_wake_sk_sleep(rs); while (!list_empty(&list)) { rm = list_entry(list.next, struct rds_message, m_sock_item); @@ -758,6 +812,63 @@ out: return *queued; } +/* + * rds_message is getting to be quite complicated, and we'd like to allocate + * it all in one go. This figures out how big it needs to be up front. + */ +static int rds_rm_size(struct msghdr *msg, int data_len) +{ + struct cmsghdr *cmsg; + int size = 0; + int cmsg_groups = 0; + int retval; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_RDS) + continue; + + switch (cmsg->cmsg_type) { + case RDS_CMSG_RDMA_ARGS: + cmsg_groups |= 1; + retval = rds_rdma_extra_size(CMSG_DATA(cmsg)); + if (retval < 0) + return retval; + size += retval; + + break; + + case RDS_CMSG_RDMA_DEST: + case RDS_CMSG_RDMA_MAP: + cmsg_groups |= 2; + /* these are valid but do no add any size */ + break; + + case RDS_CMSG_ATOMIC_CSWP: + case RDS_CMSG_ATOMIC_FADD: + case RDS_CMSG_MASKED_ATOMIC_CSWP: + case RDS_CMSG_MASKED_ATOMIC_FADD: + cmsg_groups |= 1; + size += sizeof(struct scatterlist); + break; + + default: + return -EINVAL; + } + + } + + size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); + + /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ + if (cmsg_groups == 3) + return -EINVAL; + + return size; +} + static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, struct msghdr *msg, int *allocated_mr) { @@ -772,7 +883,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, continue; /* As a side effect, RDMA_DEST and RDMA_MAP will set - * rm->m_rdma_cookie and rm->m_rdma_mr. + * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr. */ switch (cmsg->cmsg_type) { case RDS_CMSG_RDMA_ARGS: @@ -788,6 +899,12 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, if (!ret) *allocated_mr = 1; break; + case RDS_CMSG_ATOMIC_CSWP: + case RDS_CMSG_ATOMIC_FADD: + case RDS_CMSG_MASKED_ATOMIC_CSWP: + case RDS_CMSG_MASKED_ATOMIC_FADD: + ret = rds_cmsg_atomic(rs, rm, cmsg); + break; default: return -EINVAL; @@ -805,7 +922,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); - struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); __be32 daddr; __be16 dport; struct rds_message *rm = NULL; @@ -813,12 +930,11 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, int ret = 0; int queued = 0, allocated_mr = 0; int nonblock = msg->msg_flags & MSG_DONTWAIT; - long timeo = sock_rcvtimeo(sk, nonblock); + long timeo = sock_sndtimeo(sk, nonblock); /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) { - printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags); ret = -EOPNOTSUPP; goto out; } @@ -845,19 +961,31 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, goto out; } - rm = rds_message_copy_from_user(msg->msg_iov, payload_len); - if (IS_ERR(rm)) { - ret = PTR_ERR(rm); - rm = NULL; + /* size of rm including all sgs */ + ret = rds_rm_size(msg, payload_len); + if (ret < 0) + goto out; + + rm = rds_message_alloc(ret, GFP_KERNEL); + if (!rm) { + ret = -ENOMEM; goto out; } - rm->m_daddr = daddr; + /* Attach data to the rm */ + if (payload_len) { + rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); + if (!rm->data.op_sg) { + ret = -ENOMEM; + goto out; + } + ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len); + if (ret) + goto out; + } + rm->data.op_active = 1; - /* Parse any control messages the user may have included. */ - ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); - if (ret) - goto out; + rm->m_daddr = daddr; /* rds_conn_create has a spinlock that runs with IRQ off. * Caching the conn in the socket helps a lot. */ @@ -874,26 +1002,32 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, rs->rs_conn = conn; } - if ((rm->m_rdma_cookie || rm->m_rdma_op) - && conn->c_trans->xmit_rdma == NULL) { - if (printk_ratelimit()) - printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", - rm->m_rdma_op, conn->c_trans->xmit_rdma); + /* Parse any control messages the user may have included. */ + ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); + if (ret) + goto out; + + if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { + printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", + &rm->rdma, conn->c_trans->xmit_rdma); + ret = -EOPNOTSUPP; + goto out; + } + + if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) { + printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n", + &rm->atomic, conn->c_trans->xmit_atomic); ret = -EOPNOTSUPP; goto out; } - /* If the connection is down, trigger a connect. We may - * have scheduled a delayed reconnect however - in this case - * we should not interfere. - */ - if (rds_conn_state(conn) == RDS_CONN_DOWN - && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + rds_conn_connect_if_down(conn); ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); - if (ret) + if (ret) { + rs->rs_seen_congestion = 1; goto out; + } while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port, dport, &queued)) { @@ -908,7 +1042,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, goto out; } - timeo = wait_event_interruptible_timeout(*sk->sk_sleep, + timeo = wait_event_interruptible_timeout(*sk_sleep(sk), rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port, dport, @@ -931,7 +1065,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, rds_stats_inc(s_send_queued); if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - rds_send_worker(&conn->c_send_w.work); + rds_send_xmit(conn); rds_message_put(rm); return payload_len; @@ -959,20 +1093,15 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) int ret = 0; rm = rds_message_alloc(0, GFP_ATOMIC); - if (rm == NULL) { + if (!rm) { ret = -ENOMEM; goto out; } rm->m_daddr = conn->c_faddr; + rm->data.op_active = 1; - /* If the connection is down, trigger a connect. We may - * have scheduled a delayed reconnect however - in this case - * we should not interfere. - */ - if (rds_conn_state(conn) == RDS_CONN_DOWN - && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + rds_conn_connect_if_down(conn); ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); if (ret) @@ -992,7 +1121,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) rds_stats_inc(s_send_queued); rds_stats_inc(s_send_pong); - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + rds_message_put(rm); return 0; diff --git a/net/rds/stats.c b/net/rds/stats.c index 637146893cf..73be187d389 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c @@ -33,14 +33,16 @@ #include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> +#include <linux/export.h> #include "rds.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); +EXPORT_PER_CPU_SYMBOL_GPL(rds_stats); /* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */ -static char *rds_stat_names[] = { +static const char *const rds_stat_names[] = { "conn_reset", "recv_drop_bad_checksum", "recv_drop_old_seq", @@ -56,8 +58,8 @@ static char *rds_stat_names[] = { "recv_ping", "send_queue_empty", "send_queue_full", - "send_sem_contention", - "send_sem_queue_raced", + "send_lock_contention", + "send_lock_queue_raced", "send_immediate_retry", "send_delayed_retry", "send_drop_acked", @@ -77,7 +79,7 @@ static char *rds_stat_names[] = { }; void rds_stats_info_copy(struct rds_info_iterator *iter, - uint64_t *values, char **names, size_t nr) + uint64_t *values, const char *const *names, size_t nr) { struct rds_info_counter ctr; size_t i; @@ -85,11 +87,13 @@ void rds_stats_info_copy(struct rds_info_iterator *iter, for (i = 0; i < nr; i++) { BUG_ON(strlen(names[i]) >= sizeof(ctr.name)); strncpy(ctr.name, names[i], sizeof(ctr.name) - 1); + ctr.name[sizeof(ctr.name) - 1] = '\0'; ctr.value = values[i]; rds_info_copy(iter, &ctr, sizeof(ctr)); } } +EXPORT_SYMBOL_GPL(rds_stats_info_copy); /* * This gives global counters across all the transports. The strings @@ -141,7 +145,7 @@ void rds_stats_exit(void) rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); } -int __init rds_stats_init(void) +int rds_stats_init(void) { rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); return 0; diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c index 307dc5c1be1..c3b0cd43eb5 100644 --- a/net/rds/sysctl.c +++ b/net/rds/sysctl.c @@ -49,74 +49,61 @@ unsigned int rds_sysctl_max_unacked_bytes = (16 << 20); unsigned int rds_sysctl_ping_enable = 1; -static ctl_table rds_sysctl_rds_table[] = { +static struct ctl_table rds_sysctl_rds_table[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "reconnect_min_delay_ms", .data = &rds_sysctl_reconnect_min_jiffies, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_ms_jiffies_minmax, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, .extra1 = &rds_sysctl_reconnect_min, .extra2 = &rds_sysctl_reconnect_max_jiffies, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "reconnect_max_delay_ms", .data = &rds_sysctl_reconnect_max_jiffies, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_ms_jiffies_minmax, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, .extra1 = &rds_sysctl_reconnect_min_jiffies, .extra2 = &rds_sysctl_reconnect_max, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "max_unacked_packets", .data = &rds_sysctl_max_unacked_packets, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "max_unacked_bytes", .data = &rds_sysctl_max_unacked_bytes, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "ping_enable", .data = &rds_sysctl_ping_enable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { .ctl_name = 0} -}; - -static struct ctl_path rds_sysctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "rds", .ctl_name = CTL_UNNUMBERED, }, { } }; - void rds_sysctl_exit(void) { - if (rds_sysctl_reg_table) - unregister_sysctl_table(rds_sysctl_reg_table); + unregister_net_sysctl_table(rds_sysctl_reg_table); } -int __init rds_sysctl_init(void) +int rds_sysctl_init(void) { rds_sysctl_reconnect_min = msecs_to_jiffies(1); rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; - rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table); - if (rds_sysctl_reg_table == NULL) + rds_sysctl_reg_table = register_net_sysctl(&init_net,"net/rds", rds_sysctl_rds_table); + if (!rds_sysctl_reg_table) return -ENOMEM; return 0; } diff --git a/net/rds/tcp.c b/net/rds/tcp.c new file mode 100644 index 00000000000..edac9ef2bc8 --- /dev/null +++ b/net/rds/tcp.c @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/in.h> +#include <linux/module.h> +#include <net/tcp.h> + +#include "rds.h" +#include "tcp.h" + +/* only for info exporting */ +static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); +static LIST_HEAD(rds_tcp_tc_list); +static unsigned int rds_tcp_tc_count; + +/* Track rds_tcp_connection structs so they can be cleaned up */ +static DEFINE_SPINLOCK(rds_tcp_conn_lock); +static LIST_HEAD(rds_tcp_conn_list); + +static struct kmem_cache *rds_tcp_conn_slab; + +#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024) + +/* doing it this way avoids calling tcp_sk() */ +void rds_tcp_nonagle(struct socket *sock) +{ + mm_segment_t oldfs = get_fs(); + int val = 1; + + set_fs(KERNEL_DS); + sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val, + sizeof(val)); + set_fs(oldfs); +} + +void rds_tcp_tune(struct socket *sock) +{ + struct sock *sk = sock->sk; + + rds_tcp_nonagle(sock); + + /* + * We're trying to saturate gigabit with the default, + * see svc_sock_setbufsize(). + */ + lock_sock(sk); + sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE; + sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE; + sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; + release_sock(sk); +} + +u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc) +{ + return tcp_sk(tc->t_sock->sk)->snd_nxt; +} + +u32 rds_tcp_snd_una(struct rds_tcp_connection *tc) +{ + return tcp_sk(tc->t_sock->sk)->snd_una; +} + +void rds_tcp_restore_callbacks(struct socket *sock, + struct rds_tcp_connection *tc) +{ + rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc); + write_lock_bh(&sock->sk->sk_callback_lock); + + /* done under the callback_lock to serialize with write_space */ + spin_lock(&rds_tcp_tc_list_lock); + list_del_init(&tc->t_list_item); + rds_tcp_tc_count--; + spin_unlock(&rds_tcp_tc_list_lock); + + tc->t_sock = NULL; + + sock->sk->sk_write_space = tc->t_orig_write_space; + sock->sk->sk_data_ready = tc->t_orig_data_ready; + sock->sk->sk_state_change = tc->t_orig_state_change; + sock->sk->sk_user_data = NULL; + + write_unlock_bh(&sock->sk->sk_callback_lock); +} + +/* + * This is the only path that sets tc->t_sock. Send and receive trust that + * it is set. The RDS_CONN_CONNECTED bit protects those paths from being + * called while it isn't set. + */ +void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + + rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc); + write_lock_bh(&sock->sk->sk_callback_lock); + + /* done under the callback_lock to serialize with write_space */ + spin_lock(&rds_tcp_tc_list_lock); + list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); + rds_tcp_tc_count++; + spin_unlock(&rds_tcp_tc_list_lock); + + /* accepted sockets need our listen data ready undone */ + if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready) + sock->sk->sk_data_ready = sock->sk->sk_user_data; + + tc->t_sock = sock; + tc->conn = conn; + tc->t_orig_data_ready = sock->sk->sk_data_ready; + tc->t_orig_write_space = sock->sk->sk_write_space; + tc->t_orig_state_change = sock->sk->sk_state_change; + + sock->sk->sk_user_data = conn; + sock->sk->sk_data_ready = rds_tcp_data_ready; + sock->sk->sk_write_space = rds_tcp_write_space; + sock->sk->sk_state_change = rds_tcp_state_change; + + write_unlock_bh(&sock->sk->sk_callback_lock); +} + +static void rds_tcp_tc_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_info_tcp_socket tsinfo; + struct rds_tcp_connection *tc; + unsigned long flags; + struct sockaddr_in sin; + int sinlen; + + spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); + + if (len / sizeof(tsinfo) < rds_tcp_tc_count) + goto out; + + list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { + + sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0); + tsinfo.local_addr = sin.sin_addr.s_addr; + tsinfo.local_port = sin.sin_port; + sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1); + tsinfo.peer_addr = sin.sin_addr.s_addr; + tsinfo.peer_port = sin.sin_port; + + tsinfo.hdr_rem = tc->t_tinc_hdr_rem; + tsinfo.data_rem = tc->t_tinc_data_rem; + tsinfo.last_sent_nxt = tc->t_last_sent_nxt; + tsinfo.last_expected_una = tc->t_last_expected_una; + tsinfo.last_seen_una = tc->t_last_seen_una; + + rds_info_copy(iter, &tsinfo, sizeof(tsinfo)); + } + +out: + lens->nr = rds_tcp_tc_count; + lens->each = sizeof(tsinfo); + + spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); +} + +static int rds_tcp_laddr_check(__be32 addr) +{ + if (inet_addr_type(&init_net, addr) == RTN_LOCAL) + return 0; + return -EADDRNOTAVAIL; +} + +static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_tcp_connection *tc; + + tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); + if (!tc) + return -ENOMEM; + + tc->t_sock = NULL; + tc->t_tinc = NULL; + tc->t_tinc_hdr_rem = sizeof(struct rds_header); + tc->t_tinc_data_rem = 0; + + conn->c_transport_data = tc; + + spin_lock_irq(&rds_tcp_conn_lock); + list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list); + spin_unlock_irq(&rds_tcp_conn_lock); + + rdsdebug("alloced tc %p\n", conn->c_transport_data); + return 0; +} + +static void rds_tcp_conn_free(void *arg) +{ + struct rds_tcp_connection *tc = arg; + unsigned long flags; + rdsdebug("freeing tc %p\n", tc); + + spin_lock_irqsave(&rds_tcp_conn_lock, flags); + list_del(&tc->t_tcp_node); + spin_unlock_irqrestore(&rds_tcp_conn_lock, flags); + + kmem_cache_free(rds_tcp_conn_slab, tc); +} + +static void rds_tcp_destroy_conns(void) +{ + struct rds_tcp_connection *tc, *_tc; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&rds_tcp_conn_lock); + list_splice(&rds_tcp_conn_list, &tmp_list); + INIT_LIST_HEAD(&rds_tcp_conn_list); + spin_unlock_irq(&rds_tcp_conn_lock); + + list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { + if (tc->conn->c_passive) + rds_conn_destroy(tc->conn->c_passive); + rds_conn_destroy(tc->conn); + } +} + +static void rds_tcp_exit(void) +{ + rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); + rds_tcp_listen_stop(); + rds_tcp_destroy_conns(); + rds_trans_unregister(&rds_tcp_transport); + rds_tcp_recv_exit(); + kmem_cache_destroy(rds_tcp_conn_slab); +} +module_exit(rds_tcp_exit); + +struct rds_transport rds_tcp_transport = { + .laddr_check = rds_tcp_laddr_check, + .xmit_prepare = rds_tcp_xmit_prepare, + .xmit_complete = rds_tcp_xmit_complete, + .xmit = rds_tcp_xmit, + .recv = rds_tcp_recv, + .conn_alloc = rds_tcp_conn_alloc, + .conn_free = rds_tcp_conn_free, + .conn_connect = rds_tcp_conn_connect, + .conn_shutdown = rds_tcp_conn_shutdown, + .inc_copy_to_user = rds_tcp_inc_copy_to_user, + .inc_free = rds_tcp_inc_free, + .stats_info_copy = rds_tcp_stats_info_copy, + .exit = rds_tcp_exit, + .t_owner = THIS_MODULE, + .t_name = "tcp", + .t_type = RDS_TRANS_TCP, + .t_prefer_loopback = 1, +}; + +static int rds_tcp_init(void) +{ + int ret; + + rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", + sizeof(struct rds_tcp_connection), + 0, 0, NULL); + if (!rds_tcp_conn_slab) { + ret = -ENOMEM; + goto out; + } + + ret = rds_tcp_recv_init(); + if (ret) + goto out_slab; + + ret = rds_trans_register(&rds_tcp_transport); + if (ret) + goto out_recv; + + ret = rds_tcp_listen_init(); + if (ret) + goto out_register; + + rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); + + goto out; + +out_register: + rds_trans_unregister(&rds_tcp_transport); +out_recv: + rds_tcp_recv_exit(); +out_slab: + kmem_cache_destroy(rds_tcp_conn_slab); +out: + return ret; +} +module_init(rds_tcp_init); + +MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); +MODULE_DESCRIPTION("RDS: TCP transport"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/net/rds/tcp.h b/net/rds/tcp.h new file mode 100644 index 00000000000..65637491f72 --- /dev/null +++ b/net/rds/tcp.h @@ -0,0 +1,88 @@ +#ifndef _RDS_TCP_H +#define _RDS_TCP_H + +#define RDS_TCP_PORT 16385 + +struct rds_tcp_incoming { + struct rds_incoming ti_inc; + struct sk_buff_head ti_skb_list; +}; + +struct rds_tcp_connection { + + struct list_head t_tcp_node; + struct rds_connection *conn; + struct socket *t_sock; + void *t_orig_write_space; + void *t_orig_data_ready; + void *t_orig_state_change; + + struct rds_tcp_incoming *t_tinc; + size_t t_tinc_hdr_rem; + size_t t_tinc_data_rem; + + /* XXX error report? */ + struct work_struct t_conn_w; + struct work_struct t_send_w; + struct work_struct t_down_w; + struct work_struct t_recv_w; + + /* for info exporting only */ + struct list_head t_list_item; + u32 t_last_sent_nxt; + u32 t_last_expected_una; + u32 t_last_seen_una; +}; + +struct rds_tcp_statistics { + uint64_t s_tcp_data_ready_calls; + uint64_t s_tcp_write_space_calls; + uint64_t s_tcp_sndbuf_full; + uint64_t s_tcp_connect_raced; + uint64_t s_tcp_listen_closed_stale; +}; + +/* tcp.c */ +void rds_tcp_tune(struct socket *sock); +void rds_tcp_nonagle(struct socket *sock); +void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn); +void rds_tcp_restore_callbacks(struct socket *sock, + struct rds_tcp_connection *tc); +u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc); +u32 rds_tcp_snd_una(struct rds_tcp_connection *tc); +u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq); +extern struct rds_transport rds_tcp_transport; + +/* tcp_connect.c */ +int rds_tcp_conn_connect(struct rds_connection *conn); +void rds_tcp_conn_shutdown(struct rds_connection *conn); +void rds_tcp_state_change(struct sock *sk); + +/* tcp_listen.c */ +int rds_tcp_listen_init(void); +void rds_tcp_listen_stop(void); +void rds_tcp_listen_data_ready(struct sock *sk); + +/* tcp_recv.c */ +int rds_tcp_recv_init(void); +void rds_tcp_recv_exit(void); +void rds_tcp_data_ready(struct sock *sk); +int rds_tcp_recv(struct rds_connection *conn); +void rds_tcp_inc_free(struct rds_incoming *inc); +int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, + size_t size); + +/* tcp_send.c */ +void rds_tcp_xmit_prepare(struct rds_connection *conn); +void rds_tcp_xmit_complete(struct rds_connection *conn); +int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); +void rds_tcp_write_space(struct sock *sk); + +/* tcp_stats.c */ +DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats); +#define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member) +unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail); + +#endif diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c new file mode 100644 index 00000000000..a65ee78db0c --- /dev/null +++ b/net/rds/tcp_connect.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> +#include <net/tcp.h> + +#include "rds.h" +#include "tcp.h" + +void rds_tcp_state_change(struct sock *sk) +{ + void (*state_change)(struct sock *sk); + struct rds_connection *conn; + struct rds_tcp_connection *tc; + + read_lock(&sk->sk_callback_lock); + conn = sk->sk_user_data; + if (!conn) { + state_change = sk->sk_state_change; + goto out; + } + tc = conn->c_transport_data; + state_change = tc->t_orig_state_change; + + rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state); + + switch(sk->sk_state) { + /* ignore connecting sockets as they make progress */ + case TCP_SYN_SENT: + case TCP_SYN_RECV: + break; + case TCP_ESTABLISHED: + rds_connect_complete(conn); + break; + case TCP_CLOSE: + rds_conn_drop(conn); + default: + break; + } +out: + read_unlock(&sk->sk_callback_lock); + state_change(sk); +} + +int rds_tcp_conn_connect(struct rds_connection *conn) +{ + struct socket *sock = NULL; + struct sockaddr_in src, dest; + int ret; + + ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret < 0) + goto out; + + rds_tcp_tune(sock); + + src.sin_family = AF_INET; + src.sin_addr.s_addr = (__force u32)conn->c_laddr; + src.sin_port = (__force u16)htons(0); + + ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src)); + if (ret) { + rdsdebug("bind failed with %d at address %pI4\n", + ret, &conn->c_laddr); + goto out; + } + + dest.sin_family = AF_INET; + dest.sin_addr.s_addr = (__force u32)conn->c_faddr; + dest.sin_port = (__force u16)htons(RDS_TCP_PORT); + + /* + * once we call connect() we can start getting callbacks and they + * own the socket + */ + rds_tcp_set_callbacks(sock, conn); + ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), + O_NONBLOCK); + sock = NULL; + + rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); + if (ret == -EINPROGRESS) + ret = 0; + +out: + if (sock) + sock_release(sock); + return ret; +} + +/* + * Before killing the tcp socket this needs to serialize with callbacks. The + * caller has already grabbed the sending sem so we're serialized with other + * senders. + * + * TCP calls the callbacks with the sock lock so we hold it while we reset the + * callbacks to those set by TCP. Our callbacks won't execute again once we + * hold the sock lock. + */ +void rds_tcp_conn_shutdown(struct rds_connection *conn) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + struct socket *sock = tc->t_sock; + + rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock); + + if (sock) { + sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); + lock_sock(sock->sk); + rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */ + + release_sock(sock->sk); + sock_release(sock); + } + + if (tc->t_tinc) { + rds_inc_put(&tc->t_tinc->ti_inc); + tc->t_tinc = NULL; + } + tc->t_tinc_hdr_rem = sizeof(struct rds_header); + tc->t_tinc_data_rem = 0; +} diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c new file mode 100644 index 00000000000..23ab4dcd1d9 --- /dev/null +++ b/net/rds/tcp_listen.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/gfp.h> +#include <linux/in.h> +#include <net/tcp.h> + +#include "rds.h" +#include "tcp.h" + +/* + * cheesy, but simple.. + */ +static void rds_tcp_accept_worker(struct work_struct *work); +static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); +static struct socket *rds_tcp_listen_sock; + +static int rds_tcp_accept_one(struct socket *sock) +{ + struct socket *new_sock = NULL; + struct rds_connection *conn; + int ret; + struct inet_sock *inet; + + ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, + sock->sk->sk_protocol, &new_sock); + if (ret) + goto out; + + new_sock->type = sock->type; + new_sock->ops = sock->ops; + ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); + if (ret < 0) + goto out; + + rds_tcp_tune(new_sock); + + inet = inet_sk(new_sock->sk); + + rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n", + &inet->inet_saddr, ntohs(inet->inet_sport), + &inet->inet_daddr, ntohs(inet->inet_dport)); + + conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr, + &rds_tcp_transport, GFP_KERNEL); + if (IS_ERR(conn)) { + ret = PTR_ERR(conn); + goto out; + } + + /* + * see the comment above rds_queue_delayed_reconnect() + */ + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + if (rds_conn_state(conn) == RDS_CONN_UP) + rds_tcp_stats_inc(s_tcp_listen_closed_stale); + else + rds_tcp_stats_inc(s_tcp_connect_raced); + rds_conn_drop(conn); + ret = 0; + goto out; + } + + rds_tcp_set_callbacks(new_sock, conn); + rds_connect_complete(conn); + new_sock = NULL; + ret = 0; + +out: + if (new_sock) + sock_release(new_sock); + return ret; +} + +static void rds_tcp_accept_worker(struct work_struct *work) +{ + while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0) + cond_resched(); +} + +void rds_tcp_listen_data_ready(struct sock *sk) +{ + void (*ready)(struct sock *sk); + + rdsdebug("listen data ready sk %p\n", sk); + + read_lock(&sk->sk_callback_lock); + ready = sk->sk_user_data; + if (!ready) { /* check for teardown race */ + ready = sk->sk_data_ready; + goto out; + } + + /* + * ->sk_data_ready is also called for a newly established child socket + * before it has been accepted and the accepter has set up their + * data_ready.. we only want to queue listen work for our listening + * socket + */ + if (sk->sk_state == TCP_LISTEN) + queue_work(rds_wq, &rds_tcp_listen_work); + +out: + read_unlock(&sk->sk_callback_lock); + ready(sk); +} + +int rds_tcp_listen_init(void) +{ + struct sockaddr_in sin; + struct socket *sock = NULL; + int ret; + + ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret < 0) + goto out; + + sock->sk->sk_reuse = SK_CAN_REUSE; + rds_tcp_nonagle(sock); + + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_user_data = sock->sk->sk_data_ready; + sock->sk->sk_data_ready = rds_tcp_listen_data_ready; + write_unlock_bh(&sock->sk->sk_callback_lock); + + sin.sin_family = PF_INET; + sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); + sin.sin_port = (__force u16)htons(RDS_TCP_PORT); + + ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); + if (ret < 0) + goto out; + + ret = sock->ops->listen(sock, 64); + if (ret < 0) + goto out; + + rds_tcp_listen_sock = sock; + sock = NULL; +out: + if (sock) + sock_release(sock); + return ret; +} + +void rds_tcp_listen_stop(void) +{ + struct socket *sock = rds_tcp_listen_sock; + struct sock *sk; + + if (!sock) + return; + + sk = sock->sk; + + /* serialize with and prevent further callbacks */ + lock_sock(sk); + write_lock_bh(&sk->sk_callback_lock); + if (sk->sk_user_data) { + sk->sk_data_ready = sk->sk_user_data; + sk->sk_user_data = NULL; + } + write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); + + /* wait for accepts to stop and close the socket */ + flush_workqueue(rds_wq); + sock_release(sock); + rds_tcp_listen_sock = NULL; +} diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c new file mode 100644 index 00000000000..9ae6e0a264e --- /dev/null +++ b/net/rds/tcp_recv.c @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/slab.h> +#include <net/tcp.h> + +#include "rds.h" +#include "tcp.h" + +static struct kmem_cache *rds_tcp_incoming_slab; + +static void rds_tcp_inc_purge(struct rds_incoming *inc) +{ + struct rds_tcp_incoming *tinc; + tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); + rdsdebug("purging tinc %p inc %p\n", tinc, inc); + skb_queue_purge(&tinc->ti_skb_list); +} + +void rds_tcp_inc_free(struct rds_incoming *inc) +{ + struct rds_tcp_incoming *tinc; + tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); + rds_tcp_inc_purge(inc); + rdsdebug("freeing tinc %p inc %p\n", tinc, inc); + kmem_cache_free(rds_tcp_incoming_slab, tinc); +} + +/* + * this is pretty lame, but, whatever. + */ +int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, + size_t size) +{ + struct rds_tcp_incoming *tinc; + struct iovec *iov, tmp; + struct sk_buff *skb; + unsigned long to_copy, skb_off; + int ret = 0; + + if (size == 0) + goto out; + + tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); + iov = first_iov; + tmp = *iov; + + skb_queue_walk(&tinc->ti_skb_list, skb) { + skb_off = 0; + while (skb_off < skb->len) { + while (tmp.iov_len == 0) { + iov++; + tmp = *iov; + } + + to_copy = min(tmp.iov_len, size); + to_copy = min(to_copy, skb->len - skb_off); + + rdsdebug("ret %d size %zu skb %p skb_off %lu " + "skblen %d iov_base %p iov_len %zu cpy %lu\n", + ret, size, skb, skb_off, skb->len, + tmp.iov_base, tmp.iov_len, to_copy); + + /* modifies tmp as it copies */ + if (skb_copy_datagram_iovec(skb, skb_off, &tmp, + to_copy)) { + ret = -EFAULT; + goto out; + } + + rds_stats_add(s_copy_to_user, to_copy); + size -= to_copy; + ret += to_copy; + skb_off += to_copy; + if (size == 0) + goto out; + } + } +out: + return ret; +} + +/* + * We have a series of skbs that have fragmented pieces of the congestion + * bitmap. They must add up to the exact size of the congestion bitmap. We + * use the skb helpers to copy those into the pages that make up the in-memory + * congestion bitmap for the remote address of this connection. We then tell + * the congestion core that the bitmap has been changed so that it can wake up + * sleepers. + * + * This is racing with sending paths which are using test_bit to see if the + * bitmap indicates that their recipient is congested. + */ + +static void rds_tcp_cong_recv(struct rds_connection *conn, + struct rds_tcp_incoming *tinc) +{ + struct sk_buff *skb; + unsigned int to_copy, skb_off; + unsigned int map_off; + unsigned int map_page; + struct rds_cong_map *map; + int ret; + + /* catch completely corrupt packets */ + if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) + return; + + map_page = 0; + map_off = 0; + map = conn->c_fcong; + + skb_queue_walk(&tinc->ti_skb_list, skb) { + skb_off = 0; + while (skb_off < skb->len) { + to_copy = min_t(unsigned int, PAGE_SIZE - map_off, + skb->len - skb_off); + + BUG_ON(map_page >= RDS_CONG_MAP_PAGES); + + /* only returns 0 or -error */ + ret = skb_copy_bits(skb, skb_off, + (void *)map->m_page_addrs[map_page] + map_off, + to_copy); + BUG_ON(ret != 0); + + skb_off += to_copy; + map_off += to_copy; + if (map_off == PAGE_SIZE) { + map_off = 0; + map_page++; + } + } + } + + rds_cong_map_updated(map, ~(u64) 0); +} + +struct rds_tcp_desc_arg { + struct rds_connection *conn; + gfp_t gfp; +}; + +static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct rds_tcp_desc_arg *arg = desc->arg.data; + struct rds_connection *conn = arg->conn; + struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_incoming *tinc = tc->t_tinc; + struct sk_buff *clone; + size_t left = len, to_copy; + + rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset, + len); + + /* + * tcp_read_sock() interprets partial progress as an indication to stop + * processing. + */ + while (left) { + if (!tinc) { + tinc = kmem_cache_alloc(rds_tcp_incoming_slab, + arg->gfp); + if (!tinc) { + desc->error = -ENOMEM; + goto out; + } + tc->t_tinc = tinc; + rdsdebug("alloced tinc %p\n", tinc); + rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr); + /* + * XXX * we might be able to use the __ variants when + * we've already serialized at a higher level. + */ + skb_queue_head_init(&tinc->ti_skb_list); + } + + if (left && tc->t_tinc_hdr_rem) { + to_copy = min(tc->t_tinc_hdr_rem, left); + rdsdebug("copying %zu header from skb %p\n", to_copy, + skb); + skb_copy_bits(skb, offset, + (char *)&tinc->ti_inc.i_hdr + + sizeof(struct rds_header) - + tc->t_tinc_hdr_rem, + to_copy); + tc->t_tinc_hdr_rem -= to_copy; + left -= to_copy; + offset += to_copy; + + if (tc->t_tinc_hdr_rem == 0) { + /* could be 0 for a 0 len message */ + tc->t_tinc_data_rem = + be32_to_cpu(tinc->ti_inc.i_hdr.h_len); + } + } + + if (left && tc->t_tinc_data_rem) { + clone = skb_clone(skb, arg->gfp); + if (!clone) { + desc->error = -ENOMEM; + goto out; + } + + to_copy = min(tc->t_tinc_data_rem, left); + pskb_pull(clone, offset); + pskb_trim(clone, to_copy); + skb_queue_tail(&tinc->ti_skb_list, clone); + + rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " + "clone %p data %p len %d\n", + skb, skb->data, skb->len, offset, to_copy, + clone, clone->data, clone->len); + + tc->t_tinc_data_rem -= to_copy; + left -= to_copy; + offset += to_copy; + } + + if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) { + if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) + rds_tcp_cong_recv(conn, tinc); + else + rds_recv_incoming(conn, conn->c_faddr, + conn->c_laddr, &tinc->ti_inc, + arg->gfp); + + tc->t_tinc_hdr_rem = sizeof(struct rds_header); + tc->t_tinc_data_rem = 0; + tc->t_tinc = NULL; + rds_inc_put(&tinc->ti_inc); + tinc = NULL; + } + } +out: + rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n", + len, left, skb->len, + skb_queue_len(&tc->t_sock->sk->sk_receive_queue)); + return len - left; +} + +/* the caller has to hold the sock lock */ +static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + struct socket *sock = tc->t_sock; + read_descriptor_t desc; + struct rds_tcp_desc_arg arg; + + /* It's like glib in the kernel! */ + arg.conn = conn; + arg.gfp = gfp; + desc.arg.data = &arg; + desc.error = 0; + desc.count = 1; /* give more than one skb per call */ + + tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv); + rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp, + desc.error); + + return desc.error; +} + +/* + * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from + * data_ready. + * + * if we fail to allocate we're in trouble.. blindly wait some time before + * trying again to see if the VM can free up something for us. + */ +int rds_tcp_recv(struct rds_connection *conn) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + struct socket *sock = tc->t_sock; + int ret = 0; + + rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock); + + lock_sock(sock->sk); + ret = rds_tcp_read_sock(conn, GFP_KERNEL); + release_sock(sock->sk); + + return ret; +} + +void rds_tcp_data_ready(struct sock *sk) +{ + void (*ready)(struct sock *sk); + struct rds_connection *conn; + struct rds_tcp_connection *tc; + + rdsdebug("data ready sk %p\n", sk); + + read_lock(&sk->sk_callback_lock); + conn = sk->sk_user_data; + if (!conn) { /* check for teardown race */ + ready = sk->sk_data_ready; + goto out; + } + + tc = conn->c_transport_data; + ready = tc->t_orig_data_ready; + rds_tcp_stats_inc(s_tcp_data_ready_calls); + + if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM) + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); +out: + read_unlock(&sk->sk_callback_lock); + ready(sk); +} + +int rds_tcp_recv_init(void) +{ + rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", + sizeof(struct rds_tcp_incoming), + 0, 0, NULL); + if (!rds_tcp_incoming_slab) + return -ENOMEM; + return 0; +} + +void rds_tcp_recv_exit(void) +{ + kmem_cache_destroy(rds_tcp_incoming_slab); +} diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c new file mode 100644 index 00000000000..53b17ca0dff --- /dev/null +++ b/net/rds/tcp_send.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> +#include <net/tcp.h> + +#include "rds.h" +#include "tcp.h" + +static void rds_tcp_cork(struct socket *sock, int val) +{ + mm_segment_t oldfs; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + sock->ops->setsockopt(sock, SOL_TCP, TCP_CORK, (char __user *)&val, + sizeof(val)); + set_fs(oldfs); +} + +void rds_tcp_xmit_prepare(struct rds_connection *conn) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + + rds_tcp_cork(tc->t_sock, 1); +} + +void rds_tcp_xmit_complete(struct rds_connection *conn) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + + rds_tcp_cork(tc->t_sock, 0); +} + +/* the core send_sem serializes this with other xmit and shutdown */ +static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) +{ + struct kvec vec = { + .iov_base = data, + .iov_len = len, + }; + struct msghdr msg = { + .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL, + }; + + return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len); +} + +/* the core send_sem serializes this with other xmit and shutdown */ +int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + int done = 0; + int ret = 0; + + if (hdr_off == 0) { + /* + * m_ack_seq is set to the sequence number of the last byte of + * header and data. see rds_tcp_is_acked(). + */ + tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc); + rm->m_ack_seq = tc->t_last_sent_nxt + + sizeof(struct rds_header) + + be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1; + smp_mb__before_atomic(); + set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); + tc->t_last_expected_una = rm->m_ack_seq + 1; + + rdsdebug("rm %p tcp nxt %u ack_seq %llu\n", + rm, rds_tcp_snd_nxt(tc), + (unsigned long long)rm->m_ack_seq); + } + + if (hdr_off < sizeof(struct rds_header)) { + /* see rds_tcp_write_space() */ + set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags); + + ret = rds_tcp_sendmsg(tc->t_sock, + (void *)&rm->m_inc.i_hdr + hdr_off, + sizeof(rm->m_inc.i_hdr) - hdr_off); + if (ret < 0) + goto out; + done += ret; + if (hdr_off + done != sizeof(struct rds_header)) + goto out; + } + + while (sg < rm->data.op_nents) { + ret = tc->t_sock->ops->sendpage(tc->t_sock, + sg_page(&rm->data.op_sg[sg]), + rm->data.op_sg[sg].offset + off, + rm->data.op_sg[sg].length - off, + MSG_DONTWAIT|MSG_NOSIGNAL); + rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]), + rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, + ret); + if (ret <= 0) + break; + + off += ret; + done += ret; + if (off == rm->data.op_sg[sg].length) { + off = 0; + sg++; + } + } + +out: + if (ret <= 0) { + /* write_space will hit after EAGAIN, all else fatal */ + if (ret == -EAGAIN) { + rds_tcp_stats_inc(s_tcp_sndbuf_full); + ret = 0; + } else { + printk(KERN_WARNING "RDS/tcp: send to %pI4 " + "returned %d, disconnecting and reconnecting\n", + &conn->c_faddr, ret); + rds_conn_drop(conn); + } + } + if (done == 0) + done = ret; + return done; +} + +/* + * rm->m_ack_seq is set to the tcp sequence number that corresponds to the + * last byte of the message, including the header. This means that the + * entire message has been received if rm->m_ack_seq is "before" the next + * unacked byte of the TCP sequence space. We have to do very careful + * wrapping 32bit comparisons here. + */ +static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack) +{ + if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags)) + return 0; + return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0; +} + +void rds_tcp_write_space(struct sock *sk) +{ + void (*write_space)(struct sock *sk); + struct rds_connection *conn; + struct rds_tcp_connection *tc; + + read_lock(&sk->sk_callback_lock); + conn = sk->sk_user_data; + if (!conn) { + write_space = sk->sk_write_space; + goto out; + } + + tc = conn->c_transport_data; + rdsdebug("write_space for tc %p\n", tc); + write_space = tc->t_orig_write_space; + rds_tcp_stats_inc(s_tcp_write_space_calls); + + rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc)); + tc->t_last_seen_una = rds_tcp_snd_una(tc); + rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked); + + if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + +out: + read_unlock(&sk->sk_callback_lock); + + /* + * write_space is only called when data leaves tcp's send queue if + * SOCK_NOSPACE is set. We set SOCK_NOSPACE every time we put + * data in tcp's send queue because we use write_space to parse the + * sequence numbers and notice that rds messages have been fully + * received. + * + * tcp's write_space clears SOCK_NOSPACE if the send queue has more + * than a certain amount of space. So we need to set it again *after* + * we call tcp's write_space or else we might only get called on the + * first of a series of incoming tcp acks. + */ + write_space(sk); + + if (sk->sk_socket) + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +} diff --git a/net/rds/tcp_stats.c b/net/rds/tcp_stats.c new file mode 100644 index 00000000000..f8a7954f1f5 --- /dev/null +++ b/net/rds/tcp_stats.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/percpu.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> + +#include "rds.h" +#include "tcp.h" + +DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats) + ____cacheline_aligned; + +static const char * const rds_tcp_stat_names[] = { + "tcp_data_ready_calls", + "tcp_write_space_calls", + "tcp_sndbuf_full", + "tcp_connect_raced", + "tcp_listen_closed_stale", +}; + +unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail) +{ + struct rds_tcp_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + + if (avail < ARRAY_SIZE(rds_tcp_stat_names)) + goto out; + + for_each_online_cpu(cpu) { + src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) + *(sum++) += *(src++); + } + + rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names, + ARRAY_SIZE(rds_tcp_stat_names)); +out: + return ARRAY_SIZE(rds_tcp_stat_names); +} diff --git a/net/rds/threads.c b/net/rds/threads.c index 828a1bf9ea9..65eaefcab24 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -32,6 +32,7 @@ */ #include <linux/kernel.h> #include <linux/random.h> +#include <linux/export.h> #include "rds.h" @@ -61,13 +62,14 @@ * * Transition to state DISCONNECTING/DOWN: * - Inside the shutdown worker; synchronizes with xmit path - * through c_send_lock, and with connection management callbacks + * through RDS_IN_XMIT, and with connection management callbacks * via c_cm_lock. * * For receive callbacks, we rely on the underlying transport * (TCP, IB/RDMA) to provide the necessary synchronisation. */ struct workqueue_struct *rds_wq; +EXPORT_SYMBOL_GPL(rds_wq); void rds_connect_complete(struct rds_connection *conn) { @@ -89,6 +91,7 @@ void rds_connect_complete(struct rds_connection *conn) queue_delayed_work(rds_wq, &conn->c_send_w, 0); queue_delayed_work(rds_wq, &conn->c_recv_w, 0); } +EXPORT_SYMBOL_GPL(rds_connect_complete); /* * This random exponential backoff is relied on to eventually resolve racing @@ -108,7 +111,7 @@ void rds_connect_complete(struct rds_connection *conn) * We should *always* start with a random backoff; otherwise a broken connection * will always take several iterations to be re-established. */ -static void rds_queue_reconnect(struct rds_connection *conn) +void rds_queue_reconnect(struct rds_connection *conn) { unsigned long rand; @@ -154,58 +157,6 @@ void rds_connect_worker(struct work_struct *work) } } -void rds_shutdown_worker(struct work_struct *work) -{ - struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); - - /* shut it down unless it's down already */ - if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { - /* - * Quiesce the connection mgmt handlers before we start tearing - * things down. We don't hold the mutex for the entire - * duration of the shutdown operation, else we may be - * deadlocking with the CM handler. Instead, the CM event - * handler is supposed to check for state DISCONNECTING - */ - mutex_lock(&conn->c_cm_lock); - if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) - && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { - rds_conn_error(conn, "shutdown called in state %d\n", - atomic_read(&conn->c_state)); - mutex_unlock(&conn->c_cm_lock); - return; - } - mutex_unlock(&conn->c_cm_lock); - - mutex_lock(&conn->c_send_lock); - conn->c_trans->conn_shutdown(conn); - rds_conn_reset(conn); - mutex_unlock(&conn->c_send_lock); - - if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { - /* This can happen - eg when we're in the middle of tearing - * down the connection, and someone unloads the rds module. - * Quite reproduceable with loopback connections. - * Mostly harmless. - */ - rds_conn_error(conn, - "%s: failed to transition to state DOWN, " - "current state is %d\n", - __func__, - atomic_read(&conn->c_state)); - return; - } - } - - /* Then reconnect if it's still live. - * The passive side of an IB loopback connection is never added - * to the conn hash, so we never trigger a reconnect on this - * conn - the reconnect is always triggered by the active peer. */ - cancel_delayed_work(&conn->c_conn_w); - if (!hlist_unhashed(&conn->c_hash_node)) - rds_queue_reconnect(conn); -} - void rds_send_worker(struct work_struct *work) { struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); @@ -250,15 +201,22 @@ void rds_recv_worker(struct work_struct *work) } } +void rds_shutdown_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); + + rds_conn_shutdown(conn); +} + void rds_threads_exit(void) { destroy_workqueue(rds_wq); } -int __init rds_threads_init(void) +int rds_threads_init(void) { rds_wq = create_singlethread_workqueue("krdsd"); - if (rds_wq == NULL) + if (!rds_wq) return -ENOMEM; return 0; diff --git a/net/rds/transport.c b/net/rds/transport.c index 767da61ad2f..7f2ac4fec36 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -37,7 +37,7 @@ #include "rds.h" #include "loop.h" -static LIST_HEAD(rds_transports); +static struct rds_transport *transports[RDS_TRANS_COUNT]; static DECLARE_RWSEM(rds_trans_sem); int rds_trans_register(struct rds_transport *trans) @@ -46,35 +46,52 @@ int rds_trans_register(struct rds_transport *trans) down_write(&rds_trans_sem); - list_add_tail(&trans->t_item, &rds_transports); - printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name); + if (transports[trans->t_type]) + printk(KERN_ERR "RDS Transport type %d already registered\n", + trans->t_type); + else { + transports[trans->t_type] = trans; + printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name); + } up_write(&rds_trans_sem); return 0; } +EXPORT_SYMBOL_GPL(rds_trans_register); void rds_trans_unregister(struct rds_transport *trans) { down_write(&rds_trans_sem); - list_del_init(&trans->t_item); + transports[trans->t_type] = NULL; printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name); up_write(&rds_trans_sem); } +EXPORT_SYMBOL_GPL(rds_trans_unregister); + +void rds_trans_put(struct rds_transport *trans) +{ + if (trans && trans->t_owner) + module_put(trans->t_owner); +} struct rds_transport *rds_trans_get_preferred(__be32 addr) { - struct rds_transport *trans; struct rds_transport *ret = NULL; + struct rds_transport *trans; + unsigned int i; if (IN_LOOPBACK(ntohl(addr))) return &rds_loop_transport; down_read(&rds_trans_sem); - list_for_each_entry(trans, &rds_transports, t_item) { - if (trans->laddr_check(addr) == 0) { + for (i = 0; i < RDS_TRANS_COUNT; i++) { + trans = transports[i]; + + if (trans && (trans->laddr_check(addr) == 0) && + (!trans->t_owner || try_module_get(trans->t_owner))) { ret = trans; break; } @@ -97,12 +114,15 @@ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, struct rds_transport *trans; unsigned int total = 0; unsigned int part; + int i; rds_info_iter_unmap(iter); down_read(&rds_trans_sem); - list_for_each_entry(trans, &rds_transports, t_item) { - if (trans->stats_info_copy == NULL) + for (i = 0; i < RDS_TRANS_COUNT; i++) + { + trans = transports[i]; + if (!trans || !trans->stats_info_copy) continue; part = trans->stats_info_copy(iter, avail); |
