diff options
Diffstat (limited to 'net/rds')
43 files changed, 2791 insertions, 1897 deletions
diff --git a/net/rds/Kconfig b/net/rds/Kconfig index ec753b3ae72..f2c670ba7b9 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -1,7 +1,7 @@ config RDS - tristate "The RDS Protocol (EXPERIMENTAL)" - depends on INET && EXPERIMENTAL + tristate "The RDS Protocol" + depends on INET ---help--- The RDS (Reliable Datagram Sockets) protocol provides reliable, sequenced delivery of datagrams over Infiniband, iWARP, diff --git a/net/rds/Makefile b/net/rds/Makefile index b46eca10968..56d3f6023ce 100644 --- a/net/rds/Makefile +++ b/net/rds/Makefile @@ -4,7 +4,7 @@ rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \ loop.o page.o rdma.o obj-$(CONFIG_RDS_RDMA) += rds_rdma.o -rds_rdma-objs := rdma_transport.o \ +rds_rdma-y := rdma_transport.o \ ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \ ib_sysctl.o ib_rdma.o \ iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \ @@ -12,10 +12,8 @@ rds_rdma-objs := rdma_transport.o \ obj-$(CONFIG_RDS_TCP) += rds_tcp.o -rds_tcp-objs := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \ +rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \ tcp_send.o tcp_stats.o -ifeq ($(CONFIG_RDS_DEBUG), y) -EXTRA_CFLAGS += -DDEBUG -endif +ccflags-$(CONFIG_RDS_DEBUG) := -DDEBUG diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 853c52be781..424ff622ab5 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -33,12 +33,21 @@ #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> +#include <linux/gfp.h> #include <linux/in.h> #include <linux/poll.h> #include <net/sock.h> #include "rds.h" -#include "rdma.h" + +char *rds_str_array(char **array, size_t elements, size_t index) +{ + if ((index < elements) && array[index]) + return array[index]; + else + return "unknown"; +} +EXPORT_SYMBOL(rds_str_array); /* this is just used for stats gathering :/ */ static DEFINE_SPINLOCK(rds_sock_lock); @@ -59,9 +68,8 @@ static int rds_release(struct socket *sock) { struct sock *sk = sock->sk; struct rds_sock *rs; - unsigned long flags; - if (sk == NULL) + if (!sk) goto out; rs = rds_sk_to_rs(sk); @@ -72,15 +80,25 @@ static int rds_release(struct socket *sock) * with the socket. */ rds_clear_recv_queue(rs); rds_cong_remove_socket(rs); + + /* + * the binding lookup hash uses rcu, we need to + * make sure we sychronize_rcu before we free our + * entry + */ rds_remove_bound(rs); + synchronize_rcu(); + rds_send_drop_to(rs, NULL); rds_rdma_drop_keys(rs); rds_notify_queue_get(rs, NULL); - spin_lock_irqsave(&rds_sock_lock, flags); + spin_lock_bh(&rds_sock_lock); list_del_init(&rs->rs_item); rds_sock_count--; - spin_unlock_irqrestore(&rds_sock_lock, flags); + spin_unlock_bh(&rds_sock_lock); + + rds_trans_put(rs->rs_transport); sock->sk = NULL; sock_put(sk); @@ -157,9 +175,10 @@ static unsigned int rds_poll(struct file *file, struct socket *sock, unsigned int mask = 0; unsigned long flags; - poll_wait(file, sk->sk_sleep, wait); + poll_wait(file, sk_sleep(sk), wait); - poll_wait(file, &rds_poll_waitq, wait); + if (rs->rs_seen_congestion) + poll_wait(file, &rds_poll_waitq, wait); read_lock_irqsave(&rs->rs_recv_lock, flags); if (!rs->rs_cong_monitor) { @@ -181,6 +200,10 @@ static unsigned int rds_poll(struct file *file, struct socket *sock, mask |= (POLLOUT | POLLWRNORM); read_unlock_irqrestore(&rs->rs_recv_lock, flags); + /* clear state any time we wake a seen-congested socket */ + if (mask) + rs->rs_seen_congestion = 0; + return mask; } @@ -385,7 +408,6 @@ static const struct proto_ops rds_proto_ops = { static int __rds_create(struct socket *sock, struct sock *sk, int protocol) { - unsigned long flags; struct rds_sock *rs; sock_init_data(sock, sk); @@ -402,10 +424,10 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) spin_lock_init(&rs->rs_rdma_lock); rs->rs_rdma_keys = RB_ROOT; - spin_lock_irqsave(&rds_sock_lock, flags); + spin_lock_bh(&rds_sock_lock); list_add_tail(&rs->rs_item, &rds_sock_list); rds_sock_count++; - spin_unlock_irqrestore(&rds_sock_lock, flags); + spin_unlock_bh(&rds_sock_lock); return 0; } @@ -446,17 +468,14 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len, struct rds_info_lengths *lens) { struct rds_sock *rs; - struct sock *sk; struct rds_incoming *inc; - unsigned long flags; unsigned int total = 0; len /= sizeof(struct rds_info_message); - spin_lock_irqsave(&rds_sock_lock, flags); + spin_lock_bh(&rds_sock_lock); list_for_each_entry(rs, &rds_sock_list, rs_item) { - sk = rds_rs_to_sk(rs); read_lock(&rs->rs_recv_lock); /* XXX too lazy to maintain counts.. */ @@ -470,7 +489,7 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len, read_unlock(&rs->rs_recv_lock); } - spin_unlock_irqrestore(&rds_sock_lock, flags); + spin_unlock_bh(&rds_sock_lock); lens->nr = total; lens->each = sizeof(struct rds_info_message); @@ -482,11 +501,10 @@ static void rds_sock_info(struct socket *sock, unsigned int len, { struct rds_info_socket sinfo; struct rds_sock *rs; - unsigned long flags; len /= sizeof(struct rds_info_socket); - spin_lock_irqsave(&rds_sock_lock, flags); + spin_lock_bh(&rds_sock_lock); if (len < rds_sock_count) goto out; @@ -507,10 +525,10 @@ out: lens->nr = rds_sock_count; lens->each = sizeof(struct rds_info_socket); - spin_unlock_irqrestore(&rds_sock_lock, flags); + spin_unlock_bh(&rds_sock_lock); } -static void __exit rds_exit(void) +static void rds_exit(void) { sock_unregister(rds_family_ops.family); proto_unregister(&rds_proto); @@ -525,7 +543,7 @@ static void __exit rds_exit(void) } module_exit(rds_exit); -static int __init rds_init(void) +static int rds_init(void) { int ret; diff --git a/net/rds/bind.c b/net/rds/bind.c index 5d95fc007f1..a2e6562da75 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -34,45 +34,52 @@ #include <net/sock.h> #include <linux/in.h> #include <linux/if_arp.h> +#include <linux/jhash.h> +#include <linux/ratelimit.h> #include "rds.h" -/* - * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't - * particularly zippy. - * - * This is now called for every incoming frame so we arguably care much more - * about it than we used to. - */ +#define BIND_HASH_SIZE 1024 +static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; static DEFINE_SPINLOCK(rds_bind_lock); -static struct rb_root rds_bind_tree = RB_ROOT; -static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, - struct rds_sock *insert) +static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) +{ + return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) & + (BIND_HASH_SIZE - 1)); +} + +static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, + struct rds_sock *insert) { - struct rb_node **p = &rds_bind_tree.rb_node; - struct rb_node *parent = NULL; struct rds_sock *rs; + struct hlist_head *head = hash_to_bucket(addr, port); u64 cmp; u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); - while (*p) { - parent = *p; - rs = rb_entry(parent, struct rds_sock, rs_bound_node); - + rcu_read_lock(); + hlist_for_each_entry_rcu(rs, head, rs_bound_node) { cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | be16_to_cpu(rs->rs_bound_port); - if (needle < cmp) - p = &(*p)->rb_left; - else if (needle > cmp) - p = &(*p)->rb_right; - else + if (cmp == needle) { + rcu_read_unlock(); return rs; + } } + rcu_read_unlock(); if (insert) { - rb_link_node(&insert->rs_bound_node, parent, p); - rb_insert_color(&insert->rs_bound_node, &rds_bind_tree); + /* + * make sure our addr and port are set before + * we are added to the list, other people + * in rcu will find us as soon as the + * hlist_add_head_rcu is done + */ + insert->rs_bound_addr = addr; + insert->rs_bound_port = port; + rds_sock_addref(insert); + + hlist_add_head_rcu(&insert->rs_bound_node, head); } return NULL; } @@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, struct rds_sock *rds_find_bound(__be32 addr, __be16 port) { struct rds_sock *rs; - unsigned long flags; - spin_lock_irqsave(&rds_bind_lock, flags); - rs = rds_bind_tree_walk(addr, port, NULL); + rs = rds_bind_lookup(addr, port, NULL); + if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) rds_sock_addref(rs); else rs = NULL; - spin_unlock_irqrestore(&rds_bind_lock, flags); rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, ntohs(port)); @@ -112,7 +117,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) rover = be16_to_cpu(*port); last = rover; } else { - rover = max_t(u16, net_random(), 2); + rover = max_t(u16, prandom_u32(), 2); last = rover - 1; } @@ -121,22 +126,15 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) do { if (rover == 0) rover++; - if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) { - *port = cpu_to_be16(rover); + if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) { + *port = rs->rs_bound_port; ret = 0; + rdsdebug("rs %p binding to %pI4:%d\n", + rs, &addr, (int)ntohs(*port)); break; } } while (rover++ != last); - if (ret == 0) { - rs->rs_bound_addr = addr; - rs->rs_bound_port = *port; - rds_sock_addref(rs); - - rdsdebug("rs %p binding to %pI4:%d\n", - rs, &addr, (int)ntohs(*port)); - } - spin_unlock_irqrestore(&rds_bind_lock, flags); return ret; @@ -153,7 +151,7 @@ void rds_remove_bound(struct rds_sock *rs) rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port)); - rb_erase(&rs->rs_bound_node, &rds_bind_tree); + hlist_del_init_rcu(&rs->rs_bound_node); rds_sock_put(rs); rs->rs_bound_addr = 0; } @@ -184,11 +182,10 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; trans = rds_trans_get_preferred(sin->sin_addr.s_addr); - if (trans == NULL) { + if (!trans) { ret = -EADDRNOTAVAIL; rds_remove_bound(rs); - if (printk_ratelimit()) - printk(KERN_INFO "RDS: rds_bind() could not find a transport, " + printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, " "load rds_tcp or rds_rdma?\n"); goto out; } @@ -198,5 +195,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) out: release_sock(sk); + + /* we might have called rds_remove_bound on error */ + if (ret) + synchronize_rcu(); return ret; } diff --git a/net/rds/cong.c b/net/rds/cong.c index 6d06cac2649..e5b65acd650 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -30,10 +30,11 @@ * SOFTWARE. * */ +#include <linux/slab.h> #include <linux/types.h> #include <linux/rbtree.h> - -#include <asm-generic/bitops/le.h> +#include <linux/bitops.h> +#include <linux/export.h> #include "rds.h" @@ -140,7 +141,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) unsigned long flags; map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); - if (map == NULL) + if (!map) return NULL; map->m_addr = addr; @@ -158,7 +159,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) ret = rds_cong_tree_walk(addr, map); spin_unlock_irqrestore(&rds_cong_lock, flags); - if (ret == NULL) { + if (!ret) { ret = map; map = NULL; } @@ -204,7 +205,7 @@ int rds_cong_get_maps(struct rds_connection *conn) conn->c_lcong = rds_cong_from_addr(conn->c_laddr); conn->c_fcong = rds_cong_from_addr(conn->c_faddr); - if (conn->c_lcong == NULL || conn->c_fcong == NULL) + if (!(conn->c_lcong && conn->c_fcong)) return -ENOMEM; return 0; @@ -218,11 +219,9 @@ void rds_cong_queue_updates(struct rds_cong_map *map) spin_lock_irqsave(&rds_cong_lock, flags); list_for_each_entry(conn, &map->m_conn_list, c_map_item) { - if (conn->c_loopback) - continue; if (!test_and_set_bit(0, &conn->c_map_queued)) { rds_stats_inc(s_cong_update_queued); - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + rds_send_xmit(conn); } } @@ -286,7 +285,7 @@ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - generic___set_le_bit(off, (void *)map->m_page_addrs[i]); + __set_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) @@ -300,7 +299,7 @@ void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - generic___clear_le_bit(off, (void *)map->m_page_addrs[i]); + __clear_bit_le(off, (void *)map->m_page_addrs[i]); } static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) @@ -311,7 +310,7 @@ static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - return generic_test_le_bit(off, (void *)map->m_page_addrs[i]); + return test_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_add_socket(struct rds_sock *rs) diff --git a/net/rds/connection.c b/net/rds/connection.c index 278f607ab60..378c3a6acf8 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -32,11 +32,12 @@ */ #include <linux/kernel.h> #include <linux/list.h> +#include <linux/slab.h> +#include <linux/export.h> #include <net/inet_hashtables.h> #include "rds.h" #include "loop.h" -#include "rdma.h" #define RDS_CONNECTION_HASH_BITS 12 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) @@ -50,10 +51,16 @@ static struct kmem_cache *rds_conn_slab; static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) { + static u32 rds_hash_secret __read_mostly; + + unsigned long hash; + + net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); + /* Pass NULL, don't need struct net for hash */ - unsigned long hash = inet_ehashfn(NULL, - be32_to_cpu(laddr), 0, - be32_to_cpu(faddr), 0); + hash = __inet_ehashfn(be32_to_cpu(laddr), 0, + be32_to_cpu(faddr), 0, + rds_hash_secret); return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; } @@ -62,26 +69,14 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ } while (0) -static inline int rds_conn_is_sending(struct rds_connection *conn) -{ - int ret = 0; - - if (!mutex_trylock(&conn->c_send_lock)) - ret = 1; - else - mutex_unlock(&conn->c_send_lock); - - return ret; -} - +/* rcu read lock must be held or the connection spinlock */ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, __be32 laddr, __be32 faddr, struct rds_transport *trans) { struct rds_connection *conn, *ret = NULL; - struct hlist_node *pos; - hlist_for_each_entry(conn, pos, head, c_hash_node) { + hlist_for_each_entry_rcu(conn, head, c_hash_node) { if (conn->c_faddr == faddr && conn->c_laddr == laddr && conn->c_trans == trans) { ret = conn; @@ -99,7 +94,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, * and receiving over this connection again in the future. It is up to * the transport to have serialized this call with its send and recv. */ -void rds_conn_reset(struct rds_connection *conn) +static void rds_conn_reset(struct rds_connection *conn) { rdsdebug("connection %pI4 to %pI4 reset\n", &conn->c_laddr, &conn->c_faddr); @@ -128,10 +123,11 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, { struct rds_connection *conn, *parent = NULL; struct hlist_head *head = rds_conn_bucket(laddr, faddr); + struct rds_transport *loop_trans; unsigned long flags; int ret; - spin_lock_irqsave(&rds_conn_lock, flags); + rcu_read_lock(); conn = rds_conn_lookup(head, laddr, faddr, trans); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && !is_outgoing) { @@ -142,12 +138,12 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, parent = conn; conn = parent->c_passive; } - spin_unlock_irqrestore(&rds_conn_lock, flags); + rcu_read_unlock(); if (conn) goto out; conn = kmem_cache_zalloc(rds_conn_slab, gfp); - if (conn == NULL) { + if (!conn) { conn = ERR_PTR(-ENOMEM); goto out; } @@ -158,7 +154,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, spin_lock_init(&conn->c_lock); conn->c_next_tx_seq = 1; - mutex_init(&conn->c_send_lock); + init_waitqueue_head(&conn->c_waitq); INIT_LIST_HEAD(&conn->c_send_queue); INIT_LIST_HEAD(&conn->c_retrans); @@ -174,7 +170,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, * can bind to the destination address then we'd rather the messages * flow through loopback rather than either transport. */ - if (rds_trans_get_preferred(faddr)) { + loop_trans = rds_trans_get_preferred(faddr); + if (loop_trans) { + rds_trans_put(loop_trans); conn->c_loopback = 1; if (is_outgoing && trans->t_prefer_loopback) { /* "outgoing" connection - and the transport @@ -237,7 +235,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, kmem_cache_free(rds_conn_slab, conn); conn = found; } else { - hlist_add_head(&conn->c_hash_node, head); + hlist_add_head_rcu(&conn->c_hash_node, head); rds_cong_add_conn(conn); rds_conn_count++; } @@ -262,21 +260,91 @@ struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); +void rds_conn_shutdown(struct rds_connection *conn) +{ + /* shut it down unless it's down already */ + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { + /* + * Quiesce the connection mgmt handlers before we start tearing + * things down. We don't hold the mutex for the entire + * duration of the shutdown operation, else we may be + * deadlocking with the CM handler. Instead, the CM event + * handler is supposed to check for state DISCONNECTING + */ + mutex_lock(&conn->c_cm_lock); + if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) + && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { + rds_conn_error(conn, "shutdown called in state %d\n", + atomic_read(&conn->c_state)); + mutex_unlock(&conn->c_cm_lock); + return; + } + mutex_unlock(&conn->c_cm_lock); + + wait_event(conn->c_waitq, + !test_bit(RDS_IN_XMIT, &conn->c_flags)); + + conn->c_trans->conn_shutdown(conn); + rds_conn_reset(conn); + + if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { + /* This can happen - eg when we're in the middle of tearing + * down the connection, and someone unloads the rds module. + * Quite reproduceable with loopback connections. + * Mostly harmless. + */ + rds_conn_error(conn, + "%s: failed to transition to state DOWN, " + "current state is %d\n", + __func__, + atomic_read(&conn->c_state)); + return; + } + } + + /* Then reconnect if it's still live. + * The passive side of an IB loopback connection is never added + * to the conn hash, so we never trigger a reconnect on this + * conn - the reconnect is always triggered by the active peer. */ + cancel_delayed_work_sync(&conn->c_conn_w); + rcu_read_lock(); + if (!hlist_unhashed(&conn->c_hash_node)) { + rcu_read_unlock(); + rds_queue_reconnect(conn); + } else { + rcu_read_unlock(); + } +} + +/* + * Stop and free a connection. + * + * This can only be used in very limited circumstances. It assumes that once + * the conn has been shutdown that no one else is referencing the connection. + * We can only ensure this in the rmmod path in the current code. + */ void rds_conn_destroy(struct rds_connection *conn) { struct rds_message *rm, *rtmp; + unsigned long flags; rdsdebug("freeing conn %p for %pI4 -> " "%pI4\n", conn, &conn->c_laddr, &conn->c_faddr); - hlist_del_init(&conn->c_hash_node); + /* Ensure conn will not be scheduled for reconnect */ + spin_lock_irq(&rds_conn_lock); + hlist_del_init_rcu(&conn->c_hash_node); + spin_unlock_irq(&rds_conn_lock); + synchronize_rcu(); - /* wait for the rds thread to shut it down */ - atomic_set(&conn->c_state, RDS_CONN_ERROR); - cancel_delayed_work(&conn->c_conn_w); - queue_work(rds_wq, &conn->c_down_w); - flush_workqueue(rds_wq); + /* shut the connection down */ + rds_conn_drop(conn); + flush_work(&conn->c_down_w); + + /* make sure lingering queued work won't try to ref the conn */ + cancel_delayed_work_sync(&conn->c_send_w); + cancel_delayed_work_sync(&conn->c_recv_w); /* tear down queued messages */ list_for_each_entry_safe(rm, rtmp, @@ -301,7 +369,9 @@ void rds_conn_destroy(struct rds_connection *conn) BUG_ON(!list_empty(&conn->c_retrans)); kmem_cache_free(rds_conn_slab, conn); + spin_lock_irqsave(&rds_conn_lock, flags); rds_conn_count--; + spin_unlock_irqrestore(&rds_conn_lock, flags); } EXPORT_SYMBOL_GPL(rds_conn_destroy); @@ -311,27 +381,26 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, int want_send) { struct hlist_head *head; - struct hlist_node *pos; struct list_head *list; struct rds_connection *conn; struct rds_message *rm; - unsigned long flags; unsigned int total = 0; + unsigned long flags; size_t i; len /= sizeof(struct rds_info_message); - spin_lock_irqsave(&rds_conn_lock, flags); + rcu_read_lock(); for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { - hlist_for_each_entry(conn, pos, head, c_hash_node) { + hlist_for_each_entry_rcu(conn, head, c_hash_node) { if (want_send) list = &conn->c_send_queue; else list = &conn->c_retrans; - spin_lock(&conn->c_lock); + spin_lock_irqsave(&conn->c_lock, flags); /* XXX too lazy to maintain counts.. */ list_for_each_entry(rm, list, m_conn_item) { @@ -342,11 +411,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, conn->c_faddr, 0); } - spin_unlock(&conn->c_lock); + spin_unlock_irqrestore(&conn->c_lock, flags); } } - - spin_unlock_irqrestore(&rds_conn_lock, flags); + rcu_read_unlock(); lens->nr = total; lens->each = sizeof(struct rds_info_message); @@ -375,20 +443,17 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, { uint64_t buffer[(item_len + 7) / 8]; struct hlist_head *head; - struct hlist_node *pos; - struct hlist_node *tmp; struct rds_connection *conn; - unsigned long flags; size_t i; - spin_lock_irqsave(&rds_conn_lock, flags); + rcu_read_lock(); lens->nr = 0; lens->each = item_len; for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { - hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) { + hlist_for_each_entry_rcu(conn, head, c_hash_node) { /* XXX no c_lock usage.. */ if (!visitor(conn, buffer)) @@ -404,8 +469,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, lens->nr++; } } - - spin_unlock_irqrestore(&rds_conn_lock, flags); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_for_each_conn_info); @@ -422,8 +486,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn, sizeof(cinfo->transport)); cinfo->flags = 0; - rds_conn_info_set(cinfo->flags, - rds_conn_is_sending(conn), SENDING); + rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags), + SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo->flags, atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, @@ -443,12 +507,12 @@ static void rds_conn_info(struct socket *sock, unsigned int len, sizeof(struct rds_info_connection)); } -int __init rds_conn_init(void) +int rds_conn_init(void) { rds_conn_slab = kmem_cache_create("rds_connection", sizeof(struct rds_connection), 0, 0, NULL); - if (rds_conn_slab == NULL) + if (!rds_conn_slab) return -ENOMEM; rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); @@ -486,6 +550,18 @@ void rds_conn_drop(struct rds_connection *conn) EXPORT_SYMBOL_GPL(rds_conn_drop); /* + * If the connection is down, trigger a connect. We may have scheduled a + * delayed reconnect however - in this case we should not interfere. + */ +void rds_conn_connect_if_down(struct rds_connection *conn) +{ + if (rds_conn_state(conn) == RDS_CONN_DOWN && + !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_conn_w, 0); +} +EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); + +/* * An error occurred on the connection */ void diff --git a/net/rds/ib.c b/net/rds/ib.c index 3b899236104..ba2dffeff60 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -37,11 +37,13 @@ #include <linux/inetdevice.h> #include <linux/if_arp.h> #include <linux/delay.h> +#include <linux/slab.h> +#include <linux/module.h> #include "rds.h" #include "ib.h" -unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; +static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; @@ -52,13 +54,72 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); module_param(rds_ib_retry_count, int, 0444); MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); +/* + * we have a clumsy combination of RCU and a rwsem protecting this list + * because it is used both in the get_mr fast path and while blocking in + * the FMR flushing path. + */ +DECLARE_RWSEM(rds_ib_devices_lock); struct list_head rds_ib_devices; /* NOTE: if also grabbing ibdev lock, grab this first */ DEFINE_SPINLOCK(ib_nodev_conns_lock); LIST_HEAD(ib_nodev_conns); -void rds_ib_add_one(struct ib_device *device) +static void rds_ib_nodev_connect(void) +{ + struct rds_ib_connection *ic; + + spin_lock(&ib_nodev_conns_lock); + list_for_each_entry(ic, &ib_nodev_conns, ib_node) + rds_conn_connect_if_down(ic->conn); + spin_unlock(&ib_nodev_conns_lock); +} + +static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_connection *ic; + unsigned long flags; + + spin_lock_irqsave(&rds_ibdev->spinlock, flags); + list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) + rds_conn_drop(ic->conn); + spin_unlock_irqrestore(&rds_ibdev->spinlock, flags); +} + +/* + * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references + * from interrupt context so we push freing off into a work struct in krdsd. + */ +static void rds_ib_dev_free(struct work_struct *work) +{ + struct rds_ib_ipaddr *i_ipaddr, *i_next; + struct rds_ib_device *rds_ibdev = container_of(work, + struct rds_ib_device, free_work); + + if (rds_ibdev->mr_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); + if (rds_ibdev->mr) + ib_dereg_mr(rds_ibdev->mr); + if (rds_ibdev->pd) + ib_dealloc_pd(rds_ibdev->pd); + + list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { + list_del(&i_ipaddr->list); + kfree(i_ipaddr); + } + + kfree(rds_ibdev); +} + +void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) +{ + BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0); + if (atomic_dec_and_test(&rds_ibdev->refcount)) + queue_work(rds_wq, &rds_ibdev->free_work); +} + +static void rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; struct ib_device_attr *dev_attr; @@ -76,11 +137,14 @@ void rds_ib_add_one(struct ib_device *device) goto free_attr; } - rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); + rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, + ibdev_to_node(device)); if (!rds_ibdev) goto free_attr; spin_lock_init(&rds_ibdev->spinlock); + atomic_set(&rds_ibdev->refcount, 1); + INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); rds_ibdev->max_wrs = dev_attr->max_qp_wr; rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); @@ -90,68 +154,107 @@ void rds_ib_add_one(struct ib_device *device) min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : fmr_pool_size; + rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; + rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; + rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device); - if (IS_ERR(rds_ibdev->pd)) - goto free_dev; + if (IS_ERR(rds_ibdev->pd)) { + rds_ibdev->pd = NULL; + goto put_dev; + } - rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(rds_ibdev->mr)) - goto err_pd; + rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(rds_ibdev->mr)) { + rds_ibdev->mr = NULL; + goto put_dev; + } rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); if (IS_ERR(rds_ibdev->mr_pool)) { rds_ibdev->mr_pool = NULL; - goto err_mr; + goto put_dev; } INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); INIT_LIST_HEAD(&rds_ibdev->conn_list); - list_add_tail(&rds_ibdev->list, &rds_ib_devices); + + down_write(&rds_ib_devices_lock); + list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); + up_write(&rds_ib_devices_lock); + atomic_inc(&rds_ibdev->refcount); ib_set_client_data(device, &rds_ib_client, rds_ibdev); + atomic_inc(&rds_ibdev->refcount); - goto free_attr; + rds_ib_nodev_connect(); -err_mr: - ib_dereg_mr(rds_ibdev->mr); -err_pd: - ib_dealloc_pd(rds_ibdev->pd); -free_dev: - kfree(rds_ibdev); +put_dev: + rds_ib_dev_put(rds_ibdev); free_attr: kfree(dev_attr); } -void rds_ib_remove_one(struct ib_device *device) +/* + * New connections use this to find the device to associate with the + * connection. It's not in the fast path so we're not concerned about the + * performance of the IB call. (As of this writing, it uses an interrupt + * blocking spinlock to serialize walking a per-device list of all registered + * clients.) + * + * RCU is used to handle incoming connections racing with device teardown. + * Rather than use a lock to serialize removal from the client_data and + * getting a new reference, we use an RCU grace period. The destruction + * path removes the device from client_data and then waits for all RCU + * readers to finish. + * + * A new connection can get NULL from this if its arriving on a + * device that is in the process of being removed. + */ +struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) { struct rds_ib_device *rds_ibdev; - struct rds_ib_ipaddr *i_ipaddr, *i_next; + rcu_read_lock(); rds_ibdev = ib_get_client_data(device, &rds_ib_client); - if (!rds_ibdev) - return; + if (rds_ibdev) + atomic_inc(&rds_ibdev->refcount); + rcu_read_unlock(); + return rds_ibdev; +} - list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { - list_del(&i_ipaddr->list); - kfree(i_ipaddr); - } +/* + * The IB stack is letting us know that a device is going away. This can + * happen if the underlying HCA driver is removed or if PCI hotplug is removing + * the pci function, for example. + * + * This can be called at any time and can be racing with any other RDS path. + */ +static void rds_ib_remove_one(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; - rds_ib_destroy_conns(rds_ibdev); + rds_ibdev = ib_get_client_data(device, &rds_ib_client); + if (!rds_ibdev) + return; - if (rds_ibdev->mr_pool) - rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); + rds_ib_dev_shutdown(rds_ibdev); - ib_dereg_mr(rds_ibdev->mr); + /* stop connection attempts from getting a reference to this device. */ + ib_set_client_data(device, &rds_ib_client, NULL); - while (ib_dealloc_pd(rds_ibdev->pd)) { - rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); - msleep(1); - } + down_write(&rds_ib_devices_lock); + list_del_rcu(&rds_ibdev->list); + up_write(&rds_ib_devices_lock); - list_del(&rds_ibdev->list); - kfree(rds_ibdev); + /* + * This synchronize rcu is waiting for readers of both the ib + * client data and the devices list to finish before we drop + * both of those references. + */ + synchronize_rcu(); + rds_ib_dev_put(rds_ibdev); + rds_ib_dev_put(rds_ibdev); } struct ib_client rds_ib_client = { @@ -185,7 +288,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); - rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + rds_ibdev = ic->rds_ibdev; iinfo->max_send_wr = ic->i_send_ring.w_nr; iinfo->max_recv_wr = ic->i_recv_ring.w_nr; iinfo->max_send_sge = rds_ibdev->max_sge; @@ -223,7 +326,7 @@ static int rds_ib_laddr_check(__be32 addr) /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ - cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); @@ -235,7 +338,8 @@ static int rds_ib_laddr_check(__be32 addr) ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); /* due to this, we will claim to support iWARP devices unless we check node_type. */ - if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) + if (ret || !cm_id->device || + cm_id->device->node_type != RDMA_NODE_IB_CA) ret = -EADDRNOTAVAIL; rdsdebug("addr %pI4 ret %d node type %d\n", @@ -247,11 +351,18 @@ static int rds_ib_laddr_check(__be32 addr) return ret; } +static void rds_ib_unregister_client(void) +{ + ib_unregister_client(&rds_ib_client); + /* wait for rds_ib_dev_free() to complete */ + flush_workqueue(rds_wq); +} + void rds_ib_exit(void) { rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + rds_ib_unregister_client(); rds_ib_destroy_nodev_conns(); - ib_unregister_client(&rds_ib_client); rds_ib_sysctl_exit(); rds_ib_recv_exit(); rds_trans_unregister(&rds_ib_transport); @@ -261,15 +372,14 @@ struct rds_transport rds_ib_transport = { .laddr_check = rds_ib_laddr_check, .xmit_complete = rds_ib_xmit_complete, .xmit = rds_ib_xmit, - .xmit_cong_map = NULL, .xmit_rdma = rds_ib_xmit_rdma, + .xmit_atomic = rds_ib_xmit_atomic, .recv = rds_ib_recv, .conn_alloc = rds_ib_conn_alloc, .conn_free = rds_ib_conn_free, .conn_connect = rds_ib_conn_connect, .conn_shutdown = rds_ib_conn_shutdown, .inc_copy_to_user = rds_ib_inc_copy_to_user, - .inc_purge = rds_ib_inc_purge, .inc_free = rds_ib_inc_free, .cm_initiate_connect = rds_ib_cm_initiate_connect, .cm_handle_connect = rds_ib_cm_handle_connect, @@ -285,7 +395,7 @@ struct rds_transport rds_ib_transport = { .t_type = RDS_TRANS_IB }; -int __init rds_ib_init(void) +int rds_ib_init(void) { int ret; @@ -316,7 +426,7 @@ out_recv: out_sysctl: rds_ib_sysctl_exit(); out_ibreg: - ib_unregister_client(&rds_ib_client); + rds_ib_unregister_client(); out: return ret; } diff --git a/net/rds/ib.h b/net/rds/ib.h index 64df4e79b29..7280ab8810c 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -3,11 +3,14 @@ #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> +#include <linux/interrupt.h> +#include <linux/pci.h> +#include <linux/slab.h> #include "rds.h" #include "rdma_transport.h" #define RDS_FMR_SIZE 256 -#define RDS_FMR_POOL_SIZE 4096 +#define RDS_FMR_POOL_SIZE 8192 #define RDS_IB_MAX_SGE 8 #define RDS_IB_RECV_SGE 2 @@ -19,6 +22,9 @@ #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ +#define RDS_IB_RECYCLE_BATCH_COUNT 32 + +extern struct rw_semaphore rds_ib_devices_lock; extern struct list_head rds_ib_devices; /* @@ -26,20 +32,29 @@ extern struct list_head rds_ib_devices; * try and minimize the amount of memory tied up both the device and * socket receive queues. */ -/* page offset of the final full frag that fits in the page */ -#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) struct rds_page_frag { struct list_head f_item; - struct page *f_page; - unsigned long f_offset; - dma_addr_t f_mapped; + struct list_head f_cache_entry; + struct scatterlist f_sg; }; struct rds_ib_incoming { struct list_head ii_frags; + struct list_head ii_cache_entry; struct rds_incoming ii_inc; }; +struct rds_ib_cache_head { + struct list_head *first; + unsigned long count; +}; + +struct rds_ib_refill_cache { + struct rds_ib_cache_head __percpu *percpu; + struct list_head *xfer; + struct list_head *ready; +}; + struct rds_ib_connect_private { /* Add new fields at the end, and don't permute existing fields. */ __be32 dp_saddr; @@ -53,8 +68,7 @@ struct rds_ib_connect_private { }; struct rds_ib_send_work { - struct rds_message *s_rm; - struct rds_rdma_op *s_op; + void *s_op; struct ib_send_wr s_wr; struct ib_sge s_sge[RDS_IB_MAX_SGE]; unsigned long s_queued; @@ -92,10 +106,11 @@ struct rds_ib_connection { /* tx */ struct rds_ib_work_ring i_send_ring; - struct rds_message *i_rm; + struct rm_data_op *i_data_op; struct rds_header *i_send_hdrs; u64 i_send_hdrs_dma; struct rds_ib_send_work *i_sends; + atomic_t i_signaled_sends; /* rx */ struct tasklet_struct i_recv_tasklet; @@ -106,8 +121,9 @@ struct rds_ib_connection { struct rds_header *i_recv_hdrs; u64 i_recv_hdrs_dma; struct rds_ib_recv_work *i_recvs; - struct rds_page_frag i_frag; u64 i_ack_recv; /* last ACK received */ + struct rds_ib_refill_cache i_cache_incs; + struct rds_ib_refill_cache i_cache_frags; /* sending acks */ unsigned long i_ack_flags; @@ -138,7 +154,6 @@ struct rds_ib_connection { /* Batched completions */ unsigned int i_unsignaled_wrs; - long i_unsignaled_bytes; }; /* This assumes that atomic_t is at least 32 bits */ @@ -164,9 +179,16 @@ struct rds_ib_device { unsigned int max_fmrs; int max_sge; unsigned int max_wrs; + unsigned int max_initiator_depth; + unsigned int max_responder_resources; spinlock_t spinlock; /* protect the above */ + atomic_t refcount; + struct work_struct free_work; }; +#define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device) +#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev) + /* bits for i_ack_flags */ #define IB_ACK_IN_FLIGHT 0 #define IB_ACK_REQUESTED 1 @@ -202,6 +224,8 @@ struct rds_ib_statistics { uint64_t s_ib_rdma_mr_pool_flush; uint64_t s_ib_rdma_mr_pool_wait; uint64_t s_ib_rdma_mr_pool_depleted; + uint64_t s_ib_atomic_cswp; + uint64_t s_ib_atomic_fadd; }; extern struct workqueue_struct *rds_ib_wq; @@ -241,11 +265,10 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, /* ib.c */ extern struct rds_transport rds_ib_transport; -extern void rds_ib_add_one(struct ib_device *device); -extern void rds_ib_remove_one(struct ib_device *device); +struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); +void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); extern struct ib_client rds_ib_client; -extern unsigned int fmr_pool_size; extern unsigned int fmr_message_size; extern unsigned int rds_ib_retry_count; @@ -258,7 +281,7 @@ void rds_ib_conn_free(void *arg); int rds_ib_conn_connect(struct rds_connection *conn); void rds_ib_conn_shutdown(struct rds_connection *conn); void rds_ib_state_change(struct sock *sk); -int __init rds_ib_listen_init(void); +int rds_ib_listen_init(void); void rds_ib_listen_stop(void); void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, @@ -275,15 +298,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); -void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock); -static inline void rds_ib_destroy_nodev_conns(void) -{ - __rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock); -} -static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev) -{ - __rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock); -} +void rds_ib_destroy_nodev_conns(void); struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); @@ -294,12 +309,12 @@ void rds_ib_free_mr(void *trans_private, int invalidate); void rds_ib_flush_mrs(void); /* ib_recv.c */ -int __init rds_ib_recv_init(void); +int rds_ib_recv_init(void); void rds_ib_recv_exit(void); int rds_ib_recv(struct rds_connection *conn); -int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, - gfp_t page_gfp, int prefill); -void rds_ib_inc_purge(struct rds_incoming *inc); +int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); +void rds_ib_recv_free_caches(struct rds_ib_connection *ic); +void rds_ib_recv_refill(struct rds_connection *conn, int prefill); void rds_ib_inc_free(struct rds_incoming *inc); int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, size_t size); @@ -325,17 +340,19 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); extern wait_queue_head_t rds_ib_ring_empty_wait; /* ib_send.c */ +char *rds_ib_wc_status_str(enum ib_wc_status status); void rds_ib_xmit_complete(struct rds_connection *conn); int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); void rds_ib_send_init_ring(struct rds_ib_connection *ic); void rds_ib_send_clear_ring(struct rds_ib_connection *ic); -int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); +int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, u32 *adv_credits, int need_posted, int max_posted); +int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); /* ib_stats.c */ DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); @@ -344,7 +361,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); /* ib_sysctl.c */ -int __init rds_ib_sysctl_init(void); +int rds_ib_sysctl_init(void); void rds_ib_sysctl_exit(void); extern unsigned long rds_ib_sysctl_max_send_wr; extern unsigned long rds_ib_sysctl_max_recv_wr; @@ -352,30 +369,5 @@ extern unsigned long rds_ib_sysctl_max_unsig_wrs; extern unsigned long rds_ib_sysctl_max_unsig_bytes; extern unsigned long rds_ib_sysctl_max_recv_allocation; extern unsigned int rds_ib_sysctl_flow_control; -extern ctl_table rds_ib_sysctl_table[]; - -/* - * Helper functions for getting/setting the header and data SGEs in - * RDS packets (not RDMA) - * - * From version 3.1 onwards, header is in front of data in the sge. - */ -static inline struct ib_sge * -rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge) -{ - if (ic->conn->c_version > RDS_PROTOCOL_3_0) - return &sge[0]; - else - return &sge[1]; -} - -static inline struct ib_sge * -rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge) -{ - if (ic->conn->c_version > RDS_PROTOCOL_3_0) - return &sge[1]; - else - return &sge[0]; -} #endif diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 647cb8ffc39..31b74f5e61a 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -32,11 +32,43 @@ */ #include <linux/kernel.h> #include <linux/in.h> +#include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/ratelimit.h> #include "rds.h" #include "ib.h" +static char *rds_ib_event_type_strings[] = { +#define RDS_IB_EVENT_STRING(foo) \ + [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo) + RDS_IB_EVENT_STRING(CQ_ERR), + RDS_IB_EVENT_STRING(QP_FATAL), + RDS_IB_EVENT_STRING(QP_REQ_ERR), + RDS_IB_EVENT_STRING(QP_ACCESS_ERR), + RDS_IB_EVENT_STRING(COMM_EST), + RDS_IB_EVENT_STRING(SQ_DRAINED), + RDS_IB_EVENT_STRING(PATH_MIG), + RDS_IB_EVENT_STRING(PATH_MIG_ERR), + RDS_IB_EVENT_STRING(DEVICE_FATAL), + RDS_IB_EVENT_STRING(PORT_ACTIVE), + RDS_IB_EVENT_STRING(PORT_ERR), + RDS_IB_EVENT_STRING(LID_CHANGE), + RDS_IB_EVENT_STRING(PKEY_CHANGE), + RDS_IB_EVENT_STRING(SM_CHANGE), + RDS_IB_EVENT_STRING(SRQ_ERR), + RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED), + RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED), + RDS_IB_EVENT_STRING(CLIENT_REREGISTER), +#undef RDS_IB_EVENT_STRING +}; + +static char *rds_ib_event_str(enum ib_event_type type) +{ + return rds_str_array(rds_ib_event_type_strings, + ARRAY_SIZE(rds_ib_event_type_strings), type); +}; + /* * Set the selected protocol version */ @@ -94,7 +126,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even { const struct rds_ib_connect_private *dp = NULL; struct rds_ib_connection *ic = conn->c_transport_data; - struct rds_ib_device *rds_ibdev; struct ib_qp_attr qp_attr; int err; @@ -110,11 +141,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even } } - printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", - &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version), - ic->i_flowctl ? ", flow control" : ""); + if (conn->c_version < RDS_PROTOCOL(3,1)) { + printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed," + " no longer supported\n", + &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version)); + rds_conn_destroy(conn); + return; + } else { + printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", + &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); + } /* * Init rings and fill recv. this needs to wait until protocol negotiation @@ -124,7 +165,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even rds_ib_recv_init_ring(ic); /* Post receive buffers - as a side effect, this will update * the posted credit count. */ - rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); + rds_ib_recv_refill(conn, 1); /* Tune RNR behavior */ rds_ib_tune_rnr(ic, &qp_attr); @@ -134,12 +175,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even if (err) printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); - /* update ib_device with this local ipaddr & conn */ - rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); - err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr); + /* update ib_device with this local ipaddr */ + err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr); if (err) - printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); - rds_ib_add_conn(rds_ibdev, conn); + printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", + err); /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ @@ -152,18 +192,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, struct rdma_conn_param *conn_param, struct rds_ib_connect_private *dp, - u32 protocol_version) + u32 protocol_version, + u32 max_responder_resources, + u32 max_initiator_depth) { + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_device *rds_ibdev = ic->rds_ibdev; + memset(conn_param, 0, sizeof(struct rdma_conn_param)); - /* XXX tune these? */ - conn_param->responder_resources = 1; - conn_param->initiator_depth = 1; + + conn_param->responder_resources = + min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources); + conn_param->initiator_depth = + min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth); conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); conn_param->rnr_retry_count = 7; if (dp) { - struct rds_ib_connection *ic = conn->c_transport_data; - memset(dp, 0, sizeof(*dp)); dp->dp_saddr = conn->c_laddr; dp->dp_daddr = conn->c_faddr; @@ -188,7 +233,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, static void rds_ib_cq_event_handler(struct ib_event *event, void *data) { - rdsdebug("event %u data %p\n", event->event, data); + rdsdebug("event %u (%s) data %p\n", + event->event, rds_ib_event_str(event->event), data); } static void rds_ib_qp_event_handler(struct ib_event *event, void *data) @@ -196,16 +242,19 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) struct rds_connection *conn = data; struct rds_ib_connection *ic = conn->c_transport_data; - rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); + rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event, + rds_ib_event_str(event->event)); switch (event->event) { case IB_EVENT_COMM_EST: rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); break; default: - rds_ib_conn_error(conn, "RDS/IB: Fatal QP Event %u " + rdsdebug("Fatal QP Event %u (%s) " "- connection %pI4->%pI4, reconnecting\n", - event->event, &conn->c_laddr, &conn->c_faddr); + event->event, rds_ib_event_str(event->event), + &conn->c_laddr, &conn->c_faddr); + rds_conn_drop(conn); break; } } @@ -222,18 +271,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct rds_ib_device *rds_ibdev; int ret; - /* rds_ib_add_one creates a rds_ib_device object per IB device, - * and allocates a protection domain, memory range and FMR pool - * for each. If that fails for any reason, it will not register - * the rds_ibdev at all. + /* + * It's normal to see a null device if an incoming connection races + * with device removal, so we don't print a warning. */ - rds_ibdev = ib_get_client_data(dev, &rds_ib_client); - if (rds_ibdev == NULL) { - if (printk_ratelimit()) - printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n", - dev->name); + rds_ibdev = rds_ib_get_client_data(dev); + if (!rds_ibdev) return -EOPNOTSUPP; - } + + /* add the conn now so that connection establishment has the dev */ + rds_ib_add_conn(rds_ibdev, conn); if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); @@ -304,7 +351,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_send_ring.w_nr * sizeof(struct rds_header), &ic->i_send_hdrs_dma, GFP_KERNEL); - if (ic->i_send_hdrs == NULL) { + if (!ic->i_send_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent send failed\n"); goto out; @@ -314,7 +361,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_recv_ring.w_nr * sizeof(struct rds_header), &ic->i_recv_hdrs_dma, GFP_KERNEL); - if (ic->i_recv_hdrs == NULL) { + if (!ic->i_recv_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent recv failed\n"); goto out; @@ -322,27 +369,27 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), &ic->i_ack_dma, GFP_KERNEL); - if (ic->i_ack == NULL) { + if (!ic->i_ack) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent ack failed\n"); goto out; } - ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); - if (ic->i_sends == NULL) { + ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work), + ibdev_to_node(dev)); + if (!ic->i_sends) { ret = -ENOMEM; rdsdebug("send allocation failed\n"); goto out; } - memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); - ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); - if (ic->i_recvs == NULL) { + ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work), + ibdev_to_node(dev)); + if (!ic->i_recvs) { ret = -ENOMEM; rdsdebug("recv allocation failed\n"); goto out; } - memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); rds_ib_recv_init_ack(ic); @@ -350,6 +397,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_send_cq, ic->i_recv_cq); out: + rds_ib_dev_put(rds_ibdev); return ret; } @@ -386,13 +434,11 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) version = RDS_PROTOCOL_3_0; while ((common >>= 1) != 0) version++; - } else if (printk_ratelimit()) { - printk(KERN_NOTICE "RDS: Connection from %pI4 using " - "incompatible protocol version %u.%u\n", - &dp->dp_saddr, - dp->dp_protocol_major, - dp->dp_protocol_minor); - } + } else + printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n", + &dp->dp_saddr, + dp->dp_protocol_major, + dp->dp_protocol_minor); return version; } @@ -407,7 +453,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, struct rds_ib_connection *ic = NULL; struct rdma_conn_param conn_param; u32 version; - int err, destroy = 1; + int err = 1, destroy = 1; /* Check whether the remote protocol version matches ours. */ version = rds_ib_protocol_compatible(event); @@ -446,7 +492,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, /* Wait and see - our connect may still be succeeding */ rds_ib_stats_inc(s_ib_connect_raced); } - mutex_unlock(&conn->c_cm_lock); goto out; } @@ -476,20 +521,20 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, goto out; } - rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, + event->param.conn.responder_resources, + event->param.conn.initiator_depth); /* rdma_accept() calls rdma_reject() internally if it fails */ err = rdma_accept(cm_id, &conn_param); - mutex_unlock(&conn->c_cm_lock); - if (err) { + if (err) rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); - goto out; - } - - return 0; out: - rdma_reject(cm_id, NULL, 0); + if (conn) + mutex_unlock(&conn->c_cm_lock); + if (err) + rdma_reject(cm_id, NULL, 0); return destroy; } @@ -513,8 +558,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) goto out; } - rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); - + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION, + UINT_MAX, UINT_MAX); ret = rdma_connect(cm_id, &conn_param); if (ret) rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); @@ -539,7 +584,7 @@ int rds_ib_conn_connect(struct rds_connection *conn) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, - RDMA_PS_TCP); + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { ret = PTR_ERR(ic->i_cm_id); ic->i_cm_id = NULL; @@ -598,9 +643,19 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) ic->i_cm_id, err); } + /* + * We want to wait for tx and rx completion to finish + * before we tear down the connection, but we have to be + * careful not to get stuck waiting on a send ring that + * only has unsignaled sends in it. We've shutdown new + * sends before getting here so by waiting for signaled + * sends to complete we're ensured that there will be no + * more tx processing. + */ wait_event(rds_ib_ring_empty_wait, - rds_ib_ring_empty(&ic->i_send_ring) && - rds_ib_ring_empty(&ic->i_recv_ring)); + rds_ib_ring_empty(&ic->i_recv_ring) && + (atomic_read(&ic->i_signaled_sends) == 0)); + tasklet_kill(&ic->i_recv_tasklet); if (ic->i_send_hdrs) ib_dma_free_coherent(dev, @@ -651,9 +706,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) BUG_ON(ic->rds_ibdev); /* Clear pending transmit */ - if (ic->i_rm) { - rds_message_put(ic->i_rm); - ic->i_rm = NULL; + if (ic->i_data_op) { + struct rds_message *rm; + + rm = container_of(ic->i_data_op, struct rds_message, data); + rds_message_put(rm); + ic->i_data_op = NULL; } /* Clear the ACK state */ @@ -687,12 +745,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) { struct rds_ib_connection *ic; unsigned long flags; + int ret; /* XXX too lazy? */ - ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); - if (ic == NULL) + ic = kzalloc(sizeof(struct rds_ib_connection), gfp); + if (!ic) return -ENOMEM; + ret = rds_ib_recv_alloc_caches(ic); + if (ret) { + kfree(ic); + return ret; + } + INIT_LIST_HEAD(&ic->ib_node); tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, (unsigned long) ic); @@ -700,6 +765,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&ic->i_ack_lock); #endif + atomic_set(&ic->i_signaled_sends, 0); /* * rds_ib_conn_shutdown() waits for these to be emptied so they @@ -741,6 +807,8 @@ void rds_ib_conn_free(void *arg) list_del(&ic->ib_node); spin_unlock_irq(lock_ptr); + rds_ib_recv_free_caches(ic); + kfree(ic); } diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 4b0da865a72..e8fdb172adb 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -31,11 +31,15 @@ * */ #include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/rculist.h> +#include <linux/llist.h> #include "rds.h" -#include "rdma.h" #include "ib.h" +static DEFINE_PER_CPU(unsigned long, clean_list_grace); +#define CLEAN_LIST_BUSY_BIT 0 /* * This is stored as mr->r_trans_private. @@ -44,7 +48,11 @@ struct rds_ib_mr { struct rds_ib_device *device; struct rds_ib_mr_pool *pool; struct ib_fmr *fmr; - struct list_head list; + + struct llist_node llnode; + + /* unmap_list is for freeing */ + struct list_head unmap_list; unsigned int remap_count; struct scatterlist *sg; @@ -58,14 +66,16 @@ struct rds_ib_mr { */ struct rds_ib_mr_pool { struct mutex flush_lock; /* serialize fmr invalidate */ - struct work_struct flush_worker; /* flush worker */ + struct delayed_work flush_worker; /* flush worker */ - spinlock_t list_lock; /* protect variables below */ atomic_t item_count; /* total # of MRs */ atomic_t dirty_count; /* # dirty of MRs */ - struct list_head drop_list; /* MRs that have reached their max_maps limit */ - struct list_head free_list; /* unused MRs */ - struct list_head clean_list; /* unused & unamapped MRs */ + + struct llist_head drop_list; /* MRs that have reached their max_maps limit */ + struct llist_head free_list; /* unused MRs */ + struct llist_head clean_list; /* global unused & unamapped MRs */ + wait_queue_head_t flush_wait; + atomic_t free_pinned; /* memory pinned by free MRs */ unsigned long max_items; unsigned long max_items_soft; @@ -73,7 +83,7 @@ struct rds_ib_mr_pool { struct ib_fmr_attr fmr_attr; }; -static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all); +static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **); static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); static void rds_ib_mr_pool_flush_worker(struct work_struct *work); @@ -82,16 +92,17 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) struct rds_ib_device *rds_ibdev; struct rds_ib_ipaddr *i_ipaddr; - list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { - spin_lock_irq(&rds_ibdev->spinlock); - list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) { + list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { - spin_unlock_irq(&rds_ibdev->spinlock); + atomic_inc(&rds_ibdev->refcount); + rcu_read_unlock(); return rds_ibdev; } } - spin_unlock_irq(&rds_ibdev->spinlock); } + rcu_read_unlock(); return NULL; } @@ -107,7 +118,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) i_ipaddr->ipaddr = ipaddr; spin_lock_irq(&rds_ibdev->spinlock); - list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list); + list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list); spin_unlock_irq(&rds_ibdev->spinlock); return 0; @@ -115,17 +126,24 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { - struct rds_ib_ipaddr *i_ipaddr, *next; + struct rds_ib_ipaddr *i_ipaddr; + struct rds_ib_ipaddr *to_free = NULL; + spin_lock_irq(&rds_ibdev->spinlock); - list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) { + list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { - list_del(&i_ipaddr->list); - kfree(i_ipaddr); + list_del_rcu(&i_ipaddr->list); + to_free = i_ipaddr; break; } } spin_unlock_irq(&rds_ibdev->spinlock); + + if (to_free) { + synchronize_rcu(); + kfree(to_free); + } } int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) @@ -133,8 +151,10 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) struct rds_ib_device *rds_ibdev_old; rds_ibdev_old = rds_ib_get_device(ipaddr); - if (rds_ibdev_old) + if (rds_ibdev_old) { rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); + rds_ib_dev_put(rds_ibdev_old); + } return rds_ib_add_ipaddr(rds_ibdev, ipaddr); } @@ -149,12 +169,13 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con BUG_ON(list_empty(&ic->ib_node)); list_del(&ic->ib_node); - spin_lock_irq(&rds_ibdev->spinlock); + spin_lock(&rds_ibdev->spinlock); list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); - spin_unlock_irq(&rds_ibdev->spinlock); + spin_unlock(&rds_ibdev->spinlock); spin_unlock_irq(&ib_nodev_conns_lock); ic->rds_ibdev = rds_ibdev; + atomic_inc(&rds_ibdev->refcount); } void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) @@ -174,18 +195,18 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection * spin_unlock(&ib_nodev_conns_lock); ic->rds_ibdev = NULL; + rds_ib_dev_put(rds_ibdev); } -void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock) +void rds_ib_destroy_nodev_conns(void) { struct rds_ib_connection *ic, *_ic; LIST_HEAD(tmp_list); /* avoid calling conn_destroy with irqs off */ - spin_lock_irq(list_lock); - list_splice(list, &tmp_list); - INIT_LIST_HEAD(list); - spin_unlock_irq(list_lock); + spin_lock_irq(&ib_nodev_conns_lock); + list_splice(&ib_nodev_conns, &tmp_list); + spin_unlock_irq(&ib_nodev_conns_lock); list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) rds_conn_destroy(ic->conn); @@ -199,12 +220,12 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) if (!pool) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&pool->free_list); - INIT_LIST_HEAD(&pool->drop_list); - INIT_LIST_HEAD(&pool->clean_list); + init_llist_head(&pool->free_list); + init_llist_head(&pool->drop_list); + init_llist_head(&pool->clean_list); mutex_init(&pool->flush_lock); - spin_lock_init(&pool->list_lock); - INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); + init_waitqueue_head(&pool->flush_wait); + INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); pool->fmr_attr.max_pages = fmr_message_size; pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; @@ -232,34 +253,52 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) { - flush_workqueue(rds_wq); - rds_ib_flush_mr_pool(pool, 1); - BUG_ON(atomic_read(&pool->item_count)); - BUG_ON(atomic_read(&pool->free_pinned)); + cancel_delayed_work_sync(&pool->flush_worker); + rds_ib_flush_mr_pool(pool, 1, NULL); + WARN_ON(atomic_read(&pool->item_count)); + WARN_ON(atomic_read(&pool->free_pinned)); kfree(pool); } static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; - unsigned long flags; + struct llist_node *ret; + unsigned long *flag; - spin_lock_irqsave(&pool->list_lock, flags); - if (!list_empty(&pool->clean_list)) { - ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list); - list_del_init(&ibmr->list); - } - spin_unlock_irqrestore(&pool->list_lock, flags); + preempt_disable(); + flag = &__get_cpu_var(clean_list_grace); + set_bit(CLEAN_LIST_BUSY_BIT, flag); + ret = llist_del_first(&pool->clean_list); + if (ret) + ibmr = llist_entry(ret, struct rds_ib_mr, llnode); + clear_bit(CLEAN_LIST_BUSY_BIT, flag); + preempt_enable(); return ibmr; } +static inline void wait_clean_list_grace(void) +{ + int cpu; + unsigned long *flag; + + for_each_online_cpu(cpu) { + flag = &per_cpu(clean_list_grace, cpu); + while (test_bit(CLEAN_LIST_BUSY_BIT, flag)) + cpu_relax(); + } +} + static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) { struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; struct rds_ib_mr *ibmr = NULL; int err = 0, iter = 0; + if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) + schedule_delayed_work(&pool->flush_worker, 10); + while (1) { ibmr = rds_ib_reuse_fmr(pool); if (ibmr) @@ -286,19 +325,24 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) /* We do have some empty MRs. Flush them out. */ rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); - rds_ib_flush_mr_pool(pool, 0); + rds_ib_flush_mr_pool(pool, 0, &ibmr); + if (ibmr) + return ibmr; } - ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); + ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev)); if (!ibmr) { err = -ENOMEM; goto out_no_cigar; } + memset(ibmr, 0, sizeof(*ibmr)); + ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE), + IB_ACCESS_REMOTE_WRITE| + IB_ACCESS_REMOTE_ATOMIC), &pool->fmr_attr); if (IS_ERR(ibmr->fmr)) { err = PTR_ERR(ibmr->fmr); @@ -366,7 +410,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm if (page_cnt > fmr_message_size) return -EINVAL; - dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC); + dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, + rdsibdev_to_node(rds_ibdev)); if (!dma_pages) return -ENOMEM; @@ -440,6 +485,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) /* FIXME we need a way to tell a r/w MR * from a r/o MR */ + BUG_ON(irqs_disabled()); set_page_dirty(page); put_page(page); } @@ -475,33 +521,107 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr } /* + * given an llist of mrs, put them all into the list_head for more processing + */ +static void llist_append_to_list(struct llist_head *llist, struct list_head *list) +{ + struct rds_ib_mr *ibmr; + struct llist_node *node; + struct llist_node *next; + + node = llist_del_all(llist); + while (node) { + next = node->next; + ibmr = llist_entry(node, struct rds_ib_mr, llnode); + list_add_tail(&ibmr->unmap_list, list); + node = next; + } +} + +/* + * this takes a list head of mrs and turns it into linked llist nodes + * of clusters. Each cluster has linked llist nodes of + * MR_CLUSTER_SIZE mrs that are ready for reuse. + */ +static void list_to_llist_nodes(struct rds_ib_mr_pool *pool, + struct list_head *list, + struct llist_node **nodes_head, + struct llist_node **nodes_tail) +{ + struct rds_ib_mr *ibmr; + struct llist_node *cur = NULL; + struct llist_node **next = nodes_head; + + list_for_each_entry(ibmr, list, unmap_list) { + cur = &ibmr->llnode; + *next = cur; + next = &cur->next; + } + *next = NULL; + *nodes_tail = cur; +} + +/* * Flush our pool of MRs. * At a minimum, all currently unused MRs are unmapped. * If the number of MRs allocated exceeds the limit, we also try * to free as many MRs as needed to get back to this limit. */ -static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) +static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, + int free_all, struct rds_ib_mr **ibmr_ret) { struct rds_ib_mr *ibmr, *next; + struct llist_node *clean_nodes; + struct llist_node *clean_tail; LIST_HEAD(unmap_list); LIST_HEAD(fmr_list); unsigned long unpinned = 0; - unsigned long flags; unsigned int nfreed = 0, ncleaned = 0, free_goal; int ret = 0; rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); - mutex_lock(&pool->flush_lock); + if (ibmr_ret) { + DEFINE_WAIT(wait); + while(!mutex_trylock(&pool->flush_lock)) { + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + finish_wait(&pool->flush_wait, &wait); + goto out_nolock; + } + + prepare_to_wait(&pool->flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (llist_empty(&pool->clean_list)) + schedule(); + + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + finish_wait(&pool->flush_wait, &wait); + goto out_nolock; + } + } + finish_wait(&pool->flush_wait, &wait); + } else + mutex_lock(&pool->flush_lock); + + if (ibmr_ret) { + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + goto out; + } + } - spin_lock_irqsave(&pool->list_lock, flags); /* Get the list of all MRs to be dropped. Ordering matters - - * we want to put drop_list ahead of free_list. */ - list_splice_init(&pool->free_list, &unmap_list); - list_splice_init(&pool->drop_list, &unmap_list); + * we want to put drop_list ahead of free_list. + */ + llist_append_to_list(&pool->drop_list, &unmap_list); + llist_append_to_list(&pool->free_list, &unmap_list); if (free_all) - list_splice_init(&pool->clean_list, &unmap_list); - spin_unlock_irqrestore(&pool->list_lock, flags); + llist_append_to_list(&pool->clean_list, &unmap_list); free_goal = rds_ib_flush_goal(pool, free_all); @@ -509,19 +629,20 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) goto out; /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ - list_for_each_entry(ibmr, &unmap_list, list) + list_for_each_entry(ibmr, &unmap_list, unmap_list) list_add(&ibmr->fmr->list, &fmr_list); + ret = ib_unmap_fmr(&fmr_list); if (ret) printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); /* Now we can destroy the DMA mapping and unpin any pages */ - list_for_each_entry_safe(ibmr, next, &unmap_list, list) { + list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) { unpinned += ibmr->sg_len; __rds_ib_teardown_mr(ibmr); if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { rds_ib_stats_inc(s_ib_rdma_mr_free); - list_del(&ibmr->list); + list_del(&ibmr->unmap_list); ib_dealloc_fmr(ibmr->fmr); kfree(ibmr); nfreed++; @@ -529,9 +650,27 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) ncleaned++; } - spin_lock_irqsave(&pool->list_lock, flags); - list_splice(&unmap_list, &pool->clean_list); - spin_unlock_irqrestore(&pool->list_lock, flags); + if (!list_empty(&unmap_list)) { + /* we have to make sure that none of the things we're about + * to put on the clean list would race with other cpus trying + * to pull items off. The llist would explode if we managed to + * remove something from the clean list and then add it back again + * while another CPU was spinning on that same item in llist_del_first. + * + * This is pretty unlikely, but just in case wait for an llist grace period + * here before adding anything back into the clean list. + */ + wait_clean_list_grace(); + + list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail); + if (ibmr_ret) + *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); + + /* more than one entry in llist nodes */ + if (clean_nodes->next) + llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list); + + } atomic_sub(unpinned, &pool->free_pinned); atomic_sub(ncleaned, &pool->dirty_count); @@ -539,14 +678,17 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) out: mutex_unlock(&pool->flush_lock); + if (waitqueue_active(&pool->flush_wait)) + wake_up(&pool->flush_wait); +out_nolock: return ret; } static void rds_ib_mr_pool_flush_worker(struct work_struct *work) { - struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker); + struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work); - rds_ib_flush_mr_pool(pool, 0); + rds_ib_flush_mr_pool(pool, 0, NULL); } void rds_ib_free_mr(void *trans_private, int invalidate) @@ -554,47 +696,48 @@ void rds_ib_free_mr(void *trans_private, int invalidate) struct rds_ib_mr *ibmr = trans_private; struct rds_ib_device *rds_ibdev = ibmr->device; struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; - unsigned long flags; rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); /* Return it to the pool's free list */ - spin_lock_irqsave(&pool->list_lock, flags); if (ibmr->remap_count >= pool->fmr_attr.max_maps) - list_add(&ibmr->list, &pool->drop_list); + llist_add(&ibmr->llnode, &pool->drop_list); else - list_add(&ibmr->list, &pool->free_list); + llist_add(&ibmr->llnode, &pool->free_list); atomic_add(ibmr->sg_len, &pool->free_pinned); atomic_inc(&pool->dirty_count); - spin_unlock_irqrestore(&pool->list_lock, flags); /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || atomic_read(&pool->dirty_count) >= pool->max_items / 10) - queue_work(rds_wq, &pool->flush_worker); + schedule_delayed_work(&pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { - rds_ib_flush_mr_pool(pool, 0); + rds_ib_flush_mr_pool(pool, 0, NULL); } else { /* We get here if the user created a MR marked * as use_once and invalidate at the same time. */ - queue_work(rds_wq, &pool->flush_worker); + schedule_delayed_work(&pool->flush_worker, 10); } } + + rds_ib_dev_put(rds_ibdev); } void rds_ib_flush_mrs(void) { struct rds_ib_device *rds_ibdev; + down_read(&rds_ib_devices_lock); list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; if (pool) - rds_ib_flush_mr_pool(pool, 0); + rds_ib_flush_mr_pool(pool, 0, NULL); } + up_read(&rds_ib_devices_lock); } void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, @@ -626,6 +769,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); ibmr->device = rds_ibdev; + rds_ibdev = NULL; out: if (ret) { @@ -633,5 +777,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, rds_ib_free_mr(ibmr, 0); ibmr = ERR_PTR(ret); } + if (rds_ibdev) + rds_ib_dev_put(rds_ibdev); return ibmr; } + diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 04dc0d3f3c9..d67de453c35 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -31,6 +31,7 @@ * */ #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/pci.h> #include <linux/dma-mapping.h> #include <rdma/rdma_cm.h> @@ -42,42 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab; static struct kmem_cache *rds_ib_frag_slab; static atomic_t rds_ib_allocation = ATOMIC_INIT(0); -static void rds_ib_frag_drop_page(struct rds_page_frag *frag) -{ - rdsdebug("frag %p page %p\n", frag, frag->f_page); - __free_page(frag->f_page); - frag->f_page = NULL; -} - -static void rds_ib_frag_free(struct rds_page_frag *frag) -{ - rdsdebug("frag %p page %p\n", frag, frag->f_page); - BUG_ON(frag->f_page != NULL); - kmem_cache_free(rds_ib_frag_slab, frag); -} - -/* - * We map a page at a time. Its fragments are posted in order. This - * is called in fragment order as the fragments get send completion events. - * Only the last frag in the page performs the unmapping. - * - * It's OK for ring cleanup to call this in whatever order it likes because - * DMA is not in flight and so we can unmap while other ring entries still - * hold page references in their frags. - */ -static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic, - struct rds_ib_recv_work *recv) -{ - struct rds_page_frag *frag = recv->r_frag; - - rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); - if (frag->f_mapped) - ib_dma_unmap_page(ic->i_cm_id->device, - frag->f_mapped, - RDS_FRAG_SIZE, DMA_FROM_DEVICE); - frag->f_mapped = 0; -} - void rds_ib_recv_init_ring(struct rds_ib_connection *ic) { struct rds_ib_recv_work *recv; @@ -94,18 +59,163 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic) recv->r_wr.sg_list = recv->r_sge; recv->r_wr.num_sge = RDS_IB_RECV_SGE; - sge = rds_ib_data_sge(ic, recv->r_sge); + sge = &recv->r_sge[0]; + sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; + + sge = &recv->r_sge[1]; sge->addr = 0; sge->length = RDS_FRAG_SIZE; sge->lkey = ic->i_mr->lkey; + } +} - sge = rds_ib_header_sge(ic, recv->r_sge); - sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; +/* + * The entire 'from' list, including the from element itself, is put on + * to the tail of the 'to' list. + */ +static void list_splice_entire_tail(struct list_head *from, + struct list_head *to) +{ + struct list_head *from_last = from->prev; + + list_splice_tail(from_last, to); + list_add_tail(from_last, to); +} + +static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache) +{ + struct list_head *tmp; + + tmp = xchg(&cache->xfer, NULL); + if (tmp) { + if (cache->ready) + list_splice_entire_tail(tmp, cache->ready); + else + cache->ready = tmp; } } +static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache) +{ + struct rds_ib_cache_head *head; + int cpu; + + cache->percpu = alloc_percpu(struct rds_ib_cache_head); + if (!cache->percpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + head = per_cpu_ptr(cache->percpu, cpu); + head->first = NULL; + head->count = 0; + } + cache->xfer = NULL; + cache->ready = NULL; + + return 0; +} + +int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic) +{ + int ret; + + ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs); + if (!ret) { + ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags); + if (ret) + free_percpu(ic->i_cache_incs.percpu); + } + + return ret; +} + +static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache, + struct list_head *caller_list) +{ + struct rds_ib_cache_head *head; + int cpu; + + for_each_possible_cpu(cpu) { + head = per_cpu_ptr(cache->percpu, cpu); + if (head->first) { + list_splice_entire_tail(head->first, caller_list); + head->first = NULL; + } + } + + if (cache->ready) { + list_splice_entire_tail(cache->ready, caller_list); + cache->ready = NULL; + } +} + +void rds_ib_recv_free_caches(struct rds_ib_connection *ic) +{ + struct rds_ib_incoming *inc; + struct rds_ib_incoming *inc_tmp; + struct rds_page_frag *frag; + struct rds_page_frag *frag_tmp; + LIST_HEAD(list); + + rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); + rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list); + free_percpu(ic->i_cache_incs.percpu); + + list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) { + list_del(&inc->ii_cache_entry); + WARN_ON(!list_empty(&inc->ii_frags)); + kmem_cache_free(rds_ib_incoming_slab, inc); + } + + rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); + rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list); + free_percpu(ic->i_cache_frags.percpu); + + list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) { + list_del(&frag->f_cache_entry); + WARN_ON(!list_empty(&frag->f_item)); + kmem_cache_free(rds_ib_frag_slab, frag); + } +} + +/* fwd decl */ +static void rds_ib_recv_cache_put(struct list_head *new_item, + struct rds_ib_refill_cache *cache); +static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache); + + +/* Recycle frag and attached recv buffer f_sg */ +static void rds_ib_frag_free(struct rds_ib_connection *ic, + struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); + + rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags); +} + +/* Recycle inc after freeing attached frags */ +void rds_ib_inc_free(struct rds_incoming *inc) +{ + struct rds_ib_incoming *ibinc; + struct rds_page_frag *frag; + struct rds_page_frag *pos; + struct rds_ib_connection *ic = inc->i_conn->c_transport_data; + + ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + + /* Free attached frags */ + list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { + list_del_init(&frag->f_item); + rds_ib_frag_free(ic, frag); + } + BUG_ON(!list_empty(&ibinc->ii_frags)); + + rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); + rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs); +} + static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, struct rds_ib_recv_work *recv) { @@ -114,10 +224,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, recv->r_ibinc = NULL; } if (recv->r_frag) { - rds_ib_recv_unmap_page(ic, recv); - if (recv->r_frag->f_page) - rds_ib_frag_drop_page(recv->r_frag); - rds_ib_frag_free(recv->r_frag); + ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); + rds_ib_frag_free(ic, recv->r_frag); recv->r_frag = NULL; } } @@ -128,84 +236,111 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) for (i = 0; i < ic->i_recv_ring.w_nr; i++) rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); - - if (ic->i_frag.f_page) - rds_ib_frag_drop_page(&ic->i_frag); } -static int rds_ib_recv_refill_one(struct rds_connection *conn, - struct rds_ib_recv_work *recv, - gfp_t kptr_gfp, gfp_t page_gfp) +static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic, + gfp_t slab_mask) { - struct rds_ib_connection *ic = conn->c_transport_data; - dma_addr_t dma_addr; - struct ib_sge *sge; - int ret = -ENOMEM; + struct rds_ib_incoming *ibinc; + struct list_head *cache_item; + int avail_allocs; - if (recv->r_ibinc == NULL) { - if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) { + cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs); + if (cache_item) { + ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry); + } else { + avail_allocs = atomic_add_unless(&rds_ib_allocation, + 1, rds_ib_sysctl_max_recv_allocation); + if (!avail_allocs) { rds_ib_stats_inc(s_ib_rx_alloc_limit); - goto out; + return NULL; } - recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, - kptr_gfp); - if (recv->r_ibinc == NULL) { + ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask); + if (!ibinc) { atomic_dec(&rds_ib_allocation); - goto out; + return NULL; } - INIT_LIST_HEAD(&recv->r_ibinc->ii_frags); - rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr); } + INIT_LIST_HEAD(&ibinc->ii_frags); + rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); - if (recv->r_frag == NULL) { - recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp); - if (recv->r_frag == NULL) - goto out; - INIT_LIST_HEAD(&recv->r_frag->f_item); - recv->r_frag->f_page = NULL; + return ibinc; +} + +static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic, + gfp_t slab_mask, gfp_t page_mask) +{ + struct rds_page_frag *frag; + struct list_head *cache_item; + int ret; + + cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags); + if (cache_item) { + frag = container_of(cache_item, struct rds_page_frag, f_cache_entry); + } else { + frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask); + if (!frag) + return NULL; + + sg_init_table(&frag->f_sg, 1); + ret = rds_page_remainder_alloc(&frag->f_sg, + RDS_FRAG_SIZE, page_mask); + if (ret) { + kmem_cache_free(rds_ib_frag_slab, frag); + return NULL; + } } - if (ic->i_frag.f_page == NULL) { - ic->i_frag.f_page = alloc_page(page_gfp); - if (ic->i_frag.f_page == NULL) - goto out; - ic->i_frag.f_offset = 0; + INIT_LIST_HEAD(&frag->f_item); + + return frag; +} + +static int rds_ib_recv_refill_one(struct rds_connection *conn, + struct rds_ib_recv_work *recv, int prefill) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_sge *sge; + int ret = -ENOMEM; + gfp_t slab_mask = GFP_NOWAIT; + gfp_t page_mask = GFP_NOWAIT; + + if (prefill) { + slab_mask = GFP_KERNEL; + page_mask = GFP_HIGHUSER; } - dma_addr = ib_dma_map_page(ic->i_cm_id->device, - ic->i_frag.f_page, - ic->i_frag.f_offset, - RDS_FRAG_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) - goto out; + if (!ic->i_cache_incs.ready) + rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); + if (!ic->i_cache_frags.ready) + rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); /* - * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap() - * must be called on this recv. This happens as completions hit - * in order or on connection shutdown. + * ibinc was taken from recv if recv contained the start of a message. + * recvs that were continuations will still have this allocated. */ - recv->r_frag->f_page = ic->i_frag.f_page; - recv->r_frag->f_offset = ic->i_frag.f_offset; - recv->r_frag->f_mapped = dma_addr; + if (!recv->r_ibinc) { + recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask); + if (!recv->r_ibinc) + goto out; + } - sge = rds_ib_data_sge(ic, recv->r_sge); - sge->addr = dma_addr; - sge->length = RDS_FRAG_SIZE; + WARN_ON(recv->r_frag); /* leak! */ + recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask); + if (!recv->r_frag) + goto out; - sge = rds_ib_header_sge(ic, recv->r_sge); + ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, + 1, DMA_FROM_DEVICE); + WARN_ON(ret != 1); + + sge = &recv->r_sge[0]; sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); sge->length = sizeof(struct rds_header); - get_page(recv->r_frag->f_page); - - if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { - ic->i_frag.f_offset += RDS_FRAG_SIZE; - } else { - put_page(ic->i_frag.f_page); - ic->i_frag.f_page = NULL; - ic->i_frag.f_offset = 0; - } + sge = &recv->r_sge[1]; + sge->addr = ib_sg_dma_address(ic->i_cm_id->device, &recv->r_frag->f_sg); + sge->length = ib_sg_dma_len(ic->i_cm_id->device, &recv->r_frag->f_sg); ret = 0; out: @@ -215,13 +350,11 @@ out: /* * This tries to allocate and post unused work requests after making sure that * they have all the allocations they need to queue received fragments into - * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc - * pairs don't go unmatched. + * sockets. * * -1 is returned if posting fails due to temporary resource exhaustion. */ -int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, - gfp_t page_gfp, int prefill) +void rds_ib_recv_refill(struct rds_connection *conn, int prefill) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_recv_work *recv; @@ -235,28 +368,28 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, if (pos >= ic->i_recv_ring.w_nr) { printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", pos); - ret = -EINVAL; break; } recv = &ic->i_recvs[pos]; - ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp); + ret = rds_ib_recv_refill_one(conn, recv, prefill); if (ret) { - ret = -1; break; } /* XXX when can this fail? */ ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, - recv->r_ibinc, recv->r_frag->f_page, - (long) recv->r_frag->f_mapped, ret); + recv->r_ibinc, sg_page(&recv->r_frag->f_sg), + (long) ib_sg_dma_address( + ic->i_cm_id->device, + &recv->r_frag->f_sg), + ret); if (ret) { rds_ib_conn_error(conn, "recv post on " "%pI4 returned %d, disconnecting and " "reconnecting\n", &conn->c_faddr, ret); - ret = -1; break; } @@ -269,37 +402,74 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, if (ret) rds_ib_ring_unalloc(&ic->i_recv_ring, 1); - return ret; } -void rds_ib_inc_purge(struct rds_incoming *inc) +/* + * We want to recycle several types of recv allocations, like incs and frags. + * To use this, the *_free() function passes in the ptr to a list_head within + * the recyclee, as well as the cache to put it on. + * + * First, we put the memory on a percpu list. When this reaches a certain size, + * We move it to an intermediate non-percpu list in a lockless manner, with some + * xchg/compxchg wizardry. + * + * N.B. Instead of a list_head as the anchor, we use a single pointer, which can + * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and + * list_empty() will return true with one element is actually present. + */ +static void rds_ib_recv_cache_put(struct list_head *new_item, + struct rds_ib_refill_cache *cache) { - struct rds_ib_incoming *ibinc; - struct rds_page_frag *frag; - struct rds_page_frag *pos; + unsigned long flags; + struct list_head *old, *chpfirst; - ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); - rdsdebug("purging ibinc %p inc %p\n", ibinc, inc); + local_irq_save(flags); - list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { - list_del_init(&frag->f_item); - rds_ib_frag_drop_page(frag); - rds_ib_frag_free(frag); - } + chpfirst = __this_cpu_read(cache->percpu->first); + if (!chpfirst) + INIT_LIST_HEAD(new_item); + else /* put on front */ + list_add_tail(new_item, chpfirst); + + __this_cpu_write(cache->percpu->first, new_item); + __this_cpu_inc(cache->percpu->count); + + if (__this_cpu_read(cache->percpu->count) < RDS_IB_RECYCLE_BATCH_COUNT) + goto end; + + /* + * Return our per-cpu first list to the cache's xfer by atomically + * grabbing the current xfer list, appending it to our per-cpu list, + * and then atomically returning that entire list back to the + * cache's xfer list as long as it's still empty. + */ + do { + old = xchg(&cache->xfer, NULL); + if (old) + list_splice_entire_tail(old, chpfirst); + old = cmpxchg(&cache->xfer, NULL, chpfirst); + } while (old); + + + __this_cpu_write(cache->percpu->first, NULL); + __this_cpu_write(cache->percpu->count, 0); +end: + local_irq_restore(flags); } -void rds_ib_inc_free(struct rds_incoming *inc) +static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache) { - struct rds_ib_incoming *ibinc; - - ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + struct list_head *head = cache->ready; + + if (head) { + if (!list_empty(head)) { + cache->ready = head->next; + list_del_init(head); + } else + cache->ready = NULL; + } - rds_ib_inc_purge(inc); - rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); - BUG_ON(!list_empty(&ibinc->ii_frags)); - kmem_cache_free(rds_ib_incoming_slab, ibinc); - atomic_dec(&rds_ib_allocation); - BUG_ON(atomic_read(&rds_ib_allocation) < 0); + return head; } int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, @@ -335,13 +505,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, to_copy = min_t(unsigned long, to_copy, len - copied); rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " - "[%p, %lu] + %lu\n", + "[%p, %u] + %lu\n", to_copy, iov->iov_base, iov->iov_len, iov_off, - frag->f_page, frag->f_offset, frag_off); + sg_page(&frag->f_sg), frag->f_sg.offset, frag_off); /* XXX needs + offset for multiple recvs per page */ - ret = rds_page_copy_to_user(frag->f_page, - frag->f_offset + frag_off, + ret = rds_page_copy_to_user(sg_page(&frag->f_sg), + frag->f_sg.offset + frag_off, iov->iov_base + iov_off, to_copy); if (ret) { @@ -428,7 +598,7 @@ static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, { atomic64_set(&ic->i_ack_next, seq); if (ack_required) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } } @@ -436,7 +606,7 @@ static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, static u64 rds_ib_get_ack(struct rds_ib_connection *ic) { clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return atomic64_read(&ic->i_ack_next); } @@ -468,8 +638,8 @@ static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credi set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); rds_ib_stats_inc(s_ib_ack_send_failure); - /* Need to finesse this later. */ - BUG(); + + rds_ib_conn_error(ic->conn, "sending ack failed\n"); } else rds_ib_stats_inc(s_ib_ack_sent); } @@ -556,47 +726,6 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic) return rds_ib_get_ack(ic); } -static struct rds_header *rds_ib_get_header(struct rds_connection *conn, - struct rds_ib_recv_work *recv, - u32 data_len) -{ - struct rds_ib_connection *ic = conn->c_transport_data; - void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs]; - void *addr; - u32 misplaced_hdr_bytes; - - /* - * Support header at the front (RDS 3.1+) as well as header-at-end. - * - * Cases: - * 1) header all in header buff (great!) - * 2) header all in data page (copy all to header buff) - * 3) header split across hdr buf + data page - * (move bit in hdr buff to end before copying other bit from data page) - */ - if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE) - return hdr_buff; - - if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) { - addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); - memcpy(hdr_buff, - addr + recv->r_frag->f_offset + data_len, - sizeof(struct rds_header)); - kunmap_atomic(addr, KM_SOFTIRQ0); - return hdr_buff; - } - - misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len)); - - memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes); - - addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); - memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len, - sizeof(struct rds_header) - misplaced_hdr_bytes); - kunmap_atomic(addr, KM_SOFTIRQ0); - return hdr_buff; -} - /* * It's kind of lame that we're copying from the posted receive pages into * long-lived bitmaps. We could have posted the bitmaps and rdma written into @@ -638,7 +767,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ - addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); + addr = kmap_atomic(sg_page(&frag->f_sg)); src = addr + frag_off; dst = (void *)map->m_page_addrs[map_page] + map_off; @@ -648,7 +777,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, uncongested |= ~(*src) & *dst; *dst++ = *src++; } - kunmap_atomic(addr, KM_SOFTIRQ0); + kunmap_atomic(addr); copied += to_copy; @@ -701,7 +830,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, if (data_len < sizeof(struct rds_header)) { rds_ib_conn_error(conn, "incoming message " - "from %pI4 didn't inclue a " + "from %pI4 didn't include a " "header, disconnecting and " "reconnecting\n", &conn->c_faddr); @@ -709,7 +838,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, } data_len -= sizeof(struct rds_header); - ihdr = rds_ib_get_header(conn, recv, data_len); + ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; /* Validate the checksum. */ if (!rds_message_verify_checksum(ihdr)) { @@ -741,12 +870,12 @@ static void rds_ib_process_recv(struct rds_connection *conn, * the inc is freed. We don't go that route, so we have to drop the * page ref ourselves. We can't just leave the page on the recv * because that confuses the dma mapping of pages and each recv's use - * of a partial page. We can leave the frag, though, it will be - * reused. + * of a partial page. * * FIXME: Fold this into the code path below. */ - rds_ib_frag_drop_page(recv->r_frag); + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; return; } @@ -756,7 +885,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, * into the inc and save the inc so we can hang upcoming fragments * off its list. */ - if (ibinc == NULL) { + if (!ibinc) { ibinc = recv->r_ibinc; recv->r_ibinc = NULL; ic->i_ibinc = ibinc; @@ -794,8 +923,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, rds_ib_cong_recv(conn, ibinc); else { rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, - &ibinc->ii_inc, GFP_ATOMIC, - KM_SOFTIRQ0); + &ibinc->ii_inc, GFP_ATOMIC); state->ack_next = be64_to_cpu(hdr->h_sequence); state->ack_next_valid = 1; } @@ -841,32 +969,38 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic, struct rds_ib_recv_work *recv; while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, + rds_ib_wc_status_str(wc.status), wc.byte_len, be32_to_cpu(wc.ex.imm_data)); rds_ib_stats_inc(s_ib_rx_cq_event); recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; - rds_ib_recv_unmap_page(ic, recv); + ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); /* * Also process recvs in connecting state because it is possible * to get a recv completion _before_ the rdmacm ESTABLISHED * event is processed. */ - if (rds_conn_up(conn) || rds_conn_connecting(conn)) { + if (wc.status == IB_WC_SUCCESS) { + rds_ib_process_recv(conn, recv, wc.byte_len, state); + } else { /* We expect errors as the qp is drained during shutdown */ - if (wc.status == IB_WC_SUCCESS) { - rds_ib_process_recv(conn, recv, wc.byte_len, state); - } else { - rds_ib_conn_error(conn, "recv completion on " - "%pI4 had status %u, disconnecting and " - "reconnecting\n", &conn->c_faddr, - wc.status); - } + if (rds_conn_up(conn) || rds_conn_connecting(conn)) + rds_ib_conn_error(conn, "recv completion on %pI4 had " + "status %u (%s), disconnecting and " + "reconnecting\n", &conn->c_faddr, + wc.status, + rds_ib_wc_status_str(wc.status)); } + /* + * It's very important that we only free this ring entry if we've truly + * freed the resources allocated to the entry. The refilling path can + * leak if we don't. + */ rds_ib_ring_free(&ic->i_recv_ring, 1); } } @@ -896,11 +1030,8 @@ void rds_ib_recv_tasklet_fn(unsigned long data) if (rds_ib_ring_empty(&ic->i_recv_ring)) rds_ib_stats_inc(s_ib_rx_ring_empty); - /* - * If the ring is running low, then schedule the thread to refill. - */ if (rds_ib_ring_low(&ic->i_recv_ring)) - queue_delayed_work(rds_wq, &conn->c_recv_w, 0); + rds_ib_recv_refill(conn, 0); } int rds_ib_recv(struct rds_connection *conn) @@ -909,25 +1040,13 @@ int rds_ib_recv(struct rds_connection *conn) int ret = 0; rdsdebug("conn %p\n", conn); - - /* - * If we get a temporary posting failure in this context then - * we're really low and we want the caller to back off for a bit. - */ - mutex_lock(&ic->i_recv_mutex); - if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) - ret = -ENOMEM; - else - rds_ib_stats_inc(s_ib_rx_refill_from_thread); - mutex_unlock(&ic->i_recv_mutex); - if (rds_conn_up(conn)) rds_ib_attempt_ack(ic); return ret; } -int __init rds_ib_recv_init(void) +int rds_ib_recv_init(void) { struct sysinfo si; int ret = -ENOMEM; @@ -938,14 +1057,14 @@ int __init rds_ib_recv_init(void) rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", sizeof(struct rds_ib_incoming), - 0, 0, NULL); - if (rds_ib_incoming_slab == NULL) + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!rds_ib_incoming_slab) goto out; rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", sizeof(struct rds_page_frag), - 0, 0, NULL); - if (rds_ib_frag_slab == NULL) + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!rds_ib_frag_slab) kmem_cache_destroy(rds_ib_incoming_slab); else ret = 0; diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index a10fab6886d..1dde91e3dc7 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -34,13 +34,52 @@ #include <linux/in.h> #include <linux/device.h> #include <linux/dmapool.h> +#include <linux/ratelimit.h> #include "rds.h" -#include "rdma.h" #include "ib.h" -static void rds_ib_send_rdma_complete(struct rds_message *rm, - int wc_status) +static char *rds_ib_wc_status_strings[] = { +#define RDS_IB_WC_STATUS_STR(foo) \ + [IB_WC_##foo] = __stringify(IB_WC_##foo) + RDS_IB_WC_STATUS_STR(SUCCESS), + RDS_IB_WC_STATUS_STR(LOC_LEN_ERR), + RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR), + RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR), + RDS_IB_WC_STATUS_STR(LOC_PROT_ERR), + RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR), + RDS_IB_WC_STATUS_STR(MW_BIND_ERR), + RDS_IB_WC_STATUS_STR(BAD_RESP_ERR), + RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR), + RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR), + RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR), + RDS_IB_WC_STATUS_STR(REM_OP_ERR), + RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR), + RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR), + RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR), + RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR), + RDS_IB_WC_STATUS_STR(REM_ABORT_ERR), + RDS_IB_WC_STATUS_STR(INV_EECN_ERR), + RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR), + RDS_IB_WC_STATUS_STR(FATAL_ERR), + RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR), + RDS_IB_WC_STATUS_STR(GENERAL_ERR), +#undef RDS_IB_WC_STATUS_STR +}; + +char *rds_ib_wc_status_str(enum ib_wc_status status) +{ + return rds_str_array(rds_ib_wc_status_strings, + ARRAY_SIZE(rds_ib_wc_status_strings), status); +} + +/* + * Convert IB-specific error message to RDS error message and call core + * completion handler. + */ +static void rds_ib_send_complete(struct rds_message *rm, + int wc_status, + void (*complete)(struct rds_message *rm, int status)) { int notify_status; @@ -60,69 +99,124 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm, notify_status = RDS_RDMA_OTHER_ERROR; break; } - rds_rdma_send_complete(rm, notify_status); + complete(rm, notify_status); +} + +static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, + struct rm_data_op *op, + int wc_status) +{ + if (op->op_nents) + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + DMA_TO_DEVICE); } static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, - struct rds_rdma_op *op) + struct rm_rdma_op *op, + int wc_status) { - if (op->r_mapped) { + if (op->op_mapped) { ib_dma_unmap_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, - op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - op->r_mapped = 0; + op->op_sg, op->op_nents, + op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->op_mapped = 0; } + + /* If the user asked for a completion notification on this + * message, we can implement three different semantics: + * 1. Notify when we received the ACK on the RDS message + * that was queued with the RDMA. This provides reliable + * notification of RDMA status at the expense of a one-way + * packet delay. + * 2. Notify when the IB stack gives us the completion event for + * the RDMA operation. + * 3. Notify when the IB stack gives us the completion event for + * the accompanying RDS messages. + * Here, we implement approach #3. To implement approach #2, + * we would need to take an event for the rdma WR. To implement #1, + * don't call rds_rdma_send_complete at all, and fall back to the notify + * handling in the ACK processing code. + * + * Note: There's no need to explicitly sync any RDMA buffers using + * ib_dma_sync_sg_for_cpu - the completion for the RDMA + * operation itself unmapped the RDMA buffers, which takes care + * of synching. + */ + rds_ib_send_complete(container_of(op, struct rds_message, rdma), + wc_status, rds_rdma_send_complete); + + if (op->op_write) + rds_stats_add(s_send_rdma_bytes, op->op_bytes); + else + rds_stats_add(s_recv_rdma_bytes, op->op_bytes); } -static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, - struct rds_ib_send_work *send, - int wc_status) +static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, + struct rm_atomic_op *op, + int wc_status) { - struct rds_message *rm = send->s_rm; - - rdsdebug("ic %p send %p rm %p\n", ic, send, rm); - - ib_dma_unmap_sg(ic->i_cm_id->device, - rm->m_sg, rm->m_nents, - DMA_TO_DEVICE); - - if (rm->m_rdma_op != NULL) { - rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); - - /* If the user asked for a completion notification on this - * message, we can implement three different semantics: - * 1. Notify when we received the ACK on the RDS message - * that was queued with the RDMA. This provides reliable - * notification of RDMA status at the expense of a one-way - * packet delay. - * 2. Notify when the IB stack gives us the completion event for - * the RDMA operation. - * 3. Notify when the IB stack gives us the completion event for - * the accompanying RDS messages. - * Here, we implement approach #3. To implement approach #2, - * call rds_rdma_send_complete from the cq_handler. To implement #1, - * don't call rds_rdma_send_complete at all, and fall back to the notify - * handling in the ACK processing code. - * - * Note: There's no need to explicitly sync any RDMA buffers using - * ib_dma_sync_sg_for_cpu - the completion for the RDMA - * operation itself unmapped the RDMA buffers, which takes care - * of synching. - */ - rds_ib_send_rdma_complete(rm, wc_status); + /* unmap atomic recvbuf */ + if (op->op_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1, + DMA_FROM_DEVICE); + op->op_mapped = 0; + } - if (rm->m_rdma_op->r_write) - rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); - else - rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); + rds_ib_send_complete(container_of(op, struct rds_message, atomic), + wc_status, rds_atomic_send_complete); + + if (op->op_type == RDS_ATOMIC_TYPE_CSWP) + rds_ib_stats_inc(s_ib_atomic_cswp); + else + rds_ib_stats_inc(s_ib_atomic_fadd); +} + +/* + * Unmap the resources associated with a struct send_work. + * + * Returns the rm for no good reason other than it is unobtainable + * other than by switching on wr.opcode, currently, and the caller, + * the event handler, needs it. + */ +static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, + int wc_status) +{ + struct rds_message *rm = NULL; + + /* In the error case, wc.opcode sometimes contains garbage */ + switch (send->s_wr.opcode) { + case IB_WR_SEND: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, data); + rds_ib_send_unmap_data(ic, send->s_op, wc_status); + } + break; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_READ: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, rdma); + rds_ib_send_unmap_rdma(ic, send->s_op, wc_status); + } + break; + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_ATOMIC_CMP_AND_SWP: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, atomic); + rds_ib_send_unmap_atomic(ic, send->s_op, wc_status); + } + break; + default: + printk_ratelimited(KERN_NOTICE + "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", + __func__, send->s_wr.opcode); + break; } - /* If anyone waited for this message to get flushed out, wake - * them up now */ - rds_message_unmapped(rm); + send->s_wr.opcode = 0xdead; - rds_message_put(rm); - send->s_rm = NULL; + return rm; } void rds_ib_send_init_ring(struct rds_ib_connection *ic) @@ -133,23 +227,18 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { struct ib_sge *sge; - send->s_rm = NULL; send->s_op = NULL; send->s_wr.wr_id = i; send->s_wr.sg_list = send->s_sge; - send->s_wr.num_sge = 1; - send->s_wr.opcode = IB_WR_SEND; - send->s_wr.send_flags = 0; send->s_wr.ex.imm_data = 0; - sge = rds_ib_data_sge(ic, send->s_sge); - sge->lkey = ic->i_mr->lkey; - - sge = rds_ib_header_sge(ic, send->s_sge); + sge = &send->s_sge[0]; sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); sge->length = sizeof(struct rds_header); sge->lkey = ic->i_mr->lkey; + + send->s_sge[1].lkey = ic->i_mr->lkey; } } @@ -159,16 +248,24 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic) u32 i; for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { - if (send->s_wr.opcode == 0xdead) - continue; - if (send->s_rm) - rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); - if (send->s_op) - rds_ib_send_unmap_rdma(ic, send->s_op); + if (send->s_op && send->s_wr.opcode != 0xdead) + rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR); } } /* + * The only fast path caller always has a non-zero nr, so we don't + * bother testing nr before performing the atomic sub. + */ +static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) +{ + if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) && + waitqueue_active(&rds_ib_ring_empty_wait)) + wake_up(&rds_ib_ring_empty_wait); + BUG_ON(atomic_read(&ic->i_signaled_sends) < 0); +} + +/* * The _oldest/_free ring operations here race cleanly with the alloc/unalloc * operations performed in the send path. As the sender allocs and potentially * unallocs the next free entry in the ring it doesn't alter which is @@ -178,12 +275,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) { struct rds_connection *conn = context; struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_message *rm = NULL; struct ib_wc wc; struct rds_ib_send_work *send; u32 completed; u32 oldest; u32 i = 0; int ret; + int nr_sig = 0; rdsdebug("cq %p conn %p\n", cq, conn); rds_ib_stats_inc(s_ib_tx_cq_call); @@ -192,13 +291,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) rdsdebug("ib_req_notify_cq send failed: %d\n", ret); while (ib_poll_cq(cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, + rds_ib_wc_status_str(wc.status), wc.byte_len, be32_to_cpu(wc.ex.imm_data)); rds_ib_stats_inc(s_ib_tx_cq_event); if (wc.wr_id == RDS_IB_ACK_WR_ID) { - if (ic->i_ack_queued + HZ/2 < jiffies) + if (time_after(jiffies, ic->i_ack_queued + HZ/2)) rds_ib_stats_inc(s_ib_tx_stalled); rds_ib_ack_send_complete(ic); continue; @@ -210,47 +310,30 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) for (i = 0; i < completed; i++) { send = &ic->i_sends[oldest]; + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; - /* In the error case, wc.opcode sometimes contains garbage */ - switch (send->s_wr.opcode) { - case IB_WR_SEND: - if (send->s_rm) - rds_ib_send_unmap_rm(ic, send, wc.status); - break; - case IB_WR_RDMA_WRITE: - case IB_WR_RDMA_READ: - /* Nothing to be done - the SG list will be unmapped - * when the SEND completes. */ - break; - default: - if (printk_ratelimit()) - printk(KERN_NOTICE - "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", - __func__, send->s_wr.opcode); - break; - } + rm = rds_ib_send_unmap_op(ic, send, wc.status); - send->s_wr.opcode = 0xdead; - send->s_wr.num_sge = 1; - if (send->s_queued + HZ/2 < jiffies) + if (time_after(jiffies, send->s_queued + HZ/2)) rds_ib_stats_inc(s_ib_tx_stalled); - /* If a RDMA operation produced an error, signal this right - * away. If we don't, the subsequent SEND that goes with this - * RDMA will be canceled with ERR_WFLUSH, and the application - * never learn that the RDMA failed. */ - if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { - struct rds_message *rm; - - rm = rds_send_get_message(conn, send->s_op); - if (rm) - rds_ib_send_rdma_complete(rm, wc.status); + if (send->s_op) { + if (send->s_op == rm->m_final_op) { + /* If anyone waited for this message to get flushed out, wake + * them up now */ + rds_message_unmapped(rm); + } + rds_message_put(rm); + send->s_op = NULL; } oldest = (oldest + 1) % ic->i_send_ring.w_nr; } rds_ib_ring_free(&ic->i_send_ring, completed); + rds_ib_sub_signaled(ic, nr_sig); + nr_sig = 0; if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || test_bit(0, &conn->c_map_queued)) @@ -258,10 +341,10 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) /* We expect errors as the qp is drained during shutdown */ if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, - "send completion on %pI4 " - "had status %u, disconnecting and reconnecting\n", - &conn->c_faddr, wc.status); + rds_ib_conn_error(conn, "send completion on %pI4 had status " + "%u (%s), disconnecting and reconnecting\n", + &conn->c_faddr, wc.status, + rds_ib_wc_status_str(wc.status)); } } } @@ -272,7 +355,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) * * Conceptually, we have two counters: * - send credits: this tells us how many WRs we're allowed - * to submit without overruning the reciever's queue. For + * to submit without overruning the receiver's queue. For * each SEND WR we post, we decrement this by one. * * - posted credits: this tells us how many WRs we recently @@ -290,7 +373,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) * credits (see rds_ib_send_add_credits below). * * The RDS send code is essentially single-threaded; rds_send_xmit - * grabs c_send_lock to ensure exclusive access to the send ring. + * sets RDS_IN_XMIT to ensure exclusive access to the send ring. * However, the ACK sending code is independent and can race with * message SENDs. * @@ -409,40 +492,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } -static inline void -rds_ib_xmit_populate_wr(struct rds_ib_connection *ic, - struct rds_ib_send_work *send, unsigned int pos, - unsigned long buffer, unsigned int length, - int send_flags) +static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, + bool notify) { - struct ib_sge *sge; - - WARN_ON(pos != send - ic->i_sends); - - send->s_wr.send_flags = send_flags; - send->s_wr.opcode = IB_WR_SEND; - send->s_wr.num_sge = 2; - send->s_wr.next = NULL; - send->s_queued = jiffies; - send->s_op = NULL; - - if (length != 0) { - sge = rds_ib_data_sge(ic, send->s_sge); - sge->addr = buffer; - sge->length = length; - sge->lkey = ic->i_mr->lkey; - - sge = rds_ib_header_sge(ic, send->s_sge); - } else { - /* We're sending a packet with no payload. There is only - * one SGE */ - send->s_wr.num_sge = 1; - sge = &send->s_sge[0]; + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time + * on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0 || notify) { + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + send->s_wr.send_flags |= IB_SEND_SIGNALED; + return 1; } - - sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; + return 0; } /* @@ -471,17 +535,27 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, u32 pos; u32 i; u32 work_alloc; - u32 credit_alloc; + u32 credit_alloc = 0; u32 posted; u32 adv_credits = 0; int send_flags = 0; - int sent; + int bytes_sent = 0; int ret; int flow_controlled = 0; + int nr_sig = 0; BUG_ON(off % RDS_FRAG_SIZE); BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); + /* Do not send cong updates to IB loopback */ + if (conn->c_loopback + && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { + rds_cong_map_updated(conn->c_fcong, ~(u64) 0); + scat = &rm->data.op_sg[sg]; + ret = max_t(int, RDS_CONG_MAP_BYTES, scat->length); + return sizeof(struct rds_header) + ret; + } + /* FIXME we may overallocate here */ if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) i = 1; @@ -496,14 +570,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, goto out; } - credit_alloc = work_alloc; if (ic->i_flowctl) { credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); adv_credits += posted; if (credit_alloc < work_alloc) { rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); work_alloc = credit_alloc; - flow_controlled++; + flow_controlled = 1; } if (work_alloc == 0) { set_bit(RDS_LL_SEND_FULL, &conn->c_flags); @@ -514,31 +587,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, } /* map the message the first time we see it */ - if (ic->i_rm == NULL) { - /* - printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", - be16_to_cpu(rm->m_inc.i_hdr.h_dport), - rm->m_inc.i_hdr.h_flags, - be32_to_cpu(rm->m_inc.i_hdr.h_len)); - */ - if (rm->m_nents) { - rm->m_count = ib_dma_map_sg(dev, - rm->m_sg, rm->m_nents, DMA_TO_DEVICE); - rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); - if (rm->m_count == 0) { + if (!ic->i_data_op) { + if (rm->data.op_nents) { + rm->data.op_count = ib_dma_map_sg(dev, + rm->data.op_sg, + rm->data.op_nents, + DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); + if (rm->data.op_count == 0) { rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); ret = -ENOMEM; /* XXX ? */ goto out; } } else { - rm->m_count = 0; + rm->data.op_count = 0; } - ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; - ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; rds_message_addref(rm); - ic->i_rm = rm; + ic->i_data_op = &rm->data; /* Finalize the header */ if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) @@ -548,10 +615,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* If it has a RDMA op, tell the peer we did it. This is * used by the peer to release use-once RDMA MRs. */ - if (rm->m_rdma_op) { + if (rm->rdma.op_active) { struct rds_ext_header_rdma ext_hdr; - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); } @@ -571,18 +638,12 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* * Update adv_credits since we reset the ACK_REQUIRED bit. */ - rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); - adv_credits += posted; - BUG_ON(adv_credits > 255); - } else if (ic->i_rm != rm) - BUG(); - - send = &ic->i_sends[pos]; - first = send; - prev = NULL; - scat = &rm->m_sg[sg]; - sent = 0; - i = 0; + if (ic->i_flowctl) { + rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); + adv_credits += posted; + BUG_ON(adv_credits > 255); + } + } /* Sometimes you want to put a fence between an RDMA * READ and the following SEND. @@ -590,81 +651,64 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, * or when requested by the user. Right now, we let * the application choose. */ - if (rm->m_rdma_op && rm->m_rdma_op->r_fence) + if (rm->rdma.op_active && rm->rdma.op_fence) send_flags = IB_SEND_FENCE; - /* - * We could be copying the header into the unused tail of the page. - * That would need to be changed in the future when those pages might - * be mapped userspace pages or page cache pages. So instead we always - * use a second sge and our long-lived ring of mapped headers. We send - * the header after the data so that the data payload can be aligned on - * the receiver. - */ + /* Each frag gets a header. Msgs may be 0 bytes */ + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &ic->i_data_op->op_sg[sg]; + i = 0; + do { + unsigned int len = 0; - /* handle a 0-len message */ - if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { - rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); - goto add_header; - } + /* Set up the header */ + send->s_wr.send_flags = send_flags; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.num_sge = 1; + send->s_wr.next = NULL; + send->s_queued = jiffies; + send->s_op = NULL; - /* if there's data reference it with a chain of work reqs */ - for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { - unsigned int len; + send->s_sge[0].addr = ic->i_send_hdrs_dma + + (pos * sizeof(struct rds_header)); + send->s_sge[0].length = sizeof(struct rds_header); - send = &ic->i_sends[pos]; + memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); - len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); - rds_ib_xmit_populate_wr(ic, send, pos, - ib_sg_dma_address(dev, scat) + off, len, - send_flags); + /* Set up the data, if present */ + if (i < work_alloc + && scat != &rm->data.op_sg[rm->data.op_count]) { + len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + send->s_wr.num_sge = 2; - /* - * We want to delay signaling completions just enough to get - * the batching benefits but not so much that we create dead time - * on the wire. - */ - if (ic->i_unsignaled_wrs-- == 0) { - ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; - send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - } + send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off; + send->s_sge[1].length = len; - ic->i_unsignaled_bytes -= len; - if (ic->i_unsignaled_bytes <= 0) { - ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; - send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + bytes_sent += len; + off += len; + if (off == ib_sg_dma_len(dev, scat)) { + scat++; + off = 0; + } } + rds_ib_set_wr_signal_state(ic, send, 0); + /* * Always signal the last one if we're stopping due to flow control. */ - if (flow_controlled && i == (work_alloc-1)) + if (ic->i_flowctl && flow_controlled && i == (work_alloc-1)) send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; + rdsdebug("send %p wr %p num_sge %u next %p\n", send, &send->s_wr, send->s_wr.num_sge, send->s_wr.next); - sent += len; - off += len; - if (off == ib_sg_dma_len(dev, scat)) { - scat++; - off = 0; - } - -add_header: - /* Tack on the header after the data. The header SGE should already - * have been set up to point to the right header buffer. */ - memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); - - if (0) { - struct rds_header *hdr = &ic->i_send_hdrs[pos]; - - printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", - be16_to_cpu(hdr->h_dport), - hdr->h_flags, - be32_to_cpu(hdr->h_len)); - } - if (adv_credits) { + if (ic->i_flowctl && adv_credits) { struct rds_header *hdr = &ic->i_send_hdrs[pos]; /* add credit and redo the header checksum */ @@ -679,20 +723,25 @@ add_header: prev = send; pos = (pos + 1) % ic->i_send_ring.w_nr; - } + send = &ic->i_sends[pos]; + i++; + + } while (i < work_alloc + && scat != &rm->data.op_sg[rm->data.op_count]); /* Account the RDS header in the number of bytes we sent, but just once. * The caller has no concept of fragmentation. */ if (hdr_off == 0) - sent += sizeof(struct rds_header); + bytes_sent += sizeof(struct rds_header); /* if we finished the message then send completion owns it */ - if (scat == &rm->m_sg[rm->m_count]) { - prev->s_rm = ic->i_rm; - prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - ic->i_rm = NULL; + if (scat == &rm->data.op_sg[rm->data.op_count]) { + prev->s_op = ic->i_data_op; + prev->s_wr.send_flags |= IB_SEND_SOLICITED; + ic->i_data_op = NULL; } + /* Put back wrs & credits we didn't use */ if (i < work_alloc) { rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); work_alloc = i; @@ -700,6 +749,9 @@ add_header: if (ic->i_flowctl && i < credit_alloc) rds_ib_send_add_credits(conn, credit_alloc - i); + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + /* XXX need to worry about failed_wr and partial sends. */ failed_wr = &first->s_wr; ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); @@ -710,32 +762,127 @@ add_header: printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); - if (prev->s_rm) { - ic->i_rm = prev->s_rm; - prev->s_rm = NULL; + rds_ib_sub_signaled(ic, nr_sig); + if (prev->s_op) { + ic->i_data_op = prev->s_op; + prev->s_op = NULL; } - /* Finesse this later */ - BUG(); + + rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); goto out; } - ret = sent; + ret = bytes_sent; out: BUG_ON(adv_credits); return ret; } -int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) +/* + * Issue atomic operation. + * A simplified version of the rdma case, we always map 1 SG, and + * only 8 bytes, for the return value from the atomic operation. + */ +int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_send_work *send = NULL; + struct ib_send_wr *failed_wr; + struct rds_ib_device *rds_ibdev; + u32 pos; + u32 work_alloc; + int ret; + int nr_sig = 0; + + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + + work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); + if (work_alloc != 1) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + /* address of send request in ring */ + send = &ic->i_sends[pos]; + send->s_queued = jiffies; + + if (op->op_type == RDS_ATOMIC_TYPE_CSWP) { + send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; + send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare; + send->s_wr.wr.atomic.swap = op->op_m_cswp.swap; + send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask; + send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask; + } else { /* FADD */ + send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; + send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add; + send->s_wr.wr.atomic.swap = 0; + send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask; + send->s_wr.wr.atomic.swap_mask = 0; + } + nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); + send->s_wr.num_sge = 1; + send->s_wr.next = NULL; + send->s_wr.wr.atomic.remote_addr = op->op_remote_addr; + send->s_wr.wr.atomic.rkey = op->op_rkey; + send->s_op = op; + rds_message_addref(container_of(send->s_op, struct rds_message, atomic)); + + /* map 8 byte retval buffer to the device */ + ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE); + rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret); + if (ret != 1) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + + /* Convert our struct scatterlist to struct ib_sge */ + send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg); + send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg); + send->s_sge[0].lkey = ic->i_mr->lkey; + + rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, + send->s_sge[0].addr, send->s_sge[0].length); + + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + + failed_wr = &send->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr); + rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic, + send, &send->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &send->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_sub_signaled(ic, nr_sig); + goto out; + } + + if (unlikely(failed_wr != &send->s_wr)) { + printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret); + BUG_ON(failed_wr != &send->s_wr); + } + +out: + return ret; +} + +int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_send_work *send = NULL; struct rds_ib_send_work *first; struct rds_ib_send_work *prev; struct ib_send_wr *failed_wr; - struct rds_ib_device *rds_ibdev; struct scatterlist *scat; unsigned long len; - u64 remote_addr = op->r_remote_addr; + u64 remote_addr = op->op_remote_addr; + u32 max_sge = ic->rds_ibdev->max_sge; u32 pos; u32 work_alloc; u32 i; @@ -743,29 +890,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) int sent; int ret; int num_sge; - - rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); - - /* map the message the first time we see it */ - if (!op->r_mapped) { - op->r_count = ib_dma_map_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, (op->r_write) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); - if (op->r_count == 0) { + int nr_sig = 0; + + /* map the op the first time we see it */ + if (!op->op_mapped) { + op->op_count = ib_dma_map_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, (op->op_write) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); + if (op->op_count == 0) { rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); ret = -ENOMEM; /* XXX ? */ goto out; } - op->r_mapped = 1; + op->op_mapped = 1; } /* * Instead of knowing how to return a partial rdma read/write we insist that there * be enough work requests to send the entire message. */ - i = ceil(op->r_count, rds_ibdev->max_sge); + i = ceil(op->op_count, max_sge); work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); if (work_alloc != i) { @@ -778,30 +924,24 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &op->r_sg[0]; + scat = &op->op_sg[0]; sent = 0; - num_sge = op->r_count; + num_sge = op->op_count; - for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { + for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { send->s_wr.send_flags = 0; send->s_queued = jiffies; - /* - * We want to delay signaling completions just enough to get - * the batching benefits but not so much that we create dead time on the wire. - */ - if (ic->i_unsignaled_wrs-- == 0) { - ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; - send->s_wr.send_flags = IB_SEND_SIGNALED; - } + send->s_op = NULL; + + nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify); - send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; + send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; send->s_wr.wr.rdma.remote_addr = remote_addr; - send->s_wr.wr.rdma.rkey = op->r_key; - send->s_op = op; + send->s_wr.wr.rdma.rkey = op->op_rkey; - if (num_sge > rds_ibdev->max_sge) { - send->s_wr.num_sge = rds_ibdev->max_sge; - num_sge -= rds_ibdev->max_sge; + if (num_sge > max_sge) { + send->s_wr.num_sge = max_sge; + num_sge -= max_sge; } else { send->s_wr.num_sge = num_sge; } @@ -811,7 +951,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) if (prev) prev->s_wr.next = &send->s_wr; - for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { + for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { len = ib_sg_dma_len(ic->i_cm_id->device, scat); send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); @@ -833,15 +973,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) send = ic->i_sends; } - /* if we finished the message then send completion owns it */ - if (scat == &op->r_sg[op->r_count]) - prev->s_wr.send_flags = IB_SEND_SIGNALED; + /* give a reference to the last op */ + if (scat == &op->op_sg[op->op_count]) { + prev->s_op = op; + rds_message_addref(container_of(op, struct rds_message, rdma)); + } if (i < work_alloc) { rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); work_alloc = i; } + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + failed_wr = &first->s_wr; ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, @@ -851,6 +996,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_sub_signaled(ic, nr_sig); goto out; } diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index d2c904dd6fb..2d5965d6e97 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -67,6 +67,8 @@ static const char *const rds_ib_stat_names[] = { "ib_rdma_mr_pool_flush", "ib_rdma_mr_pool_wait", "ib_rdma_mr_pool_depleted", + "ib_atomic_cswp", + "ib_atomic_fadd", }; unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index 03f01cb4e0f..e4e41b3afce 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c @@ -49,10 +49,6 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16; static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1; static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; -unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20); -static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1; -static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL; - /* * This sysctl does nothing. * @@ -65,7 +61,7 @@ static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL; */ unsigned int rds_ib_sysctl_flow_control = 0; -ctl_table rds_ib_sysctl_table[] = { +static struct ctl_table rds_ib_sysctl_table[] = { { .procname = "max_send_wr", .data = &rds_ib_sysctl_max_send_wr, @@ -94,15 +90,6 @@ ctl_table rds_ib_sysctl_table[] = { .extra2 = &rds_ib_sysctl_max_unsig_wr_max, }, { - .procname = "max_unsignaled_bytes", - .data = &rds_ib_sysctl_max_unsig_bytes, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &rds_ib_sysctl_max_unsig_bytes_min, - .extra2 = &rds_ib_sysctl_max_unsig_bytes_max, - }, - { .procname = "max_recv_allocation", .data = &rds_ib_sysctl_max_recv_allocation, .maxlen = sizeof(unsigned long), @@ -119,23 +106,16 @@ ctl_table rds_ib_sysctl_table[] = { { } }; -static struct ctl_path rds_ib_sysctl_path[] = { - { .procname = "net", }, - { .procname = "rds", }, - { .procname = "ib", }, - { } -}; - void rds_ib_sysctl_exit(void) { if (rds_ib_sysctl_hdr) - unregister_sysctl_table(rds_ib_sysctl_hdr); + unregister_net_sysctl_table(rds_ib_sysctl_hdr); } -int __init rds_ib_sysctl_init(void) +int rds_ib_sysctl_init(void) { - rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table); - if (rds_ib_sysctl_hdr == NULL) + rds_ib_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/ib", rds_ib_sysctl_table); + if (!rds_ib_sysctl_hdr) return -ENOMEM; return 0; } diff --git a/net/rds/info.c b/net/rds/info.c index 814a91a6f4a..9a6b4f66187 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -32,7 +32,9 @@ */ #include <linux/percpu.h> #include <linux/seq_file.h> +#include <linux/slab.h> #include <linux/proc_fs.h> +#include <linux/export.h> #include "rds.h" @@ -75,7 +77,7 @@ void rds_info_register_func(int optname, rds_info_func func) BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); spin_lock(&rds_info_lock); - BUG_ON(rds_info_funcs[offset] != NULL); + BUG_ON(rds_info_funcs[offset]); rds_info_funcs[offset] = func; spin_unlock(&rds_info_lock); } @@ -101,8 +103,8 @@ EXPORT_SYMBOL_GPL(rds_info_deregister_func); */ void rds_info_iter_unmap(struct rds_info_iterator *iter) { - if (iter->addr != NULL) { - kunmap_atomic(iter->addr, KM_USER0); + if (iter->addr) { + kunmap_atomic(iter->addr); iter->addr = NULL; } } @@ -116,8 +118,8 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data, unsigned long this; while (bytes) { - if (iter->addr == NULL) - iter->addr = kmap_atomic(*iter->pages, KM_USER0); + if (!iter->addr) + iter->addr = kmap_atomic(*iter->pages); this = min(bytes, PAGE_SIZE - iter->offset); @@ -132,7 +134,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data, iter->offset += this; if (iter->offset == PAGE_SIZE) { - kunmap_atomic(iter->addr, KM_USER0); + kunmap_atomic(iter->addr); iter->addr = NULL; iter->offset = 0; iter->pages++; @@ -187,7 +189,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, >> PAGE_SHIFT; pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); - if (pages == NULL) { + if (!pages) { ret = -ENOMEM; goto out; } @@ -205,7 +207,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, call_func: func = rds_info_funcs[optname - RDS_INFO_FIRST]; - if (func == NULL) { + if (!func) { ret = -ENOPROTOOPT; goto out; } @@ -233,7 +235,7 @@ call_func: ret = -EFAULT; out: - for (i = 0; pages != NULL && i < nr_pages; i++) + for (i = 0; pages && i < nr_pages; i++) put_page(pages[i]); kfree(pages); diff --git a/net/rds/iw.c b/net/rds/iw.c index b28fa8525b2..589935661d6 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -37,6 +37,8 @@ #include <linux/inetdevice.h> #include <linux/if_arp.h> #include <linux/delay.h> +#include <linux/slab.h> +#include <linux/module.h> #include "rds.h" #include "iw.h" @@ -55,7 +57,7 @@ struct list_head rds_iw_devices; DEFINE_SPINLOCK(iw_nodev_conns_lock); LIST_HEAD(iw_nodev_conns); -void rds_iw_add_one(struct ib_device *device) +static void rds_iw_add_one(struct ib_device *device) { struct rds_iw_device *rds_iwdev; struct ib_device_attr *dev_attr; @@ -123,7 +125,7 @@ free_attr: kfree(dev_attr); } -void rds_iw_remove_one(struct ib_device *device) +static void rds_iw_remove_one(struct ib_device *device) { struct rds_iw_device *rds_iwdev; struct rds_iw_cm_id *i_cm_id, *next; @@ -225,7 +227,7 @@ static int rds_iw_laddr_check(__be32 addr) /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ - cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); @@ -237,7 +239,8 @@ static int rds_iw_laddr_check(__be32 addr) ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); /* due to this, we will claim to support IB devices unless we check node_type. */ - if (ret || cm_id->device->node_type != RDMA_NODE_RNIC) + if (ret || !cm_id->device || + cm_id->device->node_type != RDMA_NODE_RNIC) ret = -EADDRNOTAVAIL; rdsdebug("addr %pI4 ret %d node type %d\n", @@ -263,7 +266,6 @@ struct rds_transport rds_iw_transport = { .laddr_check = rds_iw_laddr_check, .xmit_complete = rds_iw_xmit_complete, .xmit = rds_iw_xmit, - .xmit_cong_map = NULL, .xmit_rdma = rds_iw_xmit_rdma, .recv = rds_iw_recv, .conn_alloc = rds_iw_conn_alloc, @@ -271,7 +273,6 @@ struct rds_transport rds_iw_transport = { .conn_connect = rds_iw_conn_connect, .conn_shutdown = rds_iw_conn_shutdown, .inc_copy_to_user = rds_iw_inc_copy_to_user, - .inc_purge = rds_iw_inc_purge, .inc_free = rds_iw_inc_free, .cm_initiate_connect = rds_iw_cm_initiate_connect, .cm_handle_connect = rds_iw_cm_handle_connect, @@ -288,7 +289,7 @@ struct rds_transport rds_iw_transport = { .t_prefer_loopback = 1, }; -int __init rds_iw_init(void) +int rds_iw_init(void) { int ret; diff --git a/net/rds/iw.h b/net/rds/iw.h index eef2f0c2847..04ce3b193f7 100644 --- a/net/rds/iw.h +++ b/net/rds/iw.h @@ -1,6 +1,7 @@ #ifndef _RDS_IW_H #define _RDS_IW_H +#include <linux/interrupt.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> #include "rds.h" @@ -70,7 +71,7 @@ struct rds_iw_send_work { struct rds_message *s_rm; /* We should really put these into a union: */ - struct rds_rdma_op *s_op; + struct rm_rdma_op *s_op; struct rds_iw_mapping *s_mapping; struct ib_mr *s_mr; struct ib_fast_reg_page_list *s_page_list; @@ -268,8 +269,6 @@ static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic) /* ib.c */ extern struct rds_transport rds_iw_transport; -extern void rds_iw_add_one(struct ib_device *device); -extern void rds_iw_remove_one(struct ib_device *device); extern struct ib_client rds_iw_client; extern unsigned int fastreg_pool_size; @@ -284,7 +283,7 @@ void rds_iw_conn_free(void *arg); int rds_iw_conn_connect(struct rds_connection *conn); void rds_iw_conn_shutdown(struct rds_connection *conn); void rds_iw_state_change(struct sock *sk); -int __init rds_iw_listen_init(void); +int rds_iw_listen_init(void); void rds_iw_listen_stop(void); void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, @@ -318,15 +317,13 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, void rds_iw_sync_mr(void *trans_private, int dir); void rds_iw_free_mr(void *trans_private, int invalidate); void rds_iw_flush_mrs(void); -void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); /* ib_recv.c */ -int __init rds_iw_recv_init(void); +int rds_iw_recv_init(void); void rds_iw_recv_exit(void); int rds_iw_recv(struct rds_connection *conn); int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, gfp_t page_gfp, int prefill); -void rds_iw_inc_purge(struct rds_incoming *inc); void rds_iw_inc_free(struct rds_incoming *inc); int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, size_t size); @@ -358,7 +355,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); void rds_iw_send_init_ring(struct rds_iw_connection *ic); void rds_iw_send_clear_ring(struct rds_iw_connection *ic); -int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); +int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, @@ -371,7 +368,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); /* ib_sysctl.c */ -int __init rds_iw_sysctl_init(void); +int rds_iw_sysctl_init(void); void rds_iw_sysctl_exit(void); extern unsigned long rds_iw_sysctl_max_send_wr; extern unsigned long rds_iw_sysctl_max_recv_wr; @@ -379,7 +376,6 @@ extern unsigned long rds_iw_sysctl_max_unsig_wrs; extern unsigned long rds_iw_sysctl_max_unsig_bytes; extern unsigned long rds_iw_sysctl_max_recv_allocation; extern unsigned int rds_iw_sysctl_flow_control; -extern ctl_table rds_iw_sysctl_table[]; /* * Helper functions for getting/setting the header and data SGEs in diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c index 394cf6b4d0a..a91e1db62ee 100644 --- a/net/rds/iw_cm.c +++ b/net/rds/iw_cm.c @@ -32,7 +32,9 @@ */ #include <linux/kernel.h> #include <linux/in.h> +#include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/ratelimit.h> #include "rds.h" #include "iw.h" @@ -156,9 +158,11 @@ static void rds_iw_qp_event_handler(struct ib_event *event, void *data) case IB_EVENT_QP_REQ_ERR: case IB_EVENT_QP_FATAL: default: - rds_iw_conn_error(conn, "RDS/IW: Fatal QP Event %u - connection %pI4->%pI4...reconnecting\n", + rdsdebug("Fatal QP Event %u " + "- connection %pI4->%pI4, reconnecting\n", event->event, &conn->c_laddr, &conn->c_faddr); + rds_conn_drop(conn); break; } } @@ -178,7 +182,7 @@ static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, unsigned int send_size, recv_size; int ret; - /* The offset of 1 is to accomodate the additional ACK WR. */ + /* The offset of 1 is to accommodate the additional ACK WR. */ send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1); recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1); rds_iw_ring_resize(send_ring, send_size - 1); @@ -254,9 +258,8 @@ static int rds_iw_setup_qp(struct rds_connection *conn) * the rds_iwdev at all. */ rds_iwdev = ib_get_client_data(dev, &rds_iw_client); - if (rds_iwdev == NULL) { - if (printk_ratelimit()) - printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n", + if (!rds_iwdev) { + printk_ratelimited(KERN_NOTICE "RDS/IW: No client_data for device %s\n", dev->name); return -EOPNOTSUPP; } @@ -289,7 +292,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ic->i_send_ring.w_nr * sizeof(struct rds_header), &ic->i_send_hdrs_dma, GFP_KERNEL); - if (ic->i_send_hdrs == NULL) { + if (!ic->i_send_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent send failed\n"); goto out; @@ -299,7 +302,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ic->i_recv_ring.w_nr * sizeof(struct rds_header), &ic->i_recv_hdrs_dma, GFP_KERNEL); - if (ic->i_recv_hdrs == NULL) { + if (!ic->i_recv_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent recv failed\n"); goto out; @@ -307,14 +310,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), &ic->i_ack_dma, GFP_KERNEL); - if (ic->i_ack == NULL) { + if (!ic->i_ack) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent ack failed\n"); goto out; } ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); - if (ic->i_sends == NULL) { + if (!ic->i_sends) { ret = -ENOMEM; rdsdebug("send allocation failed\n"); goto out; @@ -322,7 +325,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) rds_iw_send_init_ring(ic); ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); - if (ic->i_recvs == NULL) { + if (!ic->i_recvs) { ret = -ENOMEM; rdsdebug("recv allocation failed\n"); goto out; @@ -362,13 +365,12 @@ static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp) version = RDS_PROTOCOL_3_0; while ((common >>= 1) != 0) version++; - } else if (printk_ratelimit()) { - printk(KERN_NOTICE "RDS: Connection from %pI4 using " + } + printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using " "incompatible protocol version %u.%u\n", &dp->dp_saddr, dp->dp_protocol_major, dp->dp_protocol_minor); - } return version; } @@ -449,6 +451,7 @@ int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, err = rds_iw_setup_qp(conn); if (err) { rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err); + mutex_unlock(&conn->c_cm_lock); goto out; } @@ -518,7 +521,7 @@ int rds_iw_conn_connect(struct rds_connection *conn) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, - RDMA_PS_TCP); + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { ret = PTR_ERR(ic->i_cm_id); ic->i_cm_id = NULL; @@ -691,8 +694,8 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) unsigned long flags; /* XXX too lazy? */ - ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); - if (ic == NULL) + ic = kzalloc(sizeof(struct rds_iw_connection), gfp); + if (!ic) return -ENOMEM; INIT_LIST_HEAD(&ic->iw_node); diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index 9eda11cca95..a817705ce2d 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c @@ -31,9 +31,10 @@ * */ #include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/ratelimit.h> #include "rds.h" -#include "rdma.h" #include "iw.h" @@ -83,7 +84,8 @@ static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, struct list_head *unmap_list, - struct list_head *kill_list); + struct list_head *kill_list, + int *unpinned); static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id) @@ -122,7 +124,7 @@ static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwd #else /* FIXME - needs to compare the local and remote * ipaddr/port tuple, but the ipaddr is the only - * available infomation in the rds_sock (as the rest are + * available information in the rds_sock (as the rest are * zero'ed. It doesn't appear to be properly populated * during connection setup... */ @@ -157,7 +159,8 @@ static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id * return 0; } -void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) +static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, + struct rdma_cm_id *cm_id) { struct rds_iw_cm_id *i_cm_id; @@ -206,9 +209,9 @@ void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *con BUG_ON(list_empty(&ic->iw_node)); list_del(&ic->iw_node); - spin_lock_irq(&rds_iwdev->spinlock); + spin_lock(&rds_iwdev->spinlock); list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); - spin_unlock_irq(&rds_iwdev->spinlock); + spin_unlock(&rds_iwdev->spinlock); spin_unlock_irq(&iw_nodev_conns_lock); ic->rds_iwdev = rds_iwdev; @@ -474,17 +477,6 @@ void rds_iw_sync_mr(void *trans_private, int direction) } } -static inline unsigned int rds_iw_flush_goal(struct rds_iw_mr_pool *pool, int free_all) -{ - unsigned int item_count; - - item_count = atomic_read(&pool->item_count); - if (free_all) - return item_count; - - return 0; -} - /* * Flush our pool of MRs. * At a minimum, all currently unused MRs are unmapped. @@ -497,7 +489,7 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) LIST_HEAD(unmap_list); LIST_HEAD(kill_list); unsigned long flags; - unsigned int nfreed = 0, ncleaned = 0, free_goal; + unsigned int nfreed = 0, ncleaned = 0, unpinned = 0; int ret = 0; rds_iw_stats_inc(s_iw_rdma_mr_pool_flush); @@ -511,8 +503,6 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) list_splice_init(&pool->clean_list, &kill_list); spin_unlock_irqrestore(&pool->list_lock, flags); - free_goal = rds_iw_flush_goal(pool, free_all); - /* Batched invalidate of dirty MRs. * For FMR based MRs, the mappings on the unmap list are * actually members of an ibmr (ibmr->mapping). They either @@ -522,7 +512,8 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) * will be destroyed by the unmap function. */ if (!list_empty(&unmap_list)) { - ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list); + ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, + &kill_list, &unpinned); /* If we've been asked to destroy all MRs, move those * that were simply cleaned to the kill list */ if (free_all) @@ -546,6 +537,7 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) spin_unlock_irqrestore(&pool->list_lock, flags); } + atomic_sub(unpinned, &pool->free_pinned); atomic_sub(ncleaned, &pool->dirty_count); atomic_sub(nfreed, &pool->item_count); @@ -728,8 +720,8 @@ static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping) failed_wr = &f_wr; ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr); BUG_ON(failed_wr != &f_wr); - if (ret && printk_ratelimit()) - printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", + if (ret) + printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", __func__, __LINE__, ret); return ret; } @@ -750,8 +742,8 @@ static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr) failed_wr = &s_wr; ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr); - if (ret && printk_ratelimit()) { - printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", + if (ret) { + printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", __func__, __LINE__, ret); goto out; } @@ -826,7 +818,8 @@ static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, struct list_head *unmap_list, - struct list_head *kill_list) + struct list_head *kill_list, + int *unpinned) { struct rds_iw_mapping *mapping, *next; unsigned int ncleaned = 0; @@ -853,6 +846,7 @@ static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, spin_lock_irqsave(&pool->list_lock, flags); list_for_each_entry_safe(mapping, next, unmap_list, m_list) { + *unpinned += mapping->m_sg.len; list_move(&mapping->m_list, &laundered); ncleaned++; } diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c index 54af7d6b92d..aa8bf678600 100644 --- a/net/rds/iw_recv.c +++ b/net/rds/iw_recv.c @@ -31,6 +31,7 @@ * */ #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/pci.h> #include <linux/dma-mapping.h> #include <rdma/rdma_cm.h> @@ -52,7 +53,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag) static void rds_iw_frag_free(struct rds_page_frag *frag) { rdsdebug("frag %p page %p\n", frag, frag->f_page); - BUG_ON(frag->f_page != NULL); + BUG_ON(frag->f_page); kmem_cache_free(rds_iw_frag_slab, frag); } @@ -142,14 +143,14 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn, struct ib_sge *sge; int ret = -ENOMEM; - if (recv->r_iwinc == NULL) { + if (!recv->r_iwinc) { if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { rds_iw_stats_inc(s_iw_rx_alloc_limit); goto out; } recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, kptr_gfp); - if (recv->r_iwinc == NULL) { + if (!recv->r_iwinc) { atomic_dec(&rds_iw_allocation); goto out; } @@ -157,17 +158,17 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn, rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); } - if (recv->r_frag == NULL) { + if (!recv->r_frag) { recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); - if (recv->r_frag == NULL) + if (!recv->r_frag) goto out; INIT_LIST_HEAD(&recv->r_frag->f_item); recv->r_frag->f_page = NULL; } - if (ic->i_frag.f_page == NULL) { + if (!ic->i_frag.f_page) { ic->i_frag.f_page = alloc_page(page_gfp); - if (ic->i_frag.f_page == NULL) + if (!ic->i_frag.f_page) goto out; ic->i_frag.f_offset = 0; } @@ -272,7 +273,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, return ret; } -void rds_iw_inc_purge(struct rds_incoming *inc) +static void rds_iw_inc_purge(struct rds_incoming *inc) { struct rds_iw_incoming *iwinc; struct rds_page_frag *frag; @@ -428,7 +429,7 @@ static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, { atomic64_set(&ic->i_ack_next, seq); if (ack_required) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } } @@ -436,7 +437,7 @@ static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, static u64 rds_iw_get_ack(struct rds_iw_connection *ic) { clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return atomic64_read(&ic->i_ack_next); } @@ -468,8 +469,8 @@ static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credi set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); rds_iw_stats_inc(s_iw_ack_send_failure); - /* Need to finesse this later. */ - BUG(); + + rds_iw_conn_error(ic->conn, "sending ack failed\n"); } else rds_iw_stats_inc(s_iw_ack_sent); } @@ -597,7 +598,7 @@ static void rds_iw_cong_recv(struct rds_connection *conn, to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ - addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); + addr = kmap_atomic(frag->f_page); src = addr + frag_off; dst = (void *)map->m_page_addrs[map_page] + map_off; @@ -607,7 +608,7 @@ static void rds_iw_cong_recv(struct rds_connection *conn, uncongested |= ~(*src) & *dst; *dst++ = *src++; } - kunmap_atomic(addr, KM_SOFTIRQ0); + kunmap_atomic(addr); copied += to_copy; @@ -660,7 +661,7 @@ static void rds_iw_process_recv(struct rds_connection *conn, if (byte_len < sizeof(struct rds_header)) { rds_iw_conn_error(conn, "incoming message " - "from %pI4 didn't inclue a " + "from %pI4 didn't include a " "header, disconnecting and " "reconnecting\n", &conn->c_faddr); @@ -715,7 +716,7 @@ static void rds_iw_process_recv(struct rds_connection *conn, * into the inc and save the inc so we can hang upcoming fragments * off its list. */ - if (iwinc == NULL) { + if (!iwinc) { iwinc = recv->r_iwinc; recv->r_iwinc = NULL; ic->i_iwinc = iwinc; @@ -753,8 +754,7 @@ static void rds_iw_process_recv(struct rds_connection *conn, rds_iw_cong_recv(conn, iwinc); else { rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, - &iwinc->ii_inc, GFP_ATOMIC, - KM_SOFTIRQ0); + &iwinc->ii_inc, GFP_ATOMIC); state->ack_next = be64_to_cpu(hdr->h_sequence); state->ack_next_valid = 1; } @@ -886,7 +886,7 @@ int rds_iw_recv(struct rds_connection *conn) return ret; } -int __init rds_iw_recv_init(void) +int rds_iw_recv_init(void) { struct sysinfo si; int ret = -ENOMEM; @@ -898,13 +898,13 @@ int __init rds_iw_recv_init(void) rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", sizeof(struct rds_iw_incoming), 0, 0, NULL); - if (rds_iw_incoming_slab == NULL) + if (!rds_iw_incoming_slab) goto out; rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", sizeof(struct rds_page_frag), 0, 0, NULL); - if (rds_iw_frag_slab == NULL) + if (!rds_iw_frag_slab) kmem_cache_destroy(rds_iw_incoming_slab); else ret = 0; diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index 1379e9d66a7..9105ea03aec 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -34,9 +34,9 @@ #include <linux/in.h> #include <linux/device.h> #include <linux/dmapool.h> +#include <linux/ratelimit.h> #include "rds.h" -#include "rdma.h" #include "iw.h" static void rds_iw_send_rdma_complete(struct rds_message *rm, @@ -64,13 +64,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm, } static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, - struct rds_rdma_op *op) + struct rm_rdma_op *op) { - if (op->r_mapped) { + if (op->op_mapped) { ib_dma_unmap_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, - op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - op->r_mapped = 0; + op->op_sg, op->op_nents, + op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->op_mapped = 0; } } @@ -83,11 +83,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, rdsdebug("ic %p send %p rm %p\n", ic, send, rm); ib_dma_unmap_sg(ic->i_cm_id->device, - rm->m_sg, rm->m_nents, + rm->data.op_sg, rm->data.op_nents, DMA_TO_DEVICE); - if (rm->m_rdma_op != NULL) { - rds_iw_send_unmap_rdma(ic, rm->m_rdma_op); + if (rm->rdma.op_active) { + rds_iw_send_unmap_rdma(ic, &rm->rdma); /* If the user asked for a completion notification on this * message, we can implement three different semantics: @@ -111,10 +111,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, */ rds_iw_send_rdma_complete(rm, wc_status); - if (rm->m_rdma_op->r_write) - rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); + if (rm->rdma.op_write) + rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes); else - rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); + rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes); } /* If anyone waited for this message to get flushed out, wake @@ -232,7 +232,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) } if (wc.wr_id == RDS_IW_ACK_WR_ID) { - if (ic->i_ack_queued + HZ/2 < jiffies) + if (time_after(jiffies, ic->i_ack_queued + HZ/2)) rds_iw_stats_inc(s_iw_tx_stalled); rds_iw_ack_send_complete(ic); continue; @@ -259,8 +259,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) * when the SEND completes. */ break; default: - if (printk_ratelimit()) - printk(KERN_NOTICE + printk_ratelimited(KERN_NOTICE "RDS/IW: %s: unexpected opcode 0x%x in WR!\n", __func__, send->s_wr.opcode); break; @@ -268,7 +267,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) send->s_wr.opcode = 0xdead; send->s_wr.num_sge = 1; - if (send->s_queued + HZ/2 < jiffies) + if (time_after(jiffies, send->s_queued + HZ/2)) rds_iw_stats_inc(s_iw_tx_stalled); /* If a RDMA operation produced an error, signal this right @@ -308,7 +307,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) * * Conceptually, we have two counters: * - send credits: this tells us how many WRs we're allowed - * to submit without overruning the reciever's queue. For + * to submit without overruning the receiver's queue. For * each SEND WR we post, we decrement this by one. * * - posted credits: this tells us how many WRs we recently @@ -556,25 +555,27 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, } /* map the message the first time we see it */ - if (ic->i_rm == NULL) { + if (!ic->i_rm) { /* printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", be16_to_cpu(rm->m_inc.i_hdr.h_dport), rm->m_inc.i_hdr.h_flags, be32_to_cpu(rm->m_inc.i_hdr.h_len)); */ - if (rm->m_nents) { - rm->m_count = ib_dma_map_sg(dev, - rm->m_sg, rm->m_nents, DMA_TO_DEVICE); - rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); - if (rm->m_count == 0) { + if (rm->data.op_nents) { + rm->data.op_count = ib_dma_map_sg(dev, + rm->data.op_sg, + rm->data.op_nents, + DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); + if (rm->data.op_count == 0) { rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); ret = -ENOMEM; /* XXX ? */ goto out; } } else { - rm->m_count = 0; + rm->data.op_count = 0; } ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; @@ -590,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, /* If it has a RDMA op, tell the peer we did it. This is * used by the peer to release use-once RDMA MRs. */ - if (rm->m_rdma_op) { + if (rm->rdma.op_active) { struct rds_ext_header_rdma ext_hdr; - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); } @@ -616,13 +617,12 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); adv_credits += posted; BUG_ON(adv_credits > 255); - } else if (ic->i_rm != rm) - BUG(); + } send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &rm->m_sg[sg]; + scat = &rm->data.op_sg[sg]; sent = 0; i = 0; @@ -632,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, * or when requested by the user. Right now, we let * the application choose. */ - if (rm->m_rdma_op && rm->m_rdma_op->r_fence) + if (rm->rdma.op_active && rm->rdma.op_fence) send_flags = IB_SEND_FENCE; /* @@ -651,7 +651,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, } /* if there's data reference it with a chain of work reqs */ - for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { + for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) { unsigned int len; send = &ic->i_sends[pos]; @@ -729,7 +729,7 @@ add_header: sent += sizeof(struct rds_header); /* if we finished the message then send completion owns it */ - if (scat == &rm->m_sg[rm->m_count]) { + if (scat == &rm->data.op_sg[rm->data.op_count]) { prev->s_rm = ic->i_rm; prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; ic->i_rm = NULL; @@ -785,7 +785,7 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); } -int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) +int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) { struct rds_iw_connection *ic = conn->c_transport_data; struct rds_iw_send_work *send = NULL; @@ -795,7 +795,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) struct rds_iw_device *rds_iwdev; struct scatterlist *scat; unsigned long len; - u64 remote_addr = op->r_remote_addr; + u64 remote_addr = op->op_remote_addr; u32 pos, fr_pos; u32 work_alloc; u32 i; @@ -807,21 +807,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); /* map the message the first time we see it */ - if (!op->r_mapped) { - op->r_count = ib_dma_map_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, (op->r_write) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); - if (op->r_count == 0) { + if (!op->op_mapped) { + op->op_count = ib_dma_map_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, (op->op_write) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); + if (op->op_count == 0) { rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); ret = -ENOMEM; /* XXX ? */ goto out; } - op->r_mapped = 1; + op->op_mapped = 1; } - if (!op->r_write) { + if (!op->op_write) { /* Alloc space on the send queue for the fastreg */ work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); if (work_alloc != 1) { @@ -836,7 +836,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) * Instead of knowing how to return a partial rdma read/write we insist that there * be enough work requests to send the entire message. */ - i = ceil(op->r_count, rds_iwdev->max_sge); + i = ceil(op->op_count, rds_iwdev->max_sge); work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); if (work_alloc != i) { @@ -847,17 +847,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) } send = &ic->i_sends[pos]; - if (!op->r_write) { + if (!op->op_write) { first = prev = &ic->i_sends[fr_pos]; } else { first = send; prev = NULL; } - scat = &op->r_sg[0]; + scat = &op->op_sg[0]; sent = 0; - num_sge = op->r_count; + num_sge = op->op_count; - for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { + for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { send->s_wr.send_flags = 0; send->s_queued = jiffies; @@ -874,13 +874,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) * for local access after RDS is finished with it, using * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. */ - if (op->r_write) + if (op->op_write) send->s_wr.opcode = IB_WR_RDMA_WRITE; else send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; send->s_wr.wr.rdma.remote_addr = remote_addr; - send->s_wr.wr.rdma.rkey = op->r_key; + send->s_wr.wr.rdma.rkey = op->op_rkey; send->s_op = op; if (num_sge > rds_iwdev->max_sge) { @@ -894,7 +894,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) if (prev) prev->s_wr.next = &send->s_wr; - for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { + for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { len = ib_sg_dma_len(ic->i_cm_id->device, scat); if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) @@ -928,7 +928,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) } /* if we finished the message then send completion owns it */ - if (scat == &op->r_sg[op->r_count]) + if (scat == &op->op_sg[op->op_count]) first->s_wr.send_flags = IB_SEND_SIGNALED; if (i < work_alloc) { @@ -942,9 +942,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) * adapters do not allow using the lkey for this at all. To bypass this use a * fastreg_mr (or possibly a dma_mr) */ - if (!op->r_write) { + if (!op->op_write) { rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], - op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); + op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); work_alloc++; } diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c index 1c4428a61a0..139239d2cb2 100644 --- a/net/rds/iw_sysctl.c +++ b/net/rds/iw_sysctl.c @@ -55,7 +55,7 @@ static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL; unsigned int rds_iw_sysctl_flow_control = 1; -ctl_table rds_iw_sysctl_table[] = { +static struct ctl_table rds_iw_sysctl_table[] = { { .procname = "max_send_wr", .data = &rds_iw_sysctl_max_send_wr, @@ -109,23 +109,15 @@ ctl_table rds_iw_sysctl_table[] = { { } }; -static struct ctl_path rds_iw_sysctl_path[] = { - { .procname = "net", }, - { .procname = "rds", }, - { .procname = "iw", }, - { } -}; - void rds_iw_sysctl_exit(void) { - if (rds_iw_sysctl_hdr) - unregister_sysctl_table(rds_iw_sysctl_hdr); + unregister_net_sysctl_table(rds_iw_sysctl_hdr); } -int __init rds_iw_sysctl_init(void) +int rds_iw_sysctl_init(void) { - rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table); - if (rds_iw_sysctl_hdr == NULL) + rds_iw_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/iw", rds_iw_sysctl_table); + if (!rds_iw_sysctl_hdr) return -ENOMEM; return 0; } diff --git a/net/rds/loop.c b/net/rds/loop.c index 4a61997f554..6b12b68541a 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -31,6 +31,7 @@ * */ #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/in.h> #include "rds.h" @@ -60,39 +61,42 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off) { + struct scatterlist *sgp = &rm->data.op_sg[sg]; + int ret = sizeof(struct rds_header) + + be32_to_cpu(rm->m_inc.i_hdr.h_len); + + /* Do not send cong updates to loopback */ + if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { + rds_cong_map_updated(conn->c_fcong, ~(u64) 0); + ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off); + goto out; + } + BUG_ON(hdr_off || sg || off); rds_inc_init(&rm->m_inc, conn, conn->c_laddr); - rds_message_addref(rm); /* for the inc */ + /* For the embedded inc. Matching put is in loop_inc_free() */ + rds_message_addref(rm); rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, - GFP_KERNEL, KM_USER0); + GFP_KERNEL); rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence), NULL); rds_inc_put(&rm->m_inc); - - return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); +out: + return ret; } -static int rds_loop_xmit_cong_map(struct rds_connection *conn, - struct rds_cong_map *map, - unsigned long offset) +/* + * See rds_loop_xmit(). Since our inc is embedded in the rm, we + * make sure the rm lives at least until the inc is done. + */ +static void rds_loop_inc_free(struct rds_incoming *inc) { - unsigned long i; - - BUG_ON(offset); - BUG_ON(map != conn->c_lcong); - - for (i = 0; i < RDS_CONG_MAP_PAGES; i++) { - memcpy((void *)conn->c_fcong->m_page_addrs[i], - (void *)map->m_page_addrs[i], PAGE_SIZE); - } - - rds_cong_map_updated(conn->c_fcong, ~(u64) 0); - - return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; + struct rds_message *rm = container_of(inc, struct rds_message, m_inc); + rds_message_put(rm); } /* we need to at least give the thread something to succeed */ @@ -117,8 +121,8 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) struct rds_loop_connection *lc; unsigned long flags; - lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL); - if (lc == NULL) + lc = kzalloc(sizeof(struct rds_loop_connection), gfp); + if (!lc) return -ENOMEM; INIT_LIST_HEAD(&lc->loop_node); @@ -135,8 +139,12 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) static void rds_loop_conn_free(void *arg) { struct rds_loop_connection *lc = arg; + unsigned long flags; + rdsdebug("lc %p\n", lc); + spin_lock_irqsave(&loop_conns_lock, flags); list_del(&lc->loop_node); + spin_unlock_irqrestore(&loop_conns_lock, flags); kfree(lc); } @@ -175,14 +183,12 @@ void rds_loop_exit(void) */ struct rds_transport rds_loop_transport = { .xmit = rds_loop_xmit, - .xmit_cong_map = rds_loop_xmit_cong_map, .recv = rds_loop_recv, .conn_alloc = rds_loop_conn_alloc, .conn_free = rds_loop_conn_free, .conn_connect = rds_loop_conn_connect, .conn_shutdown = rds_loop_conn_shutdown, .inc_copy_to_user = rds_message_inc_copy_to_user, - .inc_purge = rds_message_inc_purge, - .inc_free = rds_message_inc_free, + .inc_free = rds_loop_inc_free, .t_name = "loopback", }; diff --git a/net/rds/message.c b/net/rds/message.c index 73e600ffd87..aba232f9f30 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -31,11 +31,10 @@ * */ #include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/export.h> #include "rds.h" -#include "rdma.h" - -static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq); static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_NONE] = 0, @@ -62,29 +61,28 @@ static void rds_message_purge(struct rds_message *rm) if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; - for (i = 0; i < rm->m_nents; i++) { - rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i])); + for (i = 0; i < rm->data.op_nents; i++) { + rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i])); /* XXX will have to put_page for page refs */ - __free_page(sg_page(&rm->m_sg[i])); + __free_page(sg_page(&rm->data.op_sg[i])); } - rm->m_nents = 0; + rm->data.op_nents = 0; - if (rm->m_rdma_op) - rds_rdma_free_op(rm->m_rdma_op); - if (rm->m_rdma_mr) - rds_mr_put(rm->m_rdma_mr); -} + if (rm->rdma.op_active) + rds_rdma_free_op(&rm->rdma); + if (rm->rdma.op_rdma_mr) + rds_mr_put(rm->rdma.op_rdma_mr); -void rds_message_inc_purge(struct rds_incoming *inc) -{ - struct rds_message *rm = container_of(inc, struct rds_message, m_inc); - rds_message_purge(rm); + if (rm->atomic.op_active) + rds_atomic_free_op(&rm->atomic); + if (rm->atomic.op_rdma_mr) + rds_mr_put(rm->atomic.op_rdma_mr); } void rds_message_put(struct rds_message *rm) { rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); - + WARN(!atomic_read(&rm->m_refcount), "danger refcount zero on %p\n", rm); if (atomic_dec_and_test(&rm->m_refcount)) { BUG_ON(!list_empty(&rm->m_sock_item)); BUG_ON(!list_empty(&rm->m_conn_item)); @@ -95,12 +93,6 @@ void rds_message_put(struct rds_message *rm) } EXPORT_SYMBOL_GPL(rds_message_put); -void rds_message_inc_free(struct rds_incoming *inc) -{ - struct rds_message *rm = container_of(inc, struct rds_message, m_inc); - rds_message_put(rm); -} - void rds_message_populate_header(struct rds_header *hdr, __be16 sport, __be16 dport, u64 seq) { @@ -112,8 +104,8 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport, } EXPORT_SYMBOL_GPL(rds_message_populate_header); -int rds_message_add_extension(struct rds_header *hdr, - unsigned int type, const void *data, unsigned int len) +int rds_message_add_extension(struct rds_header *hdr, unsigned int type, + const void *data, unsigned int len) { unsigned int ext_len = sizeof(u8) + len; unsigned char *dst; @@ -183,26 +175,6 @@ none: return RDS_EXTHDR_NONE; } -int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version) -{ - struct rds_ext_header_version ext_hdr; - - ext_hdr.h_version = cpu_to_be32(version); - return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr)); -} - -int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version) -{ - struct rds_ext_header_version ext_hdr; - unsigned int pos = 0, len = sizeof(ext_hdr); - - /* We assume the version extension is the only one present */ - if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION) - return 0; - *version = be32_to_cpu(ext_hdr.h_version); - return 1; -} - int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset) { struct rds_ext_header_rdma_dest ext_hdr; @@ -213,41 +185,78 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o } EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension); -struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp) +/* + * Each rds_message is allocated with extra space for the scatterlist entries + * rds ops will need. This is to minimize memory allocation count. Then, each rds op + * can grab SGs when initializing its part of the rds_message. + */ +struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp) { struct rds_message *rm; - rm = kzalloc(sizeof(struct rds_message) + - (nents * sizeof(struct scatterlist)), gfp); + if (extra_len > KMALLOC_MAX_SIZE - sizeof(struct rds_message)) + return NULL; + + rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp); if (!rm) goto out; - if (nents) - sg_init_table(rm->m_sg, nents); + rm->m_used_sgs = 0; + rm->m_total_sgs = extra_len / sizeof(struct scatterlist); + atomic_set(&rm->m_refcount, 1); INIT_LIST_HEAD(&rm->m_sock_item); INIT_LIST_HEAD(&rm->m_conn_item); spin_lock_init(&rm->m_rs_lock); + init_waitqueue_head(&rm->m_flush_wait); out: return rm; } +/* + * RDS ops use this to grab SG entries from the rm's sg pool. + */ +struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents) +{ + struct scatterlist *sg_first = (struct scatterlist *) &rm[1]; + struct scatterlist *sg_ret; + + WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs); + WARN_ON(!nents); + + if (rm->m_used_sgs + nents > rm->m_total_sgs) + return NULL; + + sg_ret = &sg_first[rm->m_used_sgs]; + sg_init_table(sg_ret, nents); + rm->m_used_sgs += nents; + + return sg_ret; +} + struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) { struct rds_message *rm; unsigned int i; + int num_sgs = ceil(total_len, PAGE_SIZE); + int extra_bytes = num_sgs * sizeof(struct scatterlist); - rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); - if (rm == NULL) + rm = rds_message_alloc(extra_bytes, GFP_NOWAIT); + if (!rm) return ERR_PTR(-ENOMEM); set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); - rm->m_nents = ceil(total_len, PAGE_SIZE); + rm->data.op_nents = ceil(total_len, PAGE_SIZE); + rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); + if (!rm->data.op_sg) { + rds_message_put(rm); + return ERR_PTR(-ENOMEM); + } - for (i = 0; i < rm->m_nents; ++i) { - sg_set_page(&rm->m_sg[i], + for (i = 0; i < rm->data.op_nents; ++i) { + sg_set_page(&rm->data.op_sg[i], virt_to_page(page_addrs[i]), PAGE_SIZE, 0); } @@ -255,40 +264,33 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in return rm; } -struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, +int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, size_t total_len) { unsigned long to_copy; unsigned long iov_off; unsigned long sg_off; - struct rds_message *rm; struct iovec *iov; struct scatterlist *sg; - int ret; - - rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); - if (rm == NULL) { - ret = -ENOMEM; - goto out; - } + int ret = 0; rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); /* * now allocate and copy in the data payload. */ - sg = rm->m_sg; + sg = rm->data.op_sg; iov = first_iov; iov_off = 0; sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ while (total_len) { - if (sg_page(sg) == NULL) { + if (!sg_page(sg)) { ret = rds_page_remainder_alloc(sg, total_len, GFP_HIGHUSER); if (ret) goto out; - rm->m_nents++; + rm->data.op_nents++; sg_off = 0; } @@ -319,14 +321,8 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, sg++; } - ret = 0; out: - if (ret) { - if (rm) - rds_message_put(rm); - rm = ERR_PTR(ret); - } - return rm; + return ret; } int rds_message_inc_copy_to_user(struct rds_incoming *inc, @@ -347,7 +343,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, iov = first_iov; iov_off = 0; - sg = rm->m_sg; + sg = rm->data.op_sg; vec_off = 0; copied = 0; @@ -393,15 +389,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, */ void rds_message_wait(struct rds_message *rm) { - wait_event(rds_message_flush_waitq, + wait_event_interruptible(rm->m_flush_wait, !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); } void rds_message_unmapped(struct rds_message *rm) { clear_bit(RDS_MSG_MAPPED, &rm->m_flags); - if (waitqueue_active(&rds_message_flush_waitq)) - wake_up(&rds_message_flush_waitq); + wake_up_interruptible(&rm->m_flush_wait); } EXPORT_SYMBOL_GPL(rds_message_unmapped); diff --git a/net/rds/page.c b/net/rds/page.c index 36790122dfd..9005a2c920e 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -31,6 +31,9 @@ * */ #include <linux/highmem.h> +#include <linux/gfp.h> +#include <linux/cpu.h> +#include <linux/export.h> #include "rds.h" @@ -39,7 +42,8 @@ struct rds_page_remainder { unsigned long r_offset; }; -DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, + rds_page_remainders); /* * returns 0 on success or -errno on failure. @@ -56,38 +60,26 @@ int rds_page_copy_user(struct page *page, unsigned long offset, unsigned long ret; void *addr; - if (to_user) + addr = kmap(page); + if (to_user) { rds_stats_add(s_copy_to_user, bytes); - else + ret = copy_to_user(ptr, addr + offset, bytes); + } else { rds_stats_add(s_copy_from_user, bytes); - - addr = kmap_atomic(page, KM_USER0); - if (to_user) - ret = __copy_to_user_inatomic(ptr, addr + offset, bytes); - else - ret = __copy_from_user_inatomic(addr + offset, ptr, bytes); - kunmap_atomic(addr, KM_USER0); - - if (ret) { - addr = kmap(page); - if (to_user) - ret = copy_to_user(ptr, addr + offset, bytes); - else - ret = copy_from_user(addr + offset, ptr, bytes); - kunmap(page); - if (ret) - return -EFAULT; + ret = copy_from_user(addr + offset, ptr, bytes); } + kunmap(page); - return 0; + return ret ? -EFAULT : 0; } EXPORT_SYMBOL_GPL(rds_page_copy_user); -/* - * Message allocation uses this to build up regions of a message. +/** + * rds_page_remainder_alloc - build up regions of a message. * - * @bytes - the number of bytes needed. - * @gfp - the waiting behaviour of the allocation + * @scat: Scatter list for message + * @bytes: the number of bytes needed. + * @gfp: the waiting behaviour of the allocation * * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to * kmap the pages, etc. @@ -115,7 +107,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, /* jump straight to allocation if we're trying for a huge page */ if (bytes >= PAGE_SIZE) { page = alloc_page(gfp); - if (page == NULL) { + if (!page) { ret = -ENOMEM; } else { sg_set_page(scat, page, PAGE_SIZE, 0); @@ -161,7 +153,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, rem = &per_cpu(rds_page_remainders, get_cpu()); local_irq_save(flags); - if (page == NULL) { + if (!page) { ret = -ENOMEM; break; } @@ -185,6 +177,7 @@ out: ret ? 0 : scat->length); return ret; } +EXPORT_SYMBOL_GPL(rds_page_remainder_alloc); static int rds_page_remainder_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 4c64daa1f5d..4e37c1cbe8b 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -31,10 +31,11 @@ * */ #include <linux/pagemap.h> +#include <linux/slab.h> #include <linux/rbtree.h> #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ -#include "rdma.h" +#include "rds.h" /* * XXX @@ -129,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs) { struct rds_mr *mr; struct rb_node *node; + unsigned long flags; /* Release any MRs associated with this socket */ + spin_lock_irqsave(&rs->rs_rdma_lock, flags); while ((node = rb_first(&rs->rs_rdma_keys))) { mr = container_of(node, struct rds_mr, r_rb_node); if (mr->r_trans == rs->rs_transport) mr->r_invalidate = 0; + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + RB_CLEAR_NODE(&mr->r_rb_node); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + rds_destroy_mr(mr); rds_mr_put(mr); + spin_lock_irqsave(&rs->rs_rdma_lock, flags); } + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (rs->rs_transport && rs->rs_transport->flush_mrs) rs->rs_transport->flush_mrs(); @@ -180,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, goto out; } - if (rs->rs_transport->get_mr == NULL) { + if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out; } @@ -196,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, /* XXX clamp nr_pages to limit the size of this alloc? */ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (pages == NULL) { + if (!pages) { ret = -ENOMEM; goto out; } mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); - if (mr == NULL) { + if (!mr) { ret = -ENOMEM; goto out; } @@ -229,13 +238,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to * the zero page. */ - ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1); + ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); if (ret < 0) goto out; nents = ret; sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); - if (sg == NULL) { + if (!sg) { ret = -ENOMEM; goto out; } @@ -405,132 +414,217 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); - if (mr && (mr->r_use_once || force)) { + if (!mr) { + printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + return; + } + + if (mr->r_use_once || force) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); zot_me = 1; - } else if (mr) - atomic_inc(&mr->r_refcount); + } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); /* May have to issue a dma_sync on this memory region. * Note we could avoid this if the operation was a RDMA READ, * but at this point we can't tell. */ - if (mr != NULL) { - if (mr->r_trans->sync_mr) - mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); - - /* If the MR was marked as invalidate, this will - * trigger an async flush. */ - if (zot_me) - rds_destroy_mr(mr); - rds_mr_put(mr); - } + if (mr->r_trans->sync_mr) + mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); + + /* If the MR was marked as invalidate, this will + * trigger an async flush. */ + if (zot_me) + rds_destroy_mr(mr); + rds_mr_put(mr); } -void rds_rdma_free_op(struct rds_rdma_op *ro) +void rds_rdma_free_op(struct rm_rdma_op *ro) { unsigned int i; - for (i = 0; i < ro->r_nents; i++) { - struct page *page = sg_page(&ro->r_sg[i]); + for (i = 0; i < ro->op_nents; i++) { + struct page *page = sg_page(&ro->op_sg[i]); /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ - if (!ro->r_write) + if (!ro->op_write) { + BUG_ON(irqs_disabled()); set_page_dirty(page); + } put_page(page); } - kfree(ro->r_notifier); - kfree(ro); + kfree(ro->op_notifier); + ro->op_notifier = NULL; + ro->op_active = 0; +} + +void rds_atomic_free_op(struct rm_atomic_op *ao) +{ + struct page *page = sg_page(ao->op_sg); + + /* Mark page dirty if it was possibly modified, which + * is the case for a RDMA_READ which copies from remote + * to local memory */ + set_page_dirty(page); + put_page(page); + + kfree(ao->op_notifier); + ao->op_notifier = NULL; + ao->op_active = 0; } + /* - * args is a pointer to an in-kernel copy in the sendmsg cmsg. + * Count the number of pages needed to describe an incoming iovec array. */ -static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, - struct rds_rdma_args *args) +static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) +{ + int tot_pages = 0; + unsigned int nr_pages; + unsigned int i; + + /* figure out the number of pages in the vector */ + for (i = 0; i < nr_iovecs; i++) { + nr_pages = rds_pages_in_vec(&iov[i]); + if (nr_pages == 0) + return -EINVAL; + + tot_pages += nr_pages; + + /* + * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, + * so tot_pages cannot overflow without first going negative. + */ + if (tot_pages < 0) + return -EINVAL; + } + + return tot_pages; +} + +int rds_rdma_extra_size(struct rds_rdma_args *args) { struct rds_iovec vec; - struct rds_rdma_op *op = NULL; + struct rds_iovec __user *local_vec; + int tot_pages = 0; unsigned int nr_pages; - unsigned int max_pages; + unsigned int i; + + local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; + + /* figure out the number of pages in the vector */ + for (i = 0; i < args->nr_local; i++) { + if (copy_from_user(&vec, &local_vec[i], + sizeof(struct rds_iovec))) + return -EFAULT; + + nr_pages = rds_pages_in_vec(&vec); + if (nr_pages == 0) + return -EINVAL; + + tot_pages += nr_pages; + + /* + * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, + * so tot_pages cannot overflow without first going negative. + */ + if (tot_pages < 0) + return -EINVAL; + } + + return tot_pages * sizeof(struct scatterlist); +} + +/* + * The application asks for a RDMA transfer. + * Extract all arguments and set up the rdma_op + */ +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + struct rds_rdma_args *args; + struct rm_rdma_op *op = &rm->rdma; + int nr_pages; unsigned int nr_bytes; struct page **pages = NULL; - struct rds_iovec __user *local_vec; - struct scatterlist *sg; - unsigned int nr; + struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack; + int iov_size; unsigned int i, j; - int ret; + int ret = 0; + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) + || rm->rdma.op_active) + return -EINVAL; + + args = CMSG_DATA(cmsg); if (rs->rs_bound_addr == 0) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } - if (args->nr_local > (u64)UINT_MAX) { + if (args->nr_local > UIO_MAXIOV) { ret = -EMSGSIZE; goto out; } - nr_pages = 0; - max_pages = 0; - - local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; - - /* figure out the number of pages in the vector */ - for (i = 0; i < args->nr_local; i++) { - if (copy_from_user(&vec, &local_vec[i], - sizeof(struct rds_iovec))) { - ret = -EFAULT; - goto out; - } - - nr = rds_pages_in_vec(&vec); - if (nr == 0) { - ret = -EINVAL; + /* Check whether to allocate the iovec area */ + iov_size = args->nr_local * sizeof(struct rds_iovec); + if (args->nr_local > UIO_FASTIOV) { + iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL); + if (!iovs) { + ret = -ENOMEM; goto out; } + } - max_pages = max(nr, max_pages); - nr_pages += nr; + if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) { + ret = -EFAULT; + goto out; } - pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL); - if (pages == NULL) { - ret = -ENOMEM; + nr_pages = rds_rdma_pages(iovs, args->nr_local); + if (nr_pages < 0) { + ret = -EINVAL; goto out; } - op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL); - if (op == NULL) { + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) { ret = -ENOMEM; goto out; } - op->r_write = !!(args->flags & RDS_RDMA_READWRITE); - op->r_fence = !!(args->flags & RDS_RDMA_FENCE); - op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); - op->r_recverr = rs->rs_recverr; + op->op_write = !!(args->flags & RDS_RDMA_READWRITE); + op->op_fence = !!(args->flags & RDS_RDMA_FENCE); + op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + op->op_silent = !!(args->flags & RDS_RDMA_SILENT); + op->op_active = 1; + op->op_recverr = rs->rs_recverr; WARN_ON(!nr_pages); - sg_init_table(op->r_sg, nr_pages); + op->op_sg = rds_message_alloc_sgs(rm, nr_pages); + if (!op->op_sg) { + ret = -ENOMEM; + goto out; + } - if (op->r_notify || op->r_recverr) { + if (op->op_notify || op->op_recverr) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ - op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); - if (!op->r_notifier) { + op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); + if (!op->op_notifier) { ret = -ENOMEM; goto out; } - op->r_notifier->n_user_token = args->user_token; - op->r_notifier->n_status = RDS_RDMA_SUCCESS; + op->op_notifier->n_user_token = args->user_token; + op->op_notifier->n_status = RDS_RDMA_SUCCESS; } /* The cookie contains the R_Key of the remote memory region, and @@ -540,68 +634,55 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, * destination address (which is really an offset into the MR) * FIXME: We may want to move this into ib_rdma.c */ - op->r_key = rds_rdma_cookie_key(args->cookie); - op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); + op->op_rkey = rds_rdma_cookie_key(args->cookie); + op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); nr_bytes = 0; rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", (unsigned long long)args->nr_local, (unsigned long long)args->remote_vec.addr, - op->r_key); + op->op_rkey); for (i = 0; i < args->nr_local; i++) { - if (copy_from_user(&vec, &local_vec[i], - sizeof(struct rds_iovec))) { - ret = -EFAULT; - goto out; - } - - nr = rds_pages_in_vec(&vec); - if (nr == 0) { - ret = -EINVAL; - goto out; - } + struct rds_iovec *iov = &iovs[i]; + /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */ + unsigned int nr = rds_pages_in_vec(iov); - rs->rs_user_addr = vec.addr; - rs->rs_user_bytes = vec.bytes; + rs->rs_user_addr = iov->addr; + rs->rs_user_bytes = iov->bytes; - /* did the user change the vec under us? */ - if (nr > max_pages || op->r_nents + nr > nr_pages) { - ret = -EINVAL; - goto out; - } /* If it's a WRITE operation, we want to pin the pages for reading. * If it's a READ operation, we need to pin the pages for writing. */ - ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write); + ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); if (ret < 0) goto out; - rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n", - nr_bytes, nr, vec.bytes, vec.addr); + rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", + nr_bytes, nr, iov->bytes, iov->addr); - nr_bytes += vec.bytes; + nr_bytes += iov->bytes; for (j = 0; j < nr; j++) { - unsigned int offset = vec.addr & ~PAGE_MASK; + unsigned int offset = iov->addr & ~PAGE_MASK; + struct scatterlist *sg; - sg = &op->r_sg[op->r_nents + j]; + sg = &op->op_sg[op->op_nents + j]; sg_set_page(sg, pages[j], - min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), + min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), offset); - rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n", - sg->offset, sg->length, vec.addr, vec.bytes); + rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", + sg->offset, sg->length, iov->addr, iov->bytes); - vec.addr += sg->length; - vec.bytes -= sg->length; + iov->addr += sg->length; + iov->bytes -= sg->length; } - op->r_nents += nr; + op->op_nents += nr; } - if (nr_bytes > args->remote_vec.bytes) { rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", nr_bytes, @@ -609,38 +690,18 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, ret = -EINVAL; goto out; } - op->r_bytes = nr_bytes; + op->op_bytes = nr_bytes; - ret = 0; out: + if (iovs != iovstack) + sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size); kfree(pages); - if (ret) { - if (op) - rds_rdma_free_op(op); - op = ERR_PTR(ret); - } - return op; -} - -/* - * The application asks for a RDMA transfer. - * Extract all arguments and set up the rdma_op - */ -int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg) -{ - struct rds_rdma_op *op; - - if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || - rm->m_rdma_op != NULL) - return -EINVAL; + if (ret) + rds_rdma_free_op(op); + else + rds_stats_inc(s_send_rdma); - op = rds_rdma_prepare(rs, CMSG_DATA(cmsg)); - if (IS_ERR(op)) - return PTR_ERR(op); - rds_stats_inc(s_send_rdma); - rm->m_rdma_op = op; - return 0; + return ret; } /* @@ -670,7 +731,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); - if (mr == NULL) + if (!mr) err = -EINVAL; /* invalid r_key */ else atomic_inc(&mr->r_refcount); @@ -678,7 +739,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, if (mr) { mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); - rm->m_rdma_mr = mr; + rm->rdma.op_rdma_mr = mr; } return err; } @@ -696,5 +757,102 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, rm->m_rdma_cookie != 0) return -EINVAL; - return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr); + return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr); +} + +/* + * Fill in rds_message for an atomic request. + */ +int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + struct page *page = NULL; + struct rds_atomic_args *args; + int ret = 0; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args)) + || rm->atomic.op_active) + return -EINVAL; + + args = CMSG_DATA(cmsg); + + /* Nonmasked & masked cmsg ops converted to masked hw ops */ + switch (cmsg->cmsg_type) { + case RDS_CMSG_ATOMIC_FADD: + rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; + rm->atomic.op_m_fadd.add = args->fadd.add; + rm->atomic.op_m_fadd.nocarry_mask = 0; + break; + case RDS_CMSG_MASKED_ATOMIC_FADD: + rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; + rm->atomic.op_m_fadd.add = args->m_fadd.add; + rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask; + break; + case RDS_CMSG_ATOMIC_CSWP: + rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; + rm->atomic.op_m_cswp.compare = args->cswp.compare; + rm->atomic.op_m_cswp.swap = args->cswp.swap; + rm->atomic.op_m_cswp.compare_mask = ~0; + rm->atomic.op_m_cswp.swap_mask = ~0; + break; + case RDS_CMSG_MASKED_ATOMIC_CSWP: + rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; + rm->atomic.op_m_cswp.compare = args->m_cswp.compare; + rm->atomic.op_m_cswp.swap = args->m_cswp.swap; + rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask; + rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask; + break; + default: + BUG(); /* should never happen */ + } + + rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); + rm->atomic.op_active = 1; + rm->atomic.op_recverr = rs->rs_recverr; + rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); + if (!rm->atomic.op_sg) { + ret = -ENOMEM; + goto err; + } + + /* verify 8 byte-aligned */ + if (args->local_addr & 0x7) { + ret = -EFAULT; + goto err; + } + + ret = rds_pin_pages(args->local_addr, 1, &page, 1); + if (ret != 1) + goto err; + ret = 0; + + sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr)); + + if (rm->atomic.op_notify || rm->atomic.op_recverr) { + /* We allocate an uninitialized notifier here, because + * we don't want to do that in the completion handler. We + * would have to use GFP_ATOMIC there, and don't want to deal + * with failed allocations. + */ + rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); + if (!rm->atomic.op_notifier) { + ret = -ENOMEM; + goto err; + } + + rm->atomic.op_notifier->n_user_token = args->user_token; + rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; + } + + rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); + rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); + + return ret; +err: + if (page) + put_page(page); + kfree(rm->atomic.op_notifier); + + return ret; } diff --git a/net/rds/rdma.h b/net/rds/rdma.h deleted file mode 100644 index 909c39835a5..00000000000 --- a/net/rds/rdma.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef _RDS_RDMA_H -#define _RDS_RDMA_H - -#include <linux/rbtree.h> -#include <linux/spinlock.h> -#include <linux/scatterlist.h> - -#include "rds.h" - -struct rds_mr { - struct rb_node r_rb_node; - atomic_t r_refcount; - u32 r_key; - - /* A copy of the creation flags */ - unsigned int r_use_once:1; - unsigned int r_invalidate:1; - unsigned int r_write:1; - - /* This is for RDS_MR_DEAD. - * It would be nice & consistent to make this part of the above - * bit field here, but we need to use test_and_set_bit. - */ - unsigned long r_state; - struct rds_sock *r_sock; /* back pointer to the socket that owns us */ - struct rds_transport *r_trans; - void *r_trans_private; -}; - -/* Flags for mr->r_state */ -#define RDS_MR_DEAD 0 - -struct rds_rdma_op { - u32 r_key; - u64 r_remote_addr; - unsigned int r_write:1; - unsigned int r_fence:1; - unsigned int r_notify:1; - unsigned int r_recverr:1; - unsigned int r_mapped:1; - struct rds_notifier *r_notifier; - unsigned int r_bytes; - unsigned int r_nents; - unsigned int r_count; - struct scatterlist r_sg[0]; -}; - -static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) -{ - return r_key | (((u64) offset) << 32); -} - -static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) -{ - return cookie; -} - -static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) -{ - return cookie >> 32; -} - -int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); -int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); -int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); -void rds_rdma_drop_keys(struct rds_sock *rs); -int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); -int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); -int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); -int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); -void rds_rdma_free_op(struct rds_rdma_op *ro); -void rds_rdma_send_complete(struct rds_message *rm, int); - -extern void __rds_put_mr_final(struct rds_mr *mr); -static inline void rds_mr_put(struct rds_mr *mr) -{ - if (atomic_dec_and_test(&mr->r_refcount)) - __rds_put_mr_final(mr); -} - -#endif diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 9ece910ea39..6cd9d1deafc 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -30,12 +30,41 @@ * SOFTWARE. * */ +#include <linux/module.h> #include <rdma/rdma_cm.h> #include "rdma_transport.h" static struct rdma_cm_id *rds_rdma_listen_id; +static char *rds_cm_event_strings[] = { +#define RDS_CM_EVENT_STRING(foo) \ + [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo) + RDS_CM_EVENT_STRING(ADDR_RESOLVED), + RDS_CM_EVENT_STRING(ADDR_ERROR), + RDS_CM_EVENT_STRING(ROUTE_RESOLVED), + RDS_CM_EVENT_STRING(ROUTE_ERROR), + RDS_CM_EVENT_STRING(CONNECT_REQUEST), + RDS_CM_EVENT_STRING(CONNECT_RESPONSE), + RDS_CM_EVENT_STRING(CONNECT_ERROR), + RDS_CM_EVENT_STRING(UNREACHABLE), + RDS_CM_EVENT_STRING(REJECTED), + RDS_CM_EVENT_STRING(ESTABLISHED), + RDS_CM_EVENT_STRING(DISCONNECTED), + RDS_CM_EVENT_STRING(DEVICE_REMOVAL), + RDS_CM_EVENT_STRING(MULTICAST_JOIN), + RDS_CM_EVENT_STRING(MULTICAST_ERROR), + RDS_CM_EVENT_STRING(ADDR_CHANGE), + RDS_CM_EVENT_STRING(TIMEWAIT_EXIT), +#undef RDS_CM_EVENT_STRING +}; + +static char *rds_cm_event_str(enum rdma_cm_event_type type) +{ + return rds_str_array(rds_cm_event_strings, + ARRAY_SIZE(rds_cm_event_strings), type); +}; + int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { @@ -44,8 +73,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rds_transport *trans; int ret = 0; - rdsdebug("conn %p id %p handling event %u\n", conn, cm_id, - event->event); + rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, + event->event, rds_cm_event_str(event->event)); if (cm_id->device->node_type == RDMA_NODE_RNIC) trans = &rds_iw_transport; @@ -101,7 +130,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, break; case RDMA_CM_EVENT_DISCONNECTED: - printk(KERN_WARNING "RDS/RDMA: DISCONNECT event - dropping connection " + rdsdebug("DISCONNECT event - dropping connection " "%pI4->%pI4\n", &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn); @@ -109,8 +138,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, default: /* things like device disconnect? */ - printk(KERN_ERR "unknown event %u\n", event->event); - BUG(); + printk(KERN_ERR "RDS: unknown event %u (%s)!\n", + event->event, rds_cm_event_str(event->event)); break; } @@ -118,26 +147,28 @@ out: if (conn) mutex_unlock(&conn->c_cm_lock); - rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret); + rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event, + rds_cm_event_str(event->event), ret); return ret; } -static int __init rds_rdma_listen_init(void) +static int rds_rdma_listen_init(void) { struct sockaddr_in sin; struct rdma_cm_id *cm_id; int ret; - cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP); + cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP, + IB_QPT_RC); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); printk(KERN_ERR "RDS/RDMA: failed to setup listener, " "rdma_create_id() returned %d\n", ret); - goto out; + return ret; } - sin.sin_family = AF_INET, + sin.sin_family = AF_INET; sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); sin.sin_port = (__force u16)htons(RDS_PORT); @@ -178,7 +209,7 @@ static void rds_rdma_listen_stop(void) } } -int __init rds_rdma_init(void) +static int rds_rdma_init(void) { int ret; @@ -205,7 +236,7 @@ out: } module_init(rds_rdma_init); -void rds_rdma_exit(void) +static void rds_rdma_exit(void) { /* stop listening first to ensure no new connections are attempted */ rds_rdma_listen_stop(); diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index 2f2c7d976c2..faba4e38269 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -11,10 +11,6 @@ int rds_rdma_conn_connect(struct rds_connection *conn); int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); -/* from rdma_transport.c */ -int rds_rdma_init(void); -void rds_rdma_exit(void); - /* from ib.c */ extern struct rds_transport rds_ib_transport; int rds_ib_init(void); diff --git a/net/rds/rds.h b/net/rds/rds.h index 85d6f897ecc..48f8ffc60f8 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -36,8 +36,8 @@ #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) #else /* sigh, pr_debug() causes unused variable warnings */ -static inline void __attribute__ ((format (printf, 1, 2))) -rdsdebug(char *fmt, ...) +static inline __printf(1, 2) +void rdsdebug(char *fmt, ...) { } #endif @@ -50,7 +50,6 @@ rdsdebug(char *fmt, ...) #define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) #define RDS_CONG_MAP_BYTES (65536 / 8) -#define RDS_CONG_MAP_LONGS (RDS_CONG_MAP_BYTES / sizeof(unsigned long)) #define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) #define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) @@ -80,6 +79,7 @@ enum { /* Bits for c_flags */ #define RDS_LL_SEND_FULL 0 #define RDS_RECONNECT_PENDING 1 +#define RDS_IN_XMIT 2 struct rds_connection { struct hlist_node c_hash_node; @@ -91,12 +91,13 @@ struct rds_connection { struct rds_cong_map *c_lcong; struct rds_cong_map *c_fcong; - struct mutex c_send_lock; /* protect send ring */ struct rds_message *c_xmit_rm; unsigned long c_xmit_sg; unsigned int c_xmit_hdr_off; unsigned int c_xmit_data_off; + unsigned int c_xmit_atomic_sent; unsigned int c_xmit_rdma_sent; + unsigned int c_xmit_data_sent; spinlock_t c_lock; /* protect msg queues */ u64 c_next_tx_seq; @@ -116,11 +117,10 @@ struct rds_connection { struct delayed_work c_conn_w; struct work_struct c_down_w; struct mutex c_cm_lock; /* protect conn state & cm */ + wait_queue_head_t c_waitq; struct list_head c_map_item; unsigned long c_map_queued; - unsigned long c_map_offset; - unsigned long c_map_bytes; unsigned int c_unacked_packets; unsigned int c_unacked_bytes; @@ -206,6 +206,48 @@ struct rds_incoming { rds_rdma_cookie_t i_rdma_cookie; }; +struct rds_mr { + struct rb_node r_rb_node; + atomic_t r_refcount; + u32 r_key; + + /* A copy of the creation flags */ + unsigned int r_use_once:1; + unsigned int r_invalidate:1; + unsigned int r_write:1; + + /* This is for RDS_MR_DEAD. + * It would be nice & consistent to make this part of the above + * bit field here, but we need to use test_and_set_bit. + */ + unsigned long r_state; + struct rds_sock *r_sock; /* back pointer to the socket that owns us */ + struct rds_transport *r_trans; + void *r_trans_private; +}; + +/* Flags for mr->r_state */ +#define RDS_MR_DEAD 0 + +static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) +{ + return r_key | (((u64) offset) << 32); +} + +static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) +{ + return cookie; +} + +static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) +{ + return cookie >> 32; +} + +/* atomic operation types */ +#define RDS_ATOMIC_TYPE_CSWP 0 +#define RDS_ATOMIC_TYPE_FADD 1 + /* * m_sock_item and m_conn_item are on lists that are serialized under * conn->c_lock. m_sock_item has additional meaning in that once it is empty @@ -258,13 +300,71 @@ struct rds_message { * -> rs->rs_lock */ spinlock_t m_rs_lock; + wait_queue_head_t m_flush_wait; + struct rds_sock *m_rs; - struct rds_rdma_op *m_rdma_op; + + /* cookie to send to remote, in rds header */ rds_rdma_cookie_t m_rdma_cookie; - struct rds_mr *m_rdma_mr; - unsigned int m_nents; - unsigned int m_count; - struct scatterlist m_sg[0]; + + unsigned int m_used_sgs; + unsigned int m_total_sgs; + + void *m_final_op; + + struct { + struct rm_atomic_op { + int op_type; + union { + struct { + uint64_t compare; + uint64_t swap; + uint64_t compare_mask; + uint64_t swap_mask; + } op_m_cswp; + struct { + uint64_t add; + uint64_t nocarry_mask; + } op_m_fadd; + }; + + u32 op_rkey; + u64 op_remote_addr; + unsigned int op_notify:1; + unsigned int op_recverr:1; + unsigned int op_mapped:1; + unsigned int op_silent:1; + unsigned int op_active:1; + struct scatterlist *op_sg; + struct rds_notifier *op_notifier; + + struct rds_mr *op_rdma_mr; + } atomic; + struct rm_rdma_op { + u32 op_rkey; + u64 op_remote_addr; + unsigned int op_write:1; + unsigned int op_fence:1; + unsigned int op_notify:1; + unsigned int op_recverr:1; + unsigned int op_mapped:1; + unsigned int op_silent:1; + unsigned int op_active:1; + unsigned int op_bytes; + unsigned int op_nents; + unsigned int op_count; + struct scatterlist *op_sg; + struct rds_notifier *op_notifier; + + struct rds_mr *op_rdma_mr; + } rdma; + struct rm_data_op { + unsigned int op_active:1; + unsigned int op_nents; + unsigned int op_count; + struct scatterlist *op_sg; + } data; + }; }; /* @@ -305,10 +405,6 @@ struct rds_notifier { * transport is responsible for other serialization, including * rds_recv_incoming(). This is called in process context but * should try hard not to block. - * - * @xmit_cong_map: This asks the transport to send the local bitmap down the - * given connection. XXX get a better story about the bitmap - * flag and header. */ #define RDS_TRANS_IB 0 @@ -332,13 +428,11 @@ struct rds_transport { void (*xmit_complete)(struct rds_connection *conn); int (*xmit)(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); - int (*xmit_cong_map)(struct rds_connection *conn, - struct rds_cong_map *map, unsigned long offset); - int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op); + int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); + int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); int (*recv)(struct rds_connection *conn); int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, size_t size); - void (*inc_purge)(struct rds_incoming *inc); void (*inc_free)(struct rds_incoming *inc); int (*cm_handle_connect)(struct rdma_cm_id *cm_id, @@ -367,17 +461,11 @@ struct rds_sock { * bound_addr used for both incoming and outgoing, no INADDR_ANY * support. */ - struct rb_node rs_bound_node; + struct hlist_node rs_bound_node; __be32 rs_bound_addr; __be32 rs_conn_addr; __be16 rs_bound_port; __be16 rs_conn_port; - - /* - * This is only used to communicate the transport between bind and - * initiating connections. All other trans use is referenced through - * the connection. - */ struct rds_transport *rs_transport; /* @@ -388,6 +476,8 @@ struct rds_sock { /* flag indicating we were congested or not */ int rs_congested; + /* seen congestion (ENOBUFS) when sending? */ + int rs_seen_congestion; /* rs_lock protects all these adjacent members before the newline */ spinlock_t rs_lock; @@ -464,8 +554,8 @@ struct rds_statistics { uint64_t s_recv_ping; uint64_t s_send_queue_empty; uint64_t s_send_queue_full; - uint64_t s_send_sem_contention; - uint64_t s_send_sem_queue_raced; + uint64_t s_send_lock_contention; + uint64_t s_send_lock_queue_raced; uint64_t s_send_immediate_retry; uint64_t s_send_delayed_retry; uint64_t s_send_drop_acked; @@ -485,12 +575,13 @@ struct rds_statistics { }; /* af_rds.c */ +char *rds_str_array(char **array, size_t elements, size_t index); void rds_sock_addref(struct rds_sock *rs); void rds_sock_put(struct rds_sock *rs); void rds_wake_sk_sleep(struct rds_sock *rs); static inline void __rds_wake_sk_sleep(struct sock *sk) { - wait_queue_head_t *waitq = sk->sk_sleep; + wait_queue_head_t *waitq = sk_sleep(sk); if (!sock_flag(sk, SOCK_DEAD) && waitq) wake_up(waitq); @@ -519,22 +610,23 @@ void rds_cong_exit(void); struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); /* conn.c */ -int __init rds_conn_init(void); +int rds_conn_init(void); void rds_conn_exit(void); struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp); struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp); +void rds_conn_shutdown(struct rds_connection *conn); void rds_conn_destroy(struct rds_connection *conn); -void rds_conn_reset(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); +void rds_conn_connect_if_down(struct rds_connection *conn); void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_connection *, void *), size_t item_len); -void __rds_conn_error(struct rds_connection *conn, const char *, ...) - __attribute__ ((format (printf, 2, 3))); +__printf(2, 3) +void __rds_conn_error(struct rds_connection *conn, const char *, ...); #define rds_conn_error(conn, fmt...) \ __rds_conn_error(conn, KERN_WARNING "RDS: " fmt) @@ -564,7 +656,8 @@ rds_conn_connecting(struct rds_connection *conn) /* message.c */ struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); -struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, +struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); +int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, size_t total_len); struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); void rds_message_populate_header(struct rds_header *hdr, __be16 sport, @@ -573,12 +666,9 @@ int rds_message_add_extension(struct rds_header *hdr, unsigned int type, const void *data, unsigned int len); int rds_message_next_extension(struct rds_header *hdr, unsigned int *pos, void *buf, unsigned int *buflen); -int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version); -int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version); int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, size_t size); -void rds_message_inc_purge(struct rds_incoming *inc); void rds_message_inc_free(struct rds_incoming *inc); void rds_message_addref(struct rds_message *rm); void rds_message_put(struct rds_message *rm); @@ -612,10 +702,9 @@ void rds_page_exit(void); /* recv.c */ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, __be32 saddr); -void rds_inc_addref(struct rds_incoming *inc); void rds_inc_put(struct rds_incoming *inc); void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, - struct rds_incoming *inc, gfp_t gfp, enum km_type km); + struct rds_incoming *inc, gfp_t gfp); int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int msg_flags); void rds_clear_recv_queue(struct rds_sock *rs); @@ -634,14 +723,38 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked); -int rds_send_acked_before(struct rds_connection *conn, u64 seq); -void rds_send_remove_from_sock(struct list_head *messages, int status); int rds_send_pong(struct rds_connection *conn, __be16 dport); struct rds_message *rds_send_get_message(struct rds_connection *, - struct rds_rdma_op *); + struct rm_rdma_op *); /* rdma.c */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); +int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); +int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); +int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); +void rds_rdma_drop_keys(struct rds_sock *rs); +int rds_rdma_extra_size(struct rds_rdma_args *args); +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +void rds_rdma_free_op(struct rm_rdma_op *ro); +void rds_atomic_free_op(struct rm_atomic_op *ao); +void rds_rdma_send_complete(struct rds_message *rm, int wc_status); +void rds_atomic_send_complete(struct rds_message *rm, int wc_status); +int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); + +void __rds_put_mr_final(struct rds_mr *mr); +static inline void rds_mr_put(struct rds_mr *mr) +{ + if (atomic_dec_and_test(&mr->r_refcount)) + __rds_put_mr_final(mr); +} /* stats.c */ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); @@ -655,14 +768,14 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); put_cpu(); \ } while (0) #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) -int __init rds_stats_init(void); +int rds_stats_init(void); void rds_stats_exit(void); void rds_stats_info_copy(struct rds_info_iterator *iter, uint64_t *values, const char *const *names, size_t nr); /* sysctl.c */ -int __init rds_sysctl_init(void); +int rds_sysctl_init(void); void rds_sysctl_exit(void); extern unsigned long rds_sysctl_sndbuf_min; extern unsigned long rds_sysctl_sndbuf_default; @@ -676,9 +789,10 @@ extern unsigned long rds_sysctl_trace_flags; extern unsigned int rds_sysctl_trace_level; /* threads.c */ -int __init rds_threads_init(void); +int rds_threads_init(void); void rds_threads_exit(void); extern struct workqueue_struct *rds_wq; +void rds_queue_reconnect(struct rds_connection *conn); void rds_connect_worker(struct work_struct *); void rds_shutdown_worker(struct work_struct *); void rds_send_worker(struct work_struct *); @@ -689,9 +803,10 @@ void rds_connect_complete(struct rds_connection *conn); int rds_trans_register(struct rds_transport *trans); void rds_trans_unregister(struct rds_transport *trans); struct rds_transport *rds_trans_get_preferred(__be32 addr); +void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); -int __init rds_trans_init(void); +int rds_trans_init(void); void rds_trans_exit(void); #endif diff --git a/net/rds/recv.c b/net/rds/recv.c index b426d67f760..bd82522534f 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -31,11 +31,12 @@ * */ #include <linux/kernel.h> +#include <linux/slab.h> #include <net/sock.h> #include <linux/in.h> +#include <linux/export.h> #include "rds.h" -#include "rdma.h" void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, __be32 saddr) @@ -48,12 +49,11 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, } EXPORT_SYMBOL_GPL(rds_inc_init); -void rds_inc_addref(struct rds_incoming *inc) +static void rds_inc_addref(struct rds_incoming *inc) { rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); atomic_inc(&inc->i_refcount); } -EXPORT_SYMBOL_GPL(rds_inc_addref); void rds_inc_put(struct rds_incoming *inc) { @@ -155,7 +155,7 @@ static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock * tell us which roles the addrs in the conn are playing for this message. */ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, - struct rds_incoming *inc, gfp_t gfp, enum km_type km) + struct rds_incoming *inc, gfp_t gfp) { struct rds_sock *rs = NULL; struct sock *sk; @@ -209,7 +209,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, } rs = rds_find_bound(daddr, inc->i_hdr.h_dport); - if (rs == NULL) { + if (!rs) { rds_stats_inc(s_recv_drop_no_sock); goto out; } @@ -250,7 +250,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc) { unsigned long flags; - if (*inc == NULL) { + if (!*inc) { read_lock_irqsave(&rs->rs_recv_lock, flags); if (!list_empty(&rs->rs_recv_queue)) { *inc = list_entry(rs->rs_recv_queue.next, @@ -296,7 +296,7 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) { struct rds_notifier *notifier; - struct rds_rdma_notify cmsg; + struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */ unsigned int count = 0, max_messages = ~0U; unsigned long flags; LIST_HEAD(copy); @@ -333,10 +333,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) if (msghdr) { cmsg.user_token = notifier->n_user_token; - cmsg.status = notifier->n_status; + cmsg.status = notifier->n_status; err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, - sizeof(cmsg), &cmsg); + sizeof(cmsg), &cmsg); if (err) break; } @@ -402,7 +402,7 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, struct rds_sock *rs = rds_sk_to_rs(sk); long timeo; int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; - struct sockaddr_in *sin; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct rds_incoming *inc = NULL; /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */ @@ -431,7 +431,7 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, break; } - timeo = wait_event_interruptible_timeout(*sk->sk_sleep, + timeo = wait_event_interruptible_timeout(*sk_sleep(sk), (!list_empty(&rs->rs_notify_queue) || rs->rs_cong_notify || rds_next_incoming(rs, &inc)), timeo); @@ -479,12 +479,12 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, rds_stats_inc(s_recv_delivered); - sin = (struct sockaddr_in *)msg->msg_name; if (sin) { sin->sin_family = AF_INET; sin->sin_port = inc->i_hdr.h_sport; sin->sin_addr.s_addr = inc->i_saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + msg->msg_namelen = sizeof(*sin); } break; } diff --git a/net/rds/send.c b/net/rds/send.c index b2fccfc2076..23718160d71 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -31,12 +31,15 @@ * */ #include <linux/kernel.h> +#include <linux/moduleparam.h> +#include <linux/gfp.h> #include <net/sock.h> #include <linux/in.h> #include <linux/list.h> +#include <linux/ratelimit.h> +#include <linux/export.h> #include "rds.h" -#include "rdma.h" /* When transmitting messages in rds_send_xmit, we need to emerge from * time to time and briefly release the CPU. Otherwise the softlock watchdog @@ -52,8 +55,11 @@ static int send_batch_count = 64; module_param(send_batch_count, int, 0444); MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); +static void rds_send_remove_from_sock(struct list_head *messages, int status); + /* - * Reset the send state. Caller must hold c_send_lock when calling here. + * Reset the send state. Callers must ensure that this doesn't race with + * rds_send_xmit(). */ void rds_send_reset(struct rds_connection *conn) { @@ -61,18 +67,22 @@ void rds_send_reset(struct rds_connection *conn) unsigned long flags; if (conn->c_xmit_rm) { + rm = conn->c_xmit_rm; + conn->c_xmit_rm = NULL; /* Tell the user the RDMA op is no longer mapped by the * transport. This isn't entirely true (it's flushed out * independently) but as the connection is down, there's * no ongoing RDMA to/from that memory */ - rds_message_unmapped(conn->c_xmit_rm); - rds_message_put(conn->c_xmit_rm); - conn->c_xmit_rm = NULL; + rds_message_unmapped(rm); + rds_message_put(rm); } + conn->c_xmit_sg = 0; conn->c_xmit_hdr_off = 0; conn->c_xmit_data_off = 0; + conn->c_xmit_atomic_sent = 0; conn->c_xmit_rdma_sent = 0; + conn->c_xmit_data_sent = 0; conn->c_map_queued = 0; @@ -89,8 +99,27 @@ void rds_send_reset(struct rds_connection *conn) spin_unlock_irqrestore(&conn->c_lock, flags); } +static int acquire_in_xmit(struct rds_connection *conn) +{ + return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0; +} + +static void release_in_xmit(struct rds_connection *conn) +{ + clear_bit(RDS_IN_XMIT, &conn->c_flags); + smp_mb__after_atomic(); + /* + * We don't use wait_on_bit()/wake_up_bit() because our waking is in a + * hot path and finding waiters is very rare. We don't want to walk + * the system-wide hashed waitqueue buckets in the fast path only to + * almost never find waiters. + */ + if (waitqueue_active(&conn->c_waitq)) + wake_up_all(&conn->c_waitq); +} + /* - * We're making the concious trade-off here to only send one message + * We're making the conscious trade-off here to only send one message * down the connection at a time. * Pro: * - tx queueing is a simple fifo list @@ -108,102 +137,69 @@ int rds_send_xmit(struct rds_connection *conn) struct rds_message *rm; unsigned long flags; unsigned int tmp; - unsigned int send_quota = send_batch_count; struct scatterlist *sg; int ret = 0; - int was_empty = 0; LIST_HEAD(to_be_dropped); +restart: + /* * sendmsg calls here after having queued its message on the send * queue. We only have one task feeding the connection at a time. If * another thread is already feeding the queue then we back off. This * avoids blocking the caller and trading per-connection data between * caches per message. - * - * The sem holder will issue a retry if they notice that someone queued - * a message after they stopped walking the send queue but before they - * dropped the sem. */ - if (!mutex_trylock(&conn->c_send_lock)) { - rds_stats_inc(s_send_sem_contention); + if (!acquire_in_xmit(conn)) { + rds_stats_inc(s_send_lock_contention); ret = -ENOMEM; goto out; } + /* + * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, + * we do the opposite to avoid races. + */ + if (!rds_conn_up(conn)) { + release_in_xmit(conn); + ret = 0; + goto out; + } + if (conn->c_trans->xmit_prepare) conn->c_trans->xmit_prepare(conn); /* * spin trying to push headers and data down the connection until - * the connection doens't make forward progress. + * the connection doesn't make forward progress. */ - while (--send_quota) { - /* - * See if need to send a congestion map update if we're - * between sending messages. The send_sem protects our sole - * use of c_map_offset and _bytes. - * Note this is used only by transports that define a special - * xmit_cong_map function. For all others, we create allocate - * a cong_map message and treat it just like any other send. - */ - if (conn->c_map_bytes) { - ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong, - conn->c_map_offset); - if (ret <= 0) - break; - - conn->c_map_offset += ret; - conn->c_map_bytes -= ret; - if (conn->c_map_bytes) - continue; - } + while (1) { - /* If we're done sending the current message, clear the - * offset and S/G temporaries. - */ rm = conn->c_xmit_rm; - if (rm != NULL && - conn->c_xmit_hdr_off == sizeof(struct rds_header) && - conn->c_xmit_sg == rm->m_nents) { - conn->c_xmit_rm = NULL; - conn->c_xmit_sg = 0; - conn->c_xmit_hdr_off = 0; - conn->c_xmit_data_off = 0; - conn->c_xmit_rdma_sent = 0; - - /* Release the reference to the previous message. */ - rds_message_put(rm); - rm = NULL; - } - /* If we're asked to send a cong map update, do so. + /* + * If between sending messages, we can send a pending congestion + * map update. */ - if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) { - if (conn->c_trans->xmit_cong_map != NULL) { - conn->c_map_offset = 0; - conn->c_map_bytes = sizeof(struct rds_header) + - RDS_CONG_MAP_BYTES; - continue; - } - + if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) { rm = rds_cong_update_alloc(conn); if (IS_ERR(rm)) { ret = PTR_ERR(rm); break; } + rm->data.op_active = 1; conn->c_xmit_rm = rm; } /* - * Grab the next message from the send queue, if there is one. + * If not already working on one, grab the next message. * * c_xmit_rm holds a ref while we're sending this message down * the connction. We can use this ref while holding the * send_sem.. rds_send_reset() is serialized with it. */ - if (rm == NULL) { + if (!rm) { unsigned int len; spin_lock_irqsave(&conn->c_lock, flags); @@ -223,10 +219,8 @@ int rds_send_xmit(struct rds_connection *conn) spin_unlock_irqrestore(&conn->c_lock, flags); - if (rm == NULL) { - was_empty = 1; + if (!rm) break; - } /* Unfortunately, the way Infiniband deals with * RDMA to a bad MR key is by moving the entire @@ -235,13 +229,12 @@ int rds_send_xmit(struct rds_connection *conn) * connection. * Therefore, we never retransmit messages with RDMA ops. */ - if (rm->m_rdma_op && + if (rm->rdma.op_active && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { spin_lock_irqsave(&conn->c_lock, flags); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) list_move(&rm->m_conn_item, &to_be_dropped); spin_unlock_irqrestore(&conn->c_lock, flags); - rds_message_put(rm); continue; } @@ -262,23 +255,55 @@ int rds_send_xmit(struct rds_connection *conn) conn->c_xmit_rm = rm; } - /* - * Try and send an rdma message. Let's see if we can - * keep this simple and require that the transport either - * send the whole rdma or none of it. - */ - if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) { - ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op); + /* The transport either sends the whole rdma or none of it */ + if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { + rm->m_final_op = &rm->rdma; + ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); if (ret) break; conn->c_xmit_rdma_sent = 1; + + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); + } + + if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { + rm->m_final_op = &rm->atomic; + ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); + if (ret) + break; + conn->c_xmit_atomic_sent = 1; + /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue */ set_bit(RDS_MSG_MAPPED, &rm->m_flags); } - if (conn->c_xmit_hdr_off < sizeof(struct rds_header) || - conn->c_xmit_sg < rm->m_nents) { + /* + * A number of cases require an RDS header to be sent + * even if there is no data. + * We permit 0-byte sends; rds-ping depends on this. + * However, if there are exclusively attached silent ops, + * we skip the hdr/data send, to enable silent operation. + */ + if (rm->data.op_nents == 0) { + int ops_present; + int all_ops_are_silent = 1; + + ops_present = (rm->atomic.op_active || rm->rdma.op_active); + if (rm->atomic.op_active && !rm->atomic.op_silent) + all_ops_are_silent = 0; + if (rm->rdma.op_active && !rm->rdma.op_silent) + all_ops_are_silent = 0; + + if (ops_present && all_ops_are_silent + && !rm->m_rdma_cookie) + rm->data.op_active = 0; + } + + if (rm->data.op_active && !conn->c_xmit_data_sent) { + rm->m_final_op = &rm->data; ret = conn->c_trans->xmit(conn, rm, conn->c_xmit_hdr_off, conn->c_xmit_sg, @@ -294,7 +319,7 @@ int rds_send_xmit(struct rds_connection *conn) ret -= tmp; } - sg = &rm->m_sg[conn->c_xmit_sg]; + sg = &rm->data.op_sg[conn->c_xmit_sg]; while (ret) { tmp = min_t(int, ret, sg->length - conn->c_xmit_data_off); @@ -305,49 +330,63 @@ int rds_send_xmit(struct rds_connection *conn) sg++; conn->c_xmit_sg++; BUG_ON(ret != 0 && - conn->c_xmit_sg == rm->m_nents); + conn->c_xmit_sg == rm->data.op_nents); } } + + if (conn->c_xmit_hdr_off == sizeof(struct rds_header) && + (conn->c_xmit_sg == rm->data.op_nents)) + conn->c_xmit_data_sent = 1; } - } - /* Nuke any messages we decided not to retransmit. */ - if (!list_empty(&to_be_dropped)) - rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); + /* + * A rm will only take multiple times through this loop + * if there is a data op. Thus, if the data is sent (or there was + * none), then we're done with the rm. + */ + if (!rm->data.op_active || conn->c_xmit_data_sent) { + conn->c_xmit_rm = NULL; + conn->c_xmit_sg = 0; + conn->c_xmit_hdr_off = 0; + conn->c_xmit_data_off = 0; + conn->c_xmit_rdma_sent = 0; + conn->c_xmit_atomic_sent = 0; + conn->c_xmit_data_sent = 0; + + rds_message_put(rm); + } + } if (conn->c_trans->xmit_complete) conn->c_trans->xmit_complete(conn); - /* - * We might be racing with another sender who queued a message but - * backed off on noticing that we held the c_send_lock. If we check - * for queued messages after dropping the sem then either we'll - * see the queued message or the queuer will get the sem. If we - * notice the queued message then we trigger an immediate retry. - * - * We need to be careful only to do this when we stopped processing - * the send queue because it was empty. It's the only way we - * stop processing the loop when the transport hasn't taken - * responsibility for forward progress. - */ - mutex_unlock(&conn->c_send_lock); + release_in_xmit(conn); - if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) { - /* We exhausted the send quota, but there's work left to - * do. Return and (re-)schedule the send worker. - */ - ret = -EAGAIN; + /* Nuke any messages we decided not to retransmit. */ + if (!list_empty(&to_be_dropped)) { + /* irqs on here, so we can put(), unlike above */ + list_for_each_entry(rm, &to_be_dropped, m_conn_item) + rds_message_put(rm); + rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); } - if (ret == 0 && was_empty) { - /* A simple bit test would be way faster than taking the - * spin lock */ - spin_lock_irqsave(&conn->c_lock, flags); + /* + * Other senders can queue a message after we last test the send queue + * but before we clear RDS_IN_XMIT. In that case they'd back off and + * not try and send their newly queued message. We need to check the + * send queue after having cleared RDS_IN_XMIT so that their message + * doesn't get stuck on the send queue. + * + * If the transport cannot continue (i.e ret != 0), then it must + * call us when more room is available, such as from the tx + * completion handler. + */ + if (ret == 0) { + smp_mb(); if (!list_empty(&conn->c_send_queue)) { - rds_stats_inc(s_send_sem_queue_raced); - ret = -EAGAIN; + rds_stats_inc(s_send_lock_queue_raced); + goto restart; } - spin_unlock_irqrestore(&conn->c_lock, flags); } out: return ret; @@ -375,52 +414,60 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, } /* - * Returns true if there are no messages on the send and retransmit queues - * which have a sequence number greater than or equal to the given sequence - * number. + * This is pretty similar to what happens below in the ACK + * handling code - except that we call here as soon as we get + * the IB send completion on the RDMA op and the accompanying + * message. */ -int rds_send_acked_before(struct rds_connection *conn, u64 seq) +void rds_rdma_send_complete(struct rds_message *rm, int status) { - struct rds_message *rm, *tmp; - int ret = 1; + struct rds_sock *rs = NULL; + struct rm_rdma_op *ro; + struct rds_notifier *notifier; + unsigned long flags; - spin_lock(&conn->c_lock); + spin_lock_irqsave(&rm->m_rs_lock, flags); - list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { - if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) - ret = 0; - break; - } + ro = &rm->rdma; + if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && + ro->op_active && ro->op_notify && ro->op_notifier) { + notifier = ro->op_notifier; + rs = rm->m_rs; + sock_hold(rds_rs_to_sk(rs)); - list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { - if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) - ret = 0; - break; + notifier->n_status = status; + spin_lock(&rs->rs_lock); + list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); + spin_unlock(&rs->rs_lock); + + ro->op_notifier = NULL; } - spin_unlock(&conn->c_lock); + spin_unlock_irqrestore(&rm->m_rs_lock, flags); - return ret; + if (rs) { + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } } +EXPORT_SYMBOL_GPL(rds_rdma_send_complete); /* - * This is pretty similar to what happens below in the ACK - * handling code - except that we call here as soon as we get - * the IB send completion on the RDMA op and the accompanying - * message. + * Just like above, except looks at atomic op */ -void rds_rdma_send_complete(struct rds_message *rm, int status) +void rds_atomic_send_complete(struct rds_message *rm, int status) { struct rds_sock *rs = NULL; - struct rds_rdma_op *ro; + struct rm_atomic_op *ao; struct rds_notifier *notifier; + unsigned long flags; - spin_lock(&rm->m_rs_lock); + spin_lock_irqsave(&rm->m_rs_lock, flags); - ro = rm->m_rdma_op; - if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && - ro && ro->r_notify && ro->r_notifier) { - notifier = ro->r_notifier; + ao = &rm->atomic; + if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) + && ao->op_active && ao->op_notify && ao->op_notifier) { + notifier = ao->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); @@ -429,17 +476,17 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); spin_unlock(&rs->rs_lock); - ro->r_notifier = NULL; + ao->op_notifier = NULL; } - spin_unlock(&rm->m_rs_lock); + spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } } -EXPORT_SYMBOL_GPL(rds_rdma_send_complete); +EXPORT_SYMBOL_GPL(rds_atomic_send_complete); /* * This is the same as rds_rdma_send_complete except we @@ -447,15 +494,23 @@ EXPORT_SYMBOL_GPL(rds_rdma_send_complete); * socket, socket lock) and can just move the notifier. */ static inline void -__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) +__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) { - struct rds_rdma_op *ro; + struct rm_rdma_op *ro; + struct rm_atomic_op *ao; + + ro = &rm->rdma; + if (ro->op_active && ro->op_notify && ro->op_notifier) { + ro->op_notifier->n_status = status; + list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue); + ro->op_notifier = NULL; + } - ro = rm->m_rdma_op; - if (ro && ro->r_notify && ro->r_notifier) { - ro->r_notifier->n_status = status; - list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue); - ro->r_notifier = NULL; + ao = &rm->atomic; + if (ao->op_active && ao->op_notify && ao->op_notifier) { + ao->op_notifier->n_status = status; + list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue); + ao->op_notifier = NULL; } /* No need to wake the app - caller does this */ @@ -467,7 +522,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status * So speed is not an issue here. */ struct rds_message *rds_send_get_message(struct rds_connection *conn, - struct rds_rdma_op *op) + struct rm_rdma_op *op) { struct rds_message *rm, *tmp, *found = NULL; unsigned long flags; @@ -475,7 +530,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, spin_lock_irqsave(&conn->c_lock, flags); list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { - if (rm->m_rdma_op == op) { + if (&rm->rdma == op) { atomic_inc(&rm->m_refcount); found = rm; goto out; @@ -483,7 +538,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, } list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { - if (rm->m_rdma_op == op) { + if (&rm->rdma == op) { atomic_inc(&rm->m_refcount); found = rm; break; @@ -505,14 +560,15 @@ EXPORT_SYMBOL_GPL(rds_send_get_message); * removing the messages from the 'messages' list regardless of if it found * the messages on the socket list or not. */ -void rds_send_remove_from_sock(struct list_head *messages, int status) +static void rds_send_remove_from_sock(struct list_head *messages, int status) { - unsigned long flags = 0; /* silence gcc :P */ + unsigned long flags; struct rds_sock *rs = NULL; struct rds_message *rm; - local_irq_save(flags); while (!list_empty(messages)) { + int was_on_sock = 0; + rm = list_entry(messages->next, struct rds_message, m_conn_item); list_del_init(&rm->m_conn_item); @@ -527,51 +583,52 @@ void rds_send_remove_from_sock(struct list_head *messages, int status) * while we're messing with it. It does not prevent the * message from being removed from the socket, though. */ - spin_lock(&rm->m_rs_lock); + spin_lock_irqsave(&rm->m_rs_lock, flags); if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) goto unlock_and_drop; if (rs != rm->m_rs) { if (rs) { - spin_unlock(&rs->rs_lock); rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } rs = rm->m_rs; - spin_lock(&rs->rs_lock); sock_hold(rds_rs_to_sk(rs)); } + spin_lock(&rs->rs_lock); if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { - struct rds_rdma_op *ro = rm->m_rdma_op; + struct rm_rdma_op *ro = &rm->rdma; struct rds_notifier *notifier; list_del_init(&rm->m_sock_item); rds_send_sndbuf_remove(rs, rm); - if (ro && ro->r_notifier && (status || ro->r_notify)) { - notifier = ro->r_notifier; + if (ro->op_active && ro->op_notifier && + (ro->op_notify || (ro->op_recverr && status))) { + notifier = ro->op_notifier; list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); if (!notifier->n_status) notifier->n_status = status; - rm->m_rdma_op->r_notifier = NULL; + rm->rdma.op_notifier = NULL; } - rds_message_put(rm); + was_on_sock = 1; rm->m_rs = NULL; } + spin_unlock(&rs->rs_lock); unlock_and_drop: - spin_unlock(&rm->m_rs_lock); + spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); + if (was_on_sock) + rds_message_put(rm); } if (rs) { - spin_unlock(&rs->rs_lock); rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } - local_irq_restore(flags); } /* @@ -604,7 +661,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack, /* order flag updates with spin locks */ if (!list_empty(&list)) - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); spin_unlock_irqrestore(&conn->c_lock, flags); @@ -617,9 +674,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) { struct rds_message *rm, *tmp; struct rds_connection *conn; - unsigned long flags, flags2; + unsigned long flags; LIST_HEAD(list); - int wake = 0; /* get all the messages we're dropping under the rs lock */ spin_lock_irqsave(&rs->rs_lock, flags); @@ -629,58 +685,54 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) dest->sin_port != rm->m_inc.i_hdr.h_dport)) continue; - wake = 1; list_move(&rm->m_sock_item, &list); rds_send_sndbuf_remove(rs, rm); clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); - - /* If this is a RDMA operation, notify the app. */ - __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED); } /* order flag updates with the rs lock */ - if (wake) - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); spin_unlock_irqrestore(&rs->rs_lock, flags); - if (wake) - rds_wake_sk_sleep(rs); - - conn = NULL; + if (list_empty(&list)) + return; - /* now remove the messages from the conn list as needed */ + /* Remove the messages from the conn */ list_for_each_entry(rm, &list, m_sock_item) { - /* We do this here rather than in the loop above, so that - * we don't have to nest m_rs_lock under rs->rs_lock */ - spin_lock_irqsave(&rm->m_rs_lock, flags2); - rm->m_rs = NULL; - spin_unlock_irqrestore(&rm->m_rs_lock, flags2); + conn = rm->m_inc.i_conn; + + spin_lock_irqsave(&conn->c_lock, flags); /* - * If we see this flag cleared then we're *sure* that someone - * else beat us to removing it from the conn. If we race - * with their flag update we'll get the lock and then really - * see that the flag has been cleared. + * Maybe someone else beat us to removing rm from the conn. + * If we race with their flag update we'll get the lock and + * then really see that the flag has been cleared. */ - if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags)) + if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { + spin_unlock_irqrestore(&conn->c_lock, flags); continue; - - if (conn != rm->m_inc.i_conn) { - if (conn) - spin_unlock_irqrestore(&conn->c_lock, flags); - conn = rm->m_inc.i_conn; - spin_lock_irqsave(&conn->c_lock, flags); } + list_del_init(&rm->m_conn_item); + spin_unlock_irqrestore(&conn->c_lock, flags); - if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { - list_del_init(&rm->m_conn_item); - rds_message_put(rm); - } + /* + * Couldn't grab m_rs_lock in top loop (lock ordering), + * but we can now. + */ + spin_lock_irqsave(&rm->m_rs_lock, flags); + + spin_lock(&rs->rs_lock); + __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); + spin_unlock(&rs->rs_lock); + + rm->m_rs = NULL; + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + + rds_message_put(rm); } - if (conn) - spin_unlock_irqrestore(&conn->c_lock, flags); + rds_wake_sk_sleep(rs); while (!list_empty(&list)) { rm = list_entry(list.next, struct rds_message, m_sock_item); @@ -760,6 +812,63 @@ out: return *queued; } +/* + * rds_message is getting to be quite complicated, and we'd like to allocate + * it all in one go. This figures out how big it needs to be up front. + */ +static int rds_rm_size(struct msghdr *msg, int data_len) +{ + struct cmsghdr *cmsg; + int size = 0; + int cmsg_groups = 0; + int retval; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_RDS) + continue; + + switch (cmsg->cmsg_type) { + case RDS_CMSG_RDMA_ARGS: + cmsg_groups |= 1; + retval = rds_rdma_extra_size(CMSG_DATA(cmsg)); + if (retval < 0) + return retval; + size += retval; + + break; + + case RDS_CMSG_RDMA_DEST: + case RDS_CMSG_RDMA_MAP: + cmsg_groups |= 2; + /* these are valid but do no add any size */ + break; + + case RDS_CMSG_ATOMIC_CSWP: + case RDS_CMSG_ATOMIC_FADD: + case RDS_CMSG_MASKED_ATOMIC_CSWP: + case RDS_CMSG_MASKED_ATOMIC_FADD: + cmsg_groups |= 1; + size += sizeof(struct scatterlist); + break; + + default: + return -EINVAL; + } + + } + + size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); + + /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ + if (cmsg_groups == 3) + return -EINVAL; + + return size; +} + static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, struct msghdr *msg, int *allocated_mr) { @@ -774,7 +883,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, continue; /* As a side effect, RDMA_DEST and RDMA_MAP will set - * rm->m_rdma_cookie and rm->m_rdma_mr. + * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr. */ switch (cmsg->cmsg_type) { case RDS_CMSG_RDMA_ARGS: @@ -790,6 +899,12 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, if (!ret) *allocated_mr = 1; break; + case RDS_CMSG_ATOMIC_CSWP: + case RDS_CMSG_ATOMIC_FADD: + case RDS_CMSG_MASKED_ATOMIC_CSWP: + case RDS_CMSG_MASKED_ATOMIC_FADD: + ret = rds_cmsg_atomic(rs, rm, cmsg); + break; default: return -EINVAL; @@ -807,7 +922,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); - struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); __be32 daddr; __be16 dport; struct rds_message *rm = NULL; @@ -815,12 +930,11 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, int ret = 0; int queued = 0, allocated_mr = 0; int nonblock = msg->msg_flags & MSG_DONTWAIT; - long timeo = sock_rcvtimeo(sk, nonblock); + long timeo = sock_sndtimeo(sk, nonblock); /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) { - printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags); ret = -EOPNOTSUPP; goto out; } @@ -847,12 +961,29 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, goto out; } - rm = rds_message_copy_from_user(msg->msg_iov, payload_len); - if (IS_ERR(rm)) { - ret = PTR_ERR(rm); - rm = NULL; + /* size of rm including all sgs */ + ret = rds_rm_size(msg, payload_len); + if (ret < 0) goto out; + + rm = rds_message_alloc(ret, GFP_KERNEL); + if (!rm) { + ret = -ENOMEM; + goto out; + } + + /* Attach data to the rm */ + if (payload_len) { + rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); + if (!rm->data.op_sg) { + ret = -ENOMEM; + goto out; + } + ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len); + if (ret) + goto out; } + rm->data.op_active = 1; rm->m_daddr = daddr; @@ -876,26 +1007,27 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, if (ret) goto out; - if ((rm->m_rdma_cookie || rm->m_rdma_op) && - conn->c_trans->xmit_rdma == NULL) { - if (printk_ratelimit()) - printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", - rm->m_rdma_op, conn->c_trans->xmit_rdma); + if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { + printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", + &rm->rdma, conn->c_trans->xmit_rdma); ret = -EOPNOTSUPP; goto out; } - /* If the connection is down, trigger a connect. We may - * have scheduled a delayed reconnect however - in this case - * we should not interfere. - */ - if (rds_conn_state(conn) == RDS_CONN_DOWN && - !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) { + printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n", + &rm->atomic, conn->c_trans->xmit_atomic); + ret = -EOPNOTSUPP; + goto out; + } + + rds_conn_connect_if_down(conn); ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); - if (ret) + if (ret) { + rs->rs_seen_congestion = 1; goto out; + } while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port, dport, &queued)) { @@ -910,7 +1042,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, goto out; } - timeo = wait_event_interruptible_timeout(*sk->sk_sleep, + timeo = wait_event_interruptible_timeout(*sk_sleep(sk), rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port, dport, @@ -933,7 +1065,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, rds_stats_inc(s_send_queued); if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - rds_send_worker(&conn->c_send_w.work); + rds_send_xmit(conn); rds_message_put(rm); return payload_len; @@ -961,20 +1093,15 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) int ret = 0; rm = rds_message_alloc(0, GFP_ATOMIC); - if (rm == NULL) { + if (!rm) { ret = -ENOMEM; goto out; } rm->m_daddr = conn->c_faddr; + rm->data.op_active = 1; - /* If the connection is down, trigger a connect. We may - * have scheduled a delayed reconnect however - in this case - * we should not interfere. - */ - if (rds_conn_state(conn) == RDS_CONN_DOWN && - !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + rds_conn_connect_if_down(conn); ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); if (ret) @@ -994,7 +1121,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) rds_stats_inc(s_send_queued); rds_stats_inc(s_send_pong); - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + rds_message_put(rm); return 0; diff --git a/net/rds/stats.c b/net/rds/stats.c index 7598eb07cfb..73be187d389 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c @@ -33,6 +33,7 @@ #include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> +#include <linux/export.h> #include "rds.h" @@ -57,8 +58,8 @@ static const char *const rds_stat_names[] = { "recv_ping", "send_queue_empty", "send_queue_full", - "send_sem_contention", - "send_sem_queue_raced", + "send_lock_contention", + "send_lock_queue_raced", "send_immediate_retry", "send_delayed_retry", "send_drop_acked", @@ -86,6 +87,7 @@ void rds_stats_info_copy(struct rds_info_iterator *iter, for (i = 0; i < nr; i++) { BUG_ON(strlen(names[i]) >= sizeof(ctr.name)); strncpy(ctr.name, names[i], sizeof(ctr.name) - 1); + ctr.name[sizeof(ctr.name) - 1] = '\0'; ctr.value = values[i]; rds_info_copy(iter, &ctr, sizeof(ctr)); @@ -143,7 +145,7 @@ void rds_stats_exit(void) rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); } -int __init rds_stats_init(void) +int rds_stats_init(void) { rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); return 0; diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c index 7829a20325d..c3b0cd43eb5 100644 --- a/net/rds/sysctl.c +++ b/net/rds/sysctl.c @@ -49,7 +49,7 @@ unsigned int rds_sysctl_max_unacked_bytes = (16 << 20); unsigned int rds_sysctl_ping_enable = 1; -static ctl_table rds_sysctl_rds_table[] = { +static struct ctl_table rds_sysctl_rds_table[] = { { .procname = "reconnect_min_delay_ms", .data = &rds_sysctl_reconnect_min_jiffies, @@ -92,26 +92,18 @@ static ctl_table rds_sysctl_rds_table[] = { { } }; -static struct ctl_path rds_sysctl_path[] = { - { .procname = "net", }, - { .procname = "rds", }, - { } -}; - - void rds_sysctl_exit(void) { - if (rds_sysctl_reg_table) - unregister_sysctl_table(rds_sysctl_reg_table); + unregister_net_sysctl_table(rds_sysctl_reg_table); } -int __init rds_sysctl_init(void) +int rds_sysctl_init(void) { rds_sysctl_reconnect_min = msecs_to_jiffies(1); rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; - rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table); - if (rds_sysctl_reg_table == NULL) + rds_sysctl_reg_table = register_net_sysctl(&init_net,"net/rds", rds_sysctl_rds_table); + if (!rds_sysctl_reg_table) return -ENOMEM; return 0; } diff --git a/net/rds/tcp.c b/net/rds/tcp.c index b5198aee45d..edac9ef2bc8 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -31,7 +31,9 @@ * */ #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/in.h> +#include <linux/module.h> #include <net/tcp.h> #include "rds.h" @@ -40,7 +42,7 @@ /* only for info exporting */ static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); static LIST_HEAD(rds_tcp_tc_list); -unsigned int rds_tcp_tc_count; +static unsigned int rds_tcp_tc_count; /* Track rds_tcp_connection structs so they can be cleaned up */ static DEFINE_SPINLOCK(rds_tcp_conn_lock); @@ -199,7 +201,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) struct rds_tcp_connection *tc; tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); - if (tc == NULL) + if (!tc) return -ENOMEM; tc->t_sock = NULL; @@ -220,7 +222,13 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) static void rds_tcp_conn_free(void *arg) { struct rds_tcp_connection *tc = arg; + unsigned long flags; rdsdebug("freeing tc %p\n", tc); + + spin_lock_irqsave(&rds_tcp_conn_lock, flags); + list_del(&tc->t_tcp_node); + spin_unlock_irqrestore(&rds_tcp_conn_lock, flags); + kmem_cache_free(rds_tcp_conn_slab, tc); } @@ -242,7 +250,7 @@ static void rds_tcp_destroy_conns(void) } } -void rds_tcp_exit(void) +static void rds_tcp_exit(void) { rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); rds_tcp_listen_stop(); @@ -257,7 +265,6 @@ struct rds_transport rds_tcp_transport = { .laddr_check = rds_tcp_laddr_check, .xmit_prepare = rds_tcp_xmit_prepare, .xmit_complete = rds_tcp_xmit_complete, - .xmit_cong_map = rds_tcp_xmit_cong_map, .xmit = rds_tcp_xmit, .recv = rds_tcp_recv, .conn_alloc = rds_tcp_conn_alloc, @@ -265,7 +272,6 @@ struct rds_transport rds_tcp_transport = { .conn_connect = rds_tcp_conn_connect, .conn_shutdown = rds_tcp_conn_shutdown, .inc_copy_to_user = rds_tcp_inc_copy_to_user, - .inc_purge = rds_tcp_inc_purge, .inc_free = rds_tcp_inc_free, .stats_info_copy = rds_tcp_stats_info_copy, .exit = rds_tcp_exit, @@ -275,14 +281,14 @@ struct rds_transport rds_tcp_transport = { .t_prefer_loopback = 1, }; -int __init rds_tcp_init(void) +static int rds_tcp_init(void) { int ret; rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", sizeof(struct rds_tcp_connection), 0, 0, NULL); - if (rds_tcp_conn_slab == NULL) { + if (!rds_tcp_conn_slab) { ret = -ENOMEM; goto out; } diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 844fa6b9cf5..65637491f72 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -43,8 +43,6 @@ struct rds_tcp_statistics { }; /* tcp.c */ -int __init rds_tcp_init(void); -void rds_tcp_exit(void); void rds_tcp_tune(struct socket *sock); void rds_tcp_nonagle(struct socket *sock); void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn); @@ -61,16 +59,15 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn); void rds_tcp_state_change(struct sock *sk); /* tcp_listen.c */ -int __init rds_tcp_listen_init(void); +int rds_tcp_listen_init(void); void rds_tcp_listen_stop(void); -void rds_tcp_listen_data_ready(struct sock *sk, int bytes); +void rds_tcp_listen_data_ready(struct sock *sk); /* tcp_recv.c */ -int __init rds_tcp_recv_init(void); +int rds_tcp_recv_init(void); void rds_tcp_recv_exit(void); -void rds_tcp_data_ready(struct sock *sk, int bytes); +void rds_tcp_data_ready(struct sock *sk); int rds_tcp_recv(struct rds_connection *conn); -void rds_tcp_inc_purge(struct rds_incoming *inc); void rds_tcp_inc_free(struct rds_incoming *inc); int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, size_t size); @@ -81,8 +78,6 @@ void rds_tcp_xmit_complete(struct rds_connection *conn); int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); void rds_tcp_write_space(struct sock *sk); -int rds_tcp_xmit_cong_map(struct rds_connection *conn, - struct rds_cong_map *map, unsigned long offset); /* tcp_stats.c */ DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 211522f9a9a..a65ee78db0c 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -45,7 +45,7 @@ void rds_tcp_state_change(struct sock *sk) read_lock(&sk->sk_callback_lock); conn = sk->sk_user_data; - if (conn == NULL) { + if (!conn) { state_change = sk->sk_state_change; goto out; } @@ -90,8 +90,8 @@ int rds_tcp_conn_connect(struct rds_connection *conn) ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src)); if (ret) { - rdsdebug("bind failed with %d at address %u.%u.%u.%u\n", - ret, NIPQUAD(conn->c_laddr)); + rdsdebug("bind failed with %d at address %pI4\n", + ret, &conn->c_laddr); goto out; } @@ -108,8 +108,7 @@ int rds_tcp_conn_connect(struct rds_connection *conn) O_NONBLOCK); sock = NULL; - rdsdebug("connect to address %u.%u.%u.%u returned %d\n", - NIPQUAD(conn->c_faddr), ret); + rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) ret = 0; @@ -142,7 +141,7 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn) release_sock(sock->sk); sock_release(sock); - }; + } if (tc->t_tinc) { rds_inc_put(&tc->t_tinc->ti_inc); diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 45474a43686..23ab4dcd1d9 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -31,6 +31,7 @@ * */ #include <linux/kernel.h> +#include <linux/gfp.h> #include <linux/in.h> #include <net/tcp.h> @@ -66,9 +67,9 @@ static int rds_tcp_accept_one(struct socket *sock) inet = inet_sk(new_sock->sk); - rdsdebug("accepted tcp %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n", - NIPQUAD(inet->inet_saddr), ntohs(inet->inet_sport), - NIPQUAD(inet->inet_daddr), ntohs(inet->inet_dport)); + rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n", + &inet->inet_saddr, ntohs(inet->inet_sport), + &inet->inet_daddr, ntohs(inet->inet_dport)); conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr, &rds_tcp_transport, GFP_KERNEL); @@ -107,15 +108,15 @@ static void rds_tcp_accept_worker(struct work_struct *work) cond_resched(); } -void rds_tcp_listen_data_ready(struct sock *sk, int bytes) +void rds_tcp_listen_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); rdsdebug("listen data ready sk %p\n", sk); read_lock(&sk->sk_callback_lock); ready = sk->sk_user_data; - if (ready == NULL) { /* check for teardown race */ + if (!ready) { /* check for teardown race */ ready = sk->sk_data_ready; goto out; } @@ -131,10 +132,10 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes) out: read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + ready(sk); } -int __init rds_tcp_listen_init(void) +int rds_tcp_listen_init(void) { struct sockaddr_in sin; struct socket *sock = NULL; @@ -144,7 +145,7 @@ int __init rds_tcp_listen_init(void) if (ret < 0) goto out; - sock->sk->sk_reuse = 1; + sock->sk->sk_reuse = SK_CAN_REUSE; rds_tcp_nonagle(sock); write_lock_bh(&sock->sk->sk_callback_lock); @@ -152,7 +153,7 @@ int __init rds_tcp_listen_init(void) sock->sk->sk_data_ready = rds_tcp_listen_data_ready; write_unlock_bh(&sock->sk->sk_callback_lock); - sin.sin_family = PF_INET, + sin.sin_family = PF_INET; sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); sin.sin_port = (__force u16)htons(RDS_TCP_PORT); @@ -177,7 +178,7 @@ void rds_tcp_listen_stop(void) struct socket *sock = rds_tcp_listen_sock; struct sock *sk; - if (sock == NULL) + if (!sock) return; sk = sock->sk; diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index c00dafffbb5..9ae6e0a264e 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -31,6 +31,7 @@ * */ #include <linux/kernel.h> +#include <linux/slab.h> #include <net/tcp.h> #include "rds.h" @@ -38,7 +39,7 @@ static struct kmem_cache *rds_tcp_incoming_slab; -void rds_tcp_inc_purge(struct rds_incoming *inc) +static void rds_tcp_inc_purge(struct rds_incoming *inc) { struct rds_tcp_incoming *tinc; tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); @@ -97,6 +98,7 @@ int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, goto out; } + rds_stats_add(s_copy_to_user, to_copy); size -= to_copy; ret += to_copy; skb_off += to_copy; @@ -167,7 +169,6 @@ static void rds_tcp_cong_recv(struct rds_connection *conn, struct rds_tcp_desc_arg { struct rds_connection *conn; gfp_t gfp; - enum km_type km; }; static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, @@ -188,10 +189,10 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, * processing. */ while (left) { - if (tinc == NULL) { + if (!tinc) { tinc = kmem_cache_alloc(rds_tcp_incoming_slab, arg->gfp); - if (tinc == NULL) { + if (!tinc) { desc->error = -ENOMEM; goto out; } @@ -227,7 +228,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, if (left && tc->t_tinc_data_rem) { clone = skb_clone(skb, arg->gfp); - if (clone == NULL) { + if (!clone) { desc->error = -ENOMEM; goto out; } @@ -253,7 +254,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, else rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, &tinc->ti_inc, - arg->gfp, arg->km); + arg->gfp); tc->t_tinc_hdr_rem = sizeof(struct rds_header); tc->t_tinc_data_rem = 0; @@ -270,7 +271,7 @@ out: } /* the caller has to hold the sock lock */ -int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km) +static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp) { struct rds_tcp_connection *tc = conn->c_transport_data; struct socket *sock = tc->t_sock; @@ -280,7 +281,6 @@ int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km) /* It's like glib in the kernel! */ arg.conn = conn; arg.gfp = gfp; - arg.km = km; desc.arg.data = &arg; desc.error = 0; desc.count = 1; /* give more than one skb per call */ @@ -308,23 +308,23 @@ int rds_tcp_recv(struct rds_connection *conn) rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock); lock_sock(sock->sk); - ret = rds_tcp_read_sock(conn, GFP_KERNEL, KM_USER0); + ret = rds_tcp_read_sock(conn, GFP_KERNEL); release_sock(sock->sk); return ret; } -void rds_tcp_data_ready(struct sock *sk, int bytes) +void rds_tcp_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); struct rds_connection *conn; struct rds_tcp_connection *tc; - rdsdebug("data ready sk %p bytes %d\n", sk, bytes); + rdsdebug("data ready sk %p\n", sk); read_lock(&sk->sk_callback_lock); conn = sk->sk_user_data; - if (conn == NULL) { /* check for teardown race */ + if (!conn) { /* check for teardown race */ ready = sk->sk_data_ready; goto out; } @@ -333,19 +333,19 @@ void rds_tcp_data_ready(struct sock *sk, int bytes) ready = tc->t_orig_data_ready; rds_tcp_stats_inc(s_tcp_data_ready_calls); - if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM) + if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM) queue_delayed_work(rds_wq, &conn->c_recv_w, 0); out: read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + ready(sk); } -int __init rds_tcp_recv_init(void) +int rds_tcp_recv_init(void) { rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", sizeof(struct rds_tcp_incoming), 0, 0, NULL); - if (rds_tcp_incoming_slab == NULL) + if (!rds_tcp_incoming_slab) return -ENOMEM; return 0; } diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index ab545e0cd5d..53b17ca0dff 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -63,7 +63,7 @@ void rds_tcp_xmit_complete(struct rds_connection *conn) } /* the core send_sem serializes this with other xmit and shutdown */ -int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) +static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) { struct kvec vec = { .iov_base = data, @@ -77,56 +77,6 @@ int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) } /* the core send_sem serializes this with other xmit and shutdown */ -int rds_tcp_xmit_cong_map(struct rds_connection *conn, - struct rds_cong_map *map, unsigned long offset) -{ - static struct rds_header rds_tcp_map_header = { - .h_flags = RDS_FLAG_CONG_BITMAP, - }; - struct rds_tcp_connection *tc = conn->c_transport_data; - unsigned long i; - int ret; - int copied = 0; - - /* Some problem claims cpu_to_be32(constant) isn't a constant. */ - rds_tcp_map_header.h_len = cpu_to_be32(RDS_CONG_MAP_BYTES); - - if (offset < sizeof(struct rds_header)) { - ret = rds_tcp_sendmsg(tc->t_sock, - (void *)&rds_tcp_map_header + offset, - sizeof(struct rds_header) - offset); - if (ret <= 0) - return ret; - offset += ret; - copied = ret; - if (offset < sizeof(struct rds_header)) - return ret; - } - - offset -= sizeof(struct rds_header); - i = offset / PAGE_SIZE; - offset = offset % PAGE_SIZE; - BUG_ON(i >= RDS_CONG_MAP_PAGES); - - do { - ret = tc->t_sock->ops->sendpage(tc->t_sock, - virt_to_page(map->m_page_addrs[i]), - offset, PAGE_SIZE - offset, - MSG_DONTWAIT); - if (ret <= 0) - break; - copied += ret; - offset += ret; - if (offset == PAGE_SIZE) { - offset = 0; - i++; - } - } while (i < RDS_CONG_MAP_PAGES); - - return copied ? copied : ret; -} - -/* the core send_sem serializes this with other xmit and shutdown */ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off) { @@ -143,7 +93,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, rm->m_ack_seq = tc->t_last_sent_nxt + sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1; - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); tc->t_last_expected_una = rm->m_ack_seq + 1; @@ -166,21 +116,21 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, goto out; } - while (sg < rm->m_nents) { + while (sg < rm->data.op_nents) { ret = tc->t_sock->ops->sendpage(tc->t_sock, - sg_page(&rm->m_sg[sg]), - rm->m_sg[sg].offset + off, - rm->m_sg[sg].length - off, + sg_page(&rm->data.op_sg[sg]), + rm->data.op_sg[sg].offset + off, + rm->data.op_sg[sg].length - off, MSG_DONTWAIT|MSG_NOSIGNAL); - rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->m_sg[sg]), - rm->m_sg[sg].offset + off, rm->m_sg[sg].length - off, + rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]), + rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, ret); if (ret <= 0) break; off += ret; done += ret; - if (off == rm->m_sg[sg].length) { + if (off == rm->data.op_sg[sg].length) { off = 0; sg++; } @@ -193,9 +143,9 @@ out: rds_tcp_stats_inc(s_tcp_sndbuf_full); ret = 0; } else { - printk(KERN_WARNING "RDS/tcp: send to %u.%u.%u.%u " + printk(KERN_WARNING "RDS/tcp: send to %pI4 " "returned %d, disconnecting and reconnecting\n", - NIPQUAD(conn->c_faddr), ret); + &conn->c_faddr, ret); rds_conn_drop(conn); } } @@ -226,7 +176,7 @@ void rds_tcp_write_space(struct sock *sk) read_lock(&sk->sk_callback_lock); conn = sk->sk_user_data; - if (conn == NULL) { + if (!conn) { write_space = sk->sk_write_space; goto out; } @@ -240,7 +190,9 @@ void rds_tcp_write_space(struct sock *sk) tc->t_last_seen_una = rds_tcp_snd_una(tc); rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked); - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + out: read_unlock(&sk->sk_callback_lock); diff --git a/net/rds/tcp_stats.c b/net/rds/tcp_stats.c index d5898d03cd6..f8a7954f1f5 100644 --- a/net/rds/tcp_stats.c +++ b/net/rds/tcp_stats.c @@ -40,7 +40,7 @@ DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats) ____cacheline_aligned; -static const char const *rds_tcp_stat_names[] = { +static const char * const rds_tcp_stat_names[] = { "tcp_data_ready_calls", "tcp_write_space_calls", "tcp_sndbuf_full", diff --git a/net/rds/threads.c b/net/rds/threads.c index 00fa10e59af..65eaefcab24 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -32,6 +32,7 @@ */ #include <linux/kernel.h> #include <linux/random.h> +#include <linux/export.h> #include "rds.h" @@ -61,7 +62,7 @@ * * Transition to state DISCONNECTING/DOWN: * - Inside the shutdown worker; synchronizes with xmit path - * through c_send_lock, and with connection management callbacks + * through RDS_IN_XMIT, and with connection management callbacks * via c_cm_lock. * * For receive callbacks, we rely on the underlying transport @@ -110,7 +111,7 @@ EXPORT_SYMBOL_GPL(rds_connect_complete); * We should *always* start with a random backoff; otherwise a broken connection * will always take several iterations to be re-established. */ -static void rds_queue_reconnect(struct rds_connection *conn) +void rds_queue_reconnect(struct rds_connection *conn) { unsigned long rand; @@ -156,58 +157,6 @@ void rds_connect_worker(struct work_struct *work) } } -void rds_shutdown_worker(struct work_struct *work) -{ - struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); - - /* shut it down unless it's down already */ - if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { - /* - * Quiesce the connection mgmt handlers before we start tearing - * things down. We don't hold the mutex for the entire - * duration of the shutdown operation, else we may be - * deadlocking with the CM handler. Instead, the CM event - * handler is supposed to check for state DISCONNECTING - */ - mutex_lock(&conn->c_cm_lock); - if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) && - !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { - rds_conn_error(conn, "shutdown called in state %d\n", - atomic_read(&conn->c_state)); - mutex_unlock(&conn->c_cm_lock); - return; - } - mutex_unlock(&conn->c_cm_lock); - - mutex_lock(&conn->c_send_lock); - conn->c_trans->conn_shutdown(conn); - rds_conn_reset(conn); - mutex_unlock(&conn->c_send_lock); - - if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { - /* This can happen - eg when we're in the middle of tearing - * down the connection, and someone unloads the rds module. - * Quite reproduceable with loopback connections. - * Mostly harmless. - */ - rds_conn_error(conn, - "%s: failed to transition to state DOWN, " - "current state is %d\n", - __func__, - atomic_read(&conn->c_state)); - return; - } - } - - /* Then reconnect if it's still live. - * The passive side of an IB loopback connection is never added - * to the conn hash, so we never trigger a reconnect on this - * conn - the reconnect is always triggered by the active peer. */ - cancel_delayed_work(&conn->c_conn_w); - if (!hlist_unhashed(&conn->c_hash_node)) - rds_queue_reconnect(conn); -} - void rds_send_worker(struct work_struct *work) { struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); @@ -252,15 +201,22 @@ void rds_recv_worker(struct work_struct *work) } } +void rds_shutdown_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); + + rds_conn_shutdown(conn); +} + void rds_threads_exit(void) { destroy_workqueue(rds_wq); } -int __init rds_threads_init(void) +int rds_threads_init(void) { rds_wq = create_singlethread_workqueue("krdsd"); - if (rds_wq == NULL) + if (!rds_wq) return -ENOMEM; return 0; diff --git a/net/rds/transport.c b/net/rds/transport.c index 7e106790135..7f2ac4fec36 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -71,19 +71,28 @@ void rds_trans_unregister(struct rds_transport *trans) } EXPORT_SYMBOL_GPL(rds_trans_unregister); +void rds_trans_put(struct rds_transport *trans) +{ + if (trans && trans->t_owner) + module_put(trans->t_owner); +} + struct rds_transport *rds_trans_get_preferred(__be32 addr) { struct rds_transport *ret = NULL; - int i; + struct rds_transport *trans; + unsigned int i; if (IN_LOOPBACK(ntohl(addr))) return &rds_loop_transport; down_read(&rds_trans_sem); - for (i = 0; i < RDS_TRANS_COUNT; i++) - { - if (transports[i] && (transports[i]->laddr_check(addr) == 0)) { - ret = transports[i]; + for (i = 0; i < RDS_TRANS_COUNT; i++) { + trans = transports[i]; + + if (trans && (trans->laddr_check(addr) == 0) && + (!trans->t_owner || try_module_get(trans->t_owner))) { + ret = trans; break; } } |
