From d1e09ebf426ff34b4b6bbd6212b820edeb992bd4 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 7 Jul 2012 15:13:47 -0700 Subject: RDMA/ocrdma: Fix assignment of max_srq_sge in device query We want to set attr->max_srq_sge to dev->attr.max_srq_sge, not to itself. This was detected by Coverity (CID 709210). Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 2e2e7aecc99..b2f9784beb4 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -97,7 +97,7 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr) min(dev->attr.max_ord_per_qp, dev->attr.max_ird_per_qp); attr->max_qp_init_rd_atom = dev->attr.max_ord_per_qp; attr->max_srq = (dev->attr.max_qp - 1); - attr->max_srq_sge = attr->max_srq_sge; + attr->max_srq_sge = dev->attr.max_srq_sge; attr->max_srq_wr = dev->attr.max_rqe; attr->local_ca_ack_delay = dev->attr.local_ca_ack_delay; attr->max_fast_reg_page_list_len = 0; -- cgit v1.2.3-18-g5258 From 5b0ec991c0576c54db75803fbcb0ef5bebfa0828 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 14 Jun 2012 20:31:39 +0000 Subject: RDMA/cma: Bind to a specific address family The RDMA CM uses a single port space for all associated (tcp, udp, etc.) port bindings, regardless of the address family that the user binds to. The result is that if a user binds to AF_INET, but does not specify an IP address, the bind will occur for AF_INET6. This causes an attempt to bind to the same port using AF_INET6 to fail, and connection requests to AF_INET6 will match with the AF_INET listener. Align the behavior with sockets and restrict the bind to AF_INET only. If a user binds to AF_INET6, we bind the port to AF_INET6 and AF_INET depending on the value of bindv6only. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 2e826f9702c..c10c45a0716 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -140,6 +140,7 @@ struct rdma_id_private { u8 srq; u8 tos; u8 reuseaddr; + u8 afonly; }; struct cma_multicast { @@ -1573,6 +1574,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); atomic_inc(&id_priv->refcount); dev_id_priv->internal_id = 1; + dev_id_priv->afonly = id_priv->afonly; ret = rdma_listen(id, id_priv->backlog); if (ret) @@ -2187,22 +2189,24 @@ static int cma_check_port(struct rdma_bind_list *bind_list, struct hlist_node *node; addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; - if (cma_any_addr(addr) && !reuseaddr) - return -EADDRNOTAVAIL; - hlist_for_each_entry(cur_id, node, &bind_list->owners, node) { if (id_priv == cur_id) continue; - if ((cur_id->state == RDMA_CM_LISTEN) || - !reuseaddr || !cur_id->reuseaddr) { - cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr; - if (cma_any_addr(cur_addr)) - return -EADDRNOTAVAIL; + if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr && + cur_id->reuseaddr) + continue; - if (!cma_addr_cmp(addr, cur_addr)) - return -EADDRINUSE; - } + cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr; + if (id_priv->afonly && cur_id->afonly && + (addr->sa_family != cur_addr->sa_family)) + continue; + + if (cma_any_addr(addr) || cma_any_addr(cur_addr)) + return -EADDRNOTAVAIL; + + if (!cma_addr_cmp(addr, cur_addr)) + return -EADDRINUSE; } return 0; } @@ -2371,6 +2375,12 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) } memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr)); + if (addr->sa_family == AF_INET) + id_priv->afonly = 1; +#if IS_ENABLED(CONFIG_IPV6) + else if (addr->sa_family == AF_INET6) + id_priv->afonly = init_net.ipv6.sysctl.bindv6only; +#endif ret = cma_get_port(id_priv); if (ret) goto err2; -- cgit v1.2.3-18-g5258 From 406b6a25f85271397739a7e9f5af1df665b8a0d0 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 14 Jun 2012 20:31:39 +0000 Subject: RDMA/cma: Listen on specific address family The rdma_cm maps IPv4 and IPv6 addresses to the same service ID. This prevents apps from listening only for IPv4 or IPv6 addresses. It also results in an app binding to an IPv4 address receiving connection requests for an IPv6 address. Change this to match socket behavior: restrict listens on IPv4 addresses to only IPv4 addresses, and if a listen is on an IPv6 address, allow it to receive either IPv4 or IPv6 addresses, based on its address family binding. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index c10c45a0716..454e7ea111e 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1298,8 +1298,10 @@ static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, } else { cma_set_ip_ver(cma_data, 4); cma_set_ip_ver(cma_mask, 0xF); - cma_data->dst_addr.ip4.addr = ip4_addr; - cma_mask->dst_addr.ip4.addr = htonl(~0); + if (!cma_any_addr(addr)) { + cma_data->dst_addr.ip4.addr = ip4_addr; + cma_mask->dst_addr.ip4.addr = htonl(~0); + } } break; case AF_INET6: @@ -1313,9 +1315,11 @@ static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, } else { cma_set_ip_ver(cma_data, 6); cma_set_ip_ver(cma_mask, 0xF); - cma_data->dst_addr.ip6 = ip6_addr; - memset(&cma_mask->dst_addr.ip6, 0xFF, - sizeof cma_mask->dst_addr.ip6); + if (!cma_any_addr(addr)) { + cma_data->dst_addr.ip6 = ip6_addr; + memset(&cma_mask->dst_addr.ip6, 0xFF, + sizeof cma_mask->dst_addr.ip6); + } } break; default: @@ -1500,7 +1504,7 @@ static int cma_ib_listen(struct rdma_id_private *id_priv) addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; svc_id = cma_get_service_id(id_priv->id.ps, addr); - if (cma_any_addr(addr)) + if (cma_any_addr(addr) && !id_priv->afonly) ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL); else { cma_set_compare_data(id_priv->id.ps, addr, &compare_data); -- cgit v1.2.3-18-g5258 From 68602120e496a31d8e3b36d0bfc7d9d2456fb05c Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 14 Jun 2012 20:31:39 +0000 Subject: RDMA/cma: Allow user to restrict listens to bound address family Provide an option for the user to specify that listens should only accept connections where the incoming address family matches that of the locally bound address. This is used to support the equivalent of IPV6_V6ONLY socket option, which allows an app to only accept connection requests directed to IPv6 addresses. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 35 +++++++++++++++++++++++++++++++---- drivers/infiniband/core/ucma.c | 7 +++++++ 2 files changed, 38 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 454e7ea111e..8734a6af35d 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -99,6 +99,10 @@ struct rdma_bind_list { unsigned short port; }; +enum { + CMA_OPTION_AFONLY, +}; + /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. @@ -137,6 +141,7 @@ struct rdma_id_private { u32 qkey; u32 qp_num; pid_t owner; + u32 options; u8 srq; u8 tos; u8 reuseaddr; @@ -2104,6 +2109,26 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) } EXPORT_SYMBOL(rdma_set_reuseaddr); +int rdma_set_afonly(struct rdma_cm_id *id, int afonly) +{ + struct rdma_id_private *id_priv; + unsigned long flags; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + spin_lock_irqsave(&id_priv->lock, flags); + if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) { + id_priv->options |= (1 << CMA_OPTION_AFONLY); + id_priv->afonly = afonly; + ret = 0; + } else { + ret = -EINVAL; + } + spin_unlock_irqrestore(&id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(rdma_set_afonly); + static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { @@ -2379,12 +2404,14 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) } memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr)); - if (addr->sa_family == AF_INET) - id_priv->afonly = 1; + if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { + if (addr->sa_family == AF_INET) + id_priv->afonly = 1; #if IS_ENABLED(CONFIG_IPV6) - else if (addr->sa_family == AF_INET6) - id_priv->afonly = init_net.ipv6.sysctl.bindv6only; + else if (addr->sa_family == AF_INET6) + id_priv->afonly = init_net.ipv6.sysctl.bindv6only; #endif + } ret = cma_get_port(id_priv); if (ret) goto err2; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 8002ae642cf..893cb879462 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -909,6 +909,13 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname, } ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0); break; + case RDMA_OPTION_ID_AFONLY: + if (optlen != sizeof(int)) { + ret = -EINVAL; + break; + } + ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0); + break; default: ret = -ENOSYS; } -- cgit v1.2.3-18-g5258 From f747c34af4f56cc239e04505bd583dd3bdcfe49d Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 5 Jul 2012 14:16:54 -0700 Subject: RDMA/cxgb4: Fix endianness of addition to mpa->private_data_size sparse correctly warns that if mpa->private_data_size is __be16, then doing += on it is wrong, even if we do += htons() -- on a little endian system, carries will go the wrong way. Fix this up by doing the addition in native byte order. Acked-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/cm.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index b18870c455a..51f42061dae 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -548,8 +548,8 @@ static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, } if (mpa_rev_to_use == 2) { - mpa->private_data_size += - htons(sizeof(struct mpa_v2_conn_params)); + mpa->private_data_size = htons(ntohs(mpa->private_data_size) + + sizeof (struct mpa_v2_conn_params)); mpa_v2_params.ird = htons((u16)ep->ird); mpa_v2_params.ord = htons((u16)ep->ord); @@ -635,8 +635,8 @@ static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen) if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { mpa->flags |= MPA_ENHANCED_RDMA_CONN; - mpa->private_data_size += - htons(sizeof(struct mpa_v2_conn_params)); + mpa->private_data_size = htons(ntohs(mpa->private_data_size) + + sizeof (struct mpa_v2_conn_params)); mpa_v2_params.ird = htons(((u16)ep->ird) | (peer2peer ? MPA_V2_PEER2PEER_MODEL : 0)); @@ -715,8 +715,8 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen) if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { mpa->flags |= MPA_ENHANCED_RDMA_CONN; - mpa->private_data_size += - htons(sizeof(struct mpa_v2_conn_params)); + mpa->private_data_size = htons(ntohs(mpa->private_data_size) + + sizeof (struct mpa_v2_conn_params)); mpa_v2_params.ird = htons((u16)ep->ird); mpa_v2_params.ord = htons((u16)ep->ord); if (peer2peer && (ep->mpa_attr.p2p_type != -- cgit v1.2.3-18-g5258 From d90f9b3591b3b5fa86178e318008fc1c531a84dc Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 5 Jul 2012 22:39:34 -0700 Subject: IB: Use IS_ENABLED(CONFIG_IPV6) Instead of testing defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 4 ++-- drivers/infiniband/core/cma.c | 2 +- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 7 +++---- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 6ef660c1332..28058ae33d3 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -129,7 +129,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) dev_put(dev); break; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6) case AF_INET6: rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { @@ -243,7 +243,7 @@ out: return ret; } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6) static int addr6_resolve(struct sockaddr_in6 *src_in, struct sockaddr_in6 *dst_in, struct rdma_dev_addr *addr) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 2e826f9702c..4d8f592b1e5 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2278,7 +2278,7 @@ static int cma_get_port(struct rdma_id_private *id_priv) static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, struct sockaddr *addr) { -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6) struct sockaddr_in6 *sin6; if (addr->sa_family != AF_INET6) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index b050e629e9c..5a044526e4f 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -202,8 +202,7 @@ static int ocrdma_build_sgid_tbl(struct ocrdma_dev *dev) return 0; } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) || \ -defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) +#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_VLAN_8021Q) static int ocrdma_inet6addr_event(struct notifier_block *notifier, unsigned long event, void *ptr) @@ -549,7 +548,7 @@ static struct ocrdma_driver ocrdma_drv = { static void ocrdma_unregister_inet6addr_notifier(void) { -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&ocrdma_inet6addr_notifier); #endif } @@ -558,7 +557,7 @@ static int __init ocrdma_init_module(void) { int status; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6) status = register_inet6addr_notifier(&ocrdma_inet6addr_notifier); if (status) return status; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 014504d8e43..fd3871e3f65 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -1376,7 +1376,7 @@ static void ipoib_cm_skb_reap(struct work_struct *work) if (skb->protocol == htons(ETH_P_IP)) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6) else if (skb->protocol == htons(ETH_P_IPV6)) icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); #endif -- cgit v1.2.3-18-g5258 From b1d8eb5a213640f1be98a90e73a241d15b70045c Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 19 Jun 2012 11:21:35 +0300 Subject: IB/mlx4: Add debug prints Define pr_fmt and add some pr_debug prints. Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/mad.c | 19 +++++++++++++++++++ drivers/infiniband/hw/mlx4/main.c | 2 +- drivers/infiniband/hw/mlx4/mlx4_ib.h | 10 ++++++++++ drivers/infiniband/hw/mlx4/qp.c | 27 +++++++++++++++++++++++++-- 4 files changed, 55 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 259b0670b51..84786a9fb64 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -242,6 +242,25 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, int err; struct ib_port_attr pattr; + if (in_wc && in_wc->qp->qp_num) { + pr_debug("received MAD: slid:%d sqpn:%d " + "dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n", + in_wc->slid, in_wc->src_qp, + in_wc->dlid_path_bits, + in_wc->qp->qp_num, + in_wc->wc_flags, + in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method, + be16_to_cpu(in_mad->mad_hdr.attr_id)); + if (in_wc->wc_flags & IB_WC_GRH) { + pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n", + be64_to_cpu(in_grh->sgid.global.subnet_prefix), + be64_to_cpu(in_grh->sgid.global.interface_id)); + pr_debug("dgid_hi:0x%016llx dgid_lo:0x%016llx\n", + be64_to_cpu(in_grh->dgid.global.subnet_prefix), + be64_to_cpu(in_grh->dgid.global.interface_id)); + } + } + slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) { diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 3530c41fcd1..5266b49c46e 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -50,7 +50,7 @@ #include "mlx4_ib.h" #include "user.h" -#define DRV_NAME "mlx4_ib" +#define DRV_NAME MLX4_IB_DRV_NAME #define DRV_VERSION "1.0" #define DRV_RELDATE "April 4, 2008" diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index ff36655d23d..5f298afaa81 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -44,6 +44,16 @@ #include #include +#define MLX4_IB_DRV_NAME "mlx4_ib" + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) "<" MLX4_IB_DRV_NAME "> %s: " fmt, __func__ + +#define mlx4_ib_warn(ibdev, format, arg...) \ + dev_warn((ibdev)->dma_device, MLX4_IB_DRV_NAME ": " format, ## arg) + enum { MLX4_IB_SQ_MIN_WQE_SHIFT = 6, MLX4_IB_MAX_HEADROOM = 2048 diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 8d4ed24aef9..84b26963c8d 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1335,11 +1335,21 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) { + pr_debug("qpn 0x%x: invalid attribute mask specified " + "for transition %d to %d. qp_type %d," + " attr_mask 0x%x\n", + ibqp->qp_num, cur_state, new_state, + ibqp->qp_type, attr_mask); goto out; + } if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) { + pr_debug("qpn 0x%x: invalid port number (%d) specified " + "for transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->port_num, cur_state, + new_state, ibqp->qp_type); goto out; } @@ -1350,17 +1360,30 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr_mask & IB_QP_PKEY_INDEX) { int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; - if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) + if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) { + pr_debug("qpn 0x%x: invalid pkey index (%d) specified " + "for transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->pkey_index, cur_state, + new_state, ibqp->qp_type); goto out; + } } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) { + pr_debug("qpn 0x%x: max_rd_atomic (%d) too large. " + "Transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->max_rd_atomic, cur_state, + new_state, ibqp->qp_type); goto out; } if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) { + pr_debug("qpn 0x%x: max_dest_rd_atomic (%d) too large. " + "Transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->max_dest_rd_atomic, cur_state, + new_state, ibqp->qp_type); goto out; } -- cgit v1.2.3-18-g5258 From aeab97ed1503bedbe14d1e1c5ab7b90253a67664 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Tue, 19 Jun 2012 11:21:38 +0300 Subject: IB/sa: Add GuidInfoRecord query support This query is needed for SRIOV alias GUID support. The query is implemented per the IB Spec definition in section 15.2.5.18 (GuidInfoRecord). Signed-off-by: Erez Shitrit Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/core/sa_query.c | 133 +++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index fbbfa24cf57..a8905abc56e 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -94,6 +94,12 @@ struct ib_sa_path_query { struct ib_sa_query sa_query; }; +struct ib_sa_guidinfo_query { + void (*callback)(int, struct ib_sa_guidinfo_rec *, void *); + void *context; + struct ib_sa_query sa_query; +}; + struct ib_sa_mcmember_query { void (*callback)(int, struct ib_sa_mcmember_rec *, void *); void *context; @@ -347,6 +353,34 @@ static const struct ib_field service_rec_table[] = { .size_bits = 2*64 }, }; +#define GUIDINFO_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \ + .struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \ + .field_name = "sa_guidinfo_rec:" #field + +static const struct ib_field guidinfo_rec_table[] = { + { GUIDINFO_REC_FIELD(lid), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 16 }, + { GUIDINFO_REC_FIELD(block_num), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 8 }, + { GUIDINFO_REC_FIELD(res1), + .offset_words = 0, + .offset_bits = 24, + .size_bits = 8 }, + { GUIDINFO_REC_FIELD(res2), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 32 }, + { GUIDINFO_REC_FIELD(guid_info_list), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 512 }, +}; + static void free_sm_ah(struct kref *kref) { struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); @@ -945,6 +979,105 @@ err1: return ret; } +/* Support GuidInfoRecord */ +static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query, + int status, + struct ib_sa_mad *mad) +{ + struct ib_sa_guidinfo_query *query = + container_of(sa_query, struct ib_sa_guidinfo_query, sa_query); + + if (mad) { + struct ib_sa_guidinfo_rec rec; + + ib_unpack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), + mad->data, &rec); + query->callback(status, &rec, query->context); + } else + query->callback(status, NULL, query->context); +} + +static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query) +{ + kfree(container_of(sa_query, struct ib_sa_guidinfo_query, sa_query)); +} + +int ib_sa_guid_info_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_guidinfo_rec *rec, + ib_sa_comp_mask comp_mask, u8 method, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_guidinfo_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct ib_sa_guidinfo_query *query; + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + struct ib_mad_agent *agent; + struct ib_sa_mad *mad; + int ret; + + if (!sa_dev) + return -ENODEV; + + if (method != IB_MGMT_METHOD_GET && + method != IB_MGMT_METHOD_SET && + method != IB_SA_METHOD_DELETE) { + return -EINVAL; + } + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + query = kmalloc(sizeof *query, gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err1; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(mad, agent); + + query->sa_query.callback = callback ? ib_sa_guidinfo_rec_callback : NULL; + query->sa_query.release = ib_sa_guidinfo_rec_release; + + mad->mad_hdr.method = method; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_GUID_INFO_REC); + mad->sa_hdr.comp_mask = comp_mask; + + ib_pack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), rec, + mad->data); + + *sa_query = &query->sa_query; + + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + if (ret < 0) + goto err2; + + return ret; + +err2: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); + +err1: + kfree(query); + return ret; +} +EXPORT_SYMBOL(ib_sa_guid_info_rec_query); + static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc) { -- cgit v1.2.3-18-g5258 From 3045f0920367e625bbec7d66fadb444e673515af Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 19 Jun 2012 11:21:39 +0300 Subject: IB/core: Move CM_xxx_ATTR_ID macros from cm_msgs.h to ib_cm.h These macros will be reused by the mlx4 SRIOV-IB CM paravirtualization code, and there is no reason to have them declared both in the IB core in the mlx4 IB driver. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/core/cm_msgs.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h index 7da9b210234..be068f47e47 100644 --- a/drivers/infiniband/core/cm_msgs.h +++ b/drivers/infiniband/core/cm_msgs.h @@ -44,18 +44,6 @@ #define IB_CM_CLASS_VERSION 2 /* IB specification 1.2 */ -#define CM_REQ_ATTR_ID cpu_to_be16(0x0010) -#define CM_MRA_ATTR_ID cpu_to_be16(0x0011) -#define CM_REJ_ATTR_ID cpu_to_be16(0x0012) -#define CM_REP_ATTR_ID cpu_to_be16(0x0013) -#define CM_RTU_ATTR_ID cpu_to_be16(0x0014) -#define CM_DREQ_ATTR_ID cpu_to_be16(0x0015) -#define CM_DREP_ATTR_ID cpu_to_be16(0x0016) -#define CM_SIDR_REQ_ATTR_ID cpu_to_be16(0x0017) -#define CM_SIDR_REP_ATTR_ID cpu_to_be16(0x0018) -#define CM_LAP_ATTR_ID cpu_to_be16(0x0019) -#define CM_APR_ATTR_ID cpu_to_be16(0x001A) - enum cm_msg_sequence { CM_MSG_SEQUENCE_REQ, CM_MSG_SEQUENCE_LAP, -- cgit v1.2.3-18-g5258 From 354dff1bd8ccd41b6e8421226d586d35e7fb8920 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Wed, 27 Jun 2012 18:33:05 -0400 Subject: IB/qib: Fix UC MR refs for immediate operations An MR reference leak exists when handling UC RDMA writes with immediate data because we manipulate the reference counts as if the operation had been a send. This patch moves the last_imm label so that the RDMA write operations with immediate data converge at the cq building code. The copy/mr deref code is now done correctly prior to the branch to last_imm. Reviewed-by: Edward Mascarenhas Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_uc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index ce7387ff5d9..70b4cb710f9 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -403,7 +403,6 @@ send_last: if (unlikely(wc.byte_len > qp->r_len)) goto rewind; wc.opcode = IB_WC_RECV; -last_imm: qib_copy_sge(&qp->r_sge, data, tlen, 0); while (qp->s_rdma_read_sge.num_sge) { atomic_dec(&qp->s_rdma_read_sge.sge.mr->refcount); @@ -411,6 +410,7 @@ last_imm: qp->s_rdma_read_sge.sge = *qp->s_rdma_read_sge.sg_list++; } +last_imm: wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; wc.qp = &qp->ibqp; @@ -509,6 +509,12 @@ rdma_last_imm: } wc.byte_len = qp->r_len; wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + qib_copy_sge(&qp->r_sge, data, tlen, 1); + while (qp->r_sge.num_sge) { + atomic_dec(&qp->r_sge.sge.mr->refcount); + if (--qp->r_sge.num_sge) + qp->r_sge.sge = *qp->r_sge.sg_list++; + } goto last_imm; case OP(RDMA_WRITE_LAST): -- cgit v1.2.3-18-g5258 From 6a82649f217023863d6b1740017e6c3dd6685327 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Wed, 27 Jun 2012 18:33:12 -0400 Subject: IB/qib: Avoid returning EBUSY from MR deregister A timing issue can occur where qib_mr_dereg can return -EBUSY if the MR use count is not zero. This can occur if the MR is de-registered while RDMA read response packets are being progressed from the SDMA ring. The suspicion is that the peer sent an RDMA read request, which has already been copied across to the peer. The peer sees the completion of his request and then communicates to the responder that the MR is not needed any longer. The responder tries to de-register the MR, catching some responses remaining in the SDMA ring holding the MR use count. The code now uses a get/put paradigm to track MR use counts and coordinates with the MR de-registration process using a completion when the count has reached zero. A timeout on the delay is in place to catch other EBUSY issues. The reference count protocol is as follows: - The return to the user counts as 1 - A reference from the lk_table or the qib_ibdev counts as 1. - Transient I/O operations increase/decrease as necessary A lot of code duplication has been folded into the new routines init_qib_mregion() and deinit_qib_mregion(). Additionally, explicit initialization of fields to zero is now handled by kzalloc(). Also, duplicated code 'while.*num_sge' that decrements reference counts have been consolidated in qib_put_ss(). Reviewed-by: Ramkrishna Vepa Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_keys.c | 84 +++++++----- drivers/infiniband/hw/qib/qib_mr.c | 242 +++++++++++++++++++--------------- drivers/infiniband/hw/qib/qib_qp.c | 21 +-- drivers/infiniband/hw/qib/qib_rc.c | 24 ++-- drivers/infiniband/hw/qib/qib_ruc.c | 14 +- drivers/infiniband/hw/qib/qib_uc.c | 33 +---- drivers/infiniband/hw/qib/qib_ud.c | 12 +- drivers/infiniband/hw/qib/qib_verbs.c | 10 +- drivers/infiniband/hw/qib/qib_verbs.h | 28 +++- 9 files changed, 244 insertions(+), 224 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/qib/qib_keys.c b/drivers/infiniband/hw/qib/qib_keys.c index 8fd19a47df0..8b5ee3aa8e3 100644 --- a/drivers/infiniband/hw/qib/qib_keys.c +++ b/drivers/infiniband/hw/qib/qib_keys.c @@ -35,21 +35,40 @@ /** * qib_alloc_lkey - allocate an lkey - * @rkt: lkey table in which to allocate the lkey * @mr: memory region that this lkey protects + * @dma_region: 0->normal key, 1->restricted DMA key + * + * Returns 0 if successful, otherwise returns -errno. + * + * Increments mr reference count and sets published + * as required. + * + * Sets the lkey field mr for non-dma regions. * - * Returns 1 if successful, otherwise returns 0. */ -int qib_alloc_lkey(struct qib_lkey_table *rkt, struct qib_mregion *mr) +int qib_alloc_lkey(struct qib_mregion *mr, int dma_region) { unsigned long flags; u32 r; u32 n; - int ret; + int ret = 0; + struct qib_ibdev *dev = to_idev(mr->pd->device); + struct qib_lkey_table *rkt = &dev->lk_table; spin_lock_irqsave(&rkt->lock, flags); + /* special case for dma_mr lkey == 0 */ + if (dma_region) { + /* should the dma_mr be relative to the pd? */ + if (!dev->dma_mr) { + qib_get_mr(mr); + dev->dma_mr = mr; + mr->lkey_published = 1; + } + goto success; + } + /* Find the next available LKEY */ r = rkt->next; n = r; @@ -57,11 +76,8 @@ int qib_alloc_lkey(struct qib_lkey_table *rkt, struct qib_mregion *mr) if (rkt->table[r] == NULL) break; r = (r + 1) & (rkt->max - 1); - if (r == n) { - spin_unlock_irqrestore(&rkt->lock, flags); - ret = 0; + if (r == n) goto bail; - } } rkt->next = (r + 1) & (rkt->max - 1); /* @@ -76,46 +92,50 @@ int qib_alloc_lkey(struct qib_lkey_table *rkt, struct qib_mregion *mr) mr->lkey |= 1 << 8; rkt->gen++; } + qib_get_mr(mr); rkt->table[r] = mr; + mr->lkey_published = 1; +success: spin_unlock_irqrestore(&rkt->lock, flags); - - ret = 1; - -bail: +out: return ret; +bail: + spin_unlock_irqrestore(&rkt->lock, flags); + ret = -ENOMEM; + goto out; } /** * qib_free_lkey - free an lkey - * @rkt: table from which to free the lkey - * @lkey: lkey id to free + * @mr: mr to free from tables */ -int qib_free_lkey(struct qib_ibdev *dev, struct qib_mregion *mr) +void qib_free_lkey(struct qib_mregion *mr) { unsigned long flags; u32 lkey = mr->lkey; u32 r; - int ret; + struct qib_ibdev *dev = to_idev(mr->pd->device); + struct qib_lkey_table *rkt = &dev->lk_table; + + spin_lock_irqsave(&rkt->lock, flags); + if (!mr->lkey_published) + goto out; + mr->lkey_published = 0; + spin_lock_irqsave(&dev->lk_table.lock, flags); if (lkey == 0) { if (dev->dma_mr && dev->dma_mr == mr) { - ret = atomic_read(&dev->dma_mr->refcount); - if (!ret) - dev->dma_mr = NULL; - } else - ret = 0; + qib_put_mr(dev->dma_mr); + dev->dma_mr = NULL; + } } else { r = lkey >> (32 - ib_qib_lkey_table_size); - ret = atomic_read(&dev->lk_table.table[r]->refcount); - if (!ret) - dev->lk_table.table[r] = NULL; + qib_put_mr(dev->dma_mr); + rkt->table[r] = NULL; } +out: spin_unlock_irqrestore(&dev->lk_table.lock, flags); - - if (ret) - ret = -EBUSY; - return ret; } /** @@ -150,7 +170,7 @@ int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, goto bail; if (!dev->dma_mr) goto bail; - atomic_inc(&dev->dma_mr->refcount); + qib_get_mr(dev->dma_mr); spin_unlock_irqrestore(&rkt->lock, flags); isge->mr = dev->dma_mr; @@ -171,7 +191,7 @@ int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, off + sge->length > mr->length || (mr->access_flags & acc) != acc)) goto bail; - atomic_inc(&mr->refcount); + qib_get_mr(mr); spin_unlock_irqrestore(&rkt->lock, flags); off += mr->offset; @@ -245,7 +265,7 @@ int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge, goto bail; if (!dev->dma_mr) goto bail; - atomic_inc(&dev->dma_mr->refcount); + qib_get_mr(dev->dma_mr); spin_unlock_irqrestore(&rkt->lock, flags); sge->mr = dev->dma_mr; @@ -265,7 +285,7 @@ int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge, if (unlikely(vaddr < mr->iova || off + len > mr->length || (mr->access_flags & acc) == 0)) goto bail; - atomic_inc(&mr->refcount); + qib_get_mr(mr); spin_unlock_irqrestore(&rkt->lock, flags); off += mr->offset; diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c index 08944e2ee33..6a2028a56e3 100644 --- a/drivers/infiniband/hw/qib/qib_mr.c +++ b/drivers/infiniband/hw/qib/qib_mr.c @@ -47,6 +47,43 @@ static inline struct qib_fmr *to_ifmr(struct ib_fmr *ibfmr) return container_of(ibfmr, struct qib_fmr, ibfmr); } +static int init_qib_mregion(struct qib_mregion *mr, struct ib_pd *pd, + int count) +{ + int m, i = 0; + int rval = 0; + + m = (count + QIB_SEGSZ - 1) / QIB_SEGSZ; + for (; i < m; i++) { + mr->map[i] = kzalloc(sizeof *mr->map[0], GFP_KERNEL); + if (!mr->map[i]) + goto bail; + } + mr->mapsz = m; + init_completion(&mr->comp); + /* count returning the ptr to user */ + atomic_set(&mr->refcount, 1); + mr->pd = pd; + mr->max_segs = count; +out: + return rval; +bail: + while (i) + kfree(mr->map[--i]); + rval = -ENOMEM; + goto out; +} + +static void deinit_qib_mregion(struct qib_mregion *mr) +{ + int i = mr->mapsz; + + mr->mapsz = 0; + while (i) + kfree(mr->map[--i]); +} + + /** * qib_get_dma_mr - get a DMA memory region * @pd: protection domain for this memory region @@ -58,10 +95,9 @@ static inline struct qib_fmr *to_ifmr(struct ib_fmr *ibfmr) */ struct ib_mr *qib_get_dma_mr(struct ib_pd *pd, int acc) { - struct qib_ibdev *dev = to_idev(pd->device); - struct qib_mr *mr; + struct qib_mr *mr = NULL; struct ib_mr *ret; - unsigned long flags; + int rval; if (to_ipd(pd)->user) { ret = ERR_PTR(-EPERM); @@ -74,61 +110,64 @@ struct ib_mr *qib_get_dma_mr(struct ib_pd *pd, int acc) goto bail; } - mr->mr.access_flags = acc; - atomic_set(&mr->mr.refcount, 0); + rval = init_qib_mregion(&mr->mr, pd, 0); + if (rval) { + ret = ERR_PTR(rval); + goto bail; + } - spin_lock_irqsave(&dev->lk_table.lock, flags); - if (!dev->dma_mr) - dev->dma_mr = &mr->mr; - spin_unlock_irqrestore(&dev->lk_table.lock, flags); + rval = qib_alloc_lkey(&mr->mr, 1); + if (rval) { + ret = ERR_PTR(rval); + goto bail_mregion; + } + + mr->mr.access_flags = acc; ret = &mr->ibmr; +done: + return ret; +bail_mregion: + deinit_qib_mregion(&mr->mr); bail: - return ret; + kfree(mr); + goto done; } -static struct qib_mr *alloc_mr(int count, struct qib_lkey_table *lk_table) +static struct qib_mr *alloc_mr(int count, struct ib_pd *pd) { struct qib_mr *mr; - int m, i = 0; + int rval = -ENOMEM; + int m; /* Allocate struct plus pointers to first level page tables. */ m = (count + QIB_SEGSZ - 1) / QIB_SEGSZ; - mr = kmalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL); + mr = kzalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL); if (!mr) - goto done; - - /* Allocate first level page tables. */ - for (; i < m; i++) { - mr->mr.map[i] = kmalloc(sizeof *mr->mr.map[0], GFP_KERNEL); - if (!mr->mr.map[i]) - goto bail; - } - mr->mr.mapsz = m; - mr->mr.page_shift = 0; - mr->mr.max_segs = count; + goto bail; + rval = init_qib_mregion(&mr->mr, pd, count); + if (rval) + goto bail; /* * ib_reg_phys_mr() will initialize mr->ibmr except for * lkey and rkey. */ - if (!qib_alloc_lkey(lk_table, &mr->mr)) - goto bail; + rval = qib_alloc_lkey(&mr->mr, 0); + if (rval) + goto bail_mregion; mr->ibmr.lkey = mr->mr.lkey; mr->ibmr.rkey = mr->mr.lkey; +done: + return mr; - atomic_set(&mr->mr.refcount, 0); - goto done; - +bail_mregion: + deinit_qib_mregion(&mr->mr); bail: - while (i) - kfree(mr->mr.map[--i]); kfree(mr); - mr = NULL; - -done: - return mr; + mr = ERR_PTR(rval); + goto done; } /** @@ -148,19 +187,15 @@ struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd, int n, m, i; struct ib_mr *ret; - mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table); - if (mr == NULL) { - ret = ERR_PTR(-ENOMEM); + mr = alloc_mr(num_phys_buf, pd); + if (IS_ERR(mr)) { + ret = (struct ib_mr *)mr; goto bail; } - mr->mr.pd = pd; mr->mr.user_base = *iova_start; mr->mr.iova = *iova_start; - mr->mr.length = 0; - mr->mr.offset = 0; mr->mr.access_flags = acc; - mr->umem = NULL; m = 0; n = 0; @@ -186,7 +221,6 @@ bail: * @pd: protection domain for this memory region * @start: starting userspace address * @length: length of region to register - * @virt_addr: virtual address to use (from HCA's point of view) * @mr_access_flags: access flags for this memory region * @udata: unused by the QLogic_IB driver * @@ -216,14 +250,13 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, list_for_each_entry(chunk, &umem->chunk_list, list) n += chunk->nents; - mr = alloc_mr(n, &to_idev(pd->device)->lk_table); - if (!mr) { - ret = ERR_PTR(-ENOMEM); + mr = alloc_mr(n, pd); + if (IS_ERR(mr)) { + ret = (struct ib_mr *)mr; ib_umem_release(umem); goto bail; } - mr->mr.pd = pd; mr->mr.user_base = start; mr->mr.iova = virt_addr; mr->mr.length = length; @@ -271,21 +304,25 @@ bail: int qib_dereg_mr(struct ib_mr *ibmr) { struct qib_mr *mr = to_imr(ibmr); - struct qib_ibdev *dev = to_idev(ibmr->device); - int ret; - int i; - - ret = qib_free_lkey(dev, &mr->mr); - if (ret) - return ret; - - i = mr->mr.mapsz; - while (i) - kfree(mr->mr.map[--i]); + int ret = 0; + unsigned long timeout; + + qib_free_lkey(&mr->mr); + + qib_put_mr(&mr->mr); /* will set completion if last */ + timeout = wait_for_completion_timeout(&mr->mr.comp, + 5 * HZ); + if (!timeout) { + qib_get_mr(&mr->mr); + ret = -EBUSY; + goto out; + } + deinit_qib_mregion(&mr->mr); if (mr->umem) ib_umem_release(mr->umem); kfree(mr); - return 0; +out: + return ret; } /* @@ -298,17 +335,9 @@ struct ib_mr *qib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) { struct qib_mr *mr; - mr = alloc_mr(max_page_list_len, &to_idev(pd->device)->lk_table); - if (mr == NULL) - return ERR_PTR(-ENOMEM); - - mr->mr.pd = pd; - mr->mr.user_base = 0; - mr->mr.iova = 0; - mr->mr.length = 0; - mr->mr.offset = 0; - mr->mr.access_flags = 0; - mr->umem = NULL; + mr = alloc_mr(max_page_list_len, pd); + if (IS_ERR(mr)) + return (struct ib_mr *)mr; return &mr->ibmr; } @@ -322,11 +351,11 @@ qib_alloc_fast_reg_page_list(struct ib_device *ibdev, int page_list_len) if (size > PAGE_SIZE) return ERR_PTR(-EINVAL); - pl = kmalloc(sizeof *pl, GFP_KERNEL); + pl = kzalloc(sizeof *pl, GFP_KERNEL); if (!pl) return ERR_PTR(-ENOMEM); - pl->page_list = kmalloc(size, GFP_KERNEL); + pl->page_list = kzalloc(size, GFP_KERNEL); if (!pl->page_list) goto err_free; @@ -355,57 +384,47 @@ struct ib_fmr *qib_alloc_fmr(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr) { struct qib_fmr *fmr; - int m, i = 0; + int m; struct ib_fmr *ret; + int rval = -ENOMEM; /* Allocate struct plus pointers to first level page tables. */ m = (fmr_attr->max_pages + QIB_SEGSZ - 1) / QIB_SEGSZ; - fmr = kmalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL); + fmr = kzalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL); if (!fmr) goto bail; - /* Allocate first level page tables. */ - for (; i < m; i++) { - fmr->mr.map[i] = kmalloc(sizeof *fmr->mr.map[0], - GFP_KERNEL); - if (!fmr->mr.map[i]) - goto bail; - } - fmr->mr.mapsz = m; + rval = init_qib_mregion(&fmr->mr, pd, fmr_attr->max_pages); + if (rval) + goto bail; /* * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey & * rkey. */ - if (!qib_alloc_lkey(&to_idev(pd->device)->lk_table, &fmr->mr)) - goto bail; + rval = qib_alloc_lkey(&fmr->mr, 0); + if (rval) + goto bail_mregion; fmr->ibfmr.rkey = fmr->mr.lkey; fmr->ibfmr.lkey = fmr->mr.lkey; /* * Resources are allocated but no valid mapping (RKEY can't be * used). */ - fmr->mr.pd = pd; - fmr->mr.user_base = 0; - fmr->mr.iova = 0; - fmr->mr.length = 0; - fmr->mr.offset = 0; fmr->mr.access_flags = mr_access_flags; fmr->mr.max_segs = fmr_attr->max_pages; fmr->mr.page_shift = fmr_attr->page_shift; - atomic_set(&fmr->mr.refcount, 0); ret = &fmr->ibfmr; - goto done; +done: + return ret; +bail_mregion: + deinit_qib_mregion(&fmr->mr); bail: - while (i) - kfree(fmr->mr.map[--i]); kfree(fmr); - ret = ERR_PTR(-ENOMEM); - -done: - return ret; + ret = ERR_PTR(rval); + goto done; } /** @@ -428,7 +447,8 @@ int qib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, u32 ps; int ret; - if (atomic_read(&fmr->mr.refcount)) + i = atomic_read(&fmr->mr.refcount); + if (i > 2) return -EBUSY; if (list_len > fmr->mr.max_segs) { @@ -490,16 +510,20 @@ int qib_unmap_fmr(struct list_head *fmr_list) int qib_dealloc_fmr(struct ib_fmr *ibfmr) { struct qib_fmr *fmr = to_ifmr(ibfmr); - int ret; - int i; - - ret = qib_free_lkey(to_idev(ibfmr->device), &fmr->mr); - if (ret) - return ret; - - i = fmr->mr.mapsz; - while (i) - kfree(fmr->mr.map[--i]); + int ret = 0; + unsigned long timeout; + + qib_free_lkey(&fmr->mr); + qib_put_mr(&fmr->mr); /* will set completion if last */ + timeout = wait_for_completion_timeout(&fmr->mr.comp, + 5 * HZ); + if (!timeout) { + qib_get_mr(&fmr->mr); + ret = -EBUSY; + goto out; + } + deinit_qib_mregion(&fmr->mr); kfree(fmr); - return 0; +out: + return ret; } diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 1ce56b51ab1..693041b076f 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -406,18 +406,9 @@ static void clear_mr_refs(struct qib_qp *qp, int clr_sends) unsigned n; if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags)) - while (qp->s_rdma_read_sge.num_sge) { - atomic_dec(&qp->s_rdma_read_sge.sge.mr->refcount); - if (--qp->s_rdma_read_sge.num_sge) - qp->s_rdma_read_sge.sge = - *qp->s_rdma_read_sge.sg_list++; - } + qib_put_ss(&qp->s_rdma_read_sge); - while (qp->r_sge.num_sge) { - atomic_dec(&qp->r_sge.sge.mr->refcount); - if (--qp->r_sge.num_sge) - qp->r_sge.sge = *qp->r_sge.sg_list++; - } + qib_put_ss(&qp->r_sge); if (clr_sends) { while (qp->s_last != qp->s_head) { @@ -427,7 +418,7 @@ static void clear_mr_refs(struct qib_qp *qp, int clr_sends) for (i = 0; i < wqe->wr.num_sge; i++) { struct qib_sge *sge = &wqe->sg_list[i]; - atomic_dec(&sge->mr->refcount); + qib_put_mr(sge->mr); } if (qp->ibqp.qp_type == IB_QPT_UD || qp->ibqp.qp_type == IB_QPT_SMI || @@ -437,7 +428,7 @@ static void clear_mr_refs(struct qib_qp *qp, int clr_sends) qp->s_last = 0; } if (qp->s_rdma_mr) { - atomic_dec(&qp->s_rdma_mr->refcount); + qib_put_mr(qp->s_rdma_mr); qp->s_rdma_mr = NULL; } } @@ -450,7 +441,7 @@ static void clear_mr_refs(struct qib_qp *qp, int clr_sends) if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST && e->rdma_sge.mr) { - atomic_dec(&e->rdma_sge.mr->refcount); + qib_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } } @@ -495,7 +486,7 @@ int qib_error_qp(struct qib_qp *qp, enum ib_wc_status err) if (!(qp->s_flags & QIB_S_BUSY)) { qp->s_hdrwords = 0; if (qp->s_rdma_mr) { - atomic_dec(&qp->s_rdma_mr->refcount); + qib_put_mr(qp->s_rdma_mr); qp->s_rdma_mr = NULL; } if (qp->s_tx) { diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index b641416148e..3ab341320ea 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -95,7 +95,7 @@ static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp, case OP(RDMA_READ_RESPONSE_ONLY): e = &qp->s_ack_queue[qp->s_tail_ack_queue]; if (e->rdma_sge.mr) { - atomic_dec(&e->rdma_sge.mr->refcount); + qib_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } /* FALLTHROUGH */ @@ -133,7 +133,7 @@ static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp, /* Copy SGE state in case we need to resend */ qp->s_rdma_mr = e->rdma_sge.mr; if (qp->s_rdma_mr) - atomic_inc(&qp->s_rdma_mr->refcount); + qib_get_mr(qp->s_rdma_mr); qp->s_ack_rdma_sge.sge = e->rdma_sge; qp->s_ack_rdma_sge.num_sge = 1; qp->s_cur_sge = &qp->s_ack_rdma_sge; @@ -172,7 +172,7 @@ static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp, qp->s_cur_sge = &qp->s_ack_rdma_sge; qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr; if (qp->s_rdma_mr) - atomic_inc(&qp->s_rdma_mr->refcount); + qib_get_mr(qp->s_rdma_mr); len = qp->s_ack_rdma_sge.sge.sge_length; if (len > pmtu) len = pmtu; @@ -1012,7 +1012,7 @@ void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr) for (i = 0; i < wqe->wr.num_sge; i++) { struct qib_sge *sge = &wqe->sg_list[i]; - atomic_dec(&sge->mr->refcount); + qib_put_mr(sge->mr); } /* Post a send completion queue entry if requested. */ if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) || @@ -1068,7 +1068,7 @@ static struct qib_swqe *do_rc_completion(struct qib_qp *qp, for (i = 0; i < wqe->wr.num_sge; i++) { struct qib_sge *sge = &wqe->sg_list[i]; - atomic_dec(&sge->mr->refcount); + qib_put_mr(sge->mr); } /* Post a send completion queue entry if requested. */ if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) || @@ -1730,7 +1730,7 @@ static int qib_rc_rcv_error(struct qib_other_headers *ohdr, if (unlikely(offset + len != e->rdma_sge.sge_length)) goto unlock_done; if (e->rdma_sge.mr) { - atomic_dec(&e->rdma_sge.mr->refcount); + qib_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } if (len != 0) { @@ -2024,11 +2024,7 @@ send_last: if (unlikely(wc.byte_len > qp->r_len)) goto nack_inv; qib_copy_sge(&qp->r_sge, data, tlen, 1); - while (qp->r_sge.num_sge) { - atomic_dec(&qp->r_sge.sge.mr->refcount); - if (--qp->r_sge.num_sge) - qp->r_sge.sge = *qp->r_sge.sg_list++; - } + qib_put_ss(&qp->r_sge); qp->r_msn++; if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) break; @@ -2116,7 +2112,7 @@ send_last: } e = &qp->s_ack_queue[qp->r_head_ack_queue]; if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { - atomic_dec(&e->rdma_sge.mr->refcount); + qib_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } reth = &ohdr->u.rc.reth; @@ -2188,7 +2184,7 @@ send_last: } e = &qp->s_ack_queue[qp->r_head_ack_queue]; if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { - atomic_dec(&e->rdma_sge.mr->refcount); + qib_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } ateth = &ohdr->u.atomic_eth; @@ -2210,7 +2206,7 @@ send_last: (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, be64_to_cpu(ateth->compare_data), sdata); - atomic_dec(&qp->r_sge.sge.mr->refcount); + qib_put_mr(qp->r_sge.sge.mr); qp->r_sge.num_sge = 0; e->opcode = opcode; e->sent = 0; diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index c0ee7e095d8..357b6cfcd46 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -110,7 +110,7 @@ bad_lkey: while (j) { struct qib_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge; - atomic_dec(&sge->mr->refcount); + qib_put_mr(sge->mr); } ss->num_sge = 0; memset(&wc, 0, sizeof(wc)); @@ -501,7 +501,7 @@ again: (u64) atomic64_add_return(sdata, maddr) - sdata : (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, sdata, wqe->wr.wr.atomic.swap); - atomic_dec(&qp->r_sge.sge.mr->refcount); + qib_put_mr(qp->r_sge.sge.mr); qp->r_sge.num_sge = 0; goto send_comp; @@ -525,7 +525,7 @@ again: sge->sge_length -= len; if (sge->sge_length == 0) { if (!release) - atomic_dec(&sge->mr->refcount); + qib_put_mr(sge->mr); if (--sqp->s_sge.num_sge) *sge = *sqp->s_sge.sg_list++; } else if (sge->length == 0 && sge->mr->lkey) { @@ -542,11 +542,7 @@ again: sqp->s_len -= len; } if (release) - while (qp->r_sge.num_sge) { - atomic_dec(&qp->r_sge.sge.mr->refcount); - if (--qp->r_sge.num_sge) - qp->r_sge.sge = *qp->r_sge.sg_list++; - } + qib_put_ss(&qp->r_sge); if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) goto send_comp; @@ -782,7 +778,7 @@ void qib_send_complete(struct qib_qp *qp, struct qib_swqe *wqe, for (i = 0; i < wqe->wr.num_sge; i++) { struct qib_sge *sge = &wqe->sg_list[i]; - atomic_dec(&sge->mr->refcount); + qib_put_mr(sge->mr); } if (qp->ibqp.qp_type == IB_QPT_UD || qp->ibqp.qp_type == IB_QPT_SMI || diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 70b4cb710f9..aa3a8035bb6 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -281,11 +281,7 @@ inv: set_bit(QIB_R_REWIND_SGE, &qp->r_aflags); qp->r_sge.num_sge = 0; } else - while (qp->r_sge.num_sge) { - atomic_dec(&qp->r_sge.sge.mr->refcount); - if (--qp->r_sge.num_sge) - qp->r_sge.sge = *qp->r_sge.sg_list++; - } + qib_put_ss(&qp->r_sge); qp->r_state = OP(SEND_LAST); switch (opcode) { case OP(SEND_FIRST): @@ -404,12 +400,7 @@ send_last: goto rewind; wc.opcode = IB_WC_RECV; qib_copy_sge(&qp->r_sge, data, tlen, 0); - while (qp->s_rdma_read_sge.num_sge) { - atomic_dec(&qp->s_rdma_read_sge.sge.mr->refcount); - if (--qp->s_rdma_read_sge.num_sge) - qp->s_rdma_read_sge.sge = - *qp->s_rdma_read_sge.sg_list++; - } + qib_put_ss(&qp->s_rdma_read_sge); last_imm: wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; @@ -493,13 +484,7 @@ rdma_last_imm: if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags)) - while (qp->s_rdma_read_sge.num_sge) { - atomic_dec(&qp->s_rdma_read_sge.sge.mr-> - refcount); - if (--qp->s_rdma_read_sge.num_sge) - qp->s_rdma_read_sge.sge = - *qp->s_rdma_read_sge.sg_list++; - } + qib_put_ss(&qp->s_rdma_read_sge); else { ret = qib_get_rwqe(qp, 1); if (ret < 0) @@ -510,11 +495,7 @@ rdma_last_imm: wc.byte_len = qp->r_len; wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; qib_copy_sge(&qp->r_sge, data, tlen, 1); - while (qp->r_sge.num_sge) { - atomic_dec(&qp->r_sge.sge.mr->refcount); - if (--qp->r_sge.num_sge) - qp->r_sge.sge = *qp->r_sge.sg_list++; - } + qib_put_ss(&qp->r_sge); goto last_imm; case OP(RDMA_WRITE_LAST): @@ -530,11 +511,7 @@ rdma_last: if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; qib_copy_sge(&qp->r_sge, data, tlen, 1); - while (qp->r_sge.num_sge) { - atomic_dec(&qp->r_sge.sge.mr->refcount); - if (--qp->r_sge.num_sge) - qp->r_sge.sge = *qp->r_sge.sg_list++; - } + qib_put_ss(&qp->r_sge); break; default: diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index a468bf2d446..d6c7fe7f88d 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -194,11 +194,7 @@ static void qib_ud_loopback(struct qib_qp *sqp, struct qib_swqe *swqe) } length -= len; } - while (qp->r_sge.num_sge) { - atomic_dec(&qp->r_sge.sge.mr->refcount); - if (--qp->r_sge.num_sge) - qp->r_sge.sge = *qp->r_sge.sg_list++; - } + qib_put_ss(&qp->r_sge); if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) goto bail_unlock; wc.wr_id = qp->r_wr_id; @@ -556,11 +552,7 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, } else qib_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1); qib_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1); - while (qp->r_sge.num_sge) { - atomic_dec(&qp->r_sge.sge.mr->refcount); - if (--qp->r_sge.num_sge) - qp->r_sge.sge = *qp->r_sge.sg_list++; - } + qib_put_ss(&qp->r_sge); if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) return; wc.wr_id = qp->r_wr_id; diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 7b6c3bffa9d..76d7ce8a8c6 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -183,7 +183,7 @@ void qib_copy_sge(struct qib_sge_state *ss, void *data, u32 length, int release) sge->sge_length -= len; if (sge->sge_length == 0) { if (release) - atomic_dec(&sge->mr->refcount); + qib_put_mr(sge->mr); if (--ss->num_sge) *sge = *ss->sg_list++; } else if (sge->length == 0 && sge->mr->lkey) { @@ -224,7 +224,7 @@ void qib_skip_sge(struct qib_sge_state *ss, u32 length, int release) sge->sge_length -= len; if (sge->sge_length == 0) { if (release) - atomic_dec(&sge->mr->refcount); + qib_put_mr(sge->mr); if (--ss->num_sge) *sge = *ss->sg_list++; } else if (sge->length == 0 && sge->mr->lkey) { @@ -435,7 +435,7 @@ bail_inval_free: while (j) { struct qib_sge *sge = &wqe->sg_list[--j]; - atomic_dec(&sge->mr->refcount); + qib_put_mr(sge->mr); } bail_inval: ret = -EINVAL; @@ -978,7 +978,7 @@ void qib_put_txreq(struct qib_verbs_txreq *tx) if (atomic_dec_and_test(&qp->refcount)) wake_up(&qp->wait); if (tx->mr) { - atomic_dec(&tx->mr->refcount); + qib_put_mr(tx->mr); tx->mr = NULL; } if (tx->txreq.flags & QIB_SDMA_TXREQ_F_FREEBUF) { @@ -1336,7 +1336,7 @@ done: } qib_sendbuf_done(dd, pbufn); if (qp->s_rdma_mr) { - atomic_dec(&qp->s_rdma_mr->refcount); + qib_put_mr(qp->s_rdma_mr); qp->s_rdma_mr = NULL; } if (qp->s_wqe) { diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 48760602465..4a2277bc059 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -302,6 +303,8 @@ struct qib_mregion { u32 max_segs; /* number of qib_segs in all the arrays */ u32 mapsz; /* size of the map array */ u8 page_shift; /* 0 - non unform/non powerof2 sizes */ + u8 lkey_published; /* in global table */ + struct completion comp; /* complete when refcount goes to zero */ atomic_t refcount; struct qib_segarray *map[0]; /* the segments */ }; @@ -944,9 +947,9 @@ int qib_post_ud_send(struct qib_qp *qp, struct ib_send_wr *wr); void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, int has_grh, void *data, u32 tlen, struct qib_qp *qp); -int qib_alloc_lkey(struct qib_lkey_table *rkt, struct qib_mregion *mr); +int qib_alloc_lkey(struct qib_mregion *mr, int dma_region); -int qib_free_lkey(struct qib_ibdev *dev, struct qib_mregion *mr); +void qib_free_lkey(struct qib_mregion *mr); int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, struct qib_sge *isge, struct ib_sge *sge, int acc); @@ -1014,6 +1017,27 @@ int qib_unmap_fmr(struct list_head *fmr_list); int qib_dealloc_fmr(struct ib_fmr *ibfmr); +static inline void qib_get_mr(struct qib_mregion *mr) +{ + atomic_inc(&mr->refcount); +} + +static inline void qib_put_mr(struct qib_mregion *mr) +{ + if (unlikely(atomic_dec_and_test(&mr->refcount))) + complete(&mr->comp); +} + +static inline void qib_put_ss(struct qib_sge_state *ss) +{ + while (ss->num_sge) { + qib_put_mr(ss->sge.mr); + if (--ss->num_sge) + ss->sge = *ss->sg_list++; + } +} + + void qib_release_mmap_info(struct kref *ref); struct qib_mmap_info *qib_create_mmap_info(struct qib_ibdev *dev, u32 size, -- cgit v1.2.3-18-g5258 From 8aac4cc3a9d7d7c2f203b7a8db521b604cfb5dc9 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Wed, 27 Jun 2012 18:33:19 -0400 Subject: IB/qib: RCU locking for MR validation Profiling indicates that MR validation locking is expensive. The MR table is largely read-only and is a suitable candidate for RCU locking. The patch uses RCU locking during validation to eliminate one lock/unlock during that validation. Reviewed-by: Mike Heinz Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_keys.c | 98 ++++++++++++++++++----------------- drivers/infiniband/hw/qib/qib_mr.c | 7 +++ drivers/infiniband/hw/qib/qib_verbs.c | 4 +- drivers/infiniband/hw/qib/qib_verbs.h | 7 ++- 4 files changed, 66 insertions(+), 50 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/qib/qib_keys.c b/drivers/infiniband/hw/qib/qib_keys.c index 8b5ee3aa8e3..970165b027f 100644 --- a/drivers/infiniband/hw/qib/qib_keys.c +++ b/drivers/infiniband/hw/qib/qib_keys.c @@ -40,8 +40,7 @@ * * Returns 0 if successful, otherwise returns -errno. * - * Increments mr reference count and sets published - * as required. + * Increments mr reference count as required. * * Sets the lkey field mr for non-dma regions. * @@ -60,10 +59,12 @@ int qib_alloc_lkey(struct qib_mregion *mr, int dma_region) /* special case for dma_mr lkey == 0 */ if (dma_region) { - /* should the dma_mr be relative to the pd? */ - if (!dev->dma_mr) { + struct qib_mregion *tmr; + + tmr = rcu_dereference(dev->dma_mr); + if (!tmr) { qib_get_mr(mr); - dev->dma_mr = mr; + rcu_assign_pointer(dev->dma_mr, mr); mr->lkey_published = 1; } goto success; @@ -93,7 +94,7 @@ int qib_alloc_lkey(struct qib_mregion *mr, int dma_region) rkt->gen++; } qib_get_mr(mr); - rkt->table[r] = mr; + rcu_assign_pointer(rkt->table[r], mr); mr->lkey_published = 1; success: spin_unlock_irqrestore(&rkt->lock, flags); @@ -120,33 +121,30 @@ void qib_free_lkey(struct qib_mregion *mr) spin_lock_irqsave(&rkt->lock, flags); if (!mr->lkey_published) goto out; - mr->lkey_published = 0; - - - spin_lock_irqsave(&dev->lk_table.lock, flags); - if (lkey == 0) { - if (dev->dma_mr && dev->dma_mr == mr) { - qib_put_mr(dev->dma_mr); - dev->dma_mr = NULL; - } - } else { + if (lkey == 0) + rcu_assign_pointer(dev->dma_mr, NULL); + else { r = lkey >> (32 - ib_qib_lkey_table_size); - qib_put_mr(dev->dma_mr); - rkt->table[r] = NULL; + rcu_assign_pointer(rkt->table[r], NULL); } + qib_put_mr(mr); + mr->lkey_published = 0; out: - spin_unlock_irqrestore(&dev->lk_table.lock, flags); + spin_unlock_irqrestore(&rkt->lock, flags); } /** * qib_lkey_ok - check IB SGE for validity and initialize * @rkt: table containing lkey to check SGE against + * @pd: protection domain * @isge: outgoing internal SGE * @sge: SGE to check * @acc: access flags * * Return 1 if valid and successful, otherwise returns 0. * + * increments the reference count upon success + * * Check the IB SGE for validity and initialize our internal version * of it. */ @@ -156,24 +154,25 @@ int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, struct qib_mregion *mr; unsigned n, m; size_t off; - unsigned long flags; /* * We use LKEY == zero for kernel virtual addresses * (see qib_get_dma_mr and qib_dma.c). */ - spin_lock_irqsave(&rkt->lock, flags); + rcu_read_lock(); if (sge->lkey == 0) { struct qib_ibdev *dev = to_idev(pd->ibpd.device); if (pd->user) goto bail; - if (!dev->dma_mr) + mr = rcu_dereference(dev->dma_mr); + if (!mr) + goto bail; + if (unlikely(!atomic_inc_not_zero(&mr->refcount))) goto bail; - qib_get_mr(dev->dma_mr); - spin_unlock_irqrestore(&rkt->lock, flags); + rcu_read_unlock(); - isge->mr = dev->dma_mr; + isge->mr = mr; isge->vaddr = (void *) sge->addr; isge->length = sge->length; isge->sge_length = sge->length; @@ -181,18 +180,18 @@ int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, isge->n = 0; goto ok; } - mr = rkt->table[(sge->lkey >> (32 - ib_qib_lkey_table_size))]; - if (unlikely(mr == NULL || mr->lkey != sge->lkey || - mr->pd != &pd->ibpd)) + mr = rcu_dereference( + rkt->table[(sge->lkey >> (32 - ib_qib_lkey_table_size))]); + if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd)) goto bail; off = sge->addr - mr->user_base; - if (unlikely(sge->addr < mr->user_base || - off + sge->length > mr->length || - (mr->access_flags & acc) != acc)) + if (unlikely(sge->addr < mr->iova || off + sge->length > mr->length || + (mr->access_flags & acc) == 0)) goto bail; - qib_get_mr(mr); - spin_unlock_irqrestore(&rkt->lock, flags); + if (unlikely(!atomic_inc_not_zero(&mr->refcount))) + goto bail; + rcu_read_unlock(); off += mr->offset; if (mr->page_shift) { @@ -228,20 +227,22 @@ int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, ok: return 1; bail: - spin_unlock_irqrestore(&rkt->lock, flag