diff options
Diffstat (limited to 'drivers/infiniband')
211 files changed, 36247 insertions, 5862 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index a0f29c1d03b..77089399359 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -3,6 +3,8 @@ menuconfig INFINIBAND depends on PCI || BROKEN depends on HAS_IOMEM depends on NET + depends on INET + depends on m || IPV6 != m ---help--- Core support for InfiniBand (IB). Make sure to also select any protocols you wish to use as well as drivers for your @@ -38,8 +40,7 @@ config INFINIBAND_USER_MEM config INFINIBAND_ADDR_TRANS bool - depends on INET - depends on !(INFINIBAND = y && IPV6 = m) + depends on INFINIBAND default y source "drivers/infiniband/hw/mthca/Kconfig" @@ -50,8 +51,10 @@ source "drivers/infiniband/hw/amso1100/Kconfig" source "drivers/infiniband/hw/cxgb3/Kconfig" source "drivers/infiniband/hw/cxgb4/Kconfig" source "drivers/infiniband/hw/mlx4/Kconfig" +source "drivers/infiniband/hw/mlx5/Kconfig" source "drivers/infiniband/hw/nes/Kconfig" source "drivers/infiniband/hw/ocrdma/Kconfig" +source "drivers/infiniband/hw/usnic/Kconfig" source "drivers/infiniband/ulp/ipoib/Kconfig" @@ -59,5 +62,6 @@ source "drivers/infiniband/ulp/srp/Kconfig" source "drivers/infiniband/ulp/srpt/Kconfig" source "drivers/infiniband/ulp/iser/Kconfig" +source "drivers/infiniband/ulp/isert/Kconfig" endif # INFINIBAND diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile index bf846a14b9d..dc21836b5a8 100644 --- a/drivers/infiniband/Makefile +++ b/drivers/infiniband/Makefile @@ -1,15 +1,3 @@ obj-$(CONFIG_INFINIBAND) += core/ -obj-$(CONFIG_INFINIBAND_MTHCA) += hw/mthca/ -obj-$(CONFIG_INFINIBAND_IPATH) += hw/ipath/ -obj-$(CONFIG_INFINIBAND_QIB) += hw/qib/ -obj-$(CONFIG_INFINIBAND_EHCA) += hw/ehca/ -obj-$(CONFIG_INFINIBAND_AMSO1100) += hw/amso1100/ -obj-$(CONFIG_INFINIBAND_CXGB3) += hw/cxgb3/ -obj-$(CONFIG_INFINIBAND_CXGB4) += hw/cxgb4/ -obj-$(CONFIG_MLX4_INFINIBAND) += hw/mlx4/ -obj-$(CONFIG_INFINIBAND_NES) += hw/nes/ -obj-$(CONFIG_INFINIBAND_OCRDMA) += hw/ocrdma/ -obj-$(CONFIG_INFINIBAND_IPOIB) += ulp/ipoib/ -obj-$(CONFIG_INFINIBAND_SRP) += ulp/srp/ -obj-$(CONFIG_INFINIBAND_SRPT) += ulp/srpt/ -obj-$(CONFIG_INFINIBAND_ISER) += ulp/iser/ +obj-$(CONFIG_INFINIBAND) += hw/ +obj-$(CONFIG_INFINIBAND) += ulp/ diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index c8bbaef1bec..ffd0af6734a 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -1,8 +1,9 @@ -infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := ib_addr.o rdma_cm.o +infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_cm.o user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_sa.o \ - ib_cm.o iw_cm.o $(infiniband-y) + ib_cm.o iw_cm.o ib_addr.o \ + $(infiniband-y) obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ $(user_access-y) @@ -17,7 +18,7 @@ ib_sa-y := sa_query.o multicast.o ib_cm-y := cm.o -iw_cm-y := iwcm.o +iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o rdma_cm-y := cma.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index eaec8d7a3b7..8172d37f9ad 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -45,6 +45,7 @@ #include <net/addrconf.h> #include <net/ip6_route.h> #include <rdma/ib_addr.h> +#include <rdma/ib.h> MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("IB Address Translation"); @@ -70,6 +71,23 @@ static LIST_HEAD(req_list); static DECLARE_DELAYED_WORK(work, process_req); static struct workqueue_struct *addr_wq; +int rdma_addr_size(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + return sizeof(struct sockaddr_in); + case AF_INET6: + return sizeof(struct sockaddr_in6); + case AF_IB: + return sizeof(struct sockaddr_ib); + default: + return 0; + } +} +EXPORT_SYMBOL(rdma_addr_size); + +static struct rdma_addr_client self; + void rdma_addr_register_client(struct rdma_addr_client *client) { atomic_set(&client->refcount, 1); @@ -103,7 +121,8 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, } EXPORT_SYMBOL(rdma_copy_addr); -int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) +int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, + u16 *vlan_id) { struct net_device *dev; int ret = -EADDRNOTAVAIL; @@ -126,6 +145,8 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) return ret; ret = rdma_copy_addr(dev_addr, dev, NULL); + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(dev); dev_put(dev); break; @@ -137,6 +158,8 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) &((struct sockaddr_in6 *) addr)->sin6_addr, dev, 1)) { ret = rdma_copy_addr(dev_addr, dev, NULL); + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(dev); break; } } @@ -222,7 +245,7 @@ static int addr4_resolve(struct sockaddr_in *src_in, src_in->sin_addr.s_addr = fl4.saddr; if (rt->dst.dev->flags & IFF_LOOPBACK) { - ret = rdma_translate_ip((struct sockaddr *) dst_in, addr); + ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL); if (!ret) memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); goto put; @@ -270,7 +293,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, } if (dst->dev->flags & IFF_LOOPBACK) { - ret = rdma_translate_ip((struct sockaddr *) dst_in, addr); + ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL); if (!ret) memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); goto put; @@ -369,12 +392,12 @@ int rdma_resolve_ip(struct rdma_addr_client *client, goto err; } - memcpy(src_in, src_addr, ip_addr_size(src_addr)); + memcpy(src_in, src_addr, rdma_addr_size(src_addr)); } else { src_in->sa_family = dst_addr->sa_family; } - memcpy(dst_in, dst_addr, ip_addr_size(dst_addr)); + memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr)); req->addr = addr; req->callback = callback; req->context = context; @@ -421,6 +444,88 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) } EXPORT_SYMBOL(rdma_addr_cancel); +struct resolve_cb_context { + struct rdma_dev_addr *addr; + struct completion comp; +}; + +static void resolve_cb(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context) +{ + memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct + rdma_dev_addr)); + complete(&((struct resolve_cb_context *)context)->comp); +} + +int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac, + u16 *vlan_id) +{ + int ret = 0; + struct rdma_dev_addr dev_addr; + struct resolve_cb_context ctx; + struct net_device *dev; + + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; + + + ret = rdma_gid2ip(&sgid_addr._sockaddr, sgid); + if (ret) + return ret; + + ret = rdma_gid2ip(&dgid_addr._sockaddr, dgid); + if (ret) + return ret; + + memset(&dev_addr, 0, sizeof(dev_addr)); + + ctx.addr = &dev_addr; + init_completion(&ctx.comp); + ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr, + &dev_addr, 1000, resolve_cb, &ctx); + if (ret) + return ret; + + wait_for_completion(&ctx.comp); + + memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); + dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if); + if (!dev) + return -ENODEV; + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(dev); + dev_put(dev); + return ret; +} +EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh); + +int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id) +{ + int ret = 0; + struct rdma_dev_addr dev_addr; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } gid_addr; + + ret = rdma_gid2ip(&gid_addr._sockaddr, sgid); + + if (ret) + return ret; + memset(&dev_addr, 0, sizeof(dev_addr)); + ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id); + if (ret) + return ret; + + memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN); + return ret; +} +EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid); + static int netevent_callback(struct notifier_block *self, unsigned long event, void *ctx) { @@ -445,11 +550,13 @@ static int __init addr_init(void) return -ENOMEM; register_netevent_notifier(&nb); + rdma_addr_register_client(&self); return 0; } static void __exit addr_cleanup(void) { + rdma_addr_unregister_client(&self); unregister_netevent_notifier(&nb); destroy_workqueue(addr_wq); } diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 394fea2ba1b..c3239170d8b 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -47,6 +47,7 @@ #include <linux/sysfs.h> #include <linux/workqueue.h> #include <linux/kdev_t.h> +#include <linux/etherdevice.h> #include <rdma/ib_cache.h> #include <rdma/ib_cm.h> @@ -177,6 +178,8 @@ struct cm_av { struct ib_ah_attr ah_attr; u16 pkey_index; u8 timeout; + u8 valid; + u8 smac[ETH_ALEN]; }; struct cm_work { @@ -376,26 +379,27 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path, &av->ah_attr); av->timeout = path->packet_life_time + 1; + memcpy(av->smac, path->smac, sizeof(av->smac)); + + av->valid = 1; return 0; } static int cm_alloc_id(struct cm_id_private *cm_id_priv) { unsigned long flags; - int ret, id; - static int next_id; + int id; - do { - spin_lock_irqsave(&cm.lock, flags); - ret = idr_get_new_above(&cm.local_id_table, cm_id_priv, - next_id, &id); - if (!ret) - next_id = ((unsigned) id + 1) & MAX_IDR_MASK; - spin_unlock_irqrestore(&cm.lock, flags); - } while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) ); + idr_preload(GFP_KERNEL); + spin_lock_irqsave(&cm.lock, flags); + + id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT); + + spin_unlock_irqrestore(&cm.lock, flags); + idr_preload_end(); cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand; - return ret; + return id < 0 ? id : 0; } static void cm_free_id(__be32 local_id) @@ -1556,6 +1560,9 @@ static int cm_req_handler(struct cm_work *work) cm_process_routed_req(req_msg, work->mad_recv_wc->wc); cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); + + memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN); + work->path[0].vlan_id = cm_id_priv->av.ah_attr.vlan_id; ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); if (ret) { ib_get_cached_gid(work->port->cm_dev->ib_device, @@ -3502,6 +3509,32 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN; qp_attr->ah_attr = cm_id_priv->av.ah_attr; + if (!cm_id_priv->av.valid) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return -EINVAL; + } + if (cm_id_priv->av.ah_attr.vlan_id != 0xffff) { + qp_attr->vlan_id = cm_id_priv->av.ah_attr.vlan_id; + *qp_attr_mask |= IB_QP_VID; + } + if (!is_zero_ether_addr(cm_id_priv->av.smac)) { + memcpy(qp_attr->smac, cm_id_priv->av.smac, + sizeof(qp_attr->smac)); + *qp_attr_mask |= IB_QP_SMAC; + } + if (cm_id_priv->alt_av.valid) { + if (cm_id_priv->alt_av.ah_attr.vlan_id != 0xffff) { + qp_attr->alt_vlan_id = + cm_id_priv->alt_av.ah_attr.vlan_id; + *qp_attr_mask |= IB_QP_ALT_VID; + } + if (!is_zero_ether_addr(cm_id_priv->alt_av.smac)) { + memcpy(qp_attr->alt_smac, + cm_id_priv->alt_av.smac, + sizeof(qp_attr->alt_smac)); + *qp_attr_mask |= IB_QP_ALT_SMAC; + } + } qp_attr->path_mtu = cm_id_priv->path_mtu; qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn); qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn); @@ -3844,7 +3877,6 @@ static int __init ib_cm_init(void) cm.remote_sidr_table = RB_ROOT; idr_init(&cm.local_id_table); get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand); - idr_pre_get(&cm.local_id_table, GFP_KERNEL); INIT_LIST_HEAD(&cm.timewait_list); ret = class_register(&cm_class); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index a7568c34a1a..d570030d899 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -50,6 +50,7 @@ #include <rdma/rdma_cm.h> #include <rdma/rdma_cm_ib.h> #include <rdma/rdma_netlink.h> +#include <rdma/ib.h> #include <rdma/ib_cache.h> #include <rdma/ib_cm.h> #include <rdma/ib_sa.h> @@ -79,7 +80,6 @@ static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct workqueue_struct *cma_wq; -static DEFINE_IDR(sdp_ps); static DEFINE_IDR(tcp_ps); static DEFINE_IDR(udp_ps); static DEFINE_IDR(ipoib_ps); @@ -195,24 +195,7 @@ struct cma_hdr { union cma_ip_addr dst_addr; }; -struct sdp_hh { - u8 bsdh[16]; - u8 sdp_version; /* Major version: 7:4 */ - u8 ip_version; /* IP version: 7:4 */ - u8 sdp_specific1[10]; - __be16 port; - __be16 sdp_specific2; - union cma_ip_addr src_addr; - union cma_ip_addr dst_addr; -}; - -struct sdp_hah { - u8 bsdh[16]; - u8 sdp_version; -}; - #define CMA_VERSION 0x00 -#define SDP_MAJ_VERSION 0x2 static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) { @@ -261,21 +244,6 @@ static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } -static inline u8 sdp_get_majv(u8 sdp_version) -{ - return sdp_version >> 4; -} - -static inline u8 sdp_get_ip_ver(struct sdp_hh *hh) -{ - return hh->ip_version >> 4; -} - -static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver) -{ - hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF); -} - static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { @@ -310,16 +278,40 @@ static void cma_release_dev(struct rdma_id_private *id_priv) mutex_unlock(&lock); } -static int cma_set_qkey(struct rdma_id_private *id_priv) +static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) +{ + return (struct sockaddr *) &id_priv->id.route.addr.src_addr; +} + +static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) +{ + return (struct sockaddr *) &id_priv->id.route.addr.dst_addr; +} + +static inline unsigned short cma_family(struct rdma_id_private *id_priv) +{ + return id_priv->id.route.addr.src_addr.ss_family; +} + +static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) { struct ib_sa_mcmember_rec rec; int ret = 0; - if (id_priv->qkey) + if (id_priv->qkey) { + if (qkey && id_priv->qkey != qkey) + return -EINVAL; + return 0; + } + + if (qkey) { + id_priv->qkey = qkey; return 0; + } switch (id_priv->id.ps) { case RDMA_PS_UDP: + case RDMA_PS_IB: id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: @@ -336,35 +328,35 @@ static int cma_set_qkey(struct rdma_id_private *id_priv) return ret; } -static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_num) +static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { - int i; - int err; - struct ib_port_attr props; - union ib_gid tmp; + dev_addr->dev_type = ARPHRD_INFINIBAND; + rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr); + ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey)); +} - err = ib_query_port(device, port_num, &props); - if (err) - return 1; +static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) +{ + int ret; - for (i = 0; i < props.gid_tbl_len; ++i) { - err = ib_query_gid(device, port_num, i, &tmp); - if (err) - return 1; - if (!memcmp(&tmp, gid, sizeof tmp)) - return 0; + if (addr->sa_family != AF_IB) { + ret = rdma_translate_ip(addr, dev_addr, NULL); + } else { + cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); + ret = 0; } - return -EAGAIN; + return ret; } -static int cma_acquire_dev(struct rdma_id_private *id_priv) +static int cma_acquire_dev(struct rdma_id_private *id_priv, + struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct cma_device *cma_dev; union ib_gid gid, iboe_gid; int ret = -ENODEV; - u8 port; + u8 port, found_port; enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ? IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; @@ -373,23 +365,46 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv) return -EINVAL; mutex_lock(&lock); - iboe_addr_get_sgid(dev_addr, &iboe_gid); + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &iboe_gid); + memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof gid); + if (listen_id_priv && + rdma_port_get_link_layer(listen_id_priv->id.device, + listen_id_priv->id.port_num) == dev_ll) { + cma_dev = listen_id_priv->cma_dev; + port = listen_id_priv->id.port_num; + if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && + rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) + ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, + &found_port, NULL); + else + ret = ib_find_cached_gid(cma_dev->device, &gid, + &found_port, NULL); + + if (!ret && (port == found_port)) { + id_priv->id.port_num = found_port; + goto out; + } + } list_for_each_entry(cma_dev, &dev_list, list) { for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { + if (listen_id_priv && + listen_id_priv->cma_dev == cma_dev && + listen_id_priv->id.port_num == port) + continue; if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) { if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) - ret = find_gid_port(cma_dev->device, &iboe_gid, port); + ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, &found_port, NULL); else - ret = find_gid_port(cma_dev->device, &gid, port); + ret = ib_find_cached_gid(cma_dev->device, &gid, &found_port, NULL); - if (!ret) { - id_priv->id.port_num = port; + if (!ret && (port == found_port)) { + id_priv->id.port_num = found_port; goto out; - } else if (ret == 1) - break; + } } } } @@ -402,6 +417,60 @@ out: return ret; } +/* + * Select the source IB device and address to reach the destination IB address. + */ +static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) +{ + struct cma_device *cma_dev, *cur_dev; + struct sockaddr_ib *addr; + union ib_gid gid, sgid, *dgid; + u16 pkey, index; + u8 p; + int i; + + cma_dev = NULL; + addr = (struct sockaddr_ib *) cma_dst_addr(id_priv); + dgid = (union ib_gid *) &addr->sib_addr; + pkey = ntohs(addr->sib_pkey); + + list_for_each_entry(cur_dev, &dev_list, list) { + if (rdma_node_get_transport(cur_dev->device->node_type) != RDMA_TRANSPORT_IB) + continue; + + for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index)) + continue; + + for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i, &gid); i++) { + if (!memcmp(&gid, dgid, sizeof(gid))) { + cma_dev = cur_dev; + sgid = gid; + id_priv->id.port_num = p; + goto found; + } + + if (!cma_dev && (gid.global.subnet_prefix == + dgid->global.subnet_prefix)) { + cma_dev = cur_dev; + sgid = gid; + id_priv->id.port_num = p; + } + } + } + } + + if (!cma_dev) + return -ENODEV; + +found: + cma_attach_to_dev(id_priv, cma_dev); + addr = (struct sockaddr_ib *) cma_src_addr(id_priv); + memcpy(&addr->sib_addr, &sgid, sizeof sgid); + cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); + return 0; +} + static void cma_deref_id(struct rdma_id_private *id_priv) { if (atomic_dec_and_test(&id_priv->refcount)) @@ -536,6 +605,7 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; + union ib_gid sgid; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { @@ -558,6 +628,20 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, if (ret) goto out; + ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num, + qp_attr.ah_attr.grh.sgid_index, &sgid); + if (ret) + goto out; + + if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) + == RDMA_TRANSPORT_IB && + rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) + == IB_LINK_LAYER_ETHERNET) { + ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL); + + if (ret) + goto out; + } if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); @@ -631,7 +715,7 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { - ret = cma_set_qkey(id_priv); + ret = cma_set_qkey(id_priv, 0); if (ret) return ret; @@ -658,6 +742,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, qp_attr_mask); + if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; break; @@ -680,26 +765,30 @@ EXPORT_SYMBOL(rdma_init_qp_attr); static inline int cma_zero_addr(struct sockaddr *addr) { - struct in6_addr *ip6; - - if (addr->sa_family == AF_INET) - return ipv4_is_zeronet( - ((struct sockaddr_in *)addr)->sin_addr.s_addr); - else { - ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr; - return (ip6->s6_addr32[0] | ip6->s6_addr32[1] | - ip6->s6_addr32[2] | ip6->s6_addr32[3]) == 0; + switch (addr->sa_family) { + case AF_INET: + return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr); + case AF_INET6: + return ipv6_addr_any(&((struct sockaddr_in6 *) addr)->sin6_addr); + case AF_IB: + return ib_addr_any(&((struct sockaddr_ib *) addr)->sib_addr); + default: + return 0; } } static inline int cma_loopback_addr(struct sockaddr *addr) { - if (addr->sa_family == AF_INET) - return ipv4_is_loopback( - ((struct sockaddr_in *) addr)->sin_addr.s_addr); - else - return ipv6_addr_loopback( - &((struct sockaddr_in6 *) addr)->sin6_addr); + switch (addr->sa_family) { + case AF_INET: + return ipv4_is_loopback(((struct sockaddr_in *) addr)->sin_addr.s_addr); + case AF_INET6: + return ipv6_addr_loopback(&((struct sockaddr_in6 *) addr)->sin6_addr); + case AF_IB: + return ib_addr_loopback(&((struct sockaddr_ib *) addr)->sib_addr); + default: + return 0; + } } static inline int cma_any_addr(struct sockaddr *addr) @@ -716,18 +805,31 @@ static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst) case AF_INET: return ((struct sockaddr_in *) src)->sin_addr.s_addr != ((struct sockaddr_in *) dst)->sin_addr.s_addr; - default: + case AF_INET6: return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr, &((struct sockaddr_in6 *) dst)->sin6_addr); + default: + return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, + &((struct sockaddr_ib *) dst)->sib_addr); } } -static inline __be16 cma_port(struct sockaddr *addr) +static __be16 cma_port(struct sockaddr *addr) { - if (addr->sa_family == AF_INET) + struct sockaddr_ib *sib; + + switch (addr->sa_family) { + case AF_INET: return ((struct sockaddr_in *) addr)->sin_port; - else + case AF_INET6: return ((struct sockaddr_in6 *) addr)->sin6_port; + case AF_IB: + sib = (struct sockaddr_ib *) addr; + return htons((u16) (be64_to_cpu(sib->sib_sid) & + be64_to_cpu(sib->sib_sid_mask))); + default: + return 0; + } } static inline int cma_any_port(struct sockaddr *addr) @@ -735,83 +837,93 @@ static inline int cma_any_port(struct sockaddr *addr) return !cma_port(addr); } -static int cma_get_net_info(void *hdr, enum rdma_port_space ps, - u8 *ip_ver, __be16 *port, - union cma_ip_addr **src, union cma_ip_addr **dst) +static void cma_save_ib_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, + struct ib_sa_path_rec *path) { - switch (ps) { - case RDMA_PS_SDP: - if (sdp_get_majv(((struct sdp_hh *) hdr)->sdp_version) != - SDP_MAJ_VERSION) - return -EINVAL; + struct sockaddr_ib *listen_ib, *ib; - *ip_ver = sdp_get_ip_ver(hdr); - *port = ((struct sdp_hh *) hdr)->port; - *src = &((struct sdp_hh *) hdr)->src_addr; - *dst = &((struct sdp_hh *) hdr)->dst_addr; - break; - default: - if (((struct cma_hdr *) hdr)->cma_version != CMA_VERSION) - return -EINVAL; + listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; + ib = (struct sockaddr_ib *) &id->route.addr.src_addr; + ib->sib_family = listen_ib->sib_family; + ib->sib_pkey = path->pkey; + ib->sib_flowinfo = path->flow_label; + memcpy(&ib->sib_addr, &path->sgid, 16); + ib->sib_sid = listen_ib->sib_sid; + ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); + ib->sib_scope_id = listen_ib->sib_scope_id; - *ip_ver = cma_get_ip_ver(hdr); - *port = ((struct cma_hdr *) hdr)->port; - *src = &((struct cma_hdr *) hdr)->src_addr; - *dst = &((struct cma_hdr *) hdr)->dst_addr; - break; - } - - if (*ip_ver != 4 && *ip_ver != 6) - return -EINVAL; - return 0; + ib = (struct sockaddr_ib *) &id->route.addr.dst_addr; + ib->sib_family = listen_ib->sib_family; + ib->sib_pkey = path->pkey; + ib->sib_flowinfo = path->flow_label; + memcpy(&ib->sib_addr, &path->dgid, 16); } -static void cma_save_net_info(struct rdma_addr *addr, - struct rdma_addr *listen_addr, - u8 ip_ver, __be16 port, - union cma_ip_addr *src, union cma_ip_addr *dst) +static void cma_save_ip4_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, + struct cma_hdr *hdr) { struct sockaddr_in *listen4, *ip4; + + listen4 = (struct sockaddr_in *) &listen_id->route.addr.src_addr; + ip4 = (struct sockaddr_in *) &id->route.addr.src_addr; + ip4->sin_family = listen4->sin_family; + ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr; + ip4->sin_port = listen4->sin_port; + + ip4 = (struct sockaddr_in *) &id->route.addr.dst_addr; + ip4->sin_family = listen4->sin_family; + ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr; + ip4->sin_port = hdr->port; +} + +static void cma_save_ip6_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, + struct cma_hdr *hdr) +{ struct sockaddr_in6 *listen6, *ip6; - switch (ip_ver) { + listen6 = (struct sockaddr_in6 *) &listen_id->route.addr.src_addr; + ip6 = (struct sockaddr_in6 *) &id->route.addr.src_addr; + ip6->sin6_family = listen6->sin6_family; + ip6->sin6_addr = hdr->dst_addr.ip6; + ip6->sin6_port = listen6->sin6_port; + + ip6 = (struct sockaddr_in6 *) &id->route.addr.dst_addr; + ip6->sin6_family = listen6->sin6_family; + ip6->sin6_addr = hdr->src_addr.ip6; + ip6->sin6_port = hdr->port; +} + +static int cma_save_net_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, + struct ib_cm_event *ib_event) +{ + struct cma_hdr *hdr; + + if ((listen_id->route.addr.src_addr.ss_family == AF_IB) && + (ib_event->event == IB_CM_REQ_RECEIVED)) { + cma_save_ib_info(id, listen_id, ib_event->param.req_rcvd.primary_path); + return 0; + } + + hdr = ib_event->private_data; + if (hdr->cma_version != CMA_VERSION) + return -EINVAL; + + switch (cma_get_ip_ver(hdr)) { case 4: - listen4 = (struct sockaddr_in *) &listen_addr->src_addr; - ip4 = (struct sockaddr_in *) &addr->src_addr; - ip4->sin_family = listen4->sin_family; - ip4->sin_addr.s_addr = dst->ip4.addr; - ip4->sin_port = listen4->sin_port; - - ip4 = (struct sockaddr_in *) &addr->dst_addr; - ip4->sin_family = listen4->sin_family; - ip4->sin_addr.s_addr = src->ip4.addr; - ip4->sin_port = port; + cma_save_ip4_info(id, listen_id, hdr); break; case 6: - listen6 = (struct sockaddr_in6 *) &listen_addr->src_addr; - ip6 = (struct sockaddr_in6 *) &addr->src_addr; - ip6->sin6_family = listen6->sin6_family; - ip6->sin6_addr = dst->ip6; - ip6->sin6_port = listen6->sin6_port; - - ip6 = (struct sockaddr_in6 *) &addr->dst_addr; - ip6->sin6_family = listen6->sin6_family; - ip6->sin6_addr = src->ip6; - ip6->sin6_port = port; + cma_save_ip6_info(id, listen_id, hdr); break; default: - break; + return -EINVAL; } + return 0; } -static inline int cma_user_data_offset(enum rdma_port_space ps) +static inline int cma_user_data_offset(struct rdma_id_private *id_priv) { - switch (ps) { - case RDMA_PS_SDP: - return 0; - default: - return sizeof(struct cma_hdr); - } + return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); } static void cma_cancel_route(struct rdma_id_private *id_priv) @@ -862,8 +974,7 @@ static void cma_cancel_operation(struct rdma_id_private *id_priv, cma_cancel_route(id_priv); break; case RDMA_CM_LISTEN: - if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr) - && !id_priv->cma_dev) + if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev) cma_cancel_listens(id_priv); break; default: @@ -978,16 +1089,6 @@ reject: return ret; } -static int cma_verify_rep(struct rdma_id_private *id_priv, void *data) -{ - if (id_priv->id.ps == RDMA_PS_SDP && - sdp_get_majv(((struct sdp_hah *) data)->sdp_version) != - SDP_MAJ_VERSION) - return -EINVAL; - - return 0; -} - static void cma_set_rep_event_data(struct rdma_cm_event *event, struct ib_cm_rep_event_param *rep_data, void *private_data) @@ -1022,15 +1123,13 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: - event.status = cma_verify_rep(id_priv, ib_event->private_data); - if (event.status) - event.event = RDMA_CM_EVENT_CONNECT_ERROR; - else if (id_priv->id.qp && id_priv->id.ps != RDMA_PS_SDP) { + if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : RDMA_CM_EVENT_ESTABLISHED; - } else + } else { event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; + } cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, ib_event->private_data); break; @@ -1086,22 +1185,16 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; - union cma_ip_addr *src, *dst; - __be16 port; - u8 ip_ver; int ret; - if (cma_get_net_info(ib_event->private_data, listen_id->ps, - &ip_ver, &port, &src, &dst)) - return NULL; - id = rdma_create_id(listen_id->event_handler, listen_id->context, listen_id->ps, ib_event->param.req_rcvd.qp_type); if (IS_ERR(id)) return NULL; - cma_save_net_info(&id->route.addr, &listen_id->route.addr, - ip_ver, port, src, dst); + id_priv = container_of(id, struct rdma_id_private, id); + if (cma_save_net_info(id, listen_id, ib_event)) + goto err; rt = &id->route; rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; @@ -1114,19 +1207,17 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, if (rt->num_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; - if (cma_any_addr((struct sockaddr *) &rt->addr.src_addr)) { + if (cma_any_addr(cma_src_addr(id_priv))) { rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); } else { - ret = rdma_translate_ip((struct sockaddr *) &rt->addr.src_addr, - &rt->addr.dev_addr); + ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); if (ret) goto err; } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); - id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = RDMA_CM_CONNECT; return id_priv; @@ -1140,9 +1231,6 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, { struct rdma_id_private *id_priv; struct rdma_cm_id *id; - union cma_ip_addr *src, *dst; - __be16 port; - u8 ip_ver; int ret; id = rdma_create_id(listen_id->event_handler, listen_id->context, @@ -1150,22 +1238,16 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, if (IS_ERR(id)) return NULL; - - if (cma_get_net_info(ib_event->private_data, listen_id->ps, - &ip_ver, &port, &src, &dst)) + id_priv = container_of(id, struct rdma_id_private, id); + if (cma_save_net_info(id, listen_id, ib_event)) goto err; - cma_save_net_info(&id->route.addr, &listen_id->route.addr, - ip_ver, port, src, dst); - if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) { - ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr, - &id->route.addr.dev_addr); + ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr); if (ret) goto err; } - id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: @@ -1211,7 +1293,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) return -ECONNABORTED; memset(&event, 0, sizeof event); - offset = cma_user_data_offset(listen_id->id.ps); + offset = cma_user_data_offset(listen_id); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { conn_id = cma_new_udp_id(&listen_id->id, ib_event); @@ -1229,7 +1311,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); - ret = cma_acquire_dev(conn_id); + ret = cma_acquire_dev(conn_id, listen_id); if (ret) goto err2; @@ -1245,13 +1327,13 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) ret = conn_id->id.event_handler(&conn_id->id, &event); if (ret) goto err3; - /* * Acquire mutex to prevent user executing rdma_destroy_id() * while we're accessing the cm_id. */ mutex_lock(&lock); - if (cma_comp(conn_id, RDMA_CM_CONNECT) && (conn_id->id.qp_type != IB_QPT_UD)) + if (cma_comp(conn_id, RDMA_CM_CONNECT) && + (conn_id->id.qp_type != IB_QPT_UD)) ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); mutex_unlock(&lock); mutex_unlock(&conn_id->handler_mutex); @@ -1273,58 +1355,44 @@ err1: return ret; } -static __be64 cma_get_service_id(enum rdma_port_space ps, struct sockaddr *addr) +__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) { - return cpu_to_be64(((u64)ps << 16) + be16_to_cpu(cma_port(addr))); + if (addr->sa_family == AF_IB) + return ((struct sockaddr_ib *) addr)->sib_sid; + + return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr))); } +EXPORT_SYMBOL(rdma_get_service_id); static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, struct ib_cm_compare_data *compare) { struct cma_hdr *cma_data, *cma_mask; - struct sdp_hh *sdp_data, *sdp_mask; __be32 ip4_addr; struct in6_addr ip6_addr; memset(compare, 0, sizeof *compare); cma_data = (void *) compare->data; cma_mask = (void *) compare->mask; - sdp_data = (void *) compare->data; - sdp_mask = (void *) compare->mask; switch (addr->sa_family) { case AF_INET: ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr; - if (ps == RDMA_PS_SDP) { - sdp_set_ip_ver(sdp_data, 4); - sdp_set_ip_ver(sdp_mask, 0xF); - sdp_data->dst_addr.ip4.addr = ip4_addr; - sdp_mask->dst_addr.ip4.addr = htonl(~0); - } else { - cma_set_ip_ver(cma_data, 4); - cma_set_ip_ver(cma_mask, 0xF); - if (!cma_any_addr(addr)) { - cma_data->dst_addr.ip4.addr = ip4_addr; - cma_mask->dst_addr.ip4.addr = htonl(~0); - } + cma_set_ip_ver(cma_data, 4); + cma_set_ip_ver(cma_mask, 0xF); + if (!cma_any_addr(addr)) { + cma_data->dst_addr.ip4.addr = ip4_addr; + cma_mask->dst_addr.ip4.addr = htonl(~0); } break; case AF_INET6: ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr; - if (ps == RDMA_PS_SDP) { - sdp_set_ip_ver(sdp_data, 6); - sdp_set_ip_ver(sdp_mask, 0xF); - sdp_data->dst_addr.ip6 = ip6_addr; - memset(&sdp_mask->dst_addr.ip6, 0xFF, - sizeof sdp_mask->dst_addr.ip6); - } else { - cma_set_ip_ver(cma_data, 6); - cma_set_ip_ver(cma_mask, 0xF); - if (!cma_any_addr(addr)) { - cma_data->dst_addr.ip6 = ip6_addr; - memset(&cma_mask->dst_addr.ip6, 0xFF, - sizeof cma_mask->dst_addr.ip6); - } + cma_set_ip_ver(cma_data, 6); + cma_set_ip_ver(cma_mask, 0xF); + if (!cma_any_addr(addr)) { + cma_data->dst_addr.ip6 = ip6_addr; + memset(&cma_mask->dst_addr.ip6, 0xFF, + sizeof cma_mask->dst_addr.ip6); } break; default: @@ -1336,8 +1404,9 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; struct rdma_cm_event event; - struct sockaddr_in *sin; int ret = 0; + struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; + struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) return 0; @@ -1348,10 +1417,10 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY: - sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; - *sin = iw_event->local_addr; - sin = (struct sockaddr_in *) &id_priv->id.route.addr.dst_addr; - *sin = iw_event->remote_addr; + memcpy(cma_src_addr(id_priv), laddr, + rdma_addr_size(laddr)); + memcpy(cma_dst_addr(id_priv), raddr, + rdma_addr_size(raddr)); switch (iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; @@ -1401,11 +1470,11 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, { struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; - struct sockaddr_in *sin; - struct net_device *dev = NULL; struct rdma_cm_event event; int ret; struct ib_device_attr attr; + struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; + struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; listen_id = cm_id->context; if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) @@ -1423,21 +1492,14 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; - dev = ip_dev_find(&init_net, iw_event->local_addr.sin_addr.s_addr); - if (!dev) { - ret = -EADDRNOTAVAIL; - mutex_unlock(&conn_id->handler_mutex); - rdma_destroy_id(new_cm_id); - goto out; - } - ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL); + ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr, NULL); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } - ret = cma_acquire_dev(conn_id); + ret = cma_acquire_dev(conn_id, listen_id); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); @@ -1448,10 +1510,8 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, cm_id->context = conn_id; cm_id->cm_handler = cma_iw_handler; - sin = (struct sockaddr_in *) &new_cm_id->route.addr.src_addr; - *sin = iw_event->local_addr; - sin = (struct sockaddr_in *) &new_cm_id->route.addr.dst_addr; - *sin = iw_event->remote_addr; + memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); + memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); ret = ib_query_device(conn_id->id.device, &attr); if (ret) { @@ -1487,8 +1547,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, cma_deref_id(conn_id); out: - if (dev) - dev_put(dev); mutex_unlock(&listen_id->handler_mutex); return ret; } @@ -1507,8 +1565,8 @@ static int cma_ib_listen(struct rdma_id_private *id_priv) id_priv->cm_id.ib = id; - addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; - svc_id = cma_get_service_id(id_priv->id.ps, addr); + addr = cma_src_addr(id_priv); + svc_id = rdma_get_service_id(&id_priv->id, addr); if (cma_any_addr(addr) && !id_priv->afonly) ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL); else { @@ -1527,7 +1585,6 @@ static int cma_ib_listen(struct rdma_id_private *id_priv) static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; - struct sockaddr_in *sin; struct iw_cm_id *id; id = iw_create_cm_id(id_priv->id.device, @@ -1538,8 +1595,8 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) id_priv->cm_id.iw = id; - sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; - id_priv->cm_id.iw->local_addr = *sin; + memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); ret = iw_cm_listen(id_priv->cm_id.iw, backlog); @@ -1568,6 +1625,10 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, struct rdma_cm_id *id; int ret; + if (cma_family(id_priv) == AF_IB && + rdma_node_get_transport(cma_dev->device->node_type) != RDMA_TRANSPORT_IB) + return; + id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps, id_priv->id.qp_type); if (IS_ERR(id)) @@ -1576,8 +1637,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, dev_id_priv = container_of(id, struct rdma_id_private, id); dev_id_priv->state = RDMA_CM_ADDR_BOUND; - memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr, - ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); + memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); cma_attach_to_dev(dev_id_priv, cma_dev); list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); @@ -1635,31 +1696,39 @@ static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, struct cma_work *work) { - struct rdma_addr *addr = &id_priv->id.route.addr; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct ib_sa_path_rec path_rec; ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; + struct sockaddr_ib *sib; memset(&path_rec, 0, sizeof path_rec); - rdma_addr_get_sgid(&addr->dev_addr, &path_rec.sgid); - rdma_addr_get_dgid(&addr->dev_addr, &path_rec.dgid); - path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr)); + rdma_addr_get_sgid(dev_addr, &path_rec.sgid); + rdma_addr_get_dgid(dev_addr, &path_rec.dgid); + path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; - path_rec.service_id = cma_get_service_id(id_priv->id.ps, - (struct sockaddr *) &addr->dst_addr); + path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; - if (addr->src_addr.ss_family == AF_INET) { + switch (cma_family(id_priv)) { + case AF_INET: path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; - } else { - sin6 = (struct sockaddr_in6 *) &addr->src_addr; + break; + case AF_INET6: + sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; + break; + case AF_IB: + sib = (struct sockaddr_ib *) cma_src_addr(id_priv); + path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20); + comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; + break; } id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, @@ -1795,19 +1864,34 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) return 0; } +static int iboe_tos_to_sl(struct net_device *ndev, int tos) +{ + int prio; + struct net_device *dev; + + prio = rt_tos2priority(tos); + dev = ndev->priv_flags & IFF_802_1Q_VLAN ? + vlan_dev_real_dev(ndev) : ndev; + + if (dev->num_tc) + return netdev_get_prio_tc_map(dev, prio); + +#if IS_ENABLED(CONFIG_VLAN_8021Q) + if (ndev->priv_flags & IFF_802_1Q_VLAN) + return (vlan_dev_get_egress_qos_mask(ndev, prio) & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; +#endif + return 0; +} + static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; - struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr; - struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr; struct net_device *ndev = NULL; - u16 vid; - if (src_addr->sin_family != dst_addr->sin_family) - return -EINVAL; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) @@ -1831,20 +1915,20 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) goto err2; } - vid = rdma_vlan_dev_vlan_id(ndev); + route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev); + memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN); + memcpy(route->path_rec->smac, ndev->dev_addr, ndev->addr_len); - iboe_mac_vlan_to_ll(&route->path_rec->sgid, addr->dev_addr.src_dev_addr, vid); - iboe_mac_vlan_to_ll(&route->path_rec->dgid, addr->dev_addr.dst_dev_addr, vid); + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &route->path_rec->sgid); + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, + &route->path_rec->dgid); route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; - route->path_rec->sl = netdev_get_prio_tc_map( - ndev->priv_flags & IFF_802_1Q_VLAN ? - vlan_dev_real_dev(ndev) : ndev, - rt_tos2priority(id_priv->tos)); - + route->path_rec->sl = iboe_tos_to_sl(ndev, id_priv->tos); route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = iboe_get_rate(ndev); @@ -1914,28 +1998,57 @@ err: } EXPORT_SYMBOL(rdma_resolve_route); +static void cma_set_loopback(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + break; + case AF_INET6: + ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr, + 0, 0, 0, htonl(1)); + break; + default: + ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr, + 0, 0, 0, htonl(1)); + break; + } +} + static int cma_bind_loopback(struct rdma_id_private *id_priv) { - struct cma_device *cma_dev; + struct cma_device *cma_dev, *cur_dev; struct ib_port_attr port_attr; union ib_gid gid; u16 pkey; int ret; u8 p; + cma_dev = NULL; mutex_lock(&lock); - if (list_empty(&dev_list)) { + list_for_each_entry(cur_dev, &dev_list, list) { + if (cma_family(id_priv) == AF_IB && + rdma_node_get_transport(cur_dev->device->node_type) != RDMA_TRANSPORT_IB) + continue; + + if (!cma_dev) + cma_dev = cur_dev; + + for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + if (!ib_query_port(cur_dev->device, p, &port_attr) && + port_attr.state == IB_PORT_ACTIVE) { + cma_dev = cur_dev; + goto port_found; + } + } + } + + if (!cma_dev) { ret = -ENODEV; goto out; } - list_for_each_entry(cma_dev, &dev_list, list) - for (p = 1; p <= cma_dev->device->phys_port_cnt; ++p) - if (!ib_query_port(cma_dev->device, p, &port_attr) && - port_attr.state == IB_PORT_ACTIVE) - goto port_found; p = 1; - cma_dev = list_entry(dev_list.next, struct cma_device, list); port_found: ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid); @@ -1954,6 +2067,7 @@ port_found: ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); + cma_set_loopback(cma_src_addr(id_priv)); out: mutex_unlock(&lock); return ret; @@ -1971,8 +2085,9 @@ static void addr_handler(int status, struct sockaddr *src_addr, RDMA_CM_ADDR_RESOLVED)) goto out; + memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) - status = cma_acquire_dev(id_priv); + status = cma_acquire_dev(id_priv, NULL); if (status) { if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, @@ -1980,11 +2095,8 @@ static void addr_handler(int status, struct sockaddr *src_addr, goto out; event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = status; - } else { - memcpy(&id_priv->id.route.addr.src_addr, src_addr, - ip_addr_size(src_addr)); + } else event.event = RDMA_CM_EVENT_ADDR_RESOLVED; - } if (id_priv->id.event_handler(&id_priv->id, &event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); @@ -2001,7 +2113,6 @@ out: static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; - struct sockaddr *src, *dst; union ib_gid gid; int ret; @@ -2018,18 +2129,36 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv) rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); - src = (struct sockaddr *) &id_priv->id.route.addr.src_addr; - if (cma_zero_addr(src)) { - dst = (struct sockaddr *) &id_priv->id.route.addr.dst_addr; - if ((src->sa_family = dst->sa_family) == AF_INET) { - ((struct sockaddr_in *)src)->sin_addr = - ((struct sockaddr_in *)dst)->sin_addr; - } else { - ((struct sockaddr_in6 *)src)->sin6_addr = - ((struct sockaddr_in6 *)dst)->sin6_addr; - } + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ADDR_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + queue_work(cma_wq, &work->work); + return 0; +err: + kfree(work); + return ret; +} + +static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) +{ + struct cma_work *work; + int ret; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + if (!id_priv->cma_dev) { + ret = cma_resolve_ib_dev(id_priv); + if (ret) + goto err; } + rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) + &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); + work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; @@ -2047,9 +2176,13 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, { if (!src_addr || !src_addr->sa_family) { src_addr = (struct sockaddr *) &id->route.addr.src_addr; - if ((src_addr->sa_family = dst_addr->sa_family) == AF_INET6) { + src_addr->sa_family = dst_addr->sa_family; + if (dst_addr->sa_family == AF_INET6) { ((struct sockaddr_in6 *) src_addr)->sin6_scope_id = ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id; + } else if (dst_addr->sa_family == AF_IB) { + ((struct sockaddr_ib *) src_addr)->sib_pkey = + ((struct sockaddr_ib *) dst_addr)->sib_pkey; } } return rdma_bind_addr(id, src_addr); @@ -2068,17 +2201,25 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, return ret; } + if (cma_family(id_priv) != dst_addr->sa_family) + return -EINVAL; + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); - memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr)); - if (cma_any_addr(dst_addr)) + memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); + if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); - else - ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr, - dst_addr, &id->route.addr.dev_addr, - timeout_ms, addr_handler, id_priv); + } else { + if (dst_addr->sa_family == AF_IB) { + ret = cma_resolve_ib_addr(id_priv); + } else { + ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv), + dst_addr, &id->route.addr.dev_addr, + timeout_ms, addr_handler, id_priv); + } + } if (ret) goto err; @@ -2098,7 +2239,7 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); - if (id_priv->state == RDMA_CM_IDLE) { + if (reuse || id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { @@ -2132,10 +2273,29 @@ EXPORT_SYMBOL(rdma_set_afonly); static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { - struct sockaddr_in *sin; + struct sockaddr *addr; + struct sockaddr_ib *sib; + u64 sid, mask; + __be16 port; + + addr = cma_src_addr(id_priv); + port = htons(bind_list->port); - sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; - sin->sin_port = htons(bind_list->port); + switch (addr->sa_family) { + case AF_INET: + ((struct sockaddr_in *) addr)->sin_port = port; + break; + case AF_INET6: + ((struct sockaddr_in6 *) addr)->sin6_port = port; + break; + case AF_IB: + sib = (struct sockaddr_ib *) addr; + sid = be64_to_cpu(sib->sib_sid); + mask = be64_to_cpu(sib->sib_sid_mask); + sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port)); + sib->sib_sid_mask = cpu_to_be64(~0ULL); + break; + } id_priv->bind_list = bind_list; hlist_add_head(&id_priv->node, &bind_list->owners); } @@ -2144,33 +2304,23 @@ static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; - int port, ret; + int ret; bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; - do { - ret = idr_get_new_above(ps, bind_list, snum, &port); - } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL)); - - if (ret) - goto err1; - - if (port != snum) { - ret = -EADDRNOTAVAIL; - goto err2; - } + ret = idr_alloc(ps, bind_list, snum, snum + 1, GFP_KERNEL); + if (ret < 0) + goto err; bind_list->ps = ps; - bind_list->port = (unsigned short) port; + bind_list->port = (unsigned short)ret; cma_bind_port(bind_list, id_priv); return 0; -err2: - idr_remove(ps, port); -err1: +err: kfree(bind_list); - return ret; + return ret == -ENOSPC ? -EADDRNOTAVAIL : ret; } static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) @@ -2179,9 +2329,9 @@ static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) int low, high, remaining; unsigned int rover; - inet_get_local_port_range(&low, &high); + inet_get_local_port_range(&init_net, &low, &high); remaining = (high - low) + 1; - rover = net_random() % remaining + low; + rover = prandom_u32() % remaining + low; retry: if (last_used_port != rover && !idr_find(ps, (unsigned short) rover)) { @@ -2215,10 +2365,9 @@ static int cma_check_port(struct rdma_bind_list *bind_list, { struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; - struct hlist_node *node; - addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; - hlist_for_each_entry(cur_id, node, &bind_list->owners, node) { + addr = cma_src_addr(id_priv); + hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; @@ -2226,7 +2375,7 @@ static int cma_check_port(struct rdma_bind_list *bind_list, cur_id->reuseaddr) continue; - cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr; + cur_addr = cma_src_addr(cur_id); if (id_priv->afonly && cur_id->afonly && (addr->sa_family != cur_addr->sa_family)) continue; @@ -2246,7 +2395,7 @@ static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) unsigned short snum; int ret; - snum = ntohs(cma_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)); + snum = ntohs(cma_port(cma_src_addr(id_priv))); if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; @@ -2273,33 +2422,67 @@ static int cma_bind_listen(struct rdma_id_private *id_priv) return ret; } -static int cma_get_port(struct rdma_id_private *id_priv) +static struct idr *cma_select_inet_ps(struct rdma_id_private *id_priv) { - struct idr *ps; - int ret; - switch (id_priv->id.ps) { - case RDMA_PS_SDP: - ps = &sdp_ps; - break; case RDMA_PS_TCP: - ps = &tcp_ps; - break; + return &tcp_ps; case RDMA_PS_UDP: - ps = &udp_ps; - break; + return &udp_ps; case RDMA_PS_IPOIB: - ps = &ipoib_ps; - break; + return &ipoib_ps; case RDMA_PS_IB: - ps = &ib_ps; - break; + return &ib_ps; default: - return -EPROTONOSUPPORT; + return NULL; } +} + +static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv) +{ + struct idr *ps = NULL; + struct sockaddr_ib *sib; + u64 sid_ps, mask, sid; + + sib = (struct sockaddr_ib *) cma_src_addr(id_priv); + mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK; + sid = be64_to_cpu(sib->sib_sid) & mask; + + if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) { + sid_ps = RDMA_IB_IP_PS_IB; + ps = &ib_ps; + } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) && + (sid == (RDMA_IB_IP_PS_TCP & mask))) { + sid_ps = RDMA_IB_IP_PS_TCP; + ps = &tcp_ps; + } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) && + (sid == (RDMA_IB_IP_PS_UDP & mask))) { + sid_ps = RDMA_IB_IP_PS_UDP; + ps = &udp_ps; + } + + if (ps) { + sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib))); + sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK | + be64_to_cpu(sib->sib_sid_mask)); + } + return ps; +} + +static int cma_get_port(struct rdma_id_private *id_priv) +{ + struct idr *ps; + int ret; + + if (cma_family(id_priv) != AF_IB) + ps = cma_select_inet_ps(id_priv); + else + ps = cma_select_ib_ps(id_priv); + if (!ps) + return -EPROTONOSUPPORT; mutex_lock(&lock); - if (cma_any_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)) + if (cma_any_port(cma_src_addr(id_priv))) ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); @@ -2318,8 +2501,11 @@ static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, return 0; sin6 = (struct sockaddr_in6 *) addr; - if ((ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) && - !sin6->sin6_scope_id) + + if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) + return 0; + + if (!sin6->sin6_scope_id) return -EINVAL; dev_addr->bound_dev_if = sin6->sin6_scope_id; @@ -2334,8 +2520,8 @@ int rdma_listen(struct rdma_cm_id *id, int backlog) id_priv = container_of(id, struct rdma_id_private, id); if (id_priv->state == RDMA_CM_IDLE) { - ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET; - ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr); + id->route.addr.src_addr.ss_family = AF_INET; + ret = rdma_bind_addr(id, cma_src_addr(id_priv)); if (ret) return ret; } @@ -2382,7 +2568,8 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) struct rdma_id_private *id_priv; int ret; - if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) + if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && + addr->sa_family != AF_IB) return -EAFNOSUPPORT; id_priv = container_of(id, struct rdma_id_private, id); @@ -2393,17 +2580,17 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (ret) goto err1; + memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) { - ret = rdma_translate_ip(addr, &id->route.addr.dev_addr); + ret = cma_translate_addr(addr, &id->route.addr.dev_addr); if (ret) goto err1; - ret = cma_acquire_dev(id_priv); + ret = cma_acquire_dev(id_priv, NULL); if (ret) goto err1; } - memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr)); if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { if (addr->sa_family == AF_INET) id_priv->afonly = 1; @@ -2426,62 +2613,32 @@ err1: } EXPORT_SYMBOL(rdma_bind_addr); -static int cma_format_hdr(void *hdr, enum rdma_port_space ps, - struct rdma_route *route) +static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) { struct cma_hdr *cma_hdr; - struct sdp_hh *sdp_hdr; - if (route->addr.src_addr.ss_family == AF_INET) { + cma_hdr = hdr; + cma_hdr->cma_version = CMA_VERSION; + if (cma_family(id_priv) == AF_INET) { struct sockaddr_in *src4, *dst4; - src4 = (struct sockaddr_in *) &route->addr.src_addr; - dst4 = (struct sockaddr_in *) &route->addr.dst_addr; - - switch (ps) { - case RDMA_PS_SDP: - sdp_hdr = hdr; - if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) - return -EINVAL; - sdp_set_ip_ver(sdp_hdr, 4); - sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; - sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; - sdp_hdr->port = src4->sin_port; - break; - default: - cma_hdr = hdr; - cma_hdr->cma_version = CMA_VERSION; - cma_set_ip_ver(cma_hdr, 4); - cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; - cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; - cma_hdr->port = src4->sin_port; - break; - } - } else { + src4 = (struct sockaddr_in *) cma_src_addr(id_priv); + dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv); + + cma_set_ip_ver(cma_hdr, 4); + cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; + cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; + cma_hdr->port = src4->sin_port; + } else if (cma_family(id_priv) == AF_INET6) { struct sockaddr_in6 *src6, *dst6; - src6 = (struct sockaddr_in6 *) &route->addr.src_addr; - dst6 = (struct sockaddr_in6 *) &route->addr.dst_addr; - - switch (ps) { - case RDMA_PS_SDP: - sdp_hdr = hdr; - if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) - return -EINVAL; - sdp_set_ip_ver(sdp_hdr, 6); - sdp_hdr->src_addr.ip6 = src6->sin6_addr; - sdp_hdr->dst_addr.ip6 = dst6->sin6_addr; - sdp_hdr->port = src6->sin6_port; - break; - default: - cma_hdr = hdr; - cma_hdr->cma_version = CMA_VERSION; - cma_set_ip_ver(cma_hdr, 6); - cma_hdr->src_addr.ip6 = src6->sin6_addr; - cma_hdr->dst_addr.ip6 = dst6->sin6_addr; - cma_hdr->port = src6->sin6_port; - break; - } + src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); + dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv); + + cma_set_ip_ver(cma_hdr, 6); + cma_hdr->src_addr.ip6 = src6->sin6_addr; + cma_hdr->dst_addr.ip6 = dst6->sin6_addr; + cma_hdr->port = src6->sin6_port; } return 0; } @@ -2511,15 +2668,10 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, event.status = ib_event->param.sidr_rep_rcvd.status; break; } - ret = cma_set_qkey(id_priv); + ret = cma_set_qkey(id_priv, rep->qkey); if (ret) { event.event = RDMA_CM_EVENT_ADDR_ERROR; - event.status = -EINVAL; - break; - } - if (id_priv->qkey != rep->qkey) { - event.event = RDMA_CM_EVENT_UNREACHABLE; - event.status = -EINVAL; + event.status = ret; break; } ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num, @@ -2554,27 +2706,34 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_sidr_req_param req; - struct rdma_route *route; struct ib_cm_id *id; - int ret; + void *private_data; + int offset, ret; - req.private_data_len = sizeof(struct cma_hdr) + - conn_param->private_data_len; + memset(&req, 0, sizeof req); + offset = cma_user_data_offset(id_priv); + req.private_data_len = offset + conn_param->private_data_len; if (req.private_data_len < conn_param->private_data_len) return -EINVAL; - req.private_data = kzalloc(req.private_data_len, GFP_ATOMIC); - if (!req.private_data) - return -ENOMEM; + if (req.private_data_len) { + private_data = kzalloc(req.private_data_len, GFP_ATOMIC); + if (!private_data) + return -ENOMEM; + } else { + private_data = NULL; + } if (conn_param->private_data && conn_param->private_data_len) - memcpy((void *) req.private_data + sizeof(struct cma_hdr), - conn_param->private_data, conn_param->private_data_len); + memcpy(private_data + offset, conn_param->private_data, + conn_param->private_data_len); - route = &id_priv->id.route; - ret = cma_format_hdr((void *) req.private_data, id_priv->id.ps, route); - if (ret) - goto out; + if (private_data) { + ret = cma_format_hdr(private_data, id_priv); + if (ret) + goto out; + req.private_data = private_data; + } id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, id_priv); @@ -2584,9 +2743,8 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, } id_priv->cm_id.ib = id; - req.path = route->path_rec; - req.service_id = cma_get_service_id(id_priv->id.ps, - (struct sockaddr *) &route->addr.dst_addr); + req.path = id_priv->id.route.path_rec; + req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; @@ -2596,7 +2754,7 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, id_priv->cm_id.ib = NULL; } out: - kfree(req.private_data); + kfree(private_data); return ret; } @@ -2610,14 +2768,18 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, int offset, ret; memset(&req, 0, sizeof req); - offset = cma_user_data_offset(id_priv->id.ps); + offset = cma_user_data_offset(id_priv); req.private_data_len = offset + conn_param->private_data_len; if (req.private_data_len < conn_param->private_data_len) return -EINVAL; - private_data = kzalloc(req.private_data_len, GFP_ATOMIC); - if (!private_data) - return -ENOMEM; + if (req.private_data_len) { + private_data = kzalloc(req.private_data_len, GFP_ATOMIC); + if (!private_data) + return -ENOMEM; + } else { + private_data = NULL; + } if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, @@ -2631,17 +2793,18 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, id_priv->cm_id.ib = id; route = &id_priv->id.route; - ret = cma_format_hdr(private_data, id_priv->id.ps, route); - if (ret) - goto out; - req.private_data = private_data; + if (private_data) { + ret = cma_format_hdr(private_data, id_priv); + if (ret) + goto out; + req.private_data = private_data; + } req.primary_path = &route->path_rec[0]; if (route->num_paths == 2) req.alternate_path = &route->path_rec[1]; - req.service_id = cma_get_service_id(id_priv->id.ps, - (struct sockaddr *) &route->addr.dst_addr); + req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.qp_num = id_priv->qp_num; req.qp_type = id_priv->id.qp_type; req.starting_psn = id_priv->seq_num; @@ -2670,7 +2833,6 @@ static int cma_connect_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_id *cm_id; - struct sockaddr_in* sin; int ret; struct iw_cm_conn_param iw_param; @@ -2680,11 +2842,10 @@ static int cma_connect_iw(struct rdma_id_private *id_priv, id_priv->cm_id.iw = cm_id; - sin = (struct sockaddr_in*) &id_priv->id.route.addr.src_addr; - cm_id->local_addr = *sin; - - sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr; - cm_id->remote_addr = *sin; + memcpy(&cm_id->local_addr, cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); + memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv), + rdma_addr_size(cma_dst_addr(id_priv))); ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) @@ -2801,7 +2962,7 @@ static int cma_accept_iw(struct rdma_id_private *id_priv, } static int cma_send_sidr_rep(struct rdma_id_private *id_priv, - enum ib_cm_sidr_status status, + enum ib_cm_sidr_status status, u32 qkey, const void *private_data, int private_data_len) { struct ib_cm_sidr_rep_param rep; @@ -2810,7 +2971,7 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { - ret = cma_set_qkey(id_priv); + ret = cma_set_qkey(id_priv, qkey); if (ret) return ret; rep.qp_num = id_priv->qp_num; @@ -2844,11 +3005,12 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) if (id->qp_type == IB_QPT_UD) { if (conn_param) ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, + conn_param->qkey, conn_param->private_data, conn_param->private_data_len); else ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, - NULL, 0); + 0, NULL, 0); } else { if (conn_param) ret = cma_accept_ib(id_priv, conn_param); @@ -2909,7 +3071,7 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data, switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: if (id->qp_type == IB_QPT_UD) - ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, + ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, private_data, private_data_len); else ret = ib_send_cm_rej(id_priv->cm_id.ib, @@ -2970,6 +3132,8 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED)) return 0; + if (!status) + status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); mutex_lock(&id_priv->qp_mutex); if (!status && id_priv->id.qp) status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, @@ -3016,6 +3180,8 @@ static void cma_set_mgid(struct rdma_id_private *id_priv, 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); + } else if (addr->sa_family == AF_IB) { + memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid); } else if ((addr->sa_family == AF_INET6)) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) @@ -3043,9 +3209,12 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, if (ret) return ret; + ret = cma_set_qkey(id_priv, 0); + if (ret) + return ret; + cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); - if (id_priv->id.ps == RDMA_PS_UDP) - rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); + rec.qkey = cpu_to_be32(id_priv->qkey); rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = 1; @@ -3067,7 +3236,7 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, cma_ib_mc_handler, mc); - return PTR_RET(mc->multicast.ib); + return PTR_ERR_OR_ZERO(mc->multicast.ib); } static void iboe_mcast_work_handler(struct work_struct *work) @@ -3150,7 +3319,8 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, err = -EINVAL; goto out2; } - iboe_addr_get_sgid(dev_addr, &mc->multicast.ib->rec.port_gid); + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &mc->multicast.ib->rec.port_gid); work->id = id_priv; work->mc = mc; INIT_WORK(&work->work, iboe_mcast_work_handler); @@ -3182,7 +3352,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, if (!mc) return -ENOMEM; - memcpy(&mc->addr, addr, ip_addr_size(addr)); + memcpy(&mc->addr, addr, rdma_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; @@ -3227,7 +3397,7 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { - if (!memcmp(&mc->addr, addr, ip_addr_size(addr))) { + if (!memcmp(&mc->addr, addr, rdma_addr_size(addr))) { list_del(&mc->list); spin_unlock_irq(&id_priv->lock); @@ -3281,9 +3451,9 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id } static int cma_netdev_callback(struct notifier_block *self, unsigned long event, - void *ctx) + void *ptr) { - struct net_device *ndev = (struct net_device *)ctx; + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct cma_device *cma_dev; struct rdma_id_private *id_priv; int ret = NOTIFY_DONE; @@ -3437,7 +3607,8 @@ static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq, sizeof *id_stats, RDMA_NL_RDMA_CM, - RDMA_NL_RDMA_CM_ID_STATS); + RDMA_NL_RDMA_CM_ID_STATS, + NLM_F_MULTI); if (!id_stats) goto out; @@ -3448,33 +3619,16 @@ static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) id_stats->bound_dev_if = id->route.addr.dev_addr.bound_dev_if; - if (id->route.addr.src_addr.ss_family == AF_INET) { - if (ibnl_put_attr(skb, nlh, - sizeof(struct sockaddr_in), - &id->route.addr.src_addr, - RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) { - goto out; - } - if (ibnl_put_attr(skb, nlh, - sizeof(struct sockaddr_in), - &id->route.addr.dst_addr, - RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) { - goto out; - } - } else if (id->route.addr.src_addr.ss_family == AF_INET6) { - if (ibnl_put_attr(skb, nlh, - sizeof(struct sockaddr_in6), - &id->route.addr.src_addr, - RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) { - goto out; - } - if (ibnl_put_attr(skb, nlh, - sizeof(struct sockaddr_in6), - &id->route.addr.dst_addr, - RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) { - goto out; - } - } + if (ibnl_put_attr(skb, nlh, + rdma_addr_size(cma_src_addr(id_priv)), + cma_src_addr(id_priv), + RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) + goto out; + if (ibnl_put_attr(skb, nlh, + rdma_addr_size(cma_src_addr(id_priv)), + cma_dst_addr(id_priv), + RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) + goto out; id_stats->pid = id_priv->owner; id_stats->port_space = id->ps; @@ -3539,7 +3693,6 @@ static void __exit cma_cleanup(void) rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); destroy_workqueue(cma_wq); - idr_destroy(&sdp_ps); idr_destroy(&tcp_ps); idr_destroy(&udp_ps); idr_destroy(&ipoib_ps); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index a565af5c2d2..87d1936f5c1 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -49,4 +49,6 @@ void ib_sysfs_cleanup(void); int ib_cache_setup(void); void ib_cache_cleanup(void); +int ib_resolve_eth_l2_attrs(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, int *qp_attr_mask); #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index 176c8f90f2b..9f5ad7cc33c 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -118,14 +118,13 @@ static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool, { struct hlist_head *bucket; struct ib_pool_fmr *fmr; - struct hlist_node *pos; if (!pool->cache_bucket) return NULL; bucket = pool->cache_bucket + ib_fmr_hash(*page_list); - hlist_for_each_entry(fmr, pos, bucket, cache_node) + hlist_for_each_entry(fmr, bucket, cache_node) if (io_virtual_address == fmr->io_virtual_address && page_list_len == fmr->page_list_len && !memcmp(page_list, fmr->page_list, diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 0bb99bb3880..3d2e489ab73 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -181,9 +181,16 @@ static void add_ref(struct iw_cm_id *cm_id) static void rem_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; + int cb_destroy; + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); - if (iwcm_deref_id(cm_id_priv) && - test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags)) { + + /* + * Test bit before deref in case the cm_id gets freed on another + * thread. + */ + cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); + if (iwcm_deref_id(cm_id_priv) && cb_destroy) { BUG_ON(!list_empty(&cm_id_priv->work_list)); free_cm_id(cm_id_priv); } @@ -327,7 +334,6 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; unsigned long flags; - int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); /* @@ -343,7 +349,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); /* destroy the listening endpoint */ - ret = cm_id->device->iwcm->destroy_listen(cm_id); + cm_id->device->iwcm->destroy_listen(cm_id); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_ESTABLISHED: @@ -878,6 +884,8 @@ static void cm_work_handler(struct work_struct *_work) } return; } + if (empty) + return; spin_lock_irqsave(&cm_id_priv->lock, flags); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c new file mode 100644 index 00000000000..b85ddbc979e --- /dev/null +++ b/drivers/infiniband/core/iwpm_msg.c @@ -0,0 +1,685 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iwpm_util.h" + +static const char iwpm_ulib_name[] = "iWarpPortMapperUser"; +static int iwpm_ulib_version = 3; +static int iwpm_user_pid = IWPM_PID_UNDEFINED; +static atomic_t echo_nlmsg_seq; + +int iwpm_valid_pid(void) +{ + return iwpm_user_pid > 0; +} +EXPORT_SYMBOL(iwpm_valid_pid); + +/* + * iwpm_register_pid - Send a netlink query to user space + * for the iwarp port mapper pid + * + * nlmsg attributes: + * [IWPM_NLA_REG_PID_SEQ] + * [IWPM_NLA_REG_IF_NAME] + * [IWPM_NLA_REG_IBDEV_NAME] + * [IWPM_NLA_REG_ULIB_NAME] + */ +int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_client(nl_client)) { + err_str = "Invalid port mapper client"; + goto pid_query_error; + } + if (iwpm_registered_client(nl_client)) + return 0; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REG_PID, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto pid_query_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL); + if (!nlmsg_request) { + err_str = "Unable to allocate netlink request"; + goto pid_query_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + + /* fill in the pid request message */ + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_REG_PID_SEQ); + if (ret) + goto pid_query_error; + ret = ibnl_put_attr(skb, nlh, IWPM_IFNAME_SIZE, + pm_msg->if_name, IWPM_NLA_REG_IF_NAME); + if (ret) + goto pid_query_error; + ret = ibnl_put_attr(skb, nlh, IWPM_DEVNAME_SIZE, + pm_msg->dev_name, IWPM_NLA_REG_IBDEV_NAME); + if (ret) + goto pid_query_error; + ret = ibnl_put_attr(skb, nlh, IWPM_ULIBNAME_SIZE, + (char *)iwpm_ulib_name, IWPM_NLA_REG_ULIB_NAME); + if (ret) + goto pid_query_error; + + pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n", + __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name); + + ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + iwpm_set_registered(nl_client, 1); + iwpm_user_pid = IWPM_PID_UNAVAILABLE; + err_str = "Unable to send a nlmsg"; + goto pid_query_error; + } + nlmsg_request->req_buffer = pm_msg; + ret = iwpm_wait_complete_req(nlmsg_request); + return ret; +pid_query_error: + pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb(skb); + if (nlmsg_request) + iwpm_free_nlmsg_request(&nlmsg_request->kref); + return ret; +} +EXPORT_SYMBOL(iwpm_register_pid); + +/* + * iwpm_add_mapping - Send a netlink add mapping message + * to the port mapper + * nlmsg attributes: + * [IWPM_NLA_MANAGE_MAPPING_SEQ] + * [IWPM_NLA_MANAGE_ADDR] + */ +int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_client(nl_client)) { + err_str = "Invalid port mapper client"; + goto add_mapping_error; + } + if (!iwpm_registered_client(nl_client)) { + err_str = "Unregistered port mapper client"; + goto add_mapping_error; + } + if (!iwpm_valid_pid()) + return 0; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_ADD_MAPPING, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto add_mapping_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL); + if (!nlmsg_request) { + err_str = "Unable to allocate netlink request"; + goto add_mapping_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + /* fill in the add mapping message */ + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, + IWPM_NLA_MANAGE_MAPPING_SEQ); + if (ret) + goto add_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + &pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR); + if (ret) + goto add_mapping_error; + nlmsg_request->req_buffer = pm_msg; + + ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + iwpm_user_pid = IWPM_PID_UNDEFINED; + err_str = "Unable to send a nlmsg"; + goto add_mapping_error; + } + ret = iwpm_wait_complete_req(nlmsg_request); + return ret; +add_mapping_error: + pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb(skb); + if (nlmsg_request) + iwpm_free_nlmsg_request(&nlmsg_request->kref); + return ret; +} +EXPORT_SYMBOL(iwpm_add_mapping); + +/* + * iwpm_add_and_query_mapping - Send a netlink add and query + * mapping message to the port mapper + * nlmsg attributes: + * [IWPM_NLA_QUERY_MAPPING_SEQ] + * [IWPM_NLA_QUERY_LOCAL_ADDR] + * [IWPM_NLA_QUERY_REMOTE_ADDR] + */ +int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_client(nl_client)) { + err_str = "Invalid port mapper client"; + goto query_mapping_error; + } + if (!iwpm_registered_client(nl_client)) { + err_str = "Unregistered port mapper client"; + goto query_mapping_error; + } + if (!iwpm_valid_pid()) + return 0; + ret = -ENOMEM; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_QUERY_MAPPING, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto query_mapping_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, + nl_client, GFP_KERNEL); + if (!nlmsg_request) { + err_str = "Unable to allocate netlink request"; + goto query_mapping_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + + /* fill in the query message */ + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, + IWPM_NLA_QUERY_MAPPING_SEQ); + if (ret) + goto query_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + &pm_msg->loc_addr, IWPM_NLA_QUERY_LOCAL_ADDR); + if (ret) + goto query_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + &pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR); + if (ret) + goto query_mapping_error; + nlmsg_request->req_buffer = pm_msg; + + ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + err_str = "Unable to send a nlmsg"; + goto query_mapping_error; + } + ret = iwpm_wait_complete_req(nlmsg_request); + return ret; +query_mapping_error: + pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb(skb); + if (nlmsg_request) + iwpm_free_nlmsg_request(&nlmsg_request->kref); + return ret; +} +EXPORT_SYMBOL(iwpm_add_and_query_mapping); + +/* + * iwpm_remove_mapping - Send a netlink remove mapping message + * to the port mapper + * nlmsg attributes: + * [IWPM_NLA_MANAGE_MAPPING_SEQ] + * [IWPM_NLA_MANAGE_ADDR] + */ +int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_client(nl_client)) { + err_str = "Invalid port mapper client"; + goto remove_mapping_error; + } + if (!iwpm_registered_client(nl_client)) { + err_str = "Unregistered port mapper client"; + goto remove_mapping_error; + } + if (!iwpm_valid_pid()) + return 0; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REMOVE_MAPPING, &nlh, nl_client); + if (!skb) { + ret = -ENOMEM; + err_str = "Unable to create a nlmsg"; + goto remove_mapping_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, + IWPM_NLA_MANAGE_MAPPING_SEQ); + if (ret) + goto remove_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + local_addr, IWPM_NLA_MANAGE_ADDR); + if (ret) + goto remove_mapping_error; + + ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + iwpm_user_pid = IWPM_PID_UNDEFINED; + err_str = "Unable to send a nlmsg"; + goto remove_mapping_error; + } + iwpm_print_sockaddr(local_addr, + "remove_mapping: Local sockaddr:"); + return 0; +remove_mapping_error: + pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb_any(skb); + return ret; +} +EXPORT_SYMBOL(iwpm_remove_mapping); + +/* netlink attribute policy for the received response to register pid request */ +static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = { + [IWPM_NLA_RREG_PID_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RREG_IBDEV_NAME] = { .type = NLA_STRING, + .len = IWPM_DEVNAME_SIZE - 1 }, + [IWPM_NLA_RREG_ULIB_NAME] = { .type = NLA_STRING, + .len = IWPM_ULIBNAME_SIZE - 1 }, + [IWPM_NLA_RREG_ULIB_VER] = { .type = NLA_U16 }, + [IWPM_NLA_RREG_PID_ERR] = { .type = NLA_U16 } +}; + +/* + * iwpm_register_pid_cb - Process a port mapper response to + * iwpm_register_pid() + */ +int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlattr *nltb[IWPM_NLA_RREG_PID_MAX]; + struct iwpm_dev_data *pm_msg; + char *dev_name, *iwpm_name; + u32 msg_seq; + u8 nl_client; + u16 iwpm_version; + const char *msg_type = "Register Pid response"; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_RREG_PID_MAX, + resp_reg_policy, nltb, msg_type)) + return -EINVAL; + + msg_seq = nla_get_u32(nltb[IWPM_NLA_RREG_PID_SEQ]); + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + pr_info("%s: Could not find a matching request (seq = %u)\n", + __func__, msg_seq); + return -EINVAL; + } + pm_msg = nlmsg_request->req_buffer; + nl_client = nlmsg_request->nl_client; + dev_name = (char *)nla_data(nltb[IWPM_NLA_RREG_IBDEV_NAME]); + iwpm_name = (char *)nla_data(nltb[IWPM_NLA_RREG_ULIB_NAME]); + iwpm_version = nla_get_u16(nltb[IWPM_NLA_RREG_ULIB_VER]); + + /* check device name, ulib name and version */ + if (strcmp(pm_msg->dev_name, dev_name) || + strcmp(iwpm_ulib_name, iwpm_name) || + iwpm_version != iwpm_ulib_version) { + + pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n", + __func__, dev_name, iwpm_name, iwpm_version); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto register_pid_response_exit; + } + iwpm_user_pid = cb->nlh->nlmsg_pid; + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", + __func__, iwpm_user_pid); + if (iwpm_valid_client(nl_client)) + iwpm_set_registered(nl_client, 1); +register_pid_response_exit: + nlmsg_request->request_done = 1; + /* always for found nlmsg_request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + wake_up(&nlmsg_request->waitq); + return 0; +} +EXPORT_SYMBOL(iwpm_register_pid_cb); + +/* netlink attribute policy for the received response to add mapping request */ +static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = { + [IWPM_NLA_MANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_MANAGE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } +}; + +/* + * iwpm_add_mapping_cb - Process a port mapper response to + * iwpm_add_mapping() + */ +int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct iwpm_sa_data *pm_msg; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlattr *nltb[IWPM_NLA_RMANAGE_MAPPING_MAX]; + struct sockaddr_storage *local_sockaddr; + struct sockaddr_storage *mapped_sockaddr; + const char *msg_type; + u32 msg_seq; + + msg_type = "Add Mapping response"; + if (iwpm_parse_nlmsg(cb, IWPM_NLA_RMANAGE_MAPPING_MAX, + resp_add_policy, nltb, msg_type)) + return -EINVAL; + + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + + msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]); + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + pr_info("%s: Could not find a matching request (seq = %u)\n", + __func__, msg_seq); + return -EINVAL; + } + pm_msg = nlmsg_request->req_buffer; + local_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_MANAGE_ADDR]); + mapped_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]); + + if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) { + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto add_mapping_response_exit; + } + if (mapped_sockaddr->ss_family != local_sockaddr->ss_family) { + pr_info("%s: Sockaddr family doesn't match the requested one\n", + __func__); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto add_mapping_response_exit; + } + memcpy(&pm_msg->mapped_loc_addr, mapped_sockaddr, + sizeof(*mapped_sockaddr)); + iwpm_print_sockaddr(&pm_msg->loc_addr, + "add_mapping: Local sockaddr:"); + iwpm_print_sockaddr(&pm_msg->mapped_loc_addr, + "add_mapping: Mapped local sockaddr:"); + +add_mapping_response_exit: + nlmsg_request->request_done = 1; + /* always for found request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + wake_up(&nlmsg_request->waitq); + return 0; +} +EXPORT_SYMBOL(iwpm_add_mapping_cb); + +/* netlink attribute policy for the response to add and query mapping request */ +static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = { + [IWPM_NLA_QUERY_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_QUERY_LOCAL_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_QUERY_REMOTE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 } +}; + +/* + * iwpm_add_and_query_mapping_cb - Process a port mapper response to + * iwpm_add_and_query_mapping() + */ +int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct iwpm_sa_data *pm_msg; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX]; + struct sockaddr_storage *local_sockaddr, *remote_sockaddr; + struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr; + const char *msg_type; + u32 msg_seq; + u16 err_code; + + msg_type = "Query Mapping response"; + if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX, + resp_query_policy, nltb, msg_type)) + return -EINVAL; + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + + msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]); + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + pr_info("%s: Could not find a matching request (seq = %u)\n", + __func__, msg_seq); + return -EINVAL; + } + pm_msg = nlmsg_request->req_buffer; + local_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]); + remote_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]); + mapped_loc_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); + mapped_rem_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]); + + err_code = nla_get_u16(nltb[IWPM_NLA_RQUERY_MAPPING_ERR]); + if (err_code == IWPM_REMOTE_QUERY_REJECT) { + pr_info("%s: Received a Reject (pid = %u, echo seq = %u)\n", + __func__, cb->nlh->nlmsg_pid, msg_seq); + nlmsg_request->err_code = IWPM_REMOTE_QUERY_REJECT; + } + if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr) || + iwpm_compare_sockaddr(remote_sockaddr, &pm_msg->rem_addr)) { + pr_info("%s: Incorrect local sockaddr\n", __func__); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto query_mapping_response_exit; + } + if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family || + mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) { + pr_info("%s: Sockaddr family doesn't match the requested one\n", + __func__); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto query_mapping_response_exit; + } + memcpy(&pm_msg->mapped_loc_addr, mapped_loc_sockaddr, + sizeof(*mapped_loc_sockaddr)); + memcpy(&pm_msg->mapped_rem_addr, mapped_rem_sockaddr, + sizeof(*mapped_rem_sockaddr)); + + iwpm_print_sockaddr(&pm_msg->loc_addr, + "query_mapping: Local sockaddr:"); + iwpm_print_sockaddr(&pm_msg->mapped_loc_addr, + "query_mapping: Mapped local sockaddr:"); + iwpm_print_sockaddr(&pm_msg->rem_addr, + "query_mapping: Remote sockaddr:"); + iwpm_print_sockaddr(&pm_msg->mapped_rem_addr, + "query_mapping: Mapped remote sockaddr:"); +query_mapping_response_exit: + nlmsg_request->request_done = 1; + /* always for found request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + wake_up(&nlmsg_request->waitq); + return 0; +} +EXPORT_SYMBOL(iwpm_add_and_query_mapping_cb); + +/* netlink attribute policy for the received request for mapping info */ +static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = { + [IWPM_NLA_MAPINFO_ULIB_NAME] = { .type = NLA_STRING, + .len = IWPM_ULIBNAME_SIZE - 1 }, + [IWPM_NLA_MAPINFO_ULIB_VER] = { .type = NLA_U16 } +}; + +/* + * iwpm_mapping_info_cb - Process a port mapper request for mapping info + */ +int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_MAPINFO_REQ_MAX]; + const char *msg_type = "Mapping Info response"; + int iwpm_pid; + u8 nl_client; + char *iwpm_name; + u16 iwpm_version; + int ret = -EINVAL; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_REQ_MAX, + resp_mapinfo_policy, nltb, msg_type)) { + pr_info("%s: Unable to parse nlmsg\n", __func__); + return ret; + } + iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]); + iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]); + if (strcmp(iwpm_ulib_name, iwpm_name) || + iwpm_version != iwpm_ulib_version) { + pr_info("%s: Invalid port mapper name = %s version = %d\n", + __func__, iwpm_name, iwpm_version); + return ret; + } + nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + if (!iwpm_valid_client(nl_client)) { + pr_info("%s: Invalid port mapper client = %d\n", + __func__, nl_client); + return ret; + } + iwpm_set_registered(nl_client, 0); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + if (!iwpm_mapinfo_available()) + return 0; + iwpm_pid = cb->nlh->nlmsg_pid; + pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", + __func__, iwpm_pid); + ret = iwpm_send_mapinfo(nl_client, iwpm_pid); + return ret; +} +EXPORT_SYMBOL(iwpm_mapping_info_cb); + +/* netlink attribute policy for the received mapping info ack */ +static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = { + [IWPM_NLA_MAPINFO_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_MAPINFO_SEND_NUM] = { .type = NLA_U32 }, + [IWPM_NLA_MAPINFO_ACK_NUM] = { .type = NLA_U32 } +}; + +/* + * iwpm_ack_mapping_info_cb - Process a port mapper ack for + * the provided mapping info records + */ +int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_MAPINFO_NUM_MAX]; + u32 mapinfo_send, mapinfo_ack; + const char *msg_type = "Mapping Info Ack"; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_NUM_MAX, + ack_mapinfo_policy, nltb, msg_type)) + return -EINVAL; + mapinfo_send = nla_get_u32(nltb[IWPM_NLA_MAPINFO_SEND_NUM]); + mapinfo_ack = nla_get_u32(nltb[IWPM_NLA_MAPINFO_ACK_NUM]); + if (mapinfo_ack != mapinfo_send) + pr_info("%s: Invalid mapinfo number (sent = %u ack-ed = %u)\n", + __func__, mapinfo_send, mapinfo_ack); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + return 0; +} +EXPORT_SYMBOL(iwpm_ack_mapping_info_cb); + +/* netlink attribute policy for the received port mapper error message */ +static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = { + [IWPM_NLA_ERR_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_ERR_CODE] = { .type = NLA_U16 }, +}; + +/* + * iwpm_mapping_error_cb - Process a port mapper error message + */ +int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct iwpm_nlmsg_request *nlmsg_request = NULL; + int nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + struct nlattr *nltb[IWPM_NLA_ERR_MAX]; + u32 msg_seq; + u16 err_code; + const char *msg_type = "Mapping Error Msg"; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_ERR_MAX, + map_error_policy, nltb, msg_type)) + return -EINVAL; + + msg_seq = nla_get_u32(nltb[IWPM_NLA_ERR_SEQ]); + err_code = nla_get_u16(nltb[IWPM_NLA_ERR_CODE]); + pr_info("%s: Received msg seq = %u err code = %u client = %d\n", + __func__, msg_seq, err_code, nl_client); + /* look for nlmsg_request */ + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + /* not all errors have associated requests */ + pr_debug("Could not find matching req (seq = %u)\n", msg_seq); + return 0; + } + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + nlmsg_request->err_code = err_code; + nlmsg_request->request_done = 1; + /* always for found request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + wake_up(&nlmsg_request->waitq); + return 0; +} +EXPORT_SYMBOL(iwpm_mapping_error_cb); diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c new file mode 100644 index 00000000000..69e9f84c160 --- /dev/null +++ b/drivers/infiniband/core/iwpm_util.c @@ -0,0 +1,607 @@ +/* + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iwpm_util.h" + +#define IWPM_HASH_BUCKET_SIZE 512 +#define IWPM_HASH_BUCKET_MASK (IWPM_HASH_BUCKET_SIZE - 1) + +static LIST_HEAD(iwpm_nlmsg_req_list); +static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock); + +static struct hlist_head *iwpm_hash_bucket; +static DEFINE_SPINLOCK(iwpm_mapinfo_lock); + +static DEFINE_MUTEX(iwpm_admin_lock); +static struct iwpm_admin_data iwpm_admin; + +int iwpm_init(u8 nl_client) +{ + if (iwpm_valid_client(nl_client)) + return -EINVAL; + mutex_lock(&iwpm_admin_lock); + if (atomic_read(&iwpm_admin.refcount) == 0) { + iwpm_hash_bucket = kzalloc(IWPM_HASH_BUCKET_SIZE * + sizeof(struct hlist_head), GFP_KERNEL); + if (!iwpm_hash_bucket) { + mutex_unlock(&iwpm_admin_lock); + pr_err("%s Unable to create mapinfo hash table\n", __func__); + return -ENOMEM; + } + } + atomic_inc(&iwpm_admin.refcount); + mutex_unlock(&iwpm_admin_lock); + iwpm_set_valid(nl_client, 1); + return 0; +} +EXPORT_SYMBOL(iwpm_init); + +static void free_hash_bucket(void); + +int iwpm_exit(u8 nl_client) +{ + + if (!iwpm_valid_client(nl_client)) + return -EINVAL; + mutex_lock(&iwpm_admin_lock); + if (atomic_read(&iwpm_admin.refcount) == 0) { + mutex_unlock(&iwpm_admin_lock); + pr_err("%s Incorrect usage - negative refcount\n", __func__); + return -EINVAL; + } + if (atomic_dec_and_test(&iwpm_admin.refcount)) { + free_hash_bucket(); + pr_debug("%s: Mapinfo hash table is destroyed\n", __func__); + } + mutex_unlock(&iwpm_admin_lock); + iwpm_set_valid(nl_client, 0); + return 0; +} +EXPORT_SYMBOL(iwpm_exit); + +static struct hlist_head *get_hash_bucket_head(struct sockaddr_storage *, + struct sockaddr_storage *); + +int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, + struct sockaddr_storage *mapped_sockaddr, + u8 nl_client) +{ + struct hlist_head *hash_bucket_head; + struct iwpm_mapping_info *map_info; + unsigned long flags; + + if (!iwpm_valid_client(nl_client)) + return -EINVAL; + map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL); + if (!map_info) { + pr_err("%s: Unable to allocate a mapping info\n", __func__); + return -ENOMEM; + } + memcpy(&map_info->local_sockaddr, local_sockaddr, + sizeof(struct sockaddr_storage)); + memcpy(&map_info->mapped_sockaddr, mapped_sockaddr, + sizeof(struct sockaddr_storage)); + map_info->nl_client = nl_client; + + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + if (iwpm_hash_bucket) { + hash_bucket_head = get_hash_bucket_head( + &map_info->local_sockaddr, + &map_info->mapped_sockaddr); + hlist_add_head(&map_info->hlist_node, hash_bucket_head); + } + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); + return 0; +} +EXPORT_SYMBOL(iwpm_create_mapinfo); + +int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr, + struct sockaddr_storage *mapped_local_addr) +{ + struct hlist_node *tmp_hlist_node; + struct hlist_head *hash_bucket_head; + struct iwpm_mapping_info *map_info = NULL; + unsigned long flags; + int ret = -EINVAL; + + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + if (iwpm_hash_bucket) { + hash_bucket_head = get_hash_bucket_head( + local_sockaddr, + mapped_local_addr); + hlist_for_each_entry_safe(map_info, tmp_hlist_node, + hash_bucket_head, hlist_node) { + + if (!iwpm_compare_sockaddr(&map_info->mapped_sockaddr, + mapped_local_addr)) { + + hlist_del_init(&map_info->hlist_node); + kfree(map_info); + ret = 0; + break; + } + } + } + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); + return ret; +} +EXPORT_SYMBOL(iwpm_remove_mapinfo); + +static void free_hash_bucket(void) +{ + struct hlist_node *tmp_hlist_node; + struct iwpm_mapping_info *map_info; + unsigned long flags; + int i; + + /* remove all the mapinfo data from the list */ + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + for (i = 0; i < IWPM_HASH_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(map_info, tmp_hlist_node, + &iwpm_hash_bucket[i], hlist_node) { + + hlist_del_init(&map_info->hlist_node); + kfree(map_info); + } + } + /* free the hash list */ + kfree(iwpm_hash_bucket); + iwpm_hash_bucket = NULL; + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); +} + +struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, + u8 nl_client, gfp_t gfp) +{ + struct iwpm_nlmsg_request *nlmsg_request = NULL; + unsigned long flags; + + nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp); + if (!nlmsg_request) { + pr_err("%s Unable to allocate a nlmsg_request\n", __func__); + return NULL; + } + spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); + list_add_tail(&nlmsg_request->inprocess_list, &iwpm_nlmsg_req_list); + spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); + + kref_init(&nlmsg_request->kref); + kref_get(&nlmsg_request->kref); + nlmsg_request->nlmsg_seq = nlmsg_seq; + nlmsg_request->nl_client = nl_client; + nlmsg_request->request_done = 0; + nlmsg_request->err_code = 0; + return nlmsg_request; +} + +void iwpm_free_nlmsg_request(struct kref *kref) +{ + struct iwpm_nlmsg_request *nlmsg_request; + unsigned long flags; + + nlmsg_request = container_of(kref, struct iwpm_nlmsg_request, kref); + + spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); + list_del_init(&nlmsg_request->inprocess_list); + spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); + + if (!nlmsg_request->request_done) + pr_debug("%s Freeing incomplete nlmsg request (seq = %u).\n", + __func__, nlmsg_request->nlmsg_seq); + kfree(nlmsg_request); +} + +struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq) +{ + struct iwpm_nlmsg_request *nlmsg_request; + struct iwpm_nlmsg_request *found_request = NULL; + unsigned long flags; + + spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); + list_for_each_entry(nlmsg_request, &iwpm_nlmsg_req_list, + inprocess_list) { + if (nlmsg_request->nlmsg_seq == echo_seq) { + found_request = nlmsg_request; + kref_get(&nlmsg_request->kref); + break; + } + } + spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); + return found_request; +} + +int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request) +{ + int ret; + init_waitqueue_head(&nlmsg_request->waitq); + + ret = wait_event_timeout(nlmsg_request->waitq, + (nlmsg_request->request_done != 0), IWPM_NL_TIMEOUT); + if (!ret) { + ret = -EINVAL; + pr_info("%s: Timeout %d sec for netlink request (seq = %u)\n", + __func__, (IWPM_NL_TIMEOUT/HZ), nlmsg_request->nlmsg_seq); + } else { + ret = nlmsg_request->err_code; + } + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + return ret; +} + +int iwpm_get_nlmsg_seq(void) +{ + return atomic_inc_return(&iwpm_admin.nlmsg_seq); +} + +int iwpm_valid_client(u8 nl_client) +{ + if (nl_client >= RDMA_NL_NUM_CLIENTS) + return 0; + return iwpm_admin.client_list[nl_client]; +} + +void iwpm_set_valid(u8 nl_client, int valid) +{ + if (nl_client >= RDMA_NL_NUM_CLIENTS) + return; + iwpm_admin.client_list[nl_client] = valid; +} + +/* valid client */ +int iwpm_registered_client(u8 nl_client) +{ + return iwpm_admin.reg_list[nl_client]; +} + +/* valid client */ +void iwpm_set_registered(u8 nl_client, int reg) +{ + iwpm_admin.reg_list[nl_client] = reg; +} + +int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, + struct sockaddr_storage *b_sockaddr) +{ + if (a_sockaddr->ss_family != b_sockaddr->ss_family) + return 1; + if (a_sockaddr->ss_family == AF_INET) { + struct sockaddr_in *a4_sockaddr = + (struct sockaddr_in *)a_sockaddr; + struct sockaddr_in *b4_sockaddr = + (struct sockaddr_in *)b_sockaddr; + if (!memcmp(&a4_sockaddr->sin_addr, + &b4_sockaddr->sin_addr, sizeof(struct in_addr)) + && a4_sockaddr->sin_port == b4_sockaddr->sin_port) + return 0; + + } else if (a_sockaddr->ss_family == AF_INET6) { + struct sockaddr_in6 *a6_sockaddr = + (struct sockaddr_in6 *)a_sockaddr; + struct sockaddr_in6 *b6_sockaddr = + (struct sockaddr_in6 *)b_sockaddr; + if (!memcmp(&a6_sockaddr->sin6_addr, + &b6_sockaddr->sin6_addr, sizeof(struct in6_addr)) + && a6_sockaddr->sin6_port == b6_sockaddr->sin6_port) + return 0; + + } else { + pr_err("%s: Invalid sockaddr family\n", __func__); + } + return 1; +} + +struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh, + int nl_client) +{ + struct sk_buff *skb = NULL; + + skb = dev_alloc_skb(NLMSG_GOODSIZE); + if (!skb) { + pr_err("%s Unable to allocate skb\n", __func__); + goto create_nlmsg_exit; + } + if (!(ibnl_put_msg(skb, nlh, 0, 0, nl_client, nl_op, + NLM_F_REQUEST))) { + pr_warn("%s: Unable to put the nlmsg header\n", __func__); + dev_kfree_skb(skb); + skb = NULL; + } +create_nlmsg_exit: + return skb; +} + +int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, + const struct nla_policy *nlmsg_policy, + struct nlattr *nltb[], const char *msg_type) +{ + int nlh_len = 0; + int ret; + const char *err_str = ""; + + ret = nlmsg_validate(cb->nlh, nlh_len, policy_max-1, nlmsg_policy); + if (ret) { + err_str = "Invalid attribute"; + goto parse_nlmsg_error; + } + ret = nlmsg_parse(cb->nlh, nlh_len, nltb, policy_max-1, nlmsg_policy); + if (ret) { + err_str = "Unable to parse the nlmsg"; + goto parse_nlmsg_error; + } + ret = iwpm_validate_nlmsg_attr(nltb, policy_max); + if (ret) { + err_str = "Invalid NULL attribute"; + goto parse_nlmsg_error; + } + return 0; +parse_nlmsg_error: + pr_warn("%s: %s (msg type %s ret = %d)\n", + __func__, err_str, msg_type, ret); + return ret; +} + +void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg) +{ + struct sockaddr_in6 *sockaddr_v6; + struct sockaddr_in *sockaddr_v4; + + switch (sockaddr->ss_family) { + case AF_INET: + sockaddr_v4 = (struct sockaddr_in *)sockaddr; + pr_debug("%s IPV4 %pI4: %u(0x%04X)\n", + msg, &sockaddr_v4->sin_addr, + ntohs(sockaddr_v4->sin_port), + ntohs(sockaddr_v4->sin_port)); + break; + case AF_INET6: + sockaddr_v6 = (struct sockaddr_in6 *)sockaddr; + pr_debug("%s IPV6 %pI6: %u(0x%04X)\n", + msg, &sockaddr_v6->sin6_addr, + ntohs(sockaddr_v6->sin6_port), + ntohs(sockaddr_v6->sin6_port)); + break; + default: + break; + } +} + +static u32 iwpm_ipv6_jhash(struct sockaddr_in6 *ipv6_sockaddr) +{ + u32 ipv6_hash = jhash(&ipv6_sockaddr->sin6_addr, sizeof(struct in6_addr), 0); + u32 hash = jhash_2words(ipv6_hash, (__force u32) ipv6_sockaddr->sin6_port, 0); + return hash; +} + +static u32 iwpm_ipv4_jhash(struct sockaddr_in *ipv4_sockaddr) +{ + u32 ipv4_hash = jhash(&ipv4_sockaddr->sin_addr, sizeof(struct in_addr), 0); + u32 hash = jhash_2words(ipv4_hash, (__force u32) ipv4_sockaddr->sin_port, 0); + return hash; +} + +static struct hlist_head *get_hash_bucket_head(struct sockaddr_storage + *local_sockaddr, + struct sockaddr_storage + *mapped_sockaddr) +{ + u32 local_hash, mapped_hash, hash; + + if (local_sockaddr->ss_family == AF_INET) { + local_hash = iwpm_ipv4_jhash((struct sockaddr_in *) local_sockaddr); + mapped_hash = iwpm_ipv4_jhash((struct sockaddr_in *) mapped_sockaddr); + + } else if (local_sockaddr->ss_family == AF_INET6) { + local_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) local_sockaddr); + mapped_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) mapped_sockaddr); + } else { + pr_err("%s: Invalid sockaddr family\n", __func__); + return NULL; + } + + if (local_hash == mapped_hash) /* if port mapper isn't available */ + hash = local_hash; + else + hash = jhash_2words(local_hash, mapped_hash, 0); + + return &iwpm_hash_bucket[hash & IWPM_HASH_BUCKET_MASK]; +} + +static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_MAPINFO_NUM, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto mapinfo_num_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + msg_seq = 0; + err_str = "Unable to put attribute of mapinfo number nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MAPINFO_SEQ); + if (ret) + goto mapinfo_num_error; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), + &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM); + if (ret) + goto mapinfo_num_error; + ret = ibnl_unicast(skb, nlh, iwpm_pid); + if (ret) { + skb = NULL; + err_str = "Unable to send a nlmsg"; + goto mapinfo_num_error; + } + pr_debug("%s: Sent mapping number = %d\n", __func__, mapping_num); + return 0; +mapinfo_num_error: + pr_info("%s: %s\n", __func__, err_str); + if (skb) + dev_kfree_skb(skb); + return ret; +} + +static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) +{ + struct nlmsghdr *nlh = NULL; + int ret = 0; + + if (!skb) + return ret; + if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, + RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { + pr_warn("%s Unable to put NLMSG_DONE\n", __func__); + return -ENOMEM; + } + nlh->nlmsg_type = NLMSG_DONE; + ret = ibnl_unicast(skb, (struct nlmsghdr *)skb->data, iwpm_pid); + if (ret) + pr_warn("%s Unable to send a nlmsg\n", __func__); + return ret; +} + +int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) +{ + struct iwpm_mapping_info *map_info; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + int skb_num = 0, mapping_num = 0; + int i = 0, nlmsg_bytes = 0; + unsigned long flags; + const char *err_str = ""; + int ret; + + skb = dev_alloc_skb(NLMSG_GOODSIZE); + if (!skb) { + ret = -ENOMEM; + err_str = "Unable to allocate skb"; + goto send_mapping_info_exit; + } + skb_num++; + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + for (i = 0; i < IWPM_HASH_BUCKET_SIZE; i++) { + hlist_for_each_entry(map_info, &iwpm_hash_bucket[i], + hlist_node) { + if (map_info->nl_client != nl_client) + continue; + nlh = NULL; + if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, + RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { + ret = -ENOMEM; + err_str = "Unable to put the nlmsg header"; + goto send_mapping_info_unlock; + } + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, + sizeof(struct sockaddr_storage), + &map_info->local_sockaddr, + IWPM_NLA_MAPINFO_LOCAL_ADDR); + if (ret) + goto send_mapping_info_unlock; + + ret = ibnl_put_attr(skb, nlh, + sizeof(struct sockaddr_storage), + &map_info->mapped_sockaddr, + IWPM_NLA_MAPINFO_MAPPED_ADDR); + if (ret) + goto send_mapping_info_unlock; + + iwpm_print_sockaddr(&map_info->local_sockaddr, + "send_mapping_info: Local sockaddr:"); + iwpm_print_sockaddr(&map_info->mapped_sockaddr, + "send_mapping_info: Mapped local sockaddr:"); + mapping_num++; + nlmsg_bytes += nlh->nlmsg_len; + + /* check if all mappings can fit in one skb */ + if (NLMSG_GOODSIZE - nlmsg_bytes < nlh->nlmsg_len * 2) { + /* and leave room for NLMSG_DONE */ + nlmsg_bytes = 0; + skb_num++; + spin_unlock_irqrestore(&iwpm_mapinfo_lock, + flags); + /* send the skb */ + ret = send_nlmsg_done(skb, nl_client, iwpm_pid); + skb = NULL; + if (ret) { + err_str = "Unable to send map info"; + goto send_mapping_info_exit; + } + if (skb_num == IWPM_MAPINFO_SKB_COUNT) { + ret = -ENOMEM; + err_str = "Insufficient skbs for map info"; + goto send_mapping_info_exit; + } + skb = dev_alloc_skb(NLMSG_GOODSIZE); + if (!skb) { + ret = -ENOMEM; + err_str = "Unable to allocate skb"; + goto send_mapping_info_exit; + } + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + } + } + } +send_mapping_info_unlock: + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); +send_mapping_info_exit: + if (ret) { + pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret); + if (skb) + dev_kfree_skb(skb); + return ret; + } + send_nlmsg_done(skb, nl_client, iwpm_pid); + return send_mapinfo_num(mapping_num, nl_client, iwpm_pid); +} + +int iwpm_mapinfo_available(void) +{ + unsigned long flags; + int full_bucket = 0, i = 0; + + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + if (iwpm_hash_bucket) { + for (i = 0; i < IWPM_HASH_BUCKET_SIZE; i++) { + if (!hlist_empty(&iwpm_hash_bucket[i])) { + full_bucket = 1; + break; + } + } + } + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); + return full_bucket; +} diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h new file mode 100644 index 00000000000..9777c869a14 --- /dev/null +++ b/drivers/infiniband/core/iwpm_util.h @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _IWPM_UTIL_H +#define _IWPM_UTIL_H + +#include <linux/module.h> +#include <linux/io.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/spinlock.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/delay.h> +#include <linux/workqueue.h> +#include <linux/mutex.h> +#include <linux/jhash.h> +#include <linux/kref.h> +#include <net/netlink.h> +#include <linux/errno.h> +#include <rdma/iw_portmap.h> +#include <rdma/rdma_netlink.h> + + +#define IWPM_NL_RETRANS 3 +#define IWPM_NL_TIMEOUT (10*HZ) +#define IWPM_MAPINFO_SKB_COUNT 20 + +#define IWPM_PID_UNDEFINED -1 +#define IWPM_PID_UNAVAILABLE -2 + +struct iwpm_nlmsg_request { + struct list_head inprocess_list; + __u32 nlmsg_seq; + void *req_buffer; + u8 nl_client; + u8 request_done; + u16 err_code; + wait_queue_head_t waitq; + struct kref kref; +}; + +struct iwpm_mapping_info { + struct hlist_node hlist_node; + struct sockaddr_storage local_sockaddr; + struct sockaddr_storage mapped_sockaddr; + u8 nl_client; +}; + +struct iwpm_admin_data { + atomic_t refcount; + atomic_t nlmsg_seq; + int client_list[RDMA_NL_NUM_CLIENTS]; + int reg_list[RDMA_NL_NUM_CLIENTS]; +}; + +/** + * iwpm_get_nlmsg_request - Allocate and initialize netlink message request + * @nlmsg_seq: Sequence number of the netlink message + * @nl_client: The index of the netlink client + * @gfp: Indicates how the memory for the request should be allocated + * + * Returns the newly allocated netlink request object if successful, + * otherwise returns NULL + */ +struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, + u8 nl_client, gfp_t gfp); + +/** + * iwpm_free_nlmsg_request - Deallocate netlink message request + * @kref: Holds reference of netlink message request + */ +void iwpm_free_nlmsg_request(struct kref *kref); + +/** + * iwpm_find_nlmsg_request - Find netlink message request in the request list + * @echo_seq: Sequence number of the netlink request to find + * + * Returns the found netlink message request, + * if not found, returns NULL + */ +struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq); + +/** + * iwpm_wait_complete_req - Block while servicing the netlink request + * @nlmsg_request: Netlink message request to service + * + * Wakes up, after the request is completed or expired + * Returns 0 if the request is complete without error + */ +int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request); + +/** + * iwpm_get_nlmsg_seq - Get the sequence number for a netlink + * message to send to the port mapper + * + * Returns the sequence number for the netlink message. + */ +int iwpm_get_nlmsg_seq(void); + +/** + * iwpm_valid_client - Check if the port mapper client is valid + * @nl_client: The index of the netlink client + * + * Valid clients need to call iwpm_init() before using + * the port mapper + */ +int iwpm_valid_client(u8 nl_client); + +/** + * iwpm_set_valid - Set the port mapper client to valid or not + * @nl_client: The index of the netlink client + * @valid: 1 if valid or 0 if invalid + */ +void iwpm_set_valid(u8 nl_client, int valid); + +/** + * iwpm_registered_client - Check if the port mapper client is registered + * @nl_client: The index of the netlink client + * + * Call iwpm_register_pid() to register a client + */ +int iwpm_registered_client(u8 nl_client); + +/** + * iwpm_set_registered - Set the port mapper client to registered or not + * @nl_client: The index of the netlink client + * @reg: 1 if registered or 0 if not + */ +void iwpm_set_registered(u8 nl_client, int reg); + +/** + * iwpm_send_mapinfo - Send local and mapped IPv4/IPv6 address info of + * a client to the user space port mapper + * @nl_client: The index of the netlink client + * @iwpm_pid: The pid of the user space port mapper + * + * If successful, returns the number of sent mapping info records + */ +int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid); + +/** + * iwpm_mapinfo_available - Check if any mapping info records is available + * in the hash table + * + * Returns 1 if mapping information is available, otherwise returns 0 + */ +int iwpm_mapinfo_available(void); + +/** + * iwpm_compare_sockaddr - Compare two sockaddr storage structs + * + * Returns 0 if they are holding the same ip/tcp address info, + * otherwise returns 1 + */ +int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, + struct sockaddr_storage *b_sockaddr); + +/** + * iwpm_validate_nlmsg_attr - Check for NULL netlink attributes + * @nltb: Holds address of each netlink message attributes + * @nla_count: Number of netlink message attributes + * + * Returns error if any of the nla_count attributes is NULL + */ +static inline int iwpm_validate_nlmsg_attr(struct nlattr *nltb[], + int nla_count) +{ + int i; + for (i = 1; i < nla_count; i++) { + if (!nltb[i]) + return -EINVAL; + } + return 0; +} + +/** + * iwpm_create_nlmsg - Allocate skb and form a netlink message + * @nl_op: Netlink message opcode + * @nlh: Holds address of the netlink message header in skb + * @nl_client: The index of the netlink client + * + * Returns the newly allcated skb, or NULL if the tailroom of the skb + * is insufficient to store the message header and payload + */ +struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh, + int nl_client); + +/** + * iwpm_parse_nlmsg - Validate and parse the received netlink message + * @cb: Netlink callback structure + * @policy_max: Maximum attribute type to be expected + * @nlmsg_policy: Validation policy + * @nltb: Array to store policy_max parsed elements + * @msg_type: Type of netlink message + * + * Returns 0 on success or a negative error code + */ +int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, + const struct nla_policy *nlmsg_policy, + struct nlattr *nltb[], const char *msg_type); + +/** + * iwpm_print_sockaddr - Print IPv4/IPv6 address and TCP port + * @sockaddr: Socket address to print + * @msg: Message to print + */ +void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg); +#endif diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index dc3fd1e8af0..ab31f136d04 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -1022,12 +1022,21 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) mad_send_wr->send_buf.mad, sge[0].length, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr))) + return -ENOMEM; + mad_send_wr->header_mapping = sge[0].addr; sge[1].addr = ib_dma_map_single(mad_agent->device, ib_get_payload(mad_send_wr), sge[1].length, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) { + ib_dma_unmap_single(mad_agent->device, + mad_send_wr->header_mapping, + sge[0].length, DMA_TO_DEVICE); + return -ENOMEM; + } mad_send_wr->payload_mapping = sge[1].addr; spin_lock_irqsave(&qp_info->send_queue.lock, flags); @@ -2590,6 +2599,11 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, sizeof *mad_priv - sizeof mad_priv->header, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, + sg_list.addr))) { + ret = -ENOMEM; + break; + } mad_priv->header.mapping = sg_list.addr; recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list; mad_priv->header.mad_list.mad_queue = recv_queue; @@ -2663,6 +2677,7 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) int ret, i; struct ib_qp_attr *attr; struct ib_qp *qp; + u16 pkey_index; attr = kmalloc(sizeof *attr, GFP_KERNEL); if (!attr) { @@ -2670,6 +2685,11 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) return -ENOMEM; } + ret = ib_find_pkey(port_priv->device, port_priv->port_num, + IB_DEFAULT_PKEY_FULL, &pkey_index); + if (ret) + pkey_index = 0; + for (i = 0; i < IB_MAD_QPS_CORE; i++) { qp = port_priv->qp_info[i].qp; if (!qp) @@ -2680,7 +2700,7 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) * one is needed for the Reset to Init transition */ attr->qp_state = IB_QPS_INIT; - attr->pkey_index = 0; + attr->pkey_index = pkey_index; attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY; ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY); diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index da06abde9e0..23dd5a5c759 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -103,13 +103,13 @@ int ibnl_remove_client(int index) EXPORT_SYMBOL(ibnl_remove_client); void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq, - int len, int client, int op) + int len, int client, int op, int flags) { unsigned char *prev_tail; prev_tail = skb_tail_pointer(skb); *nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), - len, NLM_F_MULTI); + len, flags); if (!*nlh) goto out_nlmsg_trim; (*nlh)->nlmsg_len = skb_tail_pointer(skb) - prev_tail; @@ -148,7 +148,7 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) list_for_each_entry(client, &client_list, list) { if (client->index == index) { if (op < 0 || op >= client->nops || - !client->cb_table[RDMA_NL_GET_OP(op)].dump) + !client->cb_table[op].dump) return -EINVAL; { @@ -172,6 +172,20 @@ static void ibnl_rcv(struct sk_buff *skb) mutex_unlock(&ibnl_mutex); } +int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh, + __u32 pid) +{ + return nlmsg_unicast(nls, skb, pid); +} +EXPORT_SYMBOL(ibnl_unicast); + +int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh, + unsigned int group, gfp_t flags) +{ + return nlmsg_multicast(nls, skb, 0, group, flags); +} +EXPORT_SYMBOL(ibnl_multicast); + int __init ibnl_init(void) { struct netlink_kernel_cfg cfg = { diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index a8905abc56e..233eaf541f5 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -42,7 +42,7 @@ #include <linux/kref.h> #include <linux/idr.h> #include <linux/workqueue.h> - +#include <uapi/linux/if_ether.h> #include <rdma/ib_pack.h> #include <rdma/ib_cache.h> #include "sa.h" @@ -556,6 +556,13 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, ah_attr->grh.hop_limit = rec->hop_limit; ah_attr->grh.traffic_class = rec->traffic_class; } + if (force_grh) { + memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN); + ah_attr->vlan_id = rec->vlan_id; + } else { + ah_attr->vlan_id = 0xffff; + } + return 0; } EXPORT_SYMBOL(ib_init_ah_from_path); @@ -611,19 +618,21 @@ static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent) static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) { + bool preload = !!(gfp_mask & __GFP_WAIT); unsigned long flags; int ret, id; -retry: - if (!idr_pre_get(&query_idr, gfp_mask)) - return -ENOMEM; + if (preload) + idr_preload(gfp_mask); spin_lock_irqsave(&idr_lock, flags); - ret = idr_get_new(&query_idr, query, &id); + + id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT); + spin_unlock_irqrestore(&idr_lock, flags); - if (ret == -EAGAIN) - goto retry; - if (ret) - return ret; + if (preload) + idr_preload_end(); + if (id < 0) + return id; query->mad_buf->timeout_ms = timeout_ms; query->mad_buf->context[0] = query; @@ -650,6 +659,12 @@ void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec) } EXPORT_SYMBOL(ib_sa_unpack_path); +void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute) +{ + ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, attribute); +} +EXPORT_SYMBOL(ib_sa_pack_path); + static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) @@ -662,6 +677,9 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), mad->data, &rec); + rec.vlan_id = 0xffff; + memset(rec.dmac, 0, ETH_ALEN); + memset(rec.smac, 0, ETH_ALEN); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 246fdc15165..cbd0383f622 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -429,15 +429,19 @@ static void ib_port_release(struct kobject *kobj) struct attribute *a; int i; - for (i = 0; (a = p->gid_group.attrs[i]); ++i) - kfree(a); + if (p->gid_group.attrs) { + for (i = 0; (a = p->gid_group.attrs[i]); ++i) + kfree(a); - kfree(p->gid_group.attrs); + kfree(p->gid_group.attrs); + } - for (i = 0; (a = p->pkey_group.attrs[i]); ++i) - kfree(a); + if (p->pkey_group.attrs) { + for (i = 0; (a = p->pkey_group.attrs[i]); ++i) + kfree(a); - kfree(p->pkey_group.attrs); + kfree(p->pkey_group.attrs); + } kfree(p); } @@ -534,10 +538,12 @@ static int add_port(struct ib_device *device, int port_num, p->port_num = port_num; ret = kobject_init_and_add(&p->kobj, &port_type, - kobject_get(device->ports_parent), + device->ports_parent, "%d", port_num); - if (ret) - goto err_put; + if (ret) { + kfree(p); + return ret; + } ret = sysfs_create_group(&p->kobj, &pma_group); if (ret) @@ -545,8 +551,10 @@ static int add_port(struct ib_device *device, int port_num, p->gid_group.name = "gids"; p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); - if (!p->gid_group.attrs) + if (!p->gid_group.attrs) { + ret = -ENOMEM; goto err_remove_pma; + } ret = sysfs_create_group(&p->kobj, &p->gid_group); if (ret) @@ -555,8 +563,10 @@ static int add_port(struct ib_device *device, int port_num, p->pkey_group.name = "pkeys"; p->pkey_group.attrs = alloc_group_attrs(show_port_pkey, attr.pkey_tbl_len); - if (!p->pkey_group.attrs) + if (!p->pkey_group.attrs) { + ret = -ENOMEM; goto err_remove_gid; + } ret = sysfs_create_group(&p->kobj, &p->pkey_group); if (ret) @@ -581,6 +591,7 @@ err_free_pkey: kfree(p->pkey_group.attrs[i]); kfree(p->pkey_group.attrs); + p->pkey_group.attrs = NULL; err_remove_gid: sysfs_remove_group(&p->kobj, &p->gid_group); @@ -590,13 +601,13 @@ err_free_gid: kfree(p->gid_group.attrs[i]); kfree(p->gid_group.attrs); + p->gid_group.attrs = NULL; err_remove_pma: sysfs_remove_group(&p->kobj, &pma_group); err_put: - kobject_put(device->ports_parent); - kfree(p); + kobject_put(&p->kobj); return ret; } @@ -608,6 +619,8 @@ static ssize_t show_node_type(struct device *device, switch (dev->node_type) { case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type); case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type); + case RDMA_NODE_USNIC: return sprintf(buf, "%d: usNIC\n", dev->node_type); + case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type); case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type); case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type); default: return sprintf(buf, "%d: <unknown>\n", dev->node_type); @@ -803,6 +816,22 @@ static struct attribute_group iw_stats_group = { .attrs = iw_proto_stats_attrs, }; +static void free_port_list_attributes(struct ib_device *device) +{ + struct kobject *p, *t; + + list_for_each_entry_safe(p, t, &device->port_list, entry) { + struct ib_port *port = container_of(p, struct ib_port, kobj); + list_del(&p->entry); + sysfs_remove_group(p, &pma_group); + sysfs_remove_group(p, &port->pkey_group); + sysfs_remove_group(p, &port->gid_group); + kobject_put(p); + } + + kobject_put(device->ports_parent); +} + int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)) @@ -813,7 +842,7 @@ int ib_device_register_sysfs(struct ib_device *device, class_dev->class = &ib_class; class_dev->parent = device->dma_device; - dev_set_name(class_dev, device->name); + dev_set_name(class_dev, "%s", device->name); dev_set_drvdata(class_dev, device); INIT_LIST_HEAD(&device->port_list); @@ -829,7 +858,7 @@ int ib_device_register_sysfs(struct ib_device *device, } device->ports_parent = kobject_create_and_add("ports", - kobject_get(&class_dev->kobj)); + &class_dev->kobj); if (!device->ports_parent) { ret = -ENOMEM; goto err_put; @@ -856,21 +885,7 @@ int ib_device_register_sysfs(struct ib_device *device, return 0; err_put: - { - struct kobject *p, *t; - struct ib_port *port; - - list_for_each_entry_safe(p, t, &device->port_list, entry) { - list_del(&p->entry); - port = container_of(p, struct ib_port, kobj); - sysfs_remove_group(p, &pma_group); - sysfs_remove_group(p, &port->pkey_group); - sysfs_remove_group(p, &port->gid_group); - kobject_put(p); - } - } - - kobject_put(&class_dev->kobj); + free_port_list_attributes(device); err_unregister: device_unregister(class_dev); @@ -881,22 +896,18 @@ err: void ib_device_unregister_sysfs(struct ib_device *device) { - struct kobject *p, *t; - struct ib_port *port; - /* Hold kobject until ib_dealloc_device() */ - kobject_get(&device->dev.kobj); + struct kobject *kobj_dev = kobject_get(&device->dev.kobj); + int i; - list_for_each_entry_safe(p, t, &device->port_list, entry) { - list_del(&p->entry); - port = container_of(p, struct ib_port, kobj); - sysfs_remove_group(p, &pma_group); - sysfs_remove_group(p, &port->pkey_group); - sysfs_remove_group(p, &port->gid_group); - kobject_put(p); - } + if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) + sysfs_remove_group(kobj_dev, &iw_stats_group); + + free_port_list_attributes(device); + + for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) + device_remove_file(&device->dev, ib_class_attributes[i]); - kobject_put(device->ports_parent); device_unregister(&device->dev); } diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 49b15ac1987..f2f63933e8a 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -176,7 +176,6 @@ static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx) static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file) { struct ib_ucm_context *ctx; - int result; ctx = kzalloc(sizeof *ctx, GFP_KERNEL); if (!ctx) @@ -187,17 +186,10 @@ static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file) ctx->file = file; INIT_LIST_HEAD(&ctx->events); - do { - result = idr_pre_get(&ctx_id_table, GFP_KERNEL); - if (!result) - goto error; - - mutex_lock(&ctx_id_mutex); - result = idr_get_new(&ctx_id_table, ctx, &ctx->id); - mutex_unlock(&ctx_id_mutex); - } while (result == -EAGAIN); - - if (result) + mutex_lock(&ctx_id_mutex); + ctx->id = idr_alloc(&ctx_id_table, ctx, 0, 0, GFP_KERNEL); + mutex_unlock(&ctx_id_mutex); + if (ctx->id < 0) goto error; list_add_tail(&ctx->file_list, &file->ctxs); diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 2709ff58139..56a4b7ca7ee 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -47,6 +47,8 @@ #include <rdma/ib_marshall.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_cm_ib.h> +#include <rdma/ib_addr.h> +#include <rdma/ib.h> MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); @@ -55,7 +57,7 @@ MODULE_LICENSE("Dual BSD/GPL"); static unsigned int max_backlog = 1024; static struct ctl_table_header *ucma_ctl_table_hdr; -static ctl_table ucma_ctl_table[] = { +static struct ctl_table ucma_ctl_table[] = { { .procname = "max_backlog", .data = &max_backlog, @@ -145,7 +147,6 @@ static void ucma_put_ctx(struct ucma_context *ctx) static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) { struct ucma_context *ctx; - int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -156,17 +157,10 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) INIT_LIST_HEAD(&ctx->mc_list); ctx->file = file; - do { - ret = idr_pre_get(&ctx_idr, GFP_KERNEL); - if (!ret) - goto error; - - mutex_lock(&mut); - ret = idr_get_new(&ctx_idr, ctx, &ctx->id); - mutex_unlock(&mut); - } while (ret == -EAGAIN); - - if (ret) + mutex_lock(&mut); + ctx->id = idr_alloc(&ctx_idr, ctx, 0, 0, GFP_KERNEL); + mutex_unlock(&mut); + if (ctx->id < 0) goto error; list_add_tail(&ctx->list, &file->ctx_list); @@ -180,23 +174,15 @@ error: static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx) { struct ucma_multicast *mc; - int ret; mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return NULL; - do { - ret = idr_pre_get(&multicast_idr, GFP_KERNEL); - if (!ret) - goto error; - - mutex_lock(&mut); - ret = idr_get_new(&multicast_idr, mc, &mc->id); - mutex_unlock(&mut); - } while (ret == -EAGAIN); - - if (ret) + mutex_lock(&mut); + mc->id = idr_alloc(&multicast_idr, mc, 0, 0, GFP_KERNEL); + mutex_unlock(&mut); + if (mc->id < 0) goto error; mc->ctx = ctx; @@ -285,7 +271,7 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, goto out; } ctx->backlog--; - } else if (!ctx->uid) { + } else if (!ctx->uid || ctx->cm_id != cm_id) { /* * We ignore events for new connections until userspace has set * their context. This can only happen if an error occurs on a @@ -526,10 +512,10 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, return ret; } -static ssize_t ucma_bind_addr(struct ucma_file *file, const char __user *inbuf, +static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { - struct rdma_ucm_bind_addr cmd; + struct rdma_ucm_bind_ip cmd; struct ucma_context *ctx; int ret; @@ -545,24 +531,75 @@ static ssize_t ucma_bind_addr(struct ucma_file *file, const char __user *inbuf, return ret; } +static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_bind cmd; + struct sockaddr *addr; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + addr = (struct sockaddr *) &cmd.addr; + if (cmd.reserved || !cmd.addr_size || (cmd.addr_size != rdma_addr_size(addr))) + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = rdma_bind_addr(ctx->cm_id, addr); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_resolve_ip(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_resolve_ip cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, + (struct sockaddr *) &cmd.dst_addr, + cmd.timeout_ms); + ucma_put_ctx(ctx); + return ret; +} + static ssize_t ucma_resolve_addr(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_resolve_addr cmd; + struct sockaddr *src, *dst; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; + src = (struct sockaddr *) &cmd.src_addr; + dst = (struct sockaddr *) &cmd.dst_addr; + if (cmd.reserved || (cmd.src_size && (cmd.src_size != rdma_addr_size(src))) || + !cmd.dst_size || (cmd.dst_size != rdma_addr_size(dst))) + return -EINVAL; + ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, - (struct sockaddr *) &cmd.dst_addr, - cmd.timeout_ms); + ret = rdma_resolve_addr(ctx->cm_id, src, dst, cmd.timeout_ms); ucma_put_ctx(ctx); return ret; } @@ -618,24 +655,14 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { - struct rdma_dev_addr *dev_addr; - struct net_device *dev; - u16 vid = 0; resp->num_paths = route->num_paths; switch (route->num_paths) { case 0: - dev_addr = &route->addr.dev_addr; - dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); - if (dev) { - vid = rdma_vlan_dev_vlan_id(dev); - dev_put(dev); - } - - iboe_mac_vlan_to_ll((union ib_gid *) &resp->ib_route[0].dgid, - dev_addr->dst_dev_addr, vid); - iboe_addr_get_sgid(dev_addr, - (union ib_gid *) &resp->ib_route[0].sgid); + rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr, + (union ib_gid *)&resp->ib_route[0].dgid); + rdma_ip2gid((struct sockaddr *)&route->addr.src_addr, + (union ib_gid *)&resp->ib_route[0].sgid); resp->ib_route[0].pkey = cpu_to_be16(0xffff); break; case 2: @@ -665,7 +692,7 @@ static ssize_t ucma_query_route(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { - struct rdma_ucm_query_route cmd; + struct rdma_ucm_query cmd; struct rdma_ucm_query_route_resp resp; struct ucma_context *ctx; struct sockaddr *addr; @@ -725,7 +752,162 @@ out: return ret; } -static void ucma_copy_conn_param(struct rdma_conn_param *dst, +static void ucma_query_device_addr(struct rdma_cm_id *cm_id, + struct rdma_ucm_query_addr_resp *resp) +{ + if (!cm_id->device) + return; + + resp->node_guid = (__force __u64) cm_id->device->node_guid; + resp->port_num = cm_id->port_num; + resp->pkey = (__force __u16) cpu_to_be16( + ib_addr_get_pkey(&cm_id->route.addr.dev_addr)); +} + +static ssize_t ucma_query_addr(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_addr_resp resp; + struct sockaddr *addr; + int ret = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + memset(&resp, 0, sizeof resp); + + addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; + resp.src_size = rdma_addr_size(addr); + memcpy(&resp.src_addr, addr, resp.src_size); + + addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr; + resp.dst_size = rdma_addr_size(addr); + memcpy(&resp.dst_addr, addr, resp.dst_size); + + ucma_query_device_addr(ctx->cm_id, &resp); + + if (copy_to_user(response, &resp, sizeof(resp))) + ret = -EFAULT; + + return ret; +} + +static ssize_t ucma_query_path(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_path_resp *resp; + int i, ret = 0; + + if (out_len < sizeof(*resp)) + return -ENOSPC; + + resp = kzalloc(out_len, GFP_KERNEL); + if (!resp) + return -ENOMEM; + + resp->num_paths = ctx->cm_id->route.num_paths; + for (i = 0, out_len -= sizeof(*resp); + i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data); + i++, out_len -= sizeof(struct ib_path_rec_data)) { + + resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY | + IB_PATH_BIDIRECTIONAL; + ib_sa_pack_path(&ctx->cm_id->route.path_rec[i], + &resp->path_data[i].path_rec); + } + + if (copy_to_user(response, resp, + sizeof(*resp) + (i * sizeof(struct ib_path_rec_data)))) + ret = -EFAULT; + + kfree(resp); + return ret; +} + +static ssize_t ucma_query_gid(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_addr_resp resp; + struct sockaddr_ib *addr; + int ret = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + memset(&resp, 0, sizeof resp); + + ucma_query_device_addr(ctx->cm_id, &resp); + + addr = (struct sockaddr_ib *) &resp.src_addr; + resp.src_size = sizeof(*addr); + if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) { + memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size); + } else { + addr->sib_family = AF_IB; + addr->sib_pkey = (__force __be16) resp.pkey; + rdma_addr_get_sgid(&ctx->cm_id->route.addr.dev_addr, + (union ib_gid *) &addr->sib_addr); + addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) + &ctx->cm_id->route.addr.src_addr); + } + + addr = (struct sockaddr_ib *) &resp.dst_addr; + resp.dst_size = sizeof(*addr); + if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) { + memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size); + } else { + addr->sib_family = AF_IB; + addr->sib_pkey = (__force __be16) resp.pkey; + rdma_addr_get_dgid(&ctx->cm_id->route.addr.dev_addr, + (union ib_gid *) &addr->sib_addr); + addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) + &ctx->cm_id->route.addr.dst_addr); + } + + if (copy_to_user(response, &resp, sizeof(resp))) + ret = -EFAULT; + + return ret; +} + +static ssize_t ucma_query(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_query cmd; + struct ucma_context *ctx; + void __user *response; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + response = (void __user *)(unsigned long) cmd.response; + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + switch (cmd.option) { + case RDMA_USER_CM_QUERY_ADDR: + ret = ucma_query_addr(ctx, response, out_len); + break; + case RDMA_USER_CM_QUERY_PATH: + ret = ucma_query_path(ctx, response, out_len); + break; + case RDMA_USER_CM_QUERY_GID: + ret = ucma_query_gid(ctx, response, out_len); + break; + default: + ret = -ENOSYS; + break; + } + + ucma_put_ctx(ctx); + return ret; +} + +static void ucma_copy_conn_param(struct rdma_cm_id *id, + struct rdma_conn_param *dst, struct rdma_ucm_conn_param *src) { dst->private_data = src->private_data; @@ -737,6 +919,7 @@ static void ucma_copy_conn_param(struct rdma_conn_param *dst, dst->rnr_retry_count = src->rnr_retry_count; dst->srq = src->srq; dst->qp_num = src->qp_num; + dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0; } static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, @@ -757,7 +940,7 @@ static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); - ucma_copy_conn_param(&conn_param, &cmd.conn_param); + ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); ret = rdma_connect(ctx->cm_id, &conn_param); ucma_put_ctx(ctx); return ret; @@ -800,7 +983,7 @@ static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, return PTR_ERR(ctx); if (cmd.conn_param.valid) { - ucma_copy_conn_param(&conn_param, &cmd.conn_param); + ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); mutex_lock(&file->mut); ret = rdma_accept(ctx->cm_id, &conn_param); if (!ret) @@ -1036,23 +1219,23 @@ static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf, return ret; } -static ssize_t ucma_join_multicast(struct ucma_file *file, - const char __user *inbuf, - int in_len, int out_len) +static ssize_t ucma_process_join(struct ucma_file *file, + struct rdma_ucm_join_mcast *cmd, int out_len) { - struct rdma_ucm_join_mcast cmd; struct rdma_ucm_create_id_resp resp; struct ucma_context *ctx; struct ucma_multicast *mc; + struct sockaddr *addr; int ret; if (out_len < sizeof(resp)) return -ENOSPC; - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; + addr = (struct sockaddr *) &cmd->addr; + if (cmd->reserved || !cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr))) + return -EINVAL; - ctx = ucma_get_ctx(file, cmd.id); + ctx = ucma_get_ctx(file, cmd->id); if (IS_ERR(ctx)) return PTR_ERR(ctx); @@ -1063,14 +1246,14 @@ static ssize_t ucma_join_multicast(struct ucma_file *file, goto err1; } - mc->uid = cmd.uid; - memcpy(&mc->addr, &cmd.addr, sizeof cmd.addr); + mc->uid = cmd->uid; + memcpy(&mc->addr, addr, cmd->addr_size); ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc); if (ret) goto err2; resp.id = mc->id; - if (copy_to_user((void __user *)(unsigned long)cmd.response, + if (copy_to_user((void __user *)(unsigned long) cmd->response, &resp, sizeof(resp))) { ret = -EFAULT; goto err3; @@ -1095,6 +1278,38 @@ err1: return ret; } +static ssize_t ucma_join_ip_multicast(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_join_ip_mcast cmd; + struct rdma_ucm_join_mcast join_cmd; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + join_cmd.response = cmd.response; + join_cmd.uid = cmd.uid; + join_cmd.id = cmd.id; + join_cmd.addr_size = rdma_addr_size((struct sockaddr *) &cmd.addr); + join_cmd.reserved = 0; + memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size); + + return ucma_process_join(file, &join_cmd, out_len); +} + +static ssize_t ucma_join_multicast(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_join_mcast cmd; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + return ucma_process_join(file, &cmd, out_len); +} + static ssize_t ucma_leave_multicast(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) @@ -1237,25 +1452,29 @@ file_put: static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) = { - [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id, - [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id, - [RDMA_USER_CM_CMD_BIND_ADDR] = ucma_bind_addr, - [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, - [RDMA_USER_CM_CMD_RESOLVE_ROUTE]= ucma_resolve_route, - [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route, - [RDMA_USER_CM_CMD_CONNECT] = ucma_connect, - [RDMA_USER_CM_CMD_LISTEN] = ucma_listen, - [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept, - [RDMA_USER_CM_CMD_REJECT] = ucma_reject, - [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect, - [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr, - [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event, - [RDMA_USER_CM_CMD_GET_OPTION] = NULL, - [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option, - [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify, - [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast, - [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast, - [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id + [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id, + [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id, + [RDMA_USER_CM_CMD_BIND_IP] = ucma_bind_ip, + [RDMA_USER_CM_CMD_RESOLVE_IP] = ucma_resolve_ip, + [RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route, + [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route, + [RDMA_USER_CM_CMD_CONNECT] = ucma_connect, + [RDMA_USER_CM_CMD_LISTEN] = ucma_listen, + [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept, + [RDMA_USER_CM_CMD_REJECT] = ucma_reject, + [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect, + [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr, + [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event, + [RDMA_USER_CM_CMD_GET_OPTION] = NULL, + [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option, + [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify, + [RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast, + [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast, + [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id, + [RDMA_USER_CM_CMD_QUERY] = ucma_query, + [RDMA_USER_CM_CMD_BIND] = ucma_bind, + [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, + [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast }; static ssize_t ucma_write(struct file *filp, const char __user *buf, diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index a8411232207..a3a2e9c1639 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -42,29 +42,29 @@ #include "uverbs.h" -#define IB_UMEM_MAX_PAGE_CHUNK \ - ((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) / \ - ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] - \ - (void *) &((struct ib_umem_chunk *) 0)->page_list[0])) static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { - struct ib_umem_chunk *chunk, *tmp; + struct scatterlist *sg; + struct page *page; int i; - list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) { - ib_dma_unmap_sg(dev, chunk->page_list, - chunk->nents, DMA_BIDIRECTIONAL); - for (i = 0; i < chunk->nents; ++i) { - struct page *page = sg_page(&chunk->page_list[i]); + if (umem->nmap > 0) + ib_dma_unmap_sg(dev, umem->sg_head.sgl, + umem->nmap, + DMA_BIDIRECTIONAL); - if (umem->writable && dirty) - set_page_dirty_lock(page); - put_page(page); - } + for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { - kfree(chunk); + page = sg_page(sg); + if (umem->writable && dirty) + set_page_dirty_lock(page); + put_page(page); } + + sg_free_table(&umem->sg_head); + return; + } /** @@ -81,15 +81,15 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, struct ib_umem *umem; struct page **page_list; struct vm_area_struct **vma_list; - struct ib_umem_chunk *chunk; unsigned long locked; unsigned long lock_limit; unsigned long cur_base; unsigned long npages; int ret; - int off; int i; DEFINE_DMA_ATTRS(attrs); + struct scatterlist *sg, *sg_list_start; + int need_release = 0; if (dmasync) dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); @@ -97,7 +97,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (!can_do_mlock()) return ERR_PTR(-EPERM); - umem = kmalloc(sizeof *umem, GFP_KERNEL); + umem = kzalloc(sizeof *umem, GFP_KERNEL); if (!umem) return ERR_PTR(-ENOMEM); @@ -117,8 +117,6 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, /* We assume the memory is from hugetlb until proved otherwise */ umem->hugetlb = 1; - INIT_LIST_HEAD(&umem->chunk_list); - page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) { kfree(umem); @@ -147,7 +145,18 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, cur_base = addr & PAGE_MASK; - ret = 0; + if (npages == 0) { + ret = -EINVAL; + goto out; + } + + ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); + if (ret) + goto out; + + need_release = 1; + sg_list_start = umem->sg_head.sgl; + while (npages) { ret = get_user_pages(current, current->mm, cur_base, min_t(unsigned long, npages, @@ -157,54 +166,38 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (ret < 0) goto out; + umem->npages += ret; cur_base += ret * PAGE_SIZE; npages -= ret; - off = 0; - - while (ret) { - chunk = kmalloc(sizeof *chunk + sizeof (struct scatterlist) * - min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK), - GFP_KERNEL); - if (!chunk) { - ret = -ENOMEM; - goto out; - } - - chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK); - sg_init_table(chunk->page_list, chunk->nents); - for (i = 0; i < chunk->nents; ++i) { - if (vma_list && - !is_vm_hugetlb_page(vma_list[i + off])) - umem->hugetlb = 0; - sg_set_page(&chunk->page_list[i], page_list[i + off], PAGE_SIZE, 0); - } - - chunk->nmap = ib_dma_map_sg_attrs(context->device, - &chunk->page_list[0], - chunk->nents, - DMA_BIDIRECTIONAL, - &attrs); - if (chunk->nmap <= 0) { - for (i = 0; i < chunk->nents; ++i) - put_page(sg_page(&chunk->page_list[i])); - kfree(chunk); - - ret = -ENOMEM; - goto out; - } - - ret -= chunk->nents; - off += chunk->nents; - list_add_tail(&chunk->list, &umem->chunk_list); + for_each_sg(sg_list_start, sg, ret, i) { + if (vma_list && !is_vm_hugetlb_page(vma_list[i])) + umem->hugetlb = 0; + + sg_set_page(sg, page_list[i], PAGE_SIZE, 0); } - ret = 0; + /* preparing for next loop */ + sg_list_start = sg; } + umem->nmap = ib_dma_map_sg_attrs(context->device, + umem->sg_head.sgl, + umem->npages, + DMA_BIDIRECTIONAL, + &attrs); + + if (umem->nmap <= 0) { + ret = -ENOMEM; + goto out; + } + + ret = 0; + out: if (ret < 0) { - __ib_umem_release(context->device, umem, 0); + if (need_release) + __ib_umem_release(context->device, umem, 0); kfree(umem); } else current->mm->pinned_vm = locked; @@ -278,17 +271,16 @@ EXPORT_SYMBOL(ib_umem_release); int ib_umem_page_count(struct ib_umem *umem) { - struct ib_umem_chunk *chunk; int shift; int i; int n; + struct scatterlist *sg; shift = ilog2(umem->page_size); n = 0; - list_for_each_entry(chunk, &umem->chunk_list, list) - for (i = 0; i < chunk->nmap; ++i) - n += sg_dma_len(&chunk->page_list[i]) >> shift; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) + n += sg_dma_len(sg) >> shift; return n; } diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index f0d588f8859..1acb9910055 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -98,7 +98,7 @@ struct ib_umad_port { struct ib_umad_device { int start_port, end_port; - struct kref ref; + struct kobject kobj; struct ib_umad_port port[0]; }; @@ -134,14 +134,18 @@ static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); static void ib_umad_add_one(struct ib_device *device); static void ib_umad_remove_one(struct ib_device *device); -static void ib_umad_release_dev(struct kref *ref) +static void ib_umad_release_dev(struct kobject *kobj) { struct ib_umad_device *dev = - container_of(ref, struct ib_umad_device, ref); + container_of(kobj, struct ib_umad_device, kobj); kfree(dev); } +static struct kobj_type ib_umad_dev_ktype = { + .release = ib_umad_release_dev, +}; + static int hdr_size(struct ib_umad_file *file) { return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) : @@ -780,27 +784,19 @@ static int ib_umad_open(struct inode *inode, struct file *filp) { struct ib_umad_port *port; struct ib_umad_file *file; - int ret; + int ret = -ENXIO; port = container_of(inode->i_cdev, struct ib_umad_port, cdev); - if (port) - kref_get(&port->umad_dev->ref); - else - return -ENXIO; mutex_lock(&port->file_mutex); - if (!port->ib_dev) { - ret = -ENXIO; + if (!port->ib_dev) goto out; - } + ret = -ENOMEM; file = kzalloc(sizeof *file, GFP_KERNEL); - if (!file) { - kref_put(&port->umad_dev->ref, ib_umad_release_dev); - ret = -ENOMEM; + if (!file) goto out; - } mutex_init(&file->mutex); spin_lock_init(&file->send_lock); @@ -814,6 +810,13 @@ static int ib_umad_open(struct inode *inode, struct file *filp) list_add_tail(&file->port_list, &port->file_list); ret = nonseekable_open(inode, filp); + if (ret) { + list_del(&file->port_list); + kfree(file); + goto out; + } + + kobject_get(&port->umad_dev->kobj); out: mutex_unlock(&port->file_mutex); @@ -852,7 +855,7 @@ static int ib_umad_close(struct inode *inode, struct file *filp) mutex_unlock(&file->port->file_mutex); kfree(file); - kref_put(&dev->ref, ib_umad_release_dev); + kobject_put(&dev->kobj); return 0; } @@ -880,10 +883,6 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp) int ret; port = container_of(inode->i_cdev, struct ib_umad_port, sm_cdev); - if (port) - kref_get(&port->umad_dev->ref); - else - return -ENXIO; if (filp->f_flags & O_NONBLOCK) { if (down_trylock(&port->sm_sem)) { @@ -898,17 +897,27 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp) } ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props); - if (ret) { - up(&port->sm_sem); - goto fail; - } + if (ret) + goto err_up_sem; filp->private_data = port; - return nonseekable_open(inode, filp); + ret = nonseekable_open(inode, filp); + if (ret) + goto err_clr_sm_cap; + + kobject_get(&port->umad_dev->kobj); + + return 0; + +err_clr_sm_cap: + swap(props.set_port_cap_mask, props.clr_port_cap_mask); + ib_modify_port(port->ib_dev, port->port_num, 0, &props); + +err_up_sem: + up(&port->sm_sem); fail: - kref_put(&port->umad_dev->ref, ib_umad_release_dev); return ret; } @@ -927,7 +936,7 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp) up(&port->sm_sem); - kref_put(&port->umad_dev->ref, ib_umad_release_dev); + kobject_put(&port->umad_dev->kobj); return ret; } @@ -995,6 +1004,7 @@ static int find_overflow_devnum(void) } static int ib_umad_init_port(struct ib_device *device, int port_num, + struct ib_umad_device *umad_dev, struct ib_umad_port *port) { int devnum; @@ -1027,6 +1037,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, cdev_init(&port->cdev, &umad_fops); port->cdev.owner = THIS_MODULE; + port->cdev.kobj.parent = &umad_dev->kobj; kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num); if (cdev_add(&port->cdev, base, 1)) goto err_cdev; @@ -1045,6 +1056,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, base += IB_UMAD_MAX_PORTS; cdev_init(&port->sm_cdev, &umad_sm_fops); port->sm_cdev.owner = THIS_MODULE; + port->sm_cdev.kobj.parent = &umad_dev->kobj; kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num); if (cdev_add(&port->sm_cdev, base, 1)) goto err_sm_cdev; @@ -1138,7 +1150,7 @@ static void ib_umad_add_one(struct ib_device *device) if (!umad_dev) return; - kref_init(&umad_dev->ref); + kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype); umad_dev->start_port = s; umad_dev->end_port = e; @@ -1146,7 +1158,8 @@ static void ib_umad_add_one(struct ib_device *device) for (i = s; i <= e; ++i) { umad_dev->port[i - s].umad_dev = umad_dev; - if (ib_umad_init_port(device, i, &umad_dev->port[i - s])) + if (ib_umad_init_port(device, i, umad_dev, + &umad_dev->port[i - s])) goto err; } @@ -1158,7 +1171,7 @@ err: while (--i >= s) ib_umad_kill_port(&umad_dev->port[i - s]); - kref_put(&umad_dev->ref, ib_umad_release_dev); + kobject_put(&umad_dev->kobj); } static void ib_umad_remove_one(struct ib_device *device) @@ -1172,7 +1185,7 @@ static void ib_umad_remove_one(struct ib_device *device) for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i) ib_umad_kill_port(&umad_dev->port[i]); - kref_put(&umad_dev->ref, ib_umad_release_dev); + kobject_put(&umad_dev->kobj); } static char *umad_devnode(struct device *dev, umode_t *mode) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 5bcb2afd3dc..a283274a5a0 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -47,6 +47,22 @@ #include <rdma/ib_umem.h> #include <rdma/ib_user_verbs.h> +#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ + do { \ + (udata)->inbuf = (const void __user *) (ibuf); \ + (udata)->outbuf = (void __user *) (obuf); \ + (udata)->inlen = (ilen); \ + (udata)->outlen = (olen); \ + } while (0) + +#define INIT_UDATA_BUF_OR_NULL(udata, ibuf, obuf, ilen, olen) \ + do { \ + (udata)->inbuf = (ilen) ? (const void __user *) (ibuf) : NULL; \ + (udata)->outbuf = (olen) ? (void __user *) (obuf) : NULL; \ + (udata)->inlen = (ilen); \ + (udata)->outlen = (olen); \ + } while (0) + /* * Our lifetime rules for these structs are the following: * @@ -135,6 +151,7 @@ struct ib_usrq_object { struct ib_uqp_object { struct ib_uevent_object uevent; struct list_head mcast_list; + struct ib_uxrcd_object *uxrcd; }; struct ib_ucq_object { @@ -155,6 +172,7 @@ extern struct idr ib_uverbs_cq_idr; extern struct idr ib_uverbs_qp_idr; extern struct idr ib_uverbs_srq_idr; extern struct idr ib_uverbs_xrcd_idr; +extern struct idr ib_uverbs_rule_idr; void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj); @@ -176,6 +194,22 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd); +struct ib_uverbs_flow_spec { + union { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_spec_eth eth; + struct ib_uverbs_flow_spec_ipv4 ipv4; + struct ib_uverbs_flow_spec_tcp_udp tcp_udp; + }; +}; + #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ const char __user *buf, int in_len, \ @@ -188,6 +222,8 @@ IB_UVERBS_DECLARE_CMD(alloc_pd); IB_UVERBS_DECLARE_CMD(dealloc_pd); IB_UVERBS_DECLARE_CMD(reg_mr); IB_UVERBS_DECLARE_CMD(dereg_mr); +IB_UVERBS_DECLARE_CMD(alloc_mw); +IB_UVERBS_DECLARE_CMD(dealloc_mw); IB_UVERBS_DECLARE_CMD(create_comp_channel); IB_UVERBS_DECLARE_CMD(create_cq); IB_UVERBS_DECLARE_CMD(resize_cq); @@ -214,4 +250,12 @@ IB_UVERBS_DECLARE_CMD(create_xsrq); IB_UVERBS_DECLARE_CMD(open_xrcd); IB_UVERBS_DECLARE_CMD(close_xrcd); +#define IB_UVERBS_DECLARE_EX_CMD(name) \ + int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \ + struct ib_udata *ucore, \ + struct ib_udata *uhw) + +IB_UVERBS_DECLARE_EX_CMD(create_flow); +IB_UVERBS_DECLARE_EX_CMD(destroy_flow); + #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 0cb0007724a..ea6203ee7bc 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -40,6 +40,7 @@ #include <asm/uaccess.h> #include "uverbs.h" +#include "core_priv.h" struct uverbs_lock_class { struct lock_class_key key; @@ -48,19 +49,13 @@ struct uverbs_lock_class { static struct uverbs_lock_class pd_lock_class = { .name = "PD-uobj" }; static struct uverbs_lock_class mr_lock_class = { .name = "MR-uobj" }; +static struct uverbs_lock_class mw_lock_class = { .name = "MW-uobj" }; static struct uverbs_lock_class cq_lock_class = { .name = "CQ-uobj" }; static struct uverbs_lock_class qp_lock_class = { .name = "QP-uobj" }; static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" }; static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" }; static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; - -#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ - do { \ - (udata)->inbuf = (void __user *) (ibuf); \ - (udata)->outbuf = (void __user *) (obuf); \ - (udata)->inlen = (ilen); \ - (udata)->outlen = (olen); \ - } while (0) +static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; /* * The ib_uobject locking scheme is as follows: @@ -124,18 +119,17 @@ static int idr_add_uobj(struct idr *idr, struct ib_uobject *uobj) { int ret; -retry: - if (!idr_pre_get(idr, GFP_KERNEL)) - return -ENOMEM; - + idr_preload(GFP_KERNEL); spin_lock(&ib_uverbs_idr_lock); - ret = idr_get_new(idr, uobj, &uobj->id); - spin_unlock(&ib_uverbs_idr_lock); - if (ret == -EAGAIN) - goto retry; + ret = idr_alloc(idr, uobj, 0, 0, GFP_NOWAIT); + if (ret >= 0) + uobj->id = ret; - return ret; + spin_unlock(&ib_uverbs_idr_lock); + idr_preload_end(); + + return ret < 0 ? ret : 0; } void idr_remove_uobj(struct idr *idr, struct ib_uobject *uobj) @@ -330,11 +324,12 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, INIT_LIST_HEAD(&ucontext->srq_list); INIT_LIST_HEAD(&ucontext->ah_list); INIT_LIST_HEAD(&ucontext->xrcd_list); + INIT_LIST_HEAD(&ucontext->rule_list); ucontext->closing = 0; resp.num_comp_vectors = file->device->num_comp_vectors; - ret = get_unused_fd(); + ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) goto err_free; resp.async_fd = ret; @@ -730,7 +725,7 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, goto err_tree_mutex_unlock; } - inode = f.file->f_path.dentry->d_inode; + inode = file_inode(f.file); xrcd = find_xrcd(file->device, inode); if (!xrcd && !(cmd.oflags & O_CREAT)) { /* no file descriptor. Need CREATE flag */ @@ -935,13 +930,9 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) return -EINVAL; - /* - * Local write permission is required if remote write or - * remote atomic permission is also requested. - */ - if (cmd.access_flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && - !(cmd.access_flags & IB_ACCESS_LOCAL_WRITE)) - return -EINVAL; + ret = ib_check_mr_access(cmd.access_flags); + if (ret) + return ret; uobj = kmalloc(sizeof *uobj, GFP_KERNEL); if (!uobj) @@ -1049,6 +1040,126 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, return in_len; } +ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_alloc_mw cmd; + struct ib_uverbs_alloc_mw_resp resp; + struct ib_uobject *uobj; + struct ib_pd *pd; + struct ib_mw *mw; + int ret; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) + return -ENOMEM; + + init_uobj(uobj, 0, file->ucontext, &mw_lock_class); + down_write(&uobj->mutex); + + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto err_free; + } + + mw = pd->device->alloc_mw(pd, cmd.mw_type); + if (IS_ERR(mw)) { + ret = PTR_ERR(mw); + goto err_put; + } + + mw->device = pd->device; + mw->pd = pd; + mw->uobject = uobj; + atomic_inc(&pd->usecnt); + + uobj->object = mw; + ret = idr_add_uobj(&ib_uverbs_mw_idr, uobj); + if (ret) + goto err_unalloc; + + memset(&resp, 0, sizeof(resp)); + resp.rkey = mw->rkey; + resp.mw_handle = uobj->id; + + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) { + ret = -EFAULT; + goto err_copy; + } + + put_pd_read(pd); + + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->mw_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + + return in_len; + +err_copy: + idr_remove_uobj(&ib_uverbs_mw_idr, uobj); + +err_unalloc: + ib_dealloc_mw(mw); + +err_put: + put_pd_read(pd); + +err_free: + put_uobj_write(uobj); + return ret; +} + +ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_dealloc_mw cmd; + struct ib_mw *mw; + struct ib_uobject *uobj; + int ret = -EINVAL; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + uobj = idr_write_uobj(&ib_uverbs_mw_idr, cmd.mw_handle, file->ucontext); + if (!uobj) + return -EINVAL; + + mw = uobj->object; + + ret = ib_dealloc_mw(mw); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + if (ret) + return ret; + + idr_remove_uobj(&ib_uverbs_mw_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + + return in_len; +} + ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) @@ -1064,7 +1175,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - ret = get_unused_fd(); + ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) return ret; resp.fd = ret; @@ -1406,7 +1517,7 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); - obj = kmalloc(sizeof *obj, GFP_KERNEL); + obj = kzalloc(sizeof *obj, GFP_KERNEL); if (!obj) return -ENOMEM; @@ -1522,8 +1633,13 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, goto err_copy; } - if (xrcd) + if (xrcd) { + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, + uobject); + atomic_inc(&obj->uxrcd->refcnt); put_xrcd_read(xrcd_uobj); + } + if (pd) put_pd_read(pd); if (scq) @@ -1633,6 +1749,8 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, goto err_remove; } + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); + atomic_inc(&obj->uxrcd->refcnt); put_xrcd_read(xrcd_uobj); mutex_lock(&file->mutex); @@ -1844,6 +1962,9 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, attr->alt_ah_attr.port_num = cmd.alt_dest.port_num; if (qp->real_qp == qp) { + ret = ib_resolve_eth_l2_attrs(qp, attr, &cmd.attr_mask); + if (ret) + goto out; ret = qp->device->modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata); } else { @@ -1899,6 +2020,9 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, if (ret) return ret; + if (obj->uxrcd) + atomic_dec(&obj->uxrcd->refcnt); + idr_remove_uobj(&ib_uverbs_qp_idr, uobj); mutex_lock(&file->mutex); @@ -1994,6 +2118,9 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, } next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn; next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey; + if (next->opcode == IB_WR_SEND_WITH_IMM) + next->ex.imm_data = + (__be32 __force) user_wr->ex.imm_data; } else { switch (next->opcode) { case IB_WR_RDMA_WRITE_WITH_IMM: @@ -2467,6 +2594,251 @@ out_put: return ret ? ret : in_len; } +static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, + union ib_flow_spec *ib_spec) +{ + if (kern_spec->reserved) + return -EINVAL; + + ib_spec->type = kern_spec->type; + + switch (ib_spec->type) { + case IB_FLOW_SPEC_ETH: + ib_spec->eth.size = sizeof(struct ib_flow_spec_eth); + if (ib_spec->eth.size != kern_spec->eth.size) + return -EINVAL; + memcpy(&ib_spec->eth.val, &kern_spec->eth.val, + sizeof(struct ib_flow_eth_filter)); + memcpy(&ib_spec->eth.mask, &kern_spec->eth.mask, + sizeof(struct ib_flow_eth_filter)); + break; + case IB_FLOW_SPEC_IPV4: + ib_spec->ipv4.size = sizeof(struct ib_flow_spec_ipv4); + if (ib_spec->ipv4.size != kern_spec->ipv4.size) + return -EINVAL; + memcpy(&ib_spec->ipv4.val, &kern_spec->ipv4.val, + sizeof(struct ib_flow_ipv4_filter)); + memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask, + sizeof(struct ib_flow_ipv4_filter)); + break; + case IB_FLOW_SPEC_TCP: + case IB_FLOW_SPEC_UDP: + ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp); + if (ib_spec->tcp_udp.size != kern_spec->tcp_udp.size) + return -EINVAL; + memcpy(&ib_spec->tcp_udp.val, &kern_spec->tcp_udp.val, + sizeof(struct ib_flow_tcp_udp_filter)); + memcpy(&ib_spec->tcp_udp.mask, &kern_spec->tcp_udp.mask, + sizeof(struct ib_flow_tcp_udp_filter)); + break; + default: + return -EINVAL; + } + return 0; +} + +int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_create_flow cmd; + struct ib_uverbs_create_flow_resp resp; + struct ib_uobject *uobj; + struct ib_flow *flow_id; + struct ib_uverbs_flow_attr *kern_flow_attr; + struct ib_flow_attr *flow_attr; + struct ib_qp *qp; + int err = 0; + void *kern_spec; + void *ib_spec; + int i; + + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + if (ucore->outlen < sizeof(resp)) + return -ENOSPC; + + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + ucore->inbuf += sizeof(cmd); + ucore->inlen -= sizeof(cmd); + + if (cmd.comp_mask) + return -EINVAL; + + if ((cmd.flow_attr.type == IB_FLOW_ATTR_SNIFFER && + !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW)) + return -EPERM; + + if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) + return -EINVAL; + + if (cmd.flow_attr.size > ucore->inlen || + cmd.flow_attr.size > + (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec))) + return -EINVAL; + + if (cmd.flow_attr.reserved[0] || + cmd.flow_attr.reserved[1]) + return -EINVAL; + + if (cmd.flow_attr.num_of_specs) { + kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size, + GFP_KERNEL); + if (!kern_flow_attr) + return -ENOMEM; + + memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr)); + err = ib_copy_from_udata(kern_flow_attr + 1, ucore, + cmd.flow_attr.size); + if (err) + goto err_free_attr; + } else { + kern_flow_attr = &cmd.flow_attr; + } + + uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) { + err = -ENOMEM; + goto err_free_attr; + } + init_uobj(uobj, 0, file->ucontext, &rule_lock_class); + down_write(&uobj->mutex); + + qp = idr_read_qp(cmd.qp_handle, file->ucontext); + if (!qp) { + err = -EINVAL; + goto err_uobj; + } + + flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, GFP_KERNEL); + if (!flow_attr) { + err = -ENOMEM; + goto err_put; + } + + flow_attr->type = kern_flow_attr->type; + flow_attr->priority = kern_flow_attr->priority; + flow_attr->num_of_specs = kern_flow_attr->num_of_specs; + flow_attr->port = kern_flow_attr->port; + flow_attr->flags = kern_flow_attr->flags; + flow_attr->size = sizeof(*flow_attr); + + kern_spec = kern_flow_attr + 1; + ib_spec = flow_attr + 1; + for (i = 0; i < flow_attr->num_of_specs && + cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) && + cmd.flow_attr.size >= + ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) { + err = kern_spec_to_ib_spec(kern_spec, ib_spec); + if (err) + goto err_free; + flow_attr->size += + ((union ib_flow_spec *) ib_spec)->size; + cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size; + kern_spec += ((struct ib_uverbs_flow_spec *) kern_spec)->size; + ib_spec += ((union ib_flow_spec *) ib_spec)->size; + } + if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { + pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n", + i, cmd.flow_attr.size); + err = -EINVAL; + goto err_free; + } + flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); + if (IS_ERR(flow_id)) { + err = PTR_ERR(flow_id); + goto err_free; + } + flow_id->qp = qp; + flow_id->uobject = uobj; + uobj->object = flow_id; + + err = idr_add_uobj(&ib_uverbs_rule_idr, uobj); + if (err) + goto destroy_flow; + + memset(&resp, 0, sizeof(resp)); + resp.flow_handle = uobj->id; + + err = ib_copy_to_udata(ucore, + &resp, sizeof(resp)); + if (err) + goto err_copy; + + put_qp_read(qp); + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->rule_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + kfree(flow_attr); + if (cmd.flow_attr.num_of_specs) + kfree(kern_flow_attr); + return 0; +err_copy: + idr_remove_uobj(&ib_uverbs_rule_idr, uobj); +destroy_flow: + ib_destroy_flow(flow_id); +err_free: + kfree(flow_attr); +err_put: + put_qp_read(qp); +err_uobj: + put_uobj_write(uobj); +err_free_attr: + if (cmd.flow_attr.num_of_specs) + kfree(kern_flow_attr); + return err; +} + +int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_destroy_flow cmd; + struct ib_flow *flow_id; + struct ib_uobject *uobj; + int ret; + + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (ret) + return ret; + + if (cmd.comp_mask) + return -EINVAL; + + uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle, + file->ucontext); + if (!uobj) + return -EINVAL; + flow_id = uobj->object; + + ret = ib_destroy_flow(flow_id); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + idr_remove_uobj(&ib_uverbs_rule_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + + return ret; +} + static int __uverbs_create_xsrq(struct ib_uverbs_file *file, struct ib_uverbs_create_xsrq *cmd, struct ib_udata *udata) @@ -2740,6 +3112,8 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, struct ib_srq *srq; struct ib_uevent_object *obj; int ret = -EINVAL; + struct ib_usrq_object *us; + enum ib_srq_type srq_type; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -2749,6 +3123,7 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, return -EINVAL; srq = uobj->object; obj = container_of(uobj, struct ib_uevent_object, uobject); + srq_type = srq->srq_type; ret = ib_destroy_srq(srq); if (!ret) @@ -2759,6 +3134,11 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, if (ret) return ret; + if (srq_type == IB_SRQT_XRC) { + us = container_of(obj, struct ib_usrq_object, uevent); + atomic_dec(&us->uxrcd->refcnt); + } + idr_remove_uobj(&ib_uverbs_srq_idr, uobj); mutex_lock(&file->mutex); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 6f2ce6fa98f..08219fb3338 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -73,6 +73,7 @@ DEFINE_IDR(ib_uverbs_cq_idr); DEFINE_IDR(ib_uverbs_qp_idr); DEFINE_IDR(ib_uverbs_srq_idr); DEFINE_IDR(ib_uverbs_xrcd_idr); +DEFINE_IDR(ib_uverbs_rule_idr); static DEFINE_SPINLOCK(map_lock); static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); @@ -87,6 +88,8 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd, [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr, [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr, + [IB_USER_VERBS_CMD_ALLOC_MW] = ib_uverbs_alloc_mw, + [IB_USER_VERBS_CMD_DEALLOC_MW] = ib_uverbs_dealloc_mw, [IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel, [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq, [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq, @@ -111,7 +114,14 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrcd, [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd, [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq, - [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp + [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp, +}; + +static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) = { + [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, + [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow }; static void ib_uverbs_add_one(struct ib_device *device); @@ -201,6 +211,23 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, kfree(uobj); } + /* Remove MWs before QPs, in order to support type 2A MWs. */ + list_for_each_entry_safe(uobj, tmp, &context->mw_list, list) { + struct ib_mw *mw = uobj->object; + + idr_remove_uobj(&ib_uverbs_mw_idr, uobj); + ib_dealloc_mw(mw); + kfree(uobj); + } + + list_for_each_entry_safe(uobj, tmp, &context->rule_list, list) { + struct ib_flow *flow_id = uobj->object; + + idr_remove_uobj(&ib_uverbs_rule_idr, uobj); + ib_destroy_flow(flow_id); + kfree(uobj); + } + list_for_each_entry_safe(uobj, tmp, &context->qp_list, list) { struct ib_qp *qp = uobj->object; struct ib_uqp_object *uqp = @@ -240,8 +267,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, kfree(uevent); } - /* XXX Free MWs */ - list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) { struct ib_mr *mr = uobj->object; @@ -567,6 +592,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, { struct ib_uverbs_file *file = filp->private_data; struct ib_uverbs_cmd_hdr hdr; + __u32 flags; if (count < sizeof hdr) return -EINVAL; @@ -574,22 +600,110 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, if (copy_from_user(&hdr, buf, sizeof hdr)) return -EFAULT; - if (hdr.in_words * 4 != count) - return -EINVAL; + flags = (hdr.command & + IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; - if (hdr.command >= ARRAY_SIZE(uverbs_cmd_table) || - !uverbs_cmd_table[hdr.command]) - return -EINVAL; + if (!flags) { + __u32 command; - if (!file->ucontext && - hdr.command != IB_USER_VERBS_CMD_GET_CONTEXT) - return -EINVAL; + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) + return -EINVAL; + + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + + if (command >= ARRAY_SIZE(uverbs_cmd_table) || + !uverbs_cmd_table[command]) + return -EINVAL; + + if (!file->ucontext && + command != IB_USER_VERBS_CMD_GET_CONTEXT) + return -EINVAL; + + if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command))) + return -ENOSYS; + + if (hdr.in_words * 4 != count) + return -EINVAL; + + return uverbs_cmd_table[command](file, + buf + sizeof(hdr), + hdr.in_words * 4, + hdr.out_words * 4); + + } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) { + __u32 command; - if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command))) - return -ENOSYS; + struct ib_uverbs_ex_cmd_hdr ex_hdr; + struct ib_udata ucore; + struct ib_udata uhw; + int err; + size_t written_count = count; + + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) + return -EINVAL; + + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + + if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) || + !uverbs_ex_cmd_table[command]) + return -ENOSYS; + + if (!file->ucontext) + return -EINVAL; + + if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command))) + return -ENOSYS; + + if (count < (sizeof(hdr) + sizeof(ex_hdr))) + return -EINVAL; + + if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) + return -EFAULT; + + count -= sizeof(hdr) + sizeof(ex_hdr); + buf += sizeof(hdr) + sizeof(ex_hdr); + + if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) + return -EINVAL; + + if (ex_hdr.cmd_hdr_reserved) + return -EINVAL; + + if (ex_hdr.response) { + if (!hdr.out_words && !ex_hdr.provider_out_words) + return -EINVAL; + + if (!access_ok(VERIFY_WRITE, + (void __user *) (unsigned long) ex_hdr.response, + (hdr.out_words + ex_hdr.provider_out_words) * 8)) + return -EFAULT; + } else { + if (hdr.out_words || ex_hdr.provider_out_words) + return -EINVAL; + } + + INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response, + hdr.in_words * 8, hdr.out_words * 8); + + INIT_UDATA_BUF_OR_NULL(&uhw, + buf + ucore.inlen, + (unsigned long) ex_hdr.response + ucore.outlen, + ex_hdr.provider_in_words * 8, + ex_hdr.provider_out_words * 8); + + err = uverbs_ex_cmd_table[command](file, + &ucore, + &uhw); + + if (err) + return err; + + return written_count; + } - return uverbs_cmd_table[hdr.command](file, buf + sizeof hdr, - hdr.in_words * 4, hdr.out_words * 4); + return -ENOSYS; } static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 30f199e8579..c2b89cc5dbc 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -44,8 +44,11 @@ #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> +#include <rdma/ib_addr.h> -int ib_rate_to_mult(enum ib_rate rate) +#include "core_priv.h" + +__attribute_const__ int ib_rate_to_mult(enum ib_rate rate) { switch (rate) { case IB_RATE_2_5_GBPS: return 1; @@ -62,7 +65,7 @@ int ib_rate_to_mult(enum ib_rate rate) } EXPORT_SYMBOL(ib_rate_to_mult); -enum ib_rate mult_to_ib_rate(int mult) +__attribute_const__ enum ib_rate mult_to_ib_rate(int mult) { switch (mult) { case 1: return IB_RATE_2_5_GBPS; @@ -79,7 +82,7 @@ enum ib_rate mult_to_ib_rate(int mult) } EXPORT_SYMBOL(mult_to_ib_rate); -int ib_rate_to_mbps(enum ib_rate rate) +__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) { switch (rate) { case IB_RATE_2_5_GBPS: return 2500; @@ -104,7 +107,7 @@ int ib_rate_to_mbps(enum ib_rate rate) } EXPORT_SYMBOL(ib_rate_to_mbps); -enum rdma_transport_type +__attribute_const__ enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type) { switch (node_type) { @@ -114,6 +117,10 @@ rdma_node_get_transport(enum rdma_node_type node_type) return RDMA_TRANSPORT_IB; case RDMA_NODE_RNIC: return RDMA_TRANSPORT_IWARP; + case RDMA_NODE_USNIC: + return RDMA_TRANSPORT_USNIC; + case RDMA_NODE_USNIC_UDP: + return RDMA_TRANSPORT_USNIC_UDP; default: BUG(); return 0; @@ -130,6 +137,8 @@ enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_ case RDMA_TRANSPORT_IB: return IB_LINK_LAYER_INFINIBAND; case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_USNIC: + case RDMA_TRANSPORT_USNIC_UDP: return IB_LINK_LAYER_ETHERNET; default: return IB_LINK_LAYER_UNSPECIFIED; @@ -189,8 +198,28 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, u32 flow_class; u16 gid_index; int ret; + int is_eth = (rdma_port_get_link_layer(device, port_num) == + IB_LINK_LAYER_ETHERNET); memset(ah_attr, 0, sizeof *ah_attr); + if (is_eth) { + if (!(wc->wc_flags & IB_WC_GRH)) + return -EPROTOTYPE; + + if (wc->wc_flags & IB_WC_WITH_SMAC && + wc->wc_flags & IB_WC_WITH_VLAN) { + memcpy(ah_attr->dmac, wc->smac, ETH_ALEN); + ah_attr->vlan_id = wc->vlan_id; + } else { + ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid, + ah_attr->dmac, &ah_attr->vlan_id); + if (ret) + return ret; + } + } else { + ah_attr->vlan_id = 0xffff; + } + ah_attr->dlid = wc->slid; ah_attr->sl = wc->sl; ah_attr->src_path_bits = wc->dlid_path_bits; @@ -346,9 +375,13 @@ EXPORT_SYMBOL(ib_destroy_srq); static void __ib_shared_qp_event_handler(struct ib_event *event, void *context) { struct ib_qp *qp = context; + unsigned long flags; + spin_lock_irqsave(&qp->device->event_handler_lock, flags); list_for_each_entry(event->element.qp, &qp->open_list, open_list) - event->element.qp->event_handler(event, event->element.qp->qp_context); + if (event->element.qp->event_handler) + event->element.qp->event_handler(event, event->element.qp->qp_context); + spin_unlock_irqrestore(&qp->device->event_handler_lock, flags); } static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp) @@ -469,7 +502,9 @@ EXPORT_SYMBOL(ib_create_qp); static const struct { int valid; enum ib_qp_attr_mask req_param[IB_QPT_MAX]; + enum ib_qp_attr_mask req_param_add_eth[IB_QPT_MAX]; enum ib_qp_attr_mask opt_param[IB_QPT_MAX]; + enum ib_qp_attr_mask opt_param_add_eth[IB_QPT_MAX]; } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { [IB_QPS_RESET] = { [IB_QPS_RESET] = { .valid = 1 }, @@ -550,6 +585,12 @@ static const struct { IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER), }, + .req_param_add_eth = { + [IB_QPT_RC] = (IB_QP_SMAC), + [IB_QPT_UC] = (IB_QP_SMAC), + [IB_QPT_XRC_INI] = (IB_QP_SMAC), + [IB_QPT_XRC_TGT] = (IB_QP_SMAC) + }, .opt_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), @@ -569,7 +610,21 @@ static const struct { IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), - } + }, + .opt_param_add_eth = { + [IB_QPT_RC] = (IB_QP_ALT_SMAC | + IB_QP_VID | + IB_QP_ALT_VID), + [IB_QPT_UC] = (IB_QP_ALT_SMAC | + IB_QP_VID | + IB_QP_ALT_VID), + [IB_QPT_XRC_INI] = (IB_QP_ALT_SMAC | + IB_QP_VID | + IB_QP_ALT_VID), + [IB_QPT_XRC_TGT] = (IB_QP_ALT_SMAC | + IB_QP_VID | + IB_QP_ALT_VID) + } } }, [IB_QPS_RTR] = { @@ -772,7 +827,8 @@ static const struct { }; int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask) + enum ib_qp_type type, enum ib_qp_attr_mask mask, + enum rdma_link_layer ll) { enum ib_qp_attr_mask req_param, opt_param; @@ -791,6 +847,13 @@ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, req_param = qp_state_table[cur_state][next_state].req_param[type]; opt_param = qp_state_table[cur_state][next_state].opt_param[type]; + if (ll == IB_LINK_LAYER_ETHERNET) { + req_param |= qp_state_table[cur_state][next_state]. + req_param_add_eth[type]; + opt_param |= qp_state_table[cur_state][next_state]. + opt_param_add_eth[type]; + } + if ((mask & req_param) != req_param) return 0; @@ -801,10 +864,51 @@ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, } EXPORT_SYMBOL(ib_modify_qp_is_ok); +int ib_resolve_eth_l2_attrs(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, int *qp_attr_mask) +{ + int ret = 0; + union ib_gid sgid; + + if ((*qp_attr_mask & IB_QP_AV) && + (rdma_port_get_link_layer(qp->device, qp_attr->ah_attr.port_num) == IB_LINK_LAYER_ETHERNET)) { + ret = ib_query_gid(qp->device, qp_attr->ah_attr.port_num, + qp_attr->ah_attr.grh.sgid_index, &sgid); + if (ret) + goto out; + if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) { + rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, qp_attr->ah_attr.dmac); + rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac); + qp_attr->vlan_id = rdma_get_vlan_id(&sgid); + } else { + ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid, + qp_attr->ah_attr.dmac, &qp_attr->vlan_id); + if (ret) + goto out; + ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr->smac, NULL); + if (ret) + goto out; + } + *qp_attr_mask |= IB_QP_SMAC; + if (qp_attr->vlan_id < 0xFFFF) + *qp_attr_mask |= IB_QP_VID; + } +out: + return ret; +} +EXPORT_SYMBOL(ib_resolve_eth_l2_attrs); + + int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask) { + int ret; + + ret = ib_resolve_eth_l2_attrs(qp, qp_attr, &qp_attr_mask); + if (ret) + return ret; + return qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); } EXPORT_SYMBOL(ib_modify_qp); @@ -954,6 +1058,11 @@ EXPORT_SYMBOL(ib_resize_cq); struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) { struct ib_mr *mr; + int err; + + err = ib_check_mr_access(mr_access_flags); + if (err) + return ERR_PTR(err); mr = pd->device->get_dma_mr(pd, mr_access_flags); @@ -976,6 +1085,11 @@ struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, u64 *iova_start) { struct ib_mr *mr; + int err; + + err = ib_check_mr_access(mr_access_flags); + if (err) + return ERR_PTR(err); if (!pd->device->reg_phys_mr) return ERR_PTR(-ENOSYS); @@ -1006,6 +1120,10 @@ int ib_rereg_phys_mr(struct ib_mr *mr, struct ib_pd *old_pd; int ret; + ret = ib_check_mr_access(mr_access_flags); + if (ret) + return ret; + if (!mr->device->rereg_phys_mr) return -ENOSYS; @@ -1051,6 +1169,45 @@ int ib_dereg_mr(struct ib_mr *mr) } EXPORT_SYMBOL(ib_dereg_mr); +struct ib_mr *ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr) +{ + struct ib_mr *mr; + + if (!pd->device->create_mr) + return ERR_PTR(-ENOSYS); + + mr = pd->device->create_mr(pd, mr_init_attr); + + if (!IS_ERR(mr)) { + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + atomic_set(&mr->usecnt, 0); + } + + return mr; +} +EXPORT_SYMBOL(ib_create_mr); + +int ib_destroy_mr(struct ib_mr *mr) +{ + struct ib_pd *pd; + int ret; + + if (atomic_read(&mr->usecnt)) + return -EBUSY; + + pd = mr->pd; + ret = mr->device->destroy_mr(mr); + if (!ret) + atomic_dec(&pd->usecnt); + + return ret; +} +EXPORT_SYMBOL(ib_destroy_mr); + struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) { struct ib_mr *mr; @@ -1099,18 +1256,19 @@ EXPORT_SYMBOL(ib_free_fast_reg_page_list); /* Memory windows */ -struct ib_mw *ib_alloc_mw(struct ib_pd *pd) +struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) { struct ib_mw *mw; if (!pd->device->alloc_mw) return ERR_PTR(-ENOSYS); - mw = pd->device->alloc_mw(pd); + mw = pd->device->alloc_mw(pd, type); if (!IS_ERR(mw)) { mw->device = pd->device; mw->pd = pd; mw->uobject = NULL; + mw->type = type; atomic_inc(&pd->usecnt); } @@ -1252,3 +1410,38 @@ int ib_dealloc_xrcd(struct ib_xrcd *xrcd) return xrcd->device->dealloc_xrcd(xrcd); } EXPORT_SYMBOL(ib_dealloc_xrcd); + +struct ib_flow *ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, + int domain) +{ + struct ib_flow *flow_id; + if (!qp->device->create_flow) + return ERR_PTR(-ENOSYS); + + flow_id = qp->device->create_flow(qp, flow_attr, domain); + if (!IS_ERR(flow_id)) + atomic_inc(&qp->usecnt); + return flow_id; +} +EXPORT_SYMBOL(ib_create_flow); + +int ib_destroy_flow(struct ib_flow *flow_id) +{ + int err; + struct ib_qp *qp = flow_id->qp; + + err = qp->device->destroy_flow(flow_id); + if (!err) + atomic_dec(&qp->usecnt); + return err; +} +EXPORT_SYMBOL(ib_destroy_flow); + +int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, + struct ib_mr_status *mr_status) +{ + return mr->device->check_mr_status ? + mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS; +} +EXPORT_SYMBOL(ib_check_mr_status); diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile new file mode 100644 index 00000000000..e900b03531a --- /dev/null +++ b/drivers/infiniband/hw/Makefile @@ -0,0 +1,12 @@ +obj-$(CONFIG_INFINIBAND_MTHCA) += mthca/ +obj-$(CONFIG_INFINIBAND_IPATH) += ipath/ +obj-$(CONFIG_INFINIBAND_QIB) += qib/ +obj-$(CONFIG_INFINIBAND_EHCA) += ehca/ +obj-$(CONFIG_INFINIBAND_AMSO1100) += amso1100/ +obj-$(CONFIG_INFINIBAND_CXGB3) += cxgb3/ +obj-$(CONFIG_INFINIBAND_CXGB4) += cxgb4/ +obj-$(CONFIG_MLX4_INFINIBAND) += mlx4/ +obj-$(CONFIG_MLX5_INFINIBAND) += mlx5/ +obj-$(CONFIG_INFINIBAND_NES) += nes/ +obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ +obj-$(CONFIG_INFINIBAND_USNIC) += usnic/ diff --git a/drivers/infiniband/hw/amso1100/c2.c b/drivers/infiniband/hw/amso1100/c2.c index 5ce7b9e8bff..00400c352c1 100644 --- a/drivers/infiniband/hw/amso1100/c2.c +++ b/drivers/infiniband/hw/amso1100/c2.c @@ -920,8 +920,7 @@ static struct net_device *c2_devinit(struct c2_dev *c2dev, return netdev; } -static int __devinit c2_probe(struct pci_dev *pcidev, - const struct pci_device_id *ent) +static int c2_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) { int ret = 0, i; unsigned long reg0_start, reg0_flags, reg0_len; @@ -1083,6 +1082,7 @@ static int __devinit c2_probe(struct pci_dev *pcidev, /* Initialize network device */ if ((netdev = c2_devinit(c2dev, mmio_regs)) == NULL) { + ret = -ENOMEM; iounmap(mmio_regs); goto bail4; } @@ -1152,7 +1152,8 @@ static int __devinit c2_probe(struct pci_dev *pcidev, goto bail10; } - if (c2_register_device(c2dev)) + ret = c2_register_device(c2dev); + if (ret) goto bail10; return 0; @@ -1191,7 +1192,7 @@ static int __devinit c2_probe(struct pci_dev *pcidev, return ret; } -static void __devexit c2_remove(struct pci_dev *pcidev) +static void c2_remove(struct pci_dev *pcidev) { struct c2_dev *c2dev = pci_get_drvdata(pcidev); struct net_device *netdev = c2dev->netdev; @@ -1236,18 +1237,7 @@ static struct pci_driver c2_pci_driver = { .name = DRV_NAME, .id_table = c2_pci_table, .probe = c2_probe, - .remove = __devexit_p(c2_remove), + .remove = c2_remove, }; -static int __init c2_init_module(void) -{ - return pci_register_driver(&c2_pci_driver); -} - -static void __exit c2_exit_module(void) -{ - pci_unregister_driver(&c2_pci_driver); -} - -module_init(c2_init_module); -module_exit(c2_exit_module); +module_pci_driver(c2_pci_driver); diff --git a/drivers/infiniband/hw/amso1100/c2.h b/drivers/infiniband/hw/amso1100/c2.h index 6ae698e6877..d619d735838 100644 --- a/drivers/infiniband/hw/amso1100/c2.h +++ b/drivers/infiniband/hw/amso1100/c2.h @@ -265,7 +265,6 @@ struct c2_pd_table { struct c2_qp_table { struct idr idr; spinlock_t lock; - int last; }; struct c2_element { @@ -498,16 +497,16 @@ extern int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, struct ib_send_wr **bad_wr); extern int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, struct ib_recv_wr **bad_wr); -extern void __devinit c2_init_qp_table(struct c2_dev *c2dev); -extern void __devexit c2_cleanup_qp_table(struct c2_dev *c2dev); +extern void c2_init_qp_table(struct c2_dev *c2dev); +extern void c2_cleanup_qp_table(struct c2_dev *c2dev); extern void c2_set_qp_state(struct c2_qp *, int); extern struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn); /* PDs */ extern int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd); extern void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd); -extern int __devinit c2_init_pd_table(struct c2_dev *c2dev); -extern void __devexit c2_cleanup_pd_table(struct c2_dev *c2dev); +extern int c2_init_pd_table(struct c2_dev *c2dev); +extern void c2_cleanup_pd_table(struct c2_dev *c2dev); /* CQs */ extern int c2_init_cq(struct c2_dev *c2dev, int entries, diff --git a/drivers/infiniband/hw/amso1100/c2_ae.c b/drivers/infiniband/hw/amso1100/c2_ae.c index 32d34e88d5c..cedda25232b 100644 --- a/drivers/infiniband/hw/amso1100/c2_ae.c +++ b/drivers/infiniband/hw/amso1100/c2_ae.c @@ -141,7 +141,7 @@ static const char *to_qp_state_str(int state) return "C2_QP_STATE_ERROR"; default: return "<invalid QP state>"; - }; + } } void c2_ae_event(struct c2_dev *c2dev, u32 mq_index) @@ -155,6 +155,8 @@ void c2_ae_event(struct c2_dev *c2dev, u32 mq_index) enum c2_event_id event_id; unsigned long flags; int status; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_event.local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_event.remote_addr; /* * retrieve the message @@ -206,10 +208,10 @@ void c2_ae_event(struct c2_dev *c2dev, u32 mq_index) case CCAE_ACTIVE_CONNECT_RESULTS: res = &wr->ae.ae_active_connect_results; cm_event.event = IW_CM_EVENT_CONNECT_REPLY; - cm_event.local_addr.sin_addr.s_addr = res->laddr; - cm_event.remote_addr.sin_addr.s_addr = res->raddr; - cm_event.local_addr.sin_port = res->lport; - cm_event.remote_addr.sin_port = res->rport; + laddr->sin_addr.s_addr = res->laddr; + raddr->sin_addr.s_addr = res->raddr; + laddr->sin_port = res->lport; + raddr->sin_port = res->rport; if (status == 0) { cm_event.private_data_len = be32_to_cpu(res->private_data_length); @@ -281,10 +283,10 @@ void c2_ae_event(struct c2_dev *c2dev, u32 mq_index) } cm_event.event = IW_CM_EVENT_CONNECT_REQUEST; cm_event.provider_data = (void*)(unsigned long)req->cr_handle; - cm_event.local_addr.sin_addr.s_addr = req->laddr; - cm_event.remote_addr.sin_addr.s_addr = req->raddr; - cm_event.local_addr.sin_port = req->lport; - cm_event.remote_addr.sin_port = req->rport; + laddr->sin_addr.s_addr = req->laddr; + raddr->sin_addr.s_addr = req->raddr; + laddr->sin_port = req->lport; + raddr->sin_port = req->rport; cm_event.private_data_len = be32_to_cpu(req->private_data_length); cm_event.private_data = req->private_data; @@ -311,6 +313,7 @@ void c2_ae_event(struct c2_dev *c2dev, u32 mq_index) if (cq->ibcq.event_handler) cq->ibcq.event_handler(&ib_event, cq->ibcq.cq_context); + break; } default: diff --git a/drivers/infiniband/hw/amso1100/c2_cm.c b/drivers/infiniband/hw/amso1100/c2_cm.c index 95f58ab1e0b..23bfa94fbd4 100644 --- a/drivers/infiniband/hw/amso1100/c2_cm.c +++ b/drivers/infiniband/hw/amso1100/c2_cm.c @@ -46,6 +46,10 @@ int c2_llp_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) struct c2wr_qp_connect_req *wr; /* variable size needs a malloc. */ struct c2_vq_req *vq_req; int err; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + + if (cm_id->remote_addr.ss_family != AF_INET) + return -ENOSYS; ibqp = c2_get_qp(cm_id->device, iw_param->qpn); if (!ibqp) @@ -91,8 +95,8 @@ int c2_llp_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) wr->rnic_handle = c2dev->adapter_handle; wr->qp_handle = qp->adapter_handle; - wr->remote_addr = cm_id->remote_addr.sin_addr.s_addr; - wr->remote_port = cm_id->remote_addr.sin_port; + wr->remote_addr = raddr->sin_addr.s_addr; + wr->remote_port = raddr->sin_port; /* * Move any private data from the callers's buf into @@ -135,6 +139,10 @@ int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog) struct c2wr_ep_listen_create_rep *reply; struct c2_vq_req *vq_req; int err; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + + if (cm_id->local_addr.ss_family != AF_INET) + return -ENOSYS; c2dev = to_c2dev(cm_id->device); if (c2dev == NULL) @@ -153,8 +161,8 @@ int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog) c2_wr_set_id(&wr, CCWR_EP_LISTEN_CREATE); wr.hdr.context = (u64) (unsigned long) vq_req; wr.rnic_handle = c2dev->adapter_handle; - wr.local_addr = cm_id->local_addr.sin_addr.s_addr; - wr.local_port = cm_id->local_addr.sin_port; + wr.local_addr = laddr->sin_addr.s_addr; + wr.local_port = laddr->sin_port; wr.backlog = cpu_to_be32(backlog); wr.user_context = (u64) (unsigned long) cm_id; diff --git a/drivers/infiniband/hw/amso1100/c2_intr.c b/drivers/infiniband/hw/amso1100/c2_intr.c index 8951db4ae29..3a17d9b36db 100644 --- a/drivers/infiniband/hw/amso1100/c2_intr.c +++ b/drivers/infiniband/hw/amso1100/c2_intr.c @@ -169,7 +169,8 @@ static void handle_vq(struct c2_dev *c2dev, u32 mq_index) * We should never get here, as the adapter should * never send us a reply that we're not expecting. */ - vq_repbuf_free(c2dev, host_msg); + if (reply_msg != NULL) + vq_repbuf_free(c2dev, host_msg); pr_debug("handle_vq: UNEXPECTEDLY got NULL req\n"); return; } diff --git a/drivers/infiniband/hw/amso1100/c2_pd.c b/drivers/infiniband/hw/amso1100/c2_pd.c index 161f2a28535..f3e81dc357b 100644 --- a/drivers/infiniband/hw/amso1100/c2_pd.c +++ b/drivers/infiniband/hw/amso1100/c2_pd.c @@ -70,7 +70,7 @@ void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd) spin_unlock(&c2dev->pd_table.lock); } -int __devinit c2_init_pd_table(struct c2_dev *c2dev) +int c2_init_pd_table(struct c2_dev *c2dev) { c2dev->pd_table.last = 0; @@ -84,7 +84,7 @@ int __devinit c2_init_pd_table(struct c2_dev *c2dev) return 0; } -void __devexit c2_cleanup_pd_table(struct c2_dev *c2dev) +void c2_cleanup_pd_table(struct c2_dev *c2dev) { kfree(c2dev->pd_table.table); } diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c index 07eb3a8067d..8af33cf1fc4 100644 --- a/drivers/infiniband/hw/amso1100/c2_provider.c +++ b/drivers/infiniband/hw/amso1100/c2_provider.c @@ -431,9 +431,9 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 *pages; u64 kva = 0; int shift, n, len; - int i, j, k; + int i, k, entry; int err = 0; - struct ib_umem_chunk *chunk; + struct scatterlist *sg; struct c2_pd *c2pd = to_c2pd(pd); struct c2_mr *c2mr; @@ -452,10 +452,7 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } shift = ffs(c2mr->umem->page_size) - 1; - - n = 0; - list_for_each_entry(chunk, &c2mr->umem->chunk_list, list) - n += chunk->nents; + n = c2mr->umem->nmap; pages = kmalloc(n * sizeof(u64), GFP_KERNEL); if (!pages) { @@ -464,14 +461,12 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } i = 0; - list_for_each_entry(chunk, &c2mr->umem->chunk_list, list) { - for (j = 0; j < chunk->nmap; ++j) { - len = sg_dma_len(&chunk->page_list[j]) >> shift; - for (k = 0; k < len; ++k) { - pages[i++] = - sg_dma_address(&chunk->page_list[j]) + - (c2mr->umem->page_size * k); - } + for_each_sg(c2mr->umem->sg_head.sgl, sg, c2mr->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = + sg_dma_address(sg) + + (c2mr->umem->page_size * k); } } diff --git a/drivers/infiniband/hw/amso1100/c2_qp.c b/drivers/infiniband/hw/amso1100/c2_qp.c index 0d7b6f23caf..86708dee58b 100644 --- a/drivers/infiniband/hw/amso1100/c2_qp.c +++ b/drivers/infiniband/hw/amso1100/c2_qp.c @@ -382,14 +382,16 @@ static int c2_alloc_qpn(struct c2_dev *c2dev, struct c2_qp *qp) { int ret; - do { - spin_lock_irq(&c2dev->qp_table.lock); - ret = idr_get_new_above(&c2dev->qp_table.idr, qp, - c2dev->qp_table.last++, &qp->qpn); - spin_unlock_irq(&c2dev->qp_table.lock); - } while ((ret == -EAGAIN) && - idr_pre_get(&c2dev->qp_table.idr, GFP_KERNEL)); - return ret; + idr_preload(GFP_KERNEL); + spin_lock_irq(&c2dev->qp_table.lock); + + ret = idr_alloc_cyclic(&c2dev->qp_table.idr, qp, 0, 0, GFP_NOWAIT); + if (ret >= 0) + qp->qpn = ret; + + spin_unlock_irq(&c2dev->qp_table.lock); + idr_preload_end(); + return ret < 0 ? ret : 0; } static void c2_free_qpn(struct c2_dev *c2dev, int qpn) @@ -1010,13 +1012,13 @@ out: return err; } -void __devinit c2_init_qp_table(struct c2_dev *c2dev) +void c2_init_qp_table(struct c2_dev *c2dev) { spin_lock_init(&c2dev->qp_table.lock); idr_init(&c2dev->qp_table.idr); } -void __devexit c2_cleanup_qp_table(struct c2_dev *c2dev) +void c2_cleanup_qp_table(struct c2_dev *c2dev) { idr_destroy(&c2dev->qp_table.idr); } diff --git a/drivers/infiniband/hw/amso1100/c2_rnic.c b/drivers/infiniband/hw/amso1100/c2_rnic.c index e4a73158fc7..d2a6d961344 100644 --- a/drivers/infiniband/hw/amso1100/c2_rnic.c +++ b/drivers/infiniband/hw/amso1100/c2_rnic.c @@ -442,7 +442,7 @@ static int c2_rnic_close(struct c2_dev *c2dev) * involves initializing the various limits and resource pools that * comprise the RNIC instance. */ -int __devinit c2_rnic_init(struct c2_dev *c2dev) +int c2_rnic_init(struct c2_dev *c2dev) { int err; u32 qsize, msgsize; @@ -576,7 +576,8 @@ int __devinit c2_rnic_init(struct c2_dev *c2dev) goto bail4; /* Initialize cached the adapter limits */ - if (c2_rnic_query(c2dev, &c2dev->props)) + err = c2_rnic_query(c2dev, &c2dev->props); + if (err) goto bail5; /* Initialize the PD pool */ @@ -611,7 +612,7 @@ int __devinit c2_rnic_init(struct c2_dev *c2dev) /* * Called by c2_remove to cleanup the RNIC resources. */ -void __devexit c2_rnic_term(struct c2_dev *c2dev) +void c2_rnic_term(struct c2_dev *c2dev) { /* Close the open adapter instance */ diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index c3f5aca4ef0..de1c61b417d 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -735,14 +735,12 @@ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, ((perm & TPT_MW_BIND) ? F_TPT_MW_BIND_ENABLE : 0) | V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) | V_TPT_PAGE_SIZE(page_size)); - tpt.rsvd_pbl_addr = reset_tpt_entry ? 0 : - cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3)); + tpt.rsvd_pbl_addr = cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3)); tpt.len = cpu_to_be32(len); tpt.va_hi = cpu_to_be32((u32) (to >> 32)); tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL)); tpt.rsvd_bind_cnt_or_pstag = 0; - tpt.rsvd_pbl_size = reset_tpt_entry ? 0 : - cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2)); + tpt.rsvd_pbl_size = cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2)); } err = cxio_hal_ctrl_qp_write_mem(rdev_p, stag_idx + diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c index 31f9201b298..c40088ecf9f 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_resource.c +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c @@ -62,13 +62,13 @@ static int __cxio_init_resource_fifo(struct kfifo *fifo, kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32)); if (random) { j = 0; - random_bytes = random32(); + random_bytes = prandom_u32(); for (i = 0; i < RANDOM_SIZE; i++) rarray[i] = i + skip_low; for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) { if (j >= RANDOM_SIZE) { j = 0; - random_bytes = random32(); + random_bytes = prandom_u32(); } idx = (random_bytes >> (j * 2)) & 0xF; kfifo_in(fifo, diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h index a1c44578e03..837862287a2 100644 --- a/drivers/infiniband/hw/cxgb3/iwch.h +++ b/drivers/infiniband/hw/cxgb3/iwch.h @@ -153,19 +153,17 @@ static inline int insert_handle(struct iwch_dev *rhp, struct idr *idr, void *handle, u32 id) { int ret; - int newid; - - do { - if (!idr_pre_get(idr, GFP_KERNEL)) { - return -ENOMEM; - } - spin_lock_irq(&rhp->lock); - ret = idr_get_new_above(idr, handle, id, &newid); - BUG_ON(newid != id); - spin_unlock_irq(&rhp->lock); - } while (ret == -EAGAIN); - - return ret; + + idr_preload(GFP_KERNEL); + spin_lock_irq(&rhp->lock); + + ret = idr_alloc(idr, handle, id, id + 1, GFP_NOWAIT); + + spin_unlock_irq(&rhp->lock); + idr_preload_end(); + + BUG_ON(ret == -ENOSPC); + return ret < 0 ? ret : 0; } static inline void remove_handle(struct iwch_dev *rhp, struct idr *idr, u32 id) diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index aaf88ef9409..cb78b1e9bcd 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -128,9 +128,8 @@ static void stop_ep_timer(struct iwch_ep *ep) { PDBG("%s ep %p\n", __func__, ep); if (!timer_pending(&ep->timer)) { - printk(KERN_ERR "%s timer stopped when its not running! ep %p state %u\n", + WARN(1, "%s timer stopped when its not running! ep %p state %u\n", __func__, ep, ep->com.state); - WARN_ON(1); return; } del_timer_sync(&ep->timer); @@ -419,6 +418,7 @@ static int send_abort(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp) skb->priority = CPL_PRIORITY_DATA; set_arp_failure_handler(skb, abort_arp_failure); req = (struct cpl_abort_req *) skb_put(skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid)); @@ -722,8 +722,10 @@ static void connect_reply_upcall(struct iwch_ep *ep, int status) memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CONNECT_REPLY; event.status = status; - event.local_addr = ep->com.local_addr; - event.remote_addr = ep->com.remote_addr; + memcpy(&event.local_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&event.remote_addr, &ep->com.remote_addr, + sizeof(ep->com.remote_addr)); if ((status == 0) || (status == -ECONNREFUSED)) { event.private_data_len = ep->plen; @@ -748,8 +750,10 @@ static void connect_request_upcall(struct iwch_ep *ep) PDBG("%s ep %p tid %d\n", __func__, ep, ep->hwtid); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CONNECT_REQUEST; - event.local_addr = ep->com.local_addr; - event.remote_addr = ep->com.remote_addr; + memcpy(&event.local_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&event.remote_addr, &ep->com.remote_addr, + sizeof(ep->com.local_addr)); event.private_data_len = ep->plen; event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); event.provider_data = ep; @@ -1756,9 +1760,8 @@ static void ep_timeout(unsigned long arg) __state_set(&ep->com, ABORTING); break; default: - printk(KERN_ERR "%s unexpected state ep %p state %u\n", + WARN(1, "%s unexpected state ep %p state %u\n", __func__, ep, ep->com.state); - WARN_ON(1); abort = 0; } spin_unlock_irqrestore(&ep->com.lock, flags); @@ -1874,8 +1877,9 @@ err: static int is_loopback_dst(struct iw_cm_id *cm_id) { struct net_device *dev; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; - dev = ip_dev_find(&init_net, cm_id->remote_addr.sin_addr.s_addr); + dev = ip_dev_find(&init_net, raddr->sin_addr.s_addr); if (!dev) return 0; dev_put(dev); @@ -1888,6 +1892,13 @@ int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct iwch_ep *ep; struct rtable *rt; int err = 0; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + + if (cm_id->remote_addr.ss_family != PF_INET) { + err = -ENOSYS; + goto out; + } if (is_loopback_dst(cm_id)) { err = -ENOSYS; @@ -1931,11 +1942,9 @@ int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) } /* find a route */ - rt = find_route(h->rdev.t3cdev_p, - cm_id->local_addr.sin_addr.s_addr, - cm_id->remote_addr.sin_addr.s_addr, - cm_id->local_addr.sin_port, - cm_id->remote_addr.sin_port, IPTOS_LOWDELAY); + rt = find_route(h->rdev.t3cdev_p, laddr->sin_addr.s_addr, + raddr->sin_addr.s_addr, laddr->sin_port, + raddr->sin_port, IPTOS_LOWDELAY); if (!rt) { printk(KERN_ERR MOD "%s - cannot find route.\n", __func__); err = -EHOSTUNREACH; @@ -1943,7 +1952,7 @@ int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) } ep->dst = &rt->dst; ep->l2t = t3_l2t_get(ep->com.tdev, ep->dst, NULL, - &cm_id->remote_addr.sin_addr.s_addr); + &raddr->sin_addr.s_addr); if (!ep->l2t) { printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); err = -ENOMEM; @@ -1952,8 +1961,10 @@ int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) state_set(&ep->com, CONNECTING); ep->tos = IPTOS_LOWDELAY; - ep->com.local_addr = cm_id->local_addr; - ep->com.remote_addr = cm_id->remote_addr; + memcpy(&ep->com.local_addr, &cm_id->local_addr, + sizeof(ep->com.local_addr)); + memcpy(&ep->com.remote_addr, &cm_id->remote_addr, + sizeof(ep->com.remote_addr)); /* send connect request to rnic */ err = send_connect(ep); @@ -1981,6 +1992,11 @@ int iwch_create_listen(struct iw_cm_id *cm_id, int backlog) might_sleep(); + if (cm_id->local_addr.ss_family != PF_INET) { + err = -ENOSYS; + goto fail1; + } + ep = alloc_ep(sizeof(*ep), GFP_KERNEL); if (!ep) { printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__); @@ -1992,7 +2008,8 @@ int iwch_create_listen(struct iw_cm_id *cm_id, int backlog) cm_id->add_ref(cm_id); ep->com.cm_id = cm_id; ep->backlog = backlog; - ep->com.local_addr = cm_id->local_addr; + memcpy(&ep->com.local_addr, &cm_id->local_addr, + sizeof(ep->com.local_addr)); /* * Allocate a server TID. diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 0bdf09aa6f4..811b24a539c 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -226,6 +226,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, int entries, int ve mm->len = PAGE_ALIGN(((1UL << uresp.size_log2) + 1) * sizeof(struct t3_cqe)); uresp.memsize = mm->len; + uresp.reserved = 0; resplen = sizeof uresp; } if (ib_copy_to_udata(udata, &uresp, resplen)) { @@ -559,7 +560,7 @@ static int iwch_reregister_phys_mem(struct ib_mr *mr, __be64 *page_list = NULL; int shift = 0; u64 total_size; - int npages; + int npages = 0; int ret; PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd); @@ -617,14 +618,13 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, { __be64 *pages; int shift, n, len; - int i, j, k; + int i, k, entry; int err = 0; - struct ib_umem_chunk *chunk; struct iwch_dev *rhp; struct iwch_pd *php; struct iwch_mr *mhp; struct iwch_reg_user_mr_resp uresp; - + struct scatterlist *sg; PDBG("%s ib_pd %p\n", __func__, pd); php = to_iwch_pd(pd); @@ -644,9 +644,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, shift = ffs(mhp->umem->page_size) - 1; - n = 0; - list_for_each_entry(chunk, &mhp->umem->chunk_list, list) - n += chunk->nents; + n = mhp->umem->nmap; err = iwch_alloc_pbl(mhp, n); if (err) @@ -660,12 +658,10 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, i = n = 0; - list_for_each_entry(chunk, &mhp->umem->chunk_list, list) - for (j = 0; j < chunk->nmap; ++j) { - len = sg_dma_len(&chunk->page_list[j]) >> shift; + for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; for (k = 0; k < len; ++k) { - pages[i++] = cpu_to_be64(sg_dma_address( - &chunk->page_list[j]) + + pages[i++] = cpu_to_be64(sg_dma_address(sg) + mhp->umem->page_size * k); if (i == PAGE_SIZE / sizeof *pages) { err = iwch_write_pbl(mhp, pages, i, n); @@ -675,7 +671,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, i = 0; } } - } + } if (i) err = iwch_write_pbl(mhp, pages, i, n); @@ -738,7 +734,7 @@ static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc) return ibmr; } -static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd) +static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) { struct iwch_dev *rhp; struct iwch_pd *php; @@ -747,6 +743,9 @@ static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd) u32 stag = 0; int ret; + if (type != IB_MW_TYPE_1) + return ERR_PTR(-EINVAL); + php = to_iwch_pd(pd); rhp = php->rhp; mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); @@ -783,8 +782,8 @@ static int iwch_dealloc_mw(struct ib_mw *mw) mmid = (mw->rkey) >> 8; cxio_deallocate_window(&rhp->rdev, mhp->attr.stag); remove_handle(rhp, &rhp->mmidr, mmid); - kfree(mhp); PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp); + kfree(mhp); return 0; } diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c index 6de8463f453..b57c0befd96 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_qp.c +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -567,18 +567,19 @@ int iwch_bind_mw(struct ib_qp *qp, if (mw_bind->send_flags & IB_SEND_SIGNALED) t3_wr_flags = T3_COMPLETION_FLAG; - sgl.addr = mw_bind->addr; - sgl.lkey = mw_bind->mr->lkey; - sgl.length = mw_bind->length; + sgl.addr = mw_bind->bind_info.addr; + sgl.lkey = mw_bind->bind_info.mr->lkey; + sgl.length = mw_bind->bind_info.length; wqe->bind.reserved = 0; wqe->bind.type = TPT_VATO; /* TBD: check perms */ - wqe->bind.perms = iwch_ib_to_tpt_bind_access(mw_bind->mw_access_flags); - wqe->bind.mr_stag = cpu_to_be32(mw_bind->mr->lkey); + wqe->bind.perms = iwch_ib_to_tpt_bind_access( + mw_bind->bind_info.mw_access_flags); + wqe->bind.mr_stag = cpu_to_be32(mw_bind->bind_info.mr->lkey); wqe->bind.mw_stag = cpu_to_be32(mw->rkey); - wqe->bind.mw_len = cpu_to_be32(mw_bind->length); - wqe->bind.mw_va = cpu_to_be64(mw_bind->addr); + wqe->bind.mw_len = cpu_to_be32(mw_bind->bind_info.length); + wqe->bind.mw_va = cpu_to_be64(mw_bind->bind_info.addr); err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size); if (err) { spin_unlock_irqrestore(&qhp->lock, flag); @@ -882,7 +883,8 @@ u16 iwch_rqes_posted(struct iwch_qp *qhp) { union t3_wr *wqe = qhp->wq.queue; u16 count = 0; - while ((count+1) != 0 && fw_riwrh_opcode((struct fw_riwrh *)wqe) == T3_WR_RCV) { + + while (count < USHRT_MAX && fw_riwrh_opcode((struct fw_riwrh *)wqe) == T3_WR_RCV) { count++; wqe++; } diff --git a/drivers/infiniband/hw/cxgb4/Kconfig b/drivers/infiniband/hw/cxgb4/Kconfig index 6b7e6c54353..23f38cf2c5c 100644 --- a/drivers/infiniband/hw/cxgb4/Kconfig +++ b/drivers/infiniband/hw/cxgb4/Kconfig @@ -1,10 +1,10 @@ config INFINIBAND_CXGB4 - tristate "Chelsio T4 RDMA Driver" - depends on CHELSIO_T4 && INET + tristate "Chelsio T4/T5 RDMA Driver" + depends on CHELSIO_T4 && INET && (IPV6 || IPV6=n) select GENERIC_ALLOCATOR ---help--- - This is an iWARP/RDMA driver for the Chelsio T4 1GbE and - 10GbE adapters. + This is an iWARP/RDMA driver for the Chelsio T4 and T5 + 1GbE, 10GbE adapters and T5 40GbE adapter. For general information about Chelsio and our products, visit our website at <http://www.chelsio.com>. diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 6cfd4d8fd0b..768a0fb67dd 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * Copyright (c) 2009-2014 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -38,10 +38,16 @@ #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/tcp.h> +#include <linux/if_vlan.h> #include <net/neighbour.h> #include <net/netevent.h> #include <net/route.h> +#include <net/tcp.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> + +#include <rdma/ib_addr.h> #include "iw_cxgb4.h" @@ -61,6 +67,14 @@ static char *states[] = { NULL, }; +static int nocong; +module_param(nocong, int, 0644); +MODULE_PARM_DESC(nocong, "Turn of congestion control (default=0)"); + +static int enable_ecn; +module_param(enable_ecn, int, 0644); +MODULE_PARM_DESC(enable_ecn, "Enable ECN (default=0/disabled)"); + static int dack_mode = 1; module_param(dack_mode, int, 0644); MODULE_PARM_DESC(dack_mode, "Delayed ack mode (default=1)"); @@ -86,9 +100,9 @@ int c4iw_debug; module_param(c4iw_debug, int, 0644); MODULE_PARM_DESC(c4iw_debug, "Enable debug logging (default=0)"); -static int peer2peer; +static int peer2peer = 1; module_param(peer2peer, int, 0644); -MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=0)"); +MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=1)"); static int p2p_type = FW_RI_INIT_P2PTYPE_READ_REQ; module_param(p2p_type, int, 0644); @@ -133,31 +147,43 @@ static void connect_reply_upcall(struct c4iw_ep *ep, int status); static LIST_HEAD(timeout_list); static spinlock_t timeout_lock; +static void deref_qp(struct c4iw_ep *ep) +{ + c4iw_qp_rem_ref(&ep->com.qp->ibqp); + clear_bit(QP_REFERENCED, &ep->com.flags); +} + +static void ref_qp(struct c4iw_ep *ep) +{ + set_bit(QP_REFERENCED, &ep->com.flags); + c4iw_qp_add_ref(&ep->com.qp->ibqp); +} + static void start_ep_timer(struct c4iw_ep *ep) { PDBG("%s ep %p\n", __func__, ep); if (timer_pending(&ep->timer)) { - PDBG("%s stopped / restarted timer ep %p\n", __func__, ep); - del_timer_sync(&ep->timer); - } else - c4iw_get_ep(&ep->com); + pr_err("%s timer already started! ep %p\n", + __func__, ep); + return; + } + clear_bit(TIMEOUT, &ep->com.flags); + c4iw_get_ep(&ep->com); ep->timer.expires = jiffies + ep_timeout_secs * HZ; ep->timer.data = (unsigned long)ep; ep->timer.function = ep_timeout; add_timer(&ep->timer); } -static void stop_ep_timer(struct c4iw_ep *ep) +static int stop_ep_timer(struct c4iw_ep *ep) { - PDBG("%s ep %p\n", __func__, ep); - if (!timer_pending(&ep->timer)) { - printk(KERN_ERR "%s timer stopped when its not running! " - "ep %p state %u\n", __func__, ep, ep->com.state); - WARN_ON(1); - return; - } + PDBG("%s ep %p stopping\n", __func__, ep); del_timer_sync(&ep->timer); - c4iw_put_ep(&ep->com); + if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) { + c4iw_put_ep(&ep->com); + return 0; + } + return 1; } static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb, @@ -208,12 +234,16 @@ static void release_tid(struct c4iw_rdev *rdev, u32 hwtid, struct sk_buff *skb) static void set_emss(struct c4iw_ep *ep, u16 opt) { - ep->emss = ep->com.dev->rdev.lldi.mtus[GET_TCPOPT_MSS(opt)] - 40; + ep->emss = ep->com.dev->rdev.lldi.mtus[GET_TCPOPT_MSS(opt)] - + sizeof(struct iphdr) - sizeof(struct tcphdr); ep->mss = ep->emss; if (GET_TCPOPT_TSTAMP(opt)) ep->emss -= 12; if (ep->emss < 128) ep->emss = 128; + if (ep->emss & 7) + PDBG("Warning: misaligned mtu idx %u mss %u emss=%u\n", + GET_TCPOPT_MSS(opt), ep->mss, ep->emss); PDBG("%s mss_idx %u mss %u emss=%u\n", __func__, GET_TCPOPT_MSS(opt), ep->mss, ep->emss); } @@ -262,11 +292,20 @@ void _c4iw_free_ep(struct kref *kref) ep = container_of(kref, struct c4iw_ep, com.kref); PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]); + if (test_bit(QP_REFERENCED, &ep->com.flags)) + deref_qp(ep); if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) { + remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid); cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid); dst_release(ep->dst); cxgb4_l2t_release(ep->l2t); } + if (test_bit(RELEASE_MAPINFO, &ep->com.flags)) { + print_addr(&ep->com, __func__, "remove_mapinfo/mapping"); + iwpm_remove_mapinfo(&ep->com.local_addr, + &ep->com.mapped_local_addr); + iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW); + } kfree(ep); } @@ -308,22 +347,78 @@ static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp) } else { skb = alloc_skb(len, gfp); } + t4_set_arp_err_handler(skb, NULL, NULL); return skb; } -static struct rtable *find_route(struct c4iw_dev *dev, __be32 local_ip, +static struct net_device *get_real_dev(struct net_device *egress_dev) +{ + return rdma_vlan_dev_real_dev(egress_dev) ? : egress_dev; +} + +static int our_interface(struct c4iw_dev *dev, struct net_device *egress_dev) +{ + int i; + + egress_dev = get_real_dev(egress_dev); + for (i = 0; i < dev->rdev.lldi.nports; i++) + if (dev->rdev.lldi.ports[i] == egress_dev) + return 1; + return 0; +} + +static struct dst_entry *find_route6(struct c4iw_dev *dev, __u8 *local_ip, + __u8 *peer_ip, __be16 local_port, + __be16 peer_port, u8 tos, + __u32 sin6_scope_id) +{ + struct dst_entry *dst = NULL; + + if (IS_ENABLED(CONFIG_IPV6)) { + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + memcpy(&fl6.daddr, peer_ip, 16); + memcpy(&fl6.saddr, local_ip, 16); + if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL) + fl6.flowi6_oif = sin6_scope_id; + dst = ip6_route_output(&init_net, NULL, &fl6); + if (!dst) + goto out; + if (!our_interface(dev, ip6_dst_idev(dst)->dev) && + !(ip6_dst_idev(dst)->dev->flags & IFF_LOOPBACK)) { + dst_release(dst); + dst = NULL; + } + } + +out: + return dst; +} + +static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip, __be32 peer_ip, __be16 local_port, __be16 peer_port, u8 tos) { struct rtable *rt; struct flowi4 fl4; + struct neighbour *n; rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip, local_ip, peer_port, local_port, IPPROTO_TCP, tos, 0); if (IS_ERR(rt)) return NULL; - return rt; + n = dst_neigh_lookup(&rt->dst, &peer_ip); + if (!n) + return NULL; + if (!our_interface(dev, n->dev) && + !(n->dev->flags & IFF_LOOPBACK)) { + dst_release(&rt->dst); + return NULL; + } + neigh_release(n); + return &rt->dst; } static void arp_failure_discard(void *handle, struct sk_buff *skb) @@ -337,8 +432,17 @@ static void arp_failure_discard(void *handle, struct sk_buff *skb) */ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb) { + struct c4iw_ep *ep = handle; + printk(KERN_ERR MOD "ARP failure duing connect\n"); kfree_skb(skb); + connect_reply_upcall(ep, -EHOSTUNREACH); + state_set(&ep->com, DEAD); + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); + cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + c4iw_put_ep(&ep->com); } /* @@ -382,7 +486,7 @@ static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb) flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT; flowc->mnemval[5].val = cpu_to_be32(ep->rcv_seq); flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF; - flowc->mnemval[6].val = cpu_to_be32(snd_win); + flowc->mnemval[6].val = cpu_to_be32(ep->snd_win); flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS; flowc->mnemval[7].val = cpu_to_be32(ep->emss); /* Pad WR to 16 byte boundary */ @@ -442,15 +546,80 @@ static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp) return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } +/* + * c4iw_form_pm_msg - Form a port mapper message with mapping info + */ +static void c4iw_form_pm_msg(struct c4iw_ep *ep, + struct iwpm_sa_data *pm_msg) +{ + memcpy(&pm_msg->loc_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&pm_msg->rem_addr, &ep->com.remote_addr, + sizeof(ep->com.remote_addr)); +} + +/* + * c4iw_form_reg_msg - Form a port mapper message with dev info + */ +static void c4iw_form_reg_msg(struct c4iw_dev *dev, + struct iwpm_dev_data *pm_msg) +{ + memcpy(pm_msg->dev_name, dev->ibdev.name, IWPM_DEVNAME_SIZE); + memcpy(pm_msg->if_name, dev->rdev.lldi.ports[0]->name, + IWPM_IFNAME_SIZE); +} + +static void c4iw_record_pm_msg(struct c4iw_ep *ep, + struct iwpm_sa_data *pm_msg) +{ + memcpy(&ep->com.mapped_local_addr, &pm_msg->mapped_loc_addr, + sizeof(ep->com.mapped_local_addr)); + memcpy(&ep->com.mapped_remote_addr, &pm_msg->mapped_rem_addr, + sizeof(ep->com.mapped_remote_addr)); +} + +static void best_mtu(const unsigned short *mtus, unsigned short mtu, + unsigned int *idx, int use_ts) +{ + unsigned short hdr_size = sizeof(struct iphdr) + + sizeof(struct tcphdr) + + (use_ts ? 12 : 0); + unsigned short data_size = mtu - hdr_size; + + cxgb4_best_aligned_mtu(mtus, hdr_size, data_size, 8, idx); +} + static int send_connect(struct c4iw_ep *ep) { struct cpl_act_open_req *req; + struct cpl_t5_act_open_req *t5_req; + struct cpl_act_open_req6 *req6; + struct cpl_t5_act_open_req6 *t5_req6; struct sk_buff *skb; u64 opt0; u32 opt2; unsigned int mtu_idx; int wscale; - int wrlen = roundup(sizeof *req, 16); + int wrlen; + int sizev4 = is_t4(ep->com.dev->rdev.lldi.adapter_type) ? + sizeof(struct cpl_act_open_req) : + sizeof(struct cpl_t5_act_open_req); + int sizev6 = is_t4(ep->com.dev->rdev.lldi.adapter_type) ? + sizeof(struct cpl_act_open_req6) : + sizeof(struct cpl_t5_act_open_req6); + struct sockaddr_in *la = (struct sockaddr_in *) + &ep->com.mapped_local_addr; + struct sockaddr_in *ra = (struct sockaddr_in *) + &ep->com.mapped_remote_addr; + struct sockaddr_in6 *la6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; + struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *) + &ep->com.mapped_remote_addr; + int win; + + wrlen = (ep->com.remote_addr.ss_family == AF_INET) ? + roundup(sizev4, 16) : + roundup(sizev6, 16); PDBG("%s ep %p atid %u\n", __func__, ep, ep->atid); @@ -462,9 +631,20 @@ static int send_connect(struct c4iw_ep *ep) } set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx); - cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx); + best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, + enable_tcp_timestamps); wscale = compute_wscale(rcv_win); - opt0 = KEEP_ALIVE(1) | + + /* + * Specify the largest window that will fit in opt0. The + * remainder will be specified in the rx_data_ack. + */ + win = ep->rcv_win >> 10; + if (win > RCV_BUFSIZ_MASK) + win = RCV_BUFSIZ_MASK; + + opt0 = (nocong ? NO_CONG(1) : 0) | + KEEP_ALIVE(1) | DELACK(1) | WND_SCALE(wscale) | MSS_IDX(mtu_idx) | @@ -473,8 +653,9 @@ static int send_connect(struct c4iw_ep *ep) SMAC_SEL(ep->smac_idx) | DSCP(ep->tos) | ULP_MODE(ULP_MODE_TCPDDP) | - RCV_BUFSIZ(rcv_win>>10); + RCV_BUFSIZ(win); opt2 = RX_CHANNEL(0) | + CCTRL_ECN(enable_ecn) | RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid); if (enable_tcp_timestamps) opt2 |= TSTAMPS_EN(1); @@ -482,19 +663,109 @@ static int send_connect(struct c4iw_ep *ep) opt2 |= SACK_EN(1); if (wscale && enable_tcp_window_scaling) opt2 |= WND_SCALE_EN(1); - t4_set_arp_err_handler(skb, NULL, act_open_req_arp_failure); - - req = (struct cpl_act_open_req *) skb_put(skb, wrlen); - INIT_TP_WR(req, 0); - OPCODE_TID(req) = cpu_to_be32( - MK_OPCODE_TID(CPL_ACT_OPEN_REQ, ((ep->rss_qid<<14)|ep->atid))); - req->local_port = ep->com.local_addr.sin_port; - req->peer_port = ep->com.remote_addr.sin_port; - req->local_ip = ep->com.local_addr.sin_addr.s_addr; - req->peer_ip = ep->com.remote_addr.sin_addr.s_addr; - req->opt0 = cpu_to_be64(opt0); - req->params = 0; - req->opt2 = cpu_to_be32(opt2); + if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) { + opt2 |= T5_OPT_2_VALID; + opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); + } + t4_set_arp_err_handler(skb, ep, act_open_req_arp_failure); + + if (is_t4(ep->com.dev->rdev.lldi.adapter_type)) { + if (ep->com.remote_addr.ss_family == AF_INET) { + req = (struct cpl_act_open_req *) skb_put(skb, wrlen); + INIT_TP_WR(req, 0); + OPCODE_TID(req) = cpu_to_be32( + MK_OPCODE_TID(CPL_ACT_OPEN_REQ, + ((ep->rss_qid << 14) | ep->atid))); + req->local_port = la->sin_port; + req->peer_port = ra->sin_port; + req->local_ip = la->sin_addr.s_addr; + req->peer_ip = ra->sin_addr.s_addr; + req->opt0 = cpu_to_be64(opt0); + req->params = cpu_to_be32(cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t)); + req->opt2 = cpu_to_be32(opt2); + } else { + req6 = (struct cpl_act_open_req6 *)skb_put(skb, wrlen); + + INIT_TP_WR(req6, 0); + OPCODE_TID(req6) = cpu_to_be32( + MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, + ((ep->rss_qid<<14)|ep->atid))); + req6->local_port = la6->sin6_port; + req6->peer_port = ra6->sin6_port; + req6->local_ip_hi = *((__be64 *) + (la6->sin6_addr.s6_addr)); + req6->local_ip_lo = *((__be64 *) + (la6->sin6_addr.s6_addr + 8)); + req6->peer_ip_hi = *((__be64 *) + (ra6->sin6_addr.s6_addr)); + req6->peer_ip_lo = *((__be64 *) + (ra6->sin6_addr.s6_addr + 8)); + req6->opt0 = cpu_to_be64(opt0); + req6->params = cpu_to_be32(cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t)); + req6->opt2 = cpu_to_be32(opt2); + } + } else { + u32 isn = (prandom_u32() & ~7UL) - 1; + + opt2 |= T5_OPT_2_VALID; + opt2 |= CONG_CNTRL_VALID; /* OPT_2_ISS for T5 */ + if (peer2peer) + isn += 4; + + if (ep->com.remote_addr.ss_family == AF_INET) { + t5_req = (struct cpl_t5_act_open_req *) + skb_put(skb, wrlen); + INIT_TP_WR(t5_req, 0); + OPCODE_TID(t5_req) = cpu_to_be32( + MK_OPCODE_TID(CPL_ACT_OPEN_REQ, + ((ep->rss_qid << 14) | ep->atid))); + t5_req->local_port = la->sin_port; + t5_req->peer_port = ra->sin_port; + t5_req->local_ip = la->sin_addr.s_addr; + t5_req->peer_ip = ra->sin_addr.s_addr; + t5_req->opt0 = cpu_to_be64(opt0); + t5_req->params = cpu_to_be64(V_FILTER_TUPLE( + cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t))); + t5_req->rsvd = cpu_to_be32(isn); + PDBG("%s snd_isn %u\n", __func__, + be32_to_cpu(t5_req->rsvd)); + t5_req->opt2 = cpu_to_be32(opt2); + } else { + t5_req6 = (struct cpl_t5_act_open_req6 *) + skb_put(skb, wrlen); + INIT_TP_WR(t5_req6, 0); + OPCODE_TID(t5_req6) = cpu_to_be32( + MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, + ((ep->rss_qid<<14)|ep->atid))); + t5_req6->local_port = la6->sin6_port; + t5_req6->peer_port = ra6->sin6_port; + t5_req6->local_ip_hi = *((__be64 *) + (la6->sin6_addr.s6_addr)); + t5_req6->local_ip_lo = *((__be64 *) + (la6->sin6_addr.s6_addr + 8)); + t5_req6->peer_ip_hi = *((__be64 *) + (ra6->sin6_addr.s6_addr)); + t5_req6->peer_ip_lo = *((__be64 *) + (ra6->sin6_addr.s6_addr + 8)); + t5_req6->opt0 = cpu_to_be64(opt0); + t5_req6->params = (__force __be64)cpu_to_be32( + cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t)); + t5_req6->rsvd = cpu_to_be32(isn); + PDBG("%s snd_isn %u\n", __func__, + be32_to_cpu(t5_req6->rsvd)); + t5_req6->opt2 = cpu_to_be32(opt2); + } + } + + set_bit(ACT_OPEN_REQ, &ep->com.history); return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } @@ -585,8 +856,9 @@ static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, ep->mpa_skb = skb; c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); start_ep_timer(ep); - state_set(&ep->com, MPA_REQ_SENT); + __state_set(&ep->com, MPA_REQ_SENT); ep->mpa_attr.initiator = 1; + ep->snd_seq += mpalen; return; } @@ -630,7 +902,7 @@ static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen) memset(mpa, 0, sizeof(*mpa)); memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); mpa->flags = MPA_REJECT; - mpa->revision = mpa_rev; + mpa->revision = ep->mpa_attr.version; mpa->private_data_size = htons(plen); if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { @@ -666,6 +938,7 @@ static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen) t4_set_arp_err_handler(skb, NULL, arp_failure_discard); BUG_ON(ep->mpa_skb); ep->mpa_skb = skb; + ep->snd_seq += mpalen; return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } @@ -749,7 +1022,8 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen) skb_get(skb); t4_set_arp_err_handler(skb, NULL, arp_failure_discard); ep->mpa_skb = skb; - state_set(&ep->com, MPA_REP_SENT); + __state_set(&ep->com, MPA_REP_SENT); + ep->snd_seq += mpalen; return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } @@ -766,11 +1040,13 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) PDBG("%s ep %p tid %u snd_isn %u rcv_isn %u\n", __func__, ep, tid, be32_to_cpu(req->snd_isn), be32_to_cpu(req->rcv_isn)); + mutex_lock(&ep->com.mutex); dst_confirm(ep->dst); /* setup the hwtid for this connection */ ep->hwtid = tid; cxgb4_insert_tid(t, ep, tid); + insert_handle(dev, &dev->hwtid_idr, ep, ep->hwtid); ep->snd_seq = be32_to_cpu(req->snd_isn); ep->rcv_seq = be32_to_cpu(req->rcv_isn); @@ -778,7 +1054,9 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) set_emss(ep, ntohs(req->tcp_opt)); /* dealloc the atid */ + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid); cxgb4_free_atid(t, atid); + set_bit(ACT_ESTAB, &ep->com.history); /* start MPA negotiation */ send_flowc(ep, NULL); @@ -786,32 +1064,33 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) send_mpa_req(ep, skb, 1); else send_mpa_req(ep, skb, mpa_rev); - + mutex_unlock(&ep->com.mutex); return 0; } -static void close_complete_upcall(struct c4iw_ep *ep) +static void close_complete_upcall(struct c4iw_ep *ep, int status) { struct iw_cm_event event; PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CLOSE; + event.status = status; if (ep->com.cm_id) { PDBG("close complete delivered ep %p cm_id %p tid %u\n", ep, ep->com.cm_id, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); ep->com.cm_id->rem_ref(ep->com.cm_id); ep->com.cm_id = NULL; - ep->com.qp = NULL; + set_bit(CLOSE_UPCALL, &ep->com.history); } } static int abort_connection(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp) { PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); - close_complete_upcall(ep); - state_set(&ep->com, ABORTING); + __state_set(&ep->com, ABORTING); + set_bit(ABORT_CONN, &ep->com.history); return send_abort(ep, skb, gfp); } @@ -826,6 +1105,7 @@ static void peer_close_upcall(struct c4iw_ep *ep) PDBG("peer close delivered ep %p cm_id %p tid %u\n", ep, ep->com.cm_id, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); + set_bit(DISCONN_UPCALL, &ep->com.history); } } @@ -843,7 +1123,7 @@ static void peer_abort_upcall(struct c4iw_ep *ep) ep->com.cm_id->event_handler(ep->com.cm_id, &event); ep->com.cm_id->rem_ref(ep->com.cm_id); ep->com.cm_id = NULL; - ep->com.qp = NULL; + set_bit(ABORT_UPCALL, &ep->com.history); } } @@ -855,8 +1135,10 @@ static void connect_reply_upcall(struct c4iw_ep *ep, int status) memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CONNECT_REPLY; event.status = status; - event.local_addr = ep->com.local_addr; - event.remote_addr = ep->com.remote_addr; + memcpy(&event.local_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&event.remote_addr, &ep->com.remote_addr, + sizeof(ep->com.remote_addr)); if ((status == 0) || (status == -ECONNREFUSED)) { if (!ep->tried_with_mpa_v1) { @@ -876,24 +1158,27 @@ static void connect_reply_upcall(struct c4iw_ep *ep, int status) PDBG("%s ep %p tid %u status %d\n", __func__, ep, ep->hwtid, status); + set_bit(CONN_RPL_UPCALL, &ep->com.history); ep->com.cm_id->event_handler(ep->com.cm_id, &event); if (status < 0) { ep->com.cm_id->rem_ref(ep->com.cm_id); ep->com.cm_id = NULL; - ep->com.qp = NULL; } } -static void connect_request_upcall(struct c4iw_ep *ep) +static int connect_request_upcall(struct c4iw_ep *ep) { struct iw_cm_event event; + int ret; PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CONNECT_REQUEST; - event.local_addr = ep->com.local_addr; - event.remote_addr = ep->com.remote_addr; + memcpy(&event.local_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&event.remote_addr, &ep->com.remote_addr, + sizeof(ep->com.remote_addr)); event.provider_data = ep; if (!ep->tried_with_mpa_v1) { /* this means MPA_v2 is used */ @@ -910,14 +1195,14 @@ static void connect_request_upcall(struct c4iw_ep *ep) event.private_data_len = ep->plen; event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); } - if (state_read(&ep->parent_ep->com) != DEAD) { - c4iw_get_ep(&ep->com); - ep->parent_ep->com.cm_id->event_handler( - ep->parent_ep->com.cm_id, - &event); - } + c4iw_get_ep(&ep->com); + ret = ep->parent_ep->com.cm_id->event_handler(ep->parent_ep->com.cm_id, + &event); + if (ret) + c4iw_put_ep(&ep->com); + set_bit(CONNREQ_UPCALL, &ep->com.history); c4iw_put_ep(&ep->parent_ep->com); - ep->parent_ep = NULL; + return ret; } static void established_upcall(struct c4iw_ep *ep) @@ -932,6 +1217,7 @@ static void established_upcall(struct c4iw_ep *ep) if (ep->com.cm_id) { PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); + set_bit(ESTAB_UPCALL, &ep->com.history); } } @@ -948,6 +1234,14 @@ static int update_rx_credits(struct c4iw_ep *ep, u32 credits) return 0; } + /* + * If we couldn't specify the entire rcv window at connection setup + * due to the limit in the number of bits in the RCV_BUFSIZ field, + * then add the overage in to the credits returned. + */ + if (ep->rcv_win > RCV_BUFSIZ_MASK * 1024) + credits += ep->rcv_win - RCV_BUFSIZ_MASK * 1024; + req = (struct cpl_rx_data_ack *) skb_put(skb, wrlen); memset(req, 0, wrlen); INIT_TP_WR(req, ep->hwtid); @@ -961,7 +1255,7 @@ static int update_rx_credits(struct c4iw_ep *ep, u32 credits) return credits; } -static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) +static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) { struct mpa_message *mpa; struct mpa_v2_conn_params *mpa_v2_params; @@ -971,17 +1265,17 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) struct c4iw_qp_attributes attrs; enum c4iw_qp_attr_mask mask; int err; + int disconnect = 0; PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); /* - * Stop mpa timer. If it expired, then the state has - * changed and we bail since ep_timeout already aborted - * the connection. + * Stop mpa timer. If it expired, then + * we ignore the MPA reply. process_timeout() + * will abort the connection. */ - stop_ep_timer(ep); - if (state_read(&ep->com) != MPA_REQ_SENT) - return; + if (stop_ep_timer(ep)) + return 0; /* * If we get more than the supported amount of private data @@ -1003,7 +1297,7 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) * if we don't even have the mpa message, then bail. */ if (ep->mpa_pkt_len < sizeof(*mpa)) - return; + return 0; mpa = (struct mpa_message *) ep->mpa_pkt; /* Validate MPA header. */ @@ -1043,7 +1337,7 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) * We'll continue process when more data arrives. */ if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) - return; + return 0; if (mpa->flags & MPA_REJECT) { err = -ECONNREFUSED; @@ -1055,7 +1349,7 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) * start reply message including private data. And * the MPA header is valid. */ - state_set(&ep->com, FPDU_MODE); + __state_set(&ep->com, FPDU_MODE); ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; @@ -1145,9 +1439,11 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) attrs.layer_etype = LAYER_MPA | DDP_LLP; attrs.ecode = MPA_NOMATCH_RTR; attrs.next_state = C4IW_QP_STATE_TERMINATE; + attrs.send_term = 1; err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, - C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); err = -ENOMEM; + disconnect = 1; goto out; } @@ -1163,18 +1459,20 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) attrs.layer_etype = LAYER_MPA | DDP_LLP; attrs.ecode = MPA_INSUFF_IRD; attrs.next_state = C4IW_QP_STATE_TERMINATE; + attrs.send_term = 1; err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, - C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); err = -ENOMEM; + disconnect = 1; goto out; } goto out; err: - state_set(&ep->com, ABORTING); + __state_set(&ep->com, ABORTING); send_abort(ep, skb, GFP_KERNEL); out: connect_reply_upcall(ep, err); - return; + return disconnect; } static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) @@ -1185,15 +1483,12 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); - if (state_read(&ep->com) != MPA_REQ_WAIT) - return; - /* * If we get more than the supported amount of private data * then we must fail this connection. */ if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { - stop_ep_timer(ep); + (void)stop_ep_timer(ep); abort_connection(ep, skb, GFP_KERNEL); return; } @@ -1215,7 +1510,6 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) return; PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__); - stop_ep_timer(ep); mpa = (struct mpa_message *) ep->mpa_pkt; /* @@ -1224,11 +1518,13 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) if (mpa->revision > mpa_rev) { printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d," " Received = %d\n", __func__, mpa_rev, mpa->revision); + (void)stop_ep_timer(ep); abort_connection(ep, skb, GFP_KERNEL); return; } if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) { + (void)stop_ep_timer(ep); abort_connection(ep, skb, GFP_KERNEL); return; } @@ -1239,6 +1535,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) * Fail if there's too much private data. */ if (plen > MPA_MAX_PRIVATE_DATA) { + (void)stop_ep_timer(ep); abort_connection(ep, skb, GFP_KERNEL); return; } @@ -1247,6 +1544,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) * If plen does not account for pkt size */ if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { + (void)stop_ep_timer(ep); abort_connection(ep, skb, GFP_KERNEL); return; } @@ -1303,10 +1601,24 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version, ep->mpa_attr.p2p_type); - state_set(&ep->com, MPA_REQ_RCVD); - - /* drive upcall */ - connect_request_upcall(ep); + /* + * If the endpoint timer already expired, then we ignore + * the start request. process_timeout() will abort + * the connection. + */ + if (!stop_ep_timer(ep)) { + __state_set(&ep->com, MPA_REQ_RCVD); + + /* drive upcall */ + mutex_lock(&ep->parent_ep->com.mutex); + if (ep->parent_ep->com.state != DEAD) { + if (connect_request_upcall(ep)) + abort_connection(ep, skb, GFP_KERNEL); + } else { + abort_connection(ep, skb, GFP_KERNEL); + } + mutex_unlock(&ep->parent_ep->com.mutex); + } return; } @@ -1317,38 +1629,49 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb) unsigned int dlen = ntohs(hdr->len); unsigned int tid = GET_TID(hdr); struct tid_info *t = dev->rdev.lldi.tids; + __u8 status = hdr->status; + int disconnect = 0; ep = lookup_tid(t, tid); + if (!ep) + return 0; PDBG("%s ep %p tid %u dlen %u\n", __func__, ep, ep->hwtid, dlen); skb_pull(skb, sizeof(*hdr)); skb_trim(skb, dlen); - - ep->rcv_seq += dlen; - BUG_ON(ep->rcv_seq != (ntohl(hdr->seq) + dlen)); + mutex_lock(&ep->com.mutex); /* update RX credits */ update_rx_credits(ep, dlen); - switch (state_read(&ep->com)) { + switch (ep->com.state) { case MPA_REQ_SENT: - process_mpa_reply(ep, skb); + ep->rcv_seq += dlen; + disconnect = process_mpa_reply(ep, skb); break; case MPA_REQ_WAIT: + ep->rcv_seq += dlen; process_mpa_request(ep, skb); break; - case MPA_REP_SENT: + case FPDU_MODE: { + struct c4iw_qp_attributes attrs; + BUG_ON(!ep->com.qp); + if (status) + pr_err("%s Unexpected streaming data." \ + " qpid %u ep %p state %d tid %u status %d\n", + __func__, ep->com.qp->wq.sq.qid, ep, + ep->com.state, ep->hwtid, status); + attrs.next_state = C4IW_QP_STATE_TERMINATE; + c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + disconnect = 1; break; + } default: - printk(KERN_ERR MOD "%s Unexpected streaming data." - " ep %p state %d tid %u\n", - __func__, ep, state_read(&ep->com), ep->hwtid); - - /* - * The ep will timeout and inform the ULP of the failure. - * See ep_timeout(). - */ break; } + mutex_unlock(&ep->com.mutex); + if (disconnect) + c4iw_ep_disconnect(ep, 0, GFP_KERNEL); return 0; } @@ -1369,6 +1692,7 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) mutex_lock(&ep->com.mutex); switch (ep->com.state) { case ABORTING: + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); __state_set(&ep->com, DEAD); release = 1; break; @@ -1384,6 +1708,78 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) return 0; } +static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) +{ + struct sk_buff *skb; + struct fw_ofld_connection_wr *req; + unsigned int mtu_idx; + int wscale; + struct sockaddr_in *sin; + int win; + + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + req = (struct fw_ofld_connection_wr *)__skb_put(skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); + req->op_compl = htonl(V_WR_OP(FW_OFLD_CONNECTION_WR)); + req->len16_pkd = htonl(FW_WR_LEN16(DIV_ROUND_UP(sizeof(*req), 16))); + req->le.filter = cpu_to_be32(cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t)); + sin = (struct sockaddr_in *)&ep->com.mapped_local_addr; + req->le.lport = sin->sin_port; + req->le.u.ipv4.lip = sin->sin_addr.s_addr; + sin = (struct sockaddr_in *)&ep->com.mapped_remote_addr; + req->le.pport = sin->sin_port; + req->le.u.ipv4.pip = sin->sin_addr.s_addr; + req->tcb.t_state_to_astid = + htonl(V_FW_OFLD_CONNECTION_WR_T_STATE(TCP_SYN_SENT) | + V_FW_OFLD_CONNECTION_WR_ASTID(atid)); + req->tcb.cplrxdataack_cplpassacceptrpl = + htons(F_FW_OFLD_CONNECTION_WR_CPLRXDATAACK); + req->tcb.tx_max = (__force __be32) jiffies; + req->tcb.rcv_adv = htons(1); + best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, + enable_tcp_timestamps); + wscale = compute_wscale(rcv_win); + + /* + * Specify the largest window that will fit in opt0. The + * remainder will be specified in the rx_data_ack. + */ + win = ep->rcv_win >> 10; + if (win > RCV_BUFSIZ_MASK) + win = RCV_BUFSIZ_MASK; + + req->tcb.opt0 = (__force __be64) (TCAM_BYPASS(1) | + (nocong ? NO_CONG(1) : 0) | + KEEP_ALIVE(1) | + DELACK(1) | + WND_SCALE(wscale) | + MSS_IDX(mtu_idx) | + L2T_IDX(ep->l2t->idx) | + TX_CHAN(ep->tx_chan) | + SMAC_SEL(ep->smac_idx) | + DSCP(ep->tos) | + ULP_MODE(ULP_MODE_TCPDDP) | + RCV_BUFSIZ(win)); + req->tcb.opt2 = (__force __be32) (PACE(1) | + TX_QUEUE(ep->com.dev->rdev.lldi.tx_modq[ep->tx_chan]) | + RX_CHANNEL(0) | + CCTRL_ECN(enable_ecn) | + RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid)); + if (enable_tcp_timestamps) + req->tcb.opt2 |= (__force __be32) TSTAMPS_EN(1); + if (enable_tcp_sack) + req->tcb.opt2 |= (__force __be32) SACK_EN(1); + if (wscale && enable_tcp_window_scaling) + req->tcb.opt2 |= (__force __be32) WND_SCALE_EN(1); + req->tcb.opt0 = cpu_to_be64((__force u64) req->tcb.opt0); + req->tcb.opt2 = cpu_to_be32((__force u32) req->tcb.opt2); + set_wr_txq(skb, CPL_PRIORITY_CONTROL, ep->ctrlq_idx); + set_bit(ACT_OFLD_CONN, &ep->com.history); + c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); +} + /* * Return whether a failed active open has allocated a TID */ @@ -1393,6 +1789,190 @@ static inline int act_open_has_tid(int status) status != CPL_ERR_ARP_MISS; } +/* Returns whether a CPL status conveys negative advice. + */ +static int is_neg_adv(unsigned int status) +{ + return status == CPL_ERR_RTX_NEG_ADVICE || + status == CPL_ERR_PERSIST_NEG_ADVICE || + status == CPL_ERR_KEEPALV_NEG_ADVICE; +} + +static void set_tcp_window(struct c4iw_ep *ep, struct port_info *pi) +{ + ep->snd_win = snd_win; + ep->rcv_win = rcv_win; + PDBG("%s snd_win %d rcv_win %d\n", __func__, ep->snd_win, ep->rcv_win); +} + +#define ACT_OPEN_RETRY_COUNT 2 + +static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, + struct dst_entry *dst, struct c4iw_dev *cdev, + bool clear_mpa_v1) +{ + struct neighbour *n; + int err, step; + struct net_device *pdev; + + n = dst_neigh_lookup(dst, peer_ip); + if (!n) + return -ENODEV; + + rcu_read_lock(); + err = -ENOMEM; + if (n->dev->flags & IFF_LOOPBACK) { + if (iptype == 4) + pdev = ip_dev_find(&init_net, *(__be32 *)peer_ip); + else if (IS_ENABLED(CONFIG_IPV6)) + for_each_netdev(&init_net, pdev) { + if (ipv6_chk_addr(&init_net, + (struct in6_addr *)peer_ip, + pdev, 1)) + break; + } + else + pdev = NULL; + + if (!pdev) { + err = -ENODEV; + goto out; + } + ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, + n, pdev, 0); + if (!ep->l2t) + goto out; + ep->mtu = pdev->mtu; + ep->tx_chan = cxgb4_port_chan(pdev); + ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1; + step = cdev->rdev.lldi.ntxq / + cdev->rdev.lldi.nchan; + ep->txq_idx = cxgb4_port_idx(pdev) * step; + step = cdev->rdev.lldi.nrxq / + cdev->rdev.lldi.nchan; + ep->ctrlq_idx = cxgb4_port_idx(pdev); + ep->rss_qid = cdev->rdev.lldi.rxq_ids[ + cxgb4_port_idx(pdev) * step]; + set_tcp_window(ep, (struct port_info *)netdev_priv(pdev)); + dev_put(pdev); + } else { + pdev = get_real_dev(n->dev); + ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, + n, pdev, 0); + if (!ep->l2t) + goto out; + ep->mtu = dst_mtu(dst); + ep->tx_chan = cxgb4_port_chan(pdev); + ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1; + step = cdev->rdev.lldi.ntxq / + cdev->rdev.lldi.nchan; + ep->txq_idx = cxgb4_port_idx(pdev) * step; + ep->ctrlq_idx = cxgb4_port_idx(pdev); + step = cdev->rdev.lldi.nrxq / + cdev->rdev.lldi.nchan; + ep->rss_qid = cdev->rdev.lldi.rxq_ids[ + cxgb4_port_idx(pdev) * step]; + set_tcp_window(ep, (struct port_info *)netdev_priv(pdev)); + + if (clear_mpa_v1) { + ep->retry_with_mpa_v1 = 0; + ep->tried_with_mpa_v1 = 0; + } + } + err = 0; +out: + rcu_read_unlock(); + + neigh_release(n); + + return err; +} + +static int c4iw_reconnect(struct c4iw_ep *ep) +{ + int err = 0; + struct sockaddr_in *laddr = (struct sockaddr_in *) + &ep->com.cm_id->local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *) + &ep->com.cm_id->remote_addr; + struct sockaddr_in6 *laddr6 = (struct sockaddr_in6 *) + &ep->com.cm_id->local_addr; + struct sockaddr_in6 *raddr6 = (struct sockaddr_in6 *) + &ep->com.cm_id->remote_addr; + int iptype; + __u8 *ra; + + PDBG("%s qp %p cm_id %p\n", __func__, ep->com.qp, ep->com.cm_id); + init_timer(&ep->timer); + + /* + * Allocate an active TID to initiate a TCP connection. + */ + ep->atid = cxgb4_alloc_atid(ep->com.dev->rdev.lldi.tids, ep); + if (ep->atid == -1) { + pr_err("%s - cannot alloc atid.\n", __func__); + err = -ENOMEM; + goto fail2; + } + insert_handle(ep->com.dev, &ep->com.dev->atid_idr, ep, ep->atid); + + /* find a route */ + if (ep->com.cm_id->local_addr.ss_family == AF_INET) { + ep->dst = find_route(ep->com.dev, laddr->sin_addr.s_addr, + raddr->sin_addr.s_addr, laddr->sin_port, + raddr->sin_port, 0); + iptype = 4; + ra = (__u8 *)&raddr->sin_addr; + } else { + ep->dst = find_route6(ep->com.dev, laddr6->sin6_addr.s6_addr, + raddr6->sin6_addr.s6_addr, + laddr6->sin6_port, raddr6->sin6_port, 0, + raddr6->sin6_scope_id); + iptype = 6; + ra = (__u8 *)&raddr6->sin6_addr; + } + if (!ep->dst) { + pr_err("%s - cannot find route.\n", __func__); + err = -EHOSTUNREACH; + goto fail3; + } + err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, false); + if (err) { + pr_err("%s - cannot alloc l2e.\n", __func__); + goto fail4; + } + + PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n", + __func__, ep->txq_idx, ep->tx_chan, ep->smac_idx, ep->rss_qid, + ep->l2t->idx); + + state_set(&ep->com, CONNECTING); + ep->tos = 0; + + /* send connect request to rnic */ + err = send_connect(ep); + if (!err) + goto out; + + cxgb4_l2t_release(ep->l2t); +fail4: + dst_release(ep->dst); +fail3: + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); + cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); +fail2: + /* + * remember to send notification to upper layer. + * We are in here so the upper layer is not aware that this is + * re-connect attempt and so, upper layer is still waiting for + * response of 1st connect request. + */ + connect_reply_upcall(ep, -ECONNRESET); + c4iw_put_ep(&ep->com); +out: + return err; +} + static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) { struct c4iw_ep *ep; @@ -1401,18 +1981,28 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) ntohl(rpl->atid_status))); struct tid_info *t = dev->rdev.lldi.tids; int status = GET_AOPEN_STATUS(ntohl(rpl->atid_status)); + struct sockaddr_in *la; + struct sockaddr_in *ra; + struct sockaddr_in6 *la6; + struct sockaddr_in6 *ra6; ep = lookup_atid(t, atid); + la = (struct sockaddr_in *)&ep->com.mapped_local_addr; + ra = (struct sockaddr_in *)&ep->com.mapped_remote_addr; + la6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr; + ra6 = (struct sockaddr_in6 *)&ep->com.mapped_remote_addr; PDBG("%s ep %p atid %u status %u errno %d\n", __func__, ep, atid, status, status2errno(status)); - if (status == CPL_ERR_RTX_NEG_ADVICE) { + if (is_neg_adv(status)) { printk(KERN_WARNING MOD "Connection problems for atid %u\n", atid); return 0; } + set_bit(ACT_OPEN_RPL, &ep->com.history); + /* * Log interesting failures. */ @@ -1420,14 +2010,42 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) case CPL_ERR_CONN_RESET: case CPL_ERR_CONN_TIMEDOUT: break; + case CPL_ERR_TCAM_FULL: + mutex_lock(&dev->rdev.stats.lock); + dev->rdev.stats.tcam_full++; + mutex_unlock(&dev->rdev.stats.lock); + if (ep->com.local_addr.ss_family == AF_INET && + dev->rdev.lldi.enable_fw_ofld_conn) { + send_fw_act_open_req(ep, + GET_TID_TID(GET_AOPEN_ATID( + ntohl(rpl->atid_status)))); + return 0; + } + break; + case CPL_ERR_CONN_EXIST: + if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) { + set_bit(ACT_RETRY_INUSE, &ep->com.history); + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, + atid); + cxgb4_free_atid(t, atid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + c4iw_reconnect(ep); + return 0; + } + break; default: - printk(KERN_INFO MOD "Active open failure - " - "atid %u status %u errno %d %pI4:%u->%pI4:%u\n", - atid, status, status2errno(status), - &ep->com.local_addr.sin_addr.s_addr, - ntohs(ep->com.local_addr.sin_port), - &ep->com.remote_addr.sin_addr.s_addr, - ntohs(ep->com.remote_addr.sin_port)); + if (ep->com.local_addr.ss_family == AF_INET) { + pr_info("Active open failure - atid %u status %u errno %d %pI4:%u->%pI4:%u\n", + atid, status, status2errno(status), + &la->sin_addr.s_addr, ntohs(la->sin_port), + &ra->sin_addr.s_addr, ntohs(ra->sin_port)); + } else { + pr_info("Active open failure - atid %u status %u errno %d %pI6:%u->%pI6:%u\n", + atid, status, status2errno(status), + la6->sin6_addr.s6_addr, ntohs(la6->sin6_port), + ra6->sin6_addr.s6_addr, ntohs(ra6->sin6_port)); + } break; } @@ -1437,6 +2055,7 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) if (status && act_open_has_tid(status)) cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, GET_TID(rpl)); + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid); cxgb4_free_atid(t, atid); dst_release(ep->dst); cxgb4_l2t_release(ep->l2t); @@ -1453,37 +2072,17 @@ static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) struct c4iw_listen_ep *ep = lookup_stid(t, stid); if (!ep) { - printk(KERN_ERR MOD "stid %d lookup failure!\n", stid); - return 0; + PDBG("%s stid %d lookup failure!\n", __func__, stid); + goto out; } PDBG("%s ep %p status %d error %d\n", __func__, ep, rpl->status, status2errno(rpl->status)); c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status)); +out: return 0; } -static int listen_stop(struct c4iw_listen_ep *ep) -{ - struct sk_buff *skb; - struct cpl_close_listsvr_req *req; - - PDBG("%s ep %p\n", __func__, ep); - skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); - if (!skb) { - printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__); - return -ENOMEM; - } - req = (struct cpl_close_listsvr_req *) skb_put(skb, sizeof(*req)); - INIT_TP_WR(req, 0); - OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, - ep->stid)); - req->reply_ctrl = cpu_to_be16( - QUEUENO(ep->com.dev->rdev.lldi.rxq_ids[0])); - set_wr_txq(skb, CPL_PRIORITY_SETUP, 0); - return c4iw_ofld_send(&ep->com.dev->rdev, skb); -} - static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_close_listsvr_rpl *rpl = cplhdr(skb); @@ -1496,7 +2095,7 @@ static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb) return 0; } -static void accept_cr(struct c4iw_ep *ep, __be32 peer_ip, struct sk_buff *skb, +static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, struct cpl_pass_accept_req *req) { struct cpl_pass_accept_rpl *rpl; @@ -1504,23 +2103,47 @@ static void accept_cr(struct c4iw_ep *ep, __be32 peer_ip, struct sk_buff *skb, u64 opt0; u32 opt2; int wscale; + struct cpl_t5_pass_accept_rpl *rpl5 = NULL; + int win; PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); BUG_ON(skb_cloned(skb)); - skb_trim(skb, sizeof(*rpl)); + skb_get(skb); - cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx); + rpl = cplhdr(skb); + if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) { + skb_trim(skb, roundup(sizeof(*rpl5), 16)); + rpl5 = (void *)rpl; + INIT_TP_WR(rpl5, ep->hwtid); + } else { + skb_trim(skb, sizeof(*rpl)); + INIT_TP_WR(rpl, ep->hwtid); + } + OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, + ep->hwtid)); + + best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, + enable_tcp_timestamps && req->tcpopt.tstamp); wscale = compute_wscale(rcv_win); - opt0 = KEEP_ALIVE(1) | + + /* + * Specify the largest window that will fit in opt0. The + * remainder will be specified in the rx_data_ack. + */ + win = ep->rcv_win >> 10; + if (win > RCV_BUFSIZ_MASK) + win = RCV_BUFSIZ_MASK; + opt0 = (nocong ? NO_CONG(1) : 0) | + KEEP_ALIVE(1) | DELACK(1) | WND_SCALE(wscale) | MSS_IDX(mtu_idx) | L2T_IDX(ep->l2t->idx) | TX_CHAN(ep->tx_chan) | SMAC_SEL(ep->smac_idx) | - DSCP(ep->tos) | + DSCP(ep->tos >> 2) | ULP_MODE(ULP_MODE_TCPDDP) | - RCV_BUFSIZ(rcv_win>>10); + RCV_BUFSIZ(win); opt2 = RX_CHANNEL(0) | RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid); @@ -1530,137 +2153,98 @@ static void accept_cr(struct c4iw_ep *ep, __be32 peer_ip, struct sk_buff *skb, opt2 |= SACK_EN(1); if (wscale && enable_tcp_window_scaling) opt2 |= WND_SCALE_EN(1); + if (enable_ecn) { + const struct tcphdr *tcph; + u32 hlen = ntohl(req->hdr_len); + + tcph = (const void *)(req + 1) + G_ETH_HDR_LEN(hlen) + + G_IP_HDR_LEN(hlen); + if (tcph->ece && tcph->cwr) + opt2 |= CCTRL_ECN(1); + } + if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) { + u32 isn = (prandom_u32() & ~7UL) - 1; + opt2 |= T5_OPT_2_VALID; + opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); + opt2 |= CONG_CNTRL_VALID; /* OPT_2_ISS for T5 */ + rpl5 = (void *)rpl; + memset(&rpl5->iss, 0, roundup(sizeof(*rpl5)-sizeof(*rpl), 16)); + if (peer2peer) + isn += 4; + rpl5->iss = cpu_to_be32(isn); + PDBG("%s iss %u\n", __func__, be32_to_cpu(rpl5->iss)); + } - rpl = cplhdr(skb); - INIT_TP_WR(rpl, ep->hwtid); - OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, - ep->hwtid)); rpl->opt0 = cpu_to_be64(opt0); rpl->opt2 = cpu_to_be32(opt2); set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx); + t4_set_arp_err_handler(skb, NULL, arp_failure_discard); c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); return; } -static void reject_cr(struct c4iw_dev *dev, u32 hwtid, __be32 peer_ip, - struct sk_buff *skb) +static void reject_cr(struct c4iw_dev *dev, u32 hwtid, struct sk_buff *skb) { - PDBG("%s c4iw_dev %p tid %u peer_ip %x\n", __func__, dev, hwtid, - peer_ip); + PDBG("%s c4iw_dev %p tid %u\n", __func__, dev, hwtid); BUG_ON(skb_cloned(skb)); skb_trim(skb, sizeof(struct cpl_tid_release)); - skb_get(skb); release_tid(&dev->rdev, hwtid, skb); return; } -static void get_4tuple(struct cpl_pass_accept_req *req, - __be32 *local_ip, __be32 *peer_ip, +static void get_4tuple(struct cpl_pass_accept_req *req, int *iptype, + __u8 *local_ip, __u8 *peer_ip, __be16 *local_port, __be16 *peer_port) { int eth_len = G_ETH_HDR_LEN(be32_to_cpu(req->hdr_len)); int ip_len = G_IP_HDR_LEN(be32_to_cpu(req->hdr_len)); struct iphdr *ip = (struct iphdr *)((u8 *)(req + 1) + eth_len); + struct ipv6hdr *ip6 = (struct ipv6hdr *)((u8 *)(req + 1) + eth_len); struct tcphdr *tcp = (struct tcphdr *) ((u8 *)(req + 1) + eth_len + ip_len); - PDBG("%s saddr 0x%x daddr 0x%x sport %u dport %u\n", __func__, - ntohl(ip->saddr), ntohl(ip->daddr), ntohs(tcp->source), - ntohs(tcp->dest)); - - *peer_ip = ip->saddr; - *local_ip = ip->daddr; + if (ip->version == 4) { + PDBG("%s saddr 0x%x daddr 0x%x sport %u dport %u\n", __func__, + ntohl(ip->saddr), ntohl(ip->daddr), ntohs(tcp->source), + ntohs(tcp->dest)); + *iptype = 4; + memcpy(peer_ip, &ip->saddr, 4); + memcpy(local_ip, &ip->daddr, 4); + } else { + PDBG("%s saddr %pI6 daddr %pI6 sport %u dport %u\n", __func__, + ip6->saddr.s6_addr, ip6->daddr.s6_addr, ntohs(tcp->source), + ntohs(tcp->dest)); + *iptype = 6; + memcpy(peer_ip, ip6->saddr.s6_addr, 16); + memcpy(local_ip, ip6->daddr.s6_addr, 16); + } *peer_port = tcp->source; *local_port = tcp->dest; return; } -static int import_ep(struct c4iw_ep *ep, __be32 peer_ip, struct dst_entry *dst, - struct c4iw_dev *cdev, bool clear_mpa_v1) -{ - struct neighbour *n; - int err, step; - - n = dst_neigh_lookup(dst, &peer_ip); - if (!n) - return -ENODEV; - - rcu_read_lock(); - err = -ENOMEM; - if (n->dev->flags & IFF_LOOPBACK) { - struct net_device *pdev; - - pdev = ip_dev_find(&init_net, peer_ip); - if (!pdev) { - err = -ENODEV; - goto out; - } - ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, - n, pdev, 0); - if (!ep->l2t) - goto out; - ep->mtu = pdev->mtu; - ep->tx_chan = cxgb4_port_chan(pdev); - ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1; - step = cdev->rdev.lldi.ntxq / - cdev->rdev.lldi.nchan; - ep->txq_idx = cxgb4_port_idx(pdev) * step; - step = cdev->rdev.lldi.nrxq / - cdev->rdev.lldi.nchan; - ep->ctrlq_idx = cxgb4_port_idx(pdev); - ep->rss_qid = cdev->rdev.lldi.rxq_ids[ - cxgb4_port_idx(pdev) * step]; - dev_put(pdev); - } else { - ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, - n, n->dev, 0); - if (!ep->l2t) - goto out; - ep->mtu = dst_mtu(dst); - ep->tx_chan = cxgb4_port_chan(n->dev); - ep->smac_idx = (cxgb4_port_viid(n->dev) & 0x7F) << 1; - step = cdev->rdev.lldi.ntxq / - cdev->rdev.lldi.nchan; - ep->txq_idx = cxgb4_port_idx(n->dev) * step; - ep->ctrlq_idx = cxgb4_port_idx(n->dev); - step = cdev->rdev.lldi.nrxq / - cdev->rdev.lldi.nchan; - ep->rss_qid = cdev->rdev.lldi.rxq_ids[ - cxgb4_port_idx(n->dev) * step]; - - if (clear_mpa_v1) { - ep->retry_with_mpa_v1 = 0; - ep->tried_with_mpa_v1 = 0; - } - } - err = 0; -out: - rcu_read_unlock(); - - neigh_release(n); - - return err; -} - static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) { - struct c4iw_ep *child_ep, *parent_ep; + struct c4iw_ep *child_ep = NULL, *parent_ep; struct cpl_pass_accept_req *req = cplhdr(skb); unsigned int stid = GET_POPEN_TID(ntohl(req->tos_stid)); struct tid_info *t = dev->rdev.lldi.tids; unsigned int hwtid = GET_TID(req); struct dst_entry *dst; - struct rtable *rt; - __be32 local_ip, peer_ip; + __u8 local_ip[16], peer_ip[16]; __be16 local_port, peer_port; int err; + u16 peer_mss = ntohs(req->tcpopt.mss); + int iptype; + unsigned short hdrs; parent_ep = lookup_stid(t, stid); - PDBG("%s parent ep %p tid %u\n", __func__, parent_ep, hwtid); - - get_4tuple(req, &local_ip, &peer_ip, &local_port, &peer_port); + if (!parent_ep) { + PDBG("%s connect request on invalid stid %d\n", __func__, stid); + goto reject; + } if (state_read(&parent_ep->com) != LISTEN) { printk(KERN_ERR "%s - listening ep not in LISTEN\n", @@ -1668,15 +2252,32 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) goto reject; } + get_4tuple(req, &iptype, local_ip, peer_ip, &local_port, &peer_port); + /* Find output route */ - rt = find_route(dev, local_ip, peer_ip, local_port, peer_port, - GET_POPEN_TOS(ntohl(req->tos_stid))); - if (!rt) { + if (iptype == 4) { + PDBG("%s parent ep %p hwtid %u laddr %pI4 raddr %pI4 lport %d rport %d peer_mss %d\n" + , __func__, parent_ep, hwtid, + local_ip, peer_ip, ntohs(local_port), + ntohs(peer_port), peer_mss); + dst = find_route(dev, *(__be32 *)local_ip, *(__be32 *)peer_ip, + local_port, peer_port, + GET_POPEN_TOS(ntohl(req->tos_stid))); + } else { + PDBG("%s parent ep %p hwtid %u laddr %pI6 raddr %pI6 lport %d rport %d peer_mss %d\n" + , __func__, parent_ep, hwtid, + local_ip, peer_ip, ntohs(local_port), + ntohs(peer_port), peer_mss); + dst = find_route6(dev, local_ip, peer_ip, local_port, peer_port, + PASS_OPEN_TOS(ntohl(req->tos_stid)), + ((struct sockaddr_in6 *) + &parent_ep->com.local_addr)->sin6_scope_id); + } + if (!dst) { printk(KERN_ERR MOD "%s - failed to find dst entry!\n", __func__); goto reject; } - dst = &rt->dst; child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL); if (!child_ep) { @@ -1686,7 +2287,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) goto reject; } - err = import_ep(child_ep, peer_ip, dst, dev, false); + err = import_ep(child_ep, iptype, peer_ip, dst, dev, false); if (err) { printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n", __func__); @@ -1695,15 +2296,35 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) goto reject; } + hdrs = sizeof(struct iphdr) + sizeof(struct tcphdr) + + ((enable_tcp_timestamps && req->tcpopt.tstamp) ? 12 : 0); + if (peer_mss && child_ep->mtu > (peer_mss + hdrs)) + child_ep->mtu = peer_mss + hdrs; + state_set(&child_ep->com, CONNECTING); child_ep->com.dev = dev; child_ep->com.cm_id = NULL; - child_ep->com.local_addr.sin_family = PF_INET; - child_ep->com.local_addr.sin_port = local_port; - child_ep->com.local_addr.sin_addr.s_addr = local_ip; - child_ep->com.remote_addr.sin_family = PF_INET; - child_ep->com.remote_addr.sin_port = peer_port; - child_ep->com.remote_addr.sin_addr.s_addr = peer_ip; + if (iptype == 4) { + struct sockaddr_in *sin = (struct sockaddr_in *) + &child_ep->com.local_addr; + sin->sin_family = PF_INET; + sin->sin_port = local_port; + sin->sin_addr.s_addr = *(__be32 *)local_ip; + sin = (struct sockaddr_in *)&child_ep->com.remote_addr; + sin->sin_family = PF_INET; + sin->sin_port = peer_port; + sin->sin_addr.s_addr = *(__be32 *)peer_ip; + } else { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) + &child_ep->com.local_addr; + sin6->sin6_family = PF_INET6; + sin6->sin6_port = local_port; + memcpy(sin6->sin6_addr.s6_addr, local_ip, 16); + sin6 = (struct sockaddr_in6 *)&child_ep->com.remote_addr; + sin6->sin6_family = PF_INET6; + sin6->sin6_port = peer_port; + memcpy(sin6->sin6_addr.s6_addr, peer_ip, 16); + } c4iw_get_ep(&parent_ep->com); child_ep->parent_ep = parent_ep; child_ep->tos = GET_POPEN_TOS(ntohl(req->tos_stid)); @@ -1715,10 +2336,12 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) init_timer(&child_ep->timer); cxgb4_insert_tid(t, child_ep, hwtid); - accept_cr(child_ep, peer_ip, skb, req); + insert_handle(dev, &dev->hwtid_idr, child_ep, child_ep->hwtid); + accept_cr(child_ep, skb, req); + set_bit(PASS_ACCEPT_REQ, &child_ep->com.history); goto out; reject: - reject_cr(dev, hwtid, peer_ip, skb); + reject_cr(dev, hwtid, skb); out: return 0; } @@ -1735,12 +2358,16 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb) ep->snd_seq = be32_to_cpu(req->snd_isn); ep->rcv_seq = be32_to_cpu(req->rcv_isn); + PDBG("%s ep %p hwtid %u tcp_opt 0x%02x\n", __func__, ep, tid, + ntohs(req->tcp_opt)); + set_emss(ep, ntohs(req->tcp_opt)); dst_confirm(ep->dst); state_set(&ep->com, MPA_REQ_WAIT); start_ep_timer(ep); send_flowc(ep, skb); + set_bit(PASS_ESTAB, &ep->com.history); return 0; } @@ -1760,6 +2387,7 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); dst_confirm(ep->dst); + set_bit(PEER_CLOSE, &ep->com.history); mutex_lock(&ep->com.mutex); switch (ep->com.state) { case MPA_REQ_WAIT: @@ -1805,13 +2433,13 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) disconnect = 0; break; case MORIBUND: - stop_ep_timer(ep); + (void)stop_ep_timer(ep); if (ep->com.cm_id && ep->com.qp) { attrs.next_state = C4IW_QP_STATE_IDLE; c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } - close_complete_upcall(ep); + close_complete_upcall(ep, 0); __state_set(&ep->com, DEAD); release = 1; disconnect = 0; @@ -1830,83 +2458,6 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) return 0; } -/* - * Returns whether an ABORT_REQ_RSS message is a negative advice. - */ -static int is_neg_adv_abort(unsigned int status) -{ - return status == CPL_ERR_RTX_NEG_ADVICE || - status == CPL_ERR_PERSIST_NEG_ADVICE; -} - -static int c4iw_reconnect(struct c4iw_ep *ep) -{ - struct rtable *rt; - int err = 0; - - PDBG("%s qp %p cm_id %p\n", __func__, ep->com.qp, ep->com.cm_id); - init_timer(&ep->timer); - - /* - * Allocate an active TID to initiate a TCP connection. - */ - ep->atid = cxgb4_alloc_atid(ep->com.dev->rdev.lldi.tids, ep); - if (ep->atid == -1) { - printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__); - err = -ENOMEM; - goto fail2; - } - - /* find a route */ - rt = find_route(ep->com.dev, - ep->com.cm_id->local_addr.sin_addr.s_addr, - ep->com.cm_id->remote_addr.sin_addr.s_addr, - ep->com.cm_id->local_addr.sin_port, - ep->com.cm_id->remote_addr.sin_port, 0); - if (!rt) { - printk(KERN_ERR MOD "%s - cannot find route.\n", __func__); - err = -EHOSTUNREACH; - goto fail3; - } - ep->dst = &rt->dst; - - err = import_ep(ep, ep->com.cm_id->remote_addr.sin_addr.s_addr, - ep->dst, ep->com.dev, false); - if (err) { - printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); - goto fail4; - } - - PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n", - __func__, ep->txq_idx, ep->tx_chan, ep->smac_idx, ep->rss_qid, - ep->l2t->idx); - - state_set(&ep->com, CONNECTING); - ep->tos = 0; - - /* send connect request to rnic */ - err = send_connect(ep); - if (!err) - goto out; - - cxgb4_l2t_release(ep->l2t); -fail4: - dst_release(ep->dst); -fail3: - cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); -fail2: - /* - * remember to send notification to upper layer. - * We are in here so the upper layer is not aware that this is - * re-connect attempt and so, upper layer is still waiting for - * response of 1st connect request. - */ - connect_reply_upcall(ep, -ECONNRESET); - c4iw_put_ep(&ep->com); -out: - return err; -} - static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_abort_req_rss *req = cplhdr(skb); @@ -1920,13 +2471,14 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) unsigned int tid = GET_TID(req); ep = lookup_tid(t, tid); - if (is_neg_adv_abort(req->status)) { + if (is_neg_adv(req->status)) { PDBG("%s neg_adv_abort ep %p tid %u\n", __func__, ep, ep->hwtid); return 0; } PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid, ep->com.state); + set_bit(PEER_ABORT, &ep->com.history); /* * Wake up any threads in rdma_init() or rdma_fini(). @@ -1941,11 +2493,11 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) case CONNECTING: break; case MPA_REQ_WAIT: - stop_ep_timer(ep); + (void)stop_ep_timer(ep); break; case MPA_REQ_SENT: - stop_ep_timer(ep); - if (mpa_rev == 2 && ep->tried_with_mpa_v1) + (void)stop_ep_timer(ep); + if (mpa_rev == 1 || (mpa_rev == 2 && ep->tried_with_mpa_v1)) connect_reply_upcall(ep, -ECONNRESET); else { /* @@ -2017,9 +2569,8 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) out: if (release) release_ep_resources(ep); - - /* retry with mpa-v1 */ - if (ep && ep->retry_with_mpa_v1) { + else if (ep->retry_with_mpa_v1) { + remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid); cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid); dst_release(ep->dst); cxgb4_l2t_release(ep->l2t); @@ -2050,7 +2601,7 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb) __state_set(&ep->com, MORIBUND); break; case MORIBUND: - stop_ep_timer(ep); + (void)stop_ep_timer(ep); if ((ep->com.cm_id) && (ep->com.qp)) { attrs.next_state = C4IW_QP_STATE_IDLE; c4iw_modify_qp(ep->com.qp->rhp, @@ -2058,7 +2609,7 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb) C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } - close_complete_upcall(ep); + close_complete_upcall(ep, 0); __state_set(&ep->com, DEAD); release = 1; break; @@ -2133,21 +2684,28 @@ static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb) int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) { - int err; + int err = 0; + int disconnect = 0; struct c4iw_ep *ep = to_ep(cm_id); PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); - if (state_read(&ep->com) == DEAD) { + mutex_lock(&ep->com.mutex); + if (ep->com.state == DEAD) { + mutex_unlock(&ep->com.mutex); c4iw_put_ep(&ep->com); return -ECONNRESET; } - BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); + set_bit(ULP_REJECT, &ep->com.history); + BUG_ON(ep->com.state != MPA_REQ_RCVD); if (mpa_rev == 0) abort_connection(ep, NULL, GFP_KERNEL); else { err = send_mpa_reject(ep, pdata, pdata_len); - err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL); + disconnect = 1; } + mutex_unlock(&ep->com.mutex); + if (disconnect) + err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL); c4iw_put_ep(&ep->com); return 0; } @@ -2162,14 +2720,17 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct c4iw_qp *qp = get_qhp(h, conn_param->qpn); PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); - if (state_read(&ep->com) == DEAD) { + + mutex_lock(&ep->com.mutex); + if (ep->com.state == DEAD) { err = -ECONNRESET; goto err; } - BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); + BUG_ON(ep->com.state != MPA_REQ_RCVD); BUG_ON(!qp); + set_bit(ULP_ACCEPT, &ep->com.history); if ((conn_param->ord > c4iw_max_read_depth) || (conn_param->ird > c4iw_max_read_depth)) { abort_connection(ep, NULL, GFP_KERNEL); @@ -2210,6 +2771,7 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_id->add_ref(cm_id); ep->com.cm_id = cm_id; ep->com.qp = qp; + ref_qp(ep); /* bind QP to EP and move to RTS */ attrs.mpa_attr = ep->mpa_attr; @@ -2234,25 +2796,95 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (err) goto err1; - state_set(&ep->com, FPDU_MODE); + __state_set(&ep->com, FPDU_MODE); established_upcall(ep); + mutex_unlock(&ep->com.mutex); c4iw_put_ep(&ep->com); return 0; err1: ep->com.cm_id = NULL; - ep->com.qp = NULL; cm_id->rem_ref(cm_id); err: + mutex_unlock(&ep->com.mutex); c4iw_put_ep(&ep->com); return err; } +static int pick_local_ipaddrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id) +{ + struct in_device *ind; + int found = 0; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + + ind = in_dev_get(dev->rdev.lldi.ports[0]); + if (!ind) + return -EADDRNOTAVAIL; + for_primary_ifa(ind) { + laddr->sin_addr.s_addr = ifa->ifa_address; + raddr->sin_addr.s_addr = ifa->ifa_address; + found = 1; + break; + } + endfor_ifa(ind); + in_dev_put(ind); + return found ? 0 : -EADDRNOTAVAIL; +} + +static int get_lladdr(struct net_device *dev, struct in6_addr *addr, + unsigned char banned_flags) +{ + struct inet6_dev *idev; + int err = -EADDRNOTAVAIL; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev != NULL) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (ifp->scope == IFA_LINK && + !(ifp->flags & banned_flags)) { + memcpy(addr, &ifp->addr, 16); + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + return err; +} + +static int pick_local_ip6addrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id) +{ + struct in6_addr uninitialized_var(addr); + struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&cm_id->local_addr; + struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&cm_id->remote_addr; + + if (get_lladdr(dev->rdev.lldi.ports[0], &addr, IFA_F_TENTATIVE)) { + memcpy(la6->sin6_addr.s6_addr, &addr, 16); + memcpy(ra6->sin6_addr.s6_addr, &addr, 16); + return 0; + } + return -EADDRNOTAVAIL; +} + int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) { struct c4iw_dev *dev = to_c4iw_dev(cm_id->device); struct c4iw_ep *ep; - struct rtable *rt; int err = 0; + struct sockaddr_in *laddr; + struct sockaddr_in *raddr; + struct sockaddr_in6 *laddr6; + struct sockaddr_in6 *raddr6; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; + __u8 *ra; + int iptype; + int iwpm_err = 0; if ((conn_param->ord > c4iw_max_read_depth) || (conn_param->ird > c4iw_max_read_depth)) { @@ -2280,7 +2912,12 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ep->com.dev = dev; ep->com.cm_id = cm_id; ep->com.qp = get_qhp(dev, conn_param->qpn); - BUG_ON(!ep->com.qp); + if (!ep->com.qp) { + PDBG("%s qpn 0x%x not found!\n", __func__, conn_param->qpn); + err = -EINVAL; + goto fail1; + } + ref_qp(ep); PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn, ep->com.qp, cm_id); @@ -2291,33 +2928,103 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (ep->atid == -1) { printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__); err = -ENOMEM; - goto fail2; + goto fail1; } + insert_handle(dev, &dev->atid_idr, ep, ep->atid); + + memcpy(&ep->com.local_addr, &cm_id->local_addr, + sizeof(ep->com.local_addr)); + memcpy(&ep->com.remote_addr, &cm_id->remote_addr, + sizeof(ep->com.remote_addr)); + + /* No port mapper available, go with the specified peer information */ + memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr, + sizeof(ep->com.mapped_local_addr)); + memcpy(&ep->com.mapped_remote_addr, &cm_id->remote_addr, + sizeof(ep->com.mapped_remote_addr)); + + c4iw_form_reg_msg(dev, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW); + if (iwpm_err) { + PDBG("%s: Port Mapper reg pid fail (err = %d).\n", + __func__, iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + c4iw_form_pm_msg(ep, &pm_msg); + iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_C4IW); + if (iwpm_err) + PDBG("%s: Port Mapper query fail (err = %d).\n", + __func__, iwpm_err); + else + c4iw_record_pm_msg(ep, &pm_msg); + } + if (iwpm_create_mapinfo(&ep->com.local_addr, + &ep->com.mapped_local_addr, RDMA_NL_C4IW)) { + iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW); + err = -ENOMEM; + goto fail1; + } + print_addr(&ep->com, __func__, "add_query/create_mapinfo"); + set_bit(RELEASE_MAPINFO, &ep->com.flags); - PDBG("%s saddr 0x%x sport 0x%x raddr 0x%x rport 0x%x\n", __func__, - ntohl(cm_id->local_addr.sin_addr.s_addr), - ntohs(cm_id->local_addr.sin_port), - ntohl(cm_id->remote_addr.sin_addr.s_addr), - ntohs(cm_id->remote_addr.sin_port)); + laddr = (struct sockaddr_in *)&ep->com.mapped_local_addr; + raddr = (struct sockaddr_in *)&ep->com.mapped_remote_addr; + laddr6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr; + raddr6 = (struct sockaddr_in6 *) &ep->com.mapped_remote_addr; - /* find a route */ - rt = find_route(dev, - cm_id->local_addr.sin_addr.s_addr, - cm_id->remote_addr.sin_addr.s_addr, - cm_id->local_addr.sin_port, - cm_id->remote_addr.sin_port, 0); - if (!rt) { + if (cm_id->remote_addr.ss_family == AF_INET) { + iptype = 4; + ra = (__u8 *)&raddr->sin_addr; + + /* + * Handle loopback requests to INADDR_ANY. + */ + if ((__force int)raddr->sin_addr.s_addr == INADDR_ANY) { + err = pick_local_ipaddrs(dev, cm_id); + if (err) + goto fail1; + } + + /* find a route */ + PDBG("%s saddr %pI4 sport 0x%x raddr %pI4 rport 0x%x\n", + __func__, &laddr->sin_addr, ntohs(laddr->sin_port), + ra, ntohs(raddr->sin_port)); + ep->dst = find_route(dev, laddr->sin_addr.s_addr, + raddr->sin_addr.s_addr, laddr->sin_port, + raddr->sin_port, 0); + } else { + iptype = 6; + ra = (__u8 *)&raddr6->sin6_addr; + + /* + * Handle loopback requests to INADDR_ANY. + */ + if (ipv6_addr_type(&raddr6->sin6_addr) == IPV6_ADDR_ANY) { + err = pick_local_ip6addrs(dev, cm_id); + if (err) + goto fail1; + } + + /* find a route */ + PDBG("%s saddr %pI6 sport 0x%x raddr %pI6 rport 0x%x\n", + __func__, laddr6->sin6_addr.s6_addr, + ntohs(laddr6->sin6_port), + raddr6->sin6_addr.s6_addr, ntohs(raddr6->sin6_port)); + ep->dst = find_route6(dev, laddr6->sin6_addr.s6_addr, + raddr6->sin6_addr.s6_addr, + laddr6->sin6_port, raddr6->sin6_port, 0, + raddr6->sin6_scope_id); + } + if (!ep->dst) { printk(KERN_ERR MOD "%s - cannot find route.\n", __func__); err = -EHOSTUNREACH; - goto fail3; + goto fail2; } - ep->dst = &rt->dst; - err = import_ep(ep, cm_id->remote_addr.sin_addr.s_addr, - ep->dst, ep->com.dev, true); + err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, true); if (err) { printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); - goto fail4; + goto fail3; } PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n", @@ -2326,8 +3033,6 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) state_set(&ep->com, CONNECTING); ep->tos = 0; - ep->com.local_addr = cm_id->local_addr; - ep->com.remote_addr = cm_id->remote_addr; /* send connect request to rnic */ err = send_connect(ep); @@ -2335,23 +3040,82 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) goto out; cxgb4_l2t_release(ep->l2t); -fail4: - dst_release(ep->dst); fail3: - cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); + dst_release(ep->dst); fail2: + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); + cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); +fail1: cm_id->rem_ref(cm_id); c4iw_put_ep(&ep->com); out: return err; } +static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) +{ + int err; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; + + c4iw_init_wr_wait(&ep->com.wr_wait); + err = cxgb4_create_server6(ep->com.dev->rdev.lldi.ports[0], + ep->stid, &sin6->sin6_addr, + sin6->sin6_port, + ep->com.dev->rdev.lldi.rxq_ids[0]); + if (!err) + err = c4iw_wait_for_reply(&ep->com.dev->rdev, + &ep->com.wr_wait, + 0, 0, __func__); + if (err) + pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n", + err, ep->stid, + sin6->sin6_addr.s6_addr, ntohs(sin6->sin6_port)); + return err; +} + +static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) +{ + int err; + struct sockaddr_in *sin = (struct sockaddr_in *) + &ep->com.mapped_local_addr; + + if (dev->rdev.lldi.enable_fw_ofld_conn) { + do { + err = cxgb4_create_server_filter( + ep->com.dev->rdev.lldi.ports[0], ep->stid, + sin->sin_addr.s_addr, sin->sin_port, 0, + ep->com.dev->rdev.lldi.rxq_ids[0], 0, 0); + if (err == -EBUSY) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(usecs_to_jiffies(100)); + } + } while (err == -EBUSY); + } else { + c4iw_init_wr_wait(&ep->com.wr_wait); + err = cxgb4_create_server(ep->com.dev->rdev.lldi.ports[0], + ep->stid, sin->sin_addr.s_addr, sin->sin_port, + 0, ep->com.dev->rdev.lldi.rxq_ids[0]); + if (!err) + err = c4iw_wait_for_reply(&ep->com.dev->rdev, + &ep->com.wr_wait, + 0, 0, __func__); + } + if (err) + pr_err("cxgb4_create_server/filter failed err %d stid %d laddr %pI4 lport %d\n" + , err, ep->stid, + &sin->sin_addr, ntohs(sin->sin_port)); + return err; +} + int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) { int err = 0; struct c4iw_dev *dev = to_c4iw_dev(cm_id->device); struct c4iw_listen_ep *ep; - + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; + int iwpm_err = 0; might_sleep(); @@ -2366,36 +3130,70 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) ep->com.cm_id = cm_id; ep->com.dev = dev; ep->backlog = backlog; - ep->com.local_addr = cm_id->local_addr; + memcpy(&ep->com.local_addr, &cm_id->local_addr, + sizeof(ep->com.local_addr)); /* * Allocate a server TID. */ - ep->stid = cxgb4_alloc_stid(dev->rdev.lldi.tids, PF_INET, ep); + if (dev->rdev.lldi.enable_fw_ofld_conn && + ep->com.local_addr.ss_family == AF_INET) + ep->stid = cxgb4_alloc_sftid(dev->rdev.lldi.tids, + cm_id->local_addr.ss_family, ep); + else + ep->stid = cxgb4_alloc_stid(dev->rdev.lldi.tids, + cm_id->local_addr.ss_family, ep); + if (ep->stid == -1) { printk(KERN_ERR MOD "%s - cannot alloc stid.\n", __func__); err = -ENOMEM; goto fail2; } - - state_set(&ep->com, LISTEN); - c4iw_init_wr_wait(&ep->com.wr_wait); - err = cxgb4_create_server(ep->com.dev->rdev.lldi.ports[0], ep->stid, - ep->com.local_addr.sin_addr.s_addr, - ep->com.local_addr.sin_port, - ep->com.dev->rdev.lldi.rxq_ids[0]); - if (err) + insert_handle(dev, &dev->stid_idr, ep, ep->stid); + + /* No port mapper available, go with the specified info */ + memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr, + sizeof(ep->com.mapped_local_addr)); + + c4iw_form_reg_msg(dev, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW); + if (iwpm_err) { + PDBG("%s: Port Mapper reg pid fail (err = %d).\n", + __func__, iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + memcpy(&pm_msg.loc_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_C4IW); + if (iwpm_err) + PDBG("%s: Port Mapper query fail (err = %d).\n", + __func__, iwpm_err); + else + memcpy(&ep->com.mapped_local_addr, + &pm_msg.mapped_loc_addr, + sizeof(ep->com.mapped_local_addr)); + } + if (iwpm_create_mapinfo(&ep->com.local_addr, + &ep->com.mapped_local_addr, RDMA_NL_C4IW)) { + err = -ENOMEM; goto fail3; + } + print_addr(&ep->com, __func__, "add_mapping/create_mapinfo"); - /* wait for pass_open_rpl */ - err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait, 0, 0, - __func__); + set_bit(RELEASE_MAPINFO, &ep->com.flags); + state_set(&ep->com, LISTEN); + if (ep->com.local_addr.ss_family == AF_INET) + err = create_server4(dev, ep); + else + err = create_server6(dev, ep); if (!err) { cm_id->provider_data = ep; goto out; } + fail3: - cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, PF_INET); + cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, + ep->com.local_addr.ss_family); fail2: cm_id->rem_ref(cm_id); c4iw_put_ep(&ep->com); @@ -2413,13 +3211,24 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id) might_sleep(); state_set(&ep->com, DEAD); - c4iw_init_wr_wait(&ep->com.wr_wait); - err = listen_stop(ep); - if (err) - goto done; - err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait, 0, 0, - __func__); - cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, PF_INET); + if (ep->com.dev->rdev.lldi.enable_fw_ofld_conn && + ep->com.local_addr.ss_family == AF_INET) { + err = cxgb4_remove_server_filter( + ep->com.dev->rdev.lldi.ports[0], ep->stid, + ep->com.dev->rdev.lldi.rxq_ids[0], 0); + } else { + c4iw_init_wr_wait(&ep->com.wr_wait); + err = cxgb4_remove_server( + ep->com.dev->rdev.lldi.ports[0], ep->stid, + ep->com.dev->rdev.lldi.rxq_ids[0], 0); + if (err) + goto done; + err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait, + 0, 0, __func__); + } + remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid); + cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, + ep->com.local_addr.ss_family); done: cm_id->rem_ref(cm_id); c4iw_put_ep(&ep->com); @@ -2441,7 +3250,7 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) rdev = &ep->com.dev->rdev; if (c4iw_fatal_error(rdev)) { fatal = 1; - close_complete_upcall(ep); + close_complete_upcall(ep, -EIO); ep->com.state = DEAD; } switch (ep->com.state) { @@ -2463,7 +3272,7 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) { close = 1; if (abrupt) { - stop_ep_timer(ep); + (void)stop_ep_timer(ep); ep->com.state = ABORTING; } else ep->com.state = MORIBUND; @@ -2482,10 +3291,13 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) if (close) { if (abrupt) { - close_complete_upcall(ep); + set_bit(EP_DISC_ABORT, &ep->com.history); + close_complete_upcall(ep, -ECONNRESET); ret = send_abort(ep, NULL, gfp); - } else + } else { + set_bit(EP_DISC_CLOSE, &ep->com.history); ret = send_halfclose(ep, gfp); + } if (ret) fatal = 1; } @@ -2495,10 +3307,351 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) return ret; } -static int async_event(struct c4iw_dev *dev, struct sk_buff *skb) +static void active_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb, + struct cpl_fw6_msg_ofld_connection_wr_rpl *req) +{ + struct c4iw_ep *ep; + int atid = be32_to_cpu(req->tid); + + ep = (struct c4iw_ep *)lookup_atid(dev->rdev.lldi.tids, + (__force u32) req->tid); + if (!ep) + return; + + switch (req->retval) { + case FW_ENOMEM: + set_bit(ACT_RETRY_NOMEM, &ep->com.history); + if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) { + send_fw_act_open_req(ep, atid); + return; + } + case FW_EADDRINUSE: + set_bit(ACT_RETRY_INUSE, &ep->com.history); + if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) { + send_fw_act_open_req(ep, atid); + return; + } + break; + default: + pr_info("%s unexpected ofld conn wr retval %d\n", + __func__, req->retval); + break; + } + pr_err("active ofld_connect_wr failure %d atid %d\n", + req->retval, atid); + mutex_lock(&dev->rdev.stats.lock); + dev->rdev.stats.act_ofld_conn_fails++; + mutex_unlock(&dev->rdev.stats.lock); + connect_reply_upcall(ep, status2errno(req->retval)); + state_set(&ep->com, DEAD); + remove_handle(dev, &dev->atid_idr, atid); + cxgb4_free_atid(dev->rdev.lldi.tids, atid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + c4iw_put_ep(&ep->com); +} + +static void passive_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb, + struct cpl_fw6_msg_ofld_connection_wr_rpl *req) +{ + struct sk_buff *rpl_skb; + struct cpl_pass_accept_req *cpl; + int ret; + + rpl_skb = (struct sk_buff *)(unsigned long)req->cookie; + BUG_ON(!rpl_skb); + if (req->retval) { + PDBG("%s passive open failure %d\n", __func__, req->retval); + mutex_lock(&dev->rdev.stats.lock); + dev->rdev.stats.pas_ofld_conn_fails++; + mutex_unlock(&dev->rdev.stats.lock); + kfree_skb(rpl_skb); + } else { + cpl = (struct cpl_pass_accept_req *)cplhdr(rpl_skb); + OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_REQ, + (__force u32) htonl( + (__force u32) req->tid))); + ret = pass_accept_req(dev, rpl_skb); + if (!ret) + kfree_skb(rpl_skb); + } + return; +} + +static int deferred_fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_fw6_msg *rpl = cplhdr(skb); - c4iw_ev_dispatch(dev, (struct t4_cqe *)&rpl->data[0]); + struct cpl_fw6_msg_ofld_connection_wr_rpl *req; + + switch (rpl->type) { + case FW6_TYPE_CQE: + c4iw_ev_dispatch(dev, (struct t4_cqe *)&rpl->data[0]); + break; + case FW6_TYPE_OFLD_CONNECTION_WR_RPL: + req = (struct cpl_fw6_msg_ofld_connection_wr_rpl *)rpl->data; + switch (req->t_state) { + case TCP_SYN_SENT: + active_ofld_conn_reply(dev, skb, req); + break; + case TCP_SYN_RECV: + passive_ofld_conn_reply(dev, skb, req); + break; + default: + pr_err("%s unexpected ofld conn wr state %d\n", + __func__, req->t_state); + break; + } + break; + } + return 0; +} + +static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos) +{ + u32 l2info; + u16 vlantag, len, hdr_len, eth_hdr_len; + u8 intf; + struct cpl_rx_pkt *cpl = cplhdr(skb); + struct cpl_pass_accept_req *req; + struct tcp_options_received tmp_opt; + struct c4iw_dev *dev; + + dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *))); + /* Store values from cpl_rx_pkt in temporary location. */ + vlantag = (__force u16) cpl->vlan; + len = (__force u16) cpl->len; + l2info = (__force u32) cpl->l2info; + hdr_len = (__force u16) cpl->hdr_len; + intf = cpl->iff; + + __skb_pull(skb, sizeof(*req) + sizeof(struct rss_header)); + + /* + * We need to parse the TCP options from SYN packet. + * to generate cpl_pass_accept_req. + */ + memset(&tmp_opt, 0, sizeof(tmp_opt)); + tcp_clear_options(&tmp_opt); + tcp_parse_options(skb, &tmp_opt, 0, NULL); + + req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); + req->l2info = cpu_to_be16(V_SYN_INTF(intf) | + V_SYN_MAC_IDX(G_RX_MACIDX( + (__force int) htonl(l2info))) | + F_SYN_XACT_MATCH); + eth_hdr_len = is_t4(dev->rdev.lldi.adapter_type) ? + G_RX_ETHHDR_LEN((__force int) htonl(l2info)) : + G_RX_T5_ETHHDR_LEN((__force int) htonl(l2info)); + req->hdr_len = cpu_to_be32(V_SYN_RX_CHAN(G_RX_CHAN( + (__force int) htonl(l2info))) | + V_TCP_HDR_LEN(G_RX_TCPHDR_LEN( + (__force int) htons(hdr_len))) | + V_IP_HDR_LEN(G_RX_IPHDR_LEN( + (__force int) htons(hdr_len))) | + V_ETH_HDR_LEN(G_RX_ETHHDR_LEN(eth_hdr_len))); + req->vlan = (__force __be16) vlantag; + req->len = (__force __be16) len; + req->tos_stid = cpu_to_be32(PASS_OPEN_TID(stid) | + PASS_OPEN_TOS(tos)); + req->tcpopt.mss = htons(tmp_opt.mss_clamp); + if (tmp_opt.wscale_ok) + req->tcpopt.wsf = tmp_opt.snd_wscale; + req->tcpopt.tstamp = tmp_opt.saw_tstamp; + if (tmp_opt.sack_ok) + req->tcpopt.sack = 1; + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_REQ, 0)); + return; +} + +static void send_fw_pass_open_req(struct c4iw_dev *dev, struct sk_buff *skb, + __be32 laddr, __be16 lport, + __be32 raddr, __be16 rport, + u32 rcv_isn, u32 filter, u16 window, + u32 rss_qid, u8 port_id) +{ + struct sk_buff *req_skb; + struct fw_ofld_connection_wr *req; + struct cpl_pass_accept_req *cpl = cplhdr(skb); + int ret; + + req_skb = alloc_skb(sizeof(struct fw_ofld_connection_wr), GFP_KERNEL); + req = (struct fw_ofld_connection_wr *)__skb_put(req_skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); + req->op_compl = htonl(V_WR_OP(FW_OFLD_CONNECTION_WR) | FW_WR_COMPL(1)); + req->len16_pkd = htonl(FW_WR_LEN16(DIV_ROUND_UP(sizeof(*req), 16))); + req->le.version_cpl = htonl(F_FW_OFLD_CONNECTION_WR_CPL); + req->le.filter = (__force __be32) filter; + req->le.lport = lport; + req->le.pport = rport; + req->le.u.ipv4.lip = laddr; + req->le.u.ipv4.pip = raddr; + req->tcb.rcv_nxt = htonl(rcv_isn + 1); + req->tcb.rcv_adv = htons(window); + req->tcb.t_state_to_astid = + htonl(V_FW_OFLD_CONNECTION_WR_T_STATE(TCP_SYN_RECV) | + V_FW_OFLD_CONNECTION_WR_RCV_SCALE(cpl->tcpopt.wsf) | + V_FW_OFLD_CONNECTION_WR_ASTID( + GET_PASS_OPEN_TID(ntohl(cpl->tos_stid)))); + + /* + * We store the qid in opt2 which will be used by the firmware + * to send us the wr response. + */ + req->tcb.opt2 = htonl(V_RSS_QUEUE(rss_qid)); + + /* + * We initialize the MSS index in TCB to 0xF. + * So that when driver sends cpl_pass_accept_rpl + * TCB picks up the correct value. If this was 0 + * TP will ignore any value > 0 for MSS index. + */ + req->tcb.opt0 = cpu_to_be64(V_MSS_IDX(0xF)); + req->cookie = (unsigned long)skb; + + set_wr_txq(req_skb, CPL_PRIORITY_CONTROL, port_id); + ret = cxgb4_ofld_send(dev->rdev.lldi.ports[0], req_skb); + if (ret < 0) { + pr_err("%s - cxgb4_ofld_send error %d - dropping\n", __func__, + ret); + kfree_skb(skb); + kfree_skb(req_skb); + } +} + +/* + * Handler for CPL_RX_PKT message. Need to handle cpl_rx_pkt + * messages when a filter is being used instead of server to + * redirect a syn packet. When packets hit filter they are redirected + * to the offload queue and driver tries to establish the connection + * using firmware work request. + */ +static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb) +{ + int stid; + unsigned int filter; + struct ethhdr *eh = NULL; + struct vlan_ethhdr *vlan_eh = NULL; + struct iphdr *iph; + struct tcphdr *tcph; + struct rss_header *rss = (void *)skb->data; + struct cpl_rx_pkt *cpl = (void *)skb->data; + struct cpl_pass_accept_req *req = (void *)(rss + 1); + struct l2t_entry *e; + struct dst_entry *dst; + struct c4iw_ep *lep; + u16 window; + struct port_info *pi; + struct net_device *pdev; + u16 rss_qid, eth_hdr_len; + int step; + u32 tx_chan; + struct neighbour *neigh; + + /* Drop all non-SYN packets */ + if (!(cpl->l2info & cpu_to_be32(F_RXF_SYN))) + goto reject; + + /* + * Drop all packets which did not hit the filter. + * Unlikely to happen. + */ + if (!(rss->filter_hit && rss->filter_tid)) + goto reject; + + /* + * Calculate the server tid from filter hit index from cpl_rx_pkt. + */ + stid = (__force int) cpu_to_be32((__force u32) rss->hash_val); + + lep = (struct c4iw_ep *)lookup_stid(dev->rdev.lldi.tids, stid); + if (!lep) { + PDBG("%s connect request on invalid stid %d\n", __func__, stid); + goto reject; + } + + eth_hdr_len = is_t4(dev->rdev.lldi.adapter_type) ? + G_RX_ETHHDR_LEN(htonl(cpl->l2info)) : + G_RX_T5_ETHHDR_LEN(htonl(cpl->l2info)); + if (eth_hdr_len == ETH_HLEN) { + eh = (struct ethhdr *)(req + 1); + iph = (struct iphdr *)(eh + 1); + } else { + vlan_eh = (struct vlan_ethhdr *)(req + 1); + iph = (struct iphdr *)(vlan_eh + 1); + skb->vlan_tci = ntohs(cpl->vlan); + } + + if (iph->version != 0x4) + goto reject; + + tcph = (struct tcphdr *)(iph + 1); + skb_set_network_header(skb, (void *)iph - (void *)rss); + skb_set_transport_header(skb, (void *)tcph - (void *)rss); + skb_get(skb); + + PDBG("%s lip 0x%x lport %u pip 0x%x pport %u tos %d\n", __func__, + ntohl(iph->daddr), ntohs(tcph->dest), ntohl(iph->saddr), + ntohs(tcph->source), iph->tos); + + dst = find_route(dev, iph->daddr, iph->saddr, tcph->dest, tcph->source, + iph->tos); + if (!dst) { + pr_err("%s - failed to find dst entry!\n", + __func__); + goto reject; + } + neigh = dst_neigh_lookup_skb(dst, skb); + + if (!neigh) { + pr_err("%s - failed to allocate neigh!\n", + __func__); + goto free_dst; + } + + if (neigh->dev->flags & IFF_LOOPBACK) { + pdev = ip_dev_find(&init_net, iph->daddr); + e = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh, + pdev, 0); + pi = (struct port_info *)netdev_priv(pdev); + tx_chan = cxgb4_port_chan(pdev); + dev_put(pdev); + } else { + pdev = get_real_dev(neigh->dev); + e = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh, + pdev, 0); + pi = (struct port_info *)netdev_priv(pdev); + tx_chan = cxgb4_port_chan(pdev); + } + neigh_release(neigh); + if (!e) { + pr_err("%s - failed to allocate l2t entry!\n", + __func__); + goto free_dst; + } + + step = dev->rdev.lldi.nrxq / dev->rdev.lldi.nchan; + rss_qid = dev->rdev.lldi.rxq_ids[pi->port_id * step]; + window = (__force u16) htons((__force u16)tcph->window); + + /* Calcuate filter portion for LE region. */ + filter = (__force unsigned int) cpu_to_be32(cxgb4_select_ntuple( + dev->rdev.lldi.ports[0], + e)); + + /* + * Synthesize the cpl_pass_accept_req. We have everything except the + * TID. Once firmware sends a reply with TID we update the TID field + * in cpl and pass it through the regular cpl_pass_accept_req path. + */ + build_cpl_pass_accept_req(skb, stid, iph->tos); + send_fw_pass_open_req(dev, skb, iph->daddr, tcph->dest, iph->saddr, + tcph->source, ntohl(tcph->seq), filter, window, + rss_qid, pi->port_id); + cxgb4_l2t_release(e); +free_dst: + dst_release(dst); +reject: return 0; } @@ -2521,7 +3674,8 @@ static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = { [CPL_CLOSE_CON_RPL] = close_con_rpl, [CPL_RDMA_TERMINATE] = terminate, [CPL_FW4_ACK] = fw4_ack, - [CPL_FW6_MSG] = async_event + [CPL_FW6_MSG] = deferred_fw6_msg, + [CPL_RX_PKT] = rx_pkt }; static void process_timeout(struct c4iw_ep *ep) @@ -2532,6 +3686,7 @@ static void process_timeout(struct c4iw_ep *ep) mutex_lock(&ep->com.mutex); PDBG("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid, ep->com.state); + set_bit(TIMEDOUT, &ep->com.history); switch (ep->com.state) { case MPA_REQ_SENT: __state_set(&ep->com, ABORTING); @@ -2549,16 +3704,26 @@ static void process_timeout(struct c4iw_ep *ep) &attrs, 1); } __state_set(&ep->com, ABORTING); + close_complete_upcall(ep, -ETIMEDOUT); + break; + case ABORTING: + case DEAD: + + /* + * These states are expected if the ep timed out at the same + * time as another thread was calling stop_ep_timer(). + * So we silently do nothing for these states. + */ + abort = 0; break; default: - printk(KERN_ERR "%s unexpected state ep %p tid %u state %u\n", + WARN(1, "%s unexpected state ep %p tid %u state %u\n", __func__, ep, ep->hwtid, ep->com.state); - WARN_ON(1); abort = 0; } - mutex_unlock(&ep->com.mutex); if (abort) abort_connection(ep, NULL, GFP_KERNEL); + mutex_unlock(&ep->com.mutex); c4iw_put_ep(&ep->com); } @@ -2572,6 +3737,8 @@ static void process_timedout_eps(void) tmp = timeout_list.next; list_del(tmp); + tmp->next = NULL; + tmp->prev = NULL; spin_unlock_irq(&timeout_lock); ep = list_entry(tmp, struct c4iw_ep, entry); process_timeout(ep); @@ -2588,6 +3755,7 @@ static void process_work(struct work_struct *work) unsigned int opcode; int ret; + process_timedout_eps(); while ((skb = skb_dequeue(&rxq))) { rpl = cplhdr(skb); dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *))); @@ -2597,8 +3765,8 @@ static void process_work(struct work_struct *work) ret = work_handlers[opcode](dev, skb); if (!ret) kfree_skb(skb); + process_timedout_eps(); } - process_timedout_eps(); } static DECLARE_WORK(skb_work, process_work); @@ -2606,11 +3774,21 @@ static DECLARE_WORK(skb_work, process_work); static void ep_timeout(unsigned long arg) { struct c4iw_ep *ep = (struct c4iw_ep *)arg; + int kickit = 0; spin_lock(&timeout_lock); - list_add_tail(&ep->entry, &timeout_list); + if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) { + /* + * Only insert if it is not already on the list. + */ + if (!ep->entry.next) { + list_add_tail(&ep->entry, &timeout_list); + kickit = 1; + } + } spin_unlock(&timeout_lock); - queue_work(workq, &skb_work); + if (kickit) + queue_work(workq, &skb_work); } /* @@ -2653,7 +3831,7 @@ static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb) PDBG("%s type %u\n", __func__, rpl->type); switch (rpl->type) { - case 1: + case FW6_TYPE_WR_RPL: ret = (int)((be64_to_cpu(rpl->data[0]) >> 8) & 0xff); wr_waitp = (struct c4iw_wr_wait *)(__force unsigned long) rpl->data[1]; PDBG("%s wr_waitp %p ret %u\n", __func__, wr_waitp, ret); @@ -2661,7 +3839,8 @@ static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb) c4iw_wake_up(wr_waitp, ret ? -ret : 0); kfree_skb(skb); break; - case 2: + case FW6_TYPE_CQE: + case FW6_TYPE_OFLD_CONNECTION_WR_RPL: sched(dev, skb); break; default: @@ -2687,7 +3866,7 @@ static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb) kfree_skb(skb); return 0; } - if (is_neg_adv_abort(req->status)) { + if (is_neg_adv(req->status)) { PDBG("%s neg_adv_abort ep %p tid %u\n", __func__, ep, ep->hwtid); kfree_skb(skb); @@ -2698,8 +3877,14 @@ static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb) /* * Wake up any threads in rdma_init() or rdma_fini(). + * However, if we are on MPAv2 and want to retry with MPAv1 + * then, don't wake up yet. */ - c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + if (mpa_rev == 2 && !ep->tried_with_mpa_v1) { + if (ep->com.state != MPA_REQ_SENT) + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + } else + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); sched(dev, skb); return 0; } @@ -2724,7 +3909,8 @@ c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS] = { [CPL_RDMA_TERMINATE] = sched, [CPL_FW4_ACK] = sched, [CPL_SET_TCB_RPL] = set_tcb_rpl, - [CPL_FW6_MSG] = fw6_msg + [CPL_FW6_MSG] = fw6_msg, + [CPL_RX_PKT] = sched }; int __init c4iw_cm_init(void) @@ -2739,7 +3925,7 @@ int __init c4iw_cm_init(void) return 0; } -void __exit c4iw_cm_term(void) +void c4iw_cm_term(void) { WARN_ON(!list_empty(&timeout_list)); flush_workqueue(workq); diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 0f1607c8325..c04292c950f 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -134,7 +134,8 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, V_FW_RI_RES_WR_IQANUS(0) | V_FW_RI_RES_WR_IQANUD(1) | F_FW_RI_RES_WR_IQANDST | - V_FW_RI_RES_WR_IQANDSTINDEX(*rdev->lldi.rxq_ids)); + V_FW_RI_RES_WR_IQANDSTINDEX( + rdev->lldi.ciq_ids[cq->vector])); res->u.cq.iqdroprss_to_iqesize = cpu_to_be16( F_FW_RI_RES_WR_IQDROPRSS | V_FW_RI_RES_WR_IQPCIECH(2) | @@ -225,43 +226,186 @@ static void insert_sq_cqe(struct t4_wq *wq, struct t4_cq *cq, t4_swcq_produce(cq); } -int c4iw_flush_sq(struct t4_wq *wq, struct t4_cq *cq, int count) +static void advance_oldest_read(struct t4_wq *wq); + +int c4iw_flush_sq(struct c4iw_qp *qhp) { int flushed = 0; - struct t4_swsqe *swsqe = &wq->sq.sw_sq[wq->sq.cidx + count]; - int in_use = wq->sq.in_use - count; + struct t4_wq *wq = &qhp->wq; + struct c4iw_cq *chp = to_c4iw_cq(qhp->ibqp.send_cq); + struct t4_cq *cq = &chp->cq; + int idx; + struct t4_swsqe *swsqe; - BUG_ON(in_use < 0); - while (in_use--) { - swsqe->signaled = 0; + if (wq->sq.flush_cidx == -1) + wq->sq.flush_cidx = wq->sq.cidx; + idx = wq->sq.flush_cidx; + BUG_ON(idx >= wq->sq.size); + while (idx != wq->sq.pidx) { + swsqe = &wq->sq.sw_sq[idx]; + BUG_ON(swsqe->flushed); + swsqe->flushed = 1; insert_sq_cqe(wq, cq, swsqe); - swsqe++; - if (swsqe == (wq->sq.sw_sq + wq->sq.size)) - swsqe = wq->sq.sw_sq; + if (wq->sq.oldest_read == swsqe) { + BUG_ON(swsqe->opcode != FW_RI_READ_REQ); + advance_oldest_read(wq); + } flushed++; + if (++idx == wq->sq.size) + idx = 0; } + wq->sq.flush_cidx += flushed; + if (wq->sq.flush_cidx >= wq->sq.size) + wq->sq.flush_cidx -= wq->sq.size; return flushed; } +static void flush_completed_wrs(struct t4_wq *wq, struct t4_cq *cq) +{ + struct t4_swsqe *swsqe; + int cidx; + + if (wq->sq.flush_cidx == -1) + wq->sq.flush_cidx = wq->sq.cidx; + cidx = wq->sq.flush_cidx; + BUG_ON(cidx > wq->sq.size); + + while (cidx != wq->sq.pidx) { + swsqe = &wq->sq.sw_sq[cidx]; + if (!swsqe->signaled) { + if (++cidx == wq->sq.size) + cidx = 0; + } else if (swsqe->complete) { + + BUG_ON(swsqe->flushed); + + /* + * Insert this completed cqe into the swcq. + */ + PDBG("%s moving cqe into swcq sq idx %u cq idx %u\n", + __func__, cidx, cq->sw_pidx); + swsqe->cqe.header |= htonl(V_CQE_SWCQE(1)); + cq->sw_queue[cq->sw_pidx] = swsqe->cqe; + t4_swcq_produce(cq); + swsqe->flushed = 1; + if (++cidx == wq->sq.size) + cidx = 0; + wq->sq.flush_cidx = cidx; + } else + break; + } +} + +static void create_read_req_cqe(struct t4_wq *wq, struct t4_cqe *hw_cqe, + struct t4_cqe *read_cqe) +{ + read_cqe->u.scqe.cidx = wq->sq.oldest_read->idx; + read_cqe->len = htonl(wq->sq.oldest_read->read_len); + read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(hw_cqe)) | + V_CQE_SWCQE(SW_CQE(hw_cqe)) | + V_CQE_OPCODE(FW_RI_READ_REQ) | + V_CQE_TYPE(1)); + read_cqe->bits_type_ts = hw_cqe->bits_type_ts; +} + +static void advance_oldest_read(struct t4_wq *wq) +{ + + u32 rptr = wq->sq.oldest_read - wq->sq.sw_sq + 1; + + if (rptr == wq->sq.size) + rptr = 0; + while (rptr != wq->sq.pidx) { + wq->sq.oldest_read = &wq->sq.sw_sq[rptr]; + + if (wq->sq.oldest_read->opcode == FW_RI_READ_REQ) + return; + if (++rptr == wq->sq.size) + rptr = 0; + } + wq->sq.oldest_read = NULL; +} + /* * Move all CQEs from the HWCQ into the SWCQ. + * Deal with out-of-order and/or completions that complete + * prior unsignalled WRs. */ -void c4iw_flush_hw_cq(struct t4_cq *cq) +void c4iw_flush_hw_cq(struct c4iw_cq *chp) { - struct t4_cqe *cqe = NULL, *swcqe; + struct t4_cqe *hw_cqe, *swcqe, read_cqe; + struct c4iw_qp *qhp; + struct t4_swsqe *swsqe; int ret; - PDBG("%s cq %p cqid 0x%x\n", __func__, cq, cq->cqid); - ret = t4_next_hw_cqe(cq, &cqe); + PDBG("%s cqid 0x%x\n", __func__, chp->cq.cqid); + ret = t4_next_hw_cqe(&chp->cq, &hw_cqe); + + /* + * This logic is similar to poll_cq(), but not quite the same + * unfortunately. Need to move pertinent HW CQEs to the SW CQ but + * also do any translation magic that poll_cq() normally does. + */ while (!ret) { - PDBG("%s flushing hwcq cidx 0x%x swcq pidx 0x%x\n", - __func__, cq->cidx, cq->sw_pidx); - swcqe = &cq->sw_queue[cq->sw_pidx]; - *swcqe = *cqe; - swcqe->header |= cpu_to_be32(V_CQE_SWCQE(1)); - t4_swcq_produce(cq); - t4_hwcq_consume(cq); - ret = t4_next_hw_cqe(cq, &cqe); + qhp = get_qhp(chp->rhp, CQE_QPID(hw_cqe)); + + /* + * drop CQEs with no associated QP + */ + if (qhp == NULL) + goto next_cqe; + + if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE) + goto next_cqe; + + if (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP) { + + /* If we have reached here because of async + * event or other error, and have egress error + * then drop + */ + if (CQE_TYPE(hw_cqe) == 1) + goto next_cqe; + + /* drop peer2peer RTR reads. + */ + if (CQE_WRID_STAG(hw_cqe) == 1) + goto next_cqe; + + /* + * Eat completions for unsignaled read WRs. + */ + if (!qhp->wq.sq.oldest_read->signaled) { + advance_oldest_read(&qhp->wq); + goto next_cqe; + } + + /* + * Don't write to the HWCQ, create a new read req CQE + * in local memory and move it into the swcq. + */ + create_read_req_cqe(&qhp->wq, hw_cqe, &read_cqe); + hw_cqe = &read_cqe; + advance_oldest_read(&qhp->wq); + } + + /* if its a SQ completion, then do the magic to move all the + * unsignaled and now in-order completions into the swcq. + */ + if (SQ_TYPE(hw_cqe)) { + swsqe = &qhp->wq.sq.sw_sq[CQE_WRID_SQ_IDX(hw_cqe)]; + swsqe->cqe = *hw_cqe; + swsqe->complete = 1; + flush_completed_wrs(&qhp->wq, &chp->cq); + } else { + swcqe = &chp->cq.sw_queue[chp->cq.sw_pidx]; + *swcqe = *hw_cqe; + swcqe->header |= cpu_to_be32(V_CQE_SWCQE(1)); + t4_swcq_produce(&chp->cq); + } +next_cqe: + t4_hwcq_consume(&chp->cq); + ret = t4_next_hw_cqe(&chp->cq, &hw_cqe); } } @@ -281,25 +425,6 @@ static int cqe_completes_wr(struct t4_cqe *cqe, struct t4_wq *wq) return 1; } -void c4iw_count_scqes(struct t4_cq *cq, struct t4_wq *wq, int *count) -{ - struct t4_cqe *cqe; - u32 ptr; - - *count = 0; - ptr = cq->sw_cidx; - while (ptr != cq->sw_pidx) { - cqe = &cq->sw_queue[ptr]; - if ((SQ_TYPE(cqe) || ((CQE_OPCODE(cqe) == FW_RI_READ_RESP) && - wq->sq.oldest_read)) && - (CQE_QPID(cqe) == wq->sq.qid)) - (*count)++; - if (++ptr == cq->size) - ptr = 0; - } - PDBG("%s cq %p count %d\n", __func__, cq, *count); -} - void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count) { struct t4_cqe *cqe; @@ -319,70 +444,6 @@ void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count) PDBG("%s cq %p count %d\n", __func__, cq, *count); } -static void flush_completed_wrs(struct t4_wq *wq, struct t4_cq *cq) -{ - struct t4_swsqe *swsqe; - u16 ptr = wq->sq.cidx; - int count = wq->sq.in_use; - int unsignaled = 0; - - swsqe = &wq->sq.sw_sq[ptr]; - while (count--) - if (!swsqe->signaled) { - if (++ptr == wq->sq.size) - ptr = 0; - swsqe = &wq->sq.sw_sq[ptr]; - unsignaled++; - } else if (swsqe->complete) { - - /* - * Insert this completed cqe into the swcq. - */ - PDBG("%s moving cqe into swcq sq idx %u cq idx %u\n", - __func__, ptr, cq->sw_pidx); - swsqe->cqe.header |= htonl(V_CQE_SWCQE(1)); - cq->sw_queue[cq->sw_pidx] = swsqe->cqe; - t4_swcq_produce(cq); - swsqe->signaled = 0; - wq->sq.in_use -= unsignaled; - break; - } else - break; -} - -static void create_read_req_cqe(struct t4_wq *wq, struct t4_cqe *hw_cqe, - struct t4_cqe *read_cqe) -{ - read_cqe->u.scqe.cidx = wq->sq.oldest_read->idx; - read_cqe->len = cpu_to_be32(wq->sq.oldest_read->read_len); - read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(hw_cqe)) | - V_CQE_SWCQE(SW_CQE(hw_cqe)) | - V_CQE_OPCODE(FW_RI_READ_REQ) | - V_CQE_TYPE(1)); - read_cqe->bits_type_ts = hw_cqe->bits_type_ts; -} - -/* - * Return a ptr to the next read wr in the SWSQ or NULL. - */ -static void advance_oldest_read(struct t4_wq *wq) -{ - - u32 rptr = wq->sq.oldest_read - wq->sq.sw_sq + 1; - - if (rptr == wq->sq.size) - rptr = 0; - while (rptr != wq->sq.pidx) { - wq->sq.oldest_read = &wq->sq.sw_sq[rptr]; - - if (wq->sq.oldest_read->opcode == FW_RI_READ_REQ) - return; - if (++rptr == wq->sq.size) - rptr = 0; - } - wq->sq.oldest_read = NULL; -} - /* * poll_cq * @@ -427,6 +488,22 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe, } /* + * skip hw cqe's if the wq is flushed. + */ + if (wq->flushed && !SW_CQE(hw_cqe)) { + ret = -EAGAIN; + goto skip_cqe; + } + + /* + * skip TERMINATE cqes... + */ + if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE) { + ret = -EAGAIN; + goto skip_cqe; + } + + /* * Gotta tweak READ completions: * 1) the cqe doesn't contain the sq_wptr from the wr. * 2) opcode not reflected from the wr. @@ -435,12 +512,22 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe, */ if (RQ_TYPE(hw_cqe) && (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP)) { - /* - * If this is an unsolicited read response, then the read + /* If we have reached here because of async + * event or other error, and have egress error + * then drop + */ + if (CQE_TYPE(hw_cqe) == 1) { + if (CQE_STATUS(hw_cqe)) + t4_set_wq_in_error(wq); + ret = -EAGAIN; + goto skip_cqe; + } + + /* If this is an unsolicited read response, then the read * was generated by the kernel driver as part of peer-2-peer * connection setup. So ignore the completion. */ - if (!wq->sq.oldest_read) { + if (CQE_WRID_STAG(hw_cqe) == 1) { if (CQE_STATUS(hw_cqe)) t4_set_wq_in_error(wq); ret = -EAGAIN; @@ -448,6 +535,15 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe, } /* + * Eat completions for unsignaled read WRs. + */ + if (!wq->sq.oldest_read->signaled) { + advance_oldest_read(wq); + ret = -EAGAIN; + goto skip_cqe; + } + + /* * Don't write to the HWCQ, so create a new read req CQE * in local memory. */ @@ -457,14 +553,8 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe, } if (CQE_STATUS(hw_cqe) || t4_wq_in_error(wq)) { - *cqe_flushed = t4_wq_in_error(wq); + *cqe_flushed = (CQE_STATUS(hw_cqe) == T4_ERR_SWFLUSH); t4_set_wq_in_error(wq); - goto proc_cqe; - } - - if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE) { - ret = -EAGAIN; - goto skip_cqe; } /* @@ -523,7 +613,24 @@ proc_cqe: * completion. */ if (SQ_TYPE(hw_cqe)) { - wq->sq.cidx = CQE_WRID_SQ_IDX(hw_cqe); + int idx = CQE_WRID_SQ_IDX(hw_cqe); + BUG_ON(idx >= wq->sq.size); + + /* + * Account for any unsignaled completions completed by + * this signaled completion. In this case, cidx points + * to the first unsignaled one, and idx points to the + * signaled one. So adjust in_use based on this delta. + * if this is not completing any unsigned wrs, then the + * delta will be 0. Handle wrapping also! + */ + if (idx < wq->sq.cidx) + wq->sq.in_use -= wq->sq.size + idx - wq->sq.cidx; + else + wq->sq.in_use -= idx - wq->sq.cidx; + BUG_ON(wq->sq.in_use <= 0 && wq->sq.in_use >= wq->sq.size); + + wq->sq.cidx = (uint16_t)idx; PDBG("%s completing sq idx %u\n", __func__, wq->sq.cidx); *cookie = wq->sq.sw_sq[wq->sq.cidx].wr_id; t4_sq_consume(wq); @@ -532,6 +639,7 @@ proc_cqe: *cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id; BUG_ON(t4_rq_empty(wq)); t4_rq_consume(wq); + goto skip_cqe; } flush_wq: @@ -565,7 +673,7 @@ skip_cqe: static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) { struct c4iw_qp *qhp = NULL; - struct t4_cqe cqe = {0, 0}, *rd_cqe; + struct t4_cqe uninitialized_var(cqe), *rd_cqe; struct t4_wq *wq; u32 credit = 0; u8 cqe_flushed; @@ -763,6 +871,9 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, rhp = to_c4iw_dev(ibdev); + if (vector >= rhp->rdev.lldi.nciq) + return ERR_PTR(-EINVAL); + chp = kzalloc(sizeof(*chp), GFP_KERNEL); if (!chp) return ERR_PTR(-ENOMEM); @@ -784,7 +895,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, /* * Make actual HW queue 2x to avoid cdix_inc overflows. */ - hwentries = entries * 2; + hwentries = min(entries * 2, T4_MAX_IQ_SIZE); /* * Make HW queue at least 64 entries so GTS updates aren't too @@ -808,6 +919,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, } chp->cq.size = hwentries; chp->cq.memsize = memsize; + chp->cq.vector = vector; ret = create_cq(&rhp->rdev, &chp->cq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx); @@ -843,7 +955,8 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, uresp.gts_key = ucontext->key; ucontext->key += PAGE_SIZE; spin_unlock(&ucontext->mmap_lock); - ret = ib_copy_to_udata(udata, &uresp, sizeof uresp); + ret = ib_copy_to_udata(udata, &uresp, + sizeof(uresp) - sizeof(uresp.reserved)); if (ret) goto err5; diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index cb4ecd78370..7db82b24302 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -41,10 +41,20 @@ #define DRV_VERSION "0.1" MODULE_AUTHOR("Steve Wise"); -MODULE_DESCRIPTION("Chelsio T4 RDMA Driver"); +MODULE_DESCRIPTION("Chelsio T4/T5 RDMA Driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(DRV_VERSION); +static int allow_db_fc_on_t5; +module_param(allow_db_fc_on_t5, int, 0644); +MODULE_PARM_DESC(allow_db_fc_on_t5, + "Allow DB Flow Control on T5 (default = 0)"); + +static int allow_db_coalescing_on_t5; +module_param(allow_db_coalescing_on_t5, int, 0644); +MODULE_PARM_DESC(allow_db_coalescing_on_t5, + "Allow DB Coalescing on T5 (default = 0)"); + struct uld_ctx { struct list_head entry; struct cxgb4_lld_info lldi; @@ -54,6 +64,10 @@ struct uld_ctx { static LIST_HEAD(uld_ctx_list); static DEFINE_MUTEX(dev_mutex); +#define DB_FC_RESUME_SIZE 64 +#define DB_FC_RESUME_DELAY 1 +#define DB_FC_DRAIN_THRESH 0 + static struct dentry *c4iw_debugfs_root; struct c4iw_debugfs_data { @@ -63,6 +77,16 @@ struct c4iw_debugfs_data { int pos; }; +/* registered cxgb4 netlink callbacks */ +static struct ibnl_client_cbs c4iw_nl_cb_table[] = { + [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, + [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, + [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, + [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, + [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} +}; + static int count_idrs(int id, void *p, void *data) { int *countp = data; @@ -93,18 +117,57 @@ static int dump_qp(int id, void *p, void *data) if (space == 0) return 1; - if (qp->ep) - cc = snprintf(qpd->buf + qpd->pos, space, - "qp sq id %u rq id %u state %u onchip %u " - "ep tid %u state %u %pI4:%u->%pI4:%u\n", - qp->wq.sq.qid, qp->wq.rq.qid, (int)qp->attr.state, - qp->wq.sq.flags & T4_SQ_ONCHIP, - qp->ep->hwtid, (int)qp->ep->com.state, - &qp->ep->com.local_addr.sin_addr.s_addr, - ntohs(qp->ep->com.local_addr.sin_port), - &qp->ep->com.remote_addr.sin_addr.s_addr, - ntohs(qp->ep->com.remote_addr.sin_port)); - else + if (qp->ep) { + if (qp->ep->com.local_addr.ss_family == AF_INET) { + struct sockaddr_in *lsin = (struct sockaddr_in *) + &qp->ep->com.local_addr; + struct sockaddr_in *rsin = (struct sockaddr_in *) + &qp->ep->com.remote_addr; + struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) + &qp->ep->com.mapped_local_addr; + struct sockaddr_in *mapped_rsin = (struct sockaddr_in *) + &qp->ep->com.mapped_remote_addr; + + cc = snprintf(qpd->buf + qpd->pos, space, + "rc qp sq id %u rq id %u state %u " + "onchip %u ep tid %u state %u " + "%pI4:%u/%u->%pI4:%u/%u\n", + qp->wq.sq.qid, qp->wq.rq.qid, + (int)qp->attr.state, + qp->wq.sq.flags & T4_SQ_ONCHIP, + qp->ep->hwtid, (int)qp->ep->com.state, + &lsin->sin_addr, ntohs(lsin->sin_port), + ntohs(mapped_lsin->sin_port), + &rsin->sin_addr, ntohs(rsin->sin_port), + ntohs(mapped_rsin->sin_port)); + } else { + struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) + &qp->ep->com.local_addr; + struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *) + &qp->ep->com.remote_addr; + struct sockaddr_in6 *mapped_lsin6 = + (struct sockaddr_in6 *) + &qp->ep->com.mapped_local_addr; + struct sockaddr_in6 *mapped_rsin6 = + (struct sockaddr_in6 *) + &qp->ep->com.mapped_remote_addr; + + cc = snprintf(qpd->buf + qpd->pos, space, + "rc qp sq id %u rq id %u state %u " + "onchip %u ep tid %u state %u " + "%pI6:%u/%u->%pI6:%u/%u\n", + qp->wq.sq.qid, qp->wq.rq.qid, + (int)qp->attr.state, + qp->wq.sq.flags & T4_SQ_ONCHIP, + qp->ep->hwtid, (int)qp->ep->com.state, + &lsin6->sin6_addr, + ntohs(lsin6->sin6_port), + ntohs(mapped_lsin6->sin6_port), + &rsin6->sin6_addr, + ntohs(rsin6->sin6_port), + ntohs(mapped_rsin6->sin6_port)); + } + } else cc = snprintf(qpd->buf + qpd->pos, space, "qp sq id %u rq id %u state %u onchip %u\n", qp->wq.sq.qid, qp->wq.rq.qid, @@ -247,7 +310,7 @@ static const struct file_operations stag_debugfs_fops = { .llseek = default_llseek, }; -static char *db_state_str[] = {"NORMAL", "FLOW_CONTROL", "RECOVERY"}; +static char *db_state_str[] = {"NORMAL", "FLOW_CONTROL", "RECOVERY", "STOPPED"}; static int stats_show(struct seq_file *seq, void *v) { @@ -276,9 +339,15 @@ static int stats_show(struct seq_file *seq, void *v) seq_printf(seq, " DB FULL: %10llu\n", dev->rdev.stats.db_full); seq_printf(seq, " DB EMPTY: %10llu\n", dev->rdev.stats.db_empty); seq_printf(seq, " DB DROP: %10llu\n", dev->rdev.stats.db_drop); - seq_printf(seq, " DB State: %s Transitions %llu\n", + seq_printf(seq, " DB State: %s Transitions %llu FC Interruptions %llu\n", db_state_str[dev->db_state], - dev->rdev.stats.db_state_transitions); + dev->rdev.stats.db_state_transitions, + dev->rdev.stats.db_fc_interruptions); + seq_printf(seq, "TCAM_FULL: %10llu\n", dev->rdev.stats.tcam_full); + seq_printf(seq, "ACT_OFLD_CONN_FAILS: %10llu\n", + dev->rdev.stats.act_ofld_conn_fails); + seq_printf(seq, "PAS_OFLD_CONN_FAILS: %10llu\n", + dev->rdev.stats.pas_ofld_conn_fails); return 0; } @@ -309,6 +378,9 @@ static ssize_t stats_clear(struct file *file, const char __user *buf, dev->rdev.stats.db_empty = 0; dev->rdev.stats.db_drop = 0; dev->rdev.stats.db_state_transitions = 0; + dev->rdev.stats.tcam_full = 0; + dev->rdev.stats.act_ofld_conn_fails = 0; + dev->rdev.stats.pas_ofld_conn_fails = 0; mutex_unlock(&dev->rdev.stats.lock); return count; } @@ -322,6 +394,168 @@ static const struct file_operations stats_debugfs_fops = { .write = stats_clear, }; +static int dump_ep(int id, void *p, void *data) +{ + struct c4iw_ep *ep = p; + struct c4iw_debugfs_data *epd = data; + int space; + int cc; + + space = epd->bufsize - epd->pos - 1; + if (space == 0) + return 1; + + if (ep->com.local_addr.ss_family == AF_INET) { + struct sockaddr_in *lsin = (struct sockaddr_in *) + &ep->com.local_addr; + struct sockaddr_in *rsin = (struct sockaddr_in *) + &ep->com.remote_addr; + struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) + &ep->com.mapped_local_addr; + struct sockaddr_in *mapped_rsin = (struct sockaddr_in *) + &ep->com.mapped_remote_addr; + + cc = snprintf(epd->buf + epd->pos, space, + "ep %p cm_id %p qp %p state %d flags 0x%lx " + "history 0x%lx hwtid %d atid %d " + "%pI4:%d/%d <-> %pI4:%d/%d\n", + ep, ep->com.cm_id, ep->com.qp, + (int)ep->com.state, ep->com.flags, + ep->com.history, ep->hwtid, ep->atid, + &lsin->sin_addr, ntohs(lsin->sin_port), + ntohs(mapped_lsin->sin_port), + &rsin->sin_addr, ntohs(rsin->sin_port), + ntohs(mapped_rsin->sin_port)); + } else { + struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) + &ep->com.local_addr; + struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *) + &ep->com.remote_addr; + struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; + struct sockaddr_in6 *mapped_rsin6 = (struct sockaddr_in6 *) + &ep->com.mapped_remote_addr; + + cc = snprintf(epd->buf + epd->pos, space, + "ep %p cm_id %p qp %p state %d flags 0x%lx " + "history 0x%lx hwtid %d atid %d " + "%pI6:%d/%d <-> %pI6:%d/%d\n", + ep, ep->com.cm_id, ep->com.qp, + (int)ep->com.state, ep->com.flags, + ep->com.history, ep->hwtid, ep->atid, + &lsin6->sin6_addr, ntohs(lsin6->sin6_port), + ntohs(mapped_lsin6->sin6_port), + &rsin6->sin6_addr, ntohs(rsin6->sin6_port), + ntohs(mapped_rsin6->sin6_port)); + } + if (cc < space) + epd->pos += cc; + return 0; +} + +static int dump_listen_ep(int id, void *p, void *data) +{ + struct c4iw_listen_ep *ep = p; + struct c4iw_debugfs_data *epd = data; + int space; + int cc; + + space = epd->bufsize - epd->pos - 1; + if (space == 0) + return 1; + + if (ep->com.local_addr.ss_family == AF_INET) { + struct sockaddr_in *lsin = (struct sockaddr_in *) + &ep->com.local_addr; + struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) + &ep->com.mapped_local_addr; + + cc = snprintf(epd->buf + epd->pos, space, + "ep %p cm_id %p state %d flags 0x%lx stid %d " + "backlog %d %pI4:%d/%d\n", + ep, ep->com.cm_id, (int)ep->com.state, + ep->com.flags, ep->stid, ep->backlog, + &lsin->sin_addr, ntohs(lsin->sin_port), + ntohs(mapped_lsin->sin_port)); + } else { + struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) + &ep->com.local_addr; + struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; + + cc = snprintf(epd->buf + epd->pos, space, + "ep %p cm_id %p state %d flags 0x%lx stid %d " + "backlog %d %pI6:%d/%d\n", + ep, ep->com.cm_id, (int)ep->com.state, + ep->com.flags, ep->stid, ep->backlog, + &lsin6->sin6_addr, ntohs(lsin6->sin6_port), + ntohs(mapped_lsin6->sin6_port)); + } + if (cc < space) + epd->pos += cc; + return 0; +} + +static int ep_release(struct inode *inode, struct file *file) +{ + struct c4iw_debugfs_data *epd = file->private_data; + if (!epd) { + pr_info("%s null qpd?\n", __func__); + return 0; + } + vfree(epd->buf); + kfree(epd); + return 0; +} + +static int ep_open(struct inode *inode, struct file *file) +{ + struct c4iw_debugfs_data *epd; + int ret = 0; + int count = 1; + + epd = kmalloc(sizeof(*epd), GFP_KERNEL); + if (!epd) { + ret = -ENOMEM; + goto out; + } + epd->devp = inode->i_private; + epd->pos = 0; + + spin_lock_irq(&epd->devp->lock); + idr_for_each(&epd->devp->hwtid_idr, count_idrs, &count); + idr_for_each(&epd->devp->atid_idr, count_idrs, &count); + idr_for_each(&epd->devp->stid_idr, count_idrs, &count); + spin_unlock_irq(&epd->devp->lock); + + epd->bufsize = count * 160; + epd->buf = vmalloc(epd->bufsize); + if (!epd->buf) { + ret = -ENOMEM; + goto err1; + } + + spin_lock_irq(&epd->devp->lock); + idr_for_each(&epd->devp->hwtid_idr, dump_ep, epd); + idr_for_each(&epd->devp->atid_idr, dump_ep, epd); + idr_for_each(&epd->devp->stid_idr, dump_listen_ep, epd); + spin_unlock_irq(&epd->devp->lock); + + file->private_data = epd; + goto out; +err1: + kfree(epd); +out: + return ret; +} + +static const struct file_operations ep_debugfs_fops = { + .owner = THIS_MODULE, + .open = ep_open, + .release = ep_release, + .read = debugfs_read, +}; + static int setup_debugfs(struct c4iw_dev *devp) { struct dentry *de; @@ -344,6 +578,11 @@ static int setup_debugfs(struct c4iw_dev *devp) if (de && de->d_inode) de->d_inode->i_size = 4096; + de = debugfs_create_file("eps", S_IWUSR, devp->debugfs_root, + (void *)devp, &ep_debugfs_fops); + if (de && de->d_inode) + de->d_inode->i_size = 4096; + return 0; } @@ -410,10 +649,10 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) rdev->lldi.vr->qp.size, rdev->lldi.vr->cq.start, rdev->lldi.vr->cq.size); - PDBG("udb len 0x%x udb base %p db_reg %p gts_reg %p qpshift %lu " + PDBG("udb len 0x%x udb base %llx db_reg %p gts_reg %p qpshift %lu " "qpmask 0x%x cqshift %lu cqmask 0x%x\n", (unsigned)pci_resource_len(rdev->lldi.pdev, 2), - (void *)pci_resource_start(rdev->lldi.pdev, 2), + (u64)pci_resource_start(rdev->lldi.pdev, 2), rdev->lldi.db_reg, rdev->lldi.gts_reg, rdev->qpshift, rdev->qpmask, @@ -451,6 +690,13 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) printk(KERN_ERR MOD "error %d initializing ocqp pool\n", err); goto err4; } + rdev->status_page = (struct t4_dev_status_page *) + __get_free_page(GFP_KERNEL); + if (!rdev->status_page) { + pr_err(MOD "error allocating status page\n"); + goto err4; + } + rdev->status_page->db_off = 0; return 0; err4: c4iw_rqtpool_destroy(rdev); @@ -464,6 +710,7 @@ err1: static void c4iw_rdev_close(struct c4iw_rdev *rdev) { + free_page((unsigned long)rdev->status_page); c4iw_pblpool_destroy(rdev); c4iw_rqtpool_destroy(rdev); c4iw_destroy_resource(&rdev->resource); @@ -475,7 +722,13 @@ static void c4iw_dealloc(struct uld_ctx *ctx) idr_destroy(&ctx->dev->cqidr); idr_destroy(&ctx->dev->qpidr); idr_destroy(&ctx->dev->mmidr); - iounmap(ctx->dev->rdev.oc_mw_kva); + idr_destroy(&ctx->dev->hwtid_idr); + idr_destroy(&ctx->dev->stid_idr); + idr_destroy(&ctx->dev->atid_idr); + if (ctx->dev->rdev.bar2_kva) + iounmap(ctx->dev->rdev.bar2_kva); + if (ctx->dev->rdev.oc_mw_kva) + iounmap(ctx->dev->rdev.oc_mw_kva); ib_dealloc_device(&ctx->dev->ibdev); ctx->dev = NULL; } @@ -491,7 +744,7 @@ static int rdma_supported(const struct cxgb4_lld_info *infop) { return infop->vr->stag.size > 0 && infop->vr->pbl.size > 0 && infop->vr->rq.size > 0 && infop->vr->qp.size > 0 && - infop->vr->cq.size > 0 && infop->vr->ocq.size > 0; + infop->vr->cq.size > 0; } static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) @@ -504,6 +757,10 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) pci_name(infop->pdev)); return ERR_PTR(-ENOSYS); } + if (!ocqp_supported(infop)) + pr_info("%s: On-Chip Queues not supported on this device.\n", + pci_name(infop->pdev)); + devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp)); if (!devp) { printk(KERN_ERR MOD "Cannot allocate ib device\n"); @@ -511,11 +768,33 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) } devp->rdev.lldi = *infop; - devp->rdev.oc_mw_pa = pci_resource_start(devp->rdev.lldi.pdev, 2) + - (pci_resource_len(devp->rdev.lldi.pdev, 2) - - roundup_pow_of_two(devp->rdev.lldi.vr->ocq.size)); - devp->rdev.oc_mw_kva = ioremap_wc(devp->rdev.oc_mw_pa, - devp->rdev.lldi.vr->ocq.size); + /* + * For T5 devices, we map all of BAR2 with WC. + * For T4 devices with onchip qp mem, we map only that part + * of BAR2 with WC. + */ + devp->rdev.bar2_pa = pci_resource_start(devp->rdev.lldi.pdev, 2); + if (is_t5(devp->rdev.lldi.adapter_type)) { + devp->rdev.bar2_kva = ioremap_wc(devp->rdev.bar2_pa, + pci_resource_len(devp->rdev.lldi.pdev, 2)); + if (!devp->rdev.bar2_kva) { + pr_err(MOD "Unable to ioremap BAR2\n"); + ib_dealloc_device(&devp->ibdev); + return ERR_PTR(-EINVAL); + } + } else if (ocqp_supported(infop)) { + devp->rdev.oc_mw_pa = + pci_resource_start(devp->rdev.lldi.pdev, 2) + + pci_resource_len(devp->rdev.lldi.pdev, 2) - + roundup_pow_of_two(devp->rdev.lldi.vr->ocq.size); + devp->rdev.oc_mw_kva = ioremap_wc(devp->rdev.oc_mw_pa, + devp->rdev.lldi.vr->ocq.size); + if (!devp->rdev.oc_mw_kva) { + pr_err(MOD "Unable to ioremap onchip mem\n"); + ib_dealloc_device(&devp->ibdev); + return ERR_PTR(-EINVAL); + } + } PDBG(KERN_INFO MOD "ocq memory: " "hw_start 0x%x size %u mw_pa 0x%lx mw_kva %p\n", @@ -532,9 +811,13 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) idr_init(&devp->cqidr); idr_init(&devp->qpidr); idr_init(&devp->mmidr); + idr_init(&devp->hwtid_idr); + idr_init(&devp->stid_idr); + idr_init(&devp->atid_idr); spin_lock_init(&devp->lock); mutex_init(&devp->rdev.stats.lock); mutex_init(&devp->db_mutex); + INIT_LIST_HEAD(&devp->db_fc_list); if (c4iw_debugfs_root) { devp->debugfs_root = debugfs_create_dir( @@ -542,6 +825,8 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) c4iw_debugfs_root); setup_debugfs(devp); } + + return devp; } @@ -552,8 +837,8 @@ static void *c4iw_uld_add(const struct cxgb4_lld_info *infop) int i; if (!vers_printed++) - printk(KERN_INFO MOD "Chelsio T4 RDMA Driver - version %s\n", - DRV_VERSION); + pr_info("Chelsio T4/T5 RDMA Driver - version %s\n", + DRV_VERSION); ctx = kzalloc(sizeof *ctx, GFP_KERNEL); if (!ctx) { @@ -577,14 +862,76 @@ out: return ctx; } +static inline struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl, + const __be64 *rsp, + u32 pktshift) +{ + struct sk_buff *skb; + + /* + * Allocate space for cpl_pass_accept_req which will be synthesized by + * driver. Once the driver synthesizes the request the skb will go + * through the regular cpl_pass_accept_req processing. + * The math here assumes sizeof cpl_pass_accept_req >= sizeof + * cpl_rx_pkt. + */ + skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req) + + sizeof(struct rss_header) - pktshift, GFP_ATOMIC); + if (unlikely(!skb)) + return NULL; + + __skb_put(skb, gl->tot_len + sizeof(struct cpl_pass_accept_req) + + sizeof(struct rss_header) - pktshift); + + /* + * This skb will contain: + * rss_header from the rspq descriptor (1 flit) + * cpl_rx_pkt struct from the rspq descriptor (2 flits) + * space for the difference between the size of an + * rx_pkt and pass_accept_req cpl (1 flit) + * the packet data from the gl + */ + skb_copy_to_linear_data(skb, rsp, sizeof(struct cpl_pass_accept_req) + + sizeof(struct rss_header)); + skb_copy_to_linear_data_offset(skb, sizeof(struct rss_header) + + sizeof(struct cpl_pass_accept_req), + gl->va + pktshift, + gl->tot_len - pktshift); + return skb; +} + +static inline int recv_rx_pkt(struct c4iw_dev *dev, const struct pkt_gl *gl, + const __be64 *rsp) +{ + unsigned int opcode = *(u8 *)rsp; + struct sk_buff *skb; + + if (opcode != CPL_RX_PKT) + goto out; + + skb = copy_gl_to_skb_pkt(gl , rsp, dev->rdev.lldi.sge_pktshift); + if (skb == NULL) + goto out; + + if (c4iw_handlers[opcode] == NULL) { + pr_info("%s no handler opcode 0x%x...\n", __func__, + opcode); + kfree_skb(skb); + goto out; + } + c4iw_handlers[opcode](dev, skb); + return 1; +out: + return 0; +} + static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp, const struct pkt_gl *gl) { struct uld_ctx *ctx = handle; struct c4iw_dev *dev = ctx->dev; struct sk_buff *skb; - const struct cpl_act_establish *rpl; - unsigned int opcode; + u8 opcode; if (gl == NULL) { /* omit RSS and rsp_ctrl at end of descriptor */ @@ -601,20 +948,33 @@ static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp, u32 qid = be32_to_cpu(rc->pldbuflen_qid); c4iw_ev_handler(dev, qid); return 0; + } else if (unlikely(*(u8 *)rsp != *(u8 *)gl->va)) { + if (recv_rx_pkt(dev, gl, rsp)) + return 0; + + pr_info("%s: unexpected FL contents at %p, " \ + "RSS %#llx, FL %#llx, len %u\n", + pci_name(ctx->lldi.pdev), gl->va, + (unsigned long long)be64_to_cpu(*rsp), + (unsigned long long)be64_to_cpu( + *(__force __be64 *)gl->va), + gl->tot_len); + + return 0; } else { skb = cxgb4_pktgl_to_skb(gl, 128, 128); if (unlikely(!skb)) goto nomem; } - rpl = cplhdr(skb); - opcode = rpl->ot.opcode; - - if (c4iw_handlers[opcode]) + opcode = *(u8 *)rsp; + if (c4iw_handlers[opcode]) { c4iw_handlers[opcode](dev, skb); - else - printk(KERN_INFO "%s no handler opcode 0x%x...\n", __func__, + } else { + pr_info("%s no handler opcode 0x%x...\n", __func__, opcode); + kfree_skb(skb); + } return 0; nomem: @@ -690,13 +1050,16 @@ static int disable_qp_db(int id, void *p, void *data) static void stop_queues(struct uld_ctx *ctx) { - spin_lock_irq(&ctx->dev->lock); - if (ctx->dev->db_state == NORMAL) { - ctx->dev->rdev.stats.db_state_transitions++; - ctx->dev->db_state = FLOW_CONTROL; + unsigned long flags; + + spin_lock_irqsave(&ctx->dev->lock, flags); + ctx->dev->rdev.stats.db_state_transitions++; + ctx->dev->db_state = STOPPED; + if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL); - } - spin_unlock_irq(&ctx->dev->lock); + else + ctx->dev->rdev.status_page->db_off = 1; + spin_unlock_irqrestore(&ctx->dev->lock, flags); } static int enable_qp_db(int id, void *p, void *data) @@ -707,15 +1070,72 @@ static int enable_qp_db(int id, void *p, void *data) return 0; } +static void resume_rc_qp(struct c4iw_qp *qp) +{ + spin_lock(&qp->lock); + t4_ring_sq_db(&qp->wq, qp->wq.sq.wq_pidx_inc, + is_t5(qp->rhp->rdev.lldi.adapter_type), NULL); + qp->wq.sq.wq_pidx_inc = 0; + t4_ring_rq_db(&qp->wq, qp->wq.rq.wq_pidx_inc, + is_t5(qp->rhp->rdev.lldi.adapter_type), NULL); + qp->wq.rq.wq_pidx_inc = 0; + spin_unlock(&qp->lock); +} + +static void resume_a_chunk(struct uld_ctx *ctx) +{ + int i; + struct c4iw_qp *qp; + + for (i = 0; i < DB_FC_RESUME_SIZE; i++) { + qp = list_first_entry(&ctx->dev->db_fc_list, struct c4iw_qp, + db_fc_entry); + list_del_init(&qp->db_fc_entry); + resume_rc_qp(qp); + if (list_empty(&ctx->dev->db_fc_list)) + break; + } +} + static void resume_queues(struct uld_ctx *ctx) { spin_lock_irq(&ctx->dev->lock); - if (ctx->dev->qpcnt <= db_fc_threshold && - ctx->dev->db_state == FLOW_CONTROL) { - ctx->dev->db_state = NORMAL; - ctx->dev->rdev.stats.db_state_transitions++; - idr_for_each(&ctx->dev->qpidr, enable_qp_db, NULL); + if (ctx->dev->db_state != STOPPED) + goto out; + ctx->dev->db_state = FLOW_CONTROL; + while (1) { + if (list_empty(&ctx->dev->db_fc_list)) { + WARN_ON(ctx->dev->db_state != FLOW_CONTROL); + ctx->dev->db_state = NORMAL; + ctx->dev->rdev.stats.db_state_transitions++; + if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) { + idr_for_each(&ctx->dev->qpidr, enable_qp_db, + NULL); + } else { + ctx->dev->rdev.status_page->db_off = 0; + } + break; + } else { + if (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1) + < (ctx->dev->rdev.lldi.dbfifo_int_thresh << + DB_FC_DRAIN_THRESH)) { + resume_a_chunk(ctx); + } + if (!list_empty(&ctx->dev->db_fc_list)) { + spin_unlock_irq(&ctx->dev->lock); + if (DB_FC_RESUME_DELAY) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(DB_FC_RESUME_DELAY); + } + spin_lock_irq(&ctx->dev->lock); + if (ctx->dev->db_state != FLOW_CONTROL) + break; + } + } } +out: + if (ctx->dev->db_state != NORMAL) + ctx->dev->rdev.stats.db_fc_interruptions++; spin_unlock_irq(&ctx->dev->lock); } @@ -741,12 +1161,12 @@ static int count_qps(int id, void *p, void *data) return 0; } -static void deref_qps(struct qp_list qp_list) +static void deref_qps(struct qp_list *qp_list) { int idx; - for (idx = 0; idx < qp_list.idx; idx++) - c4iw_qp_rem_ref(&qp_list.qps[idx]->ibqp); + for (idx = 0; idx < qp_list->idx; idx++) + c4iw_qp_rem_ref(&qp_list->qps[idx]->ibqp); } static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list) @@ -757,17 +1177,22 @@ static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list) for (idx = 0; idx < qp_list->idx; idx++) { struct c4iw_qp *qp = qp_list->qps[idx]; + spin_lock_irq(&qp->rhp->lock); + spin_lock(&qp->lock); ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0], qp->wq.sq.qid, t4_sq_host_wq_pidx(&qp->wq), t4_sq_wq_size(&qp->wq)); if (ret) { - printk(KERN_ERR MOD "%s: Fatal error - " + pr_err(KERN_ERR MOD "%s: Fatal error - " "DB overflow recovery failed - " "error syncing SQ qid %u\n", pci_name(ctx->lldi.pdev), qp->wq.sq.qid); + spin_unlock(&qp->lock); + spin_unlock_irq(&qp->rhp->lock); return; } + qp->wq.sq.wq_pidx_inc = 0; ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0], qp->wq.rq.qid, @@ -775,12 +1200,17 @@ static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list) t4_rq_wq_size(&qp->wq)); if (ret) { - printk(KERN_ERR MOD "%s: Fatal error - " + pr_err(KERN_ERR MOD "%s: Fatal error - " "DB overflow recovery failed - " "error syncing RQ qid %u\n", pci_name(ctx->lldi.pdev), qp->wq.rq.qid); + spin_unlock(&qp->lock); + spin_unlock_irq(&qp->rhp->lock); return; } + qp->wq.rq.wq_pidx_inc = 0; + spin_unlock(&qp->lock); + spin_unlock_irq(&qp->rhp->lock); /* Wait for the dbfifo to drain */ while (cxgb4_dbfifo_count(qp->rhp->rdev.lldi.ports[0], 1) > 0) { @@ -796,36 +1226,22 @@ static void recover_queues(struct uld_ctx *ctx) struct qp_list qp_list; int ret; - /* lock out kernel db ringers */ - mutex_lock(&ctx->dev->db_mutex); - - /* put all queues in to recovery mode */ - spin_lock_irq(&ctx->dev->lock); - ctx->dev->db_state = RECOVERY; - ctx->dev->rdev.stats.db_state_transitions++; - idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL); - spin_unlock_irq(&ctx->dev->lock); - /* slow everybody down */ set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(usecs_to_jiffies(1000)); - /* Wait for the dbfifo to completely drain. */ - while (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1) > 0) { - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(usecs_to_jiffies(10)); - } - /* flush the SGE contexts */ ret = cxgb4_flush_eq_cache(ctx->dev->rdev.lldi.ports[0]); if (ret) { printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n", pci_name(ctx->lldi.pdev)); - goto out; + return; } /* Count active queues so we can build a list of queues to recover */ spin_lock_irq(&ctx->dev->lock); + WARN_ON(ctx->dev->db_state != STOPPED); + ctx->dev->db_state = RECOVERY; idr_for_each(&ctx->dev->qpidr, count_qps, &count); qp_list.qps = kzalloc(count * sizeof *qp_list.qps, GFP_ATOMIC); @@ -833,7 +1249,7 @@ static void recover_queues(struct uld_ctx *ctx) printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n", pci_name(ctx->lldi.pdev)); spin_unlock_irq(&ctx->dev->lock); - goto out; + return; } qp_list.idx = 0; @@ -846,29 +1262,13 @@ static void recover_queues(struct uld_ctx *ctx) recover_lost_dbs(ctx, &qp_list); /* we're almost done! deref the qps and clean up */ - deref_qps(qp_list); + deref_qps(&qp_list); kfree(qp_list.qps); - /* Wait for the dbfifo to completely drain again */ - while (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1) > 0) { - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(usecs_to_jiffies(10)); - } - - /* resume the queues */ spin_lock_irq(&ctx->dev->lock); - if (ctx->dev->qpcnt > db_fc_threshold) - ctx->dev->db_state = FLOW_CONTROL; - else { - ctx->dev->db_state = NORMAL; - idr_for_each(&ctx->dev->qpidr, enable_qp_db, NULL); - } - ctx->dev->rdev.stats.db_state_transitions++; + WARN_ON(ctx->dev->db_state != RECOVERY); + ctx->dev->db_state = STOPPED; spin_unlock_irq(&ctx->dev->lock); - -out: - /* start up kernel db ringers again */ - mutex_unlock(&ctx->dev->db_mutex); } static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...) @@ -878,9 +1278,7 @@ static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...) switch (control) { case CXGB4_CONTROL_DB_FULL: stop_queues(ctx); - mutex_lock(&ctx->dev->rdev.stats.lock); ctx->dev->rdev.stats.db_full++; - mutex_unlock(&ctx->dev->rdev.stats.lock); break; case CXGB4_CONTROL_DB_EMPTY: resume_queues(ctx); @@ -923,6 +1321,20 @@ static int __init c4iw_init_module(void) printk(KERN_WARNING MOD "could not create debugfs entry, continuing\n"); + if (ibnl_add_client(RDMA_NL_C4IW, RDMA_NL_IWPM_NUM_OPS, + c4iw_nl_cb_table)) + pr_err("%s[%u]: Failed to add netlink callback\n" + , __func__, __LINE__); + + err = iwpm_init(RDMA_NL_C4IW); + if (err) { + pr_err("port mapper initialization failed with %d\n", err); + ibnl_remove_client(RDMA_NL_C4IW); + c4iw_cm_term(); + debugfs_remove_recursive(c4iw_debugfs_root); + return err; + } + cxgb4_register_uld(CXGB4_ULD_RDMA, &c4iw_uld_info); return 0; @@ -940,6 +1352,8 @@ static void __exit c4iw_exit_module(void) } mutex_unlock(&dev_mutex); cxgb4_unregister_uld(CXGB4_ULD_RDMA); + iwpm_exit(RDMA_NL_C4IW); + ibnl_remove_client(RDMA_NL_C4IW); c4iw_cm_term(); debugfs_remove_recursive(c4iw_debugfs_root); } diff --git a/drivers/infiniband/hw/cxgb4/ev.c b/drivers/infiniband/hw/cxgb4/ev.c index cf2f6b47617..d61d0a18f78 100644 --- a/drivers/infiniband/hw/cxgb4/ev.c +++ b/drivers/infiniband/hw/cxgb4/ev.c @@ -44,14 +44,6 @@ static void post_qp_event(struct c4iw_dev *dev, struct c4iw_cq *chp, struct c4iw_qp_attributes attrs; unsigned long flag; - if ((qhp->attr.state == C4IW_QP_STATE_ERROR) || - (qhp->attr.state == C4IW_QP_STATE_TERMINATE)) { - PDBG("%s AE received after RTS - " - "qp state %d qpid 0x%x status 0x%x\n", __func__, - qhp->attr.state, qhp->wq.sq.qid, CQE_STATUS(err_cqe)); - return; - } - printk(KERN_ERR MOD "AE qpid 0x%x opcode %d status 0x%x " "type %d wrid.hi 0x%x wrid.lo 0x%x\n", CQE_QPID(err_cqe), CQE_OPCODE(err_cqe), diff --git a/drivers/infiniband/hw/cxgb4/id_table.c b/drivers/infiniband/hw/cxgb4/id_table.c index f95e5df30db..0161ae6ad62 100644 --- a/drivers/infiniband/hw/cxgb4/id_table.c +++ b/drivers/infiniband/hw/cxgb4/id_table.c @@ -54,7 +54,7 @@ u32 c4iw_id_alloc(struct c4iw_id_table *alloc) if (obj < alloc->max) { if (alloc->flags & C4IW_ID_TABLE_F_RANDOM) - alloc->last += random32() % RANDOM_SKIP; + alloc->last += prandom_u32() % RANDOM_SKIP; else alloc->last = obj + 1; if (alloc->last >= alloc->max) @@ -88,7 +88,7 @@ int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num, alloc->start = start; alloc->flags = flags; if (flags & C4IW_ID_TABLE_F_RANDOM) - alloc->last = random32() % RANDOM_SKIP; + alloc->last = prandom_u32() % RANDOM_SKIP; else alloc->last = 0; alloc->max = num; diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 9beb3a9f033..361fff7a074 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -52,6 +52,8 @@ #include <rdma/ib_verbs.h> #include <rdma/iw_cm.h> +#include <rdma/rdma_netlink.h> +#include <rdma/iw_portmap.h> #include "cxgb4.h" #include "cxgb4_uld.h" @@ -109,6 +111,7 @@ struct c4iw_dev_ucontext { enum c4iw_rdev_flags { T4_FATAL_ERROR = (1<<0), + T4_STATUS_PAGE_DISABLED = (1<<1), }; struct c4iw_stat { @@ -130,6 +133,10 @@ struct c4iw_stats { u64 db_empty; u64 db_drop; u64 db_state_transitions; + u64 db_fc_interruptions; + u64 tcam_full; + u64 act_ofld_conn_fails; + u64 pas_ofld_conn_fails; }; struct c4iw_rdev { @@ -144,9 +151,12 @@ struct c4iw_rdev { struct gen_pool *ocqp_pool; u32 flags; struct cxgb4_lld_info lldi; + unsigned long bar2_pa; + void __iomem *bar2_kva; unsigned long oc_mw_pa; void __iomem *oc_mw_kva; struct c4iw_stats stats; + struct t4_dev_status_page *status_page; }; static inline int c4iw_fatal_error(struct c4iw_rdev *rdev) @@ -159,7 +169,7 @@ static inline int c4iw_num_stags(struct c4iw_rdev *rdev) return min((int)T4_MAX_NUM_STAG, (int)(rdev->lldi.vr->stag.size >> 5)); } -#define C4IW_WR_TO (10*HZ) +#define C4IW_WR_TO (30*HZ) struct c4iw_wr_wait { struct completion completion; @@ -208,7 +218,8 @@ static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev, enum db_state { NORMAL = 0, FLOW_CONTROL = 1, - RECOVERY = 2 + RECOVERY = 2, + STOPPED = 3 }; struct c4iw_dev { @@ -222,7 +233,10 @@ struct c4iw_dev { struct mutex db_mutex; struct dentry *debugfs_root; enum db_state db_state; - int qpcnt; + struct idr hwtid_idr; + struct idr atid_idr; + struct idr stid_idr; + struct list_head db_fc_list; }; static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev) @@ -254,20 +268,21 @@ static inline int _insert_handle(struct c4iw_dev *rhp, struct idr *idr, void *handle, u32 id, int lock) { int ret; - int newid; - do { - if (!idr_pre_get(idr, lock ? GFP_KERNEL : GFP_ATOMIC)) - return -ENOMEM; - if (lock) - spin_lock_irq(&rhp->lock); - ret = idr_get_new_above(idr, handle, id, &newid); - BUG_ON(!ret && newid != id); - if (lock) - spin_unlock_irq(&rhp->lock); - } while (ret == -EAGAIN); + if (lock) { + idr_preload(GFP_KERNEL); + spin_lock_irq(&rhp->lock); + } + + ret = idr_alloc(idr, handle, id, id + 1, GFP_ATOMIC); - return ret; + if (lock) { + spin_unlock_irq(&rhp->lock); + idr_preload_end(); + } + + BUG_ON(ret == -ENOSPC); + return ret < 0 ? ret : 0; } static inline int insert_handle(struct c4iw_dev *rhp, struct idr *idr, @@ -362,7 +377,7 @@ struct c4iw_fr_page_list { DEFINE_DMA_UNMAP_ADDR(mapping); dma_addr_t dma_addr; struct c4iw_dev *dev; - int size; + int pll_len; }; static inline struct c4iw_fr_page_list *to_c4iw_fr_page_list( @@ -422,10 +437,12 @@ struct c4iw_qp_attributes { u8 ecode; u16 sq_db_inc; u16 rq_db_inc; + u8 send_term; }; struct c4iw_qp { struct ib_qp ibqp; + struct list_head db_fc_entry; struct c4iw_dev *rhp; struct c4iw_ep *ep; struct c4iw_qp_attributes attr; @@ -435,6 +452,7 @@ struct c4iw_qp { atomic_t refcnt; wait_queue_head_t wait; struct timer_list timer; + int sq_sig_all; }; static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp) @@ -710,6 +728,34 @@ enum c4iw_ep_flags { ABORT_REQ_IN_PROGRESS = 1, RELEASE_RESOURCES = 2, CLOSE_SENT = 3, + TIMEOUT = 4, + QP_REFERENCED = 5, + RELEASE_MAPINFO = 6, +}; + +enum c4iw_ep_history { + ACT_OPEN_REQ = 0, + ACT_OFLD_CONN = 1, + ACT_OPEN_RPL = 2, + ACT_ESTAB = 3, + PASS_ACCEPT_REQ = 4, + PASS_ESTAB = 5, + ABORT_UPCALL = 6, + ESTAB_UPCALL = 7, + CLOSE_UPCALL = 8, + ULP_ACCEPT = 9, + ULP_REJECT = 10, + TIMEDOUT = 11, + PEER_ABORT = 12, + PEER_CLOSE = 13, + CONNREQ_UPCALL = 14, + ABORT_CONN = 15, + DISCONN_UPCALL = 16, + EP_DISC_CLOSE = 17, + EP_DISC_ABORT = 18, + CONN_RPL_UPCALL = 19, + ACT_RETRY_NOMEM = 20, + ACT_RETRY_INUSE = 21 }; struct c4iw_ep_common { @@ -719,10 +765,13 @@ struct c4iw_ep_common { enum c4iw_ep_state state; struct kref kref; struct mutex mutex; - struct sockaddr_in local_addr; - struct sockaddr_in remote_addr; + struct sockaddr_storage local_addr; + struct sockaddr_storage remote_addr; + struct sockaddr_storage mapped_local_addr; + struct sockaddr_storage mapped_remote_addr; struct c4iw_wr_wait wr_wait; unsigned long flags; + unsigned long history; }; struct c4iw_listen_ep { @@ -760,7 +809,49 @@ struct c4iw_ep { u8 tos; u8 retry_with_mpa_v1; u8 tried_with_mpa_v1; -}; + unsigned int retry_count; + int snd_win; + int rcv_win; +}; + +static inline void print_addr(struct c4iw_ep_common *epc, const char *func, + const char *msg) +{ + +#define SINA(a) (&(((struct sockaddr_in *)(a))->sin_addr.s_addr)) +#define SINP(a) ntohs(((struct sockaddr_in *)(a))->sin_port) +#define SIN6A(a) (&(((struct sockaddr_in6 *)(a))->sin6_addr)) +#define SIN6P(a) ntohs(((struct sockaddr_in6 *)(a))->sin6_port) + + if (c4iw_debug) { + switch (epc->local_addr.ss_family) { + case AF_INET: + PDBG("%s %s %pI4:%u/%u <-> %pI4:%u/%u\n", + func, msg, SINA(&epc->local_addr), + SINP(&epc->local_addr), + SINP(&epc->mapped_local_addr), + SINA(&epc->remote_addr), + SINP(&epc->remote_addr), + SINP(&epc->mapped_remote_addr)); + break; + case AF_INET6: + PDBG("%s %s %pI6:%u/%u <-> %pI6:%u/%u\n", + func, msg, SIN6A(&epc->local_addr), + SIN6P(&epc->local_addr), + SIN6P(&epc->mapped_local_addr), + SIN6A(&epc->remote_addr), + SIN6P(&epc->remote_addr), + SIN6P(&epc->mapped_remote_addr)); + break; + default: + break; + } + } +#undef SINA +#undef SINP +#undef SIN6A +#undef SIN6P +} static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id) { @@ -781,6 +872,15 @@ static inline int compute_wscale(int win) return wscale; } +static inline int ocqp_supported(const struct cxgb4_lld_info *infop) +{ +#if defined(__i386__) || defined(__x86_64__) || defined(CONFIG_PPC64) + return infop->vr->ocq.size > 0; +#else + return 0; +#endif +} + u32 c4iw_id_alloc(struct c4iw_id_table *alloc); void c4iw_id_free(struct c4iw_id_table *alloc, u32 obj); int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num, @@ -808,7 +908,7 @@ int c4iw_destroy_ctrl_qp(struct c4iw_rdev *rdev); int c4iw_register_device(struct c4iw_dev *dev); void c4iw_unregister_device(struct c4iw_dev *dev); int __init c4iw_cm_init(void); -void __exit c4iw_cm_term(void); +void c4iw_cm_term(void); void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx); void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev, @@ -833,7 +933,7 @@ struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl( int page_list_len); struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth); int c4iw_dealloc_mw(struct ib_mw *mw); -struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd); +struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata); @@ -873,12 +973,11 @@ void c4iw_pblpool_free(struct c4iw_rdev *rdev, u32 addr, int size); u32 c4iw_ocqp_pool_alloc(struct c4iw_rdev *rdev, int size); void c4iw_ocqp_pool_free(struct c4iw_rdev *rdev, u32 addr, int size); int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb); -void c4iw_flush_hw_cq(struct t4_cq *cq); +void c4iw_flush_hw_cq(struct c4iw_cq *chp); void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count); -void c4iw_count_scqes(struct t4_cq *cq, struct t4_wq *wq, int *count); int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp); int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count); -int c4iw_flush_sq(struct t4_wq *wq, struct t4_cq *cq, int count); +int c4iw_flush_sq(struct c4iw_qp *qhp); int c4iw_ev_handler(struct c4iw_dev *rnicp, u32 qid); u16 c4iw_rqes_posted(struct c4iw_qp *qhp); int c4iw_post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe); @@ -894,6 +993,8 @@ extern struct cxgb4_client t4c_client; extern c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS]; extern int c4iw_max_read_depth; extern int db_fc_threshold; +extern int db_coalescing_threshold; +extern int use_dsgl; #endif diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 57e07c61ace..ec7a2988a70 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -30,16 +30,76 @@ * SOFTWARE. */ +#include <linux/module.h> +#include <linux/moduleparam.h> #include <rdma/ib_umem.h> #include <linux/atomic.h> #include "iw_cxgb4.h" +int use_dsgl = 0; +module_param(use_dsgl, int, 0644); +MODULE_PARM_DESC(use_dsgl, "Use DSGL for PBL/FastReg (default=0)"); + #define T4_ULPTX_MIN_IO 32 #define C4IW_MAX_INLINE_SIZE 96 +#define T4_ULPTX_MAX_DMA 1024 +#define C4IW_INLINE_THRESHOLD 128 -static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len, - void *data) +static int inline_threshold = C4IW_INLINE_THRESHOLD; +module_param(inline_threshold, int, 0644); +MODULE_PARM_DESC(inline_threshold, "inline vs dsgl threshold (default=128)"); + +static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr, + u32 len, dma_addr_t data, int wait) +{ + struct sk_buff *skb; + struct ulp_mem_io *req; + struct ulptx_sgl *sgl; + u8 wr_len; + int ret = 0; + struct c4iw_wr_wait wr_wait; + + addr &= 0x7FFFFFF; + + if (wait) + c4iw_init_wr_wait(&wr_wait); + wr_len = roundup(sizeof(*req) + sizeof(*sgl), 16); + + skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL); + if (!skb) + return -ENOMEM; + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); + + req = (struct ulp_mem_io *)__skb_put(skb, wr_len); + memset(req, 0, wr_len); + INIT_ULPTX_WR(req, wr_len, 0, 0); + req->wr.wr_hi = cpu_to_be32(FW_WR_OP(FW_ULPTX_WR) | + (wait ? FW_WR_COMPL(1) : 0)); + req->wr.wr_lo = wait ? (__force __be64)(unsigned long) &wr_wait : 0L; + req->wr.wr_mid = cpu_to_be32(FW_WR_LEN16(DIV_ROUND_UP(wr_len, 16))); + req->cmd = cpu_to_be32(ULPTX_CMD(ULP_TX_MEM_WRITE)); + req->cmd |= cpu_to_be32(V_T5_ULP_MEMIO_ORDER(1)); + req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN(len>>5)); + req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr), 16)); + req->lock_addr = cpu_to_be32(ULP_MEMIO_ADDR(addr)); + + sgl = (struct ulptx_sgl *)(req + 1); + sgl->cmd_nsge = cpu_to_be32(ULPTX_CMD(ULP_TX_SC_DSGL) | + ULPTX_NSGE(1)); + sgl->len0 = cpu_to_be32(len); + sgl->addr0 = cpu_to_be64(data); + + ret = c4iw_ofld_send(rdev, skb); + if (ret) + return ret; + if (wait) + ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); + return ret; +} + +static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len, + void *data) { struct sk_buff *skb; struct ulp_mem_io *req; @@ -47,6 +107,12 @@ static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len, u8 wr_len, *to_dp, *from_dp; int copy_len, num_wqe, i, ret = 0; struct c4iw_wr_wait wr_wait; + __be32 cmd = cpu_to_be32(ULPTX_CMD(ULP_TX_MEM_WRITE)); + + if (is_t4(rdev->lldi.adapter_type)) + cmd |= cpu_to_be32(ULP_MEMIO_ORDER(1)); + else + cmd |= cpu_to_be32(V_T5_ULP_MEMIO_IMM(1)); addr &= 0x7FFFFFF; PDBG("%s addr 0x%x len %u\n", __func__, addr, len); @@ -77,7 +143,7 @@ static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len, req->wr.wr_mid = cpu_to_be32( FW_WR_LEN16(DIV_ROUND_UP(wr_len, 16))); - req->cmd = cpu_to_be32(ULPTX_CMD(ULP_TX_MEM_WRITE) | (1<<23)); + req->cmd = cmd; req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN( DIV_ROUND_UP(copy_len, T4_ULPTX_MIN_IO))); req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr), @@ -107,6 +173,67 @@ static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len, return ret; } +static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *data) +{ + u32 remain = len; + u32 dmalen; + int ret = 0; + dma_addr_t daddr; + dma_addr_t save; + + daddr = dma_map_single(&rdev->lldi.pdev->dev, data, len, DMA_TO_DEVICE); + if (dma_mapping_error(&rdev->lldi.pdev->dev, daddr)) + return -1; + save = daddr; + + while (remain > inline_threshold) { + if (remain < T4_ULPTX_MAX_DMA) { + if (remain & ~T4_ULPTX_MIN_IO) + dmalen = remain & ~(T4_ULPTX_MIN_IO-1); + else + dmalen = remain; + } else + dmalen = T4_ULPTX_MAX_DMA; + remain -= dmalen; + ret = _c4iw_write_mem_dma_aligned(rdev, addr, dmalen, daddr, + !remain); + if (ret) + goto out; + addr += dmalen >> 5; + data += dmalen; + daddr += dmalen; + } + if (remain) + ret = _c4iw_write_mem_inline(rdev, addr, remain, data); +out: + dma_unmap_single(&rdev->lldi.pdev->dev, save, len, DMA_TO_DEVICE); + return ret; +} + +/* + * write len bytes of data into addr (32B aligned address) + * If data is NULL, clear len byte of memory to zero. + */ +static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len, + void *data) +{ + if (is_t5(rdev->lldi.adapter_type) && use_dsgl) { + if (len > inline_threshold) { + if (_c4iw_write_mem_dma(rdev, addr, len, data)) { + printk_ratelimited(KERN_WARNING + "%s: dma map" + " failure (non fatal)\n", + pci_name(rdev->lldi.pdev)); + return _c4iw_write_mem_inline(rdev, addr, len, + data); + } else + return 0; + } else + return _c4iw_write_mem_inline(rdev, addr, len, data); + } else + return _c4iw_write_mem_inline(rdev, addr, len, data); +} + /* * Build and write a TPT entry. * IN: stag key, pdid, perm, bind_enabled, zbva, to, len, page_size, @@ -132,8 +259,12 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry, if ((!reset_tpt_entry) && (*stag == T4_STAG_UNSET)) { stag_idx = c4iw_get_resource(&rdev->resource.tpt_table); - if (!stag_idx) + if (!stag_idx) { + mutex_lock(&rdev->stats.lock); + rdev->stats.stag.fail++; + mutex_unlock(&rdev->stats.lock); return -ENOMEM; + } mutex_lock(&rdev->stats.lock); rdev->stats.stag.cur += 32; if (rdev->stats.stag.cur > rdev->stats.stag.max) @@ -468,7 +599,7 @@ struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd, ret = alloc_pbl(mhp, npages); if (ret) { kfree(page_list); - goto err_pbl; + goto err; } ret = write_pbl(&mhp->rhp->rdev, page_list, mhp->attr.pbl_addr, @@ -551,9 +682,9 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, { __be64 *pages; int shift, n, len; - int i, j, k; + int i, k, entry; int err = 0; - struct ib_umem_chunk *chunk; + struct scatterlist *sg; struct c4iw_dev *rhp; struct c4iw_pd *php; struct c4iw_mr *mhp; @@ -583,10 +714,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, shift = ffs(mhp->umem->page_size) - 1; - n = 0; - list_for_each_entry(chunk, &mhp->umem->chunk_list, list) - n += chunk->nents; - + n = mhp->umem->nmap; err = alloc_pbl(mhp, n); if (err) goto err; @@ -599,24 +727,22 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, i = n = 0; - list_for_each_entry(chunk, &mhp->umem->chunk_list, list) - for (j = 0; j < chunk->nmap; ++j) { - len = sg_dma_len(&chunk->page_list[j]) >> shift; - for (k = 0; k < len; ++k) { - pages[i++] = cpu_to_be64(sg_dma_address( - &chunk->page_list[j]) + - mhp->umem->page_size * k); - if (i == PAGE_SIZE / sizeof *pages) { - err = write_pbl(&mhp->rhp->rdev, - pages, - mhp->attr.pbl_addr + (n << 3), i); - if (err) - goto pbl_done; - n += i; - i = 0; - } + for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = cpu_to_be64(sg_dma_address(sg) + + mhp->umem->page_size * k); + if (i == PAGE_SIZE / sizeof *pages) { + err = write_pbl(&mhp->rhp->rdev, + pages, + mhp->attr.pbl_addr + (n << 3), i); + if (err) + goto pbl_done; + n += i; + i = 0; } } + } if (i) err = write_pbl(&mhp->rhp->rdev, pages, @@ -650,7 +776,7 @@ err: return ERR_PTR(err); } -struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd) +struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) { struct c4iw_dev *rhp; struct c4iw_pd *php; @@ -659,6 +785,9 @@ struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd) u32 stag = 0; int ret; + if (type != IB_MW_TYPE_1) + return ERR_PTR(-EINVAL); + php = to_c4iw_pd(pd); rhp = php->rhp; mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); @@ -757,19 +886,27 @@ struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl(struct ib_device *device, struct c4iw_fr_page_list *c4pl; struct c4iw_dev *dev = to_c4iw_dev(device); dma_addr_t dma_addr; - int size = sizeof *c4pl + page_list_len * sizeof(u64); + int pll_len = roundup(page_list_len * sizeof(u64), 32); - c4pl = dma_alloc_coherent(&dev->rdev.lldi.pdev->dev, size, - &dma_addr, GFP_KERNEL); + c4pl = kmalloc(sizeof(*c4pl), GFP_KERNEL); if (!c4pl) return ERR_PTR(-ENOMEM); + c4pl->ibpl.page_list = dma_alloc_coherent(&dev->rdev.lldi.pdev->dev, + pll_len, &dma_addr, + GFP_KERNEL); + if (!c4pl->ibpl.page_list) { + kfree(c4pl); + return ERR_PTR(-ENOMEM); + } dma_unmap_addr_set(c4pl, mapping, dma_addr); c4pl->dma_addr = dma_addr; c4pl->dev = dev; - c4pl->size = size; - c4pl->ibpl.page_list = (u64 *)(c4pl + 1); - c4pl->ibpl.max_page_list_len = page_list_len; + c4pl->pll_len = pll_len; + + PDBG("%s c4pl %p pll_len %u page_list %p dma_addr %pad\n", + __func__, c4pl, c4pl->pll_len, c4pl->ibpl.page_list, + &c4pl->dma_addr); return &c4pl->ibpl; } @@ -778,8 +915,14 @@ void c4iw_free_fastreg_pbl(struct ib_fast_reg_page_list *ibpl) { struct c4iw_fr_page_list *c4pl = to_c4iw_fr_page_list(ibpl); - dma_free_coherent(&c4pl->dev->rdev.lldi.pdev->dev, c4pl->size, - c4pl, dma_unmap_addr(c4pl, mapping)); + PDBG("%s c4pl %p pll_len %u page_list %p dma_addr %pad\n", + __func__, c4pl, c4pl->pll_len, c4pl->ibpl.page_list, + &c4pl->dma_addr); + + dma_free_coherent(&c4pl->dev->rdev.lldi.pdev->dev, + c4pl->pll_len, + c4pl->ibpl.page_list, dma_unmap_addr(c4pl, mapping)); + kfree(c4pl); } int c4iw_dereg_mr(struct ib_mr *ib_mr) diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index e084fdc6da7..b1d305338de 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -106,15 +106,57 @@ static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev, { struct c4iw_ucontext *context; struct c4iw_dev *rhp = to_c4iw_dev(ibdev); + static int warned; + struct c4iw_alloc_ucontext_resp uresp; + int ret = 0; + struct c4iw_mm_entry *mm = NULL; PDBG("%s ibdev %p\n", __func__, ibdev); context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return ERR_PTR(-ENOMEM); + if (!context) { + ret = -ENOMEM; + goto err; + } + c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx); INIT_LIST_HEAD(&context->mmaps); spin_lock_init(&context->mmap_lock); + + if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) { + if (!warned++) + pr_err(MOD "Warning - downlevel libcxgb4 (non-fatal), device status page disabled."); + rhp->rdev.flags |= T4_STATUS_PAGE_DISABLED; + } else { + mm = kmalloc(sizeof(*mm), GFP_KERNEL); + if (!mm) { + ret = -ENOMEM; + goto err_free; + } + + uresp.status_page_size = PAGE_SIZE; + + spin_lock(&context->mmap_lock); + uresp.status_page_key = context->key; + context->key += PAGE_SIZE; + spin_unlock(&context->mmap_lock); + + ret = ib_copy_to_udata(udata, &uresp, + sizeof(uresp) - sizeof(uresp.reserved)); + if (ret) + goto err_mm; + + mm->key = uresp.status_page_key; + mm->addr = virt_to_phys(rhp->rdev.status_page); + mm->len = PAGE_SIZE; + insert_mmap(context, mm); + } return &context->ibucontext; +err_mm: + kfree(mm); +err_free: + kfree(context); +err: + return ERR_PTR(ret); } static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) @@ -162,8 +204,14 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) */ if (addr >= rdev->oc_mw_pa) vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot); - else - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + else { + if (is_t5(rdev->lldi.adapter_type)) + vma->vm_page_prot = + t4_pgprot_wc(vma->vm_page_prot); + else + vma->vm_page_prot = + pgprot_noncached(vma->vm_page_prot); + } ret = io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, len, vma->vm_page_prot); @@ -263,7 +311,7 @@ static int c4iw_query_device(struct ib_device *ibdev, dev = to_c4iw_dev(ibdev); memset(props, 0, sizeof *props); memcpy(&props->sys_image_guid, dev->rdev.lldi.ports[0]->dev_addr, 6); - props->hw_ver = dev->rdev.lldi.adapter_type; + props->hw_ver = CHELSIO_CHIP_RELEASE(dev->rdev.lldi.adapter_type); props->fw_ver = dev->rdev.lldi.fw_vers; props->device_cap_flags = dev->device_cap_flags; props->page_size_cap = T4_PAGESIZE_MASK; @@ -281,7 +329,7 @@ static int c4iw_query_device(struct ib_device *ibdev, props->max_mr = c4iw_num_stags(&dev->rdev); props->max_pd = T4_MAX_NUM_PD; props->local_ca_ack_delay = 0; - props->max_fast_reg_page_list_len = T4_MAX_FR_DEPTH; + props->max_fast_reg_page_list_len = t4_max_fr_depth(use_dsgl); return 0; } @@ -346,7 +394,8 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr, struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, ibdev.dev); PDBG("%s dev 0x%p\n", __func__, dev); - return sprintf(buf, "%d\n", c4iw_dev->rdev.lldi.adapter_type); + return sprintf(buf, "%d\n", + CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type)); } static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, @@ -451,7 +500,7 @@ int c4iw_register_device(struct c4iw_dev *dev) dev->ibdev.node_type = RDMA_NODE_RNIC; memcpy(dev->ibdev.node_desc, C4IW_NODE_DESC, sizeof(C4IW_NODE_DESC)); dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports; - dev->ibdev.num_comp_vectors = 1; + dev->ibdev.num_comp_vectors = dev->rdev.lldi.nciq; dev->ibdev.dma_device = &(dev->rdev.lldi.pdev->dev); dev->ibdev.query_device = c4iw_query_device; dev->ibdev.query_port = c4iw_query_port; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 05bfe53bff6..086f62f5dc9 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -42,10 +42,21 @@ static int ocqp_support = 1; module_param(ocqp_support, int, 0644); MODULE_PARM_DESC(ocqp_support, "Support on-chip SQs (default=1)"); -int db_fc_threshold = 2000; +int db_fc_threshold = 1000; module_param(db_fc_threshold, int, 0644); -MODULE_PARM_DESC(db_fc_threshold, "QP count/threshold that triggers automatic " - "db flow control mode (default = 2000)"); +MODULE_PARM_DESC(db_fc_threshold, + "QP count/threshold that triggers" + " automatic db flow control mode (default = 1000)"); + +int db_coalescing_threshold; +module_param(db_coalescing_threshold, int, 0644); +MODULE_PARM_DESC(db_coalescing_threshold, + "QP count/threshold that triggers" + " disabling db coalescing (default = 0)"); + +static int max_fr_immd = T4_MAX_FR_IMMD; +module_param(max_fr_immd, int, 0644); +MODULE_PARM_DESC(max_fr_immd, "fastreg threshold for using DSGL instead of immedate"); static void set_state(struct c4iw_qp *qhp, enum c4iw_qp_state state) { @@ -76,7 +87,7 @@ static void dealloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) static int alloc_oc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) { - if (!ocqp_support || !t4_ocqp_supported()) + if (!ocqp_support || !ocqp_supported(&rdev->lldi)) return -ENOSYS; sq->dma_addr = c4iw_ocqp_pool_alloc(rdev, sq->memsize); if (!sq->dma_addr) @@ -100,6 +111,16 @@ static int alloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) return 0; } +static int alloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq, int user) +{ + int ret = -ENOSYS; + if (user) + ret = alloc_oc_sq(rdev, sq); + if (ret) + ret = alloc_host_sq(rdev, sq); + return ret; +} + static int destroy_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, struct c4iw_dev_ucontext *uctx) { @@ -129,7 +150,7 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, int wr_len; struct c4iw_wr_wait wr_wait; struct sk_buff *skb; - int ret; + int ret = 0; int eqsize; wq->sq.qid = c4iw_get_qpid(rdev, uctx); @@ -168,26 +189,19 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, goto free_sw_rq; } - if (user) { - ret = alloc_oc_sq(rdev, &wq->sq); - if (ret) - goto free_hwaddr; - - ret = alloc_host_sq(rdev, &wq->sq); - if (ret) - goto free_sq; - } else - ret = alloc_host_sq(rdev, &wq->sq); - if (ret) - goto free_hwaddr; + ret = alloc_sq(rdev, &wq->sq, user); + if (ret) + goto free_hwaddr; memset(wq->sq.queue, 0, wq->sq.memsize); dma_unmap_addr_set(&wq->sq, mapping, wq->sq.dma_addr); wq->rq.queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev), wq->rq.memsize, &(wq->rq.dma_addr), GFP_KERNEL); - if (!wq->rq.queue) + if (!wq->rq.queue) { + ret = -ENOMEM; goto free_sq; + } PDBG("%s sq base va 0x%p pa 0x%llx rq base va 0x%p pa 0x%llx\n", __func__, wq->sq.queue, (unsigned long long)virt_to_phys(wq->sq.queue), @@ -198,13 +212,23 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, wq->db = rdev->lldi.db_reg; wq->gts = rdev->lldi.gts_reg; - if (user) { - wq->sq.udb = (u64)pci_resource_start(rdev->lldi.pdev, 2) + - (wq->sq.qid << rdev->qpshift); - wq->sq.udb &= PAGE_MASK; - wq->rq.udb = (u64)pci_resource_start(rdev->lldi.pdev, 2) + - (wq->rq.qid << rdev->qpshift); - wq->rq.udb &= PAGE_MASK; + if (user || is_t5(rdev->lldi.adapter_type)) { + u32 off; + + off = (wq->sq.qid << rdev->qpshift) & PAGE_MASK; + if (user) { + wq->sq.udb = (u64 __iomem *)(rdev->bar2_pa + off); + } else { + off += 128 * (wq->sq.qid & rdev->qpmask) + 8; + wq->sq.udb = (u64 __iomem *)(rdev->bar2_kva + off); + } + off = (wq->rq.qid << rdev->qpshift) & PAGE_MASK; + if (user) { + wq->rq.udb = (u64 __iomem *)(rdev->bar2_pa + off); + } else { + off += 128 * (wq->rq.qid & rdev->qpmask) + 8; + wq->rq.udb = (u64 __iomem *)(rdev->bar2_kva + off); + } } wq->rdev = rdev; wq->rq.msn = 1; @@ -285,9 +309,10 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, if (ret) goto free_dma; - PDBG("%s sqid 0x%x rqid 0x%x kdb 0x%p squdb 0x%llx rqudb 0x%llx\n", + PDBG("%s sqid 0x%x rqid 0x%x kdb 0x%p squdb 0x%lx rqudb 0x%lx\n", __func__, wq->sq.qid, wq->rq.qid, wq->db, - (unsigned long long)wq->sq.udb, (unsigned long long)wq->rq.udb); + (__force unsigned long) wq->sq.udb, + (__force unsigned long) wq->rq.udb); return 0; free_dma: @@ -411,6 +436,8 @@ static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe, default: return -EINVAL; } + wqe->send.r3 = 0; + wqe->send.r4 = 0; plen = 0; if (wr->num_sge) { @@ -532,7 +559,7 @@ static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe, } static int build_fastreg(struct t4_sq *sq, union t4_wr *wqe, - struct ib_send_wr *wr, u8 *len16) + struct ib_send_wr *wr, u8 *len16, u8 t5dev) { struct fw_ri_immd *imdp; @@ -541,7 +568,8 @@ static int build_fastreg(struct t4_sq *sq, union t4_wr *wqe, int pbllen = roundup(wr->wr.fast_reg.page_list_len * sizeof(u64), 32); int rem; - if (wr->wr.fast_reg.page_list_len > T4_MAX_FR_DEPTH) + if (wr->wr.fast_reg.page_list_len > + t4_max_fr_depth(use_dsgl)) return -EINVAL; wqe->fr.qpbinde_to_dcacpu = 0; @@ -554,28 +582,51 @@ static int build_fastreg(struct t4_sq *sq, union t4_wr *wqe, wqe->fr.va_hi = cpu_to_be32(wr->wr.fast_reg.iova_start >> 32); wqe->fr.va_lo_fbo = cpu_to_be32(wr->wr.fast_reg.iova_start & 0xffffffff); - WARN_ON(pbllen > T4_MAX_FR_IMMD); - imdp = (struct fw_ri_immd *)(&wqe->fr + 1); - imdp->op = FW_RI_DATA_IMMD; - imdp->r1 = 0; - imdp->r2 = 0; - imdp->immdlen = cpu_to_be32(pbllen); - p = (__be64 *)(imdp + 1); - rem = pbllen; - for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) { - *p = cpu_to_be64((u64)wr->wr.fast_reg.page_list->page_list[i]); - rem -= sizeof *p; - if (++p == (__be64 *)&sq->queue[sq->size]) - p = (__be64 *)sq->queue; - } - BUG_ON(rem < 0); - while (rem) { - *p = 0; - rem -= sizeof *p; - if (++p == (__be64 *)&sq->queue[sq->size]) - p = (__be64 *)sq->queue; + + if (t5dev && use_dsgl && (pbllen > max_fr_immd)) { + struct c4iw_fr_page_list *c4pl = + to_c4iw_fr_page_list(wr->wr.fast_reg.page_list); + struct fw_ri_dsgl *sglp; + + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) { + wr->wr.fast_reg.page_list->page_list[i] = (__force u64) + cpu_to_be64((u64) + wr->wr.fast_reg.page_list->page_list[i]); + } + + sglp = (struct fw_ri_dsgl *)(&wqe->fr + 1); + sglp->op = FW_RI_DATA_DSGL; + sglp->r1 = 0; + sglp->nsge = cpu_to_be16(1); + sglp->addr0 = cpu_to_be64(c4pl->dma_addr); + sglp->len0 = cpu_to_be32(pbllen); + + *len16 = DIV_ROUND_UP(sizeof(wqe->fr) + sizeof(*sglp), 16); + } else { + imdp = (struct fw_ri_immd *)(&wqe->fr + 1); + imdp->op = FW_RI_DATA_IMMD; + imdp->r1 = 0; + imdp->r2 = 0; + imdp->immdlen = cpu_to_be32(pbllen); + p = (__be64 *)(imdp + 1); + rem = pbllen; + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) { + *p = cpu_to_be64( + (u64)wr->wr.fast_reg.page_list->page_list[i]); + rem -= sizeof(*p); + if (++p == (__be64 *)&sq->queue[sq->size]) + p = (__be64 *)sq->queue; + } + BUG_ON(rem < 0); + while (rem) { + *p = 0; + rem -= sizeof(*p); + if (++p == (__be64 *)&sq->queue[sq->size]) + p = (__be64 *)sq->queue; + } + *len16 = DIV_ROUND_UP(sizeof(wqe->fr) + sizeof(*imdp) + + pbllen, 16); } - *len16 = DIV_ROUND_UP(sizeof wqe->fr + sizeof *imdp + pbllen, 16); return 0; } @@ -601,6 +652,48 @@ void c4iw_qp_rem_ref(struct ib_qp *qp) wake_up(&(to_c4iw_qp(qp)->wait)); } +static void add_to_fc_list(struct list_head *head, struct list_head *entry) +{ + if (list_empty(entry)) + list_add_tail(entry, head); +} + +static int ring_kernel_sq_db(struct c4iw_qp *qhp, u16 inc) +{ + unsigned long flags; + + spin_lock_irqsave(&qhp->rhp->lock, flags); + spin_lock(&qhp->lock); + if (qhp->rhp->db_state == NORMAL) + t4_ring_sq_db(&qhp->wq, inc, + is_t5(qhp->rhp->rdev.lldi.adapter_type), NULL); + else { + add_to_fc_list(&qhp->rhp->db_fc_list, &qhp->db_fc_entry); + qhp->wq.sq.wq_pidx_inc += inc; + } + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&qhp->rhp->lock, flags); + return 0; +} + +static int ring_kernel_rq_db(struct c4iw_qp *qhp, u16 inc) +{ + unsigned long flags; + + spin_lock_irqsave(&qhp->rhp->lock, flags); + spin_lock(&qhp->lock); + if (qhp->rhp->db_state == NORMAL) + t4_ring_rq_db(&qhp->wq, inc, + is_t5(qhp->rhp->rdev.lldi.adapter_type), NULL); + else { + add_to_fc_list(&qhp->rhp->db_fc_list, &qhp->db_fc_entry); + qhp->wq.rq.wq_pidx_inc += inc; + } + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&qhp->rhp->lock, flags); + return 0; +} + int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { @@ -609,7 +702,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, enum fw_wr_opcodes fw_opcode = 0; enum fw_ri_wr_flags fw_flags; struct c4iw_qp *qhp; - union t4_wr *wqe; + union t4_wr *wqe = NULL; u32 num_wrs; struct t4_swsqe *swsqe; unsigned long flag; @@ -638,7 +731,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, fw_flags = 0; if (wr->send_flags & IB_SEND_SOLICITED) fw_flags |= FW_RI_SOLICITED_EVENT_FLAG; - if (wr->send_flags & IB_SEND_SIGNALED) + if (wr->send_flags & IB_SEND_SIGNALED || qhp->sq_sig_all) fw_flags |= FW_RI_COMPLETION_FLAG; swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx]; switch (wr->opcode) { @@ -676,7 +769,10 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, case IB_WR_FAST_REG_MR: fw_opcode = FW_RI_FR_NSMR_WR; swsqe->opcode = FW_RI_FAST_REGISTER; - err = build_fastreg(&qhp->wq.sq, wqe, wr, &len16); + err = build_fastreg(&qhp->wq.sq, wqe, wr, &len16, + is_t5( + qhp->rhp->rdev.lldi.adapter_type) ? + 1 : 0); break; case IB_WR_LOCAL_INV: if (wr->send_flags & IB_SEND_FENCE) @@ -696,7 +792,9 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } swsqe->idx = qhp->wq.sq.pidx; swsqe->complete = 0; - swsqe->signaled = (wr->send_flags & IB_SEND_SIGNALED); + swsqe->signaled = (wr->send_flags & IB_SEND_SIGNALED) || + qhp->sq_sig_all; + swsqe->flushed = 0; swsqe->wr_id = wr->wr_id; init_wr_hdr(wqe, qhp->wq.sq.pidx, fw_opcode, fw_flags, len16); @@ -709,9 +807,14 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, t4_sq_produce(&qhp->wq, len16); idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); } - if (t4_wq_db_enabled(&qhp->wq)) - t4_ring_sq_db(&qhp->wq, idx); - spin_unlock_irqrestore(&qhp->lock, flag); + if (!qhp->rhp->rdev.status_page->db_off) { + t4_ring_sq_db(&qhp->wq, idx, + is_t5(qhp->rhp->rdev.lldi.adapter_type), wqe); + spin_unlock_irqrestore(&qhp->lock, flag); + } else { + spin_unlock_irqrestore(&qhp->lock, flag); + ring_kernel_sq_db(qhp, idx); + } return err; } @@ -720,7 +823,7 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, { int err = 0; struct c4iw_qp *qhp; - union t4_recv_wr *wqe; + union t4_recv_wr *wqe = NULL; u32 num_wrs; u8 len16 = 0; unsigned long flag; @@ -771,9 +874,14 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, wr = wr->next; num_wrs--; } - if (t4_wq_db_enabled(&qhp->wq)) - t4_ring_rq_db(&qhp->wq, idx); - spin_unlock_irqrestore(&qhp->lock, flag); + if (!qhp->rhp->rdev.status_page->db_off) { + t4_ring_rq_db(&qhp->wq, idx, + is_t5(qhp->rhp->rdev.lldi.adapter_type), wqe); + spin_unlock_irqrestore(&qhp->lock, flag); + } else { + spin_unlock_irqrestore(&qhp->lock, flag); + ring_kernel_rq_db(qhp, idx); + } return err; } @@ -966,7 +1074,15 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, /* locking hierarchy: cq lock first, then qp lock. */ spin_lock_irqsave(&rchp->lock, flag); spin_lock(&qhp->lock); - c4iw_flush_hw_cq(&rchp->cq); + + if (qhp->wq.flushed) { + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&rchp->lock, flag); + return; + } + qhp->wq.flushed = 1; + + c4iw_flush_hw_cq(rchp); c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count); flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count); spin_unlock(&qhp->lock); @@ -980,9 +1096,9 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, /* locking hierarchy: cq lock first, then qp lock. */ spin_lock_irqsave(&schp->lock, flag); spin_lock(&qhp->lock); - c4iw_flush_hw_cq(&schp->cq); - c4iw_count_scqes(&schp->cq, &qhp->wq, &count); - flushed = c4iw_flush_sq(&qhp->wq, &schp->cq, count); + if (schp != rchp) + c4iw_flush_hw_cq(schp); + flushed = c4iw_flush_sq(qhp); spin_unlock(&qhp->lock); spin_unlock_irqrestore(&schp->lock, flag); if (flushed) { @@ -997,11 +1113,11 @@ static void flush_qp(struct c4iw_qp *qhp) struct c4iw_cq *rchp, *schp; unsigned long flag; - rchp = get_chp(qhp->rhp, qhp->attr.rcq); - schp = get_chp(qhp->rhp, qhp->attr.scq); + rchp = to_c4iw_cq(qhp->ibqp.recv_cq); + schp = to_c4iw_cq(qhp->ibqp.send_cq); + t4_set_wq_in_error(&qhp->wq); if (qhp->ibqp.uobject) { - t4_set_wq_in_error(&qhp->wq); t4_set_cq_in_error(&rchp->cq); spin_lock_irqsave(&rchp->comp_handler_lock, flag); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); @@ -1151,35 +1267,6 @@ out: return ret; } -/* - * Called by the library when the qp has user dbs disabled due to - * a DB_FULL condition. This function will single-thread all user - * DB rings to avoid overflowing the hw db-fifo. - */ -static int ring_kernel_db(struct c4iw_qp *qhp, u32 qid, u16 inc) -{ - int delay = db_delay_usecs; - - mutex_lock(&qhp->rhp->db_mutex); - do { - - /* - * The interrupt threshold is dbfifo_int_thresh << 6. So - * make sure we don't cross that and generate an interrupt. - */ - if (cxgb4_dbfifo_count(qhp->rhp->rdev.lldi.ports[0], 1) < - (qhp->rhp->rdev.lldi.dbfifo_int_thresh << 5)) { - writel(QID(qid) | PIDX(inc), qhp->wq.db); - break; - } - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(usecs_to_jiffies(delay)); - delay = min(delay << 1, 2000); - } while (1); - mutex_unlock(&qhp->rhp->db_mutex); - return 0; -} - int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, enum c4iw_qp_attr_mask mask, struct c4iw_qp_attributes *attrs, @@ -1229,11 +1316,11 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, } if (mask & C4IW_QP_ATTR_SQ_DB) { - ret = ring_kernel_db(qhp, qhp->wq.sq.qid, attrs->sq_db_inc); + ret = ring_kernel_sq_db(qhp, attrs->sq_db_inc); goto out; } if (mask & C4IW_QP_ATTR_RQ_DB) { - ret = ring_kernel_db(qhp, qhp->wq.rq.qid, attrs->rq_db_inc); + ret = ring_kernel_rq_db(qhp, attrs->rq_db_inc); goto out; } @@ -1283,6 +1370,7 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, switch (attrs->next_state) { case C4IW_QP_STATE_CLOSING: BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2); + t4_set_wq_in_error(&qhp->wq); set_state(qhp, C4IW_QP_STATE_CLOSING); ep = qhp->ep; if (!internal) { @@ -1290,28 +1378,30 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, disconnect = 1; c4iw_get_ep(&qhp->ep->com); } - if (qhp->ibqp.uobject) - t4_set_wq_in_error(&qhp->wq); ret = rdma_fini(rhp, qhp, ep); if (ret) goto err; break; case C4IW_QP_STATE_TERMINATE: + t4_set_wq_in_error(&qhp->wq); set_state(qhp, C4IW_QP_STATE_TERMINATE); qhp->attr.layer_etype = attrs->layer_etype; qhp->attr.ecode = attrs->ecode; - if (qhp->ibqp.uobject) - t4_set_wq_in_error(&qhp->wq); ep = qhp->ep; - if (!internal) + if (!internal) { + c4iw_get_ep(&qhp->ep->com); terminate = 1; - disconnect = 1; - c4iw_get_ep(&qhp->ep->com); + disconnect = 1; + } else { + terminate = qhp->attr.send_term; + ret = rdma_fini(rhp, qhp, ep); + if (ret) + goto err; + } break; case C4IW_QP_STATE_ERROR: + t4_set_wq_in_error(&qhp->wq); set_state(qhp, C4IW_QP_STATE_ERROR); - if (qhp->ibqp.uobject) - t4_set_wq_in_error(&qhp->wq); if (!internal) { abort = 1; disconnect = 1; @@ -1383,6 +1473,7 @@ err: qhp->ep = NULL; set_state(qhp, C4IW_QP_STATE_ERROR); free = 1; + abort = 1; wake_up(&qhp->wait); BUG_ON(!ep); flush_qp(qhp); @@ -1413,14 +1504,6 @@ out: return ret; } -static int enable_qp_db(int id, void *p, void *data) -{ - struct c4iw_qp *qp = p; - - t4_enable_wq_db(&qp->wq); - return 0; -} - int c4iw_destroy_qp(struct ib_qp *ib_qp) { struct c4iw_dev *rhp; @@ -1438,19 +1521,15 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp) c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); wait_event(qhp->wait, !qhp->ep); - spin_lock_irq(&rhp->lock); - remove_handle_nolock(rhp, &rhp->qpidr, qhp->wq.sq.qid); - rhp->qpcnt--; - BUG_ON(rhp->qpcnt < 0); - if (rhp->qpcnt <= db_fc_threshold && rhp->db_state == FLOW_CONTROL) { - rhp->rdev.stats.db_state_transitions++; - rhp->db_state = NORMAL; - idr_for_each(&rhp->qpidr, enable_qp_db, NULL); - } - spin_unlock_irq(&rhp->lock); + remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid); atomic_dec(&qhp->refcnt); wait_event(qhp->wait, !atomic_read(&qhp->refcnt)); + spin_lock_irq(&rhp->lock); + if (!list_empty(&qhp->db_fc_entry)) + list_del_init(&qhp->db_fc_entry); + spin_unlock_irq(&rhp->lock); + ucontext = ib_qp->uobject ? to_c4iw_ucontext(ib_qp->uobject->context) : NULL; destroy_qp(&rhp->rdev, &qhp->wq, @@ -1461,14 +1540,6 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp) return 0; } -static int disable_qp_db(int id, void *p, void *data) -{ - struct c4iw_qp *qp = p; - - t4_disable_wq_db(&qp->wq); - return 0; -} - struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, struct ib_udata *udata) { @@ -1478,7 +1549,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, struct c4iw_cq *schp; struct c4iw_cq *rchp; struct c4iw_create_qp_resp uresp; - int sqsize, rqsize; + unsigned int sqsize, rqsize; struct c4iw_ucontext *ucontext; int ret; struct c4iw_mm_entry *mm1, *mm2, *mm3, *mm4, *mm5 = NULL; @@ -1508,12 +1579,12 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, ucontext = pd->uobject ? to_c4iw_ucontext(pd->uobject->context) : NULL; - qhp = kzalloc(sizeof(*qhp), GFP_KERNEL); if (!qhp) return ERR_PTR(-ENOMEM); qhp->wq.sq.size = sqsize; qhp->wq.sq.memsize = (sqsize + 1) * sizeof *qhp->wq.sq.queue; + qhp->wq.sq.flush_cidx = -1; qhp->wq.rq.size = rqsize; qhp->wq.rq.memsize = (rqsize + 1) * sizeof *qhp->wq.rq.queue; @@ -1550,21 +1621,13 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, qhp->attr.enable_bind = 1; qhp->attr.max_ord = 1; qhp->attr.max_ird = 1; + qhp->sq_sig_all = attrs->sq_sig_type == IB_SIGNAL_ALL_WR; spin_lock_init(&qhp->lock); mutex_init(&qhp->mutex); init_waitqueue_head(&qhp->wait); atomic_set(&qhp->refcnt, 1); - spin_lock_irq(&rhp->lock); - if (rhp->db_state != NORMAL) - t4_disable_wq_db(&qhp->wq); - if (++rhp->qpcnt > db_fc_threshold && rhp->db_state == NORMAL) { - rhp->rdev.stats.db_state_transitions++; - rhp->db_state = FLOW_CONTROL; - idr_for_each(&rhp->qpidr, disable_qp_db, NULL); - } - ret = insert_handle_nolock(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid); - spin_unlock_irq(&rhp->lock); + ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid); if (ret) goto err2; @@ -1609,6 +1672,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, if (mm5) { uresp.ma_sync_key = ucontext->key; ucontext->key += PAGE_SIZE; + } else { + uresp.ma_sync_key = 0; } uresp.sq_key = ucontext->key; ucontext->key += PAGE_SIZE; @@ -1631,11 +1696,11 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, mm2->len = PAGE_ALIGN(qhp->wq.rq.memsize); insert_mmap(ucontext, mm2); mm3->key = uresp.sq_db_gts_key; - mm3->addr = qhp->wq.sq.udb; + mm3->addr = (__force unsigned long) qhp->wq.sq.udb; mm3->len = PAGE_SIZE; insert_mmap(ucontext, mm3); mm4->key = uresp.rq_db_gts_key; - mm4->addr = qhp->wq.rq.udb; + mm4->addr = (__force unsigned long) qhp->wq.rq.udb; mm4->len = PAGE_SIZE; insert_mmap(ucontext, mm4); if (mm5) { @@ -1648,6 +1713,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, } qhp->ibqp.qp_num = qhp->wq.sq.qid; init_timer(&(qhp->timer)); + INIT_LIST_HEAD(&qhp->db_fc_entry); PDBG("%s qhp %p sq_num_entries %d, rq_num_entries %d qpid 0x%0x\n", __func__, qhp, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries, qhp->wq.sq.qid); @@ -1711,11 +1777,15 @@ int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, /* * Use SQ_PSN and RQ_PSN to pass in IDX_INC values for * ringing the queue db when we're in DB_FULL mode. + * Only allow this on T4 devices. */ attrs.sq_db_inc = attr->sq_psn; attrs.rq_db_inc = attr->rq_psn; mask |= (attr_mask & IB_QP_SQ_PSN) ? C4IW_QP_ATTR_SQ_DB : 0; mask |= (attr_mask & IB_QP_RQ_PSN) ? C4IW_QP_ATTR_RQ_DB : 0; + if (is_t5(to_c4iw_qp(ibqp)->rhp->rdev.lldi.adapter_type) && + (mask & (C4IW_QP_ATTR_SQ_DB|C4IW_QP_ATTR_RQ_DB))) + return -EINVAL; return c4iw_modify_qp(rhp, qhp, mask, &attrs, 0); } diff --git a/drivers/infiniband/hw/cxgb4/resource.c b/drivers/infiniband/hw/cxgb4/resource.c index cdef4d7fb6d..67df71a7012 100644 --- a/drivers/infiniband/hw/cxgb4/resource.c +++ b/drivers/infiniband/hw/cxgb4/resource.c @@ -179,8 +179,12 @@ u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx) kfree(entry); } else { qid = c4iw_get_resource(&rdev->resource.qid_table); - if (!qid) + if (!qid) { + mutex_lock(&rdev->stats.lock); + rdev->stats.qid.fail++; + mutex_unlock(&rdev->stats.lock); goto out; + } mutex_lock(&rdev->stats.lock); rdev->stats.qid.cur += rdev->qpmask + 1; mutex_unlock(&rdev->stats.lock); @@ -322,8 +326,8 @@ u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size) unsigned long addr = gen_pool_alloc(rdev->rqt_pool, size << 6); PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size << 6); if (!addr) - printk_ratelimited(KERN_WARNING MOD "%s: Out of RQT memory\n", - pci_name(rdev->lldi.pdev)); + pr_warn_ratelimited(MOD "%s: Out of RQT memory\n", + pci_name(rdev->lldi.pdev)); mutex_lock(&rdev->stats.lock); if (addr) { rdev->stats.rqt.cur += roundup(size << 6, 1 << MIN_RQT_SHIFT); diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h index 16f26ab2930..68b0a6bf4eb 100644 --- a/drivers/infiniband/hw/cxgb4/t4.h +++ b/drivers/infiniband/hw/cxgb4/t4.h @@ -36,9 +36,9 @@ #include "t4_msg.h" #include "t4fw_ri_api.h" -#define T4_MAX_NUM_QP (1<<16) -#define T4_MAX_NUM_CQ (1<<15) -#define T4_MAX_NUM_PD (1<<15) +#define T4_MAX_NUM_QP 65536 +#define T4_MAX_NUM_CQ 65536 +#define T4_MAX_NUM_PD 65536 #define T4_EQ_STATUS_ENTRIES (L1_CACHE_BYTES > 64 ? 2 : 1) #define T4_MAX_EQ_SIZE (65520 - T4_EQ_STATUS_ENTRIES) #define T4_MAX_IQ_SIZE (65520 - 1) @@ -47,7 +47,7 @@ #define T4_MAX_QP_DEPTH (T4_MAX_RQ_SIZE - 1) #define T4_MAX_CQ_DEPTH (T4_MAX_IQ_SIZE - 1) #define T4_MAX_NUM_STAG (1<<15) -#define T4_MAX_MR_SIZE (~0ULL - 1) +#define T4_MAX_MR_SIZE (~0ULL) #define T4_PAGESIZE_MASK 0xffff000 /* 4KB-128MB */ #define T4_STAG_UNSET 0xffffffff #define T4_FW_MAJ 0 @@ -84,7 +84,14 @@ struct t4_status_page { sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge)) #define T4_MAX_FR_IMMD ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_fr_nsmr_wr) - \ sizeof(struct fw_ri_immd)) & ~31UL) -#define T4_MAX_FR_DEPTH (T4_MAX_FR_IMMD / sizeof(u64)) +#define T4_MAX_FR_IMMD_DEPTH (T4_MAX_FR_IMMD / sizeof(u64)) +#define T4_MAX_FR_DSGL 1024 +#define T4_MAX_FR_DSGL_DEPTH (T4_MAX_FR_DSGL / sizeof(u64)) + +static inline int t4_max_fr_depth(int use_dsgl) +{ + return use_dsgl ? T4_MAX_FR_DSGL_DEPTH : T4_MAX_FR_IMMD_DEPTH; +} #define T4_RQ_NUM_SLOTS 2 #define T4_RQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_RQ_NUM_SLOTS) @@ -269,6 +276,7 @@ struct t4_swsqe { int complete; int signaled; u16 idx; + int flushed; }; static inline pgprot_t t4_pgprot_wc(pgprot_t prot) @@ -280,15 +288,6 @@ static inline pgprot_t t4_pgprot_wc(pgprot_t prot) #endif } -static inline int t4_ocqp_supported(void) -{ -#if defined(__i386__) || defined(__x86_64__) || defined(CONFIG_PPC64) - return 1; -#else - return 0; -#endif -} - enum { T4_SQ_ONCHIP = (1<<0), }; @@ -300,7 +299,7 @@ struct t4_sq { unsigned long phys_addr; struct t4_swsqe *sw_sq; struct t4_swsqe *oldest_read; - u64 udb; + u64 __iomem *udb; size_t memsize; u32 qid; u16 in_use; @@ -308,7 +307,9 @@ struct t4_sq { u16 cidx; u16 pidx; u16 wq_pidx; + u16 wq_pidx_inc; u16 flags; + short flush_cidx; }; struct t4_swrqe { @@ -320,7 +321,7 @@ struct t4_rq { dma_addr_t dma_addr; DEFINE_DMA_UNMAP_ADDR(mapping); struct t4_swrqe *sw_rq; - u64 udb; + u64 __iomem *udb; size_t memsize; u32 qid; u32 msn; @@ -331,6 +332,7 @@ struct t4_rq { u16 cidx; u16 pidx; u16 wq_pidx; + u16 wq_pidx_inc; }; struct t4_wq { @@ -339,6 +341,7 @@ struct t4_wq { void __iomem *db; void __iomem *gts; struct c4iw_rdev *rdev; + int flushed; }; static inline int t4_rqes_posted(struct t4_wq *wq) @@ -421,6 +424,9 @@ static inline void t4_sq_produce(struct t4_wq *wq, u8 len16) static inline void t4_sq_consume(struct t4_wq *wq) { + BUG_ON(wq->sq.in_use < 1); + if (wq->sq.cidx == wq->sq.flush_cidx) + wq->sq.flush_cidx = -1; wq->sq.in_use--; if (++wq->sq.cidx == wq->sq.size) wq->sq.cidx = 0; @@ -436,15 +442,67 @@ static inline u16 t4_sq_wq_size(struct t4_wq *wq) return wq->sq.size * T4_SQ_NUM_SLOTS; } -static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc) +/* This function copies 64 byte coalesced work request to memory + * mapped BAR2 space. For coalesced WRs, the SGE fetches data + * from the FIFO instead of from Host. + */ +static inline void pio_copy(u64 __iomem *dst, u64 *src) +{ + int count = 8; + + while (count) { + writeq(*src, dst); + src++; + dst++; + count--; + } +} + +static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc, u8 t5, + union t4_wr *wqe) { + + /* Flush host queue memory writes. */ wmb(); + if (t5) { + if (inc == 1 && wqe) { + PDBG("%s: WC wq->sq.pidx = %d\n", + __func__, wq->sq.pidx); + pio_copy(wq->sq.udb + 7, (void *)wqe); + } else { + PDBG("%s: DB wq->sq.pidx = %d\n", + __func__, wq->sq.pidx); + writel(PIDX_T5(inc), wq->sq.udb); + } + + /* Flush user doorbell area writes. */ + wmb(); + return; + } writel(QID(wq->sq.qid) | PIDX(inc), wq->db); } -static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc) +static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc, u8 t5, + union t4_recv_wr *wqe) { + + /* Flush host queue memory writes. */ wmb(); + if (t5) { + if (inc == 1 && wqe) { + PDBG("%s: WC wq->rq.pidx = %d\n", + __func__, wq->rq.pidx); + pio_copy(wq->rq.udb + 7, (void *)wqe); + } else { + PDBG("%s: DB wq->rq.pidx = %d\n", + __func__, wq->rq.pidx); + writel(PIDX_T5(inc), wq->rq.udb); + } + + /* Flush user doorbell area writes. */ + wmb(); + return; + } writel(QID(wq->rq.qid) | PIDX(inc), wq->db); } @@ -484,6 +542,7 @@ struct t4_cq { size_t memsize; __be64 bits_type_ts; u32 cqid; + int vector; u16 size; /* including status page */ u16 cidx; u16 sw_pidx; @@ -514,12 +573,18 @@ static inline int t4_arm_cq(struct t4_cq *cq, int se) static inline void t4_swcq_produce(struct t4_cq *cq) { cq->sw_in_use++; + if (cq->sw_in_use == cq->size) { + PDBG("%s cxgb4 sw cq overflow cqid %u\n", __func__, cq->cqid); + cq->error = 1; + BUG_ON(1); + } if (++cq->sw_pidx == cq->size) cq->sw_pidx = 0; } static inline void t4_swcq_consume(struct t4_cq *cq) { + BUG_ON(cq->sw_in_use < 1); cq->sw_in_use--; if (++cq->sw_cidx == cq->size) cq->sw_cidx = 0; @@ -528,7 +593,7 @@ static inline void t4_swcq_consume(struct t4_cq *cq) static inline void t4_hwcq_consume(struct t4_cq *cq) { cq->bits_type_ts = cq->queue[cq->cidx].bits_type_ts; - if (++cq->cidx_inc == (cq->size >> 4)) { + if (++cq->cidx_inc == (cq->size >> 4) || cq->cidx_inc == CIDXINC_MASK) { u32 val; val = SEINTARM(0) | CIDXINC(cq->cidx_inc) | TIMERREG(7) | @@ -561,7 +626,11 @@ static inline int t4_next_hw_cqe(struct t4_cq *cq, struct t4_cqe **cqe) ret = -EOVERFLOW; cq->error = 1; printk(KERN_ERR MOD "cq overflow cqid %u\n", cq->cqid); + BUG_ON(1); } else if (t4_valid_cqe(cq, &cq->queue[cq->cidx])) { + + /* Ensure CQE is flushed to memory */ + rmb(); *cqe = &cq->queue[cq->cidx]; ret = 0; } else @@ -571,6 +640,12 @@ static inline int t4_next_hw_cqe(struct t4_cq *cq, struct t4_cqe **cqe) static inline struct t4_cqe *t4_next_sw_cqe(struct t4_cq *cq) { + if (cq->sw_in_use == cq->size) { + PDBG("%s cxgb4 sw cq overflow cqid %u\n", __func__, cq->cqid); + cq->error = 1; + BUG_ON(1); + return NULL; + } if (cq->sw_in_use) return &cq->sw_queue[cq->sw_cidx]; return NULL; @@ -599,3 +674,7 @@ static inline void t4_set_cq_in_error(struct t4_cq *cq) ((struct t4_status_page *)&cq->queue[cq->size])->qp_err = 1; } #endif + +struct t4_dev_status_page { + u8 db_off; +}; diff --git a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h index dc193c29267..91289a051af 100644 --- a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h +++ b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h @@ -836,4 +836,19 @@ struct ulptx_idata { #define V_RX_DACK_CHANGE(x) ((x) << S_RX_DACK_CHANGE) #define F_RX_DACK_CHANGE V_RX_DACK_CHANGE(1U) +enum { /* TCP congestion control algorithms */ + CONG_ALG_RENO, + CONG_ALG_TAHOE, + CONG_ALG_NEWRENO, + CONG_ALG_HIGHSPEED +}; + +#define S_CONG_CNTRL 14 +#define M_CONG_CNTRL 0x3 +#define V_CONG_CNTRL(x) ((x) << S_CONG_CNTRL) +#define G_CONG_CNTRL(x) (((x) >> S_CONG_CNTRL) & M_CONG_CNTRL) + +#define CONG_CNTRL_VALID (1 << 18) +#define T5_OPT_2_VALID (1 << 31) + #endif /* _T4FW_RI_API_H_ */ diff --git a/drivers/infiniband/hw/cxgb4/user.h b/drivers/infiniband/hw/cxgb4/user.h index 32b754c35ab..cbd0ce17072 100644 --- a/drivers/infiniband/hw/cxgb4/user.h +++ b/drivers/infiniband/hw/cxgb4/user.h @@ -48,6 +48,7 @@ struct c4iw_create_cq_resp { __u32 cqid; __u32 size; __u32 qid_mask; + __u32 reserved; /* explicit padding (optional for i386) */ }; @@ -70,4 +71,10 @@ struct c4iw_create_qp_resp { __u32 qid_mask; __u32 flags; }; + +struct c4iw_alloc_ucontext_resp { + __u64 status_page_key; + __u32 status_page_size; + __u32 reserved; /* explicit padding (optional for i386) */ +}; #endif diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index f08f6eaf3fa..bd45e0f3923 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -322,7 +322,7 @@ struct ehca_mr_pginfo { } phy; struct { /* type EHCA_MR_PGI_USER section */ struct ib_umem *region; - struct ib_umem_chunk *next_chunk; + struct scatterlist *next_sg; u64 next_nmap; } usr; struct { /* type EHCA_MR_PGI_FMR section */ diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c index 8f5290147e8..8cc83753776 100644 --- a/drivers/infiniband/hw/ehca/ehca_cq.c +++ b/drivers/infiniband/hw/ehca/ehca_cq.c @@ -128,7 +128,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, void *vpage; u32 counter; u64 rpage, cqx_fec, h_ret; - int ipz_rc, ret, i; + int ipz_rc, i; unsigned long flags; if (cqe >= 0xFFFFFFFF - 64 - additional_cqe) @@ -163,32 +163,19 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, adapter_handle = shca->ipz_hca_handle; param.eq_handle = shca->eq.ipz_eq_handle; - do { - if (!idr_pre_get(&ehca_cq_idr, GFP_KERNEL)) { - cq = ERR_PTR(-ENOMEM); - ehca_err(device, "Can't reserve idr nr. device=%p", - device); - goto create_cq_exit1; - } - - write_lock_irqsave(&ehca_cq_idr_lock, flags); - ret = idr_get_new(&ehca_cq_idr, my_cq, &my_cq->token); - write_unlock_irqrestore(&ehca_cq_idr_lock, flags); - } while (ret == -EAGAIN); + idr_preload(GFP_KERNEL); + write_lock_irqsave(&ehca_cq_idr_lock, flags); + my_cq->token = idr_alloc(&ehca_cq_idr, my_cq, 0, 0x2000000, GFP_NOWAIT); + write_unlock_irqrestore(&ehca_cq_idr_lock, flags); + idr_preload_end(); - if (ret) { + if (my_cq->token < 0) { cq = ERR_PTR(-ENOMEM); ehca_err(device, "Can't allocate new idr entry. device=%p", device); goto create_cq_exit1; } - if (my_cq->token > 0x1FFFFFF) { - cq = ERR_PTR(-ENOMEM); - ehca_err(device, "Invalid number of cq. device=%p", device); - goto create_cq_exit2; - } - /* * CQs maximum depth is 4GB-64, but we need additional 20 as buffer * for receiving errors CQEs. @@ -296,6 +283,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, (my_cq->galpas.user.fw_handle & (PAGE_SIZE - 1)); if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { ehca_err(device, "Copy to udata failed."); + cq = ERR_PTR(-EFAULT); goto create_cq_exit4; } } diff --git a/drivers/infiniband/hw/ehca/ehca_iverbs.h b/drivers/infiniband/hw/ehca/ehca_iverbs.h index 8f7f282ead6..22f79afa7fc 100644 --- a/drivers/infiniband/hw/ehca/ehca_iverbs.h +++ b/drivers/infiniband/hw/ehca/ehca_iverbs.h @@ -95,7 +95,7 @@ int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr); int ehca_dereg_mr(struct ib_mr *mr); -struct ib_mw *ehca_alloc_mw(struct ib_pd *pd); +struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); int ehca_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind); diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index 832e7a7d0ae..cd8d290a09f 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -211,6 +211,7 @@ static int ehca_create_slab_caches(void) if (!ctblk_cache) { ehca_gen_err("Cannot create ctblk SLAB cache."); ehca_cleanup_small_qp_cache(); + ret = -ENOMEM; goto create_slab_caches6; } #endif @@ -713,8 +714,7 @@ static struct attribute_group ehca_dev_attr_grp = { .attrs = ehca_dev_attrs }; -static int __devinit ehca_probe(struct platform_device *dev, - const struct of_device_id *id) +static int ehca_probe(struct platform_device *dev) { struct ehca_shca *shca; const u64 *handle; @@ -879,7 +879,7 @@ probe1: return -EINVAL; } -static int __devexit ehca_remove(struct platform_device *dev) +static int ehca_remove(struct platform_device *dev) { struct ehca_shca *shca = dev_get_drvdata(&dev->dev); unsigned long flags; @@ -937,7 +937,7 @@ static struct of_device_id ehca_device_table[] = }; MODULE_DEVICE_TABLE(of, ehca_device_table); -static struct of_platform_driver ehca_driver = { +static struct platform_driver ehca_driver = { .probe = ehca_probe, .remove = ehca_remove, .driver = { diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c index 87844869dcc..3488e8c9fcb 100644 --- a/drivers/infiniband/hw/ehca/ehca_mrmw.c +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c @@ -400,10 +400,7 @@ reg_user_mr_fallback: pginfo.num_hwpages = num_hwpages; pginfo.u.usr.region = e_mr->umem; pginfo.next_hwpage = e_mr->umem->offset / hwpage_size; - pginfo.u.usr.next_chunk = list_prepare_entry(pginfo.u.usr.next_chunk, - (&e_mr->umem->chunk_list), - list); - + pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl; ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags, e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, &e_mr->ib.ib_mr.rkey, EHCA_REG_MR); @@ -688,7 +685,7 @@ dereg_mr_exit0: /*----------------------------------------------------------------------*/ -struct ib_mw *ehca_alloc_mw(struct ib_pd *pd) +struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) { struct ib_mw *ib_mw; u64 h_ret; @@ -698,6 +695,9 @@ struct ib_mw *ehca_alloc_mw(struct ib_pd *pd) container_of(pd->device, struct ehca_shca, ib_device); struct ehca_mw_hipzout_parms hipzout; + if (type != IB_MW_TYPE_1) + return ERR_PTR(-EINVAL); + e_mw = ehca_mw_new(); if (!e_mw) { ib_mw = ERR_PTR(-ENOMEM); @@ -1855,61 +1855,39 @@ static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo, u64 *kpage) { int ret = 0; - struct ib_umem_chunk *prev_chunk; - struct ib_umem_chunk *chunk; u64 pgaddr; - u32 i = 0; u32 j = 0; int hwpages_per_kpage = PAGE_SIZE / pginfo->hwpage_size; - - /* loop over desired chunk entries */ - chunk = pginfo->u.usr.next_chunk; - prev_chunk = pginfo->u.usr.next_chunk; - list_for_each_entry_continue( - chunk, (&(pginfo->u.usr.region->chunk_list)), list) { - for (i = pginfo->u.usr.next_nmap; i < chunk->nmap; ) { - pgaddr = page_to_pfn(sg_page(&chunk->page_list[i])) - << PAGE_SHIFT ; - *kpage = pgaddr + (pginfo->next_hwpage * - pginfo->hwpage_size); - if ( !(*kpage) ) { - ehca_gen_err("pgaddr=%llx " - "chunk->page_list[i]=%llx " - "i=%x next_hwpage=%llx", - pgaddr, (u64)sg_dma_address( - &chunk->page_list[i]), - i, pginfo->next_hwpage); - return -EFAULT; - } - (pginfo->hwpage_cnt)++; - (pginfo->next_hwpage)++; - kpage++; - if (pginfo->next_hwpage % hwpages_per_kpage == 0) { - (pginfo->kpage_cnt)++; - (pginfo->u.usr.next_nmap)++; - pginfo->next_hwpage = 0; - i++; - } - j++; - if (j >= number) break; + struct scatterlist **sg = &pginfo->u.usr.next_sg; + + while (*sg != NULL) { + pgaddr = page_to_pfn(sg_page(*sg)) + << PAGE_SHIFT; + *kpage = pgaddr + (pginfo->next_hwpage * + pginfo->hwpage_size); + if (!(*kpage)) { + ehca_gen_err("pgaddr=%llx " + "sg_dma_address=%llx " + "entry=%llx next_hwpage=%llx", + pgaddr, (u64)sg_dma_address(*sg), + pginfo->u.usr.next_nmap, + pginfo->next_hwpage); + return -EFAULT; } - if ((pginfo->u.usr.next_nmap >= chunk->nmap) && - (j >= number)) { - pginfo->u.usr.next_nmap = 0; - prev_chunk = chunk; - break; - } else if (pginfo->u.usr.next_nmap >= chunk->nmap) { - pginfo->u.usr.next_nmap = 0; - prev_chunk = chunk; - } else if (j >= number) + (pginfo->hwpage_cnt)++; + (pginfo->next_hwpage)++; + kpage++; + if (pginfo->next_hwpage % hwpages_per_kpage == 0) { + (pginfo->kpage_cnt)++; + (pginfo->u.usr.next_nmap)++; + pginfo->next_hwpage = 0; + *sg = sg_next(*sg); + } + j++; + if (j >= number) break; - else - prev_chunk = chunk; } - pginfo->u.usr.next_chunk = - list_prepare_entry(prev_chunk, - (&(pginfo->u.usr.region->chunk_list)), - list); + return ret; } @@ -1917,20 +1895,19 @@ static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo, * check given pages for contiguous layout * last page addr is returned in prev_pgaddr for further check */ -static int ehca_check_kpages_per_ate(struct scatterlist *page_list, - int start_idx, int end_idx, +static int ehca_check_kpages_per_ate(struct scatterlist **sg, + int num_pages, u64 *prev_pgaddr) { - int t; - for (t = start_idx; t <= end_idx; t++) { - u64 pgaddr = page_to_pfn(sg_page(&page_list[t])) << PAGE_SHIFT; + for (; *sg && num_pages > 0; *sg = sg_next(*sg), num_pages--) { + u64 pgaddr = page_to_pfn(sg_page(*sg)) << PAGE_SHIFT; if (ehca_debug_level >= 3) ehca_gen_dbg("chunk_page=%llx value=%016llx", pgaddr, *(u64 *)__va(pgaddr)); if (pgaddr - PAGE_SIZE != *prev_pgaddr) { ehca_gen_err("uncontiguous page found pgaddr=%llx " - "prev_pgaddr=%llx page_list_i=%x", - pgaddr, *prev_pgaddr, t); + "prev_pgaddr=%llx entries_left_in_hwpage=%x", + pgaddr, *prev_pgaddr, num_pages); return -EINVAL; } *prev_pgaddr = pgaddr; @@ -1944,111 +1921,80 @@ static int ehca_set_pagebuf_user2(struct ehca_mr_pginfo *pginfo, u64 *kpage) { int ret = 0; - struct ib_umem_chunk *prev_chunk; - struct ib_umem_chunk *chunk; u64 pgaddr, prev_pgaddr; - u32 i = 0; u32 j = 0; int kpages_per_hwpage = pginfo->hwpage_size / PAGE_SIZE; int nr_kpages = kpages_per_hwpage; + struct scatterlist **sg = &pginfo->u.usr.next_sg; + + while (*sg != NULL) { - /* loop over desired chunk entries */ - chunk = pginfo->u.usr.next_chunk; - prev_chunk = pginfo->u.usr.next_chunk; - list_for_each_entry_continue( - chunk, (&(pginfo->u.usr.region->chunk_list)), list) { - for (i = pginfo->u.usr.next_nmap; i < chunk->nmap; ) { - if (nr_kpages == kpages_per_hwpage) { - pgaddr = ( page_to_pfn(sg_page(&chunk->page_list[i])) - << PAGE_SHIFT ); - *kpage = pgaddr; - if ( !(*kpage) ) { - ehca_gen_err("pgaddr=%llx i=%x", - pgaddr, i); + if (nr_kpages == kpages_per_hwpage) { + pgaddr = (page_to_pfn(sg_page(*sg)) + << PAGE_SHIFT); + *kpage = pgaddr; + if (!(*kpage)) { + ehca_gen_err("pgaddr=%llx entry=%llx", + pgaddr, pginfo->u.usr.next_nmap); + ret = -EFAULT; + return ret; + } + /* + * The first page in a hwpage must be aligned; + * the first MR page is exempt from this rule. + */ + if (pgaddr & (pginfo->hwpage_size - 1)) { + if (pginfo->hwpage_cnt) { + ehca_gen_err( + "invalid alignment " + "pgaddr=%llx entry=%llx " + "mr_pgsize=%llx", + pgaddr, pginfo->u.usr.next_nmap, + pginfo->hwpage_size); ret = -EFAULT; return ret; } - /* - * The first page in a hwpage must be aligned; - * the first MR page is exempt from this rule. - */ - if (pgaddr & (pginfo->hwpage_size - 1)) { - if (pginfo->hwpage_cnt) { - ehca_gen_err( - "invalid alignment " - "pgaddr=%llx i=%x " - "mr_pgsize=%llx", - pgaddr, i, - pginfo->hwpage_size); - ret = -EFAULT; - return ret; - } - /* first MR page */ - pginfo->kpage_cnt = - (pgaddr & - (pginfo->hwpage_size - 1)) >> - PAGE_SHIFT; - nr_kpages -= pginfo->kpage_cnt; - *kpage = pgaddr & - ~(pginfo->hwpage_size - 1); - } - if (ehca_debug_level >= 3) { - u64 val = *(u64 *)__va(pgaddr); - ehca_gen_dbg("kpage=%llx chunk_page=%llx " - "value=%016llx", - *kpage, pgaddr, val); - } - prev_pgaddr = pgaddr; - i++; - pginfo->kpage_cnt++; - pginfo->u.usr.next_nmap++; - nr_kpages--; - if (!nr_kpages) - goto next_kpage; - continue; + /* first MR page */ + pginfo->kpage_cnt = + (pgaddr & + (pginfo->hwpage_size - 1)) >> + PAGE_SHIFT; + nr_kpages -= pginfo->kpage_cnt; + *kpage = pgaddr & + ~(pginfo->hwpage_size - 1); } - if (i + nr_kpages > chunk->nmap) { - ret = ehca_check_kpages_per_ate( - chunk->page_list, i, - chunk->nmap - 1, &prev_pgaddr); - if (ret) return ret; - pginfo->kpage_cnt += chunk->nmap - i; - pginfo->u.usr.next_nmap += chunk->nmap - i; - nr_kpages -= chunk->nmap - i; - break; + if (ehca_debug_level >= 3) { + u64 val = *(u64 *)__va(pgaddr); + ehca_gen_dbg("kpage=%llx page=%llx " + "value=%016llx", + *kpage, pgaddr, val); } + prev_pgaddr = pgaddr; + *sg = sg_next(*sg); + pginfo->kpage_cnt++; + pginfo->u.usr.next_nmap++; + nr_kpages--; + if (!nr_kpages) + goto next_kpage; + continue; + } + + ret = ehca_check_kpages_per_ate(sg, nr_kpages, + &prev_pgaddr); + if (ret) + return ret; + pginfo->kpage_cnt += nr_kpages; + pginfo->u.usr.next_nmap += nr_kpages; - ret = ehca_check_kpages_per_ate(chunk->page_list, i, - i + nr_kpages - 1, - &prev_pgaddr); - if (ret) return ret; - i += nr_kpages; - pginfo->kpage_cnt += nr_kpages; - pginfo->u.usr.next_nmap += nr_kpages; next_kpage: - nr_kpages = kpages_per_hwpage; - (pginfo->hwpage_cnt)++; - kpage++; - j++; - if (j >= number) break; - } - if ((pginfo->u.usr.next_nmap >= chunk->nmap) && - (j >= number)) { - pginfo->u.usr.next_nmap = 0; - prev_chunk = chunk; - break; - } else if (pginfo->u.usr.next_nmap >= chunk->nmap) { - pginfo->u.usr.next_nmap = 0; - prev_chunk = chunk; - } else if (j >= number) + nr_kpages = kpages_per_hwpage; + (pginfo->hwpage_cnt)++; + kpage++; + j++; + if (j >= number) break; - else - prev_chunk = chunk; } - pginfo->u.usr.next_chunk = - list_prepare_entry(prev_chunk, - (&(pginfo->u.usr.region->chunk_list)), - list); + return ret; } @@ -2588,16 +2534,6 @@ static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, /* This is only a stub; nothing to be done here */ } -static u64 ehca_dma_address(struct ib_device *dev, struct scatterlist *sg) -{ - return sg->dma_address; -} - -static unsigned int ehca_dma_len(struct ib_device *dev, struct scatterlist *sg) -{ - return sg->length; -} - static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) @@ -2650,8 +2586,6 @@ struct ib_dma_mapping_ops ehca_dma_mapping_ops = { .unmap_page = ehca_dma_unmap_page, .map_sg = ehca_dma_map_sg, .unmap_sg = ehca_dma_unmap_sg, - .dma_address = ehca_dma_address, - .dma_len = ehca_dma_len, .sync_single_for_cpu = ehca_dma_sync_single_for_cpu, .sync_single_for_device = ehca_dma_sync_single_for_device, .alloc_coherent = ehca_dma_alloc_coherent, diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index 149393915ae..2e89356c46f 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -636,30 +636,26 @@ static struct ehca_qp *internal_create_qp( my_qp->send_cq = container_of(init_attr->send_cq, struct ehca_cq, ib_cq); - do { - if (!idr_pre_get(&ehca_qp_idr, GFP_KERNEL)) { - ret = -ENOMEM; - ehca_err(pd->device, "Can't reserve idr resources."); - goto create_qp_exit0; - } + idr_preload(GFP_KERNEL); + write_lock_irqsave(&ehca_qp_idr_lock, flags); - write_lock_irqsave(&ehca_qp_idr_lock, flags); - ret = idr_get_new(&ehca_qp_idr, my_qp, &my_qp->token); - write_unlock_irqrestore(&ehca_qp_idr_lock, flags); - } while (ret == -EAGAIN); + ret = idr_alloc(&ehca_qp_idr, my_qp, 0, 0x2000000, GFP_NOWAIT); + if (ret >= 0) + my_qp->token = ret; - if (ret) { - ret = -ENOMEM; - ehca_err(pd->device, "Can't allocate new idr entry."); + write_unlock_irqrestore(&ehca_qp_idr_lock, flags); + idr_preload_end(); + if (ret < 0) { + if (ret == -ENOSPC) { + ret = -EINVAL; + ehca_err(pd->device, "Invalid number of qp"); + } else { + ret = -ENOMEM; + ehca_err(pd->device, "Can't allocate new idr entry."); + } goto create_qp_exit0; } - if (my_qp->token > 0x1FFFFFF) { - ret = -EINVAL; - ehca_err(pd->device, "Invalid number of qp"); - goto create_qp_exit1; - } - if (has_srq) parms.srq_token = my_qp->token; @@ -1333,7 +1329,7 @@ static int internal_modify_qp(struct ib_qp *ibqp, qp_new_state = attr_mask & IB_QP_STATE ? attr->qp_state : qp_cur_state; if (!smi_reset2init && !ib_modify_qp_is_ok(qp_cur_state, qp_new_state, ibqp->qp_type, - attr_mask)) { + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) { ret = -EINVAL; ehca_err(ibqp->device, "Invalid qp transition new_state=%x cur_state=%x " diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c index 2d41d04fd95..89517ffb438 100644 --- a/drivers/infiniband/hw/ehca/hcp_if.c +++ b/drivers/infiniband/hw/ehca/hcp_if.c @@ -90,26 +90,6 @@ static DEFINE_SPINLOCK(hcall_lock); -static u32 get_longbusy_msecs(int longbusy_rc) -{ - switch (longbusy_rc) { - case H_LONG_BUSY_ORDER_1_MSEC: - return 1; - case H_LONG_BUSY_ORDER_10_MSEC: - return 10; - case H_LONG_BUSY_ORDER_100_MSEC: - return 100; - case H_LONG_BUSY_ORDER_1_SEC: - return 1000; - case H_LONG_BUSY_ORDER_10_SEC: - return 10000; - case H_LONG_BUSY_ORDER_100_SEC: - return 100000; - default: - return 1; - } -} - static long ehca_plpar_hcall_norets(unsigned long opcode, unsigned long arg1, unsigned long arg2, diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.c b/drivers/infiniband/hw/ehca/ipz_pt_fn.c index 62c71fadb4d..8d594517cd2 100644 --- a/drivers/infiniband/hw/ehca/ipz_pt_fn.c +++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.c @@ -222,7 +222,8 @@ int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue, queue->small_page = NULL; /* allocate queue page pointers */ - queue->queue_pages = kzalloc(nr_of_pages * sizeof(void *), GFP_KERNEL); + queue->queue_pages = kzalloc(nr_of_pages * sizeof(void *), + GFP_KERNEL | __GFP_NOWARN); if (!queue->queue_pages) { queue->queue_pages = vzalloc(nr_of_pages * sizeof(void *)); if (!queue->queue_pages) { diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c index 714293b7851..45802e97332 100644 --- a/drivers/infiniband/hw/ipath/ipath_diag.c +++ b/drivers/infiniband/hw/ipath/ipath_diag.c @@ -326,7 +326,7 @@ static ssize_t ipath_diagpkt_write(struct file *fp, size_t count, loff_t *off) { u32 __iomem *piobuf; - u32 plen, clen, pbufn; + u32 plen, pbufn, maxlen_reserve; struct ipath_diag_pkt odp; struct ipath_diag_xpkt dp; u32 *tmpbuf = NULL; @@ -335,42 +335,24 @@ static ssize_t ipath_diagpkt_write(struct file *fp, u64 val; u32 l_state, lt_state; /* LinkState, LinkTrainingState */ - if (count < sizeof(odp)) { - ret = -EINVAL; - goto bail; - } if (count == sizeof(dp)) { if (copy_from_user(&dp, data, sizeof(dp))) { ret = -EFAULT; goto bail; } - } else if (copy_from_user(&odp, data, sizeof(odp))) { - ret = -EFAULT; - goto bail; - } - - /* - * Due to padding/alignment issues (lessened with new struct) - * the old and new structs are the same length. We need to - * disambiguate them, which we can do because odp.len has never - * been less than the total of LRH+BTH+DETH so far, while - * dp.unit (same offset) unit is unlikely to get that high. - * Similarly, dp.data, the pointer to user at the same offset - * as odp.unit, is almost certainly at least one (512byte)page - * "above" NULL. The if-block below can be omitted if compatibility - * between a new driver and older diagnostic code is unimportant. - * compatibility the other direction (new diags, old driver) is - * handled in the diagnostic code, with a warning. - */ - if (dp.unit >= 20 && dp.data < 512) { - /* very probable version mismatch. Fix it up */ - memcpy(&odp, &dp, sizeof(odp)); - /* We got a legacy dp, copy elements to dp */ + } else if (count == sizeof(odp)) { + if (copy_from_user(&odp, data, sizeof(odp))) { + ret = -EFAULT; + goto bail; + } + dp.len = odp.len; dp.unit = odp.unit; dp.data = odp.data; - dp.len = odp.len; - dp.pbc_wd = 0; /* Indicate we need to compute PBC wd */ + dp.pbc_wd = 0; + } else { + ret = -EINVAL; + goto bail; } /* send count must be an exact number of dwords */ @@ -379,7 +361,7 @@ static ssize_t ipath_diagpkt_write(struct file *fp, goto bail; } - clen = dp.len >> 2; + plen = dp.len >> 2; dd = ipath_lookup(dp.unit); if (!dd || !(dd->ipath_flags & IPATH_PRESENT) || @@ -422,16 +404,22 @@ static ssize_t ipath_diagpkt_write(struct file *fp, goto bail; } - /* need total length before first word written */ - /* +1 word is for the qword padding */ - plen = sizeof(u32) + dp.len; - - if ((plen + 4) > dd->ipath_ibmaxlen) { + /* + * need total length before first word written, plus 2 Dwords. One Dword + * is for padding so we get the full user data when not aligned on + * a word boundary. The other Dword is to make sure we have room for the + * ICRC which gets tacked on later. + */ + maxlen_reserve = 2 * sizeof(u32); + if (dp.len > dd->ipath_ibmaxlen - maxlen_reserve) { ipath_dbg("Pkt len 0x%x > ibmaxlen %x\n", - plen - 4, dd->ipath_ibmaxlen); + dp.len, dd->ipath_ibmaxlen); ret = -EINVAL; - goto bail; /* before writing pbc */ + goto bail; } + + plen = sizeof(u32) + dp.len; + tmpbuf = vmalloc(plen); if (!tmpbuf) { dev_info(&dd->pcidev->dev, "Unable to allocate tmp buffer, " @@ -473,11 +461,11 @@ static ssize_t ipath_diagpkt_write(struct file *fp, */ if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) { ipath_flush_wc(); - __iowrite32_copy(piobuf + 2, tmpbuf, clen - 1); + __iowrite32_copy(piobuf + 2, tmpbuf, plen - 1); ipath_flush_wc(); - __raw_writel(tmpbuf[clen - 1], piobuf + clen + 1); + __raw_writel(tmpbuf[plen - 1], piobuf + plen + 1); } else - __iowrite32_copy(piobuf + 2, tmpbuf, clen); + __iowrite32_copy(piobuf + 2, tmpbuf, plen); ipath_flush_wc(); diff --git a/drivers/infiniband/hw/ipath/ipath_dma.c b/drivers/infiniband/hw/ipath/ipath_dma.c index 644c2c74e05..123a8c05353 100644 --- a/drivers/infiniband/hw/ipath/ipath_dma.c +++ b/drivers/infiniband/hw/ipath/ipath_dma.c @@ -115,6 +115,10 @@ static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl, ret = 0; break; } + sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif } return ret; } @@ -126,21 +130,6 @@ static void ipath_unmap_sg(struct ib_device *dev, BUG_ON(!valid_dma_direction(direction)); } -static u64 ipath_sg_dma_address(struct ib_device *dev, struct scatterlist *sg) -{ - u64 addr = (u64) page_address(sg_page(sg)); - - if (addr) - addr += sg->offset; - return addr; -} - -static unsigned int ipath_sg_dma_len(struct ib_device *dev, - struct scatterlist *sg) -{ - return sg->length; -} - static void ipath_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, @@ -176,17 +165,15 @@ static void ipath_dma_free_coherent(struct ib_device *dev, size_t size, } struct ib_dma_mapping_ops ipath_dma_mapping_ops = { - ipath_mapping_error, - ipath_dma_map_single, - ipath_dma_unmap_single, - ipath_dma_map_page, - ipath_dma_unmap_page, - ipath_map_sg, - ipath_unmap_sg, - ipath_sg_dma_address, - ipath_sg_dma_len, - ipath_sync_single_for_cpu, - ipath_sync_single_for_device, - ipath_dma_alloc_coherent, - ipath_dma_free_coherent + .mapping_error = ipath_mapping_error, + .map_single = ipath_dma_map_single, + .unmap_single = ipath_dma_unmap_single, + .map_page = ipath_dma_map_page, + .unmap_page = ipath_dma_unmap_page, + .map_sg = ipath_map_sg, + .unmap_sg = ipath_unmap_sg, + .sync_single_for_cpu = ipath_sync_single_for_cpu, + .sync_single_for_device = ipath_sync_single_for_device, + .alloc_coherent = ipath_dma_alloc_coherent, + .free_coherent = ipath_dma_free_coherent }; diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index bfca37b2432..bd0caedafe9 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -127,9 +127,8 @@ const char *ipath_ibcstatus_str[] = { "LTState1C", "LTState1D", "LTState1E", "LTState1F" }; -static void __devexit ipath_remove_one(struct pci_dev *); -static int __devinit ipath_init_one(struct pci_dev *, - const struct pci_device_id *); +static void ipath_remove_one(struct pci_dev *); +static int ipath_init_one(struct pci_dev *, const struct pci_device_id *); /* Only needed for registration, nothing else needs this info */ #define PCI_VENDOR_ID_PATHSCALE 0x1fc1 @@ -148,7 +147,7 @@ MODULE_DEVICE_TABLE(pci, ipath_pci_tbl); static struct pci_driver ipath_driver = { .name = IPATH_DRV_NAME, .probe = ipath_init_one, - .remove = __devexit_p(ipath_remove_one), + .remove = ipath_remove_one, .id_table = ipath_pci_tbl, .driver = { .groups = ipath_driver_attr_groups, @@ -195,11 +194,6 @@ static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev) struct ipath_devdata *dd; int ret; - if (!idr_pre_get(&unit_table, GFP_KERNEL)) { - dd = ERR_PTR(-ENOMEM); - goto bail; - } - dd = vzalloc(sizeof(*dd)); if (!dd) { dd = ERR_PTR(-ENOMEM); @@ -207,9 +201,10 @@ static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev) } dd->ipath_unit = -1; + idr_preload(GFP_KERNEL); spin_lock_irqsave(&ipath_devs_lock, flags); - ret = idr_get_new(&unit_table, dd, &dd->ipath_unit); + ret = idr_alloc(&unit_table, dd, 0, 0, GFP_NOWAIT); if (ret < 0) { printk(KERN_ERR IPATH_DRV_NAME ": Could not allocate unit ID: error %d\n", -ret); @@ -217,6 +212,7 @@ static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev) dd = ERR_PTR(ret); goto bail_unlock; } + dd->ipath_unit = ret; dd->pcidev = pdev; pci_set_drvdata(pdev, dd); @@ -225,7 +221,7 @@ static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev) bail_unlock: spin_unlock_irqrestore(&ipath_devs_lock, flags); - + idr_preload_end(); bail: return dd; } @@ -392,8 +388,7 @@ done: static void cleanup_device(struct ipath_devdata *dd); -static int __devinit ipath_init_one(struct pci_dev *pdev, - const struct pci_device_id *ent) +static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { int ret, len, j; struct ipath_devdata *dd; @@ -737,7 +732,7 @@ static void cleanup_device(struct ipath_devdata *dd) kfree(tmp); } -static void __devexit ipath_remove_one(struct pci_dev *pdev) +static void ipath_remove_one(struct pci_dev *pdev) { struct ipath_devdata *dd = pci_get_drvdata(pdev); @@ -2505,11 +2500,6 @@ static int __init infinipath_init(void) * the PCI subsystem. */ idr_init(&unit_table); - if (!idr_pre_get(&unit_table, GFP_KERNEL)) { - printk(KERN_ERR IPATH_DRV_NAME ": idr_pre_get() failed\n"); - ret = -ENOMEM; - goto bail; - } ret = pci_register_driver(&ipath_driver); if (ret < 0) { diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 3eb7e454849..6d7f453b4d0 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -40,6 +40,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/io.h> +#include <linux/aio.h> #include <linux/jiffies.h> #include <linux/cpu.h> #include <asm/pgtable.h> @@ -1864,9 +1865,9 @@ static int ipath_assign_port(struct file *fp, goto done_chk_sdma; } - i_minor = iminor(fp->f_path.dentry->d_inode) - IPATH_USER_MINOR_BASE; + i_minor = iminor(file_inode(fp)) - IPATH_USER_MINOR_BASE; ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n", - (long)fp->f_path.dentry->d_inode->i_rdev, i_minor); + (long)file_inode(fp)->i_rdev, i_minor); if (i_minor) ret = find_free_port(i_minor - 1, fp, uinfo); diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c index a4de9d58e9b..e0c404bdc4a 100644 --- a/drivers/infiniband/hw/ipath/ipath_fs.c +++ b/drivers/infiniband/hw/ipath/ipath_fs.c @@ -113,7 +113,7 @@ static ssize_t atomic_counters_read(struct file *file, char __user *buf, struct infinipath_counters counters; struct ipath_devdata *dd; - dd = file->f_path.dentry->d_inode->i_private; + dd = file_inode(file)->i_private; dd->ipath_f_read_counters(dd, &counters); return simple_read_from_buffer(buf, count, ppos, &counters, @@ -154,7 +154,7 @@ static ssize_t flash_read(struct file *file, char __user *buf, goto bail; } - dd = file->f_path.dentry->d_inode->i_private; + dd = file_inode(file)->i_private; if (ipath_eeprom_read(dd, pos, tmp, count)) { ipath_dev_err(dd, "failed to read from flash\n"); ret = -ENXIO; @@ -207,7 +207,7 @@ static ssize_t flash_write(struct file *file, const char __user *buf, goto bail_tmp; } - dd = file->f_path.dentry->d_inode->i_private; + dd = file_inode(file)->i_private; if (ipath_eeprom_write(dd, pos, tmp, count)) { ret = -ENXIO; ipath_dev_err(dd, "failed to write to flash\n"); @@ -410,6 +410,7 @@ static struct file_system_type ipathfs_fs_type = { .mount = ipathfs_mount, .kill_sb = ipathfs_kill_super, }; +MODULE_ALIAS_FS("ipathfs"); int __init ipath_init_ipathfs(void) { diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c index 49b09c697c7..be2a60e142b 100644 --- a/drivers/infiniband/hw/ipath/ipath_init_chip.c +++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c @@ -719,16 +719,6 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) goto done; /* - * we ignore most issues after reporting them, but have to specially - * handle hardware-disabled chips. - */ - if (ret == 2) { - /* unique error, known to ipath_init_one */ - ret = -EPERM; - goto done; - } - - /* * We could bump this to allow for full rcvegrcnt + rcvtidcnt, * but then it no longer nicely fits power of two, and since * we now use routines that backend onto __get_free_pages, the diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 26dfbc8ee0f..01ba792791a 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -70,7 +70,7 @@ void ipath_disarm_senderrbufs(struct ipath_devdata *dd) if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) { int i; if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG) && - dd->ipath_lastcancel > jiffies) { + time_after(dd->ipath_lastcancel, jiffies)) { __IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG, "SendbufErrs %lx %lx", sbuf[0], sbuf[1]); @@ -755,7 +755,7 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) /* likely due to cancel; so suppress message unless verbose */ if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) && - dd->ipath_lastcancel > jiffies) { + time_after(dd->ipath_lastcancel, jiffies)) { /* armlaunch takes precedence; it often causes both. */ ipath_cdbg(VERBOSE, "Suppressed %s error (%llx) after sendbuf cancel\n", diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c index e346d3890a0..5e61e9bff69 100644 --- a/drivers/infiniband/hw/ipath/ipath_mr.c +++ b/drivers/infiniband/hw/ipath/ipath_mr.c @@ -188,8 +188,8 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, { struct ipath_mr *mr; struct ib_umem *umem; - struct ib_umem_chunk *chunk; - int n, m, i; + int n, m, entry; + struct scatterlist *sg; struct ib_mr *ret; if (length == 0) { @@ -202,10 +202,7 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (IS_ERR(umem)) return (void *) umem; - n = 0; - list_for_each_entry(chunk, &umem->chunk_list, list) - n += chunk->nents; - + n = umem->nmap; mr = alloc_mr(n, &to_idev(pd->device)->lk_table); if (!mr) { ret = ERR_PTR(-ENOMEM); @@ -224,22 +221,20 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, m = 0; n = 0; - list_for_each_entry(chunk, &umem->chunk_list, list) { - for (i = 0; i < chunk->nents; i++) { - void *vaddr; - - vaddr = page_address(sg_page(&chunk->page_list[i])); - if (!vaddr) { - ret = ERR_PTR(-EINVAL); - goto bail; - } - mr->mr.map[m]->segs[n].vaddr = vaddr; - mr->mr.map[m]->segs[n].length = umem->page_size; - n++; - if (n == IPATH_SEGSZ) { - m++; - n = 0; - } + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + void *vaddr; + + vaddr = page_address(sg_page(sg)); + if (!vaddr) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + mr->mr.map[m]->segs[n].vaddr = vaddr; + mr->mr.map[m]->segs[n].length = umem->page_size; + n++; + if (n == IPATH_SEGSZ) { + m++; + n = 0; } } ret = &mr->ibmr; diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c index 0857a9c3cd3..face87602dc 100644 --- a/drivers/infiniband/hw/ipath/ipath_qp.c +++ b/drivers/infiniband/hw/ipath/ipath_qp.c @@ -463,7 +463,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask)) + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) goto inval; if (attr_mask & IB_QP_AV) { diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c index 98ac18ec977..17a517766ad 100644 --- a/drivers/infiniband/hw/ipath/ipath_sdma.c +++ b/drivers/infiniband/hw/ipath/ipath_sdma.c @@ -247,7 +247,7 @@ static void sdma_abort_task(unsigned long opaque) /* ipath_sdma_abort() is done, waiting for interrupt */ if (status == IPATH_SDMA_ABORT_DISARMED) { - if (jiffies < dd->ipath_sdma_abort_intr_timeout) + if (time_before(jiffies, dd->ipath_sdma_abort_intr_timeout)) goto resched_noprint; /* give up, intr got lost somewhere */ ipath_dbg("give up waiting for SDMADISABLED intr\n"); @@ -341,7 +341,7 @@ resched: * JAG - this is bad to just have default be a loop without * state change */ - if (jiffies > dd->ipath_sdma_abort_jiffies) { + if (time_after(jiffies, dd->ipath_sdma_abort_jiffies)) { ipath_dbg("looping with status 0x%08lx\n", dd->ipath_sdma_status); dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ; diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c index f5cb13b2144..cc04b7ba348 100644 --- a/drivers/infiniband/hw/ipath/ipath_user_sdma.c +++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c @@ -280,9 +280,7 @@ static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd, int j; int ret; - ret = get_user_pages(current, current->mm, addr, - npages, 0, 1, pages, NULL); - + ret = get_user_pages_fast(addr, npages, 0, pages); if (ret != npages) { int i; @@ -811,10 +809,7 @@ int ipath_user_sdma_writev(struct ipath_devdata *dd, while (dim) { const int mxp = 8; - down_write(¤t->mm->mmap_sem); ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp); - up_write(¤t->mm->mmap_sem); - if (ret <= 0) goto done_unlock; else { diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 439c35d4a66..44ea9390417 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -620,7 +620,7 @@ void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data, goto bail; } - opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f; dev->opstats[opcode].n_bytes += tlen; dev->opstats[opcode].n_packets++; @@ -2187,7 +2187,8 @@ int ipath_register_ib_device(struct ipath_devdata *dd) if (ret) goto err_reg; - if (ipath_verbs_register_sysfs(dev)) + ret = ipath_verbs_register_sysfs(dev); + if (ret) goto err_class; enable_timer(dd); @@ -2327,15 +2328,15 @@ static int ipath_verbs_register_sysfs(struct ib_device *dev) int i; int ret; - for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) - if (device_create_file(&dev->dev, - ipath_class_attributes[i])) { - ret = 1; + for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) { + ret = device_create_file(&dev->dev, + ipath_class_attributes[i]); + if (ret) goto bail; - } - - ret = 0; - + } + return 0; bail: + for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) + device_remove_file(&dev->dev, ipath_class_attributes[i]); return ret; } diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig index 24ab11a9ad1..fc01deac1d3 100644 --- a/drivers/infiniband/hw/mlx4/Kconfig +++ b/drivers/infiniband/hw/mlx4/Kconfig @@ -1,6 +1,6 @@ config MLX4_INFINIBAND tristate "Mellanox ConnectX HCA support" - depends on NETDEVICES && ETHERNET && PCI + depends on NETDEVICES && ETHERNET && PCI && INET select NET_VENDOR_MELLANOX select MLX4_CORE ---help--- diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index a251becdaa9..2d8c3397774 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -39,25 +39,6 @@ #include "mlx4_ib.h" -int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, - u8 *mac, int *is_mcast, u8 port) -{ - struct in6_addr in6; - - *is_mcast = 0; - - memcpy(&in6, ah_attr->grh.dgid.raw, sizeof in6); - if (rdma_link_local_addr(&in6)) - rdma_get_ll_mac(&in6, mac); - else if (rdma_is_multicast_addr(&in6)) { - rdma_get_mcast_mac(&in6, mac); - *is_mcast = 1; - } else - return -EINVAL; - - return 0; -} - static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, struct mlx4_ib_ah *ah) { @@ -92,21 +73,18 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr { struct mlx4_ib_dev *ibdev = to_mdev(pd->device); struct mlx4_dev *dev = ibdev->dev; - union ib_gid sgid; - u8 mac[6]; - int err; - int is_mcast; + int is_mcast = 0; + struct in6_addr in6; u16 vlan_tag; - err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, &is_mcast, ah_attr->port_num); - if (err) - return ERR_PTR(err); - - memcpy(ah->av.eth.mac, mac, 6); - err = ib_get_cached_gid(pd->device, ah_attr->port_num, ah_attr->grh.sgid_index, &sgid); - if (err) - return ERR_PTR(err); - vlan_tag = rdma_get_vlan_id(&sgid); + memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6)); + if (rdma_is_multicast_addr(&in6)) { + is_mcast = 1; + rdma_get_mcast_mac(&in6, ah->av.eth.mac); + } else { + memcpy(ah->av.eth.mac, ah_attr->dmac, ETH_ALEN); + } + vlan_tag = ah_attr->vlan_id; if (vlan_tag < 0x1000) vlan_tag |= (ah_attr->sl & 7) << 13; ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index d2fb38d4357..0eb141c4141 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -107,7 +107,7 @@ static __be64 get_cached_alias_guid(struct mlx4_ib_dev *dev, int port, int index { if (index >= NUM_ALIAS_GUID_PER_PORT) { pr_err("%s: ERROR: asked for index:%d\n", __func__, index); - return (__force __be64) ((u64) 0xFFFFFFFFFFFFFFFFUL); + return (__force __be64) -1; } return *(__be64 *)&dev->sriov.demux[port - 1].guid_cache[index]; } @@ -154,7 +154,7 @@ void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, continue; slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ; - if (slave_id >= dev->dev->num_slaves) + if (slave_id >= dev->dev->num_vfs + 1) return; tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE]; form_cache_ag = get_cached_alias_guid(dev, port_num, diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c index 80079e5a2e3..56a593e0ae5 100644 --- a/drivers/infiniband/hw/mlx4/cm.c +++ b/drivers/infiniband/hw/mlx4/cm.c @@ -61,6 +61,11 @@ struct cm_generic_msg { __be32 remote_comm_id; }; +struct cm_sidr_generic_msg { + struct ib_mad_hdr hdr; + __be32 request_id; +}; + struct cm_req_msg { unsigned char unused[0x60]; union ib_gid primary_path_sgid; @@ -69,28 +74,62 @@ struct cm_req_msg { static void set_local_comm_id(struct ib_mad *mad, u32 cm_id) { - struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - msg->local_comm_id = cpu_to_be32(cm_id); + if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + msg->request_id = cpu_to_be32(cm_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + pr_err("trying to set local_comm_id in SIDR_REP\n"); + return; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->local_comm_id = cpu_to_be32(cm_id); + } } static u32 get_local_comm_id(struct ib_mad *mad) { - struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - - return be32_to_cpu(msg->local_comm_id); + if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + return be32_to_cpu(msg->request_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + pr_err("trying to set local_comm_id in SIDR_REP\n"); + return -1; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + return be32_to_cpu(msg->local_comm_id); + } } static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id) { - struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - msg->remote_comm_id = cpu_to_be32(cm_id); + if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + msg->request_id = cpu_to_be32(cm_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + pr_err("trying to set remote_comm_id in SIDR_REQ\n"); + return; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->remote_comm_id = cpu_to_be32(cm_id); + } } static u32 get_remote_comm_id(struct ib_mad *mad) { - struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - - return be32_to_cpu(msg->remote_comm_id); + if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + return be32_to_cpu(msg->request_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + pr_err("trying to set remote_comm_id in SIDR_REQ\n"); + return -1; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + return be32_to_cpu(msg->remote_comm_id); + } } static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad) @@ -203,8 +242,7 @@ static void sl_id_map_add(struct ib_device *ibdev, struct id_map_entry *new) static struct id_map_entry * id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id) { - int ret, id; - static int next_id; + int ret; struct id_map_entry *ent; struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; @@ -220,25 +258,22 @@ id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id) ent->dev = to_mdev(ibdev); INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout); - do { - spin_lock(&to_mdev(ibdev)->sriov.id_map_lock); - ret = idr_get_new_above(&sriov->pv_id_table, ent, - next_id, &id); - if (!ret) { - next_id = ((unsigned) id + 1) & MAX_IDR_MASK; - ent->pv_cm_id = (u32)id; - sl_id_map_add(ibdev, ent); - } + idr_preload(GFP_KERNEL); + spin_lock(&to_mdev(ibdev)->sriov.id_map_lock); - spin_unlock(&sriov->id_map_lock); - } while (ret == -EAGAIN && idr_pre_get(&sriov->pv_id_table, GFP_KERNEL)); - /*the function idr_get_new_above can return -ENOSPC, so don't insert in that case.*/ - if (!ret) { - spin_lock(&sriov->id_map_lock); + ret = idr_alloc_cyclic(&sriov->pv_id_table, ent, 0, 0, GFP_NOWAIT); + if (ret >= 0) { + ent->pv_cm_id = (u32)ret; + sl_id_map_add(ibdev, ent); list_add_tail(&ent->list, &sriov->cm_list); - spin_unlock(&sriov->id_map_lock); - return ent; } + + spin_unlock(&sriov->id_map_lock); + idr_preload_end(); + + if (ret >= 0) + return ent; + /*error flow*/ kfree(ent); mlx4_ib_warn(ibdev, "No more space in the idr (err:0x%x)\n", ret); @@ -268,15 +303,15 @@ static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id) struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; unsigned long flags; - spin_lock_irqsave(&sriov->going_down_lock, flags); spin_lock(&sriov->id_map_lock); + spin_lock_irqsave(&sriov->going_down_lock, flags); /*make sure that there is no schedule inside the scheduled work.*/ if (!sriov->is_going_down) { id->scheduled_delete = 1; schedule_delayed_work(&id->timeout, CM_CLEANUP_CACHE_TIMEOUT); } - spin_unlock(&sriov->id_map_lock); spin_unlock_irqrestore(&sriov->going_down_lock, flags); + spin_unlock(&sriov->id_map_lock); } int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, @@ -286,19 +321,21 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id u32 sl_cm_id; int pv_cm_id = -1; - sl_cm_id = get_local_comm_id(mad); - if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || - mad->mad_hdr.attr_id == CM_REP_ATTR_ID) { + mad->mad_hdr.attr_id == CM_REP_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + sl_cm_id = get_local_comm_id(mad); id = id_map_alloc(ibdev, slave_id, sl_cm_id); if (IS_ERR(id)) { mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n", __func__, slave_id, sl_cm_id); return PTR_ERR(id); } - } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID) { + } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { return 0; } else { + sl_cm_id = get_local_comm_id(mad); id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id); } @@ -319,14 +356,18 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id } int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, - struct ib_mad *mad) + struct ib_mad *mad) { u32 pv_cm_id; struct id_map_entry *id; - if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID) { + if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { union ib_gid gid; + if (!slave) + return 0; + gid = gid_from_req_msg(ibdev, mad); *slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id); if (*slave < 0) { @@ -345,7 +386,8 @@ int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, return -ENOENT; } - *slave = id->slave_id; + if (slave) + *slave = id->slave_id; set_remote_comm_id(mad, id->sl_cm_id); if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) @@ -364,7 +406,6 @@ void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev) INIT_LIST_HEAD(&dev->sriov.cm_list); dev->sriov.sl_id_map = RB_ROOT; idr_init(&dev->sriov.pv_id_table); - idr_pre_get(&dev->sriov.pv_id_table, GFP_KERNEL); } /* slave = -1 ==> all slaves */ diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index c9eb6a6815c..1066eec854a 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -33,6 +33,7 @@ #include <linux/mlx4/cq.h> #include <linux/mlx4/qp.h> +#include <linux/mlx4/srq.h> #include <linux/slab.h> #include "mlx4_ib.h" @@ -66,7 +67,7 @@ static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type) static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n) { - return mlx4_buf_offset(&buf->buf, n * sizeof (struct mlx4_cqe)); + return mlx4_buf_offset(&buf->buf, n * buf->entry_size); } static void *get_cqe(struct mlx4_ib_cq *cq, int n) @@ -77,8 +78,9 @@ static void *get_cqe(struct mlx4_ib_cq *cq, int n) static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n) { struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe); + struct mlx4_cqe *tcqe = ((cq->buf.entry_size == 64) ? (cqe + 1) : cqe); - return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe; } @@ -99,18 +101,19 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * { int err; - err = mlx4_buf_alloc(dev->dev, nent * sizeof(struct mlx4_cqe), - PAGE_SIZE * 2, &buf->buf); + err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size, + PAGE_SIZE * 2, &buf->buf, GFP_KERNEL); if (err) goto out; + buf->entry_size = dev->dev->caps.cqe_size; err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift, &buf->mtt); if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf); + err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf, GFP_KERNEL); if (err) goto err_mtt; @@ -120,8 +123,7 @@ err_mtt: mlx4_mtt_cleanup(dev->dev, &buf->mtt); err_buf: - mlx4_buf_free(dev->dev, nent * sizeof(struct mlx4_cqe), - &buf->buf); + mlx4_buf_free(dev->dev, nent * buf->entry_size, &buf->buf); out: return err; @@ -129,7 +131,7 @@ out: static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe) { - mlx4_buf_free(dev->dev, (cqe + 1) * sizeof(struct mlx4_cqe), &buf->buf); + mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf); } static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context, @@ -137,8 +139,9 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *cont u64 buf_addr, int cqe) { int err; + int cqe_size = dev->dev->caps.cqe_size; - *umem = ib_umem_get(context, buf_addr, cqe * sizeof (struct mlx4_cqe), + *umem = ib_umem_get(context, buf_addr, cqe * cqe_size, IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(*umem)) return PTR_ERR(*umem); @@ -206,7 +209,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector uar = &to_mucontext(context)->uar; } else { - err = mlx4_db_alloc(dev->dev, &cq->db, 1); + err = mlx4_db_alloc(dev->dev, &cq->db, 1, GFP_KERNEL); if (err) goto err_cq; @@ -226,7 +229,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector vector = dev->eq_table[vector % ibdev->num_comp_vectors]; err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, - cq->db.dma, &cq->mcq, vector, 0); + cq->db.dma, &cq->mcq, vector, 0, 0); if (err) goto err_dbmap; @@ -321,7 +324,7 @@ static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq) u32 i; i = cq->mcq.cons_index; - while (get_sw_cqe(cq, i & cq->ibcq.cqe)) + while (get_sw_cqe(cq, i)) ++i; return i - cq->mcq.cons_index; @@ -331,16 +334,23 @@ static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq) { struct mlx4_cqe *cqe, *new_cqe; int i; + int cqe_size = cq->buf.entry_size; + int cqe_inc = cqe_size == 64 ? 1 : 0; i = cq->mcq.cons_index; cqe = get_cqe(cq, i & cq->ibcq.cqe); + cqe += cqe_inc; + while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) { new_cqe = get_cqe_from_buf(&cq->resize_buf->buf, (i + 1) & cq->resize_buf->cqe); - memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), sizeof(struct mlx4_cqe)); + memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), cqe_size); + new_cqe += cqe_inc; + new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) | (((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0); cqe = get_cqe(cq, ++i & cq->ibcq.cqe); + cqe += cqe_inc; } ++cq->mcq.cons_index; } @@ -355,7 +365,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) mutex_lock(&cq->resize_mutex); - if (entries < 1 || entries > dev->dev->caps.max_cqes) { + if (entries < 1) { err = -EINVAL; goto out; } @@ -366,6 +376,11 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) goto out; } + if (entries > dev->dev->caps.max_cqes) { + err = -EINVAL; + goto out; + } + if (ibcq->uobject) { err = mlx4_alloc_resize_umem(dev, cq, entries, udata); if (err) @@ -438,6 +453,7 @@ err_buf: out: mutex_unlock(&cq->resize_mutex); + return err; } @@ -548,7 +564,7 @@ static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum) } static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc, - unsigned tail, struct mlx4_cqe *cqe) + unsigned tail, struct mlx4_cqe *cqe, int is_eth) { struct mlx4_ib_proxy_sqp_hdr *hdr; @@ -558,12 +574,20 @@ static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct DMA_FROM_DEVICE); hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr); wc->pkey_index = be16_to_cpu(hdr->tun.pkey_index); - wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32); - wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12); wc->src_qp = be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF; wc->wc_flags |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0; wc->dlid_path_bits = 0; + if (is_eth) { + wc->vlan_id = be16_to_cpu(hdr->tun.sl_vid); + memcpy(&(wc->smac[0]), (char *)&hdr->tun.mac_31_0, 4); + memcpy(&(wc->smac[4]), (char *)&hdr->tun.slid_mac_47_32, 2); + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); + } else { + wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32); + wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12); + } + return 0; } @@ -575,8 +599,10 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, struct mlx4_qp *mqp; struct mlx4_ib_wq *wq; struct mlx4_ib_srq *srq; + struct mlx4_srq *msrq = NULL; int is_send; int is_error; + int is_eth; u32 g_mlpath_rqpn; u16 wqe_ctr; unsigned tail = 0; @@ -586,6 +612,9 @@ repoll: if (!cqe) return -EAGAIN; + if (cq->buf.entry_size == 64) + cqe++; + ++cq->mcq.cons_index; /* @@ -640,6 +669,20 @@ repoll: wc->qp = &(*cur_qp)->ibqp; + if (wc->qp->qp_type == IB_QPT_XRC_TGT) { + u32 srq_num; + g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn); + srq_num = g_mlpath_rqpn & 0xffffff; + /* SRQ is also in the radix tree */ + msrq = mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev, + srq_num); + if (unlikely(!msrq)) { + pr_warn("CQ %06x with entry for unknown SRQN %06x\n", + cq->mcq.cqn, srq_num); + return -EINVAL; + } + } + if (is_send) { wq = &(*cur_qp)->sq; if (!(*cur_qp)->sq_signal_bits) { @@ -653,6 +696,11 @@ repoll: wqe_ctr = be16_to_cpu(cqe->wqe_index); wc->wr_id = srq->wrid[wqe_ctr]; mlx4_ib_free_srq_wqe(srq, wqe_ctr); + } else if (msrq) { + srq = to_mibsrq(msrq); + wqe_ctr = be16_to_cpu(cqe->wqe_index); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx4_ib_free_srq_wqe(srq, wqe_ctr); } else { wq = &(*cur_qp)->rq; tail = wq->tail & (wq->wqe_cnt - 1); @@ -739,11 +787,15 @@ repoll: break; } + is_eth = (rdma_port_get_link_layer(wc->qp->device, + (*cur_qp)->port) == + IB_LINK_LAYER_ETHERNET); if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) { if ((*cur_qp)->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) - return use_tunnel_data(*cur_qp, cq, wc, tail, cqe); + return use_tunnel_data(*cur_qp, cq, wc, tail, + cqe, is_eth); } wc->slid = be16_to_cpu(cqe->rlid); @@ -754,11 +806,21 @@ repoll: wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f; wc->wc_flags |= mlx4_ib_ipoib_csum_ok(cqe->status, cqe->checksum) ? IB_WC_IP_CSUM_OK : 0; - if (rdma_port_get_link_layer(wc->qp->device, - (*cur_qp)->port) == IB_LINK_LAYER_ETHERNET) + if (is_eth) { wc->sl = be16_to_cpu(cqe->sl_vid) >> 13; - else + if (be32_to_cpu(cqe->vlan_my_qpn) & + MLX4_CQE_VLAN_PRESENT_MASK) { + wc->vlan_id = be16_to_cpu(cqe->sl_vid) & + MLX4_CQE_VID_MASK; + } else { + wc->vlan_id = 0xffff; + } + memcpy(wc->smac, cqe->smac, ETH_ALEN); + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); + } else { wc->sl = be16_to_cpu(cqe->sl_vid) >> 12; + wc->vlan_id = 0xffff; + } } return 0; @@ -807,6 +869,7 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) int nfreed = 0; struct mlx4_cqe *cqe, *dest; u8 owner_bit; + int cqe_inc = cq->buf.entry_size == 64 ? 1 : 0; /* * First we need to find the current producer index, so we @@ -825,12 +888,16 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) */ while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + cqe += cqe_inc; + if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) { if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index)); ++nfreed; } else if (nfreed) { dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); + dest += cqe_inc; + owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK; memcpy(dest, cqe, sizeof *cqe); dest->owner_sr_opcode = owner_bit | diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c index 8aee4233b38..c5174098636 100644 --- a/drivers/infiniband/hw/mlx4/doorbell.c +++ b/drivers/infiniband/hw/mlx4/doorbell.c @@ -45,7 +45,6 @@ int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, struct mlx4_db *db) { struct mlx4_ib_user_db_page *page; - struct ib_umem_chunk *chunk; int err = 0; mutex_lock(&context->db_page_mutex); @@ -73,8 +72,7 @@ int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, list_add(&page->list, &context->db_page_list); found: - chunk = list_entry(page->umem->chunk_list.next, struct ib_umem_chunk, list); - db->dma = sg_dma_address(chunk->page_list) + (virt & ~PAGE_MASK); + db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK); db->u.user_page = page; ++page->refcnt; diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 21a794152d1..287ad0564ac 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -93,7 +93,7 @@ static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, __be64 mlx4_ib_gen_node_guid(void) { #define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40)) - return cpu_to_be64(NODE_GUID_HI | random32()); + return cpu_to_be64(NODE_GUID_HI | prandom_u32()); } __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx) @@ -409,38 +409,45 @@ int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid) } -static int get_pkey_phys_indices(struct mlx4_ib_dev *ibdev, u8 port, u8 ph_pkey_ix, - u8 *full_pk_ix, u8 *partial_pk_ix, - int *is_full_member) +static int find_slave_port_pkey_ix(struct mlx4_ib_dev *dev, int slave, + u8 port, u16 pkey, u16 *ix) { - u16 search_pkey; - int fm; - int err = 0; - u16 pk; + int i, ret; + u8 unassigned_pkey_ix, pkey_ix, partial_ix = 0xFF; + u16 slot_pkey; - err = ib_get_cached_pkey(&ibdev->ib_dev, port, ph_pkey_ix, &search_pkey); - if (err) - return err; + if (slave == mlx4_master_func_num(dev->dev)) + return ib_find_cached_pkey(&dev->ib_dev, port, pkey, ix); - fm = (search_pkey & 0x8000) ? 1 : 0; - if (fm) { - *full_pk_ix = ph_pkey_ix; - search_pkey &= 0x7FFF; - } else { - *partial_pk_ix = ph_pkey_ix; - search_pkey |= 0x8000; - } + unassigned_pkey_ix = dev->dev->phys_caps.pkey_phys_table_len[port] - 1; - if (ib_find_exact_cached_pkey(&ibdev->ib_dev, port, search_pkey, &pk)) - pk = 0xFFFF; + for (i = 0; i < dev->dev->caps.pkey_table_len[port]; i++) { + if (dev->pkeys.virt2phys_pkey[slave][port - 1][i] == unassigned_pkey_ix) + continue; - if (fm) - *partial_pk_ix = (pk & 0xFF); - else - *full_pk_ix = (pk & 0xFF); + pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][i]; - *is_full_member = fm; - return err; + ret = ib_get_cached_pkey(&dev->ib_dev, port, pkey_ix, &slot_pkey); + if (ret) + continue; + if ((slot_pkey & 0x7FFF) == (pkey & 0x7FFF)) { + if (slot_pkey & 0x8000) { + *ix = (u16) pkey_ix; + return 0; + } else { + /* take first partial pkey index found */ + if (partial_ix == 0xFF) + partial_ix = pkey_ix; + } + } + } + + if (partial_ix < 0xFF) { + *ix = (u16) partial_ix; + return 0; + } + + return -EINVAL; } int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, @@ -458,10 +465,9 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, unsigned tun_tx_ix = 0; int dqpn; int ret = 0; - int i; - int is_full_member = 0; u16 tun_pkey_ix; - u8 ph_pkey_ix, full_pk_ix = 0, partial_pk_ix = 0; + u16 cached_pkey; + u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; if (dest_qpt > IB_QPT_GSI) return -EINVAL; @@ -472,36 +478,22 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE) return -EAGAIN; - /* QP0 forwarding only for Dom0 */ - if (!dest_qpt && (mlx4_master_func_num(dev->dev) != slave)) - return -EINVAL; - if (!dest_qpt) tun_qp = &tun_ctx->qp[0]; else tun_qp = &tun_ctx->qp[1]; - /* compute pkey index for slave */ - /* get physical pkey -- virtualized Dom0 pkey to phys*/ + /* compute P_Key index to put in tunnel header for slave */ if (dest_qpt) { - ph_pkey_ix = - dev->pkeys.virt2phys_pkey[mlx4_master_func_num(dev->dev)][port - 1][wc->pkey_index]; - - /* now, translate this to the slave pkey index */ - ret = get_pkey_phys_indices(dev, port, ph_pkey_ix, &full_pk_ix, - &partial_pk_ix, &is_full_member); + u16 pkey_ix; + ret = ib_get_cached_pkey(&dev->ib_dev, port, wc->pkey_index, &cached_pkey); if (ret) return -EINVAL; - for (i = 0; i < dev->dev->caps.pkey_table_len[port]; i++) { - if ((dev->pkeys.virt2phys_pkey[slave][port - 1][i] == full_pk_ix) || - (is_full_member && - (dev->pkeys.virt2phys_pkey[slave][port - 1][i] == partial_pk_ix))) - break; - } - if (i == dev->dev->caps.pkey_table_len[port]) + ret = find_slave_port_pkey_ix(dev, slave, port, cached_pkey, &pkey_ix); + if (ret) return -EINVAL; - tun_pkey_ix = i; + tun_pkey_ix = pkey_ix; } else tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; @@ -514,6 +506,10 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, * The driver will set the force loopback bit in post_send */ memset(&attr, 0, sizeof attr); attr.port_num = port; + if (is_eth) { + memcpy(&attr.grh.dgid.raw[0], &grh->dgid.raw[0], 16); + attr.ah_flags = IB_AH_GRH; + } ah = ib_create_ah(tun_ctx->pd, &attr); if (IS_ERR(ah)) return -ENOMEM; @@ -545,11 +541,36 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, /* adjust tunnel data */ tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix); - tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); - tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF); tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0; + if (is_eth) { + u16 vlan = 0; + if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan, + NULL)) { + /* VST mode */ + if (vlan != wc->vlan_id) + /* Packet vlan is not the VST-assigned vlan. + * Drop the packet. + */ + goto out; + else + /* Remove the vlan tag before forwarding + * the packet to the VF. + */ + vlan = 0xffff; + } else { + vlan = wc->vlan_id; + } + + tun_mad->hdr.sl_vid = cpu_to_be16(vlan); + memcpy((char *)&tun_mad->hdr.mac_31_0, &(wc->smac[0]), 4); + memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2); + } else { + tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); + tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); + } + ib_dma_sync_single_for_device(&dev->ib_dev, tun_qp->tx_ring[tun_tx_ix].buf.map, sizeof (struct mlx4_rcv_tunnel_mad), @@ -585,6 +606,41 @@ static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port, int err; int slave; u8 *slave_id; + int is_eth = 0; + + if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) + is_eth = 0; + else + is_eth = 1; + + if (is_eth) { + if (!(wc->wc_flags & IB_WC_GRH)) { + mlx4_ib_warn(ibdev, "RoCE grh not present.\n"); + return -EINVAL; + } + if (mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_CM) { + mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n"); + return -EINVAL; + } + if (mlx4_get_slave_from_roce_gid(dev->dev, port, grh->dgid.raw, &slave)) { + mlx4_ib_warn(ibdev, "failed matching grh\n"); + return -ENOENT; + } + if (slave >= dev->dev->caps.sqp_demux) { + mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", + slave, dev->dev->caps.sqp_demux); + return -ENOENT; + } + + if (mlx4_ib_demux_cm_handler(ibdev, port, NULL, mad)) + return 0; + + err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); + if (err) + pr_debug("failed sending to slave %d via tunnel qp (%d)\n", + slave, err); + return 0; + } /* Initially assume that this mad is for us */ slave = mlx4_master_func_num(dev->dev); @@ -607,6 +663,21 @@ static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port, } /* Class-specific handling */ switch (mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + /* 255 indicates the dom0 */ + if (slave != 255 && slave != mlx4_master_func_num(dev->dev)) { + if (!mlx4_vf_smi_enabled(dev->dev, slave, port)) + return -EPERM; + /* for a VF. drop unsolicited MADs */ + if (!(mad->mad_hdr.method & IB_MGMT_METHOD_RESP)) { + mlx4_ib_warn(ibdev, "demux QP0. rejecting unsolicited mad for slave %d class 0x%x, method 0x%x\n", + slave, mad->mad_hdr.mgmt_class, + mad->mad_hdr.method); + return -EINVAL; + } + } + break; case IB_MGMT_CLASS_SUBN_ADM: if (mlx4_ib_demux_sa_handler(ibdev, port, slave, (struct ib_sa_mad *) mad)) @@ -1081,8 +1152,9 @@ static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave) int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, - enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, - u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad) + enum ib_qp_type dest_qpt, u16 pkey_index, + u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr, + u8 *s_mac, struct ib_mad *mad) { struct ib_sge list; struct ib_send_wr wr, *bad_wr; @@ -1104,10 +1176,6 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE) return -EAGAIN; - /* QP0 forwarding only for Dom0 */ - if (dest_qpt == IB_QPT_SMI && (mlx4_master_func_num(dev->dev) != slave)) - return -EINVAL; - if (dest_qpt == IB_QPT_SMI) { src_qpnum = 0; sqp = &sqp_ctx->qp[0]; @@ -1171,6 +1239,9 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, wr.num_sge = 1; wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; + if (s_mac) + memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6); + ret = ib_post_send(send_qp, &wr, &bad_wr); out: @@ -1179,6 +1250,22 @@ out: return ret; } +static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port) +{ + if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) + return slave; + return mlx4_get_base_gid_ix(dev->dev, slave, port); +} + +static void fill_in_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port, + struct ib_ah_attr *ah_attr) +{ + if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) + ah_attr->grh.sgid_index = slave; + else + ah_attr->grh.sgid_index += get_slave_base_gid_ix(dev, slave, port); +} + static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc) { struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); @@ -1189,6 +1276,7 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc struct ib_ah_attr ah_attr; u8 *slave_id; int slave; + int port; /* Get slave that sent this packet */ if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn || @@ -1204,11 +1292,6 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc "belongs to another slave\n", wc->src_qp); return; } - if (slave != mlx4_master_func_num(dev->dev) && !(wc->src_qp & 0x2)) { - mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: " - "non-master trying to send QP0 packets\n", wc->src_qp); - return; - } /* Map transaction ID */ ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map, @@ -1236,6 +1319,12 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc /* Class-specific handling */ switch (tunnel->mad.mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + if (slave != mlx4_master_func_num(dev->dev) && + !mlx4_vf_smi_enabled(dev->dev, slave, ctx->port)) + return; + break; case IB_MGMT_CLASS_SUBN_ADM: if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave, (struct ib_sa_mad *) &tunnel->mad)) @@ -1265,12 +1354,18 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av)); ah.ibah.device = ctx->ib_dev; mlx4_ib_query_ah(&ah.ibah, &ah_attr); - if ((ah_attr.ah_flags & IB_AH_GRH) && - (ah_attr.grh.sgid_index != slave)) { - mlx4_ib_warn(ctx->ib_dev, "slave:%d accessed invalid sgid_index:%d\n", - slave, ah_attr.grh.sgid_index); + if (ah_attr.ah_flags & IB_AH_GRH) + fill_in_real_sgid_index(dev, slave, ctx->port, &ah_attr); + + port = mlx4_slave_convert_port(dev->dev, slave, ah_attr.port_num); + if (port < 0) return; - } + ah_attr.port_num = port; + memcpy(ah_attr.dmac, tunnel->hdr.mac, 6); + ah_attr.vlan_id = be16_to_cpu(tunnel->hdr.vlan); + /* if slave have default vlan use it */ + mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave, + &ah_attr.vlan_id, &ah_attr.sl); mlx4_ib_send_to_wire(dev, slave, ctx->port, is_proxy_qp0(dev, wc->src_qp, slave) ? @@ -1278,7 +1373,7 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc be16_to_cpu(tunnel->hdr.pkey_index), be32_to_cpu(tunnel->hdr.remote_qpn), be32_to_cpu(tunnel->hdr.qkey), - &ah_attr, &tunnel->mad); + &ah_attr, wc->smac, &tunnel->mad); } static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, @@ -1516,8 +1611,14 @@ static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx, memset(&attr, 0, sizeof attr); attr.qp_state = IB_QPS_INIT; - attr.pkey_index = - to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0]; + ret = 0; + if (create_tun) + ret = find_slave_port_pkey_ix(to_mdev(ctx->ib_dev), ctx->slave, + ctx->port, IB_DEFAULT_PKEY_FULL, + &attr.pkey_index); + if (ret || !create_tun) + attr.pkey_index = + to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0]; attr.qkey = IB_QP1_QKEY; attr.port_num = ctx->port; ret = ib_modify_qp(tun_qp->qp, &attr, qp_attr_mask_INIT); @@ -1656,9 +1757,9 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port, return -EEXIST; ctx->state = DEMUX_PV_STATE_STARTING; - /* have QP0 only on port owner, and only if link layer is IB */ - if (ctx->slave == mlx4_master_func_num(to_mdev(ctx->ib_dev)->dev) && - rdma_port_get_link_layer(ibdev, ctx->port) == IB_LINK_LAYER_INFINIBAND) + /* have QP0 only if link layer is IB */ + if (rdma_port_get_link_layer(ibdev, ctx->port) == + IB_LINK_LAYER_INFINIBAND) ctx->has_smi = 1; if (ctx->has_smi) { @@ -1849,7 +1950,15 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, ctx->port = port; ctx->ib_dev = &dev->ib_dev; - for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + for (i = 0; + i < min(dev->dev->caps.sqp_demux, (u16)(dev->dev->num_vfs + 1)); + i++) { + struct mlx4_active_ports actv_ports = + mlx4_get_active_ports(dev->dev, i); + + if (!test_bit(port - 1, actv_ports.ports)) + continue; + ret = alloc_pv_object(dev, i, port, &ctx->tun[i]); if (ret) { ret = -ENOMEM; @@ -2004,16 +2113,17 @@ int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev) goto demux_err; err = mlx4_ib_alloc_demux_ctx(dev, &dev->sriov.demux[i], i + 1); if (err) - goto demux_err; + goto free_pv; } mlx4_ib_master_tunnels(dev, 1); return 0; +free_pv: + free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1); demux_err: - while (i > 0) { + while (--i >= 0) { free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1); mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); - --i; } mlx4_ib_device_unregister_sysfs(dev); diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 718ec6b2bad..0f7027e7db1 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -39,6 +39,8 @@ #include <linux/inetdevice.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> +#include <net/ipv6.h> +#include <net/addrconf.h> #include <rdma/ib_smi.h> #include <rdma/ib_user_verbs.h> @@ -46,13 +48,17 @@ #include <linux/mlx4/driver.h> #include <linux/mlx4/cmd.h> +#include <linux/mlx4/qp.h> #include "mlx4_ib.h" #include "user.h" #define DRV_NAME MLX4_IB_DRV_NAME -#define DRV_VERSION "1.0" -#define DRV_RELDATE "April 4, 2008" +#define DRV_VERSION "2.2-1" +#define DRV_RELDATE "Feb 2014" + +#define MLX4_IB_FLOW_MAX_PRIO 0xFFF +#define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); @@ -88,6 +94,31 @@ static void init_query_mad(struct ib_smp *mad) static union ib_gid zgid; +static int check_flow_steering_support(struct mlx4_dev *dev) +{ + int eth_num_ports = 0; + int ib_num_ports = 0; + + int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED; + + if (dmfs) { + int i; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) + eth_num_ports++; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_num_ports++; + dmfs &= (!ib_num_ports || + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) && + (!eth_num_ports || + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)); + if (ib_num_ports && mlx4_is_mfunc(dev)) { + pr_warn("Device managed flow steering is unavailable for IB port in multifunction env.\n"); + dmfs = 0; + } + } + return dmfs; +} + static int mlx4_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { @@ -137,6 +168,16 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) props->device_cap_flags |= IB_DEVICE_XRC; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW; + if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { + if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_WIN_TYPE_2B) + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B; + else + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A; + if (dev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) + props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; + } props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & 0xffffff; @@ -146,18 +187,18 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->max_mr_size = ~0ull; props->page_size_cap = dev->dev->caps.page_size_cap; - props->max_qp = dev->dev->caps.num_qps - dev->dev->caps.reserved_qps; + props->max_qp = dev->dev->quotas.qp; props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); - props->max_cq = dev->dev->caps.num_cqs - dev->dev->caps.reserved_cqs; + props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; - props->max_mr = dev->dev->caps.num_mpts - dev->dev->caps.reserved_mrws; + props->max_mr = dev->dev->quotas.mpt; props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds; props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma; props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma; props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; - props->max_srq = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs; + props->max_srq = dev->dev->quotas.srq; props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; props->max_srq_sge = dev->dev->caps.max_srq_sge; props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; @@ -307,7 +348,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port, props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) ? IB_WIDTH_4X : IB_WIDTH_1X; props->active_speed = IB_SPEED_QDR; - props->port_cap_flags = IB_PORT_CM_SUP; + props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS; props->gid_tbl_len = mdev->dev->caps.gid_table_len[port]; props->max_msg_sz = mdev->dev->caps.max_msg_sz; props->pkey_tbl_len = 1; @@ -495,7 +536,6 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, if (IS_ERR(mailbox)) return 0; - memset(mailbox->buf, 0, 256); memcpy(mailbox->buf, props->node_desc, 64); mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); @@ -505,19 +545,16 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, return 0; } -static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, - u32 cap_mask) +static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, + u32 cap_mask) { struct mlx4_cmd_mailbox *mailbox; int err; - u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; mailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); - memset(mailbox->buf, 0, 256); - if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { *(u8 *) mailbox->buf = !!reset_qkey_viols << 6; ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask); @@ -526,8 +563,8 @@ static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask); } - err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT, - MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + err = mlx4_cmd(dev->dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); mlx4_free_cmd_mailbox(dev->dev, mailbox); return err; @@ -536,11 +573,20 @@ static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { + struct mlx4_ib_dev *mdev = to_mdev(ibdev); + u8 is_eth = mdev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; struct ib_port_attr attr; u32 cap_mask; int err; - mutex_lock(&to_mdev(ibdev)->cap_mask_mutex); + /* return OK if this is RoCE. CM calls ib_modify_port() regardless + * of whether port link layer is ETH or IB. For ETH ports, qkey + * violations and port capabilities are not meaningful. + */ + if (is_eth) + return 0; + + mutex_lock(&mdev->cap_mask_mutex); err = mlx4_ib_query_port(ibdev, port, &attr); if (err) @@ -549,9 +595,9 @@ static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; - err = mlx4_SET_PORT(to_mdev(ibdev), port, - !!(mask & IB_PORT_RESET_QKEY_CNTR), - cap_mask); + err = mlx4_ib_SET_PORT(mdev, port, + !!(mask & IB_PORT_RESET_QKEY_CNTR), + cap_mask); out: mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); @@ -563,15 +609,24 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct mlx4_ib_ucontext *context; + struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3; struct mlx4_ib_alloc_ucontext_resp resp; int err; if (!dev->ib_active) return ERR_PTR(-EAGAIN); - resp.qp_tab_size = dev->dev->caps.num_qps; - resp.bf_reg_size = dev->dev->caps.bf_reg_size; - resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { + resp_v3.qp_tab_size = dev->dev->caps.num_qps; + resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size; + resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + } else { + resp.dev_caps = dev->dev->caps.userspace_caps; + resp.qp_tab_size = dev->dev->caps.num_qps; + resp.bf_reg_size = dev->dev->caps.bf_reg_size; + resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + resp.cqe_size = dev->dev->caps.cqe_size; + } context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) @@ -586,7 +641,11 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); - err = ib_copy_to_udata(udata, &resp, sizeof resp); + if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) + err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); + else + err = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (err) { mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar); kfree(context); @@ -746,7 +805,6 @@ static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, union ib_gid *gid) { - u8 mac[6]; struct net_device *ndev; int ret = 0; @@ -760,11 +818,7 @@ int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, spin_unlock(&mdev->iboe.lock); if (ndev) { - rdma_get_mcast_mac((struct in6_addr *)gid, mac); - rtnl_lock(); - dev_mc_add(mdev->iboe.netdevs[mqp->port - 1], mac); ret = 1; - rtnl_unlock(); dev_put(ndev); } @@ -777,6 +831,338 @@ struct mlx4_ib_steering { union ib_gid gid; }; +static int parse_flow_attr(struct mlx4_dev *dev, + u32 qp_num, + union ib_flow_spec *ib_spec, + struct _rule_hw *mlx4_spec) +{ + enum mlx4_net_trans_rule_id type; + + switch (ib_spec->type) { + case IB_FLOW_SPEC_ETH: + type = MLX4_NET_TRANS_RULE_ID_ETH; + memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac, + ETH_ALEN); + memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac, + ETH_ALEN); + mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag; + mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag; + break; + case IB_FLOW_SPEC_IB: + type = MLX4_NET_TRANS_RULE_ID_IB; + mlx4_spec->ib.l3_qpn = + cpu_to_be32(qp_num); + mlx4_spec->ib.qpn_mask = + cpu_to_be32(MLX4_IB_FLOW_QPN_MASK); + break; + + + case IB_FLOW_SPEC_IPV4: + type = MLX4_NET_TRANS_RULE_ID_IPV4; + mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip; + mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip; + mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip; + mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip; + break; + + case IB_FLOW_SPEC_TCP: + case IB_FLOW_SPEC_UDP: + type = ib_spec->type == IB_FLOW_SPEC_TCP ? + MLX4_NET_TRANS_RULE_ID_TCP : + MLX4_NET_TRANS_RULE_ID_UDP; + mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port; + mlx4_spec->tcp_udp.dst_port_msk = ib_spec->tcp_udp.mask.dst_port; + mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port; + mlx4_spec->tcp_udp.src_port_msk = ib_spec->tcp_udp.mask.src_port; + break; + + default: + return -EINVAL; + } + if (mlx4_map_sw_to_hw_steering_id(dev, type) < 0 || + mlx4_hw_rule_sz(dev, type) < 0) + return -EINVAL; + mlx4_spec->id = cpu_to_be16(mlx4_map_sw_to_hw_steering_id(dev, type)); + mlx4_spec->size = mlx4_hw_rule_sz(dev, type) >> 2; + return mlx4_hw_rule_sz(dev, type); +} + +struct default_rules { + __u32 mandatory_fields[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u32 mandatory_not_fields[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u32 rules_create_list[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u8 link_layer; +}; +static const struct default_rules default_table[] = { + { + .mandatory_fields = {IB_FLOW_SPEC_IPV4}, + .mandatory_not_fields = {IB_FLOW_SPEC_ETH}, + .rules_create_list = {IB_FLOW_SPEC_IB}, + .link_layer = IB_LINK_LAYER_INFINIBAND + } +}; + +static int __mlx4_ib_default_rules_match(struct ib_qp *qp, + struct ib_flow_attr *flow_attr) +{ + int i, j, k; + void *ib_flow; + const struct default_rules *pdefault_rules = default_table; + u8 link_layer = rdma_port_get_link_layer(qp->device, flow_attr->port); + + for (i = 0; i < sizeof(default_table)/sizeof(default_table[0]); i++, + pdefault_rules++) { + __u32 field_types[IB_FLOW_SPEC_SUPPORT_LAYERS]; + memset(&field_types, 0, sizeof(field_types)); + + if (link_layer != pdefault_rules->link_layer) + continue; + + ib_flow = flow_attr + 1; + /* we assume the specs are sorted */ + for (j = 0, k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS && + j < flow_attr->num_of_specs; k++) { + union ib_flow_spec *current_flow = + (union ib_flow_spec *)ib_flow; + + /* same layer but different type */ + if (((current_flow->type & IB_FLOW_SPEC_LAYER_MASK) == + (pdefault_rules->mandatory_fields[k] & + IB_FLOW_SPEC_LAYER_MASK)) && + (current_flow->type != + pdefault_rules->mandatory_fields[k])) + goto out; + + /* same layer, try match next one */ + if (current_flow->type == + pdefault_rules->mandatory_fields[k]) { + j++; + ib_flow += + ((union ib_flow_spec *)ib_flow)->size; + } + } + + ib_flow = flow_attr + 1; + for (j = 0; j < flow_attr->num_of_specs; + j++, ib_flow += ((union ib_flow_spec *)ib_flow)->size) + for (k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS; k++) + /* same layer and same type */ + if (((union ib_flow_spec *)ib_flow)->type == + pdefault_rules->mandatory_not_fields[k]) + goto out; + + return i; + } +out: + return -1; +} + +static int __mlx4_ib_create_default_rules( + struct mlx4_ib_dev *mdev, + struct ib_qp *qp, + const struct default_rules *pdefault_rules, + struct _rule_hw *mlx4_spec) { + int size = 0; + int i; + + for (i = 0; i < sizeof(pdefault_rules->rules_create_list)/ + sizeof(pdefault_rules->rules_create_list[0]); i++) { + int ret; + union ib_flow_spec ib_spec; + switch (pdefault_rules->rules_create_list[i]) { + case 0: + /* no rule */ + continue; + case IB_FLOW_SPEC_IB: + ib_spec.type = IB_FLOW_SPEC_IB; + ib_spec.size = sizeof(struct ib_flow_spec_ib); + + break; + default: + /* invalid rule */ + return -EINVAL; + } + /* We must put empty rule, qpn is being ignored */ + ret = parse_flow_attr(mdev->dev, 0, &ib_spec, + mlx4_spec); + if (ret < 0) { + pr_info("invalid parsing\n"); + return -EINVAL; + } + + mlx4_spec = (void *)mlx4_spec + ret; + size += ret; + } + return size; +} + +static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, + int domain, + enum mlx4_net_trans_promisc_mode flow_type, + u64 *reg_id) +{ + int ret, i; + int size = 0; + void *ib_flow; + struct mlx4_ib_dev *mdev = to_mdev(qp->device); + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_net_trans_rule_hw_ctrl *ctrl; + int default_flow; + + static const u16 __mlx4_domain[] = { + [IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS, + [IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL, + [IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS, + [IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC, + }; + + if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) { + pr_err("Invalid priority value %d\n", flow_attr->priority); + return -EINVAL; + } + + if (domain >= IB_FLOW_DOMAIN_NUM) { + pr_err("Invalid domain value %d\n", domain); + return -EINVAL; + } + + if (mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0) + return -EINVAL; + + mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + ctrl = mailbox->buf; + + ctrl->prio = cpu_to_be16(__mlx4_domain[domain] | + flow_attr->priority); + ctrl->type = mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type); + ctrl->port = flow_attr->port; + ctrl->qpn = cpu_to_be32(qp->qp_num); + + ib_flow = flow_attr + 1; + size += sizeof(struct mlx4_net_trans_rule_hw_ctrl); + /* Add default flows */ + default_flow = __mlx4_ib_default_rules_match(qp, flow_attr); + if (default_flow >= 0) { + ret = __mlx4_ib_create_default_rules( + mdev, qp, default_table + default_flow, + mailbox->buf + size); + if (ret < 0) { + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return -EINVAL; + } + size += ret; + } + for (i = 0; i < flow_attr->num_of_specs; i++) { + ret = parse_flow_attr(mdev->dev, qp->qp_num, ib_flow, + mailbox->buf + size); + if (ret < 0) { + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return -EINVAL; + } + ib_flow += ((union ib_flow_spec *) ib_flow)->size; + size += ret; + } + + ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0, + MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (ret == -ENOMEM) + pr_err("mcg table is full. Fail to register network rule.\n"); + else if (ret == -ENXIO) + pr_err("Device managed flow steering is disabled. Fail to register network rule.\n"); + else if (ret) + pr_err("Invalid argumant. Fail to register network rule.\n"); + + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return ret; +} + +static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id) +{ + int err; + err = mlx4_cmd(dev, reg_id, 0, 0, + MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + pr_err("Fail to detach network rule. registration id = 0x%llx\n", + reg_id); + return err; +} + +static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, + int domain) +{ + int err = 0, i = 0; + struct mlx4_ib_flow *mflow; + enum mlx4_net_trans_promisc_mode type[2]; + + memset(type, 0, sizeof(type)); + + mflow = kzalloc(sizeof(*mflow), GFP_KERNEL); + if (!mflow) { + err = -ENOMEM; + goto err_free; + } + + switch (flow_attr->type) { + case IB_FLOW_ATTR_NORMAL: + type[0] = MLX4_FS_REGULAR; + break; + + case IB_FLOW_ATTR_ALL_DEFAULT: + type[0] = MLX4_FS_ALL_DEFAULT; + break; + + case IB_FLOW_ATTR_MC_DEFAULT: + type[0] = MLX4_FS_MC_DEFAULT; + break; + + case IB_FLOW_ATTR_SNIFFER: + type[0] = MLX4_FS_UC_SNIFFER; + type[1] = MLX4_FS_MC_SNIFFER; + break; + + default: + err = -EINVAL; + goto err_free; + } + + while (i < ARRAY_SIZE(type) && type[i]) { + err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i], + &mflow->reg_id[i]); + if (err) + goto err_free; + i++; + } + + return &mflow->ibflow; + +err_free: + kfree(mflow); + return ERR_PTR(err); +} + +static int mlx4_ib_destroy_flow(struct ib_flow *flow_id) +{ + int err, ret = 0; + int i = 0; + struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device); + struct mlx4_ib_flow *mflow = to_mflow(flow_id); + + while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i]) { + err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i]); + if (err) + ret = err; + i++; + } + + kfree(mflow); + return ret; +} + static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { int err; @@ -784,6 +1170,8 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) struct mlx4_ib_qp *mqp = to_mqp(ibqp); u64 reg_id; struct mlx4_ib_steering *ib_steering = NULL; + enum mlx4_protocol prot = (gid->raw[1] == 0x0e) ? + MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { @@ -795,7 +1183,7 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port, !!(mqp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), - MLX4_PROT_IB_IPV6, ®_id); + prot, ®_id); if (err) goto err_malloc; @@ -814,7 +1202,7 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) err_add: mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, - MLX4_PROT_IB_IPV6, reg_id); + prot, reg_id); err_malloc: kfree(ib_steering); @@ -842,10 +1230,11 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); - u8 mac[6]; struct net_device *ndev; struct mlx4_ib_gid_entry *ge; u64 reg_id = 0; + enum mlx4_protocol prot = (gid->raw[1] == 0x0e) ? + MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { @@ -868,7 +1257,7 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) } err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, - MLX4_PROT_IB_IPV6, reg_id); + prot, reg_id); if (err) return err; @@ -880,13 +1269,8 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) if (ndev) dev_hold(ndev); spin_unlock(&mdev->iboe.lock); - rdma_get_mcast_mac((struct in6_addr *)gid, mac); - if (ndev) { - rtnl_lock(); - dev_mc_del(mdev->iboe.netdevs[ge->port - 1], mac); - rtnl_unlock(); + if (ndev) dev_put(ndev); - } list_del(&ge->list); kfree(ge); } else @@ -982,7 +1366,8 @@ static struct device_attribute *mlx4_class_attributes[] = { &dev_attr_board_id }; -static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev) +static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, + struct net_device *dev) { memcpy(eui, dev->dev_addr, 3); memcpy(eui + 5, dev->dev_addr + 3, 3); @@ -1018,162 +1403,437 @@ static void update_gids_task(struct work_struct *work) MLX4_CMD_WRAPPED); if (err) pr_warn("set port command failed\n"); - else { - memcpy(gw->dev->iboe.gid_table[gw->port - 1], gw->gids, sizeof gw->gids); + else mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE); + + mlx4_free_cmd_mailbox(dev, mailbox); + kfree(gw); +} + +static void reset_gids_task(struct work_struct *work) +{ + struct update_gid_work *gw = + container_of(work, struct update_gid_work, work); + struct mlx4_cmd_mailbox *mailbox; + union ib_gid *gids; + int err; + struct mlx4_dev *dev = gw->dev->dev; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + pr_warn("reset gid table failed\n"); + goto free; + } + + gids = mailbox->buf; + memcpy(gids, gw->gids, sizeof(gw->gids)); + + if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) == + IB_LINK_LAYER_ETHERNET) { + err = mlx4_cmd(dev, mailbox->dma, + MLX4_SET_PORT_GID_TABLE << 8 | gw->port, + 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + pr_warn(KERN_WARNING + "set port %d command failed\n", gw->port); } mlx4_free_cmd_mailbox(dev, mailbox); +free: kfree(gw); } -static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) +static int update_gid_table(struct mlx4_ib_dev *dev, int port, + union ib_gid *gid, int clear, + int default_gid) { - struct net_device *ndev = dev->iboe.netdevs[port - 1]; struct update_gid_work *work; - struct net_device *tmp; int i; - u8 *hits; - int ret; - union ib_gid gid; - int free; - int found; int need_update = 0; - u16 vid; - - work = kzalloc(sizeof *work, GFP_ATOMIC); - if (!work) - return -ENOMEM; + int free = -1; + int found = -1; + int max_gids; - hits = kzalloc(128, GFP_ATOMIC); - if (!hits) { - ret = -ENOMEM; - goto out; - } - - rcu_read_lock(); - for_each_netdev_rcu(&init_net, tmp) { - if (ndev && (tmp == ndev || rdma_vlan_dev_real_dev(tmp) == ndev)) { - gid.global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); - vid = rdma_vlan_dev_vlan_id(tmp); - mlx4_addrconf_ifid_eui48(&gid.raw[8], vid, ndev); - found = 0; - free = -1; - for (i = 0; i < 128; ++i) { - if (free < 0 && - !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) - free = i; - if (!memcmp(&dev->iboe.gid_table[port - 1][i], &gid, sizeof gid)) { - hits[i] = 1; - found = 1; + if (default_gid) { + free = 0; + } else { + max_gids = dev->dev->caps.gid_table_len[port]; + for (i = 1; i < max_gids; ++i) { + if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid, + sizeof(*gid))) + found = i; + + if (clear) { + if (found >= 0) { + need_update = 1; + dev->iboe.gid_table[port - 1][found] = + zgid; break; } - } + } else { + if (found >= 0) + break; - if (!found) { - if (tmp == ndev && - (memcmp(&dev->iboe.gid_table[port - 1][0], - &gid, sizeof gid) || - !memcmp(&dev->iboe.gid_table[port - 1][0], - &zgid, sizeof gid))) { - dev->iboe.gid_table[port - 1][0] = gid; - ++need_update; - hits[0] = 1; - } else if (free >= 0) { - dev->iboe.gid_table[port - 1][free] = gid; - hits[free] = 1; - ++need_update; - } + if (free < 0 && + !memcmp(&dev->iboe.gid_table[port - 1][i], + &zgid, sizeof(*gid))) + free = i; } } } - rcu_read_unlock(); - for (i = 0; i < 128; ++i) - if (!hits[i]) { - if (memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) - ++need_update; - dev->iboe.gid_table[port - 1][i] = zgid; - } + if (found == -1 && !clear && free >= 0) { + dev->iboe.gid_table[port - 1][free] = *gid; + need_update = 1; + } - if (need_update) { - memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof work->gids); - INIT_WORK(&work->work, update_gids_task); - work->port = port; - work->dev = dev; - queue_work(wq, &work->work); - } else - kfree(work); + if (!need_update) + return 0; + + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return -ENOMEM; + + memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids)); + INIT_WORK(&work->work, update_gids_task); + work->port = port; + work->dev = dev; + queue_work(wq, &work->work); - kfree(hits); return 0; +} -out: - kfree(work); - return ret; +static void mlx4_make_default_gid(struct net_device *dev, union ib_gid *gid) +{ + gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev); } -static void handle_en_event(struct mlx4_ib_dev *dev, int port, unsigned long event) + +static int reset_gid_table(struct mlx4_ib_dev *dev, u8 port) { - switch (event) { - case NETDEV_UP: - case NETDEV_CHANGEADDR: - update_ipv6_gids(dev, port, 0); - break; + struct update_gid_work *work; - case NETDEV_DOWN: - update_ipv6_gids(dev, port, 1); - dev->iboe.netdevs[port - 1] = NULL; - } + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return -ENOMEM; + + memset(dev->iboe.gid_table[port - 1], 0, sizeof(work->gids)); + memset(work->gids, 0, sizeof(work->gids)); + INIT_WORK(&work->work, reset_gids_task); + work->dev = dev; + work->port = port; + queue_work(wq, &work->work); + return 0; } -static void netdev_added(struct mlx4_ib_dev *dev, int port) +static int mlx4_ib_addr_event(int event, struct net_device *event_netdev, + struct mlx4_ib_dev *ibdev, union ib_gid *gid) { - update_ipv6_gids(dev, port, 0); + struct mlx4_ib_iboe *iboe; + int port = 0; + struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ? + rdma_vlan_dev_real_dev(event_netdev) : + event_netdev; + union ib_gid default_gid; + + mlx4_make_default_gid(real_dev, &default_gid); + + if (!memcmp(gid, &default_gid, sizeof(*gid))) + return 0; + + if (event != NETDEV_DOWN && event != NETDEV_UP) + return 0; + + if ((real_dev != event_netdev) && + (event == NETDEV_DOWN) && + rdma_link_local_addr((struct in6_addr *)gid)) + return 0; + + iboe = &ibdev->iboe; + spin_lock(&iboe->lock); + + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) + if ((netif_is_bond_master(real_dev) && + (real_dev == iboe->masters[port - 1])) || + (!netif_is_bond_master(real_dev) && + (real_dev == iboe->netdevs[port - 1]))) + update_gid_table(ibdev, port, gid, + event == NETDEV_DOWN, 0); + + spin_unlock(&iboe->lock); + return 0; + } -static void netdev_removed(struct mlx4_ib_dev *dev, int port) +static u8 mlx4_ib_get_dev_port(struct net_device *dev, + struct mlx4_ib_dev *ibdev) { - update_ipv6_gids(dev, port, 1); + u8 port = 0; + struct mlx4_ib_iboe *iboe; + struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ? + rdma_vlan_dev_real_dev(dev) : dev; + + iboe = &ibdev->iboe; + + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) + if ((netif_is_bond_master(real_dev) && + (real_dev == iboe->masters[port - 1])) || + (!netif_is_bond_master(real_dev) && + (real_dev == iboe->netdevs[port - 1]))) + break; + + if ((port == 0) || (port > ibdev->dev->caps.num_ports)) + return 0; + else + return port; +} + +static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct mlx4_ib_dev *ibdev; + struct in_ifaddr *ifa = ptr; + union ib_gid gid; + struct net_device *event_netdev = ifa->ifa_dev->dev; + + ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid); + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet); + + mlx4_ib_addr_event(event, event_netdev, ibdev, &gid); + return NOTIFY_DONE; } -static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event, +#if IS_ENABLED(CONFIG_IPV6) +static int mlx4_ib_inet6_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; struct mlx4_ib_dev *ibdev; - struct net_device *oldnd; + struct inet6_ifaddr *ifa = ptr; + union ib_gid *gid = (union ib_gid *)&ifa->addr; + struct net_device *event_netdev = ifa->idev->dev; + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet6); + + mlx4_ib_addr_event(event, event_netdev, ibdev, gid); + return NOTIFY_DONE; +} +#endif + +#define MLX4_IB_INVALID_MAC ((u64)-1) +static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev, + struct net_device *dev, + int port) +{ + u64 new_smac = 0; + u64 release_mac = MLX4_IB_INVALID_MAC; + struct mlx4_ib_qp *qp; + + read_lock(&dev_base_lock); + new_smac = mlx4_mac_to_u64(dev->dev_addr); + read_unlock(&dev_base_lock); + + mutex_lock(&ibdev->qp1_proxy_lock[port - 1]); + qp = ibdev->qp1_proxy[port - 1]; + if (qp) { + int new_smac_index; + u64 old_smac = qp->pri.smac; + struct mlx4_update_qp_params update_params; + + if (new_smac == old_smac) + goto unlock; + + new_smac_index = mlx4_register_mac(ibdev->dev, port, new_smac); + + if (new_smac_index < 0) + goto unlock; + + update_params.smac_index = new_smac_index; + if (mlx4_update_qp(ibdev->dev, &qp->mqp, MLX4_UPDATE_QP_SMAC, + &update_params)) { + release_mac = new_smac; + goto unlock; + } + + qp->pri.smac = new_smac; + qp->pri.smac_index = new_smac_index; + + release_mac = old_smac; + } + +unlock: + mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]); + if (release_mac != MLX4_IB_INVALID_MAC) + mlx4_unregister_mac(ibdev->dev, port, release_mac); +} + +static void mlx4_ib_get_dev_addr(struct net_device *dev, + struct mlx4_ib_dev *ibdev, u8 port) +{ + struct in_device *in_dev; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev; + union ib_gid *pgid; + struct inet6_ifaddr *ifp; +#endif + union ib_gid gid; + + + if ((port == 0) || (port > ibdev->dev->caps.num_ports)) + return; + + /* IPv4 gids */ + in_dev = in_dev_get(dev); + if (in_dev) { + for_ifa(in_dev) { + /*ifa->ifa_address;*/ + ipv6_addr_set_v4mapped(ifa->ifa_address, + (struct in6_addr *)&gid); + update_gid_table(ibdev, port, &gid, 0, 0); + } + endfor_ifa(in_dev); + in_dev_put(in_dev); + } +#if IS_ENABLED(CONFIG_IPV6) + /* IPv6 gids */ + in6_dev = in6_dev_get(dev); + if (in6_dev) { + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { + pgid = (union ib_gid *)&ifp->addr; + update_gid_table(ibdev, port, pgid, 0, 0); + } + read_unlock_bh(&in6_dev->lock); + in6_dev_put(in6_dev); + } +#endif +} + +static void mlx4_ib_set_default_gid(struct mlx4_ib_dev *ibdev, + struct net_device *dev, u8 port) +{ + union ib_gid gid; + mlx4_make_default_gid(dev, &gid); + update_gid_table(ibdev, port, &gid, 0, 1); +} + +static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev) +{ + struct net_device *dev; + struct mlx4_ib_iboe *iboe = &ibdev->iboe; + int i; + + for (i = 1; i <= ibdev->num_ports; ++i) + if (reset_gid_table(ibdev, i)) + return -1; + + read_lock(&dev_base_lock); + spin_lock(&iboe->lock); + + for_each_netdev(&init_net, dev) { + u8 port = mlx4_ib_get_dev_port(dev, ibdev); + if (port) + mlx4_ib_get_dev_addr(dev, ibdev, port); + } + + spin_unlock(&iboe->lock); + read_unlock(&dev_base_lock); + + return 0; +} + +static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, + struct net_device *dev, + unsigned long event) + +{ struct mlx4_ib_iboe *iboe; + int update_qps_port = -1; int port; - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - - ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); iboe = &ibdev->iboe; spin_lock(&iboe->lock); mlx4_foreach_ib_transport_port(port, ibdev->dev) { - oldnd = iboe->netdevs[port - 1]; + enum ib_port_state port_state = IB_PORT_NOP; + struct net_device *old_master = iboe->masters[port - 1]; + struct net_device *curr_netdev; + struct net_device *curr_master; + iboe->netdevs[port - 1] = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); - if (oldnd != iboe->netdevs[port - 1]) { - if (iboe->netdevs[port - 1]) - netdev_added(ibdev, port); - else - netdev_removed(ibdev, port); + if (iboe->netdevs[port - 1]) + mlx4_ib_set_default_gid(ibdev, + iboe->netdevs[port - 1], port); + curr_netdev = iboe->netdevs[port - 1]; + + if (iboe->netdevs[port - 1] && + netif_is_bond_slave(iboe->netdevs[port - 1])) { + iboe->masters[port - 1] = netdev_master_upper_dev_get( + iboe->netdevs[port - 1]); + } else { + iboe->masters[port - 1] = NULL; + } + curr_master = iboe->masters[port - 1]; + + if (dev == iboe->netdevs[port - 1] && + (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER || + event == NETDEV_UP || event == NETDEV_CHANGE)) + update_qps_port = port; + + if (curr_netdev) { + port_state = (netif_running(curr_netdev) && netif_carrier_ok(curr_netdev)) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + mlx4_ib_set_default_gid(ibdev, curr_netdev, port); + } else { + reset_gid_table(ibdev, port); + } + /* if using bonding/team and a slave port is down, we don't the bond IP + * based gids in the table since flows that select port by gid may get + * the down port. + */ + if (curr_master && (port_state == IB_PORT_DOWN)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, curr_netdev, port); + } + /* if bonding is used it is possible that we add it to masters + * only after IP address is assigned to the net bonding + * interface. + */ + if (curr_master && (old_master != curr_master)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, curr_netdev, port); + mlx4_ib_get_dev_addr(curr_master, ibdev, port); } - } - if (dev == iboe->netdevs[0] || - (iboe->netdevs[0] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[0])) - handle_en_event(ibdev, 1, event); - else if (dev == iboe->netdevs[1] - || (iboe->netdevs[1] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[1])) - handle_en_event(ibdev, 2, event); + if (!curr_master && (old_master != curr_master)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, curr_netdev, port); + mlx4_ib_get_dev_addr(curr_netdev, ibdev, port); + } + } spin_unlock(&iboe->lock); + if (update_qps_port > 0) + mlx4_ib_update_qps(ibdev, dev, update_qps_port); +} + +static int mlx4_ib_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct mlx4_ib_dev *ibdev; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); + mlx4_ib_scan_netdevs(ibdev, dev, event); + return NOTIFY_DONE; } @@ -1211,7 +1871,7 @@ static void init_pkeys(struct mlx4_ib_dev *ibdev) static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { - char name[32]; + char name[80]; int eq_per_port = 0; int added_eqs = 0; int total_eqs = 0; @@ -1241,8 +1901,8 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) eq = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) { for (j = 0; j < eq_per_port; j++) { - sprintf(name, "mlx4-ib-%d-%d@%s", - i, j, dev->pdev->bus->name); + snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%s", + i, j, dev->pdev->bus->name); /* Set IRQ for specific name (per ring) */ if (mlx4_assign_eq(dev, name, NULL, &ibdev->eq_table[eq])) { @@ -1292,17 +1952,10 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) int i, j; int err; struct mlx4_ib_iboe *iboe; + int ib_num_ports = 0; pr_info_once("%s", mlx4_ib_version); - mlx4_foreach_non_ib_transport_port(i, dev) - num_ports++; - - if (mlx4_is_mfunc(dev) && num_ports) { - dev_err(&dev->pdev->dev, "RoCE is not supported over SRIOV as yet\n"); - return NULL; - } - num_ports = 0; mlx4_foreach_ib_transport_port(i, dev) num_ports++; @@ -1342,7 +1995,11 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; ibdev->ib_dev.dma_device = &dev->pdev->dev; - ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; + if (dev->caps.userspace_caps) + ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; + else + ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; + ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | @@ -1417,6 +2074,17 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; } + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW || + dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { + ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw; + ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw; + ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw; + + ibdev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); + } + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) { ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd; ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd; @@ -1425,6 +2093,16 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } + if (check_flow_steering_support(dev)) { + ibdev->steering_support = MLX4_STEERING_MODE_DEVICE_MANAGED; + ibdev->ib_dev.create_flow = mlx4_ib_create_flow; + ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; + + ibdev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); + } + mlx4_ib_alloc_eqs(dev, ibdev); spin_lock_init(&iboe->lock); @@ -1433,20 +2111,53 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) goto err_map; for (i = 0; i < ibdev->num_ports; ++i) { + mutex_init(&ibdev->qp1_proxy_lock[i]); if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == IB_LINK_LAYER_ETHERNET) { err = mlx4_counter_alloc(ibdev->dev, &ibdev->counters[i]); if (err) ibdev->counters[i] = -1; - } else - ibdev->counters[i] = -1; + } else { + ibdev->counters[i] = -1; + } } + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_num_ports++; + spin_lock_init(&ibdev->sm_lock); mutex_init(&ibdev->cap_mask_mutex); + if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED && + ib_num_ports) { + ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; + err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, + MLX4_IB_UC_STEER_QPN_ALIGN, + &ibdev->steer_qpn_base); + if (err) + goto err_counter; + + ibdev->ib_uc_qpns_bitmap = + kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) * + sizeof(long), + GFP_KERNEL); + if (!ibdev->ib_uc_qpns_bitmap) { + dev_err(&dev->pdev->dev, "bit map alloc failed\n"); + goto err_steer_qp_release; + } + + bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count); + + err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE( + dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_base + + ibdev->steer_qpn_count - 1); + if (err) + goto err_steer_free_bitmap; + } + if (ib_register_device(&ibdev->ib_dev, NULL)) - goto err_counter; + goto err_steer_free_bitmap; if (mlx4_ib_mad_init(ibdev)) goto err_reg; @@ -1454,11 +2165,39 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) if (mlx4_ib_init_sriov(ibdev)) goto err_mad; - if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE && !iboe->nb.notifier_call) { - iboe->nb.notifier_call = mlx4_ib_netdev_event; - err = register_netdevice_notifier(&iboe->nb); - if (err) - goto err_sriov; + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) { + if (!iboe->nb.notifier_call) { + iboe->nb.notifier_call = mlx4_ib_netdev_event; + err = register_netdevice_notifier(&iboe->nb); + if (err) { + iboe->nb.notifier_call = NULL; + goto err_notif; + } + } + if (!iboe->nb_inet.notifier_call) { + iboe->nb_inet.notifier_call = mlx4_ib_inet_event; + err = register_inetaddr_notifier(&iboe->nb_inet); + if (err) { + iboe->nb_inet.notifier_call = NULL; + goto err_notif; + } + } +#if IS_ENABLED(CONFIG_IPV6) + if (!iboe->nb_inet6.notifier_call) { + iboe->nb_inet6.notifier_call = mlx4_ib_inet6_event; + err = register_inet6addr_notifier(&iboe->nb_inet6); + if (err) { + iboe->nb_inet6.notifier_call = NULL; + goto err_notif; + } + } +#endif + for (i = 1 ; i <= ibdev->num_ports ; ++i) + reset_gid_table(ibdev, i); + rtnl_lock(); + mlx4_ib_scan_netdevs(ibdev, NULL, 0); + rtnl_unlock(); + mlx4_ib_init_gid_table(ibdev); } for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { @@ -1484,11 +2223,25 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) return ibdev; err_notif: - if (unregister_netdevice_notifier(&ibdev->iboe.nb)) - pr_warn("failure unregistering notifier\n"); + if (ibdev->iboe.nb.notifier_call) { + if (unregister_netdevice_notifier(&ibdev->iboe.nb)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb.notifier_call = NULL; + } + if (ibdev->iboe.nb_inet.notifier_call) { + if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet.notifier_call = NULL; + } +#if IS_ENABLED(CONFIG_IPV6) + if (ibdev->iboe.nb_inet6.notifier_call) { + if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet6.notifier_call = NULL; + } +#endif flush_workqueue(wq); -err_sriov: mlx4_ib_close_sriov(ibdev); err_mad: @@ -1497,6 +2250,13 @@ err_mad: err_reg: ib_unregister_device(&ibdev->ib_dev); +err_steer_free_bitmap: + kfree(ibdev->ib_uc_qpns_bitmap); + +err_steer_qp_release: + if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); err_counter: for (; i; --i) if (ibdev->counters[i - 1] != -1) @@ -1517,6 +2277,69 @@ err_dealloc: return NULL; } +int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn) +{ + int offset; + + WARN_ON(!dev->ib_uc_qpns_bitmap); + + offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap, + dev->steer_qpn_count, + get_count_order(count)); + if (offset < 0) + return offset; + + *qpn = dev->steer_qpn_base + offset; + return 0; +} + +void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) +{ + if (!qpn || + dev->steering_support != MLX4_STEERING_MODE_DEVICE_MANAGED) + return; + + BUG_ON(qpn < dev->steer_qpn_base); + + bitmap_release_region(dev->ib_uc_qpns_bitmap, + qpn - dev->steer_qpn_base, + get_count_order(count)); +} + +int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + int is_attach) +{ + int err; + size_t flow_size; + struct ib_flow_attr *flow = NULL; + struct ib_flow_spec_ib *ib_spec; + + if (is_attach) { + flow_size = sizeof(struct ib_flow_attr) + + sizeof(struct ib_flow_spec_ib); + flow = kzalloc(flow_size, GFP_KERNEL); + if (!flow) + return -ENOMEM; + flow->port = mqp->port; + flow->num_of_specs = 1; + flow->size = flow_size; + ib_spec = (struct ib_flow_spec_ib *)(flow + 1); + ib_spec->type = IB_FLOW_SPEC_IB; + ib_spec->size = sizeof(struct ib_flow_spec_ib); + /* Add an empty rule for IB L2 */ + memset(&ib_spec->mask, 0, sizeof(ib_spec->mask)); + + err = __mlx4_ib_create_flow(&mqp->ibqp, flow, + IB_FLOW_DOMAIN_NIC, + MLX4_FS_REGULAR, + &mqp->reg_id); + } else { + err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id); + } + kfree(flow); + return err; +} + static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) { struct mlx4_ib_dev *ibdev = ibdev_ptr; @@ -1530,6 +2353,26 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb.notifier_call = NULL; } + + if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) { + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); + kfree(ibdev->ib_uc_qpns_bitmap); + } + + if (ibdev->iboe.nb_inet.notifier_call) { + if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet.notifier_call = NULL; + } +#if IS_ENABLED(CONFIG_IPV6) + if (ibdev->iboe.nb_inet6.notifier_call) { + if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet6.notifier_call = NULL; + } +#endif + iounmap(ibdev->uar_map); for (p = 0; p < ibdev->num_ports; ++p) if (ibdev->counters[p] != -1) @@ -1550,17 +2393,24 @@ static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) struct mlx4_dev *dev = ibdev->dev; int i; unsigned long flags; + struct mlx4_active_ports actv_ports; + unsigned int ports; + unsigned int first_port; if (!mlx4_is_master(dev)) return; - dm = kcalloc(dev->caps.num_ports, sizeof *dm, GFP_ATOMIC); + actv_ports = mlx4_get_active_ports(dev, slave); + ports = bitmap_weight(actv_ports.ports, dev->caps.num_ports); + first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports); + + dm = kcalloc(ports, sizeof(*dm), GFP_ATOMIC); if (!dm) { pr_err("failed to allocate memory for tunneling qp update\n"); goto out; } - for (i = 0; i < dev->caps.num_ports; i++) { + for (i = 0; i < ports; i++) { dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC); if (!dm[i]) { pr_err("failed to allocate memory for tunneling qp update work struct\n"); @@ -1572,9 +2422,9 @@ static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) } } /* initialize or tear down tunnel QPs for the slave */ - for (i = 0; i < dev->caps.num_ports; i++) { + for (i = 0; i < ports; i++) { INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work); - dm[i]->port = i + 1; + dm[i]->port = first_port + i + 1; dm[i]->slave = slave; dm[i]->do_init = do_init; dm[i]->dev = ibdev; @@ -1584,8 +2434,7 @@ static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); } out: - if (dm) - kfree(dm); + kfree(dm); return; } diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c index 3c3b54c3fdd..ed327e6c8fd 100644 --- a/drivers/infiniband/hw/mlx4/mcg.c +++ b/drivers/infiniband/hw/mlx4/mcg.c @@ -215,8 +215,9 @@ static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad) } mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); spin_unlock(&dev->sm_lock); - return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), ctx->port, - IB_QPT_GSI, 0, 1, IB_QP1_QKEY, &ah_attr, mad); + return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), + ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY, + &ah_attr, NULL, mad); } static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx, @@ -233,7 +234,8 @@ static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx, ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); - wc.pkey_index = 0; + if (ib_find_cached_pkey(&dev->ib_dev, ctx->port, IB_DEFAULT_PKEY_FULL, &wc.pkey_index)) + return -EINVAL; wc.sl = 0; wc.dlid_path_bits = 0; wc.port_num = ctx->port; @@ -1074,10 +1076,6 @@ static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy unsigned long end; int count; - if (ctx->flushing) - return; - - ctx->flushing = 1; for (i = 0; i < MAX_VFS; ++i) clean_vf_mcast(ctx, i); @@ -1107,9 +1105,6 @@ static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy force_clean_group(group); } mutex_unlock(&ctx->mcg_table_lock); - - if (!destroy_wq) - ctx->flushing = 0; } struct clean_work { @@ -1123,6 +1118,7 @@ static void mcg_clean_task(struct work_struct *work) struct clean_work *cw = container_of(work, struct clean_work, work); _mlx4_ib_mcg_port_cleanup(cw->ctx, cw->destroy_wq); + cw->ctx->flushing = 0; kfree(cw); } @@ -1130,13 +1126,20 @@ void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq) { struct clean_work *work; + if (ctx->flushing) + return; + + ctx->flushing = 1; + if (destroy_wq) { _mlx4_ib_mcg_port_cleanup(ctx, destroy_wq); + ctx->flushing = 0; return; } work = kmalloc(sizeof *work, GFP_KERNEL); if (!work) { + ctx->flushing = 0; mcg_warn("failed allocating work for cleanup\n"); return; } diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index e04cbc9a54a..369da3ca5d6 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -68,6 +68,8 @@ enum { /*module param to indicate if SM assigns the alias_GUID*/ extern int mlx4_ib_sm_guid_assign; +#define MLX4_IB_UC_STEER_QPN_ALIGN 1 +#define MLX4_IB_UC_MAX_NUM_QPS 256 struct mlx4_ib_ucontext { struct ib_ucontext ibucontext; struct mlx4_uar uar; @@ -90,6 +92,7 @@ struct mlx4_ib_xrcd { struct mlx4_ib_cq_buf { struct mlx4_buf buf; struct mlx4_mtt mtt; + int entry_size; }; struct mlx4_ib_cq_resize { @@ -115,6 +118,11 @@ struct mlx4_ib_mr { struct ib_umem *umem; }; +struct mlx4_ib_mw { + struct ib_mw ibmw; + struct mlx4_mw mmw; +}; + struct mlx4_ib_fast_reg_page_list { struct ib_fast_reg_page_list ibfrpl; __be64 *mapped_page_list; @@ -126,6 +134,12 @@ struct mlx4_ib_fmr { struct mlx4_fmr mfmr; }; +struct mlx4_ib_flow { + struct ib_flow ibflow; + /* translating DMFS verbs sniffer rule to FW API requires two reg IDs */ + u64 reg_id[2]; +}; + struct mlx4_ib_wq { u64 *wrid; spinlock_t lock; @@ -141,6 +155,8 @@ struct mlx4_ib_wq { enum mlx4_ib_qp_flags { MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, + MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP, + MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO, MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30, MLX4_IB_SRIOV_SQP = 1 << 31, }; @@ -226,6 +242,22 @@ struct mlx4_ib_proxy_sqp_hdr { struct mlx4_rcv_tunnel_hdr tun; } __packed; +struct mlx4_roce_smac_vlan_info { + u64 smac; + int smac_index; + int smac_port; + u64 candidate_smac; + int candidate_smac_index; + int candidate_smac_port; + u16 vid; + int vlan_index; + int vlan_port; + u16 candidate_vid; + int candidate_vlan_index; + int candidate_vlan_port; + int update_vid; +}; + struct mlx4_ib_qp { struct ib_qp ibqp; struct mlx4_qp mqp; @@ -258,7 +290,9 @@ struct mlx4_ib_qp { struct list_head gid_list; struct list_head steering_rules; struct mlx4_ib_buf *sqp_proxy_rcv; - + struct mlx4_roce_smac_vlan_info pri; + struct mlx4_roce_smac_vlan_info alt; + u64 reg_id; }; struct mlx4_ib_srq { @@ -416,7 +450,10 @@ struct mlx4_ib_sriov { struct mlx4_ib_iboe { spinlock_t lock; struct net_device *netdevs[MLX4_MAX_PORTS]; + struct net_device *masters[MLX4_MAX_PORTS]; struct notifier_block nb; + struct notifier_block nb_inet; + struct notifier_block nb_inet6; union ib_gid gid_table[MLX4_MAX_PORTS][128]; }; @@ -482,6 +519,13 @@ struct mlx4_ib_dev { struct kobject *dev_ports_parent[MLX4_MFUNC_MAX]; struct mlx4_ib_iov_port iov_ports[MLX4_MAX_PORTS]; struct pkey_mgt pkeys; + unsigned long *ib_uc_qpns_bitmap; + int steer_qpn_count; + int steer_qpn_base; + int steering_support; + struct mlx4_ib_qp *qp1_proxy[MLX4_MAX_PORTS]; + /* lock when destroying qp1_proxy and getting netdev events */ + struct mutex qp1_proxy_lock[MLX4_MAX_PORTS]; }; struct ib_event_work { @@ -532,6 +576,11 @@ static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr) return container_of(ibmr, struct mlx4_ib_mr, ibmr); } +static inline struct mlx4_ib_mw *to_mmw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct mlx4_ib_mw, ibmw); +} + static inline struct mlx4_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl) { return container_of(ibfrpl, struct mlx4_ib_fast_reg_page_list, ibfrpl); @@ -541,6 +590,12 @@ static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) { return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr); } + +static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow) +{ + return container_of(ibflow, struct mlx4_ib_flow, ibflow); +} + static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp) { return container_of(ibqp, struct mlx4_ib_qp, ibqp); @@ -580,6 +635,10 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); int mlx4_ib_dereg_mr(struct ib_mr *mr); +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); +int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, + struct ib_mw_bind *mw_bind); +int mlx4_ib_dealloc_mw(struct ib_mw *mw); struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len); struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, @@ -648,15 +707,12 @@ int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid, int netw_view); -int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, - u8 *mac, int *is_mcast, u8 port); - -static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) +static inline bool mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) { u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3; if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET) - return 1; + return true; return !!(ah->av.ib.g_slid & 0x80); } @@ -685,9 +741,12 @@ void mlx4_ib_tunnels_update_work(struct work_struct *work); int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type qpt, struct ib_wc *wc, struct ib_grh *grh, struct ib_mad *mad); + int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, - u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad); + u32 qkey, struct ib_ah_attr *attr, u8 *s_mac, + struct ib_mad *mad); + __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx); int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, @@ -725,5 +784,9 @@ void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device); __be64 mlx4_ib_gen_node_guid(void); +int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn); +void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count); +int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + int is_attach); #endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index bbaf6176f20..cb2a8727f3f 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -41,9 +41,19 @@ static u32 convert_access(int acc) (acc & IB_ACCESS_REMOTE_WRITE ? MLX4_PERM_REMOTE_WRITE : 0) | (acc & IB_ACCESS_REMOTE_READ ? MLX4_PERM_REMOTE_READ : 0) | (acc & IB_ACCESS_LOCAL_WRITE ? MLX4_PERM_LOCAL_WRITE : 0) | + (acc & IB_ACCESS_MW_BIND ? MLX4_PERM_BIND_MW : 0) | MLX4_PERM_LOCAL_READ; } +static enum mlx4_mw_type to_mlx4_type(enum ib_mw_type type) +{ + switch (type) { + case IB_MW_TYPE_1: return MLX4_MW_TYPE_1; + case IB_MW_TYPE_2: return MLX4_MW_TYPE_2; + default: return -1; + } +} + struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc) { struct mlx4_ib_mr *mr; @@ -68,7 +78,7 @@ struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc) return &mr->ibmr; err_mr: - mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); err_free: kfree(mr); @@ -80,11 +90,11 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, struct ib_umem *umem) { u64 *pages; - struct ib_umem_chunk *chunk; - int i, j, k; + int i, k, entry; int n; int len; int err = 0; + struct scatterlist *sg; pages = (u64 *) __get_free_page(GFP_KERNEL); if (!pages) @@ -92,26 +102,25 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, i = n = 0; - list_for_each_entry(chunk, &umem->chunk_list, list) - for (j = 0; j < chunk->nmap; ++j) { - len = sg_dma_len(&chunk->page_list[j]) >> mtt->page_shift; - for (k = 0; k < len; ++k) { - pages[i++] = sg_dma_address(&chunk->page_list[j]) + - umem->page_size * k; - /* - * Be friendly to mlx4_write_mtt() and - * pass it chunks of appropriate size. - */ - if (i == PAGE_SIZE / sizeof (u64)) { - err = mlx4_write_mtt(dev->dev, mtt, n, - i, pages); - if (err) - goto out; - n += i; - i = 0; - } + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> mtt->page_shift; + for (k = 0; k < len; ++k) { + pages[i++] = sg_dma_address(sg) + + umem->page_size * k; + /* + * Be friendly to mlx4_write_mtt() and + * pass it chunks of appropriate size. + */ + if (i == PAGE_SIZE / sizeof (u64)) { + err = mlx4_write_mtt(dev->dev, mtt, n, + i, pages); + if (err) + goto out; + n += i; + i = 0; } } + } if (i) err = mlx4_write_mtt(dev->dev, mtt, n, i, pages); @@ -163,7 +172,7 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return &mr->ibmr; err_mr: - mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); err_umem: ib_umem_release(mr->umem); @@ -177,8 +186,11 @@ err_free: int mlx4_ib_dereg_mr(struct ib_mr *ibmr) { struct mlx4_ib_mr *mr = to_mmr(ibmr); + int ret; - mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr); + ret = mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr); + if (ret) + return ret; if (mr->umem) ib_umem_release(mr->umem); kfree(mr); @@ -186,6 +198,70 @@ int mlx4_ib_dereg_mr(struct ib_mr *ibmr) return 0; } +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_mw *mw; + int err; + + mw = kmalloc(sizeof(*mw), GFP_KERNEL); + if (!mw) + return ERR_PTR(-ENOMEM); + + err = mlx4_mw_alloc(dev->dev, to_mpd(pd)->pdn, + to_mlx4_type(type), &mw->mmw); + if (err) + goto err_free; + + err = mlx4_mw_enable(dev->dev, &mw->mmw); + if (err) + goto err_mw; + + mw->ibmw.rkey = mw->mmw.key; + + return &mw->ibmw; + +err_mw: + mlx4_mw_free(dev->dev, &mw->mmw); + +err_free: + kfree(mw); + + return ERR_PTR(err); +} + +int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, + struct ib_mw_bind *mw_bind) +{ + struct ib_send_wr wr; + struct ib_send_wr *bad_wr; + int ret; + + memset(&wr, 0, sizeof(wr)); + wr.opcode = IB_WR_BIND_MW; + wr.wr_id = mw_bind->wr_id; + wr.send_flags = mw_bind->send_flags; + wr.wr.bind_mw.mw = mw; + wr.wr.bind_mw.bind_info = mw_bind->bind_info; + wr.wr.bind_mw.rkey = ib_inc_rkey(mw->rkey); + + ret = mlx4_ib_post_send(qp, &wr, &bad_wr); + if (!ret) + mw->rkey = wr.wr.bind_mw.rkey; + + return ret; +} + +int mlx4_ib_dealloc_mw(struct ib_mw *ibmw) +{ + struct mlx4_ib_mw *mw = to_mmw(ibmw); + + mlx4_mw_free(to_mdev(ibmw->device)->dev, &mw->mmw); + kfree(mw); + + return 0; +} + struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) { @@ -212,7 +288,7 @@ struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, return &mr->ibmr; err_mr: - mlx4_mr_free(dev->dev, &mr->mmr); + (void) mlx4_mr_free(dev->dev, &mr->mmr); err_free: kfree(mr); @@ -291,7 +367,7 @@ struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc, return &fmr->ibfmr; err_mr: - mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr); + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr); err_free: kfree(fmr); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 19e0637220b..67780452f0c 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -90,6 +90,21 @@ enum { MLX4_RAW_QP_MSGMAX = 31, }; +#ifndef ETH_ALEN +#define ETH_ALEN 6 +#endif +static inline u64 mlx4_mac_to_u64(u8 *addr) +{ + u64 mac = 0; + int i; + + for (i = 0; i < ETH_ALEN; i++) { + mac <<= 8; + mac |= addr[i]; + } + return mac; +} + static const __be32 mlx4_ib_opcode[] = { [IB_WR_SEND] = cpu_to_be32(MLX4_OPCODE_SEND), [IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO), @@ -104,6 +119,7 @@ static const __be32 mlx4_ib_opcode[] = { [IB_WR_FAST_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR), [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS), [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA), + [IB_WR_BIND_MW] = cpu_to_be32(MLX4_OPCODE_BIND_MW), }; static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) @@ -592,9 +608,20 @@ static int qp_has_rq(struct ib_qp_init_attr *attr) return !attr->srq; } +static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn) +{ + int i; + for (i = 0; i < dev->caps.num_ports; i++) { + if (qpn == dev->caps.qp0_proxy[i]) + return !!dev->caps.qp0_qkey[i]; + } + return 0; +} + static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, - struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp) + struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp, + gfp_t gfp) { int qpn; int err; @@ -609,10 +636,13 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) { if (init_attr->qp_type == IB_QPT_GSI) qp_type = MLX4_IB_QPT_PROXY_GSI; - else if (mlx4_is_master(dev->dev)) - qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER; - else - qp_type = MLX4_IB_QPT_PROXY_SMI; + else { + if (mlx4_is_master(dev->dev) || + qp0_enabled_vf(dev->dev, sqpn)) + qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER; + else + qp_type = MLX4_IB_QPT_PROXY_SMI; + } } qpn = sqpn; /* add extra sg entry for tunneling */ @@ -627,7 +657,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, return -EINVAL; if (tnl_init->proxy_qp_type == IB_QPT_GSI) qp_type = MLX4_IB_QPT_TUN_GSI; - else if (tnl_init->slave == mlx4_master_func_num(dev->dev)) + else if (tnl_init->slave == mlx4_master_func_num(dev->dev) || + mlx4_vf_smi_enabled(dev->dev, tnl_init->slave, + tnl_init->port)) qp_type = MLX4_IB_QPT_TUN_SMI_OWNER; else qp_type = MLX4_IB_QPT_TUN_SMI; @@ -642,14 +674,18 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI || (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) { - sqp = kzalloc(sizeof (struct mlx4_ib_sqp), GFP_KERNEL); + sqp = kzalloc(sizeof (struct mlx4_ib_sqp), gfp); if (!sqp) return -ENOMEM; qp = &sqp->qp; + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; } else { - qp = kzalloc(sizeof (struct mlx4_ib_qp), GFP_KERNEL); + qp = kzalloc(sizeof (struct mlx4_ib_qp), gfp); if (!qp) return -ENOMEM; + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; } } else qp = *caller_qp; @@ -715,19 +751,27 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) qp->flags |= MLX4_IB_QP_LSO; + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { + if (dev->steering_support == + MLX4_STEERING_MODE_DEVICE_MANAGED) + qp->flags |= MLX4_IB_QP_NETIF; + else + goto err; + } + err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp); if (err) goto err; if (qp_has_rq(init_attr)) { - err = mlx4_db_alloc(dev->dev, &qp->db, 0); + err = mlx4_db_alloc(dev->dev, &qp->db, 0, gfp); if (err) goto err; *qp->db.db = 0; } - if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) { + if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) { err = -ENOMEM; goto err_db; } @@ -737,13 +781,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf); + err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, gfp); if (err) goto err_mtt; - qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL); - qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL); - + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp); if (!qp->sq.wrid || !qp->rq.wrid) { err = -ENOMEM; goto err_wrid; @@ -764,12 +807,16 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (init_attr->qp_type == IB_QPT_RAW_PACKET) err = mlx4_qp_reserve_range(dev->dev, 1, 1 << 8, &qpn); else - err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn); + if (qp->flags & MLX4_IB_QP_NETIF) + err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn); + else + err = mlx4_qp_reserve_range(dev->dev, 1, 1, + &qpn); if (err) goto err_proxy; } - err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp); + err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, gfp); if (err) goto err_qpn; @@ -789,8 +836,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, return 0; err_qpn: - if (!sqpn) - mlx4_qp_release_range(dev->dev, qpn, 1); + if (!sqpn) { + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_free(dev, qpn, 1); + else + mlx4_qp_release_range(dev->dev, qpn, 1); + } err_proxy: if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) free_proxy_bufs(pd->device, qp); @@ -908,11 +959,32 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, { struct mlx4_ib_cq *send_cq, *recv_cq; - if (qp->state != IB_QPS_RESET) + if (qp->state != IB_QPS_RESET) { if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) pr_warn("modify QP %06x to RESET failed.\n", qp->mqp.qpn); + if (qp->pri.smac) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = 0; + } + if (qp->alt.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = 0; + } + if (qp->pri.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); + qp->pri.vid = 0xFFFF; + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + if (qp->alt.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); + qp->alt.vid = 0xFFFF; + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + } get_cqs(qp, &send_cq, &recv_cq); @@ -931,8 +1003,12 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, mlx4_qp_free(dev->dev, &qp->mqp); - if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) - mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); + if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) { + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1); + else + mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); + } mlx4_mtt_cleanup(dev->dev, &qp->mtt); @@ -979,19 +1055,30 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, struct mlx4_ib_qp *qp = NULL; int err; u16 xrcdn = 0; + gfp_t gfp; + gfp = (init_attr->create_flags & MLX4_IB_QP_CREATE_USE_GFP_NOIO) ? + GFP_NOIO : GFP_KERNEL; /* * We only support LSO, vendor flag1, and multicast loopback blocking, * and only for kernel UD QPs. */ if (init_attr->create_flags & ~(MLX4_IB_QP_LSO | MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK | - MLX4_IB_SRIOV_TUNNEL_QP | MLX4_IB_SRIOV_SQP)) + MLX4_IB_SRIOV_TUNNEL_QP | + MLX4_IB_SRIOV_SQP | + MLX4_IB_QP_NETIF | + MLX4_IB_QP_CREATE_USE_GFP_NOIO)) return ERR_PTR(-EINVAL); + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { + if (init_attr->qp_type != IB_QPT_UD) + return ERR_PTR(-EINVAL); + } + if (init_attr->create_flags && (udata || - ((init_attr->create_flags & ~MLX4_IB_SRIOV_SQP) && + ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | MLX4_IB_QP_CREATE_USE_GFP_NOIO)) && init_attr->qp_type != IB_QPT_UD) || ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) && init_attr->qp_type > IB_QPT_GSI))) @@ -1011,14 +1098,16 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_RAW_PACKET: - qp = kzalloc(sizeof *qp, GFP_KERNEL); + qp = kzalloc(sizeof *qp, gfp); if (!qp) return ERR_PTR(-ENOMEM); + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; /* fall through */ case IB_QPT_UD: { err = create_qp_common(to_mdev(pd->device), pd, init_attr, - udata, 0, &qp); + udata, 0, &qp, gfp); if (err) return ERR_PTR(err); @@ -1036,7 +1125,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, get_sqp_num(to_mdev(pd->device), init_attr), - &qp); + &qp, gfp); if (err) return ERR_PTR(err); @@ -1062,6 +1151,12 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp) if (is_qp0(dev, mqp)) mlx4_CLOSE_PORT(dev->dev, mqp->port); + if (dev->qp1_proxy[mqp->port - 1] == mqp) { + mutex_lock(&dev->qp1_proxy_lock[mqp->port - 1]); + dev->qp1_proxy[mqp->port - 1] = NULL; + mutex_unlock(&dev->qp1_proxy_lock[mqp->port - 1]); + } + pd = get_pd(mqp); destroy_qp_common(dev, mqp, !!pd->ibpd.uobject); @@ -1143,16 +1238,16 @@ static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6); } -static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, - struct mlx4_qp_path *path, u8 port) +static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, + u64 smac, u16 vlan_tag, struct mlx4_qp_path *path, + struct mlx4_roce_smac_vlan_info *smac_info, u8 port) { - int err; int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_ETHERNET; - u8 mac[6]; - int is_mcast; - u16 vlan_tag; int vidx; + int smac_index; + int err; + path->grh_mylmc = ah->src_path_bits & 0x7f; path->rlid = cpu_to_be16(ah->dlid); @@ -1181,36 +1276,105 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, } if (is_eth) { - path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | - ((port - 1) << 6) | ((ah->sl & 7) << 3); - if (!(ah->ah_flags & IB_AH_GRH)) return -1; - err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast, port); - if (err) - return err; - - memcpy(path->dmac, mac, 6); - path->ackto = MLX4_IB_LINK_TYPE_ETH; - /* use index 0 into MAC table for IBoE */ - path->grh_mylmc &= 0x80; + path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | + ((port - 1) << 6) | ((ah->sl & 7) << 3); - vlan_tag = rdma_get_vlan_id(&dev->iboe.gid_table[port - 1][ah->grh.sgid_index]); + path->feup |= MLX4_FEUP_FORCE_ETH_UP; if (vlan_tag < 0x1000) { - if (mlx4_find_cached_vlan(dev->dev, port, vlan_tag, &vidx)) - return -ENOENT; - - path->vlan_index = vidx; + if (smac_info->vid < 0x1000) { + /* both valid vlan ids */ + if (smac_info->vid != vlan_tag) { + /* different VIDs. unreg old and reg new */ + err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); + if (err) + return err; + smac_info->candidate_vid = vlan_tag; + smac_info->candidate_vlan_index = vidx; + smac_info->candidate_vlan_port = port; + smac_info->update_vid = 1; + path->vlan_index = vidx; + } else { + path->vlan_index = smac_info->vlan_index; + } + } else { + /* no current vlan tag in qp */ + err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); + if (err) + return err; + smac_info->candidate_vid = vlan_tag; + smac_info->candidate_vlan_index = vidx; + smac_info->candidate_vlan_port = port; + smac_info->update_vid = 1; + path->vlan_index = vidx; + } + path->feup |= MLX4_FVL_FORCE_ETH_VLAN; path->fl = 1 << 6; + } else { + /* have current vlan tag. unregister it at modify-qp success */ + if (smac_info->vid < 0x1000) { + smac_info->candidate_vid = 0xFFFF; + smac_info->update_vid = 1; + } } - } else + + /* get smac_index for RoCE use. + * If no smac was yet assigned, register one. + * If one was already assigned, but the new mac differs, + * unregister the old one and register the new one. + */ + if (!smac_info->smac || smac_info->smac != smac) { + /* register candidate now, unreg if needed, after success */ + smac_index = mlx4_register_mac(dev->dev, port, smac); + if (smac_index >= 0) { + smac_info->candidate_smac_index = smac_index; + smac_info->candidate_smac = smac; + smac_info->candidate_smac_port = port; + } else { + return -EINVAL; + } + } else { + smac_index = smac_info->smac_index; + } + + memcpy(path->dmac, ah->dmac, 6); + path->ackto = MLX4_IB_LINK_TYPE_ETH; + /* put MAC table smac index for IBoE */ + path->grh_mylmc = (u8) (smac_index) | 0x80; + } else { path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((port - 1) << 6) | ((ah->sl & 0xf) << 2); + } return 0; } +static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp, + enum ib_qp_attr_mask qp_attr_mask, + struct mlx4_ib_qp *mqp, + struct mlx4_qp_path *path, u8 port) +{ + return _mlx4_set_path(dev, &qp->ah_attr, + mlx4_mac_to_u64((u8 *)qp->smac), + (qp_attr_mask & IB_QP_VID) ? qp->vlan_id : 0xffff, + path, &mqp->pri, port); +} + +static int mlx4_set_alt_path(struct mlx4_ib_dev *dev, + const struct ib_qp_attr *qp, + enum ib_qp_attr_mask qp_attr_mask, + struct mlx4_ib_qp *mqp, + struct mlx4_qp_path *path, u8 port) +{ + return _mlx4_set_path(dev, &qp->alt_ah_attr, + mlx4_mac_to_u64((u8 *)qp->alt_smac), + (qp_attr_mask & IB_QP_ALT_VID) ? + qp->alt_vlan_id : 0xffff, + path, &mqp->alt, port); +} + static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { struct mlx4_ib_gid_entry *ge, *tmp; @@ -1223,6 +1387,37 @@ static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) } } +static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, u8 *smac, + struct mlx4_qp_context *context) +{ + struct net_device *ndev; + u64 u64_mac; + int smac_index; + + + ndev = dev->iboe.netdevs[qp->port - 1]; + if (ndev) { + smac = ndev->dev_addr; + u64_mac = mlx4_mac_to_u64(smac); + } else { + u64_mac = dev->dev->caps.def_mac[qp->port]; + } + + context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6); + if (!qp->pri.smac) { + smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac); + if (smac_index >= 0) { + qp->pri.candidate_smac_index = smac_index; + qp->pri.candidate_smac = u64_mac; + qp->pri.candidate_smac_port = qp->port; + context->pri_path.grh_mylmc = 0x80 | (u8) smac_index; + } else { + return -ENOENT; + } + } + return 0; +} + static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) @@ -1234,6 +1429,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, struct mlx4_qp_context *context; enum mlx4_qp_optpar optpar = 0; int sqd_event; + int steer_qp = 0; int err = -EINVAL; context = kzalloc(sizeof *context, GFP_KERNEL); @@ -1291,6 +1487,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { context->sq_size_stride |= !!qp->sq_no_prefetch << 7; context->xrcd = cpu_to_be32((u32) qp->xrcdn); + if (ibqp->qp_type == IB_QPT_RAW_PACKET) + context->param3 |= cpu_to_be32(1 << 30); } if (qp->ibqp.uobject) @@ -1316,6 +1514,11 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX; } else context->pri_path.counter_index = 0xff; + + if (qp->flags & MLX4_IB_QP_NETIF) { + mlx4_ib_steer_qp_reg(dev, qp, 1); + steer_qp = 1; + } } if (attr_mask & IB_QP_PKEY_INDEX) { @@ -1326,7 +1529,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } if (attr_mask & IB_QP_AV) { - if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path, + if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path, attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) goto out; @@ -1349,8 +1552,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, dev->dev->caps.pkey_table_len[attr->alt_port_num]) goto out; - if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path, - attr->alt_port_num)) + if (mlx4_set_alt_path(dev, attr, attr_mask, qp, + &context->alt_path, + attr->alt_port_num)) goto out; context->alt_path.pkey_index = attr->alt_pkey_index; @@ -1455,8 +1659,39 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, context->pri_path.fl = 0x80; context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE; } + if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET) { + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) + context->pri_path.feup = 1 << 7; /* don't fsm */ + /* handle smac_index */ + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) { + err = handle_eth_ud_smac_index(dev, qp, (u8 *)attr->smac, context); + if (err) + return -EINVAL; + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) + dev->qp1_proxy[qp->port - 1] = qp; + } + } } + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) + context->pri_path.ackto = (context->pri_path.ackto & 0xf8) | + MLX4_IB_LINK_TYPE_ETH; + + if (ibqp->qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) { + int is_eth = rdma_port_get_link_layer( + &dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET; + if (is_eth) { + context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH; + optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH; + } + } + + if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) sqd_event = 1; @@ -1527,23 +1762,113 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, * If we moved a kernel QP to RESET, clean up all old CQ * entries and reinitialize the QP. */ - if (new_state == IB_QPS_RESET && !ibqp->uobject) { - mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, - ibqp->srq ? to_msrq(ibqp->srq): NULL); - if (send_cq != recv_cq) - mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + if (new_state == IB_QPS_RESET) { + if (!ibqp->uobject) { + mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (send_cq != recv_cq) + mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq_next_wqe = 0; + if (qp->rq.wqe_cnt) + *qp->db.db = 0; + + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_reg(dev, qp, 0); + } + if (qp->pri.smac) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = 0; + } + if (qp->alt.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = 0; + } + if (qp->pri.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); + qp->pri.vid = 0xFFFF; + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } - qp->rq.head = 0; - qp->rq.tail = 0; - qp->sq.head = 0; - qp->sq.tail = 0; - qp->sq_next_wqe = 0; - if (qp->rq.wqe_cnt) - *qp->db.db = 0; + if (qp->alt.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); + qp->alt.vid = 0xFFFF; + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } } - out: + if (err && steer_qp) + mlx4_ib_steer_qp_reg(dev, qp, 0); kfree(context); + if (qp->pri.candidate_smac) { + if (err) { + mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac); + } else { + if (qp->pri.smac) + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = qp->pri.candidate_smac; + qp->pri.smac_index = qp->pri.candidate_smac_index; + qp->pri.smac_port = qp->pri.candidate_smac_port; + } + qp->pri.candidate_smac = 0; + qp->pri.candidate_smac_index = 0; + qp->pri.candidate_smac_port = 0; + } + if (qp->alt.candidate_smac) { + if (err) { + mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->alt.candidate_smac); + } else { + if (qp->alt.smac) + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = qp->alt.candidate_smac; + qp->alt.smac_index = qp->alt.candidate_smac_index; + qp->alt.smac_port = qp->alt.candidate_smac_port; + } + qp->alt.candidate_smac = 0; + qp->alt.candidate_smac_index = 0; + qp->alt.candidate_smac_port = 0; + } + + if (qp->pri.update_vid) { + if (err) { + if (qp->pri.candidate_vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port, + qp->pri.candidate_vid); + } else { + if (qp->pri.vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, + qp->pri.vid); + qp->pri.vid = qp->pri.candidate_vid; + qp->pri.vlan_port = qp->pri.candidate_vlan_port; + qp->pri.vlan_index = qp->pri.candidate_vlan_index; + } + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + + if (qp->alt.update_vid) { + if (err) { + if (qp->alt.candidate_vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port, + qp->alt.candidate_vid); + } else { + if (qp->alt.vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, + qp->alt.vid); + qp->alt.vid = qp->alt.candidate_vid; + qp->alt.vlan_port = qp->alt.candidate_vlan_port; + qp->alt.vlan_index = qp->alt.candidate_vlan_index; + } + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + return err; } @@ -1554,13 +1879,21 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct mlx4_ib_qp *qp = to_mqp(ibqp); enum ib_qp_state cur_state, new_state; int err = -EINVAL; - + int ll; mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) { + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + ll = IB_LINK_LAYER_UNSPECIFIED; + } else { + int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + ll = rdma_port_get_link_layer(&dev->ib_dev, port); + } + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask, ll)) { pr_debug("qpn 0x%x: invalid attribute mask specified " "for transition %d to %d. qp_type %d," " attr_mask 0x%x\n", @@ -1624,6 +1957,19 @@ out: return err; } +static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey) +{ + int i; + for (i = 0; i < dev->caps.num_ports; i++) { + if (qpn == dev->caps.qp0_proxy[i] || + qpn == dev->caps.qp0_tunnel[i]) { + *qkey = dev->caps.qp0_qkey[i]; + return 0; + } + } + return -EINVAL; +} + static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len) @@ -1681,8 +2027,13 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]); sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); - if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) - return -EINVAL; + if (mlx4_is_master(mdev->dev)) { + if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) + return -EINVAL; + } else { + if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) + return -EINVAL; + } sqp->ud_header.deth.qkey = cpu_to_be32(qkey); sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn); @@ -1737,20 +2088,20 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, { struct ib_device *ib_dev = sqp->qp.ibqp.device; struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_ctrl_seg *ctrl = wqe; struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); - struct net_device *ndev; union ib_gid sgid; u16 pkey; int send_size; int header_size; int spc; int i; - int is_eth; - int is_vlan = 0; - int is_grh; - u16 vlan; int err = 0; + u16 vlan = 0xffff; + bool is_eth; + bool is_vlan = false; + bool is_grh; send_size = 0; for (i = 0; i < wr->num_sge; ++i) @@ -1763,12 +2114,11 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, /* When multi-function is enabled, the ib_core gid * indexes don't necessarily match the hw ones, so * we must use our own cache */ - sgid.global.subnet_prefix = - to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. - subnet_prefix; - sgid.global.interface_id = - to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. - guid_cache[ah->av.ib.gid_index]; + err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sgid.raw[0]); + if (err) + return err; } else { err = ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, @@ -1777,8 +2127,10 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, return err; } - vlan = rdma_get_vlan_id(&sgid); - is_vlan = vlan < 0x1000; + if (ah->av.eth.vlan != cpu_to_be16(0xffff)) { + vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff; + is_vlan = 1; + } } ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header); @@ -1795,6 +2147,9 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, sqp->ud_header.grh.flow_label = ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit; + if (is_eth) + memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16); + else { if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { /* When multi-function is enabled, the ib_core gid * indexes don't necessarily match the hw ones, so @@ -1810,6 +2165,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, be32_to_cpu(ah->av.ib.port_pd) >> 24, ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid); + } memcpy(sqp->ud_header.grh.destination_gid.raw, ah->av.ib.dgid, 16); } @@ -1842,16 +2198,23 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, if (is_eth) { u8 *smac; + struct in6_addr in6; + u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13; mlx->sched_prio = cpu_to_be16(pcp); memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); /* FIXME: cache smac value? */ - ndev = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]; - if (!ndev) - return -ENODEV; - smac = ndev->dev_addr; + memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2); + memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4); + memcpy(&in6, sgid.raw, sizeof(in6)); + + if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) + smac = to_mdev(sqp->qp.ibqp.device)-> + iboe.netdevs[sqp->qp.port - 1]->dev_addr; + else /* use the src mac of the tunnel */ + smac = ah->av.eth.s_mac; memcpy(sqp->ud_header.eth.smac_h, smac, 6); if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); @@ -1953,9 +2316,12 @@ static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq static __be32 convert_access(int acc) { - return (acc & IB_ACCESS_REMOTE_ATOMIC ? cpu_to_be32(MLX4_WQE_FMR_PERM_ATOMIC) : 0) | - (acc & IB_ACCESS_REMOTE_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_WRITE) : 0) | - (acc & IB_ACCESS_REMOTE_READ ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_READ) : 0) | + return (acc & IB_ACCESS_REMOTE_ATOMIC ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC) : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) | + (acc & IB_ACCESS_REMOTE_READ ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ) : 0) | (acc & IB_ACCESS_LOCAL_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE) : 0) | cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ); } @@ -1981,12 +2347,28 @@ static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr) fseg->reserved[1] = 0; } +static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ib_send_wr *wr) +{ + bseg->flags1 = + convert_access(wr->wr.bind_mw.bind_info.mw_access_flags) & + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ | + MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE | + MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC); + bseg->flags2 = 0; + if (wr->wr.bind_mw.mw->type == IB_MW_TYPE_2) + bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_TYPE_2); + if (wr->wr.bind_mw.bind_info.mw_access_flags & IB_ZERO_BASED) + bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_ZERO_BASED); + bseg->new_rkey = cpu_to_be32(wr->wr.bind_mw.rkey); + bseg->lkey = cpu_to_be32(wr->wr.bind_mw.bind_info.mr->lkey); + bseg->addr = cpu_to_be64(wr->wr.bind_mw.bind_info.addr); + bseg->length = cpu_to_be64(wr->wr.bind_mw.bind_info.length); +} + static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey) { - iseg->flags = 0; - iseg->mem_key = cpu_to_be32(rkey); - iseg->guest_id = 0; - iseg->pa = 0; + memset(iseg, 0, sizeof(*iseg)); + iseg->mem_key = cpu_to_be32(rkey); } static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, @@ -2033,7 +2415,8 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev, struct mlx4_wqe_datagram_seg *dseg, - struct ib_send_wr *wr, enum ib_qp_type qpt) + struct ib_send_wr *wr, + enum mlx4_ib_qp_type qpt) { union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av; struct mlx4_av sqp_av = {0}; @@ -2046,8 +2429,10 @@ static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev, cpu_to_be32(0xf0000000); memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av)); - /* This function used only for sending on QP1 proxies */ - dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]); + if (qpt == MLX4_IB_QPT_PROXY_GSI) + dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]); + else + dseg->dqpn = cpu_to_be32(dev->dev->caps.qp0_tunnel[port - 1]); /* Use QKEY from the QP context, which is set by master */ dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY); } @@ -2064,6 +2449,8 @@ static void build_tunnel_header(struct ib_send_wr *wr, void *wqe, unsigned *mlx_ hdr.remote_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index); hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + memcpy(hdr.mac, ah->av.eth.mac, 6); + hdr.vlan = ah->av.eth.vlan; spc = MLX4_INLINE_ALIGN - ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); @@ -2291,6 +2678,13 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += sizeof (struct mlx4_wqe_fmr_seg) / 16; break; + case IB_WR_BIND_MW: + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); + set_bind_seg(wqe, wr); + wqe += sizeof(struct mlx4_wqe_bind_seg); + size += sizeof(struct mlx4_wqe_bind_seg) / 16; + break; default: /* No extra segments required for sends */ break; @@ -2333,11 +2727,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, break; case MLX4_IB_QPT_PROXY_SMI_OWNER: - if (unlikely(!mlx4_is_master(to_mdev(ibqp->device)->dev))) { - err = -ENOSYS; - *bad_wr = wr; - goto out; - } err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen); if (unlikely(err)) { *bad_wr = wr; @@ -2354,16 +2743,13 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += seglen / 16; break; case MLX4_IB_QPT_PROXY_SMI: - /* don't allow QP0 sends on guests */ - err = -ENOSYS; - *bad_wr = wr; - goto out; case MLX4_IB_QPT_PROXY_GSI: /* If we are tunneling special qps, this is a UD qp. * In this case we first add a UD segment targeting * the tunnel qp, and then add a header with address * information */ - set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, ibqp->qp_type); + set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, + qp->mlx4_ib_qp_type); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; build_tunnel_header(wr, wqe, &seglen); @@ -2729,6 +3115,9 @@ done: if (qp->flags & MLX4_IB_QP_LSO) qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + if (qp->flags & MLX4_IB_QP_NETIF) + qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP; + qp_init_attr->sq_sig_type = qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 60c5fb025fc..62d9285300a 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -134,13 +134,14 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, if (err) goto err_mtt; } else { - err = mlx4_db_alloc(dev->dev, &srq->db, 0); + err = mlx4_db_alloc(dev->dev, &srq->db, 0, GFP_KERNEL); if (err) goto err_srq; *srq->db.db = 0; - if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf)) { + if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf, + GFP_KERNEL)) { err = -ENOMEM; goto err_db; } @@ -165,7 +166,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf); + err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf, GFP_KERNEL); if (err) goto err_mtt; diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c index 5b2a01dfb90..cb4c66e723b 100644 --- a/drivers/infiniband/hw/mlx4/sysfs.c +++ b/drivers/infiniband/hw/mlx4/sysfs.c @@ -389,8 +389,10 @@ struct mlx4_port { struct mlx4_ib_dev *dev; struct attribute_group pkey_group; struct attribute_group gid_group; - u8 port_num; + struct device_attribute enable_smi_admin; + struct device_attribute smi_enabled; int slave; + u8 port_num; }; @@ -558,6 +560,101 @@ err: return NULL; } +static ssize_t sysfs_show_smi_enabled(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, smi_enabled); + ssize_t len = 0; + + if (mlx4_vf_smi_enabled(p->dev->dev, p->slave, p->port_num)) + len = sprintf(buf, "%d\n", 1); + else + len = sprintf(buf, "%d\n", 0); + + return len; +} + +static ssize_t sysfs_show_enable_smi_admin(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, enable_smi_admin); + ssize_t len = 0; + + if (mlx4_vf_get_enable_smi_admin(p->dev->dev, p->slave, p->port_num)) + len = sprintf(buf, "%d\n", 1); + else + len = sprintf(buf, "%d\n", 0); + + return len; +} + +static ssize_t sysfs_store_enable_smi_admin(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, enable_smi_admin); + int enable; + + if (sscanf(buf, "%i", &enable) != 1 || + enable < 0 || enable > 1) + return -EINVAL; + + if (mlx4_vf_set_enable_smi_admin(p->dev->dev, p->slave, p->port_num, enable)) + return -EINVAL; + return count; +} + +static int add_vf_smi_entries(struct mlx4_port *p) +{ + int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) == + IB_LINK_LAYER_ETHERNET; + int ret; + + /* do not display entries if eth transport, or if master */ + if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev)) + return 0; + + sysfs_attr_init(&p->smi_enabled.attr); + p->smi_enabled.show = sysfs_show_smi_enabled; + p->smi_enabled.store = NULL; + p->smi_enabled.attr.name = "smi_enabled"; + p->smi_enabled.attr.mode = 0444; + ret = sysfs_create_file(&p->kobj, &p->smi_enabled.attr); + if (ret) { + pr_err("failed to create smi_enabled\n"); + return ret; + } + + sysfs_attr_init(&p->enable_smi_admin.attr); + p->enable_smi_admin.show = sysfs_show_enable_smi_admin; + p->enable_smi_admin.store = sysfs_store_enable_smi_admin; + p->enable_smi_admin.attr.name = "enable_smi_admin"; + p->enable_smi_admin.attr.mode = 0644; + ret = sysfs_create_file(&p->kobj, &p->enable_smi_admin.attr); + if (ret) { + pr_err("failed to create enable_smi_admin\n"); + sysfs_remove_file(&p->kobj, &p->smi_enabled.attr); + return ret; + } + return 0; +} + +static void remove_vf_smi_entries(struct mlx4_port *p) +{ + int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) == + IB_LINK_LAYER_ETHERNET; + + if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev)) + return; + + sysfs_remove_file(&p->kobj, &p->smi_enabled.attr); + sysfs_remove_file(&p->kobj, &p->enable_smi_admin.attr); +} + static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave) { struct mlx4_port *p; @@ -582,8 +679,10 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave) p->pkey_group.attrs = alloc_group_attrs(show_port_pkey, store_port_pkey, dev->dev->caps.pkey_table_len[port_num]); - if (!p->pkey_group.attrs) + if (!p->pkey_group.attrs) { + ret = -ENOMEM; goto err_alloc; + } ret = sysfs_create_group(&p->kobj, &p->pkey_group); if (ret) @@ -591,13 +690,19 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave) p->gid_group.name = "gid_idx"; p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx, NULL, 1); - if (!p->gid_group.attrs) + if (!p->gid_group.attrs) { + ret = -ENOMEM; goto err_free_pkey; + } ret = sysfs_create_group(&p->kobj, &p->gid_group); if (ret) goto err_free_gid; + ret = add_vf_smi_entries(p); + if (ret) + goto err_free_gid; + list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]); return 0; @@ -623,6 +728,7 @@ static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave) int port; struct kobject *p, *t; struct mlx4_port *mport; + struct mlx4_active_ports actv_ports; get_name(dev, name, slave, sizeof name); @@ -645,7 +751,11 @@ static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave) goto err_ports; } + actv_ports = mlx4_get_active_ports(dev->dev, slave); + for (port = 1; port <= dev->dev->caps.num_ports; ++port) { + if (!test_bit(port - 1, actv_ports.ports)) + continue; err = add_port(dev, port, slave); if (err) goto err_add; @@ -660,6 +770,7 @@ err_add: mport = container_of(p, struct mlx4_port, kobj); sysfs_remove_group(p, &mport->pkey_group); sysfs_remove_group(p, &mport->gid_group); + remove_vf_smi_entries(mport); kobject_put(p); } kobject_put(dev->dev_ports_parent[slave]); @@ -704,6 +815,7 @@ static void unregister_pkey_tree(struct mlx4_ib_dev *device) port = container_of(p, struct mlx4_port, kobj); sysfs_remove_group(p, &port->pkey_group); sysfs_remove_group(p, &port->gid_group); + remove_vf_smi_entries(port); kobject_put(p); kobject_put(device->dev_ports_parent[slave]); } @@ -732,7 +844,7 @@ int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *dev) dev->ports_parent = kobject_create_and_add("ports", kobject_get(dev->iov_parent)); - if (!dev->iov_parent) { + if (!dev->ports_parent) { ret = -ENOMEM; goto err_ports; } diff --git a/drivers/infiniband/hw/mlx4/user.h b/drivers/infiniband/hw/mlx4/user.h index 13beedeeef9..07e6769ef43 100644 --- a/drivers/infiniband/hw/mlx4/user.h +++ b/drivers/infiniband/hw/mlx4/user.h @@ -40,7 +40,9 @@ * Increment this value if any changes that break userspace ABI * compatibility are made. */ -#define MLX4_IB_UVERBS_ABI_VERSION 3 + +#define MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION 3 +#define MLX4_IB_UVERBS_ABI_VERSION 4 /* * Make sure that all structs defined in this file remain laid out so @@ -50,10 +52,18 @@ * instead. */ +struct mlx4_ib_alloc_ucontext_resp_v3 { + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; +}; + struct mlx4_ib_alloc_ucontext_resp { + __u32 dev_caps; __u32 qp_tab_size; __u16 bf_reg_size; __u16 bf_regs_per_page; + __u32 cqe_size; }; struct mlx4_ib_alloc_pd_resp { diff --git a/drivers/infiniband/hw/mlx5/Kconfig b/drivers/infiniband/hw/mlx5/Kconfig new file mode 100644 index 00000000000..10df386c634 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/Kconfig @@ -0,0 +1,10 @@ +config MLX5_INFINIBAND + tristate "Mellanox Connect-IB HCA support" + depends on NETDEVICES && ETHERNET && PCI + select NET_VENDOR_MELLANOX + select MLX5_CORE + ---help--- + This driver provides low-level InfiniBand support for + Mellanox Connect-IB PCI Express host channel adapters (HCAs). + This is required to use InfiniBand protocols such as + IP-over-IB or SRP with these devices. diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile new file mode 100644 index 00000000000..4ea0135af48 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o + +mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c new file mode 100644 index 00000000000..39ab0caefdf --- /dev/null +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "mlx5_ib.h" + +struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr, + struct mlx5_ib_ah *ah) +{ + if (ah_attr->ah_flags & IB_AH_GRH) { + memcpy(ah->av.rgid, &ah_attr->grh.dgid, 16); + ah->av.grh_gid_fl = cpu_to_be32(ah_attr->grh.flow_label | + (1 << 30) | + ah_attr->grh.sgid_index << 20); + ah->av.hop_limit = ah_attr->grh.hop_limit; + ah->av.tclass = ah_attr->grh.traffic_class; + } + + ah->av.rlid = cpu_to_be16(ah_attr->dlid); + ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f; + ah->av.stat_rate_sl = (ah_attr->static_rate << 4) | (ah_attr->sl & 0xf); + + return &ah->ibah; +} + +struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + struct mlx5_ib_ah *ah; + + ah = kzalloc(sizeof(*ah), GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + return create_ib_ah(ah_attr, ah); /* never fails */ +} + +int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct mlx5_ib_ah *ah = to_mah(ibah); + u32 tmp; + + memset(ah_attr, 0, sizeof(*ah_attr)); + + tmp = be32_to_cpu(ah->av.grh_gid_fl); + if (tmp & (1 << 30)) { + ah_attr->ah_flags = IB_AH_GRH; + ah_attr->grh.sgid_index = (tmp >> 20) & 0xff; + ah_attr->grh.flow_label = tmp & 0xfffff; + memcpy(&ah_attr->grh.dgid, ah->av.rgid, 16); + ah_attr->grh.hop_limit = ah->av.hop_limit; + ah_attr->grh.traffic_class = ah->av.tclass; + } + ah_attr->dlid = be16_to_cpu(ah->av.rlid); + ah_attr->static_rate = ah->av.stat_rate_sl >> 4; + ah_attr->sl = ah->av.stat_rate_sl & 0xf; + + return 0; +} + +int mlx5_ib_destroy_ah(struct ib_ah *ah) +{ + kfree(to_mah(ah)); + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c new file mode 100644 index 00000000000..8ae4f896cb4 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -0,0 +1,1187 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/kref.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_user_verbs.h> +#include "mlx5_ib.h" +#include "user.h" + +static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq) +{ + struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; + + ibcq->comp_handler(ibcq, ibcq->cq_context); +} + +static void mlx5_ib_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type) +{ + struct mlx5_ib_cq *cq = container_of(mcq, struct mlx5_ib_cq, mcq); + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct ib_cq *ibcq = &cq->ibcq; + struct ib_event event; + + if (type != MLX5_EVENT_TYPE_CQ_ERROR) { + mlx5_ib_warn(dev, "Unexpected event type %d on CQ %06x\n", + type, mcq->cqn); + return; + } + + if (ibcq->event_handler) { + event.device = &dev->ib_dev; + event.event = IB_EVENT_CQ_ERR; + event.element.cq = ibcq; + ibcq->event_handler(&event, ibcq->cq_context); + } +} + +static void *get_cqe_from_buf(struct mlx5_ib_cq_buf *buf, int n, int size) +{ + return mlx5_buf_offset(&buf->buf, n * size); +} + +static void *get_cqe(struct mlx5_ib_cq *cq, int n) +{ + return get_cqe_from_buf(&cq->buf, n, cq->mcq.cqe_sz); +} + +static u8 sw_ownership_bit(int n, int nent) +{ + return (n & nent) ? 1 : 0; +} + +static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n) +{ + void *cqe = get_cqe(cq, n & cq->ibcq.cqe); + struct mlx5_cqe64 *cqe64; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) && + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) { + return cqe; + } else { + return NULL; + } +} + +static void *next_cqe_sw(struct mlx5_ib_cq *cq) +{ + return get_sw_cqe(cq, cq->mcq.cons_index); +} + +static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx) +{ + switch (wq->wr_data[idx]) { + case MLX5_IB_WR_UMR: + return 0; + + case IB_WR_LOCAL_INV: + return IB_WC_LOCAL_INV; + + case IB_WR_FAST_REG_MR: + return IB_WC_FAST_REG_MR; + + default: + pr_warn("unknown completion status\n"); + return 0; + } +} + +static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_ib_wq *wq, int idx) +{ + wc->wc_flags = 0; + switch (be32_to_cpu(cqe->sop_drop_qpn) >> 24) { + case MLX5_OPCODE_RDMA_WRITE_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX5_OPCODE_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case MLX5_OPCODE_SEND_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_INVAL: + wc->opcode = IB_WC_SEND; + break; + case MLX5_OPCODE_RDMA_READ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + break; + case MLX5_OPCODE_ATOMIC_CS: + wc->opcode = IB_WC_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_FA: + wc->opcode = IB_WC_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_MASKED_CS: + wc->opcode = IB_WC_MASKED_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_MASKED_FA: + wc->opcode = IB_WC_MASKED_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_BIND_MW: + wc->opcode = IB_WC_BIND_MW; + break; + case MLX5_OPCODE_UMR: + wc->opcode = get_umr_comp(wq, idx); + break; + } +} + +enum { + MLX5_GRH_IN_BUFFER = 1, + MLX5_GRH_IN_CQE = 2, +}; + +static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + struct mlx5_ib_srq *srq; + struct mlx5_ib_wq *wq; + u16 wqe_ctr; + u8 g; + + if (qp->ibqp.srq || qp->ibqp.xrcd) { + struct mlx5_core_srq *msrq = NULL; + + if (qp->ibqp.xrcd) { + msrq = mlx5_core_get_srq(&dev->mdev, + be32_to_cpu(cqe->srqn)); + srq = to_mibsrq(msrq); + } else { + srq = to_msrq(qp->ibqp.srq); + } + if (srq) { + wqe_ctr = be16_to_cpu(cqe->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_ib_free_srq_wqe(srq, wqe_ctr); + if (msrq && atomic_dec_and_test(&msrq->refcount)) + complete(&msrq->free); + } + } else { + wq = &qp->rq; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + + switch (cqe->op_own >> 4) { + case MLX5_CQE_RESP_WR_IMM: + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->imm_inval_pkey; + break; + case MLX5_CQE_RESP_SEND: + wc->opcode = IB_WC_RECV; + wc->wc_flags = 0; + break; + case MLX5_CQE_RESP_SEND_IMM: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->imm_inval_pkey; + break; + case MLX5_CQE_RESP_SEND_INV: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = be32_to_cpu(cqe->imm_inval_pkey); + break; + } + wc->slid = be16_to_cpu(cqe->slid); + wc->sl = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0xf; + wc->src_qp = be32_to_cpu(cqe->flags_rqpn) & 0xffffff; + wc->dlid_path_bits = cqe->ml_path; + g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3; + wc->wc_flags |= g ? IB_WC_GRH : 0; + wc->pkey_index = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff; +} + +static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe) +{ + __be32 *p = (__be32 *)cqe; + int i; + + mlx5_ib_warn(dev, "dump error cqe\n"); + for (i = 0; i < sizeof(*cqe) / 16; i++, p += 4) + pr_info("%08x %08x %08x %08x\n", be32_to_cpu(p[0]), + be32_to_cpu(p[1]), be32_to_cpu(p[2]), + be32_to_cpu(p[3])); +} + +static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, + struct mlx5_err_cqe *cqe, + struct ib_wc *wc) +{ + int dump = 1; + + switch (cqe->syndrome) { + case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR: + wc->status = IB_WC_LOC_QP_OP_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case MLX5_CQE_SYNDROME_WR_FLUSH_ERR: + dump = 0; + wc->status = IB_WC_WR_FLUSH_ERR; + break; + case MLX5_CQE_SYNDROME_MW_BIND_ERR: + wc->status = IB_WC_MW_BIND_ERR; + break; + case MLX5_CQE_SYNDROME_BAD_RESP_ERR: + wc->status = IB_WC_BAD_RESP_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + wc->status = IB_WC_REM_INV_REQ_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR: + wc->status = IB_WC_REM_ACCESS_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_OP_ERR: + wc->status = IB_WC_REM_OP_ERR; + break; + case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + wc->status = IB_WC_RETRY_EXC_ERR; + dump = 0; + break; + case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + wc->status = IB_WC_RNR_RETRY_EXC_ERR; + dump = 0; + break; + case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR: + wc->status = IB_WC_REM_ABORT_ERR; + break; + default: + wc->status = IB_WC_GENERAL_ERR; + break; + } + + wc->vendor_err = cqe->vendor_err_synd; + if (dump) + dump_cqe(dev, cqe); +} + +static int is_atomic_response(struct mlx5_ib_qp *qp, uint16_t idx) +{ + /* TBD: waiting decision + */ + return 0; +} + +static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, uint16_t idx) +{ + struct mlx5_wqe_data_seg *dpseg; + void *addr; + + dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_atomic_seg); + addr = (void *)(unsigned long)be64_to_cpu(dpseg->addr); + return addr; +} + +static void handle_atomic(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, + uint16_t idx) +{ + void *addr; + int byte_count; + int i; + + if (!is_atomic_response(qp, idx)) + return; + + byte_count = be32_to_cpu(cqe64->byte_cnt); + addr = mlx5_get_atomic_laddr(qp, idx); + + if (byte_count == 4) { + *(uint32_t *)addr = be32_to_cpu(*((__be32 *)addr)); + } else { + for (i = 0; i < byte_count; i += 8) { + *(uint64_t *)addr = be64_to_cpu(*((__be64 *)addr)); + addr += 8; + } + } + + return; +} + +static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, + u16 tail, u16 head) +{ + int idx; + + do { + idx = tail & (qp->sq.wqe_cnt - 1); + handle_atomic(qp, cqe64, idx); + if (idx == head) + break; + + tail = qp->sq.w_list[idx].next; + } while (1); + tail = qp->sq.w_list[idx].next; + qp->sq.last_poll = tail; +} + +static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) +{ + mlx5_buf_free(&dev->mdev, &buf->buf); +} + +static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe, + struct ib_sig_err *item) +{ + u16 syndrome = be16_to_cpu(cqe->syndrome); + +#define GUARD_ERR (1 << 13) +#define APPTAG_ERR (1 << 12) +#define REFTAG_ERR (1 << 11) + + if (syndrome & GUARD_ERR) { + item->err_type = IB_SIG_BAD_GUARD; + item->expected = be32_to_cpu(cqe->expected_trans_sig) >> 16; + item->actual = be32_to_cpu(cqe->actual_trans_sig) >> 16; + } else + if (syndrome & REFTAG_ERR) { + item->err_type = IB_SIG_BAD_REFTAG; + item->expected = be32_to_cpu(cqe->expected_reftag); + item->actual = be32_to_cpu(cqe->actual_reftag); + } else + if (syndrome & APPTAG_ERR) { + item->err_type = IB_SIG_BAD_APPTAG; + item->expected = be32_to_cpu(cqe->expected_trans_sig) & 0xffff; + item->actual = be32_to_cpu(cqe->actual_trans_sig) & 0xffff; + } else { + pr_err("Got signature completion error with bad syndrome %04x\n", + syndrome); + } + + item->sig_err_offset = be64_to_cpu(cqe->err_offset); + item->key = be32_to_cpu(cqe->mkey); +} + +static int mlx5_poll_one(struct mlx5_ib_cq *cq, + struct mlx5_ib_qp **cur_qp, + struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_err_cqe *err_cqe; + struct mlx5_cqe64 *cqe64; + struct mlx5_core_qp *mqp; + struct mlx5_ib_wq *wq; + struct mlx5_sig_err_cqe *sig_err_cqe; + struct mlx5_core_mr *mmr; + struct mlx5_ib_mr *mr; + uint8_t opcode; + uint32_t qpn; + u16 wqe_ctr; + void *cqe; + int idx; + +repoll: + cqe = next_cqe_sw(cq); + if (!cqe) + return -EAGAIN; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + ++cq->mcq.cons_index; + + /* Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + opcode = cqe64->op_own >> 4; + if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) { + if (likely(cq->resize_buf)) { + free_cq_buf(dev, &cq->buf); + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + goto repoll; + } else { + mlx5_ib_warn(dev, "unexpected resize cqe\n"); + } + } + + qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff; + if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) { + /* We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + mqp = __mlx5_qp_lookup(&dev->mdev, qpn); + if (unlikely(!mqp)) { + mlx5_ib_warn(dev, "CQE@CQ %06x for unknown QPN %6x\n", + cq->mcq.cqn, qpn); + return -EINVAL; + } + + *cur_qp = to_mibqp(mqp); + } + + wc->qp = &(*cur_qp)->ibqp; + switch (opcode) { + case MLX5_CQE_REQ: + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + handle_good_req(wc, cqe64, wq, idx); + handle_atomics(*cur_qp, cqe64, wq->last_poll, idx); + wc->wr_id = wq->wrid[idx]; + wq->tail = wq->wqe_head[idx] + 1; + wc->status = IB_WC_SUCCESS; + break; + case MLX5_CQE_RESP_WR_IMM: + case MLX5_CQE_RESP_SEND: + case MLX5_CQE_RESP_SEND_IMM: + case MLX5_CQE_RESP_SEND_INV: + handle_responder(wc, cqe64, *cur_qp); + wc->status = IB_WC_SUCCESS; + break; + case MLX5_CQE_RESIZE_CQ: + break; + case MLX5_CQE_REQ_ERR: + case MLX5_CQE_RESP_ERR: + err_cqe = (struct mlx5_err_cqe *)cqe64; + mlx5_handle_error_cqe(dev, err_cqe, wc); + mlx5_ib_dbg(dev, "%s error cqe on cqn 0x%x:\n", + opcode == MLX5_CQE_REQ_ERR ? + "Requestor" : "Responder", cq->mcq.cqn); + mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n", + err_cqe->syndrome, err_cqe->vendor_err_synd); + if (opcode == MLX5_CQE_REQ_ERR) { + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[idx]; + wq->tail = wq->wqe_head[idx] + 1; + } else { + struct mlx5_ib_srq *srq; + + if ((*cur_qp)->ibqp.srq) { + srq = to_msrq((*cur_qp)->ibqp.srq); + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_ib_free_srq_wqe(srq, wqe_ctr); + } else { + wq = &(*cur_qp)->rq; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + } + break; + case MLX5_CQE_SIG_ERR: + sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64; + + read_lock(&dev->mdev.priv.mr_table.lock); + mmr = __mlx5_mr_lookup(&dev->mdev, + mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); + if (unlikely(!mmr)) { + read_unlock(&dev->mdev.priv.mr_table.lock); + mlx5_ib_warn(dev, "CQE@CQ %06x for unknown MR %6x\n", + cq->mcq.cqn, be32_to_cpu(sig_err_cqe->mkey)); + return -EINVAL; + } + + mr = to_mibmr(mmr); + get_sig_err_item(sig_err_cqe, &mr->sig->err_item); + mr->sig->sig_err_exists = true; + mr->sig->sigerr_count++; + + mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n", + cq->mcq.cqn, mr->sig->err_item.key, + mr->sig->err_item.err_type, + mr->sig->err_item.sig_err_offset, + mr->sig->err_item.expected, + mr->sig->err_item.actual); + + read_unlock(&dev->mdev.priv.mr_table.lock); + goto repoll; + } + + return 0; +} + +int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct mlx5_ib_cq *cq = to_mcq(ibcq); + struct mlx5_ib_qp *cur_qp = NULL; + unsigned long flags; + int npolled; + int err = 0; + + spin_lock_irqsave(&cq->lock, flags); + + for (npolled = 0; npolled < num_entries; npolled++) { + err = mlx5_poll_one(cq, &cur_qp, wc + npolled); + if (err) + break; + } + + if (npolled) + mlx5_cq_set_ci(&cq->mcq); + + spin_unlock_irqrestore(&cq->lock, flags); + + if (err == 0 || err == -EAGAIN) + return npolled; + else + return err; +} + +int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + mlx5_cq_arm(&to_mcq(ibcq)->mcq, + (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT, + to_mdev(ibcq->device)->mdev.priv.uuari.uars[0].map, + MLX5_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->mdev.priv.cq_uar_lock)); + + return 0; +} + +static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf, + int nent, int cqe_size) +{ + int err; + + err = mlx5_buf_alloc(&dev->mdev, nent * cqe_size, + PAGE_SIZE * 2, &buf->buf); + if (err) + return err; + + buf->cqe_size = cqe_size; + buf->nent = nent; + + return 0; +} + +static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, + struct ib_ucontext *context, struct mlx5_ib_cq *cq, + int entries, struct mlx5_create_cq_mbox_in **cqb, + int *cqe_size, int *index, int *inlen) +{ + struct mlx5_ib_create_cq ucmd; + size_t ucmdlen; + int page_shift; + int npages; + int ncont; + int err; + + ucmdlen = + (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < + sizeof(ucmd)) ? (sizeof(ucmd) - + sizeof(ucmd.reserved)) : sizeof(ucmd); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) + return -EFAULT; + + if (ucmdlen == sizeof(ucmd) && + ucmd.reserved != 0) + return -EINVAL; + + if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128) + return -EINVAL; + + *cqe_size = ucmd.cqe_size; + + cq->buf.umem = ib_umem_get(context, ucmd.buf_addr, + entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(cq->buf.umem)) { + err = PTR_ERR(cq->buf.umem); + return err; + } + + err = mlx5_ib_db_map_user(to_mucontext(context), ucmd.db_addr, + &cq->db); + if (err) + goto err_umem; + + mlx5_ib_cont_pages(cq->buf.umem, ucmd.buf_addr, &npages, &page_shift, + &ncont, NULL); + mlx5_ib_dbg(dev, "addr 0x%llx, size %u, npages %d, page_shift %d, ncont %d\n", + ucmd.buf_addr, entries * ucmd.cqe_size, npages, page_shift, ncont); + + *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * ncont; + *cqb = mlx5_vzalloc(*inlen); + if (!*cqb) { + err = -ENOMEM; + goto err_db; + } + mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0); + (*cqb)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + + *index = to_mucontext(context)->uuari.uars[0].index; + + return 0; + +err_db: + mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db); + +err_umem: + ib_umem_release(cq->buf.umem); + return err; +} + +static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context) +{ + mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db); + ib_umem_release(cq->buf.umem); +} + +static void init_cq_buf(struct mlx5_ib_cq *cq, struct mlx5_ib_cq_buf *buf) +{ + int i; + void *cqe; + struct mlx5_cqe64 *cqe64; + + for (i = 0; i < buf->nent; i++) { + cqe = get_cqe_from_buf(buf, i, buf->cqe_size); + cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; + cqe64->op_own = MLX5_CQE_INVALID << 4; + } +} + +static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size, + struct mlx5_create_cq_mbox_in **cqb, + int *index, int *inlen) +{ + int err; + + err = mlx5_db_alloc(&dev->mdev, &cq->db); + if (err) + return err; + + cq->mcq.set_ci_db = cq->db.db; + cq->mcq.arm_db = cq->db.db + 1; + *cq->mcq.set_ci_db = 0; + *cq->mcq.arm_db = 0; + cq->mcq.cqe_sz = cqe_size; + + err = alloc_cq_buf(dev, &cq->buf, entries, cqe_size); + if (err) + goto err_db; + + init_cq_buf(cq, &cq->buf); + + *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * cq->buf.buf.npages; + *cqb = mlx5_vzalloc(*inlen); + if (!*cqb) { + err = -ENOMEM; + goto err_buf; + } + mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas); + + (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; + *index = dev->mdev.priv.uuari.uars[0].index; + + return 0; + +err_buf: + free_cq_buf(dev, &cq->buf); + +err_db: + mlx5_db_free(&dev->mdev, &cq->db); + return err; +} + +static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, &cq->buf); + mlx5_db_free(&dev->mdev, &cq->db); +} + +struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_create_cq_mbox_in *cqb = NULL; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_cq *cq; + int uninitialized_var(index); + int uninitialized_var(inlen); + int cqe_size; + int irqn; + int eqn; + int err; + + if (entries < 0) + return ERR_PTR(-EINVAL); + + entries = roundup_pow_of_two(entries + 1); + if (entries > dev->mdev.caps.max_cqes) + return ERR_PTR(-EINVAL); + + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return ERR_PTR(-ENOMEM); + + cq->ibcq.cqe = entries - 1; + mutex_init(&cq->resize_mutex); + spin_lock_init(&cq->lock); + cq->resize_buf = NULL; + cq->resize_umem = NULL; + + if (context) { + err = create_cq_user(dev, udata, context, cq, entries, + &cqb, &cqe_size, &index, &inlen); + if (err) + goto err_create; + } else { + /* for now choose 64 bytes till we have a proper interface */ + cqe_size = 64; + err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb, + &index, &inlen); + if (err) + goto err_create; + } + + cq->cqe_size = cqe_size; + cqb->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; + cqb->ctx.log_sz_usr_page = cpu_to_be32((ilog2(entries) << 24) | index); + err = mlx5_vector2eqn(dev, vector, &eqn, &irqn); + if (err) + goto err_cqb; + + cqb->ctx.c_eqn = cpu_to_be16(eqn); + cqb->ctx.db_record_addr = cpu_to_be64(cq->db.dma); + + err = mlx5_core_create_cq(&dev->mdev, &cq->mcq, cqb, inlen); + if (err) + goto err_cqb; + + mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); + cq->mcq.irqn = irqn; + cq->mcq.comp = mlx5_ib_cq_comp; + cq->mcq.event = mlx5_ib_cq_event; + + if (context) + if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) { + err = -EFAULT; + goto err_cmd; + } + + + mlx5_vfree(cqb); + return &cq->ibcq; + +err_cmd: + mlx5_core_destroy_cq(&dev->mdev, &cq->mcq); + +err_cqb: + mlx5_vfree(cqb); + if (context) + destroy_cq_user(cq, context); + else + destroy_cq_kernel(dev, cq); + +err_create: + kfree(cq); + + return ERR_PTR(err); +} + + +int mlx5_ib_destroy_cq(struct ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + struct ib_ucontext *context = NULL; + + if (cq->uobject) + context = cq->uobject->context; + + mlx5_core_destroy_cq(&dev->mdev, &mcq->mcq); + if (context) + destroy_cq_user(mcq, context); + else + destroy_cq_kernel(dev, mcq); + + kfree(mcq); + + return 0; +} + +static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn) +{ + return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff); +} + +void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq) +{ + struct mlx5_cqe64 *cqe64, *dest64; + void *cqe, *dest; + u32 prod_index; + int nfreed = 0; + u8 owner_bit; + + if (!cq) + return; + + /* First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); prod_index++) + if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe) + break; + + /* Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + if (is_equal_rsn(cqe64, rsn)) { + if (srq && (ntohl(cqe64->srqn) & 0xffffff)) + mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter)); + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); + dest64 = (cq->mcq.cqe_sz == 64) ? dest : dest + 64; + owner_bit = dest64->op_own & MLX5_CQE_OWNER_MASK; + memcpy(dest, cqe, cq->mcq.cqe_sz); + dest64->op_own = owner_bit | + (dest64->op_own & ~MLX5_CQE_OWNER_MASK); + } + } + + if (nfreed) { + cq->mcq.cons_index += nfreed; + /* Make sure update of buffer contents is done before + * updating consumer index. + */ + wmb(); + mlx5_cq_set_ci(&cq->mcq); + } +} + +void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq) +{ + if (!cq) + return; + + spin_lock_irq(&cq->lock); + __mlx5_ib_cq_clean(cq, qpn, srq); + spin_unlock_irq(&cq->lock); +} + +int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +{ + struct mlx5_modify_cq_mbox_in *in; + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + int err; + u32 fsel; + + if (!(dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_CQ_MODER)) + return -ENOSYS; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->cqn = cpu_to_be32(mcq->mcq.cqn); + fsel = (MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT); + in->ctx.cq_period = cpu_to_be16(cq_period); + in->ctx.cq_max_count = cpu_to_be16(cq_count); + in->field_select = cpu_to_be32(fsel); + err = mlx5_core_modify_cq(&dev->mdev, &mcq->mcq, in, sizeof(*in)); + kfree(in); + + if (err) + mlx5_ib_warn(dev, "modify cq 0x%x failed\n", mcq->mcq.cqn); + + return err; +} + +static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, struct ib_udata *udata, int *npas, + int *page_shift, int *cqe_size) +{ + struct mlx5_ib_resize_cq ucmd; + struct ib_umem *umem; + int err; + int npages; + struct ib_ucontext *context = cq->buf.umem->context; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) + return err; + + if (ucmd.reserved0 || ucmd.reserved1) + return -EINVAL; + + umem = ib_umem_get(context, ucmd.buf_addr, entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + return err; + } + + mlx5_ib_cont_pages(umem, ucmd.buf_addr, &npages, page_shift, + npas, NULL); + + cq->resize_umem = umem; + *cqe_size = ucmd.cqe_size; + + return 0; +} + +static void un_resize_user(struct mlx5_ib_cq *cq) +{ + ib_umem_release(cq->resize_umem); +} + +static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size) +{ + int err; + + cq->resize_buf = kzalloc(sizeof(*cq->resize_buf), GFP_KERNEL); + if (!cq->resize_buf) + return -ENOMEM; + + err = alloc_cq_buf(dev, cq->resize_buf, entries, cqe_size); + if (err) + goto ex; + + init_cq_buf(cq, cq->resize_buf); + + return 0; + +ex: + kfree(cq->resize_buf); + return err; +} + +static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, cq->resize_buf); + cq->resize_buf = NULL; +} + +static int copy_resize_cqes(struct mlx5_ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_cqe64 *scqe64; + struct mlx5_cqe64 *dcqe64; + void *start_cqe; + void *scqe; + void *dcqe; + int ssize; + int dsize; + int i; + u8 sw_own; + + ssize = cq->buf.cqe_size; + dsize = cq->resize_buf->cqe_size; + if (ssize != dsize) { + mlx5_ib_warn(dev, "resize from different cqe size is not supported\n"); + return -EINVAL; + } + + i = cq->mcq.cons_index; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + start_cqe = scqe; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) { + dcqe = get_cqe_from_buf(cq->resize_buf, + (i + 1) & (cq->resize_buf->nent), + dsize); + dcqe64 = dsize == 64 ? dcqe : dcqe + 64; + sw_own = sw_ownership_bit(i + 1, cq->resize_buf->nent); + memcpy(dcqe, scqe, dsize); + dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own; + + ++i; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + if (scqe == start_cqe) { + pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n", + cq->mcq.cqn); + return -ENOMEM; + } + } + ++cq->mcq.cons_index; + return 0; +} + +int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibcq->device); + struct mlx5_ib_cq *cq = to_mcq(ibcq); + struct mlx5_modify_cq_mbox_in *in; + int err; + int npas; + int page_shift; + int inlen; + int uninitialized_var(cqe_size); + unsigned long flags; + + if (!(dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_RESIZE_CQ)) { + pr_info("Firmware does not support resize CQ\n"); + return -ENOSYS; + } + + if (entries < 1) + return -EINVAL; + + entries = roundup_pow_of_two(entries + 1); + if (entries > dev->mdev.caps.max_cqes + 1) + return -EINVAL; + + if (entries == ibcq->cqe + 1) + return 0; + + mutex_lock(&cq->resize_mutex); + if (udata) { + err = resize_user(dev, cq, entries, udata, &npas, &page_shift, + &cqe_size); + } else { + cqe_size = 64; + err = resize_kernel(dev, cq, entries, cqe_size); + if (!err) { + npas = cq->resize_buf->buf.npages; + page_shift = cq->resize_buf->buf.page_shift; + } + } + + if (err) + goto ex; + + inlen = sizeof(*in) + npas * sizeof(in->pas[0]); + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto ex_resize; + } + + if (udata) + mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift, + in->pas, 0); + else + mlx5_fill_page_array(&cq->resize_buf->buf, in->pas); + + in->field_select = cpu_to_be32(MLX5_MODIFY_CQ_MASK_LOG_SIZE | + MLX5_MODIFY_CQ_MASK_PG_OFFSET | + MLX5_MODIFY_CQ_MASK_PG_SIZE); + in->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + in->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; + in->ctx.page_offset = 0; + in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(entries) << 24); + in->hdr.opmod = cpu_to_be16(MLX5_CQ_OPMOD_RESIZE); + in->cqn = cpu_to_be32(cq->mcq.cqn); + + err = mlx5_core_modify_cq(&dev->mdev, &cq->mcq, in, inlen); + if (err) + goto ex_alloc; + + if (udata) { + cq->ibcq.cqe = entries - 1; + ib_umem_release(cq->buf.umem); + cq->buf.umem = cq->resize_umem; + cq->resize_umem = NULL; + } else { + struct mlx5_ib_cq_buf tbuf; + int resized = 0; + + spin_lock_irqsave(&cq->lock, flags); + if (cq->resize_buf) { + err = copy_resize_cqes(cq); + if (!err) { + tbuf = cq->buf; + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + resized = 1; + } + } + cq->ibcq.cqe = entries - 1; + spin_unlock_irqrestore(&cq->lock, flags); + if (resized) + free_cq_buf(dev, &tbuf); + } + mutex_unlock(&cq->resize_mutex); + + mlx5_vfree(in); + return 0; + +ex_alloc: + mlx5_vfree(in); + +ex_resize: + if (udata) + un_resize_user(cq); + else + un_resize_kernel(dev, cq); +ex: + mutex_unlock(&cq->resize_mutex); + return err; +} + +int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq) +{ + struct mlx5_ib_cq *cq; + + if (!ibcq) + return 128; + + cq = to_mcq(ibcq); + return cq->cqe_size; +} diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c new file mode 100644 index 00000000000..ece028fc47d --- /dev/null +++ b/drivers/infiniband/hw/mlx5/doorbell.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/kref.h> +#include <linux/slab.h> +#include <rdma/ib_umem.h> + +#include "mlx5_ib.h" + +struct mlx5_ib_user_db_page { + struct list_head list; + struct ib_umem *umem; + unsigned long user_virt; + int refcnt; +}; + +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, + struct mlx5_db *db) +{ + struct mlx5_ib_user_db_page *page; + int err = 0; + + mutex_lock(&context->db_page_mutex); + + list_for_each_entry(page, &context->db_page_list, list) + if (page->user_virt == (virt & PAGE_MASK)) + goto found; + + page = kmalloc(sizeof(*page), GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + page->user_virt = (virt & PAGE_MASK); + page->refcnt = 0; + page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK, + PAGE_SIZE, 0, 0); + if (IS_ERR(page->umem)) { + err = PTR_ERR(page->umem); + kfree(page); + goto out; + } + + list_add(&page->list, &context->db_page_list); + +found: + db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK); + db->u.user_page = page; + ++page->refcnt; + +out: + mutex_unlock(&context->db_page_mutex); + + return err; +} + +void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db) +{ + mutex_lock(&context->db_page_mutex); + + if (!--db->u.user_page->refcnt) { + list_del(&db->u.user_page->list); + ib_umem_release(db->u.user_page->umem); + kfree(db->u.user_page); + } + + mutex_unlock(&context->db_page_mutex); +} diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c new file mode 100644 index 00000000000..5c8938be0e0 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/mlx5/cmd.h> +#include <rdma/ib_mad.h> +#include <rdma/ib_smi.h> +#include "mlx5_ib.h" + +enum { + MLX5_IB_VENDOR_CLASS1 = 0x9, + MLX5_IB_VENDOR_CLASS2 = 0xa +}; + +int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, + int port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad) +{ + u8 op_modifier = 0; + + /* Key check traps can't be generated unless we have in_wc to + * tell us where to send the trap. + */ + if (ignore_mkey || !in_wc) + op_modifier |= 0x1; + if (ignore_bkey || !in_wc) + op_modifier |= 0x2; + + return mlx5_core_mad_ifc(&dev->mdev, in_mad, response_mad, op_modifier, port); +} + +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + u16 slid; + int err; + + slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS) + return IB_MAD_RESULT_SUCCESS; + + /* Don't process SMInfo queries -- the SMA can't handle them. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO) + return IB_MAD_RESULT_SUCCESS; + } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT || + in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS1 || + in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS2 || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET) + return IB_MAD_RESULT_SUCCESS; + } else { + return IB_MAD_RESULT_SUCCESS; + } + + err = mlx5_MAD_IFC(to_mdev(ibdev), + mad_flags & IB_MAD_IGNORE_MKEY, + mad_flags & IB_MAD_IGNORE_BKEY, + port_num, in_wc, in_grh, in_mad, out_mad); + if (err) + return IB_MAD_RESULT_FAILURE; + + /* set return bit in status of directed route responses */ + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + out_mad->mad_hdr.status |= cpu_to_be16(1 << 15); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) + /* no response for trap repress */ + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + u16 packet_error; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + + packet_error = be16_to_cpu(out_mad->status); + + dev->mdev.caps.ext_port_cap[port - 1] = (!err && !packet_error) ? + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO : 0; + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c new file mode 100644 index 00000000000..364d4b6937f --- /dev/null +++ b/drivers/infiniband/hw/mlx5/main.c @@ -0,0 +1,1543 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <asm-generic/kmap_types.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <linux/slab.h> +#include <linux/io-mapping.h> +#include <linux/sched.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_smi.h> +#include <rdma/ib_umem.h> +#include "user.h" +#include "mlx5_ib.h" + +#define DRIVER_NAME "mlx5_ib" +#define DRIVER_VERSION "2.2-1" +#define DRIVER_RELDATE "Feb 2014" + +MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>"); +MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRIVER_VERSION); + +static int prof_sel = 2; +module_param_named(prof_sel, prof_sel, int, 0444); +MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2"); + +static char mlx5_version[] = + DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" + DRIVER_VERSION " (" DRIVER_RELDATE ")\n"; + +static struct mlx5_profile profile[] = { + [0] = { + .mask = 0, + }, + [1] = { + .mask = MLX5_PROF_MASK_QP_SIZE, + .log_max_qp = 12, + }, + [2] = { + .mask = MLX5_PROF_MASK_QP_SIZE | + MLX5_PROF_MASK_MR_CACHE, + .log_max_qp = 17, + .mr_cache[0] = { + .size = 500, + .limit = 250 + }, + .mr_cache[1] = { + .size = 500, + .limit = 250 + }, + .mr_cache[2] = { + .size = 500, + .limit = 250 + }, + .mr_cache[3] = { + .size = 500, + .limit = 250 + }, + .mr_cache[4] = { + .size = 500, + .limit = 250 + }, + .mr_cache[5] = { + .size = 500, + .limit = 250 + }, + .mr_cache[6] = { + .size = 500, + .limit = 250 + }, + .mr_cache[7] = { + .size = 500, + .limit = 250 + }, + .mr_cache[8] = { + .size = 500, + .limit = 250 + }, + .mr_cache[9] = { + .size = 500, + .limit = 250 + }, + .mr_cache[10] = { + .size = 500, + .limit = 250 + }, + .mr_cache[11] = { + .size = 500, + .limit = 250 + }, + .mr_cache[12] = { + .size = 64, + .limit = 32 + }, + .mr_cache[13] = { + .size = 32, + .limit = 16 + }, + .mr_cache[14] = { + .size = 16, + .limit = 8 + }, + .mr_cache[15] = { + .size = 8, + .limit = 4 + }, + }, +}; + +int mlx5_vector2eqn(struct mlx5_ib_dev *dev, int vector, int *eqn, int *irqn) +{ + struct mlx5_eq_table *table = &dev->mdev.priv.eq_table; + struct mlx5_eq *eq, *n; + int err = -ENOENT; + + spin_lock(&table->lock); + list_for_each_entry_safe(eq, n, &dev->eqs_list, list) { + if (eq->index == vector) { + *eqn = eq->eqn; + *irqn = eq->irqn; + err = 0; + break; + } + } + spin_unlock(&table->lock); + + return err; +} + +static int alloc_comp_eqs(struct mlx5_ib_dev *dev) +{ + struct mlx5_eq_table *table = &dev->mdev.priv.eq_table; + char name[MLX5_MAX_EQ_NAME]; + struct mlx5_eq *eq, *n; + int ncomp_vec; + int nent; + int err; + int i; + + INIT_LIST_HEAD(&dev->eqs_list); + ncomp_vec = table->num_comp_vectors; + nent = MLX5_COMP_EQ_SIZE; + for (i = 0; i < ncomp_vec; i++) { + eq = kzalloc(sizeof(*eq), GFP_KERNEL); + if (!eq) { + err = -ENOMEM; + goto clean; + } + + snprintf(name, MLX5_MAX_EQ_NAME, "mlx5_comp%d", i); + err = mlx5_create_map_eq(&dev->mdev, eq, + i + MLX5_EQ_VEC_COMP_BASE, nent, 0, + name, &dev->mdev.priv.uuari.uars[0]); + if (err) { + kfree(eq); + goto clean; + } + mlx5_ib_dbg(dev, "allocated completion EQN %d\n", eq->eqn); + eq->index = i; + spin_lock(&table->lock); + list_add_tail(&eq->list, &dev->eqs_list); + spin_unlock(&table->lock); + } + + dev->num_comp_vectors = ncomp_vec; + return 0; + +clean: + spin_lock(&table->lock); + list_for_each_entry_safe(eq, n, &dev->eqs_list, list) { + list_del(&eq->list); + spin_unlock(&table->lock); + if (mlx5_destroy_unmap_eq(&dev->mdev, eq)) + mlx5_ib_warn(dev, "failed to destroy EQ 0x%x\n", eq->eqn); + kfree(eq); + spin_lock(&table->lock); + } + spin_unlock(&table->lock); + return err; +} + +static void free_comp_eqs(struct mlx5_ib_dev *dev) +{ + struct mlx5_eq_table *table = &dev->mdev.priv.eq_table; + struct mlx5_eq *eq, *n; + + spin_lock(&table->lock); + list_for_each_entry_safe(eq, n, &dev->eqs_list, list) { + list_del(&eq->list); + spin_unlock(&table->lock); + if (mlx5_destroy_unmap_eq(&dev->mdev, eq)) + mlx5_ib_warn(dev, "failed to destroy EQ 0x%x\n", eq->eqn); + kfree(eq); + spin_lock(&table->lock); + } + spin_unlock(&table->lock); +} + +static int mlx5_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + int max_rq_sg; + int max_sq_sg; + u64 flags; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memset(props, 0, sizeof(*props)); + + props->fw_ver = ((u64)fw_rev_maj(&dev->mdev) << 32) | + (fw_rev_min(&dev->mdev) << 16) | + fw_rev_sub(&dev->mdev); + props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | + IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN; + flags = dev->mdev.caps.flags; + if (flags & MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR) + props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; + if (flags & MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR) + props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; + if (flags & MLX5_DEV_CAP_FLAG_APM) + props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; + props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; + if (flags & MLX5_DEV_CAP_FLAG_XRC) + props->device_cap_flags |= IB_DEVICE_XRC; + props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + if (flags & MLX5_DEV_CAP_FLAG_SIG_HAND_OVER) { + props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; + /* At this stage no support for signature handover */ + props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | + IB_PROT_T10DIF_TYPE_2 | + IB_PROT_T10DIF_TYPE_3; + props->sig_guard_cap = IB_GUARD_T10DIF_CRC | + IB_GUARD_T10DIF_CSUM; + } + if (flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST) + props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + + props->vendor_id = be32_to_cpup((__be32 *)(out_mad->data + 36)) & + 0xffffff; + props->vendor_part_id = be16_to_cpup((__be16 *)(out_mad->data + 30)); + props->hw_ver = be32_to_cpup((__be32 *)(out_mad->data + 32)); + memcpy(&props->sys_image_guid, out_mad->data + 4, 8); + + props->max_mr_size = ~0ull; + props->page_size_cap = dev->mdev.caps.min_page_sz; + props->max_qp = 1 << dev->mdev.caps.log_max_qp; + props->max_qp_wr = dev->mdev.caps.max_wqes; + max_rq_sg = dev->mdev.caps.max_rq_desc_sz / sizeof(struct mlx5_wqe_data_seg); + max_sq_sg = (dev->mdev.caps.max_sq_desc_sz - sizeof(struct mlx5_wqe_ctrl_seg)) / + sizeof(struct mlx5_wqe_data_seg); + props->max_sge = min(max_rq_sg, max_sq_sg); + props->max_cq = 1 << dev->mdev.caps.log_max_cq; + props->max_cqe = dev->mdev.caps.max_cqes - 1; + props->max_mr = 1 << dev->mdev.caps.log_max_mkey; + props->max_pd = 1 << dev->mdev.caps.log_max_pd; + props->max_qp_rd_atom = dev->mdev.caps.max_ra_req_qp; + props->max_qp_init_rd_atom = dev->mdev.caps.max_ra_res_qp; + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_srq = 1 << dev->mdev.caps.log_max_srq; + props->max_srq_wr = dev->mdev.caps.max_srq_wqes - 1; + props->max_srq_sge = max_rq_sg - 1; + props->max_fast_reg_page_list_len = (unsigned int)-1; + props->local_ca_ack_delay = dev->mdev.caps.local_ca_ack_delay; + props->atomic_cap = IB_ATOMIC_NONE; + props->masked_atomic_cap = IB_ATOMIC_NONE; + props->max_pkeys = be16_to_cpup((__be16 *)(out_mad->data + 28)); + props->max_mcast_grp = 1 << dev->mdev.caps.log_max_mcg; + props->max_mcast_qp_attach = dev->mdev.caps.max_qp_mcg; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ + +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} + +int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int ext_active_speed; + int err = -ENOMEM; + + if (port < 1 || port > dev->mdev.caps.num_ports) { + mlx5_ib_warn(dev, "invalid port number %d\n", port); + return -EINVAL; + } + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + memset(props, 0, sizeof(*props)); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) { + mlx5_ib_warn(dev, "err %d\n", err); + goto out; + } + + + props->lid = be16_to_cpup((__be16 *)(out_mad->data + 16)); + props->lmc = out_mad->data[34] & 0x7; + props->sm_lid = be16_to_cpup((__be16 *)(out_mad->data + 18)); + props->sm_sl = out_mad->data[36] & 0xf; + props->state = out_mad->data[32] & 0xf; + props->phys_state = out_mad->data[33] >> 4; + props->port_cap_flags = be32_to_cpup((__be32 *)(out_mad->data + 20)); + props->gid_tbl_len = out_mad->data[50]; + props->max_msg_sz = 1 << to_mdev(ibdev)->mdev.caps.log_max_msg; + props->pkey_tbl_len = to_mdev(ibdev)->mdev.caps.port[port - 1].pkey_table_len; + props->bad_pkey_cntr = be16_to_cpup((__be16 *)(out_mad->data + 46)); + props->qkey_viol_cntr = be16_to_cpup((__be16 *)(out_mad->data + 48)); + props->active_width = out_mad->data[31] & 0xf; + props->active_speed = out_mad->data[35] >> 4; + props->max_mtu = out_mad->data[41] & 0xf; + props->active_mtu = out_mad->data[36] >> 4; + props->subnet_timeout = out_mad->data[51] & 0x1f; + props->max_vl_num = out_mad->data[37] >> 4; + props->init_type_reply = out_mad->data[41] >> 4; + + /* Check if extended speeds (EDR/FDR/...) are supported */ + if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { + ext_active_speed = out_mad->data[62] >> 4; + + switch (ext_active_speed) { + case 1: + props->active_speed = 16; /* FDR */ + break; + case 2: + props->active_speed = 32; /* EDR */ + break; + } + } + + /* If reported active speed is QDR, check if is FDR-10 */ + if (props->active_speed == 4) { + if (dev->mdev.caps.ext_port_cap[port - 1] & + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) { + init_query_mad(in_mad); + in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, port, + NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + /* Checking LinkSpeedActive for FDR-10 */ + if (out_mad->data[15] & 0x1) + props->active_speed = 8; + } + } + +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} + +static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw, out_mad->data + 8, 8); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(index / 8); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; + in_mad->attr_mod = cpu_to_be32(index / 32); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + *pkey = be16_to_cpu(((__be16 *)out_mad->data)[index % 32]); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +struct mlx5_reg_node_desc { + u8 desc[64]; +}; + +static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, + struct ib_device_modify *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_reg_node_desc in; + struct mlx5_reg_node_desc out; + int err; + + if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) + return 0; + + /* + * If possible, pass node desc to FW, so it can generate + * a 144 trap. If cmd fails, just ignore. + */ + memcpy(&in, props->node_desc, 64); + err = mlx5_core_access_reg(&dev->mdev, &in, sizeof(in), &out, + sizeof(out), MLX5_REG_NODE_DESC, 0, 1); + if (err) + return err; + + memcpy(ibdev->node_desc, props->node_desc, 64); + + return err; +} + +static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, + struct ib_port_modify *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct ib_port_attr attr; + u32 tmp; + int err; + + mutex_lock(&dev->cap_mask_mutex); + + err = mlx5_ib_query_port(ibdev, port, &attr); + if (err) + goto out; + + tmp = (attr.port_cap_flags | props->set_port_cap_mask) & + ~props->clr_port_cap_mask; + + err = mlx5_set_port_caps(&dev->mdev, port, tmp); + +out: + mutex_unlock(&dev->cap_mask_mutex); + return err; +} + +static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_alloc_ucontext_req_v2 req; + struct mlx5_ib_alloc_ucontext_resp resp; + struct mlx5_ib_ucontext *context; + struct mlx5_uuar_info *uuari; + struct mlx5_uar *uars; + int gross_uuars; + int num_uars; + int ver; + int uuarn; + int err; + int i; + int reqlen; + + if (!dev->ib_active) + return ERR_PTR(-EAGAIN); + + memset(&req, 0, sizeof(req)); + reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); + if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) + ver = 0; + else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2)) + ver = 2; + else + return ERR_PTR(-EINVAL); + + err = ib_copy_from_udata(&req, udata, reqlen); + if (err) + return ERR_PTR(err); + + if (req.flags || req.reserved) + return ERR_PTR(-EINVAL); + + if (req.total_num_uuars > MLX5_MAX_UUARS) + return ERR_PTR(-ENOMEM); + + if (req.total_num_uuars == 0) + return ERR_PTR(-EINVAL); + + req.total_num_uuars = ALIGN(req.total_num_uuars, + MLX5_NON_FP_BF_REGS_PER_PAGE); + if (req.num_low_latency_uuars > req.total_num_uuars - 1) + return ERR_PTR(-EINVAL); + + num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE; + gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE; + resp.qp_tab_size = 1 << dev->mdev.caps.log_max_qp; + resp.bf_reg_size = dev->mdev.caps.bf_reg_size; + resp.cache_line_size = L1_CACHE_BYTES; + resp.max_sq_desc_sz = dev->mdev.caps.max_sq_desc_sz; + resp.max_rq_desc_sz = dev->mdev.caps.max_rq_desc_sz; + resp.max_send_wqebb = dev->mdev.caps.max_wqes; + resp.max_recv_wr = dev->mdev.caps.max_wqes; + resp.max_srq_recv_wr = dev->mdev.caps.max_srq_wqes; + + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + uuari = &context->uuari; + mutex_init(&uuari->lock); + uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL); + if (!uars) { + err = -ENOMEM; + goto out_ctx; + } + + uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars), + sizeof(*uuari->bitmap), + GFP_KERNEL); + if (!uuari->bitmap) { + err = -ENOMEM; + goto out_uar_ctx; + } + /* + * clear all fast path uuars + */ + for (i = 0; i < gross_uuars; i++) { + uuarn = i & 3; + if (uuarn == 2 || uuarn == 3) + set_bit(i, uuari->bitmap); + } + + uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL); + if (!uuari->count) { + err = -ENOMEM; + goto out_bitmap; + } + + for (i = 0; i < num_uars; i++) { + err = mlx5_cmd_alloc_uar(&dev->mdev, &uars[i].index); + if (err) + goto out_count; + } + + INIT_LIST_HEAD(&context->db_page_list); + mutex_init(&context->db_page_mutex); + + resp.tot_uuars = req.total_num_uuars; + resp.num_ports = dev->mdev.caps.num_ports; + err = ib_copy_to_udata(udata, &resp, + sizeof(resp) - sizeof(resp.reserved)); + if (err) + goto out_uars; + + uuari->ver = ver; + uuari->num_low_latency_uuars = req.num_low_latency_uuars; + uuari->uars = uars; + uuari->num_uars = num_uars; + return &context->ibucontext; + +out_uars: + for (i--; i >= 0; i--) + mlx5_cmd_free_uar(&dev->mdev, uars[i].index); +out_count: + kfree(uuari->count); + +out_bitmap: + kfree(uuari->bitmap); + +out_uar_ctx: + kfree(uars); + +out_ctx: + kfree(context); + return ERR_PTR(err); +} + +static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + struct mlx5_uuar_info *uuari = &context->uuari; + int i; + + for (i = 0; i < uuari->num_uars; i++) { + if (mlx5_cmd_free_uar(&dev->mdev, uuari->uars[i].index)) + mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index); + } + + kfree(uuari->count); + kfree(uuari->bitmap); + kfree(uuari->uars); + kfree(context); + + return 0; +} + +static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index) +{ + return (pci_resource_start(dev->mdev.pdev, 0) >> PAGE_SHIFT) + index; +} + +static int get_command(unsigned long offset) +{ + return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK; +} + +static int get_arg(unsigned long offset) +{ + return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1); +} + +static int get_index(unsigned long offset) +{ + return get_arg(offset); +} + +static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) +{ + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + struct mlx5_uuar_info *uuari = &context->uuari; + unsigned long command; + unsigned long idx; + phys_addr_t pfn; + + command = get_command(vma->vm_pgoff); + switch (command) { + case MLX5_IB_MMAP_REGULAR_PAGE: + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + idx = get_index(vma->vm_pgoff); + pfn = uar_index2pfn(dev, uuari->uars[idx].index); + mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx, + (unsigned long long)pfn); + + if (idx >= uuari->num_uars) + return -EINVAL; + + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + if (io_remap_pfn_range(vma, vma->vm_start, pfn, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + + mlx5_ib_dbg(dev, "mapped WC at 0x%lx, PA 0x%llx\n", + vma->vm_start, + (unsigned long long)pfn << PAGE_SHIFT); + break; + + case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: + return -ENOSYS; + + default: + return -EINVAL; + } + + return 0; +} + +static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn) +{ + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_mkey_seg *seg; + struct mlx5_core_mr mr; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + seg = &in->seg; + seg->flags = MLX5_PERM_LOCAL_READ | MLX5_ACCESS_MODE_PA; + seg->flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64); + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + seg->start_addr = 0; + + err = mlx5_core_create_mkey(&dev->mdev, &mr, in, sizeof(*in), + NULL, NULL, NULL); + if (err) { + mlx5_ib_warn(dev, "failed to create mkey, %d\n", err); + goto err_in; + } + + kfree(in); + *key = mr.key; + + return 0; + +err_in: + kfree(in); + + return err; +} + +static void free_pa_mkey(struct mlx5_ib_dev *dev, u32 key) +{ + struct mlx5_core_mr mr; + int err; + + memset(&mr, 0, sizeof(mr)); + mr.key = key; + err = mlx5_core_destroy_mkey(&dev->mdev, &mr); + if (err) + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x\n", key); +} + +static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_ib_alloc_pd_resp resp; + struct mlx5_ib_pd *pd; + int err; + + pd = kmalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + err = mlx5_core_alloc_pd(&to_mdev(ibdev)->mdev, &pd->pdn); + if (err) { + kfree(pd); + return ERR_PTR(err); + } + + if (context) { + resp.pdn = pd->pdn; + if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { + mlx5_core_dealloc_pd(&to_mdev(ibdev)->mdev, pd->pdn); + kfree(pd); + return ERR_PTR(-EFAULT); + } + } else { + err = alloc_pa_mkey(to_mdev(ibdev), &pd->pa_lkey, pd->pdn); + if (err) { + mlx5_core_dealloc_pd(&to_mdev(ibdev)->mdev, pd->pdn); + kfree(pd); + return ERR_PTR(err); + } + } + + return &pd->ibpd; +} + +static int mlx5_ib_dealloc_pd(struct ib_pd *pd) +{ + struct mlx5_ib_dev *mdev = to_mdev(pd->device); + struct mlx5_ib_pd *mpd = to_mpd(pd); + + if (!pd->uobject) + free_pa_mkey(mdev, mpd->pa_lkey); + + mlx5_core_dealloc_pd(&mdev->mdev, mpd->pdn); + kfree(mpd); + + return 0; +} + +static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + int err; + + err = mlx5_core_attach_mcg(&dev->mdev, gid, ibqp->qp_num); + if (err) + mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", + ibqp->qp_num, gid->raw); + + return err; +} + +static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + int err; + + err = mlx5_core_detach_mcg(&dev->mdev, gid, ibqp->qp_num); + if (err) + mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n", + ibqp->qp_num, gid->raw); + + return err; +} + +static int init_node_data(struct mlx5_ib_dev *dev) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(dev->ib_dev.node_desc, out_mad->data, 64); + + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + dev->mdev.rev_id = be32_to_cpup((__be32 *)(out_mad->data + 32)); + memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + + return sprintf(buf, "%d\n", dev->mdev.priv.fw_pages); +} + +static ssize_t show_reg_pages(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + + return sprintf(buf, "%d\n", dev->mdev.priv.reg_pages); +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "MT%d\n", dev->mdev.pdev->device); +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%d.%d.%d\n", fw_rev_maj(&dev->mdev), + fw_rev_min(&dev->mdev), fw_rev_sub(&dev->mdev)); +} + +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%x\n", dev->mdev.rev_id); +} + +static ssize_t show_board(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, + dev->mdev.board_id); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL); +static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL); + +static struct device_attribute *mlx5_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id, + &dev_attr_fw_pages, + &dev_attr_reg_pages, +}; + +static void mlx5_ib_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event, + void *data) +{ + struct mlx5_ib_dev *ibdev = container_of(dev, struct mlx5_ib_dev, mdev); + struct ib_event ibev; + u8 port = 0; + + switch (event) { + case MLX5_DEV_EVENT_SYS_ERROR: + ibdev->ib_active = false; + ibev.event = IB_EVENT_DEVICE_FATAL; + break; + + case MLX5_DEV_EVENT_PORT_UP: + ibev.event = IB_EVENT_PORT_ACTIVE; + port = *(u8 *)data; + break; + + case MLX5_DEV_EVENT_PORT_DOWN: + ibev.event = IB_EVENT_PORT_ERR; + port = *(u8 *)data; + break; + + case MLX5_DEV_EVENT_PORT_INITIALIZED: + /* not used by ULPs */ + return; + + case MLX5_DEV_EVENT_LID_CHANGE: + ibev.event = IB_EVENT_LID_CHANGE; + port = *(u8 *)data; + break; + + case MLX5_DEV_EVENT_PKEY_CHANGE: + ibev.event = IB_EVENT_PKEY_CHANGE; + port = *(u8 *)data; + break; + + case MLX5_DEV_EVENT_GUID_CHANGE: + ibev.event = IB_EVENT_GID_CHANGE; + port = *(u8 *)data; + break; + + case MLX5_DEV_EVENT_CLIENT_REREG: + ibev.event = IB_EVENT_CLIENT_REREGISTER; + port = *(u8 *)data; + break; + } + + ibev.device = &ibdev->ib_dev; + ibev.element.port_num = port; + + if (port < 1 || port > ibdev->num_ports) { + mlx5_ib_warn(ibdev, "warning: event on port %d\n", port); + return; + } + + if (ibdev->ib_active) + ib_dispatch_event(&ibev); +} + +static void get_ext_port_caps(struct mlx5_ib_dev *dev) +{ + int port; + + for (port = 1; port <= dev->mdev.caps.num_ports; port++) + mlx5_query_ext_port_caps(dev, port); +} + +static int get_port_caps(struct mlx5_ib_dev *dev) +{ + struct ib_device_attr *dprops = NULL; + struct ib_port_attr *pprops = NULL; + int err = 0; + int port; + + pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); + if (!pprops) + goto out; + + dprops = kmalloc(sizeof(*dprops), GFP_KERNEL); + if (!dprops) + goto out; + + err = mlx5_ib_query_device(&dev->ib_dev, dprops); + if (err) { + mlx5_ib_warn(dev, "query_device failed %d\n", err); + goto out; + } + + for (port = 1; port <= dev->mdev.caps.num_ports; port++) { + err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); + if (err) { + mlx5_ib_warn(dev, "query_port %d failed %d\n", port, err); + break; + } + dev->mdev.caps.port[port - 1].pkey_table_len = dprops->max_pkeys; + dev->mdev.caps.port[port - 1].gid_table_len = pprops->gid_tbl_len; + mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n", + dprops->max_pkeys, pprops->gid_tbl_len); + } + +out: + kfree(pprops); + kfree(dprops); + + return err; +} + +static void destroy_umrc_res(struct mlx5_ib_dev *dev) +{ + int err; + + err = mlx5_mr_cache_cleanup(dev); + if (err) + mlx5_ib_warn(dev, "mr cache cleanup failed\n"); + + mlx5_ib_destroy_qp(dev->umrc.qp); + ib_destroy_cq(dev->umrc.cq); + ib_dereg_mr(dev->umrc.mr); + ib_dealloc_pd(dev->umrc.pd); +} + +enum { + MAX_UMR_WR = 128, +}; + +static int create_umr_res(struct mlx5_ib_dev *dev) +{ + struct ib_qp_init_attr *init_attr = NULL; + struct ib_qp_attr *attr = NULL; + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + struct ib_mr *mr; + int ret; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL); + if (!attr || !init_attr) { + ret = -ENOMEM; + goto error_0; + } + + pd = ib_alloc_pd(&dev->ib_dev); + if (IS_ERR(pd)) { + mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); + ret = PTR_ERR(pd); + goto error_0; + } + + mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(mr)) { + mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n"); + ret = PTR_ERR(mr); + goto error_1; + } + + cq = ib_create_cq(&dev->ib_dev, mlx5_umr_cq_handler, NULL, NULL, 128, + 0); + if (IS_ERR(cq)) { + mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); + ret = PTR_ERR(cq); + goto error_2; + } + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + + init_attr->send_cq = cq; + init_attr->recv_cq = cq; + init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->cap.max_send_wr = MAX_UMR_WR; + init_attr->cap.max_send_sge = 1; + init_attr->qp_type = MLX5_IB_QPT_REG_UMR; + init_attr->port_num = 1; + qp = mlx5_ib_create_qp(pd, init_attr, NULL); + if (IS_ERR(qp)) { + mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); + ret = PTR_ERR(qp); + goto error_3; + } + qp->device = &dev->ib_dev; + qp->real_qp = qp; + qp->uobject = NULL; + qp->qp_type = MLX5_IB_QPT_REG_UMR; + + attr->qp_state = IB_QPS_INIT; + attr->port_num = 1; + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | + IB_QP_PORT, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); + goto error_4; + } + + memset(attr, 0, sizeof(*attr)); + attr->qp_state = IB_QPS_RTR; + attr->path_mtu = IB_MTU_256; + + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n"); + goto error_4; + } + + memset(attr, 0, sizeof(*attr)); + attr->qp_state = IB_QPS_RTS; + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n"); + goto error_4; + } + + dev->umrc.qp = qp; + dev->umrc.cq = cq; + dev->umrc.mr = mr; + dev->umrc.pd = pd; + + sema_init(&dev->umrc.sem, MAX_UMR_WR); + ret = mlx5_mr_cache_init(dev); + if (ret) { + mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); + goto error_4; + } + + kfree(attr); + kfree(init_attr); + + return 0; + +error_4: + mlx5_ib_destroy_qp(qp); + +error_3: + ib_destroy_cq(cq); + +error_2: + ib_dereg_mr(mr); + +error_1: + ib_dealloc_pd(pd); + +error_0: + kfree(attr); + kfree(init_attr); + return ret; +} + +static int create_dev_resources(struct mlx5_ib_resources *devr) +{ + struct ib_srq_init_attr attr; + struct mlx5_ib_dev *dev; + int ret = 0; + + dev = container_of(devr, struct mlx5_ib_dev, devr); + + devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->p0)) { + ret = PTR_ERR(devr->p0); + goto error0; + } + devr->p0->device = &dev->ib_dev; + devr->p0->uobject = NULL; + atomic_set(&devr->p0->usecnt, 0); + + devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, 1, 0, NULL, NULL); + if (IS_ERR(devr->c0)) { + ret = PTR_ERR(devr->c0); + goto error1; + } + devr->c0->device = &dev->ib_dev; + devr->c0->uobject = NULL; + devr->c0->comp_handler = NULL; + devr->c0->event_handler = NULL; + devr->c0->cq_context = NULL; + atomic_set(&devr->c0->usecnt, 0); + + devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->x0)) { + ret = PTR_ERR(devr->x0); + goto error2; + } + devr->x0->device = &dev->ib_dev; + devr->x0->inode = NULL; + atomic_set(&devr->x0->usecnt, 0); + mutex_init(&devr->x0->tgt_qp_mutex); + INIT_LIST_HEAD(&devr->x0->tgt_qp_list); + + devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->x1)) { + ret = PTR_ERR(devr->x1); + goto error3; + } + devr->x1->device = &dev->ib_dev; + devr->x1->inode = NULL; + atomic_set(&devr->x1->usecnt, 0); + mutex_init(&devr->x1->tgt_qp_mutex); + INIT_LIST_HEAD(&devr->x1->tgt_qp_list); + + memset(&attr, 0, sizeof(attr)); + attr.attr.max_sge = 1; + attr.attr.max_wr = 1; + attr.srq_type = IB_SRQT_XRC; + attr.ext.xrc.cq = devr->c0; + attr.ext.xrc.xrcd = devr->x0; + + devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL); + if (IS_ERR(devr->s0)) { + ret = PTR_ERR(devr->s0); + goto error4; + } + devr->s0->device = &dev->ib_dev; + devr->s0->pd = devr->p0; + devr->s0->uobject = NULL; + devr->s0->event_handler = NULL; + devr->s0->srq_context = NULL; + devr->s0->srq_type = IB_SRQT_XRC; + devr->s0->ext.xrc.xrcd = devr->x0; + devr->s0->ext.xrc.cq = devr->c0; + atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt); + atomic_inc(&devr->s0->ext.xrc.cq->usecnt); + atomic_inc(&devr->p0->usecnt); + atomic_set(&devr->s0->usecnt, 0); + + return 0; + +error4: + mlx5_ib_dealloc_xrcd(devr->x1); +error3: + mlx5_ib_dealloc_xrcd(devr->x0); +error2: + mlx5_ib_destroy_cq(devr->c0); +error1: + mlx5_ib_dealloc_pd(devr->p0); +error0: + return ret; +} + +static void destroy_dev_resources(struct mlx5_ib_resources *devr) +{ + mlx5_ib_destroy_srq(devr->s0); + mlx5_ib_dealloc_xrcd(devr->x0); + mlx5_ib_dealloc_xrcd(devr->x1); + mlx5_ib_destroy_cq(devr->c0); + mlx5_ib_dealloc_pd(devr->p0); +} + +static int init_one(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct mlx5_core_dev *mdev; + struct mlx5_ib_dev *dev; + int err; + int i; + + printk_once(KERN_INFO "%s", mlx5_version); + + dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); + if (!dev) + return -ENOMEM; + + mdev = &dev->mdev; + mdev->event = mlx5_ib_event; + if (prof_sel >= ARRAY_SIZE(profile)) { + pr_warn("selected pofile out of range, selceting default\n"); + prof_sel = 0; + } + mdev->profile = &profile[prof_sel]; + err = mlx5_dev_init(mdev, pdev); + if (err) + goto err_free; + + err = get_port_caps(dev); + if (err) + goto err_cleanup; + + get_ext_port_caps(dev); + + err = alloc_comp_eqs(dev); + if (err) + goto err_cleanup; + + MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock); + + strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX); + dev->ib_dev.owner = THIS_MODULE; + dev->ib_dev.node_type = RDMA_NODE_IB_CA; + dev->ib_dev.local_dma_lkey = mdev->caps.reserved_lkey; + dev->num_ports = mdev->caps.num_ports; + dev->ib_dev.phys_port_cnt = dev->num_ports; + dev->ib_dev.num_comp_vectors = dev->num_comp_vectors; + dev->ib_dev.dma_device = &mdev->pdev->dev; + + dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; + dev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP); + + dev->ib_dev.query_device = mlx5_ib_query_device; + dev->ib_dev.query_port = mlx5_ib_query_port; + dev->ib_dev.query_gid = mlx5_ib_query_gid; + dev->ib_dev.query_pkey = mlx5_ib_query_pkey; + dev->ib_dev.modify_device = mlx5_ib_modify_device; + dev->ib_dev.modify_port = mlx5_ib_modify_port; + dev->ib_dev.alloc_ucontext = mlx5_ib_alloc_ucontext; + dev->ib_dev.dealloc_ucontext = mlx5_ib_dealloc_ucontext; + dev->ib_dev.mmap = mlx5_ib_mmap; + dev->ib_dev.alloc_pd = mlx5_ib_alloc_pd; + dev->ib_dev.dealloc_pd = mlx5_ib_dealloc_pd; + dev->ib_dev.create_ah = mlx5_ib_create_ah; + dev->ib_dev.query_ah = mlx5_ib_query_ah; + dev->ib_dev.destroy_ah = mlx5_ib_destroy_ah; + dev->ib_dev.create_srq = mlx5_ib_create_srq; + dev->ib_dev.modify_srq = mlx5_ib_modify_srq; + dev->ib_dev.query_srq = mlx5_ib_query_srq; + dev->ib_dev.destroy_srq = mlx5_ib_destroy_srq; + dev->ib_dev.post_srq_recv = mlx5_ib_post_srq_recv; + dev->ib_dev.create_qp = mlx5_ib_create_qp; + dev->ib_dev.modify_qp = mlx5_ib_modify_qp; + dev->ib_dev.query_qp = mlx5_ib_query_qp; + dev->ib_dev.destroy_qp = mlx5_ib_destroy_qp; + dev->ib_dev.post_send = mlx5_ib_post_send; + dev->ib_dev.post_recv = mlx5_ib_post_recv; + dev->ib_dev.create_cq = mlx5_ib_create_cq; + dev->ib_dev.modify_cq = mlx5_ib_modify_cq; + dev->ib_dev.resize_cq = mlx5_ib_resize_cq; + dev->ib_dev.destroy_cq = mlx5_ib_destroy_cq; + dev->ib_dev.poll_cq = mlx5_ib_poll_cq; + dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq; + dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; + dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; + dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; + dev->ib_dev.destroy_mr = mlx5_ib_destroy_mr; + dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; + dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; + dev->ib_dev.process_mad = mlx5_ib_process_mad; + dev->ib_dev.create_mr = mlx5_ib_create_mr; + dev->ib_dev.alloc_fast_reg_mr = mlx5_ib_alloc_fast_reg_mr; + dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list; + dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; + dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; + + if (mdev->caps.flags & MLX5_DEV_CAP_FLAG_XRC) { + dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; + dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; + dev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | + (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); + } + + err = init_node_data(dev); + if (err) + goto err_eqs; + + mutex_init(&dev->cap_mask_mutex); + spin_lock_init(&dev->mr_lock); + + err = create_dev_resources(&dev->devr); + if (err) + goto err_eqs; + + err = ib_register_device(&dev->ib_dev, NULL); + if (err) + goto err_rsrc; + + err = create_umr_res(dev); + if (err) + goto err_dev; + + for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { + err = device_create_file(&dev->ib_dev.dev, + mlx5_class_attributes[i]); + if (err) + goto err_umrc; + } + + dev->ib_active = true; + + return 0; + +err_umrc: + destroy_umrc_res(dev); + +err_dev: + ib_unregister_device(&dev->ib_dev); + +err_rsrc: + destroy_dev_resources(&dev->devr); + +err_eqs: + free_comp_eqs(dev); + +err_cleanup: + mlx5_dev_cleanup(mdev); + +err_free: + ib_dealloc_device((struct ib_device *)dev); + + return err; +} + +static void remove_one(struct pci_dev *pdev) +{ + struct mlx5_ib_dev *dev = mlx5_pci2ibdev(pdev); + + destroy_umrc_res(dev); + ib_unregister_device(&dev->ib_dev); + destroy_dev_resources(&dev->devr); + free_comp_eqs(dev); + mlx5_dev_cleanup(&dev->mdev); + ib_dealloc_device(&dev->ib_dev); +} + +static DEFINE_PCI_DEVICE_TABLE(mlx5_ib_pci_table) = { + { PCI_VDEVICE(MELLANOX, 4113) }, /* MT4113 Connect-IB */ + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, mlx5_ib_pci_table); + +static struct pci_driver mlx5_ib_driver = { + .name = DRIVER_NAME, + .id_table = mlx5_ib_pci_table, + .probe = init_one, + .remove = remove_one +}; + +static int __init mlx5_ib_init(void) +{ + return pci_register_driver(&mlx5_ib_driver); +} + +static void __exit mlx5_ib_cleanup(void) +{ + pci_unregister_driver(&mlx5_ib_driver); +} + +module_init(mlx5_ib_init); +module_exit(mlx5_ib_cleanup); diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c new file mode 100644 index 00000000000..8499aec94db --- /dev/null +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <rdma/ib_umem.h> +#include "mlx5_ib.h" + +/* @umem: umem object to scan + * @addr: ib virtual address requested by the user + * @count: number of PAGE_SIZE pages covered by umem + * @shift: page shift for the compound pages found in the region + * @ncont: number of compund pages + * @order: log2 of the number of compound pages + */ +void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, + int *ncont, int *order) +{ + unsigned long tmp; + unsigned long m; + int i, k; + u64 base = 0; + int p = 0; + int skip; + int mask; + u64 len; + u64 pfn; + struct scatterlist *sg; + int entry; + + addr = addr >> PAGE_SHIFT; + tmp = (unsigned long)addr; + m = find_first_bit(&tmp, sizeof(tmp)); + skip = 1 << m; + mask = skip - 1; + i = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> PAGE_SHIFT; + pfn = sg_dma_address(sg) >> PAGE_SHIFT; + for (k = 0; k < len; k++) { + if (!(i & mask)) { + tmp = (unsigned long)pfn; + m = min(m, find_first_bit(&tmp, sizeof(tmp))); + skip = 1 << m; + mask = skip - 1; + base = pfn; + p = 0; + } else { + if (base + p != pfn) { + tmp = (unsigned long)p; + m = find_first_bit(&tmp, sizeof(tmp)); + skip = 1 << m; + mask = skip - 1; + base = pfn; + p = 0; + } + } + p++; + i++; + } + } + + if (i) { + m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m); + + if (order) + *order = ilog2(roundup_pow_of_two(i) >> m); + + *ncont = DIV_ROUND_UP(i, (1 << m)); + } else { + m = 0; + + if (order) + *order = 0; + + *ncont = 0; + } + *shift = PAGE_SHIFT + m; + *count = i; +} + +void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, __be64 *pas, int umr) +{ + int shift = page_shift - PAGE_SHIFT; + int mask = (1 << shift) - 1; + int i, k; + u64 cur = 0; + u64 base; + int len; + struct scatterlist *sg; + int entry; + + i = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> PAGE_SHIFT; + base = sg_dma_address(sg); + for (k = 0; k < len; k++) { + if (!(i & mask)) { + cur = base + (k << PAGE_SHIFT); + if (umr) + cur |= 3; + + pas[i >> shift] = cpu_to_be64(cur); + mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", + i >> shift, be64_to_cpu(pas[i >> shift])); + } else + mlx5_ib_dbg(dev, "=====> 0x%llx\n", + base + (k << PAGE_SHIFT)); + i++; + } + } +} + +int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) +{ + u64 page_size; + u64 page_mask; + u64 off_size; + u64 off_mask; + u64 buf_off; + + page_size = 1 << page_shift; + page_mask = page_size - 1; + buf_off = addr & page_mask; + off_size = page_size >> 6; + off_mask = off_size - 1; + + if (buf_off & off_mask) + return -EINVAL; + + *offset = buf_off >> ilog2(off_size); + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h new file mode 100644 index 00000000000..f2ccf1a5a29 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -0,0 +1,574 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IB_H +#define MLX5_IB_H + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <rdma/ib_verbs.h> +#include <rdma/ib_smi.h> +#include <linux/mlx5/driver.h> +#include <linux/mlx5/cq.h> +#include <linux/mlx5/qp.h> +#include <linux/mlx5/srq.h> +#include <linux/types.h> + +#define mlx5_ib_dbg(dev, format, arg...) \ +pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) + +#define mlx5_ib_err(dev, format, arg...) \ +pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) + +#define mlx5_ib_warn(dev, format, arg...) \ +pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) + +enum { + MLX5_IB_MMAP_CMD_SHIFT = 8, + MLX5_IB_MMAP_CMD_MASK = 0xff, +}; + +enum mlx5_ib_mmap_cmd { + MLX5_IB_MMAP_REGULAR_PAGE = 0, + MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, /* always last */ +}; + +enum { + MLX5_RES_SCAT_DATA32_CQE = 0x1, + MLX5_RES_SCAT_DATA64_CQE = 0x2, + MLX5_REQ_SCAT_DATA32_CQE = 0x11, + MLX5_REQ_SCAT_DATA64_CQE = 0x22, +}; + +enum mlx5_ib_latency_class { + MLX5_IB_LATENCY_CLASS_LOW, + MLX5_IB_LATENCY_CLASS_MEDIUM, + MLX5_IB_LATENCY_CLASS_HIGH, + MLX5_IB_LATENCY_CLASS_FAST_PATH +}; + +enum mlx5_ib_mad_ifc_flags { + MLX5_MAD_IFC_IGNORE_MKEY = 1, + MLX5_MAD_IFC_IGNORE_BKEY = 2, + MLX5_MAD_IFC_NET_VIEW = 4, +}; + +struct mlx5_ib_ucontext { + struct ib_ucontext ibucontext; + struct list_head db_page_list; + + /* protect doorbell record alloc/free + */ + struct mutex db_page_mutex; + struct mlx5_uuar_info uuari; +}; + +static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct mlx5_ib_ucontext, ibucontext); +} + +struct mlx5_ib_pd { + struct ib_pd ibpd; + u32 pdn; + u32 pa_lkey; +}; + +/* Use macros here so that don't have to duplicate + * enum ib_send_flags and enum ib_qp_type for low-level driver + */ + +#define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START +#define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 +#define MLX5_IB_WR_UMR IB_WR_RESERVED1 + +struct wr_list { + u16 opcode; + u16 next; +}; + +struct mlx5_ib_wq { + u64 *wrid; + u32 *wr_data; + struct wr_list *w_list; + unsigned *wqe_head; + u16 unsig_count; + + /* serialize post to the work queue + */ + spinlock_t lock; + int wqe_cnt; + int max_post; + int max_gs; + int offset; + int wqe_shift; + unsigned head; + unsigned tail; + u16 cur_post; + u16 last_poll; + void *qend; +}; + +enum { + MLX5_QP_USER, + MLX5_QP_KERNEL, + MLX5_QP_EMPTY +}; + +struct mlx5_ib_qp { + struct ib_qp ibqp; + struct mlx5_core_qp mqp; + struct mlx5_buf buf; + + struct mlx5_db db; + struct mlx5_ib_wq rq; + + u32 doorbell_qpn; + u8 sq_signal_bits; + u8 fm_cache; + int sq_max_wqes_per_wr; + int sq_spare_wqes; + struct mlx5_ib_wq sq; + + struct ib_umem *umem; + int buf_size; + + /* serialize qp state modifications + */ + struct mutex mutex; + u16 xrcdn; + u32 flags; + u8 port; + u8 alt_port; + u8 atomic_rd_en; + u8 resp_depth; + u8 state; + int mlx_type; + int wq_sig; + int scat_cqe; + int max_inline_data; + struct mlx5_bf *bf; + int has_rq; + + /* only for user space QPs. For kernel + * we have it from the bf object + */ + int uuarn; + + int create_type; + u32 pa_lkey; + + /* Store signature errors */ + bool signature_en; +}; + +struct mlx5_ib_cq_buf { + struct mlx5_buf buf; + struct ib_umem *umem; + int cqe_size; + int nent; +}; + +enum mlx5_ib_qp_flags { + MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 0, + MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, +}; + +struct mlx5_shared_mr_info { + int mr_id; + struct ib_umem *umem; +}; + +struct mlx5_ib_cq { + struct ib_cq ibcq; + struct mlx5_core_cq mcq; + struct mlx5_ib_cq_buf buf; + struct mlx5_db db; + + /* serialize access to the CQ + */ + spinlock_t lock; + + /* protect resize cq + */ + struct mutex resize_mutex; + struct mlx5_ib_cq_buf *resize_buf; + struct ib_umem *resize_umem; + int cqe_size; +}; + +struct mlx5_ib_srq { + struct ib_srq ibsrq; + struct mlx5_core_srq msrq; + struct mlx5_buf buf; + struct mlx5_db db; + u64 *wrid; + /* protect SRQ hanlding + */ + spinlock_t lock; + int head; + int tail; + u16 wqe_ctr; + struct ib_umem *umem; + /* serialize arming a SRQ + */ + struct mutex mutex; + int wq_sig; +}; + +struct mlx5_ib_xrcd { + struct ib_xrcd ibxrcd; + u32 xrcdn; +}; + +struct mlx5_ib_mr { + struct ib_mr ibmr; + struct mlx5_core_mr mmr; + struct ib_umem *umem; + struct mlx5_shared_mr_info *smr_info; + struct list_head list; + int order; + int umred; + __be64 *pas; + dma_addr_t dma; + int npages; + struct mlx5_ib_dev *dev; + struct mlx5_create_mkey_mbox_out out; + struct mlx5_core_sig_ctx *sig; +}; + +struct mlx5_ib_fast_reg_page_list { + struct ib_fast_reg_page_list ibfrpl; + __be64 *mapped_page_list; + dma_addr_t map; +}; + +struct mlx5_ib_umr_context { + enum ib_wc_status status; + struct completion done; +}; + +static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) +{ + context->status = -1; + init_completion(&context->done); +} + +struct umr_common { + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + struct ib_mr *mr; + /* control access to UMR QP + */ + struct semaphore sem; +}; + +enum { + MLX5_FMR_INVALID, + MLX5_FMR_VALID, + MLX5_FMR_BUSY, +}; + +struct mlx5_ib_fmr { + struct ib_fmr ibfmr; + struct mlx5_core_mr mr; + int access_flags; + int state; + /* protect fmr state + */ + spinlock_t lock; + u64 wrid; + struct ib_send_wr wr[2]; + u8 page_shift; + struct ib_fast_reg_page_list page_list; +}; + +struct mlx5_cache_ent { + struct list_head head; + /* sync access to the cahce entry + */ + spinlock_t lock; + + + struct dentry *dir; + char name[4]; + u32 order; + u32 size; + u32 cur; + u32 miss; + u32 limit; + + struct dentry *fsize; + struct dentry *fcur; + struct dentry *fmiss; + struct dentry *flimit; + + struct mlx5_ib_dev *dev; + struct work_struct work; + struct delayed_work dwork; + int pending; +}; + +struct mlx5_mr_cache { + struct workqueue_struct *wq; + struct mlx5_cache_ent ent[MAX_MR_CACHE_ENTRIES]; + int stopped; + struct dentry *root; + unsigned long last_add; +}; + +struct mlx5_ib_resources { + struct ib_cq *c0; + struct ib_xrcd *x0; + struct ib_xrcd *x1; + struct ib_pd *p0; + struct ib_srq *s0; +}; + +struct mlx5_ib_dev { + struct ib_device ib_dev; + struct mlx5_core_dev mdev; + MLX5_DECLARE_DOORBELL_LOCK(uar_lock); + struct list_head eqs_list; + int num_ports; + int num_comp_vectors; + /* serialize update of capability mask + */ + struct mutex cap_mask_mutex; + bool ib_active; + struct umr_common umrc; + /* sync used page count stats + */ + spinlock_t mr_lock; + struct mlx5_ib_resources devr; + struct mlx5_mr_cache cache; + struct timer_list delay_timer; + int fill_delay; +}; + +static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) +{ + return container_of(mcq, struct mlx5_ib_cq, mcq); +} + +static inline struct mlx5_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) +{ + return container_of(ibxrcd, struct mlx5_ib_xrcd, ibxrcd); +} + +static inline struct mlx5_ib_dev *to_mdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct mlx5_ib_dev, ib_dev); +} + +static inline struct mlx5_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct mlx5_ib_fmr, ibfmr); +} + +static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct mlx5_ib_cq, ibcq); +} + +static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) +{ + return container_of(mqp, struct mlx5_ib_qp, mqp); +} + +static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr) +{ + return container_of(mmr, struct mlx5_ib_mr, mmr); +} + +static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct mlx5_ib_pd, ibpd); +} + +static inline struct mlx5_ib_srq *to_msrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct mlx5_ib_srq, ibsrq); +} + +static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct mlx5_ib_qp, ibqp); +} + +static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq) +{ + return container_of(msrq, struct mlx5_ib_srq, msrq); +} + +static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct mlx5_ib_mr, ibmr); +} + +static inline struct mlx5_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl) +{ + return container_of(ibfrpl, struct mlx5_ib_fast_reg_page_list, ibfrpl); +} + +struct mlx5_ib_ah { + struct ib_ah ibah; + struct mlx5_av av; +}; + +static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah) +{ + return container_of(ibah, struct mlx5_ib_ah, ibah); +} + +static inline struct mlx5_ib_dev *mlx5_core2ibdev(struct mlx5_core_dev *dev) +{ + return container_of(dev, struct mlx5_ib_dev, mdev); +} + +static inline struct mlx5_ib_dev *mlx5_pci2ibdev(struct pci_dev *pdev) +{ + return mlx5_core2ibdev(pci2mlx5_core_dev(pdev)); +} + +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, + struct mlx5_db *db); +void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); +void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index); +int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, + int port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad); +struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr, + struct mlx5_ib_ah *ah); +struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); +int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); +int mlx5_ib_destroy_ah(struct ib_ah *ah); +struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); +int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr); +int mlx5_ib_destroy_srq(struct ib_srq *srq); +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx5_ib_destroy_qp(struct ib_qp *qp); +int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); +struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *context, + struct ib_udata *udata); +int mlx5_ib_destroy_cq(struct ib_cq *cq); +int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); +int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); +int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); +struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); +struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata); +int mlx5_ib_dereg_mr(struct ib_mr *ibmr); +int mlx5_ib_destroy_mr(struct ib_mr *ibmr); +struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr); +struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len); +struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len); +void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); +struct ib_fmr *mlx5_ib_fmr_alloc(struct ib_pd *pd, int acc, + struct ib_fmr_attr *fmr_attr); +int mlx5_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int npages, u64 iova); +int mlx5_ib_unmap_fmr(struct list_head *fmr_list); +int mlx5_ib_fmr_dealloc(struct ib_fmr *ibfmr); +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad); +struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata); +int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd); +int mlx5_vector2eqn(struct mlx5_ib_dev *dev, int vector, int *eqn, int *irqn); +int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset); +int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port); +int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); +int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev); +void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); +void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, + int *ncont, int *order); +void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, __be64 *pas, int umr); +void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); +int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); +int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); +int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); +int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift); +void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context); +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status); + +static inline void init_query_mad(struct ib_smp *mad) +{ + mad->base_version = 1; + mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + mad->class_version = 1; + mad->method = IB_MGMT_METHOD_GET; +} + +static inline u8 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | + MLX5_PERM_LOCAL_READ; +} + +#endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c new file mode 100644 index 00000000000..afa873bd028 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -0,0 +1,1250 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include <linux/kref.h> +#include <linux/random.h> +#include <linux/debugfs.h> +#include <linux/export.h> +#include <linux/delay.h> +#include <rdma/ib_umem.h> +#include "mlx5_ib.h" + +enum { + MAX_PENDING_REG_MR = 8, +}; + +enum { + MLX5_UMR_ALIGN = 2048 +}; + +static __be64 *mr_align(__be64 *ptr, int align) +{ + unsigned long mask = align - 1; + + return (__be64 *)(((unsigned long)ptr + mask) & ~mask); +} + +static int order2idx(struct mlx5_ib_dev *dev, int order) +{ + struct mlx5_mr_cache *cache = &dev->cache; + + if (order < cache->ent[0].order) + return 0; + else + return order - cache->ent[0].order; +} + +static void reg_mr_callback(int status, void *context) +{ + struct mlx5_ib_mr *mr = context; + struct mlx5_ib_dev *dev = mr->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int c = order2idx(dev, mr->order); + struct mlx5_cache_ent *ent = &cache->ent[c]; + u8 key; + unsigned long flags; + struct mlx5_mr_table *table = &dev->mdev.priv.mr_table; + int err; + + spin_lock_irqsave(&ent->lock, flags); + ent->pending--; + spin_unlock_irqrestore(&ent->lock, flags); + if (status) { + mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + if (mr->out.hdr.status) { + mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n", + mr->out.hdr.status, + be32_to_cpu(mr->out.hdr.syndrome)); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + spin_lock_irqsave(&dev->mdev.priv.mkey_lock, flags); + key = dev->mdev.priv.mkey_key++; + spin_unlock_irqrestore(&dev->mdev.priv.mkey_lock, flags); + mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; + + cache->last_add = jiffies; + + spin_lock_irqsave(&ent->lock, flags); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + ent->size++; + spin_unlock_irqrestore(&ent->lock, flags); + + write_lock_irqsave(&table->lock, flags); + err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmr.key), + &mr->mmr); + if (err) + pr_err("Error inserting to mr tree. 0x%x\n", -err); + write_unlock_irqrestore(&table->lock, flags); +} + +static int add_keys(struct mlx5_ib_dev *dev, int c, int num) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int npages = 1 << ent->order; + int err = 0; + int i; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + for (i = 0; i < num; i++) { + if (ent->pending >= MAX_PENDING_REG_MR) { + err = -EAGAIN; + break; + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + err = -ENOMEM; + break; + } + mr->order = ent->order; + mr->umred = 1; + mr->dev = dev; + in->seg.status = 1 << 6; + in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; + in->seg.log2_page_size = 12; + + spin_lock_irq(&ent->lock); + ent->pending++; + spin_unlock_irq(&ent->lock); + err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, + sizeof(*in), reg_mr_callback, + mr, &mr->out); + if (err) { + mlx5_ib_warn(dev, "create mkey failed %d\n", err); + kfree(mr); + break; + } + } + + kfree(in); + return err; +} + +static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *mr; + int err; + int i; + + for (i = 0; i < num; i++) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + return; + } + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_del(&mr->list); + ent->cur--; + ent->size--; + spin_unlock_irq(&ent->lock); + err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr); + if (err) + mlx5_ib_warn(dev, "failed destroy mkey\n"); + else + kfree(mr); + } +} + +static ssize_t size_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + struct mlx5_ib_dev *dev = ent->dev; + char lbuf[20]; + u32 var; + int err; + int c; + + if (copy_from_user(lbuf, buf, sizeof(lbuf))) + return -EFAULT; + + c = order2idx(dev, ent->order); + lbuf[sizeof(lbuf) - 1] = 0; + + if (sscanf(lbuf, "%u", &var) != 1) + return -EINVAL; + + if (var < ent->limit) + return -EINVAL; + + if (var > ent->size) { + do { + err = add_keys(dev, c, var - ent->size); + if (err && err != -EAGAIN) + return err; + + usleep_range(3000, 5000); + } while (err); + } else if (var < ent->size) { + remove_keys(dev, c, ent->size - var); + } + + return count; +} + +static ssize_t size_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + char lbuf[20]; + int err; + + if (*pos) + return 0; + + err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size); + if (err < 0) + return err; + + if (copy_to_user(buf, lbuf, err)) + return -EFAULT; + + *pos += err; + + return err; +} + +static const struct file_operations size_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = size_write, + .read = size_read, +}; + +static ssize_t limit_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + struct mlx5_ib_dev *dev = ent->dev; + char lbuf[20]; + u32 var; + int err; + int c; + + if (copy_from_user(lbuf, buf, sizeof(lbuf))) + return -EFAULT; + + c = order2idx(dev, ent->order); + lbuf[sizeof(lbuf) - 1] = 0; + + if (sscanf(lbuf, "%u", &var) != 1) + return -EINVAL; + + if (var > ent->size) + return -EINVAL; + + ent->limit = var; + + if (ent->cur < ent->limit) { + err = add_keys(dev, c, 2 * ent->limit - ent->cur); + if (err) + return err; + } + + return count; +} + +static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + char lbuf[20]; + int err; + + if (*pos) + return 0; + + err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); + if (err < 0) + return err; + + if (copy_to_user(buf, lbuf, err)) + return -EFAULT; + + *pos += err; + + return err; +} + +static const struct file_operations limit_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = limit_write, + .read = limit_read, +}; + +static int someone_adding(struct mlx5_mr_cache *cache) +{ + int i; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + if (cache->ent[i].cur < cache->ent[i].limit) + return 1; + } + + return 0; +} + +static void __cache_work_func(struct mlx5_cache_ent *ent) +{ + struct mlx5_ib_dev *dev = ent->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int i = order2idx(dev, ent->order); + int err; + + if (cache->stopped) + return; + + ent = &dev->cache.ent[i]; + if (ent->cur < 2 * ent->limit && !dev->fill_delay) { + err = add_keys(dev, i, 1); + if (ent->cur < 2 * ent->limit) { + if (err == -EAGAIN) { + mlx5_ib_dbg(dev, "returned eagain, order %d\n", + i + 2); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(3)); + } else if (err) { + mlx5_ib_warn(dev, "command failed order %d, err %d\n", + i + 2, err); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(1000)); + } else { + queue_work(cache->wq, &ent->work); + } + } + } else if (ent->cur > 2 * ent->limit) { + if (!someone_adding(cache) && + time_after(jiffies, cache->last_add + 300 * HZ)) { + remove_keys(dev, i, 1); + if (ent->cur > ent->limit) + queue_work(cache->wq, &ent->work); + } else { + queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); + } + } +} + +static void delayed_cache_work_func(struct work_struct *work) +{ + struct mlx5_cache_ent *ent; + + ent = container_of(work, struct mlx5_cache_ent, dwork.work); + __cache_work_func(ent); +} + +static void cache_work_func(struct work_struct *work) +{ + struct mlx5_cache_ent *ent; + + ent = container_of(work, struct mlx5_cache_ent, work); + __cache_work_func(ent); +} + +static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_ib_mr *mr = NULL; + struct mlx5_cache_ent *ent; + int c; + int i; + + c = order2idx(dev, order); + if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { + mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c); + return NULL; + } + + for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + + mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i); + + spin_lock_irq(&ent->lock); + if (!list_empty(&ent->head)) { + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, + list); + list_del(&mr->list); + ent->cur--; + spin_unlock_irq(&ent->lock); + if (ent->cur < ent->limit) + queue_work(cache->wq, &ent->work); + break; + } + spin_unlock_irq(&ent->lock); + + queue_work(cache->wq, &ent->work); + + if (mr) + break; + } + + if (!mr) + cache->ent[c].miss++; + + return mr; +} + +static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int shrink = 0; + int c; + + c = order2idx(dev, mr->order); + if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { + mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c); + return; + } + ent = &cache->ent[c]; + spin_lock_irq(&ent->lock); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + if (ent->cur > 2 * ent->limit) + shrink = 1; + spin_unlock_irq(&ent->lock); + + if (shrink) + queue_work(cache->wq, &ent->work); +} + +static void clean_keys(struct mlx5_ib_dev *dev, int c) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *mr; + int err; + + cancel_delayed_work(&ent->dwork); + while (1) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + return; + } + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_del(&mr->list); + ent->cur--; + ent->size--; + spin_unlock_irq(&ent->lock); + err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr); + if (err) + mlx5_ib_warn(dev, "failed destroy mkey\n"); + else + kfree(mr); + } +} + +static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int i; + + if (!mlx5_debugfs_root) + return 0; + + cache->root = debugfs_create_dir("mr_cache", dev->mdev.priv.dbg_root); + if (!cache->root) + return -ENOMEM; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + sprintf(ent->name, "%d", ent->order); + ent->dir = debugfs_create_dir(ent->name, cache->root); + if (!ent->dir) + return -ENOMEM; + + ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent, + &size_fops); + if (!ent->fsize) + return -ENOMEM; + + ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent, + &limit_fops); + if (!ent->flimit) + return -ENOMEM; + + ent->fcur = debugfs_create_u32("cur", 0400, ent->dir, + &ent->cur); + if (!ent->fcur) + return -ENOMEM; + + ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir, + &ent->miss); + if (!ent->fmiss) + return -ENOMEM; + } + + return 0; +} + +static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) +{ + if (!mlx5_debugfs_root) + return; + + debugfs_remove_recursive(dev->cache.root); +} + +static void delay_time_func(unsigned long ctx) +{ + struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx; + + dev->fill_delay = 0; +} + +int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int limit; + int err; + int i; + + cache->wq = create_singlethread_workqueue("mkey_cache"); + if (!cache->wq) { + mlx5_ib_warn(dev, "failed to create work queue\n"); + return -ENOMEM; + } + + setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev); + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + INIT_LIST_HEAD(&cache->ent[i].head); + spin_lock_init(&cache->ent[i].lock); + + ent = &cache->ent[i]; + INIT_LIST_HEAD(&ent->head); + spin_lock_init(&ent->lock); + ent->order = i + 2; + ent->dev = dev; + + if (dev->mdev.profile->mask & MLX5_PROF_MASK_MR_CACHE) + limit = dev->mdev.profile->mr_cache[i].limit; + else + limit = 0; + + INIT_WORK(&ent->work, cache_work_func); + INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); + ent->limit = limit; + queue_work(cache->wq, &ent->work); + } + + err = mlx5_mr_cache_debugfs_init(dev); + if (err) + mlx5_ib_warn(dev, "cache debugfs failure\n"); + + return 0; +} + +int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) +{ + int i; + + dev->cache.stopped = 1; + flush_workqueue(dev->cache.wq); + + mlx5_mr_cache_debugfs_cleanup(dev); + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) + clean_keys(dev, i); + + destroy_workqueue(dev->cache.wq); + del_timer_sync(&dev->delay_timer); + + return 0; +} + +struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_core_dev *mdev = &dev->mdev; + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_mkey_seg *seg; + struct mlx5_ib_mr *mr; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + seg = &in->seg; + seg->flags = convert_access(acc) | MLX5_ACCESS_MODE_PA; + seg->flags_pd = cpu_to_be32(to_mpd(pd)->pdn | MLX5_MKEY_LEN64); + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + seg->start_addr = 0; + + err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL, + NULL); + if (err) + goto err_in; + + kfree(in); + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_in: + kfree(in); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +static int get_octo_len(u64 addr, u64 len, int page_size) +{ + u64 offset; + int npages; + + offset = addr & (page_size - 1); + npages = ALIGN(len + offset, page_size) >> ilog2(page_size); + return (npages + 1) / 2; +} + +static int use_umr(int order) +{ + return order <= 17; +} + +static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, + struct ib_sge *sg, u64 dma, int n, u32 key, + int page_shift, u64 virt_addr, u64 len, + int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_mr *mr = dev->umrc.mr; + + sg->addr = dma; + sg->length = ALIGN(sizeof(u64) * n, 64); + sg->lkey = mr->lkey; + + wr->next = NULL; + wr->send_flags = 0; + wr->sg_list = sg; + if (n) + wr->num_sge = 1; + else + wr->num_sge = 0; + + wr->opcode = MLX5_IB_WR_UMR; + wr->wr.fast_reg.page_list_len = n; + wr->wr.fast_reg.page_shift = page_shift; + wr->wr.fast_reg.rkey = key; + wr->wr.fast_reg.iova_start = virt_addr; + wr->wr.fast_reg.length = len; + wr->wr.fast_reg.access_flags = access_flags; + wr->wr.fast_reg.page_list = (struct ib_fast_reg_page_list *)pd; +} + +static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, + struct ib_send_wr *wr, u32 key) +{ + wr->send_flags = MLX5_IB_SEND_UMR_UNREG; + wr->opcode = MLX5_IB_WR_UMR; + wr->wr.fast_reg.rkey = key; +} + +void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) +{ + struct mlx5_ib_umr_context *context; + struct ib_wc wc; + int err; + + while (1) { + err = ib_poll_cq(cq, 1, &wc); + if (err < 0) { + pr_warn("poll cq error %d\n", err); + return; + } + if (err == 0) + break; + + context = (struct mlx5_ib_umr_context *) (unsigned long) wc.wr_id; + context->status = wc.status; + complete(&context->done); + } + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); +} + +static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, + u64 virt_addr, u64 len, int npages, + int page_shift, int order, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct device *ddev = dev->ib_dev.dma_device; + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct ib_send_wr wr, *bad; + struct mlx5_ib_mr *mr; + struct ib_sge sg; + int size = sizeof(u64) * npages; + int err = 0; + int i; + + for (i = 0; i < 1; i++) { + mr = alloc_cached_mr(dev, order); + if (mr) + break; + + err = add_keys(dev, order2idx(dev, order), 1); + if (err && err != -EAGAIN) { + mlx5_ib_warn(dev, "add_keys failed, err %d\n", err); + break; + } + } + + if (!mr) + return ERR_PTR(-EAGAIN); + + mr->pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); + if (!mr->pas) { + err = -ENOMEM; + goto free_mr; + } + + mlx5_ib_populate_pas(dev, umem, page_shift, + mr_align(mr->pas, MLX5_UMR_ALIGN), 1); + + mr->dma = dma_map_single(ddev, mr_align(mr->pas, MLX5_UMR_ALIGN), size, + DMA_TO_DEVICE); + if (dma_mapping_error(ddev, mr->dma)) { + err = -ENOMEM; + goto free_pas; + } + + memset(&wr, 0, sizeof(wr)); + wr.wr_id = (u64)(unsigned long)&umr_context; + prep_umr_reg_wqe(pd, &wr, &sg, mr->dma, npages, mr->mmr.key, page_shift, virt_addr, len, access_flags); + + mlx5_ib_init_umr_context(&umr_context); + down(&umrc->sem); + err = ib_post_send(umrc->qp, &wr, &bad); + if (err) { + mlx5_ib_warn(dev, "post send failed, err %d\n", err); + goto unmap_dma; + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed\n"); + err = -EFAULT; + } + } + + mr->mmr.iova = virt_addr; + mr->mmr.size = len; + mr->mmr.pd = to_mpd(pd)->pdn; + +unmap_dma: + up(&umrc->sem); + dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE); + +free_pas: + kfree(mr->pas); + +free_mr: + if (err) { + free_cached_mr(dev, mr); + return ERR_PTR(err); + } + + return mr; +} + +static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, + u64 length, struct ib_umem *umem, + int npages, int page_shift, + int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int inlen; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + inlen = sizeof(*in) + sizeof(*in->pas) * ((npages + 1) / 2) * 2; + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto err_1; + } + mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, 0); + + in->seg.flags = convert_access(access_flags) | + MLX5_ACCESS_MODE_MTT; + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + in->seg.start_addr = cpu_to_be64(virt_addr); + in->seg.len = cpu_to_be64(length); + in->seg.bsfs_octo_size = 0; + in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); + in->seg.log2_page_size = page_shift; + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, + 1 << page_shift)); + err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen, NULL, + NULL, NULL); + if (err) { + mlx5_ib_warn(dev, "create mkey failed\n"); + goto err_2; + } + mr->umem = umem; + mlx5_vfree(in); + + mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); + + return mr; + +err_2: + mlx5_vfree(in); + +err_1: + kfree(mr); + + return ERR_PTR(err); +} + +struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr = NULL; + struct ib_umem *umem; + int page_shift; + int npages; + int ncont; + int order; + int err; + + mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx\n", + start, virt_addr, length); + umem = ib_umem_get(pd->uobject->context, start, length, access_flags, + 0); + if (IS_ERR(umem)) { + mlx5_ib_dbg(dev, "umem get failed\n"); + return (void *)umem; + } + + mlx5_ib_cont_pages(umem, start, &npages, &page_shift, &ncont, &order); + if (!npages) { + mlx5_ib_warn(dev, "avoid zero region\n"); + err = -EINVAL; + goto error; + } + + mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", + npages, ncont, order, page_shift); + + if (use_umr(order)) { + mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift, + order, access_flags); + if (PTR_ERR(mr) == -EAGAIN) { + mlx5_ib_dbg(dev, "cache empty for order %d", order); + mr = NULL; + } + } + + if (!mr) + mr = reg_create(pd, virt_addr, length, umem, ncont, page_shift, + access_flags); + + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + goto error; + } + + mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key); + + mr->umem = umem; + mr->npages = npages; + spin_lock(&dev->mr_lock); + dev->mdev.priv.reg_pages += npages; + spin_unlock(&dev->mr_lock); + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + + return &mr->ibmr; + +error: + ib_umem_release(umem); + return ERR_PTR(err); +} + +static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct ib_send_wr wr, *bad; + int err; + + memset(&wr, 0, sizeof(wr)); + wr.wr_id = (u64)(unsigned long)&umr_context; + prep_umr_unreg_wqe(dev, &wr, mr->mmr.key); + + mlx5_ib_init_umr_context(&umr_context); + down(&umrc->sem); + err = ib_post_send(umrc->qp, &wr, &bad); + if (err) { + up(&umrc->sem); + mlx5_ib_dbg(dev, "err %d\n", err); + goto error; + } else { + wait_for_completion(&umr_context.done); + up(&umrc->sem); + } + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "unreg umr failed\n"); + err = -EFAULT; + goto error; + } + return 0; + +error: + return err; +} + +int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct ib_umem *umem = mr->umem; + int npages = mr->npages; + int umred = mr->umred; + int err; + + if (!umred) { + err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr); + if (err) { + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", + mr->mmr.key, err); + return err; + } + } else { + err = unreg_umr(dev, mr); + if (err) { + mlx5_ib_warn(dev, "failed unregister\n"); + return err; + } + free_cached_mr(dev, mr); + } + + if (umem) { + ib_umem_release(umem); + spin_lock(&dev->mr_lock); + dev->mdev.priv.reg_pages -= npages; + spin_unlock(&dev->mr_lock); + } + + if (!umred) + kfree(mr); + + return 0; +} + +struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int access_mode, err; + int ndescs = roundup(mr_init_attr->max_reg_descriptors, 4); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + in->seg.status = 1 << 6; /* free */ + in->seg.xlt_oct_size = cpu_to_be32(ndescs); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + access_mode = MLX5_ACCESS_MODE_MTT; + + if (mr_init_attr->flags & IB_MR_SIGNATURE_EN) { + u32 psv_index[2]; + + in->seg.flags_pd = cpu_to_be32(be32_to_cpu(in->seg.flags_pd) | + MLX5_MKEY_BSF_EN); + in->seg.bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); + mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); + if (!mr->sig) { + err = -ENOMEM; + goto err_free_in; + } + + /* create mem & wire PSVs */ + err = mlx5_core_create_psv(&dev->mdev, to_mpd(pd)->pdn, + 2, psv_index); + if (err) + goto err_free_sig; + + access_mode = MLX5_ACCESS_MODE_KLM; + mr->sig->psv_memory.psv_idx = psv_index[0]; + mr->sig->psv_wire.psv_idx = psv_index[1]; + + mr->sig->sig_status_checked = true; + mr->sig->sig_err_exists = false; + /* Next UMR, Arm SIGERR */ + ++mr->sig->sigerr_count; + } + + in->seg.flags = MLX5_PERM_UMR_EN | access_mode; + err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in), + NULL, NULL, NULL); + if (err) + goto err_destroy_psv; + + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + kfree(in); + + return &mr->ibmr; + +err_destroy_psv: + if (mr->sig) { + if (mlx5_core_destroy_psv(&dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(&dev->mdev, + mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + } +err_free_sig: + kfree(mr->sig); +err_free_in: + kfree(in); +err_free: + kfree(mr); + return ERR_PTR(err); +} + +int mlx5_ib_destroy_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + int err; + + if (mr->sig) { + if (mlx5_core_destroy_psv(&dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(&dev->mdev, + mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + kfree(mr->sig); + } + + err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr); + if (err) { + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", + mr->mmr.key, err); + return err; + } + + kfree(mr); + + return err; +} + +struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + in->seg.status = 1 << 6; /* free */ + in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT; + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + /* + * TBD not needed - issue 197292 */ + in->seg.log2_page_size = PAGE_SHIFT; + + err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in), NULL, + NULL, NULL); + kfree(in); + if (err) + goto err_free; + + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl; + int size = page_list_len * sizeof(u64); + + mfrpl = kmalloc(sizeof(*mfrpl), GFP_KERNEL); + if (!mfrpl) + return ERR_PTR(-ENOMEM); + + mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL); + if (!mfrpl->ibfrpl.page_list) + goto err_free; + + mfrpl->mapped_page_list = dma_alloc_coherent(ibdev->dma_device, + size, &mfrpl->map, + GFP_KERNEL); + if (!mfrpl->mapped_page_list) + goto err_free; + + WARN_ON(mfrpl->map & 0x3f); + + return &mfrpl->ibfrpl; + +err_free: + kfree(mfrpl->ibfrpl.page_list); + kfree(mfrpl); + return ERR_PTR(-ENOMEM); +} + +void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list); + struct mlx5_ib_dev *dev = to_mdev(page_list->device); + int size = page_list->max_page_list_len * sizeof(u64); + + dma_free_coherent(&dev->mdev.pdev->dev, size, mfrpl->mapped_page_list, + mfrpl->map); + kfree(mfrpl->ibfrpl.page_list); + kfree(mfrpl); +} + +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status) +{ + struct mlx5_ib_mr *mmr = to_mmr(ibmr); + int ret = 0; + + if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { + pr_err("Invalid status check mask\n"); + ret = -EINVAL; + goto done; + } + + mr_status->fail_status = 0; + if (check_mask & IB_MR_CHECK_SIG_STATUS) { + if (!mmr->sig) { + ret = -EINVAL; + pr_err("signature status check requested on a non-signature enabled MR\n"); + goto done; + } + + mmr->sig->sig_status_checked = true; + if (!mmr->sig->sig_err_exists) + goto done; + + if (ibmr->lkey == mmr->sig->err_item.key) + memcpy(&mr_status->sig_err, &mmr->sig->err_item, + sizeof(mr_status->sig_err)); + else { + mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; + mr_status->sig_err.sig_err_offset = 0; + mr_status->sig_err.key = mmr->sig->err_item.key; + } + + mmr->sig->sig_err_exists = false; + mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; + } + +done: + return ret; +} diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c new file mode 100644 index 00000000000..bbbcf389272 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -0,0 +1,3048 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <rdma/ib_umem.h> +#include "mlx5_ib.h" +#include "user.h" + +/* not supported currently */ +static int wq_signature; + +enum { + MLX5_IB_ACK_REQ_FREQ = 8, +}; + +enum { + MLX5_IB_DEFAULT_SCHED_QUEUE = 0x83, + MLX5_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, + MLX5_IB_LINK_TYPE_IB = 0, + MLX5_IB_LINK_TYPE_ETH = 1 +}; + +enum { + MLX5_IB_SQ_STRIDE = 6, + MLX5_IB_CACHE_LINE_SIZE = 64, +}; + +static const u32 mlx5_ib_opcode[] = { + [IB_WR_SEND] = MLX5_OPCODE_SEND, + [IB_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, + [IB_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, + [IB_WR_RDMA_READ] = MLX5_OPCODE_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_CS, + [IB_WR_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_FA, + [IB_WR_SEND_WITH_INV] = MLX5_OPCODE_SEND_INVAL, + [IB_WR_LOCAL_INV] = MLX5_OPCODE_UMR, + [IB_WR_FAST_REG_MR] = MLX5_OPCODE_UMR, + [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_MASKED_CS, + [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_MASKED_FA, + [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, +}; + +struct umr_wr { + u64 virt_addr; + struct ib_pd *pd; + unsigned int page_shift; + unsigned int npages; + u32 length; + int access_flags; + u32 mkey; +}; + +static int is_qp0(enum ib_qp_type qp_type) +{ + return qp_type == IB_QPT_SMI; +} + +static int is_qp1(enum ib_qp_type qp_type) +{ + return qp_type == IB_QPT_GSI; +} + +static int is_sqp(enum ib_qp_type qp_type) +{ + return is_qp0(qp_type) || is_qp1(qp_type); +} + +static void *get_wqe(struct mlx5_ib_qp *qp, int offset) +{ + return mlx5_buf_offset(&qp->buf, offset); +} + +static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); +} + +void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); +} + +static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) +{ + struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; + struct ib_event event; + + if (type == MLX5_EVENT_TYPE_PATH_MIG) + to_mibqp(qp)->port = to_mibqp(qp)->alt_port; + + if (ibqp->event_handler) { + event.device = ibqp->device; + event.element.qp = ibqp; + switch (type) { + case MLX5_EVENT_TYPE_PATH_MIG: + event.event = IB_EVENT_PATH_MIG; + break; + case MLX5_EVENT_TYPE_COMM_EST: + event.event = IB_EVENT_COMM_EST; + break; + case MLX5_EVENT_TYPE_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + break; + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + event.event = IB_EVENT_PATH_MIG_ERR; + break; + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EVENT_QP_REQ_ERR; + break; + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EVENT_QP_ACCESS_ERR; + break; + default: + pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn); + return; + } + + ibqp->event_handler(&event, ibqp->qp_context); + } +} + +static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, + int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd) +{ + int wqe_size; + int wq_size; + + /* Sanity check RQ size before proceeding */ + if (cap->max_recv_wr > dev->mdev.caps.max_wqes) + return -EINVAL; + + if (!has_rq) { + qp->rq.max_gs = 0; + qp->rq.wqe_cnt = 0; + qp->rq.wqe_shift = 0; + } else { + if (ucmd) { + qp->rq.wqe_cnt = ucmd->rq_wqe_count; + qp->rq.wqe_shift = ucmd->rq_wqe_shift; + qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; + qp->rq.max_post = qp->rq.wqe_cnt; + } else { + wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0; + wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg); + wqe_size = roundup_pow_of_two(wqe_size); + wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size; + wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB); + qp->rq.wqe_cnt = wq_size / wqe_size; + if (wqe_size > dev->mdev.caps.max_rq_desc_sz) { + mlx5_ib_dbg(dev, "wqe_size %d, max %d\n", + wqe_size, + dev->mdev.caps.max_rq_desc_sz); + return -EINVAL; + } + qp->rq.wqe_shift = ilog2(wqe_size); + qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; + qp->rq.max_post = qp->rq.wqe_cnt; + } + } + + return 0; +} + +static int sq_overhead(enum ib_qp_type qp_type) +{ + int size = 0; + + switch (qp_type) { + case IB_QPT_XRC_INI: + size += sizeof(struct mlx5_wqe_xrc_seg); + /* fall through */ + case IB_QPT_RC: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_atomic_seg) + + sizeof(struct mlx5_wqe_raddr_seg); + break; + + case IB_QPT_XRC_TGT: + return 0; + + case IB_QPT_UC: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); + break; + + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_datagram_seg); + break; + + case MLX5_IB_QPT_REG_UMR: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); + break; + + default: + return -EINVAL; + } + + return size; +} + +static int calc_send_wqe(struct ib_qp_init_attr *attr) +{ + int inl_size = 0; + int size; + + size = sq_overhead(attr->qp_type); + if (size < 0) + return size; + + if (attr->cap.max_inline_data) { + inl_size = size + sizeof(struct mlx5_wqe_inline_seg) + + attr->cap.max_inline_data; + } + + size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN && + ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE) + return MLX5_SIG_WQE_SIZE; + else + return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); +} + +static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, + struct mlx5_ib_qp *qp) +{ + int wqe_size; + int wq_size; + + if (!attr->cap.max_send_wr) + return 0; + + wqe_size = calc_send_wqe(attr); + mlx5_ib_dbg(dev, "wqe_size %d\n", wqe_size); + if (wqe_size < 0) + return wqe_size; + + if (wqe_size > dev->mdev.caps.max_sq_desc_sz) { + mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n", + wqe_size, dev->mdev.caps.max_sq_desc_sz); + return -EINVAL; + } + + qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) - + sizeof(struct mlx5_wqe_inline_seg); + attr->cap.max_inline_data = qp->max_inline_data; + + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) + qp->signature_en = true; + + wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size); + qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; + if (qp->sq.wqe_cnt > dev->mdev.caps.max_wqes) { + mlx5_ib_dbg(dev, "wqe count(%d) exceeds limits(%d)\n", + qp->sq.wqe_cnt, dev->mdev.caps.max_wqes); + return -ENOMEM; + } + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.max_gs = attr->cap.max_send_sge; + qp->sq.max_post = wq_size / wqe_size; + attr->cap.max_send_wr = qp->sq.max_post; + + return wq_size; +} + +static int set_user_buf_size(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + struct mlx5_ib_create_qp *ucmd) +{ + int desc_sz = 1 << qp->sq.wqe_shift; + + if (desc_sz > dev->mdev.caps.max_sq_desc_sz) { + mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n", + desc_sz, dev->mdev.caps.max_sq_desc_sz); + return -EINVAL; + } + + if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) { + mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n", + ucmd->sq_wqe_count, ucmd->sq_wqe_count); + return -EINVAL; + } + + qp->sq.wqe_cnt = ucmd->sq_wqe_count; + + if (qp->sq.wqe_cnt > dev->mdev.caps.max_wqes) { + mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n", + qp->sq.wqe_cnt, dev->mdev.caps.max_wqes); + return -EINVAL; + } + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << 6); + + return 0; +} + +static int qp_has_rq(struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_INI || + attr->qp_type == IB_QPT_XRC_TGT || attr->srq || + attr->qp_type == MLX5_IB_QPT_REG_UMR || + !attr->cap.max_recv_wr) + return 0; + + return 1; +} + +static int first_med_uuar(void) +{ + return 1; +} + +static int next_uuar(int n) +{ + n++; + + while (((n % 4) & 2)) + n++; + + return n; +} + +static int num_med_uuar(struct mlx5_uuar_info *uuari) +{ + int n; + + n = uuari->num_uars * MLX5_NON_FP_BF_REGS_PER_PAGE - + uuari->num_low_latency_uuars - 1; + + return n >= 0 ? n : 0; +} + +static int max_uuari(struct mlx5_uuar_info *uuari) +{ + return uuari->num_uars * 4; +} + +static int first_hi_uuar(struct mlx5_uuar_info *uuari) +{ + int med; + int i; + int t; + + med = num_med_uuar(uuari); + for (t = 0, i = first_med_uuar();; i = next_uuar(i)) { + t++; + if (t == med) + return next_uuar(i); + } + + return 0; +} + +static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari) +{ + int i; + + for (i = first_hi_uuar(uuari); i < max_uuari(uuari); i = next_uuar(i)) { + if (!test_bit(i, uuari->bitmap)) { + set_bit(i, uuari->bitmap); + uuari->count[i]++; + return i; + } + } + + return -ENOMEM; +} + +static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari) +{ + int minidx = first_med_uuar(); + int i; + + for (i = first_med_uuar(); i < first_hi_uuar(uuari); i = next_uuar(i)) { + if (uuari->count[i] < uuari->count[minidx]) + minidx = i; + } + + uuari->count[minidx]++; + return minidx; +} + +static int alloc_uuar(struct mlx5_uuar_info *uuari, + enum mlx5_ib_latency_class lat) +{ + int uuarn = -EINVAL; + + mutex_lock(&uuari->lock); + switch (lat) { + case MLX5_IB_LATENCY_CLASS_LOW: + uuarn = 0; + uuari->count[uuarn]++; + break; + + case MLX5_IB_LATENCY_CLASS_MEDIUM: + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_med_class_uuar(uuari); + break; + + case MLX5_IB_LATENCY_CLASS_HIGH: + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_high_class_uuar(uuari); + break; + + case MLX5_IB_LATENCY_CLASS_FAST_PATH: + uuarn = 2; + break; + } + mutex_unlock(&uuari->lock); + + return uuarn; +} + +static void free_med_class_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + clear_bit(uuarn, uuari->bitmap); + --uuari->count[uuarn]; +} + +static void free_high_class_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + clear_bit(uuarn, uuari->bitmap); + --uuari->count[uuarn]; +} + +static void free_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE; + int high_uuar = nuuars - uuari->num_low_latency_uuars; + + mutex_lock(&uuari->lock); + if (uuarn == 0) { + --uuari->count[uuarn]; + goto out; + } + + if (uuarn < high_uuar) { + free_med_class_uuar(uuari, uuarn); + goto out; + } + + free_high_class_uuar(uuari, uuarn); + +out: + mutex_unlock(&uuari->lock); +} + +static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: return MLX5_QP_STATE_RST; + case IB_QPS_INIT: return MLX5_QP_STATE_INIT; + case IB_QPS_RTR: return MLX5_QP_STATE_RTR; + case IB_QPS_RTS: return MLX5_QP_STATE_RTS; + case IB_QPS_SQD: return MLX5_QP_STATE_SQD; + case IB_QPS_SQE: return MLX5_QP_STATE_SQER; + case IB_QPS_ERR: return MLX5_QP_STATE_ERR; + default: return -1; + } +} + +static int to_mlx5_st(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_RC: return MLX5_QP_ST_RC; + case IB_QPT_UC: return MLX5_QP_ST_UC; + case IB_QPT_UD: return MLX5_QP_ST_UD; + case MLX5_IB_QPT_REG_UMR: return MLX5_QP_ST_REG_UMR; + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: return MLX5_QP_ST_XRC; + case IB_QPT_SMI: return MLX5_QP_ST_QP0; + case IB_QPT_GSI: return MLX5_QP_ST_QP1; + case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; + case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; + case IB_QPT_RAW_PACKET: + case IB_QPT_MAX: + default: return -EINVAL; + } +} + +static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn) +{ + return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index; +} + +static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, struct ib_udata *udata, + struct mlx5_create_qp_mbox_in **in, + struct mlx5_ib_create_qp_resp *resp, int *inlen) +{ + struct mlx5_ib_ucontext *context; + struct mlx5_ib_create_qp ucmd; + int page_shift = 0; + int uar_index; + int npages; + u32 offset = 0; + int uuarn; + int ncont = 0; + int err; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) { + mlx5_ib_dbg(dev, "copy failed\n"); + return err; + } + + context = to_mucontext(pd->uobject->context); + /* + * TBD: should come from the verbs when we have the API + */ + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH); + if (uuarn < 0) { + mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to medium latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM); + if (uuarn < 0) { + mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to high latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); + if (uuarn < 0) { + mlx5_ib_warn(dev, "uuar allocation failed\n"); + return uuarn; + } + } + } + + uar_index = uuarn_to_uar_index(&context->uuari, uuarn); + mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index); + + qp->rq.offset = 0; + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + + err = set_user_buf_size(dev, qp, &ucmd); + if (err) + goto err_uuar; + + if (ucmd.buf_addr && qp->buf_size) { + qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, + qp->buf_size, 0, 0); + if (IS_ERR(qp->umem)) { + mlx5_ib_dbg(dev, "umem_get failed\n"); + err = PTR_ERR(qp->umem); + goto err_uuar; + } + } else { + qp->umem = NULL; + } + + if (qp->umem) { + mlx5_ib_cont_pages(qp->umem, ucmd.buf_addr, &npages, &page_shift, + &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n", + ucmd.buf_addr, qp->buf_size, npages, page_shift, ncont, offset); + } + + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + err = -ENOMEM; + goto err_umem; + } + if (qp->umem) + mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); + (*in)->ctx.log_pg_sz_remote_qpn = + cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); + (*in)->ctx.params2 = cpu_to_be32(offset << 6); + + (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); + resp->uuar_index = uuarn; + qp->uuarn = uuarn; + + err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db); + if (err) { + mlx5_ib_dbg(dev, "map failed\n"); + goto err_free; + } + + err = ib_copy_to_udata(udata, resp, sizeof(*resp)); + if (err) { + mlx5_ib_dbg(dev, "copy failed\n"); + goto err_unmap; + } + qp->create_type = MLX5_QP_USER; + + return 0; + +err_unmap: + mlx5_ib_db_unmap_user(context, &qp->db); + +err_free: + mlx5_vfree(*in); + +err_umem: + if (qp->umem) + ib_umem_release(qp->umem); + +err_uuar: + free_uuar(&context->uuari, uuarn); + return err; +} + +static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_ucontext *context; + + context = to_mucontext(pd->uobject->context); + mlx5_ib_db_unmap_user(context, &qp->db); + if (qp->umem) + ib_umem_release(qp->umem); + free_uuar(&context->uuari, qp->uuarn); +} + +static int create_kernel_qp(struct mlx5_ib_dev *dev, + struct ib_qp_init_attr *init_attr, + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_mbox_in **in, int *inlen) +{ + enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW; + struct mlx5_uuar_info *uuari; + int uar_index; + int uuarn; + int err; + + uuari = &dev->mdev.priv.uuari; + if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) + return -EINVAL; + + if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) + lc = MLX5_IB_LATENCY_CLASS_FAST_PATH; + + uuarn = alloc_uuar(uuari, lc); + if (uuarn < 0) { + mlx5_ib_dbg(dev, "\n"); + return -ENOMEM; + } + + qp->bf = &uuari->bfs[uuarn]; + uar_index = qp->bf->uar->index; + + err = calc_sq_size(dev, init_attr, qp); + if (err < 0) { + mlx5_ib_dbg(dev, "err %d\n", err); + goto err_uuar; + } + + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift); + + err = mlx5_buf_alloc(&dev->mdev, qp->buf_size, PAGE_SIZE * 2, &qp->buf); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + goto err_uuar; + } + + qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt); + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * qp->buf.npages; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + err = -ENOMEM; + goto err_buf; + } + (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); + (*in)->ctx.log_pg_sz_remote_qpn = + cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); + /* Set "fast registration enabled" for all kernel QPs */ + (*in)->ctx.params1 |= cpu_to_be32(1 << 11); + (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); + + mlx5_fill_page_array(&qp->buf, (*in)->pas); + + err = mlx5_db_alloc(&dev->mdev, &qp->db); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + goto err_free; + } + + qp->db.db[0] = 0; + qp->db.db[1] = 0; + + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid), GFP_KERNEL); + qp->sq.wr_data = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data), GFP_KERNEL); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(*qp->rq.wrid), GFP_KERNEL); + qp->sq.w_list = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.w_list), GFP_KERNEL); + qp->sq.wqe_head = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head), GFP_KERNEL); + + if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid || + !qp->sq.w_list || !qp->sq.wqe_head) { + err = -ENOMEM; + goto err_wrid; + } + qp->create_type = MLX5_QP_KERNEL; + + return 0; + +err_wrid: + mlx5_db_free(&dev->mdev, &qp->db); + kfree(qp->sq.wqe_head); + kfree(qp->sq.w_list); + kfree(qp->sq.wrid); + kfree(qp->sq.wr_data); + kfree(qp->rq.wrid); + +err_free: + mlx5_vfree(*in); + +err_buf: + mlx5_buf_free(&dev->mdev, &qp->buf); + +err_uuar: + free_uuar(&dev->mdev.priv.uuari, uuarn); + return err; +} + +static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + mlx5_db_free(&dev->mdev, &qp->db); + kfree(qp->sq.wqe_head); + kfree(qp->sq.w_list); + kfree(qp->sq.wrid); + kfree(qp->sq.wr_data); + kfree(qp->rq.wrid); + mlx5_buf_free(&dev->mdev, &qp->buf); + free_uuar(&dev->mdev.priv.uuari, qp->bf->uuarn); +} + +static __be32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) +{ + if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) || + (attr->qp_type == IB_QPT_XRC_INI)) + return cpu_to_be32(MLX5_SRQ_RQ); + else if (!qp->has_rq) + return cpu_to_be32(MLX5_ZERO_LEN_RQ); + else + return cpu_to_be32(MLX5_NON_ZERO_RQ); +} + +static int is_connected(enum ib_qp_type qp_type) +{ + if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC) + return 1; + + return 0; +} + +static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_resources *devr = &dev->devr; + struct mlx5_ib_create_qp_resp resp; + struct mlx5_create_qp_mbox_in *in; + struct mlx5_ib_create_qp ucmd; + int inlen = sizeof(*in); + int err; + + mutex_init(&qp->mutex); + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { + if (!(dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST)) { + mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n"); + return -EINVAL; + } else { + qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK; + } + } + + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; + + if (pd && pd->uobject) { + if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { + mlx5_ib_dbg(dev, "copy failed\n"); + return -EFAULT; + } + + qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE); + qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE); + } else { + qp->wq_sig = !!wq_signature; + } + + qp->has_rq = qp_has_rq(init_attr); + err = set_rq_size(dev, &init_attr->cap, qp->has_rq, + qp, (pd && pd->uobject) ? &ucmd : NULL); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + if (pd) { + if (pd->uobject) { + mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count); + if (ucmd.rq_wqe_shift != qp->rq.wqe_shift || + ucmd.rq_wqe_count != qp->rq.wqe_cnt) { + mlx5_ib_dbg(dev, "invalid rq params\n"); + return -EINVAL; + } + if (ucmd.sq_wqe_count > dev->mdev.caps.max_wqes) { + mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n", + ucmd.sq_wqe_count, dev->mdev.caps.max_wqes); + return -EINVAL; + } + err = create_user_qp(dev, pd, qp, udata, &in, &resp, &inlen); + if (err) + mlx5_ib_dbg(dev, "err %d\n", err); + } else { + err = create_kernel_qp(dev, init_attr, qp, &in, &inlen); + if (err) + mlx5_ib_dbg(dev, "err %d\n", err); + else + qp->pa_lkey = to_mpd(pd)->pa_lkey; + } + + if (err) + return err; + } else { + in = mlx5_vzalloc(sizeof(*in)); + if (!in) + return -ENOMEM; + + qp->create_type = MLX5_QP_EMPTY; + } + + if (is_sqp(init_attr->qp_type)) + qp->port = init_attr->port_num; + + in->ctx.flags = cpu_to_be32(to_mlx5_st(init_attr->qp_type) << 16 | + MLX5_QP_PM_MIGRATED << 11); + + if (init_attr->qp_type != MLX5_IB_QPT_REG_UMR) + in->ctx.flags_pd = cpu_to_be32(to_mpd(pd ? pd : devr->p0)->pdn); + else + in->ctx.flags_pd = cpu_to_be32(MLX5_QP_LAT_SENSITIVE); + + if (qp->wq_sig) + in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_ENABLE_SIG); + + if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST); + + if (qp->scat_cqe && is_connected(init_attr->qp_type)) { + int rcqe_sz; + int scqe_sz; + + rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq); + scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq); + + if (rcqe_sz == 128) + in->ctx.cs_res = MLX5_RES_SCAT_DATA64_CQE; + else + in->ctx.cs_res = MLX5_RES_SCAT_DATA32_CQE; + + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) { + if (scqe_sz == 128) + in->ctx.cs_req = MLX5_REQ_SCAT_DATA64_CQE; + else + in->ctx.cs_req = MLX5_REQ_SCAT_DATA32_CQE; + } + } + + if (qp->rq.wqe_cnt) { + in->ctx.rq_size_stride = (qp->rq.wqe_shift - 4); + in->ctx.rq_size_stride |= ilog2(qp->rq.wqe_cnt) << 3; + } + + in->ctx.rq_type_srqn = get_rx_type(qp, init_attr); + + if (qp->sq.wqe_cnt) + in->ctx.sq_crq_size |= cpu_to_be16(ilog2(qp->sq.wqe_cnt) << 11); + else + in->ctx.sq_crq_size |= cpu_to_be16(0x8000); + + /* Set default resources */ + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.cqn_send = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + in->ctx.xrcd = cpu_to_be32(to_mxrcd(init_attr->xrcd)->xrcdn); + break; + case IB_QPT_XRC_INI: + in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + break; + default: + if (init_attr->srq) { + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x0)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(init_attr->srq)->msrq.srqn); + } else { + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + } + } + + if (init_attr->send_cq) + in->ctx.cqn_send = cpu_to_be32(to_mcq(init_attr->send_cq)->mcq.cqn); + + if (init_attr->recv_cq) + in->ctx.cqn_recv = cpu_to_be32(to_mcq(init_attr->recv_cq)->mcq.cqn); + + in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma); + + err = mlx5_core_create_qp(&dev->mdev, &qp->mqp, in, inlen); + if (err) { + mlx5_ib_dbg(dev, "create qp failed\n"); + goto err_create; + } + + mlx5_vfree(in); + /* Hardware wants QPN written in big-endian order (after + * shifting) for send doorbell. Precompute this value to save + * a little bit when posting sends. + */ + qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); + + qp->mqp.event = mlx5_ib_qp_event; + + return 0; + +err_create: + if (qp->create_type == MLX5_QP_USER) + destroy_qp_user(pd, qp); + else if (qp->create_type == MLX5_QP_KERNEL) + destroy_qp_kernel(dev, qp); + + mlx5_vfree(in); + return err; +} + +static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) +{ + if (send_cq) { + if (recv_cq) { + if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_lock_irq(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, + SINGLE_DEPTH_NESTING); + } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { + spin_lock_irq(&send_cq->lock); + __acquire(&recv_cq->lock); + } else { + spin_lock_irq(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, + SINGLE_DEPTH_NESTING); + } + } else { + spin_lock_irq(&send_cq->lock); + } + } else if (recv_cq) { + spin_lock_irq(&recv_cq->lock); + } +} + +static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) +{ + if (send_cq) { + if (recv_cq) { + if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_unlock(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { + __release(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock_irq(&recv_cq->lock); + } + } else { + spin_unlock_irq(&send_cq->lock); + } + } else if (recv_cq) { + spin_unlock_irq(&recv_cq->lock); + } +} + +static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp) +{ + return to_mpd(qp->ibqp.pd); +} + +static void get_cqs(struct mlx5_ib_qp *qp, + struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq) +{ + switch (qp->ibqp.qp_type) { + case IB_QPT_XRC_TGT: + *send_cq = NULL; + *recv_cq = NULL; + break; + case MLX5_IB_QPT_REG_UMR: + case IB_QPT_XRC_INI: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = NULL; + break; + + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_RAW_IPV6: + case IB_QPT_RAW_ETHERTYPE: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = to_mcq(qp->ibqp.recv_cq); + break; + + case IB_QPT_RAW_PACKET: + case IB_QPT_MAX: + default: + *send_cq = NULL; + *recv_cq = NULL; + break; + } +} + +static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_modify_qp_mbox_in *in; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return; + if (qp->state != IB_QPS_RESET) + if (mlx5_core_qp_modify(&dev->mdev, to_mlx5_state(qp->state), + MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp)) + mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", + qp->mqp.qpn); + + get_cqs(qp, &send_cq, &recv_cq); + + if (qp->create_type == MLX5_QP_KERNEL) { + mlx5_ib_lock_cqs(send_cq, recv_cq); + __mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); + if (send_cq != recv_cq) + __mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + mlx5_ib_unlock_cqs(send_cq, recv_cq); + } + + err = mlx5_core_destroy_qp(&dev->mdev, &qp->mqp); + if (err) + mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", qp->mqp.qpn); + kfree(in); + + + if (qp->create_type == MLX5_QP_KERNEL) + destroy_qp_kernel(dev, qp); + else if (qp->create_type == MLX5_QP_USER) + destroy_qp_user(&get_pd(qp)->ibpd, qp); +} + +static const char *ib_qp_type_str(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_SMI: + return "IB_QPT_SMI"; + case IB_QPT_GSI: + return "IB_QPT_GSI"; + case IB_QPT_RC: + return "IB_QPT_RC"; + case IB_QPT_UC: + return "IB_QPT_UC"; + case IB_QPT_UD: + return "IB_QPT_UD"; + case IB_QPT_RAW_IPV6: + return "IB_QPT_RAW_IPV6"; + case IB_QPT_RAW_ETHERTYPE: + return "IB_QPT_RAW_ETHERTYPE"; + case IB_QPT_XRC_INI: + return "IB_QPT_XRC_INI"; + case IB_QPT_XRC_TGT: + return "IB_QPT_XRC_TGT"; + case IB_QPT_RAW_PACKET: + return "IB_QPT_RAW_PACKET"; + case MLX5_IB_QPT_REG_UMR: + return "MLX5_IB_QPT_REG_UMR"; + case IB_QPT_MAX: + default: + return "Invalid QP type"; + } +} + +struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev; + struct mlx5_ib_qp *qp; + u16 xrcdn = 0; + int err; + + if (pd) { + dev = to_mdev(pd->device); + } else { + /* being cautious here */ + if (init_attr->qp_type != IB_QPT_XRC_TGT && + init_attr->qp_type != MLX5_IB_QPT_REG_UMR) { + pr_warn("%s: no PD for transport %s\n", __func__, + ib_qp_type_str(init_attr->qp_type)); + return ERR_PTR(-EINVAL); + } + dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); + } + + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + case IB_QPT_XRC_INI: + if (!(dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_XRC)) { + mlx5_ib_dbg(dev, "XRC not supported\n"); + return ERR_PTR(-ENOSYS); + } + init_attr->recv_cq = NULL; + if (init_attr->qp_type == IB_QPT_XRC_TGT) { + xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; + init_attr->send_cq = NULL; + } + + /* fall through */ + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + case MLX5_IB_QPT_REG_UMR: + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + err = create_qp_common(dev, pd, init_attr, udata, qp); + if (err) { + mlx5_ib_dbg(dev, "create_qp_common failed\n"); + kfree(qp); + return ERR_PTR(err); + } + + if (is_qp0(init_attr->qp_type)) + qp->ibqp.qp_num = 0; + else if (is_qp1(init_attr->qp_type)) + qp->ibqp.qp_num = 1; + else + qp->ibqp.qp_num = qp->mqp.qpn; + + mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", + qp->ibqp.qp_num, qp->mqp.qpn, to_mcq(init_attr->recv_cq)->mcq.cqn, + to_mcq(init_attr->send_cq)->mcq.cqn); + + qp->xrcdn = xrcdn; + + break; + + case IB_QPT_RAW_IPV6: + case IB_QPT_RAW_ETHERTYPE: + case IB_QPT_RAW_PACKET: + case IB_QPT_MAX: + default: + mlx5_ib_dbg(dev, "unsupported qp type %d\n", + init_attr->qp_type); + /* Don't support raw QPs */ + return ERR_PTR(-EINVAL); + } + + return &qp->ibqp; +} + +int mlx5_ib_destroy_qp(struct ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + + destroy_qp_common(dev, mqp); + + kfree(mqp); + + return 0; +} + +static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr, + int attr_mask) +{ + u32 hw_access_flags = 0; + u8 dest_rd_atomic; + u32 access_flags; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + dest_rd_atomic = attr->max_dest_rd_atomic; + else + dest_rd_atomic = qp->resp_depth; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + access_flags = attr->qp_access_flags; + else + access_flags = qp->atomic_rd_en; + + if (!dest_rd_atomic) + access_flags &= IB_ACCESS_REMOTE_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_READ) + hw_access_flags |= MLX5_QP_BIT_RRE; + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) + hw_access_flags |= (MLX5_QP_BIT_RAE | MLX5_ATOMIC_MODE_CX); + if (access_flags & IB_ACCESS_REMOTE_WRITE) + hw_access_flags |= MLX5_QP_BIT_RWE; + + return cpu_to_be32(hw_access_flags); +} + +enum { + MLX5_PATH_FLAG_FL = 1 << 0, + MLX5_PATH_FLAG_FREE_AR = 1 << 1, + MLX5_PATH_FLAG_COUNTER = 1 << 2, +}; + +static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) +{ + if (rate == IB_RATE_PORT_CURRENT) { + return 0; + } else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) { + return -EINVAL; + } else { + while (rate != IB_RATE_2_5_GBPS && + !(1 << (rate + MLX5_STAT_RATE_OFFSET) & + dev->mdev.caps.stat_rate_support)) + --rate; + } + + return rate + MLX5_STAT_RATE_OFFSET; +} + +static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, + struct mlx5_qp_path *path, u8 port, int attr_mask, + u32 path_flags, const struct ib_qp_attr *attr) +{ + int err; + + path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; + path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : 0; + + if (attr_mask & IB_QP_PKEY_INDEX) + path->pkey_index = attr->pkey_index; + + path->grh_mlid = ah->src_path_bits & 0x7f; + path->rlid = cpu_to_be16(ah->dlid); + + if (ah->ah_flags & IB_AH_GRH) { + path->grh_mlid |= 1 << 7; + path->mgid_index = ah->grh.sgid_index; + path->hop_limit = ah->grh.hop_limit; + path->tclass_flowlabel = + cpu_to_be32((ah->grh.traffic_class << 20) | + (ah->grh.flow_label)); + memcpy(path->rgid, ah->grh.dgid.raw, 16); + } + + err = ib_rate_to_mlx5(dev, ah->static_rate); + if (err < 0) + return err; + path->static_rate = err; + path->port = port; + + if (ah->ah_flags & IB_AH_GRH) { + if (ah->grh.sgid_index >= dev->mdev.caps.port[port - 1].gid_table_len) { + pr_err(KERN_ERR "sgid_index (%u) too large. max is %d\n", + ah->grh.sgid_index, dev->mdev.caps.port[port - 1].gid_table_len); + return -EINVAL; + } + + path->grh_mlid |= 1 << 7; + path->mgid_index = ah->grh.sgid_index; + path->hop_limit = ah->grh.hop_limit; + path->tclass_flowlabel = + cpu_to_be32((ah->grh.traffic_class << 20) | + (ah->grh.flow_label)); + memcpy(path->rgid, ah->grh.dgid.raw, 16); + } + + if (attr_mask & IB_QP_TIMEOUT) + path->ackto_lt = attr->timeout << 3; + + path->sl = ah->sl & 0xf; + + return 0; +} + +static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_QP_ST_MAX] = { + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY | + MLX5_QP_OPTPAR_PRI_PORT, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + }, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_RNR_TIMEOUT, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + }, + }, + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY | + MLX5_QP_OPTPAR_SRQN | + MLX5_QP_OPTPAR_CQN_RCV, + }, + }, + [MLX5_QP_STATE_SQER] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE, + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RRE, + }, + }, +}; + +static int ib_nr_to_mlx5_nr(int ib_mask) +{ + switch (ib_mask) { + case IB_QP_STATE: + return 0; + case IB_QP_CUR_STATE: + return 0; + case IB_QP_EN_SQD_ASYNC_NOTIFY: + return 0; + case IB_QP_ACCESS_FLAGS: + return MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE; + case IB_QP_PKEY_INDEX: + return MLX5_QP_OPTPAR_PKEY_INDEX; + case IB_QP_PORT: + return MLX5_QP_OPTPAR_PRI_PORT; + case IB_QP_QKEY: + return MLX5_QP_OPTPAR_Q_KEY; + case IB_QP_AV: + return MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH | + MLX5_QP_OPTPAR_PRI_PORT; + case IB_QP_PATH_MTU: + return 0; + case IB_QP_TIMEOUT: + return MLX5_QP_OPTPAR_ACK_TIMEOUT; + case IB_QP_RETRY_CNT: + return MLX5_QP_OPTPAR_RETRY_COUNT; + case IB_QP_RNR_RETRY: + return MLX5_QP_OPTPAR_RNR_RETRY; + case IB_QP_RQ_PSN: + return 0; + case IB_QP_MAX_QP_RD_ATOMIC: + return MLX5_QP_OPTPAR_SRA_MAX; + case IB_QP_ALT_PATH: + return MLX5_QP_OPTPAR_ALT_ADDR_PATH; + case IB_QP_MIN_RNR_TIMER: + return MLX5_QP_OPTPAR_RNR_TIMEOUT; + case IB_QP_SQ_PSN: + return 0; + case IB_QP_MAX_DEST_RD_ATOMIC: + return MLX5_QP_OPTPAR_RRA_MAX | MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE; + case IB_QP_PATH_MIG_STATE: + return MLX5_QP_OPTPAR_PM_STATE; + case IB_QP_CAP: + return 0; + case IB_QP_DEST_QPN: + return 0; + } + return 0; +} + +static int ib_mask_to_mlx5_opt(int ib_mask) +{ + int result = 0; + int i; + + for (i = 0; i < 8 * sizeof(int); i++) { + if ((1 << i) & ib_mask) + result |= ib_nr_to_mlx5_nr(1 << i); + } + + return result; +} + +static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, enum ib_qp_state new_state) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_qp_context *context; + struct mlx5_modify_qp_mbox_in *in; + struct mlx5_ib_pd *pd; + enum mlx5_qp_state mlx5_cur, mlx5_new; + enum mlx5_qp_optpar optpar; + int sqd_event; + int mlx5_st; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + context = &in->ctx; + err = to_mlx5_st(ibqp->qp_type); + if (err < 0) + goto out; + + context->flags = cpu_to_be32(err << 16); + + if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { + context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + } else { + switch (attr->path_mig_state) { + case IB_MIG_MIGRATED: + context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + break; + case IB_MIG_REARM: + context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11); + break; + case IB_MIG_ARMED: + context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11); + break; + } + } + + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) { + context->mtu_msgmax = (IB_MTU_256 << 5) | 8; + } else if (ibqp->qp_type == IB_QPT_UD || + ibqp->qp_type == MLX5_IB_QPT_REG_UMR) { + context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; + } else if (attr_mask & IB_QP_PATH_MTU) { + if (attr->path_mtu < IB_MTU_256 || + attr->path_mtu > IB_MTU_4096) { + mlx5_ib_warn(dev, "invalid mtu %d\n", attr->path_mtu); + err = -EINVAL; + goto out; + } + context->mtu_msgmax = (attr->path_mtu << 5) | dev->mdev.caps.log_max_msg; + } + + if (attr_mask & IB_QP_DEST_QPN) + context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num); + + if (attr_mask & IB_QP_PKEY_INDEX) + context->pri_path.pkey_index = attr->pkey_index; + + /* todo implement counter_index functionality */ + + if (is_sqp(ibqp->qp_type)) + context->pri_path.port = qp->port; + + if (attr_mask & IB_QP_PORT) + context->pri_path.port = attr->port_num; + + if (attr_mask & IB_QP_AV) { + err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path, + attr_mask & IB_QP_PORT ? attr->port_num : qp->port, + attr_mask, 0, attr); + if (err) + goto out; + } + + if (attr_mask & IB_QP_TIMEOUT) + context->pri_path.ackto_lt |= attr->timeout << 3; + + if (attr_mask & IB_QP_ALT_PATH) { + err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path, + attr->alt_port_num, attr_mask, 0, attr); + if (err) + goto out; + } + + pd = get_pd(qp); + get_cqs(qp, &send_cq, &recv_cq); + + context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn); + context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0; + context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0; + context->params1 = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28); + + if (attr_mask & IB_QP_RNR_RETRY) + context->params1 |= cpu_to_be32(attr->rnr_retry << 13); + + if (attr_mask & IB_QP_RETRY_CNT) + context->params1 |= cpu_to_be32(attr->retry_cnt << 16); + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic) + context->params1 |= + cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); + } + + if (attr_mask & IB_QP_SQ_PSN) + context->next_send_psn = cpu_to_be32(attr->sq_psn); + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + if (attr->max_dest_rd_atomic) + context->params2 |= + cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); + } + + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) + context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask); + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); + + if (attr_mask & IB_QP_RQ_PSN) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); + + if (attr_mask & IB_QP_QKEY) + context->qkey = cpu_to_be32(attr->qkey); + + if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->db_rec_addr = cpu_to_be64(qp->db.dma); + + if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && + attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) + sqd_event = 1; + else + sqd_event = 0; + + if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->sq_crq_size |= cpu_to_be16(1 << 4); + + + mlx5_cur = to_mlx5_state(cur_state); + mlx5_new = to_mlx5_state(new_state); + mlx5_st = to_mlx5_st(ibqp->qp_type); + if (mlx5_st < 0) + goto out; + + optpar = ib_mask_to_mlx5_opt(attr_mask); + optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; + in->optparam = cpu_to_be32(optpar); + err = mlx5_core_qp_modify(&dev->mdev, to_mlx5_state(cur_state), + to_mlx5_state(new_state), in, sqd_event, + &qp->mqp); + if (err) + goto out; + + qp->state = new_state; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->atomic_rd_en = attr->qp_access_flags; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->resp_depth = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_PORT) + qp->port = attr->port_num; + if (attr_mask & IB_QP_ALT_PATH) + qp->alt_port = attr->alt_port_num; + + /* + * If we moved a kernel QP to RESET, clean up all old CQ + * entries and reinitialize the QP. + */ + if (new_state == IB_QPS_RESET && !ibqp->uobject) { + mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (send_cq != recv_cq) + mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq.cur_post = 0; + qp->sq.last_poll = 0; + qp->db.db[MLX5_RCV_DBR] = 0; + qp->db.db[MLX5_SND_DBR] = 0; + } + +out: + kfree(in); + return err; +} + +int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + enum ib_qp_state cur_state, new_state; + int err = -EINVAL; + int port; + + mutex_lock(&qp->mutex); + + cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR && + !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, + IB_LINK_LAYER_UNSPECIFIED)) + goto out; + + if ((attr_mask & IB_QP_PORT) && + (attr->port_num == 0 || attr->port_num > dev->mdev.caps.num_ports)) + goto out; + + if (attr_mask & IB_QP_PKEY_INDEX) { + port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + if (attr->pkey_index >= dev->mdev.caps.port[port - 1].pkey_table_len) + goto out; + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > dev->mdev.caps.max_ra_res_qp) + goto out; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > dev->mdev.caps.max_ra_req_qp) + goto out; + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + err = 0; + goto out; + } + + err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq) +{ + struct mlx5_ib_cq *cq; + unsigned cur; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max_post)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, + u64 remote_addr, u32 rkey) +{ + rseg->raddr = cpu_to_be64(remote_addr); + rseg->rkey = cpu_to_be32(rkey); + rseg->reserved = 0; +} + +static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, + struct ib_send_wr *wr) +{ + memcpy(&dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof(struct mlx5_av)); + dseg->av.dqp_dct = cpu_to_be32(wr->wr.ud.remote_qpn | MLX5_EXTENDED_UD_AV); + dseg->av.key.qkey.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); +} + +static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + +static __be16 get_klm_octo(int npages) +{ + return cpu_to_be16(ALIGN(npages, 8) / 2); +} + +static __be64 frwr_mkey_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 sig_mkey_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_SIGERR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE | + MLX5_MKEY_MASK_BSF_EN; + + return cpu_to_be64(result); +} + +static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr, int li) +{ + memset(umr, 0, sizeof(*umr)); + + if (li) { + umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); + umr->flags = 1 << 7; + return; + } + + umr->flags = (1 << 5); /* fail if not free */ + umr->klm_octowords = get_klm_octo(wr->wr.fast_reg.page_list_len); + umr->mkey_mask = frwr_mkey_mask(); +} + +static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr) +{ + struct umr_wr *umrwr = (struct umr_wr *)&wr->wr.fast_reg; + u64 mask; + + memset(umr, 0, sizeof(*umr)); + + if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) { + umr->flags = 1 << 5; /* fail if not free */ + umr->klm_octowords = get_klm_octo(umrwr->npages); + mask = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_PD | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_FREE; + umr->mkey_mask = cpu_to_be64(mask); + } else { + umr->flags = 2 << 5; /* fail if free */ + mask = MLX5_MKEY_MASK_FREE; + umr->mkey_mask = cpu_to_be64(mask); + } + + if (!wr->num_sge) + umr->flags |= (1 << 7); /* inline */ +} + +static u8 get_umr_flags(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | + MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN; +} + +static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, + int li, int *writ) +{ + memset(seg, 0, sizeof(*seg)); + if (li) { + seg->status = 1 << 6; + return; + } + + seg->flags = get_umr_flags(wr->wr.fast_reg.access_flags) | + MLX5_ACCESS_MODE_MTT; + *writ = seg->flags & (MLX5_PERM_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE); + seg->qpn_mkey7_0 = cpu_to_be32((wr->wr.fast_reg.rkey & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); + seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); + seg->len = cpu_to_be64(wr->wr.fast_reg.length); + seg->xlt_oct_size = cpu_to_be32((wr->wr.fast_reg.page_list_len + 1) / 2); + seg->log2_page_size = wr->wr.fast_reg.page_shift; +} + +static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr) +{ + memset(seg, 0, sizeof(*seg)); + if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) { + seg->status = 1 << 6; + return; + } + + seg->flags = convert_access(wr->wr.fast_reg.access_flags); + seg->flags_pd = cpu_to_be32(to_mpd((struct ib_pd *)wr->wr.fast_reg.page_list)->pdn); + seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); + seg->len = cpu_to_be64(wr->wr.fast_reg.length); + seg->log2_page_size = wr->wr.fast_reg.page_shift; + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | + mlx5_mkey_variant(wr->wr.fast_reg.rkey)); +} + +static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, + struct ib_send_wr *wr, + struct mlx5_core_dev *mdev, + struct mlx5_ib_pd *pd, + int writ) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list); + u64 *page_list = wr->wr.fast_reg.page_list->page_list; + u64 perm = MLX5_EN_RD | (writ ? MLX5_EN_WR : 0); + int i; + + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) + mfrpl->mapped_page_list[i] = cpu_to_be64(page_list[i] | perm); + dseg->addr = cpu_to_be64(mfrpl->map); + dseg->byte_count = cpu_to_be32(ALIGN(sizeof(u64) * wr->wr.fast_reg.page_list_len, 64)); + dseg->lkey = cpu_to_be32(pd->pa_lkey); +} + +static __be32 send_ieth(struct ib_send_wr *wr) +{ + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + case IB_WR_RDMA_WRITE_WITH_IMM: + return wr->ex.imm_data; + + case IB_WR_SEND_WITH_INV: + return cpu_to_be32(wr->ex.invalidate_rkey); + + default: + return 0; + } +} + +static u8 calc_sig(void *wqe, int size) +{ + u8 *p = wqe; + u8 res = 0; + int i; + + for (i = 0; i < size; i++) + res ^= p[i]; + + return ~res; +} + +static u8 wq_sig(void *wqe) +{ + return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4); +} + +static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr, + void *wqe, int *sz) +{ + struct mlx5_wqe_inline_seg *seg; + void *qend = qp->sq.qend; + void *addr; + int inl = 0; + int copy; + int len; + int i; + + seg = wqe; + wqe += sizeof(*seg); + for (i = 0; i < wr->num_sge; i++) { + addr = (void *)(unsigned long)(wr->sg_list[i].addr); + len = wr->sg_list[i].length; + inl += len; + + if (unlikely(inl > qp->max_inline_data)) + return -ENOMEM; + + if (unlikely(wqe + len > qend)) { + copy = qend - wqe; + memcpy(wqe, addr, copy); + addr += copy; + len -= copy; + wqe = mlx5_get_send_wqe(qp, 0); + } + memcpy(wqe, addr, len); + wqe += len; + } + + seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG); + + *sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16; + + return 0; +} + +static u16 prot_field_size(enum ib_signature_type type) +{ + switch (type) { + case IB_SIG_TYPE_T10_DIF: + return MLX5_DIF_SIZE; + default: + return 0; + } +} + +static u8 bs_selector(int block_size) +{ + switch (block_size) { + case 512: return 0x1; + case 520: return 0x2; + case 4096: return 0x3; + case 4160: return 0x4; + case 1073741824: return 0x5; + default: return 0; + } +} + +static int format_selector(struct ib_sig_attrs *attr, + struct ib_sig_domain *domain, + int *selector) +{ + +#define FORMAT_DIF_NONE 0 +#define FORMAT_DIF_CRC_INC 8 +#define FORMAT_DIF_CRC_NO_INC 12 +#define FORMAT_DIF_CSUM_INC 13 +#define FORMAT_DIF_CSUM_NO_INC 14 + + switch (domain->sig.dif.type) { + case IB_T10DIF_NONE: + /* No DIF */ + *selector = FORMAT_DIF_NONE; + break; + case IB_T10DIF_TYPE1: /* Fall through */ + case IB_T10DIF_TYPE2: + switch (domain->sig.dif.bg_type) { + case IB_T10DIF_CRC: + *selector = FORMAT_DIF_CRC_INC; + break; + case IB_T10DIF_CSUM: + *selector = FORMAT_DIF_CSUM_INC; + break; + default: + return 1; + } + break; + case IB_T10DIF_TYPE3: + switch (domain->sig.dif.bg_type) { + case IB_T10DIF_CRC: + *selector = domain->sig.dif.type3_inc_reftag ? + FORMAT_DIF_CRC_INC : + FORMAT_DIF_CRC_NO_INC; + break; + case IB_T10DIF_CSUM: + *selector = domain->sig.dif.type3_inc_reftag ? + FORMAT_DIF_CSUM_INC : + FORMAT_DIF_CSUM_NO_INC; + break; + default: + return 1; + } + break; + default: + return 1; + } + + return 0; +} + +static int mlx5_set_bsf(struct ib_mr *sig_mr, + struct ib_sig_attrs *sig_attrs, + struct mlx5_bsf *bsf, u32 data_size) +{ + struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig; + struct mlx5_bsf_basic *basic = &bsf->basic; + struct ib_sig_domain *mem = &sig_attrs->mem; + struct ib_sig_domain *wire = &sig_attrs->wire; + int ret, selector; + + memset(bsf, 0, sizeof(*bsf)); + switch (sig_attrs->mem.sig_type) { + case IB_SIG_TYPE_T10_DIF: + if (sig_attrs->wire.sig_type != IB_SIG_TYPE_T10_DIF) + return -EINVAL; + + /* Input domain check byte mask */ + basic->check_byte_mask = sig_attrs->check_mask; + if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval && + mem->sig.dif.type == wire->sig.dif.type) { + /* Same block structure */ + basic->bsf_size_sbs = 1 << 4; + if (mem->sig.dif.bg_type == wire->sig.dif.bg_type) + basic->wire.copy_byte_mask |= 0xc0; + if (mem->sig.dif.app_tag == wire->sig.dif.app_tag) + basic->wire.copy_byte_mask |= 0x30; + if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag) + basic->wire.copy_byte_mask |= 0x0f; + } else + basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval); + + basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval); + basic->raw_data_size = cpu_to_be32(data_size); + + ret = format_selector(sig_attrs, mem, &selector); + if (ret) + return -EINVAL; + basic->m_bfs_psv = cpu_to_be32(selector << 24 | + msig->psv_memory.psv_idx); + + ret = format_selector(sig_attrs, wire, &selector); + if (ret) + return -EINVAL; + basic->w_bfs_psv = cpu_to_be32(selector << 24 | + msig->psv_wire.psv_idx); + break; + + default: + return -EINVAL; + } + + return 0; +} + +static int set_sig_data_segment(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, + void **seg, int *size) +{ + struct ib_sig_attrs *sig_attrs = wr->wr.sig_handover.sig_attrs; + struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr; + struct mlx5_bsf *bsf; + u32 data_len = wr->sg_list->length; + u32 data_key = wr->sg_list->lkey; + u64 data_va = wr->sg_list->addr; + int ret; + int wqe_size; + + if (!wr->wr.sig_handover.prot || + (data_key == wr->wr.sig_handover.prot->lkey && + data_va == wr->wr.sig_handover.prot->addr && + data_len == wr->wr.sig_handover.prot->length)) { + /** + * Source domain doesn't contain signature information + * or data and protection are interleaved in memory. + * So need construct: + * ------------------ + * | data_klm | + * ------------------ + * | BSF | + * ------------------ + **/ + struct mlx5_klm *data_klm = *seg; + + data_klm->bcount = cpu_to_be32(data_len); + data_klm->key = cpu_to_be32(data_key); + data_klm->va = cpu_to_be64(data_va); + wqe_size = ALIGN(sizeof(*data_klm), 64); + } else { + /** + * Source domain contains signature information + * So need construct a strided block format: + * --------------------------- + * | stride_block_ctrl | + * --------------------------- + * | data_klm | + * --------------------------- + * | prot_klm | + * --------------------------- + * | BSF | + * --------------------------- + **/ + struct mlx5_stride_block_ctrl_seg *sblock_ctrl; + struct mlx5_stride_block_entry *data_sentry; + struct mlx5_stride_block_entry *prot_sentry; + u32 prot_key = wr->wr.sig_handover.prot->lkey; + u64 prot_va = wr->wr.sig_handover.prot->addr; + u16 block_size = sig_attrs->mem.sig.dif.pi_interval; + int prot_size; + + sblock_ctrl = *seg; + data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl); + prot_sentry = (void *)data_sentry + sizeof(*data_sentry); + + prot_size = prot_field_size(sig_attrs->mem.sig_type); + if (!prot_size) { + pr_err("Bad block size given: %u\n", block_size); + return -EINVAL; + } + sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size + + prot_size); + sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP); + sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size); + sblock_ctrl->num_entries = cpu_to_be16(2); + + data_sentry->bcount = cpu_to_be16(block_size); + data_sentry->key = cpu_to_be32(data_key); + data_sentry->va = cpu_to_be64(data_va); + data_sentry->stride = cpu_to_be16(block_size); + + prot_sentry->bcount = cpu_to_be16(prot_size); + prot_sentry->key = cpu_to_be32(prot_key); + prot_sentry->va = cpu_to_be64(prot_va); + prot_sentry->stride = cpu_to_be16(prot_size); + + wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) + + sizeof(*prot_sentry), 64); + } + + *seg += wqe_size; + *size += wqe_size / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + bsf = *seg; + ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len); + if (ret) + return -EINVAL; + + *seg += sizeof(*bsf); + *size += sizeof(*bsf) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + return 0; +} + +static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, + struct ib_send_wr *wr, u32 nelements, + u32 length, u32 pdn) +{ + struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr; + u32 sig_key = sig_mr->rkey; + u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1; + + memset(seg, 0, sizeof(*seg)); + + seg->flags = get_umr_flags(wr->wr.sig_handover.access_flags) | + MLX5_ACCESS_MODE_KLM; + seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | + MLX5_MKEY_BSF_EN | pdn); + seg->len = cpu_to_be64(length); + seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(nelements))); + seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); +} + +static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr, u32 nelements) +{ + memset(umr, 0, sizeof(*umr)); + + umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE; + umr->klm_octowords = get_klm_octo(nelements); + umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE); + umr->mkey_mask = sig_mkey_mask(); +} + + +static int set_sig_umr_wr(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, + void **seg, int *size) +{ + struct mlx5_ib_mr *sig_mr = to_mmr(wr->wr.sig_handover.sig_mr); + u32 pdn = get_pd(qp)->pdn; + u32 klm_oct_size; + int region_len, ret; + + if (unlikely(wr->num_sge != 1) || + unlikely(wr->wr.sig_handover.access_flags & + IB_ACCESS_REMOTE_ATOMIC) || + unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) || + unlikely(!sig_mr->sig->sig_status_checked)) + return -EINVAL; + + /* length of the protected region, data + protection */ + region_len = wr->sg_list->length; + if (wr->wr.sig_handover.prot && + (wr->wr.sig_handover.prot->lkey != wr->sg_list->lkey || + wr->wr.sig_handover.prot->addr != wr->sg_list->addr || + wr->wr.sig_handover.prot->length != wr->sg_list->length)) + region_len += wr->wr.sig_handover.prot->length; + + /** + * KLM octoword size - if protection was provided + * then we use strided block format (3 octowords), + * else we use single KLM (1 octoword) + **/ + klm_oct_size = wr->wr.sig_handover.prot ? 3 : 1; + + set_sig_umr_segment(*seg, wr, klm_oct_size); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + set_sig_mkey_segment(*seg, wr, klm_oct_size, region_len, pdn); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + ret = set_sig_data_segment(wr, qp, seg, size); + if (ret) + return ret; + + sig_mr->sig->sig_status_checked = false; + return 0; +} + +static int set_psv_wr(struct ib_sig_domain *domain, + u32 psv_idx, void **seg, int *size) +{ + struct mlx5_seg_set_psv *psv_seg = *seg; + + memset(psv_seg, 0, sizeof(*psv_seg)); + psv_seg->psv_num = cpu_to_be32(psv_idx); + switch (domain->sig_type) { + case IB_SIG_TYPE_T10_DIF: + psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 | + domain->sig.dif.app_tag); + psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag); + + *seg += sizeof(*psv_seg); + *size += sizeof(*psv_seg) / 16; + break; + + default: + pr_err("Bad signature type given.\n"); + return 1; + } + + return 0; +} + +static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size, + struct mlx5_core_dev *mdev, struct mlx5_ib_pd *pd, struct mlx5_ib_qp *qp) +{ + int writ = 0; + int li; + + li = wr->opcode == IB_WR_LOCAL_INV ? 1 : 0; + if (unlikely(wr->send_flags & IB_SEND_INLINE)) + return -EINVAL; + + set_frwr_umr_segment(*seg, wr, li); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + set_mkey_segment(*seg, wr, li, &writ); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + if (!li) { + if (unlikely(wr->wr.fast_reg.page_list_len > + wr->wr.fast_reg.page_list->max_page_list_len)) + return -ENOMEM; + + set_frwr_pages(*seg, wr, mdev, pd, writ); + *seg += sizeof(struct mlx5_wqe_data_seg); + *size += (sizeof(struct mlx5_wqe_data_seg) / 16); + } + return 0; +} + +static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16) +{ + __be32 *p = NULL; + int tidx = idx; + int i, j; + + pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx)); + for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) { + if ((i & 0xf) == 0) { + void *buf = mlx5_get_send_wqe(qp, tidx); + tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1); + p = buf; + j = 0; + } + pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]), + be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]), + be32_to_cpu(p[j + 3])); + } +} + +static void mlx5_bf_copy(u64 __iomem *dst, u64 *src, + unsigned bytecnt, struct mlx5_ib_qp *qp) +{ + while (bytecnt > 0) { + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + bytecnt -= 64; + if (unlikely(src == qp->sq.qend)) + src = mlx5_get_send_wqe(qp, 0); + } +} + +static u8 get_fence(u8 fence, struct ib_send_wr *wr) +{ + if (unlikely(wr->opcode == IB_WR_LOCAL_INV && + wr->send_flags & IB_SEND_FENCE)) + return MLX5_FENCE_MODE_STRONG_ORDERING; + + if (unlikely(fence)) { + if (wr->send_flags & IB_SEND_FENCE) + return MLX5_FENCE_MODE_SMALL_AND_FENCE; + else + return fence; + + } else { + return 0; + } +} + +static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, + struct ib_send_wr *wr, int *idx, + int *size, int nreq) +{ + int err = 0; + + if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) { + err = -ENOMEM; + return err; + } + + *idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); + *seg = mlx5_get_send_wqe(qp, *idx); + *ctrl = *seg; + *(uint32_t *)(*seg + 8) = 0; + (*ctrl)->imm = send_ieth(wr); + (*ctrl)->fm_ce_se = qp->sq_signal_bits | + (wr->send_flags & IB_SEND_SIGNALED ? + MLX5_WQE_CTRL_CQ_UPDATE : 0) | + (wr->send_flags & IB_SEND_SOLICITED ? + MLX5_WQE_CTRL_SOLICITED : 0); + + *seg += sizeof(**ctrl); + *size = sizeof(**ctrl) / 16; + + return err; +} + +static void finish_wqe(struct mlx5_ib_qp *qp, + struct mlx5_wqe_ctrl_seg *ctrl, + u8 size, unsigned idx, u64 wr_id, + int nreq, u8 fence, u8 next_fence, + u32 mlx5_opcode) +{ + u8 opmod = 0; + + ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | + mlx5_opcode | ((u32)opmod << 24)); + ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8)); + ctrl->fm_ce_se |= fence; + qp->fm_cache = next_fence; + if (unlikely(qp->wq_sig)) + ctrl->signature = wq_sig(ctrl); + + qp->sq.wrid[idx] = wr_id; + qp->sq.w_list[idx].opcode = mlx5_opcode; + qp->sq.wqe_head[idx] = qp->sq.head + nreq; + qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); + qp->sq.w_list[idx].next = qp->sq.cur_post; +} + + +int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = &dev->mdev; + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_mr *mr; + struct mlx5_wqe_data_seg *dpseg; + struct mlx5_wqe_xrc_seg *xrc; + struct mlx5_bf *bf = qp->bf; + int uninitialized_var(size); + void *qend = qp->sq.qend; + unsigned long flags; + unsigned idx; + int err = 0; + int inl = 0; + int num_sge; + void *seg; + int nreq; + int i; + u8 next_fence = 0; + u8 fence; + + spin_lock_irqsave(&qp->sq.lock, flags); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->opcode >= sizeof(mlx5_ib_opcode) / sizeof(mlx5_ib_opcode[0]))) { + mlx5_ib_warn(dev, "\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + fence = qp->fm_cache; + num_sge = wr->num_sge; + if (unlikely(num_sge > qp->sq.max_gs)) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + switch (ibqp->qp_type) { + case IB_QPT_XRC_INI: + xrc = seg; + xrc->xrc_srqn = htonl(wr->xrc_remote_srq_num); + seg += sizeof(*xrc); + size += sizeof(*xrc) / 16; + /* fall through */ + case IB_QPT_RC: + switch (wr->opcode) { + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(seg, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + mlx5_ib_warn(dev, "Atomic operations are not supported yet\n"); + err = -ENOSYS; + *bad_wr = wr; + goto out; + + case IB_WR_LOCAL_INV: + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + qp->sq.wr_data[idx] = IB_WR_LOCAL_INV; + ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey); + err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + num_sge = 0; + break; + + case IB_WR_FAST_REG_MR: + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + qp->sq.wr_data[idx] = IB_WR_FAST_REG_MR; + ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey); + err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + num_sge = 0; + break; + + case IB_WR_REG_SIG_MR: + qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR; + mr = to_mmr(wr->wr.sig_handover.sig_mr); + + ctrl->imm = cpu_to_be32(mr->ibmr.rkey); + err = set_sig_umr_wr(wr, qp, &seg, &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_UMR); + /* + * SET_PSV WQEs are not signaled and solicited + * on error + */ + wr->send_flags &= ~IB_SEND_SIGNALED; + wr->send_flags |= IB_SEND_SOLICITED; + err = begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->mem, + mr->sig->psv_memory.psv_idx, &seg, + &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_SET_PSV); + err = begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->wire, + mr->sig->psv_wire.psv_idx, &seg, + &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_SET_PSV); + num_sge = 0; + goto skip_psv; + + default: + break; + } + break; + + case IB_QPT_UC: + switch (wr->opcode) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(seg, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + break; + + default: + break; + } + break; + + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + set_datagram_seg(seg, wr); + seg += sizeof(struct mlx5_wqe_datagram_seg); + size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + break; + + case MLX5_IB_QPT_REG_UMR: + if (wr->opcode != MLX5_IB_WR_UMR) { + err = -EINVAL; + mlx5_ib_warn(dev, "bad opcode\n"); + goto out; + } + qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; + ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey); + set_reg_umr_segment(seg, wr); + seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + set_reg_mkey_segment(seg, wr); + seg += sizeof(struct mlx5_mkey_seg); + size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + break; + + default: + break; + } + + if (wr->send_flags & IB_SEND_INLINE && num_sge) { + int uninitialized_var(sz); + + err = set_data_inl_seg(qp, wr, seg, &sz); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + inl = 1; + size += sz; + } else { + dpseg = seg; + for (i = 0; i < num_sge; i++) { + if (unlikely(dpseg == qend)) { + seg = mlx5_get_send_wqe(qp, 0); + dpseg = seg; + } + if (likely(wr->sg_list[i].length)) { + set_data_ptr_seg(dpseg, wr->sg_list + i); + size += sizeof(struct mlx5_wqe_data_seg) / 16; + dpseg++; + } + } + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, + get_fence(fence, wr), next_fence, + mlx5_ib_opcode[wr->opcode]); +skip_psv: + if (0) + dump_wqe(qp, idx, size); + } + +out: + if (likely(nreq)) { + qp->sq.head += nreq; + + /* Make sure that descriptors are written before + * updating doorbell record and ringing the doorbell + */ + wmb(); + + qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); + + /* Make sure doorbell record is visible to the HCA before + * we hit doorbell */ + wmb(); + + if (bf->need_lock) + spin_lock(&bf->lock); + + /* TBD enable WC */ + if (0 && nreq == 1 && bf->uuarn && inl && size > 1 && size <= bf->buf_size / 16) { + mlx5_bf_copy(bf->reg + bf->offset, (u64 *)ctrl, ALIGN(size * 16, 64), qp); + /* wc_wmb(); */ + } else { + mlx5_write64((__be32 *)ctrl, bf->regreg + bf->offset, + MLX5_GET_DOORBELL_LOCK(&bf->lock32)); + /* Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + mmiowb(); + } + bf->offset ^= bf->buf_size; + if (bf->need_lock) + spin_unlock(&bf->lock); + } + + spin_unlock_irqrestore(&qp->sq.lock, flags); + + return err; +} + +static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size) +{ + sig->signature = calc_sig(sig, size); +} + +int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_wqe_data_seg *scat; + struct mlx5_rwqe_sig *sig; + unsigned long flags; + int err = 0; + int nreq; + int ind; + int i; + + spin_lock_irqsave(&qp->rq.lock, flags); + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + if (qp->wq_sig) + scat++; + + for (i = 0; i < wr->num_sge; i++) + set_data_ptr_seg(scat + i, wr->sg_list + i); + + if (i < qp->rq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + + if (qp->wq_sig) { + sig = (struct mlx5_rwqe_sig *)scat; + set_sig_seg(sig, (qp->rq.max_gs + 1) << 2); + } + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + + return err; +} + +static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state) +{ + switch (mlx5_state) { + case MLX5_QP_STATE_RST: return IB_QPS_RESET; + case MLX5_QP_STATE_INIT: return IB_QPS_INIT; + case MLX5_QP_STATE_RTR: return IB_QPS_RTR; + case MLX5_QP_STATE_RTS: return IB_QPS_RTS; + case MLX5_QP_STATE_SQ_DRAINING: + case MLX5_QP_STATE_SQD: return IB_QPS_SQD; + case MLX5_QP_STATE_SQER: return IB_QPS_SQE; + case MLX5_QP_STATE_ERR: return IB_QPS_ERR; + default: return -1; + } +} + +static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state) +{ + switch (mlx5_mig_state) { + case MLX5_QP_PM_ARMED: return IB_MIG_ARMED; + case MLX5_QP_PM_REARM: return IB_MIG_REARM; + case MLX5_QP_PM_MIGRATED: return IB_MIG_MIGRATED; + default: return -1; + } +} + +static int to_ib_qp_access_flags(int mlx5_flags) +{ + int ib_flags = 0; + + if (mlx5_flags & MLX5_QP_BIT_RRE) + ib_flags |= IB_ACCESS_REMOTE_READ; + if (mlx5_flags & MLX5_QP_BIT_RWE) + ib_flags |= IB_ACCESS_REMOTE_WRITE; + if (mlx5_flags & MLX5_QP_BIT_RAE) + ib_flags |= IB_ACCESS_REMOTE_ATOMIC; + + return ib_flags; +} + +static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr, + struct mlx5_qp_path *path) +{ + struct mlx5_core_dev *dev = &ibdev->mdev; + + memset(ib_ah_attr, 0, sizeof(*ib_ah_attr)); + ib_ah_attr->port_num = path->port; + + if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports) + return; + + ib_ah_attr->sl = path->sl & 0xf; + + ib_ah_attr->dlid = be16_to_cpu(path->rlid); + ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f; + ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; + ib_ah_attr->ah_flags = (path->grh_mlid & (1 << 7)) ? IB_AH_GRH : 0; + if (ib_ah_attr->ah_flags) { + ib_ah_attr->grh.sgid_index = path->mgid_index; + ib_ah_attr->grh.hop_limit = path->hop_limit; + ib_ah_attr->grh.traffic_class = + (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff; + ib_ah_attr->grh.flow_label = + be32_to_cpu(path->tclass_flowlabel) & 0xfffff; + memcpy(ib_ah_attr->grh.dgid.raw, + path->rgid, sizeof(ib_ah_attr->grh.dgid.raw)); + } +} + +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_query_qp_mbox_out *outb; + struct mlx5_qp_context *context; + int mlx5_state; + int err = 0; + + mutex_lock(&qp->mutex); + outb = kzalloc(sizeof(*outb), GFP_KERNEL); + if (!outb) { + err = -ENOMEM; + goto out; + } + context = &outb->ctx; + err = mlx5_core_qp_query(&dev->mdev, &qp->mqp, outb, sizeof(*outb)); + if (err) + goto out_free; + + mlx5_state = be32_to_cpu(context->flags) >> 28; + + qp->state = to_ib_qp_state(mlx5_state); + qp_attr->qp_state = qp->state; + qp_attr->path_mtu = context->mtu_msgmax >> 5; + qp_attr->path_mig_state = + to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); + qp_attr->qkey = be32_to_cpu(context->qkey); + qp_attr->rq_psn = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff; + qp_attr->sq_psn = be32_to_cpu(context->next_send_psn) & 0xffffff; + qp_attr->dest_qp_num = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff; + qp_attr->qp_access_flags = + to_ib_qp_access_flags(be32_to_cpu(context->params2)); + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); + qp_attr->alt_pkey_index = context->alt_path.pkey_index & 0x7f; + qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; + } + + qp_attr->pkey_index = context->pri_path.pkey_index & 0x7f; + qp_attr->port_num = context->pri_path.port; + + /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ + qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING; + + qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7); + + qp_attr->max_dest_rd_atomic = + 1 << ((be32_to_cpu(context->params2) >> 21) & 0x7); + qp_attr->min_rnr_timer = + (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f; + qp_attr->timeout = context->pri_path.ackto_lt >> 3; + qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; + qp_attr->rnr_retry = (be32_to_cpu(context->params1) >> 13) & 0x7; + qp_attr->alt_timeout = context->alt_path.ackto_lt >> 3; + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; + qp_attr->cap.max_recv_sge = qp->rq.max_gs; + + if (!ibqp->uobject) { + qp_attr->cap.max_send_wr = qp->sq.wqe_cnt; + qp_attr->cap.max_send_sge = qp->sq.max_gs; + } else { + qp_attr->cap.max_send_wr = 0; + qp_attr->cap.max_send_sge = 0; + } + + /* We don't support inline sends for kernel QPs (yet), and we + * don't know what userspace's value should be. + */ + qp_attr->cap.max_inline_data = 0; + + qp_init_attr->cap = qp_attr->cap; + + qp_init_attr->create_flags = 0; + if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? + IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + +out_free: + kfree(outb); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_xrcd *xrcd; + int err; + + if (!(dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_XRC)) + return ERR_PTR(-ENOSYS); + + xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL); + if (!xrcd) + return ERR_PTR(-ENOMEM); + + err = mlx5_core_xrcd_alloc(&dev->mdev, &xrcd->xrcdn); + if (err) { + kfree(xrcd); + return ERR_PTR(-ENOMEM); + } + + return &xrcd->ibxrcd; +} + +int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + struct mlx5_ib_dev *dev = to_mdev(xrcd->device); + u32 xrcdn = to_mxrcd(xrcd)->xrcdn; + int err; + + err = mlx5_core_xrcd_dealloc(&dev->mdev, xrcdn); + if (err) { + mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn); + return err; + } + + kfree(xrcd); + + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c new file mode 100644 index 00000000000..384af6dec5e --- /dev/null +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -0,0 +1,485 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/mlx5/qp.h> +#include <linux/mlx5/srq.h> +#include <linux/slab.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_user_verbs.h> + +#include "mlx5_ib.h" +#include "user.h" + +/* not supported currently */ +static int srq_signature; + +static void *get_wqe(struct mlx5_ib_srq *srq, int n) +{ + return mlx5_buf_offset(&srq->buf, n << srq->msrq.wqe_shift); +} + +static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type) +{ + struct ib_event event; + struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq; + + if (ibsrq->event_handler) { + event.device = ibsrq->device; + event.element.srq = ibsrq; + switch (type) { + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + break; + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + event.event = IB_EVENT_SRQ_ERR; + break; + default: + pr_warn("mlx5_ib: Unexpected event type %d on SRQ %06x\n", + type, srq->srqn); + return; + } + + ibsrq->event_handler(&event, ibsrq->srq_context); + } +} + +static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, + struct mlx5_create_srq_mbox_in **in, + struct ib_udata *udata, int buf_size, int *inlen) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_create_srq ucmd; + size_t ucmdlen; + int err; + int npages; + int page_shift; + int ncont; + u32 offset; + + ucmdlen = + (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < + sizeof(ucmd)) ? (sizeof(ucmd) - + sizeof(ucmd.reserved)) : sizeof(ucmd); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { + mlx5_ib_dbg(dev, "failed copy udata\n"); + return -EFAULT; + } + + if (ucmdlen == sizeof(ucmd) && + ucmd.reserved != 0) + return -EINVAL; + + srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); + + srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size, + 0, 0); + if (IS_ERR(srq->umem)) { + mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size); + err = PTR_ERR(srq->umem); + return err; + } + + mlx5_ib_cont_pages(srq->umem, ucmd.buf_addr, &npages, + &page_shift, &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, + &offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; + *in = mlx5_vzalloc(*inlen); + if (!(*in)) { + err = -ENOMEM; + goto err_umem; + } + + mlx5_ib_populate_pas(dev, srq->umem, page_shift, (*in)->pas, 0); + + err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), + ucmd.db_addr, &srq->db); + if (err) { + mlx5_ib_dbg(dev, "map doorbell failed\n"); + goto err_in; + } + + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); + + return 0; + +err_in: + mlx5_vfree(*in); + +err_umem: + ib_umem_release(srq->umem); + + return err; +} + +static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, + struct mlx5_create_srq_mbox_in **in, int buf_size, + int *inlen) +{ + int err; + int i; + struct mlx5_wqe_srq_next_seg *next; + int page_shift; + int npages; + + err = mlx5_db_alloc(&dev->mdev, &srq->db); + if (err) { + mlx5_ib_warn(dev, "alloc dbell rec failed\n"); + return err; + } + + *srq->db.db = 0; + + if (mlx5_buf_alloc(&dev->mdev, buf_size, PAGE_SIZE * 2, &srq->buf)) { + mlx5_ib_dbg(dev, "buf alloc failed\n"); + err = -ENOMEM; + goto err_db; + } + page_shift = srq->buf.page_shift; + + srq->head = 0; + srq->tail = srq->msrq.max - 1; + srq->wqe_ctr = 0; + + for (i = 0; i < srq->msrq.max; i++) { + next = get_wqe(srq, i); + next->next_wqe_index = + cpu_to_be16((i + 1) & (srq->msrq.max - 1)); + } + + npages = DIV_ROUND_UP(srq->buf.npages, 1 << (page_shift - PAGE_SHIFT)); + mlx5_ib_dbg(dev, "buf_size %d, page_shift %d, npages %d, calc npages %d\n", + buf_size, page_shift, srq->buf.npages, npages); + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * npages; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + err = -ENOMEM; + goto err_buf; + } + mlx5_fill_page_array(&srq->buf, (*in)->pas); + + srq->wrid = kmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL); + if (!srq->wrid) { + mlx5_ib_dbg(dev, "kmalloc failed %lu\n", + (unsigned long)(srq->msrq.max * sizeof(u64))); + err = -ENOMEM; + goto err_in; + } + srq->wq_sig = !!srq_signature; + + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + + return 0; + +err_in: + mlx5_vfree(*in); + +err_buf: + mlx5_buf_free(&dev->mdev, &srq->buf); + +err_db: + mlx5_db_free(&dev->mdev, &srq->db); + return err; +} + +static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq) +{ + mlx5_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db); + ib_umem_release(srq->umem); +} + + +static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq) +{ + kfree(srq->wrid); + mlx5_buf_free(&dev->mdev, &srq->buf); + mlx5_db_free(&dev->mdev, &srq->db); +} + +struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_srq *srq; + int desc_size; + int buf_size; + int err; + struct mlx5_create_srq_mbox_in *uninitialized_var(in); + int uninitialized_var(inlen); + int is_xrc; + u32 flgs, xrcdn; + + /* Sanity check SRQ size before proceeding */ + if (init_attr->attr.max_wr >= dev->mdev.caps.max_srq_wqes) { + mlx5_ib_dbg(dev, "max_wr %d, cap %d\n", + init_attr->attr.max_wr, + dev->mdev.caps.max_srq_wqes); + return ERR_PTR(-EINVAL); + } + + srq = kmalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) + return ERR_PTR(-ENOMEM); + + mutex_init(&srq->mutex); + spin_lock_init(&srq->lock); + srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); + srq->msrq.max_gs = init_attr->attr.max_sge; + + desc_size = sizeof(struct mlx5_wqe_srq_next_seg) + + srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg); + desc_size = roundup_pow_of_two(desc_size); + desc_size = max_t(int, 32, desc_size); + srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) / + sizeof(struct mlx5_wqe_data_seg); + srq->msrq.wqe_shift = ilog2(desc_size); + buf_size = srq->msrq.max * desc_size; + mlx5_ib_dbg(dev, "desc_size 0x%x, req wr 0x%x, srq size 0x%x, max_gs 0x%x, max_avail_gather 0x%x\n", + desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs, + srq->msrq.max_avail_gather); + + if (pd->uobject) + err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen); + else + err = create_srq_kernel(dev, srq, &in, buf_size, &inlen); + + if (err) { + mlx5_ib_warn(dev, "create srq %s failed, err %d\n", + pd->uobject ? "user" : "kernel", err); + goto err_srq; + } + + is_xrc = (init_attr->srq_type == IB_SRQT_XRC); + in->ctx.state_log_sz = ilog2(srq->msrq.max); + flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24; + xrcdn = 0; + if (is_xrc) { + xrcdn = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn; + in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(init_attr->ext.xrc.cq)->mcq.cqn); + } else if (init_attr->srq_type == IB_SRQT_BASIC) { + xrcdn = to_mxrcd(dev->devr.x0)->xrcdn; + in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(dev->devr.c0)->mcq.cqn); + } + + in->ctx.flags_xrcd = cpu_to_be32((flgs & 0xFF000000) | (xrcdn & 0xFFFFFF)); + + in->ctx.pd = cpu_to_be32(to_mpd(pd)->pdn); + in->ctx.db_record = cpu_to_be64(srq->db.dma); + err = mlx5_core_create_srq(&dev->mdev, &srq->msrq, in, inlen); + mlx5_vfree(in); + if (err) { + mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err); + goto err_usr_kern_srq; + } + + mlx5_ib_dbg(dev, "create SRQ with srqn 0x%x\n", srq->msrq.srqn); + + srq->msrq.event = mlx5_ib_srq_event; + srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn; + + if (pd->uobject) + if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof(__u32))) { + mlx5_ib_dbg(dev, "copy to user failed\n"); + err = -EFAULT; + goto err_core; + } + + init_attr->attr.max_wr = srq->msrq.max - 1; + + return &srq->ibsrq; + +err_core: + mlx5_core_destroy_srq(&dev->mdev, &srq->msrq); + +err_usr_kern_srq: + if (pd->uobject) + destroy_srq_user(pd, srq); + else + destroy_srq_kernel(dev, srq); + +err_srq: + kfree(srq); + + return ERR_PTR(err); +} + +int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + int ret; + + /* We don't support resizing SRQs yet */ + if (attr_mask & IB_SRQ_MAX_WR) + return -EINVAL; + + if (attr_mask & IB_SRQ_LIMIT) { + if (attr->srq_limit >= srq->msrq.max) + return -EINVAL; + + mutex_lock(&srq->mutex); + ret = mlx5_core_arm_srq(&dev->mdev, &srq->msrq, attr->srq_limit, 1); + mutex_unlock(&srq->mutex); + + if (ret) + return ret; + } + + return 0; +} + +int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + int ret; + struct mlx5_query_srq_mbox_out *out; + + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) + return -ENOMEM; + + ret = mlx5_core_query_srq(&dev->mdev, &srq->msrq, out); + if (ret) + goto out_box; + + srq_attr->srq_limit = be16_to_cpu(out->ctx.lwm); + srq_attr->max_wr = srq->msrq.max - 1; + srq_attr->max_sge = srq->msrq.max_gs; + +out_box: + kfree(out); + return ret; +} + +int mlx5_ib_destroy_srq(struct ib_srq *srq) +{ + struct mlx5_ib_dev *dev = to_mdev(srq->device); + struct mlx5_ib_srq *msrq = to_msrq(srq); + + mlx5_core_destroy_srq(&dev->mdev, &msrq->msrq); + + if (srq->uobject) { + mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); + ib_umem_release(msrq->umem); + } else { + destroy_srq_kernel(dev, msrq); + } + + kfree(srq); + return 0; +} + +void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index) +{ + struct mlx5_wqe_srq_next_seg *next; + + /* always called with interrupts disabled. */ + spin_lock(&srq->lock); + + next = get_wqe(srq, srq->tail); + next->next_wqe_index = cpu_to_be16(wqe_index); + srq->tail = wqe_index; + + spin_unlock(&srq->lock); +} + +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + struct mlx5_wqe_srq_next_seg *next; + struct mlx5_wqe_data_seg *scat; + unsigned long flags; + int err = 0; + int nreq; + int i; + + spin_lock_irqsave(&srq->lock, flags); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->num_sge > srq->msrq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + if (unlikely(srq->head == srq->tail)) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + + srq->wrid[srq->head] = wr->wr_id; + + next = get_wqe(srq, srq->head); + srq->head = be16_to_cpu(next->next_wqe_index); + scat = (struct mlx5_wqe_data_seg *)(next + 1); + + for (i = 0; i < wr->num_sge; i++) { + scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length); + scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey); + scat[i].addr = cpu_to_be64(wr->sg_list[i].addr); + } + + if (i < srq->msrq.max_avail_gather) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + } + + if (likely(nreq)) { + srq->wqe_ctr += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *srq->db.db = cpu_to_be32(srq->wqe_ctr); + } + + spin_unlock_irqrestore(&srq->lock, flags); + + return err; +} diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h new file mode 100644 index 00000000000..d0ba264ac1e --- /dev/null +++ b/drivers/infiniband/hw/mlx5/user.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IB_USER_H +#define MLX5_IB_USER_H + +#include <linux/types.h> + +enum { + MLX5_QP_FLAG_SIGNATURE = 1 << 0, + MLX5_QP_FLAG_SCATTER_CQE = 1 << 1, +}; + +enum { + MLX5_SRQ_FLAG_SIGNATURE = 1 << 0, +}; + + +/* Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define MLX5_IB_UVERBS_ABI_VERSION 1 + +/* Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mlx5_ib_alloc_ucontext_req { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; +}; + +struct mlx5_ib_alloc_ucontext_req_v2 { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; + __u32 flags; + __u32 reserved; +}; + +struct mlx5_ib_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 bf_reg_size; + __u32 tot_uuars; + __u32 cache_line_size; + __u16 max_sq_desc_sz; + __u16 max_rq_desc_sz; + __u32 max_send_wqebb; + __u32 max_recv_wr; + __u32 max_srq_recv_wr; + __u16 num_ports; + __u16 reserved; +}; + +struct mlx5_ib_alloc_pd_resp { + __u32 pdn; +}; + +struct mlx5_ib_create_cq { + __u64 buf_addr; + __u64 db_addr; + __u32 cqe_size; + __u32 reserved; /* explicit padding (optional on i386) */ +}; + +struct mlx5_ib_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mlx5_ib_resize_cq { + __u64 buf_addr; + __u16 cqe_size; + __u16 reserved0; + __u32 reserved1; +}; + +struct mlx5_ib_create_srq { + __u64 buf_addr; + __u64 db_addr; + __u32 flags; + __u32 reserved; /* explicit padding (optional on i386) */ +}; + +struct mlx5_ib_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mlx5_ib_create_qp { + __u64 buf_addr; + __u64 db_addr; + __u32 sq_wqe_count; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 flags; +}; + +struct mlx5_ib_create_qp_resp { + __u32 uuar_index; +}; +#endif /* MLX5_IB_USER_H */ diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c index 7c9d35f39d7..69020173899 100644 --- a/drivers/infiniband/hw/mthca/mthca_eq.c +++ b/drivers/infiniband/hw/mthca/mthca_eq.c @@ -357,7 +357,7 @@ static int mthca_eq_int(struct mthca_dev *dev, struct mthca_eq *eq) mthca_warn(dev, "Unhandled event %02x(%02x) on EQ %d\n", eqe->type, eqe->subtype, eq->eqn); break; - }; + } set_eqe_hw(eqe); ++eq->cons_index; diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index aa12a533ae9..ded76c101dd 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -130,7 +130,7 @@ static int log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8); module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-5)"); -static char mthca_version[] __devinitdata = +static char mthca_version[] = DRV_NAME ": Mellanox InfiniBand HCA driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; @@ -858,13 +858,9 @@ static int mthca_enable_msi_x(struct mthca_dev *mdev) entries[1].entry = 1; entries[2].entry = 2; - err = pci_enable_msix(mdev->pdev, entries, ARRAY_SIZE(entries)); - if (err) { - if (err > 0) - mthca_info(mdev, "Only %d MSI-X vectors available, " - "not using MSI-X\n", err); + err = pci_enable_msix_exact(mdev->pdev, entries, ARRAY_SIZE(entries)); + if (err) return err; - } mdev->eq_table.eq[MTHCA_EQ_COMP ].msi_x_vector = entries[0].vector; mdev->eq_table.eq[MTHCA_EQ_ASYNC].msi_x_vector = entries[1].vector; @@ -1139,8 +1135,7 @@ int __mthca_restart_one(struct pci_dev *pdev) return __mthca_init_one(pdev, hca_type); } -static int __devinit mthca_init_one(struct pci_dev *pdev, - const struct pci_device_id *id) +static int mthca_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { int ret; @@ -1162,7 +1157,7 @@ static int __devinit mthca_init_one(struct pci_dev *pdev, return ret; } -static void __devexit mthca_remove_one(struct pci_dev *pdev) +static void mthca_remove_one(struct pci_dev *pdev) { mutex_lock(&mthca_device_mutex); __mthca_remove_one(pdev); @@ -1199,7 +1194,7 @@ static struct pci_driver mthca_driver = { .name = DRV_NAME, .id_table = mthca_pci_table, .probe = mthca_init_one, - .remove = __devexit_p(mthca_remove_one) + .remove = mthca_remove_one, }; static void __init __mthca_check_profile_val(const char *name, int *pval, diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 5b71d43bd89..415f8e1a54d 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -695,6 +695,7 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries, if (context && ib_copy_to_udata(udata, &cq->cqn, sizeof (__u32))) { mthca_free_cq(to_mdev(ibdev), cq); + err = -EFAULT; goto err_free; } @@ -976,12 +977,12 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata) { struct mthca_dev *dev = to_mdev(pd->device); - struct ib_umem_chunk *chunk; + struct scatterlist *sg; struct mthca_mr *mr; struct mthca_reg_mr ucmd; u64 *pages; int shift, n, len; - int i, j, k; + int i, k, entry; int err = 0; int write_mtt_size; @@ -1009,10 +1010,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } shift = ffs(mr->umem->page_size) - 1; - - n = 0; - list_for_each_entry(chunk, &mr->umem->chunk_list, list) - n += chunk->nents; + n = mr->umem->nmap; mr->mtt = mthca_alloc_mtt(dev, n); if (IS_ERR(mr->mtt)) { @@ -1030,25 +1028,24 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages)); - list_for_each_entry(chunk, &mr->umem->chunk_list, list) - for (j = 0; j < chunk->nmap; ++j) { - len = sg_dma_len(&chunk->page_list[j]) >> shift; - for (k = 0; k < len; ++k) { - pages[i++] = sg_dma_address(&chunk->page_list[j]) + - mr->umem->page_size * k; - /* - * Be friendly to write_mtt and pass it chunks - * of appropriate size. - */ - if (i == write_mtt_size) { - err = mthca_write_mtt(dev, mr->mtt, n, pages, i); - if (err) - goto mtt_done; - n += i; - i = 0; - } + for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = sg_dma_address(sg) + + mr->umem->page_size * k; + /* + * Be friendly to write_mtt and pass it chunks + * of appropriate size. + */ + if (i == write_mtt_size) { + err = mthca_write_mtt(dev, mr->mtt, n, pages, i); + if (err) + goto mtt_done; + n += i; + i = 0; } } + } if (i) err = mthca_write_mtt(dev, mr->mtt, n, pages, i); diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 26a68453610..e354b2f04ad 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -860,7 +860,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) { + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, + IB_LINK_LAYER_UNSPECIFIED)) { mthca_dbg(dev, "Bad QP transition (transport %d) " "%d->%d with attr 0x%08x\n", qp->transport, cur_state, new_state, diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index 748db2d3e46..3b2a6dc8ea9 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -68,7 +68,6 @@ MODULE_VERSION(DRV_VERSION); int max_mtu = 9000; int interrupt_mod_interval = 0; - /* Interoperability */ int mpa_version = 1; module_param(mpa_version, int, 0644); @@ -112,6 +111,16 @@ static struct pci_device_id nes_pci_table[] = { MODULE_DEVICE_TABLE(pci, nes_pci_table); +/* registered nes netlink callbacks */ +static struct ibnl_client_cbs nes_nl_cb_table[] = { + [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, + [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, + [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, + [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, + [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} +}; + static int nes_inetaddr_event(struct notifier_block *, unsigned long, void *); static int nes_net_event(struct notifier_block *, unsigned long, void *); static int nes_notifiers_registered; @@ -135,6 +144,7 @@ static int nes_inetaddr_event(struct notifier_block *notifier, struct net_device *event_netdev = ifa->ifa_dev->dev; struct nes_device *nesdev; struct net_device *netdev; + struct net_device *upper_dev; struct nes_vnic *nesvnic; unsigned int is_bonded; @@ -145,8 +155,9 @@ static int nes_inetaddr_event(struct notifier_block *notifier, nesdev, nesdev->netdev[0]->name); netdev = nesdev->netdev[0]; nesvnic = netdev_priv(netdev); + upper_dev = netdev_master_upper_dev_get(netdev); is_bonded = netif_is_bond_slave(netdev) && - (netdev->master == event_netdev); + (upper_dev == event_netdev); if ((netdev == event_netdev) || is_bonded) { if (nesvnic->rdma_enabled == 0) { nes_debug(NES_DBG_NETDEV, "Returning without processing event for %s since" @@ -179,9 +190,9 @@ static int nes_inetaddr_event(struct notifier_block *notifier, /* fall through */ case NETDEV_CHANGEADDR: /* Add the address to the IP table */ - if (netdev->master) + if (upper_dev) nesvnic->local_ipaddr = - ((struct in_device *)netdev->master->ip_ptr)->ifa_list->ifa_address; + ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address; else nesvnic->local_ipaddr = ifa->ifa_address; @@ -444,7 +455,7 @@ static irqreturn_t nes_interrupt(int irq, void *dev_id) /** * nes_probe - Device initialization */ -static int __devinit nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) +static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) { struct net_device *netdev = NULL; struct nes_device *nesdev = NULL; @@ -670,11 +681,25 @@ static int __devinit nes_probe(struct pci_dev *pcidev, const struct pci_device_i } nes_notifiers_registered++; + if (ibnl_add_client(RDMA_NL_NES, RDMA_NL_IWPM_NUM_OPS, nes_nl_cb_table)) + printk(KERN_ERR PFX "%s[%u]: Failed to add netlink callback\n", + __func__, __LINE__); + + ret = iwpm_init(RDMA_NL_NES); + if (ret) { + printk(KERN_ERR PFX "%s: port mapper initialization failed\n", + pci_name(pcidev)); + goto bail7; + } + INIT_DELAYED_WORK(&nesdev->work, nes_recheck_link_status); /* Initialize network devices */ - if ((netdev = nes_netdev_init(nesdev, mmio_regs)) == NULL) + netdev = nes_netdev_init(nesdev, mmio_regs); + if (netdev == NULL) { + ret = -ENOMEM; goto bail7; + } /* Register network device */ ret = register_netdev(netdev); @@ -705,6 +730,7 @@ static int __devinit nes_probe(struct pci_dev *pcidev, const struct pci_device_i nes_debug(NES_DBG_INIT, "netdev_count=%d, nesadapter->netdev_count=%d\n", nesdev->netdev_count, nesdev->nesadapter->netdev_count); + ibnl_remove_client(RDMA_NL_NES); nes_notifiers_registered--; if (nes_notifiers_registered == 0) { @@ -749,7 +775,7 @@ static int __devinit nes_probe(struct pci_dev *pcidev, const struct pci_device_i /** * nes_remove - unload from kernel */ -static void __devexit nes_remove(struct pci_dev *pcidev) +static void nes_remove(struct pci_dev *pcidev) { struct nes_device *nesdev = pci_get_drvdata(pcidev); struct net_device *netdev; @@ -768,6 +794,8 @@ static void __devexit nes_remove(struct pci_dev *pcidev) nesdev->nesadapter->netdev_count--; } } + ibnl_remove_client(RDMA_NL_NES); + iwpm_exit(RDMA_NL_NES); nes_notifiers_registered--; if (nes_notifiers_registered == 0) { @@ -810,7 +838,7 @@ static struct pci_driver nes_pci_driver = { .name = DRV_NAME, .id_table = nes_pci_table, .probe = nes_probe, - .remove = __devexit_p(nes_remove), + .remove = nes_remove, }; static ssize_t nes_show_adapter(struct device_driver *ddp, char *buf) diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index 5cac29e6bc1..bd9d132f11c 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -51,6 +51,8 @@ #include <rdma/ib_pack.h> #include <rdma/rdma_cm.h> #include <rdma/iw_cm.h> +#include <rdma/rdma_netlink.h> +#include <rdma/iw_portmap.h> #define NES_SEND_FIRST_WRITE @@ -130,6 +132,7 @@ #define NES_DBG_IW_TX 0x00040000 #define NES_DBG_SHUTDOWN 0x00080000 #define NES_DBG_PAU 0x00100000 +#define NES_DBG_NLMSG 0x00200000 #define NES_DBG_RSVD1 0x10000000 #define NES_DBG_RSVD2 0x20000000 #define NES_DBG_RSVD3 0x40000000 @@ -532,6 +535,7 @@ void nes_iwarp_ce_handler(struct nes_device *, struct nes_hw_cq *); int nes_destroy_cqp(struct nes_device *); int nes_nic_cm_xmit(struct sk_buff *, struct net_device *); void nes_recheck_link_status(struct work_struct *work); +void nes_terminate_timeout(unsigned long context); /* nes_nic.c */ struct net_device *nes_netdev_init(struct nes_device *, void __iomem *); diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index cfaacaf6bf5..6f09a72e78d 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -59,6 +59,7 @@ #include <net/route.h> #include <net/ip_fib.h> #include <net/tcp.h> +#include <linux/fcntl.h> #include "nes.h" @@ -128,6 +129,7 @@ static void build_mpa_v1(struct nes_cm_node *, void *, u8); static void build_rdma0_msg(struct nes_cm_node *, struct nes_qp **); static void print_core(struct nes_cm_core *core); +static void record_ird_ord(struct nes_cm_node *, u16, u16); /* External CM API Interface */ /* instance of function pointers for client API */ @@ -165,7 +167,6 @@ int nes_rem_ref_cm_node(struct nes_cm_node *cm_node) { return rem_ref_cm_node(cm_node->cm_core, cm_node); } - /** * create_event */ @@ -317,7 +318,6 @@ static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type, } } - if (priv_data_len + mpa_hdr_len != len) { nes_debug(NES_DBG_CM, "The received ietf buffer was not right" " complete (%x + %x != %x)\n", @@ -356,25 +356,57 @@ static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type, /* send reset */ return -EINVAL; } + if (ird_size == IETF_NO_IRD_ORD || ord_size == IETF_NO_IRD_ORD) + cm_node->mpav2_ird_ord = IETF_NO_IRD_ORD; - if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) { + if (cm_node->mpav2_ird_ord != IETF_NO_IRD_ORD) { /* responder */ - if (cm_node->ord_size > ird_size) - cm_node->ord_size = ird_size; - } else { - /* initiator */ - if (cm_node->ord_size > ird_size) - cm_node->ord_size = ird_size; - - if (cm_node->ird_size < ord_size) { - /* no resources available */ - /* send terminate message */ - return -EINVAL; + if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) { + /* we are still negotiating */ + if (ord_size > NES_MAX_IRD) { + cm_node->ird_size = NES_MAX_IRD; + } else { + cm_node->ird_size = ord_size; + if (ord_size == 0 && + (rtr_ctrl_ord & IETF_RDMA0_READ)) { + cm_node->ird_size = 1; + nes_debug(NES_DBG_CM, + "%s: Remote peer doesn't support RDMA0_READ (ord=%u)\n", + __func__, ord_size); + } + } + if (ird_size > NES_MAX_ORD) + cm_node->ord_size = NES_MAX_ORD; + else + cm_node->ord_size = ird_size; + } else { /* initiator */ + if (ord_size > NES_MAX_IRD) { + nes_debug(NES_DBG_CM, + "%s: Unable to support the requested (ord =%u)\n", + __func__, ord_size); + return -EINVAL; + } + cm_node->ird_size = ord_size; + + if (ird_size > NES_MAX_ORD) { + cm_node->ord_size = NES_MAX_ORD; + } else { + if (ird_size == 0 && + (rtr_ctrl_ord & IETF_RDMA0_READ)) { + nes_debug(NES_DBG_CM, + "%s: Remote peer doesn't support RDMA0_READ (ird=%u)\n", + __func__, ird_size); + return -EINVAL; + } else { + cm_node->ord_size = ird_size; + } + } } } if (rtr_ctrl_ord & IETF_RDMA0_READ) { cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; + } else if (rtr_ctrl_ord & IETF_RDMA0_WRITE) { cm_node->send_rdma0_op = SEND_RDMA_WRITE_ZERO; } else { /* Not supported RDMA0 operation */ @@ -450,11 +482,11 @@ static void form_cm_frame(struct sk_buff *skb, iph->ttl = 0x40; iph->protocol = 0x06; /* IPPROTO_TCP */ - iph->saddr = htonl(cm_node->loc_addr); - iph->daddr = htonl(cm_node->rem_addr); + iph->saddr = htonl(cm_node->mapped_loc_addr); + iph->daddr = htonl(cm_node->mapped_rem_addr); - tcph->source = htons(cm_node->loc_port); - tcph->dest = htons(cm_node->rem_port); + tcph->source = htons(cm_node->mapped_loc_port); + tcph->dest = htons(cm_node->mapped_rem_port); tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num); if (flags & SET_ACK) { @@ -493,6 +525,100 @@ static void form_cm_frame(struct sk_buff *skb, cm_packets_created++; } +/* + * nes_create_sockaddr - Record ip addr and tcp port in a sockaddr struct + */ +static void nes_create_sockaddr(__be32 ip_addr, __be16 port, + struct sockaddr_storage *addr) +{ + struct sockaddr_in *nes_sockaddr = (struct sockaddr_in *)addr; + nes_sockaddr->sin_family = AF_INET; + memcpy(&nes_sockaddr->sin_addr.s_addr, &ip_addr, sizeof(__be32)); + nes_sockaddr->sin_port = port; +} + +/* + * nes_create_mapinfo - Create a mapinfo object in the port mapper data base + */ +static int nes_create_mapinfo(struct nes_cm_info *cm_info) +{ + struct sockaddr_storage local_sockaddr; + struct sockaddr_storage mapped_sockaddr; + + nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port), + &local_sockaddr); + nes_create_sockaddr(htonl(cm_info->mapped_loc_addr), + htons(cm_info->mapped_loc_port), &mapped_sockaddr); + + return iwpm_create_mapinfo(&local_sockaddr, + &mapped_sockaddr, RDMA_NL_NES); +} + +/* + * nes_remove_mapinfo - Remove a mapinfo object from the port mapper data base + * and send a remove mapping op message to + * the userspace port mapper + */ +static int nes_remove_mapinfo(u32 loc_addr, u16 loc_port, + u32 mapped_loc_addr, u16 mapped_loc_port) +{ + struct sockaddr_storage local_sockaddr; + struct sockaddr_storage mapped_sockaddr; + + nes_create_sockaddr(htonl(loc_addr), htons(loc_port), &local_sockaddr); + nes_create_sockaddr(htonl(mapped_loc_addr), htons(mapped_loc_port), + &mapped_sockaddr); + + iwpm_remove_mapinfo(&local_sockaddr, &mapped_sockaddr); + return iwpm_remove_mapping(&local_sockaddr, RDMA_NL_NES); +} + +/* + * nes_form_pm_msg - Form a port mapper message with mapping info + */ +static void nes_form_pm_msg(struct nes_cm_info *cm_info, + struct iwpm_sa_data *pm_msg) +{ + nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port), + &pm_msg->loc_addr); + nes_create_sockaddr(htonl(cm_info->rem_addr), htons(cm_info->rem_port), + &pm_msg->rem_addr); +} + +/* + * nes_form_reg_msg - Form a port mapper message with dev info + */ +static void nes_form_reg_msg(struct nes_vnic *nesvnic, + struct iwpm_dev_data *pm_msg) +{ + memcpy(pm_msg->dev_name, nesvnic->nesibdev->ibdev.name, + IWPM_DEVNAME_SIZE); + memcpy(pm_msg->if_name, nesvnic->netdev->name, IWPM_IFNAME_SIZE); +} + +/* + * nes_record_pm_msg - Save the received mapping info + */ +static void nes_record_pm_msg(struct nes_cm_info *cm_info, + struct iwpm_sa_data *pm_msg) +{ + struct sockaddr_in *mapped_loc_addr = + (struct sockaddr_in *)&pm_msg->mapped_loc_addr; + struct sockaddr_in *mapped_rem_addr = + (struct sockaddr_in *)&pm_msg->mapped_rem_addr; + + if (mapped_loc_addr->sin_family == AF_INET) { + cm_info->mapped_loc_addr = + ntohl(mapped_loc_addr->sin_addr.s_addr); + cm_info->mapped_loc_port = ntohs(mapped_loc_addr->sin_port); + } + if (mapped_rem_addr->sin_family == AF_INET) { + cm_info->mapped_rem_addr = + ntohl(mapped_rem_addr->sin_addr.s_addr); + cm_info->mapped_rem_port = ntohs(mapped_rem_addr->sin_port); + } +} + /** * print_core - dump a cm core */ @@ -514,6 +640,19 @@ static void print_core(struct nes_cm_core *core) nes_debug(NES_DBG_CM, "-------------- end core ---------------\n"); } +static void record_ird_ord(struct nes_cm_node *cm_node, + u16 conn_ird, u16 conn_ord) +{ + if (conn_ird > NES_MAX_IRD) + conn_ird = NES_MAX_IRD; + + if (conn_ord > NES_MAX_ORD) + conn_ord = NES_MAX_ORD; + + cm_node->ird_size = conn_ird; + cm_node->ord_size = conn_ord; +} + /** * cm_build_mpa_frame - build a MPA V1 frame or MPA V2 frame */ @@ -557,11 +696,13 @@ static void build_mpa_v2(struct nes_cm_node *cm_node, mpa_frame->priv_data_len += htons(IETF_RTR_MSG_SIZE); /* initialize RTR msg */ - ctrl_ird = (cm_node->ird_size > IETF_NO_IRD_ORD) ? - IETF_NO_IRD_ORD : cm_node->ird_size; - ctrl_ord = (cm_node->ord_size > IETF_NO_IRD_ORD) ? - IETF_NO_IRD_ORD : cm_node->ord_size; - + if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) { + ctrl_ird = IETF_NO_IRD_ORD; + ctrl_ord = IETF_NO_IRD_ORD; + } else { + ctrl_ird = cm_node->ird_size & IETF_NO_IRD_ORD; + ctrl_ord = cm_node->ord_size & IETF_NO_IRD_ORD; + } ctrl_ird |= IETF_PEER_TO_PEER; ctrl_ird |= IETF_FLPDU_ZERO_LEN; @@ -610,7 +751,7 @@ static void build_rdma0_msg(struct nes_cm_node *cm_node, struct nes_qp **nesqp_a struct nes_qp *nesqp = *nesqp_addr; struct nes_hw_qp_wqe *wqe = &nesqp->hwqp.sq_vbase[0]; - u64temp = (unsigned long)nesqp; + u64temp = (unsigned long)nesqp->nesuqp_addr; u64temp |= NES_SW_CONTEXT_ALIGN >> 1; set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, u64temp); @@ -629,11 +770,9 @@ static void build_rdma0_msg(struct nes_cm_node *cm_node, struct nes_qp **nesqp_a case SEND_RDMA_READ_ZERO: default: - if (cm_node->send_rdma0_op != SEND_RDMA_READ_ZERO) { - printk(KERN_ERR "%s[%u]: Unsupported RDMA0 len operation=%u\n", - __func__, __LINE__, cm_node->send_rdma0_op); - WARN_ON(1); - } + if (cm_node->send_rdma0_op != SEND_RDMA_READ_ZERO) + WARN(1, "Unsupported RDMA0 len operation=%u\n", + cm_node->send_rdma0_op); nes_debug(NES_DBG_CM, "Sending first rdma operation.\n"); wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(NES_IWARP_SQ_OP_RDMAR); @@ -671,7 +810,6 @@ int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb, struct nes_cm_core *cm_core = cm_node->cm_core; struct nes_timer_entry *new_send; int ret = 0; - u32 was_timer_set; new_send = kzalloc(sizeof(*new_send), GFP_ATOMIC); if (!new_send) @@ -723,12 +861,8 @@ int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb, } } - was_timer_set = timer_pending(&cm_core->tcp_timer); - - if (!was_timer_set) { - cm_core->tcp_timer.expires = new_send->timetosend; - add_timer(&cm_core->tcp_timer); - } + if (!timer_pending(&cm_core->tcp_timer)) + mod_timer(&cm_core->tcp_timer, new_send->timetosend); return ret; } @@ -946,10 +1080,8 @@ static void nes_cm_timer_tick(unsigned long pass) } if (settimer) { - if (!timer_pending(&cm_core->tcp_timer)) { - cm_core->tcp_timer.expires = nexttimeout; - add_timer(&cm_core->tcp_timer); - } + if (!timer_pending(&cm_core->tcp_timer)) + mod_timer(&cm_core->tcp_timer, nexttimeout); } } @@ -1109,8 +1241,11 @@ static struct nes_cm_node *find_node(struct nes_cm_core *cm_core, loc_addr, loc_port, cm_node->rem_addr, cm_node->rem_port, rem_addr, rem_port); - if ((cm_node->loc_addr == loc_addr) && (cm_node->loc_port == loc_port) && - (cm_node->rem_addr == rem_addr) && (cm_node->rem_port == rem_port)) { + if ((cm_node->mapped_loc_addr == loc_addr) && + (cm_node->mapped_loc_port == loc_port) && + (cm_node->mapped_rem_addr == rem_addr) && + (cm_node->mapped_rem_port == rem_port)) { + add_ref_cm_node(cm_node); spin_unlock_irqrestore(&cm_core->ht_lock, flags); return cm_node; @@ -1127,18 +1262,28 @@ static struct nes_cm_node *find_node(struct nes_cm_core *cm_core, * find_listener - find a cm node listening on this addr-port pair */ static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core, - nes_addr_t dst_addr, u16 dst_port, enum nes_cm_listener_state listener_state) + nes_addr_t dst_addr, u16 dst_port, + enum nes_cm_listener_state listener_state, int local) { unsigned long flags; struct nes_cm_listener *listen_node; + nes_addr_t listen_addr; + u16 listen_port; /* walk list and find cm_node associated with this session ID */ spin_lock_irqsave(&cm_core->listen_list_lock, flags); list_for_each_entry(listen_node, &cm_core->listen_list.list, list) { + if (local) { + listen_addr = listen_node->loc_addr; + listen_port = listen_node->loc_port; + } else { + listen_addr = listen_node->mapped_loc_addr; + listen_port = listen_node->mapped_loc_port; + } /* compare node pair, return node handle if a match */ - if (((listen_node->loc_addr == dst_addr) || - listen_node->loc_addr == 0x00000000) && - (listen_node->loc_port == dst_port) && + if (((listen_addr == dst_addr) || + listen_addr == 0x00000000) && + (listen_port == dst_port) && (listener_state & listen_node->listener_state)) { atomic_inc(&listen_node->ref_count); spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); @@ -1151,7 +1296,6 @@ static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core, return NULL; } - /** * add_hte_node - add a cm node to the hash table */ @@ -1272,9 +1416,20 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); - if (listener->nesvnic) - nes_manage_apbvt(listener->nesvnic, listener->loc_port, - PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); + if (listener->nesvnic) { + nes_manage_apbvt(listener->nesvnic, + listener->mapped_loc_port, + PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn), + NES_MANAGE_APBVT_DEL); + + nes_remove_mapinfo(listener->loc_addr, + listener->loc_port, + listener->mapped_loc_addr, + listener->mapped_loc_port); + nes_debug(NES_DBG_NLMSG, + "Delete APBVT mapped_loc_port = %04X\n", + listener->mapped_loc_port); + } nes_debug(NES_DBG_CM, "destroying listener (%p)\n", listener); @@ -1314,8 +1469,6 @@ static int mini_cm_del_listen(struct nes_cm_core *cm_core, static inline int mini_cm_accelerated(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) { - u32 was_timer_set; - cm_node->accelerated = 1; if (cm_node->accept_pend) { @@ -1325,11 +1478,8 @@ static inline int mini_cm_accelerated(struct nes_cm_core *cm_core, BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0); } - was_timer_set = timer_pending(&cm_core->tcp_timer); - if (!was_timer_set) { - cm_core->tcp_timer.expires = jiffies + NES_SHORT_TIME; - add_timer(&cm_core->tcp_timer); - } + if (!timer_pending(&cm_core->tcp_timer)) + mod_timer(&cm_core->tcp_timer, (jiffies + NES_SHORT_TIME)); return 0; } @@ -1354,7 +1504,7 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpi } if (netif_is_bond_slave(nesvnic->netdev)) - netdev = nesvnic->netdev->master; + netdev = netdev_master_upper_dev_get(nesvnic->netdev); else netdev = nesvnic->netdev; @@ -1368,8 +1518,7 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpi neigh->ha, ntohl(rt->rt_gateway)); if (arpindex >= 0) { - if (!memcmp(nesadapter->arp_table[arpindex].mac_addr, - neigh->ha, ETH_ALEN)) { + if (ether_addr_equal(nesadapter->arp_table[arpindex].mac_addr, neigh->ha)) { /* Mac address same as in nes_arp_table */ goto out; } @@ -1422,10 +1571,16 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, cm_node->loc_port = cm_info->loc_port; cm_node->rem_port = cm_info->rem_port; + cm_node->mapped_loc_addr = cm_info->mapped_loc_addr; + cm_node->mapped_rem_addr = cm_info->mapped_rem_addr; + cm_node->mapped_loc_port = cm_info->mapped_loc_port; + cm_node->mapped_rem_port = cm_info->mapped_rem_port; + cm_node->mpa_frame_rev = mpa_version; cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; - cm_node->ird_size = IETF_NO_IRD_ORD; - cm_node->ord_size = IETF_NO_IRD_ORD; + cm_node->mpav2_ird_ord = 0; + cm_node->ird_size = 0; + cm_node->ord_size = 0; nes_debug(NES_DBG_CM, "Make node addresses : loc = %pI4:%x, rem = %pI4:%x\n", &cm_node->loc_addr, cm_node->loc_port, @@ -1467,8 +1622,10 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, cm_node->loopbackpartner = NULL; /* get the mac addr for the remote node */ - oldarpindex = nes_arp_table(nesdev, cm_node->rem_addr, NULL, NES_ARP_RESOLVE); - arpindex = nes_addr_resolve_neigh(nesvnic, cm_info->rem_addr, oldarpindex); + oldarpindex = nes_arp_table(nesdev, cm_node->mapped_rem_addr, + NULL, NES_ARP_RESOLVE); + arpindex = nes_addr_resolve_neigh(nesvnic, + cm_node->mapped_rem_addr, oldarpindex); if (arpindex < 0) { kfree(cm_node); return NULL; @@ -1530,11 +1687,14 @@ static int rem_ref_cm_node(struct nes_cm_core *cm_core, mini_cm_dec_refcnt_listen(cm_core, cm_node->listener, 0); } else { if (cm_node->apbvt_set && cm_node->nesvnic) { - nes_manage_apbvt(cm_node->nesvnic, cm_node->loc_port, - PCI_FUNC( - cm_node->nesvnic->nesdev->pcidev->devfn), + nes_manage_apbvt(cm_node->nesvnic, cm_node->mapped_loc_port, + PCI_FUNC(cm_node->nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); } + nes_debug(NES_DBG_NLMSG, "Delete APBVT mapped_loc_port = %04X\n", + cm_node->mapped_loc_port); + nes_remove_mapinfo(cm_node->loc_addr, cm_node->loc_port, + cm_node->mapped_loc_addr, cm_node->mapped_loc_port); } atomic_dec(&cm_core->node_cnt); @@ -2202,17 +2362,21 @@ static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb, * mini_cm_listen - create a listen node with params */ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, - struct nes_vnic *nesvnic, struct nes_cm_info *cm_info) + struct nes_vnic *nesvnic, struct nes_cm_info *cm_info) { struct nes_cm_listener *listener; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; unsigned long flags; + int iwpm_err = 0; nes_debug(NES_DBG_CM, "Search for 0x%08x : 0x%04x\n", cm_info->loc_addr, cm_info->loc_port); /* cannot have multiple matching listeners */ - listener = find_listener(cm_core, htonl(cm_info->loc_addr), - htons(cm_info->loc_port), NES_CM_LISTENER_EITHER_STATE); + listener = find_listener(cm_core, cm_info->loc_addr, cm_info->loc_port, + NES_CM_LISTENER_EITHER_STATE, 1); + if (listener && listener->listener_state == NES_CM_LISTENER_ACTIVE_STATE) { /* find automatically incs ref count ??? */ atomic_dec(&listener->ref_count); @@ -2221,6 +2385,22 @@ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, } if (!listener) { + nes_form_reg_msg(nesvnic, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES); + if (iwpm_err) { + nes_debug(NES_DBG_NLMSG, + "Port Mapper reg pid fail (err = %d).\n", iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + nes_form_pm_msg(cm_info, &pm_msg); + iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_NES); + if (iwpm_err) + nes_debug(NES_DBG_NLMSG, + "Port Mapper query fail (err = %d).\n", iwpm_err); + else + nes_record_pm_msg(cm_info, &pm_msg); + } + /* create a CM listen node (1/2 node to compare incoming traffic to) */ listener = kzalloc(sizeof(*listener), GFP_ATOMIC); if (!listener) { @@ -2228,8 +2408,10 @@ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, return NULL; } - listener->loc_addr = htonl(cm_info->loc_addr); - listener->loc_port = htons(cm_info->loc_port); + listener->loc_addr = cm_info->loc_addr; + listener->loc_port = cm_info->loc_port; + listener->mapped_loc_addr = cm_info->mapped_loc_addr; + listener->mapped_loc_port = cm_info->mapped_loc_port; listener->reused_node = 0; atomic_set(&listener->ref_count, 1); @@ -2291,14 +2473,18 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, if (cm_info->loc_addr == cm_info->rem_addr) { loopbackremotelistener = find_listener(cm_core, - ntohl(nesvnic->local_ipaddr), cm_node->rem_port, - NES_CM_LISTENER_ACTIVE_STATE); + cm_node->mapped_loc_addr, cm_node->mapped_rem_port, + NES_CM_LISTENER_ACTIVE_STATE, 0); if (loopbackremotelistener == NULL) { create_event(cm_node, NES_CM_EVENT_ABORTED); } else { loopback_cm_info = *cm_info; loopback_cm_info.loc_port = cm_info->rem_port; loopback_cm_info.rem_port = cm_info->loc_port; + loopback_cm_info.mapped_loc_port = + cm_info->mapped_rem_port; + loopback_cm_info.mapped_rem_port = + cm_info->mapped_loc_port; loopback_cm_info.cm_id = loopbackremotelistener->cm_id; loopbackremotenode = make_cm_node(cm_core, nesvnic, &loopback_cm_info, loopbackremotelistener); @@ -2527,6 +2713,12 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, nfo.rem_addr = ntohl(iph->saddr); nfo.rem_port = ntohs(tcph->source); + /* If port mapper is available these should be mapped address info */ + nfo.mapped_loc_addr = ntohl(iph->daddr); + nfo.mapped_loc_port = ntohs(tcph->dest); + nfo.mapped_rem_addr = ntohl(iph->saddr); + nfo.mapped_rem_port = ntohs(tcph->source); + tmp_daddr = cpu_to_be32(iph->daddr); tmp_saddr = cpu_to_be32(iph->saddr); @@ -2535,8 +2727,8 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, do { cm_node = find_node(cm_core, - nfo.rem_port, nfo.rem_addr, - nfo.loc_port, nfo.loc_addr); + nfo.mapped_rem_port, nfo.mapped_rem_addr, + nfo.mapped_loc_port, nfo.mapped_loc_addr); if (!cm_node) { /* Only type of packet accepted are for */ @@ -2545,9 +2737,9 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, skb_handled = 0; break; } - listener = find_listener(cm_core, nfo.loc_addr, - nfo.loc_port, - NES_CM_LISTENER_ACTIVE_STATE); + listener = find_listener(cm_core, nfo.mapped_loc_addr, + nfo.mapped_loc_port, + NES_CM_LISTENER_ACTIVE_STATE, 0); if (!listener) { nfo.cm_id = NULL; nfo.conn_type = 0; @@ -3012,6 +3204,8 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) u8 *start_ptr = &start_addr; u8 **start_buff = &start_ptr; u16 buff_len = 0; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; ibqp = nes_get_qp(cm_id->device, conn_param->qpn); if (!ibqp) @@ -3040,11 +3234,11 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) rem_ref_cm_node(cm_node->cm_core, cm_node); return -ECONNRESET; } - /* associate the node with the QP */ nesqp->cm_node = (void *)cm_node; cm_node->nesqp = nesqp; + nes_debug(NES_DBG_CM, "QP%u, cm_node=%p, jiffies = %lu listener = %p\n", nesqp->hwqp.qp_id, cm_node, jiffies, cm_node->listener); atomic_inc(&cm_accepts); @@ -3067,6 +3261,11 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (cm_node->mpa_frame_rev == IETF_MPA_V1) mpa_frame_offset = 4; + if (cm_node->mpa_frame_rev == IETF_MPA_V1 || + cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) { + record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); + } + memcpy(mpa_v2_frame->priv_data, conn_param->private_data, conn_param->private_data_len); @@ -3076,8 +3275,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) /* setup our first outgoing iWarp send WQE (the IETF frame response) */ wqe = &nesqp->hwqp.sq_vbase[0]; - if (cm_id->remote_addr.sin_addr.s_addr != - cm_id->local_addr.sin_addr.s_addr) { + if (raddr->sin_addr.s_addr != laddr->sin_addr.s_addr) { u64temp = (unsigned long)nesqp; nesibdev = nesvnic->nesibdev; nespd = nesqp->nespd; @@ -3131,7 +3329,6 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) } nesqp->skip_lsmm = 1; - /* Cache the cm_id in the qp */ nesqp->cm_id = cm_id; cm_node->cm_id = cm_id; @@ -3147,12 +3344,11 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) nes_cm_init_tsa_conn(nesqp, cm_node); nesqp->nesqp_context->tcpPorts[0] = - cpu_to_le16(ntohs(cm_id->local_addr.sin_port)); + cpu_to_le16(cm_node->mapped_loc_port); nesqp->nesqp_context->tcpPorts[1] = - cpu_to_le16(ntohs(cm_id->remote_addr.sin_port)); + cpu_to_le16(cm_node->mapped_rem_port); - nesqp->nesqp_context->ip0 = - cpu_to_le32(ntohl(cm_id->remote_addr.sin_addr.s_addr)); + nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr); nesqp->nesqp_context->misc2 |= cpu_to_le32( (u32)PCI_FUNC(nesdev->pcidev->devfn) << @@ -3171,14 +3367,14 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32( ((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT)); nesqp->nesqp_context->ird_ord_sizes |= - cpu_to_le32((u32)conn_param->ord); + cpu_to_le32((u32)cm_node->ord_size); memset(&nes_quad, 0, sizeof(nes_quad)); nes_quad.DstIpAdrIndex = cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); - nes_quad.SrcIpadr = cm_id->remote_addr.sin_addr.s_addr; - nes_quad.TcpPorts[0] = cm_id->remote_addr.sin_port; - nes_quad.TcpPorts[1] = cm_id->local_addr.sin_port; + nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr); + nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port); + nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port); /* Produce hash key */ crc_value = get_crc_value(&nes_quad); @@ -3194,10 +3390,8 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) nes_debug(NES_DBG_CM, "QP%u, Destination IP = 0x%08X:0x%04X, local = " "0x%08X:0x%04X, rcv_nxt=0x%08X, snd_nxt=0x%08X, mpa + " "private data length=%u.\n", nesqp->hwqp.qp_id, - ntohl(cm_id->remote_addr.sin_addr.s_addr), - ntohs(cm_id->remote_addr.sin_port), - ntohl(cm_id->local_addr.sin_addr.s_addr), - ntohs(cm_id->local_addr.sin_port), + ntohl(raddr->sin_addr.s_addr), ntohs(raddr->sin_port), + ntohl(laddr->sin_addr.s_addr), ntohs(laddr->sin_port), le32_to_cpu(nesqp->nesqp_context->rcv_nxt), le32_to_cpu(nesqp->nesqp_context->snd_nxt), buff_len); @@ -3213,6 +3407,9 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_event.remote_addr = cm_id->remote_addr; cm_event.private_data = NULL; cm_event.private_data_len = 0; + cm_event.ird = cm_node->ird_size; + cm_event.ord = cm_node->ord_size; + ret = cm_id->event_handler(cm_id, &cm_event); attr.qp_state = IB_QPS_RTS; nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); @@ -3277,7 +3474,14 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct nes_cm_node *cm_node; struct nes_cm_info cm_info; int apbvt_set = 0; - + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; + int iwpm_err = 0; + + if (cm_id->remote_addr.ss_family != AF_INET) + return -ENOSYS; ibqp = nes_get_qp(cm_id->device, conn_param->qpn); if (!ibqp) return -EINVAL; @@ -3291,50 +3495,66 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (!nesdev) return -EINVAL; - if (!(cm_id->local_addr.sin_port) || !(cm_id->remote_addr.sin_port)) + if (!laddr->sin_port || !raddr->sin_port) return -EINVAL; nes_debug(NES_DBG_CM, "QP%u, current IP = 0x%08X, Destination IP = " "0x%08X:0x%04X, local = 0x%08X:0x%04X.\n", nesqp->hwqp.qp_id, - ntohl(nesvnic->local_ipaddr), - ntohl(cm_id->remote_addr.sin_addr.s_addr), - ntohs(cm_id->remote_addr.sin_port), - ntohl(cm_id->local_addr.sin_addr.s_addr), - ntohs(cm_id->local_addr.sin_port)); + ntohl(nesvnic->local_ipaddr), ntohl(raddr->sin_addr.s_addr), + ntohs(raddr->sin_port), ntohl(laddr->sin_addr.s_addr), + ntohs(laddr->sin_port)); atomic_inc(&cm_connects); nesqp->active_conn = 1; /* cache the cm_id in the qp */ nesqp->cm_id = cm_id; - cm_id->provider_data = nesqp; - nesqp->private_data_len = conn_param->private_data_len; - nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32((u32)conn_param->ord); - /* space for rdma0 read msg */ - if (conn_param->ord == 0) - nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(1); nes_debug(NES_DBG_CM, "requested ord = 0x%08X.\n", (u32)conn_param->ord); nes_debug(NES_DBG_CM, "mpa private data len =%u\n", conn_param->private_data_len); - if (cm_id->local_addr.sin_addr.s_addr != - cm_id->remote_addr.sin_addr.s_addr) { - nes_manage_apbvt(nesvnic, ntohs(cm_id->local_addr.sin_port), - PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD); - apbvt_set = 1; - } - /* set up the connection params for the node */ - cm_info.loc_addr = htonl(cm_id->local_addr.sin_addr.s_addr); - cm_info.loc_port = htons(cm_id->local_addr.sin_port); - cm_info.rem_addr = htonl(cm_id->remote_addr.sin_addr.s_addr); - cm_info.rem_port = htons(cm_id->remote_addr.sin_port); + cm_info.loc_addr = ntohl(laddr->sin_addr.s_addr); + cm_info.loc_port = ntohs(laddr->sin_port); + cm_info.rem_addr = ntohl(raddr->sin_addr.s_addr); + cm_info.rem_port = ntohs(raddr->sin_port); cm_info.cm_id = cm_id; cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; + /* No port mapper available, go with the specified peer information */ + cm_info.mapped_loc_addr = cm_info.loc_addr; + cm_info.mapped_loc_port = cm_info.loc_port; + cm_info.mapped_rem_addr = cm_info.rem_addr; + cm_info.mapped_rem_port = cm_info.rem_port; + + nes_form_reg_msg(nesvnic, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES); + if (iwpm_err) { + nes_debug(NES_DBG_NLMSG, + "Port Mapper reg pid fail (err = %d).\n", iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + nes_form_pm_msg(&cm_info, &pm_msg); + iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_NES); + if (iwpm_err) + nes_debug(NES_DBG_NLMSG, + "Port Mapper query fail (err = %d).\n", iwpm_err); + else + nes_record_pm_msg(&cm_info, &pm_msg); + } + + if (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr) { + nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port, + PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD); + apbvt_set = 1; + } + + if (nes_create_mapinfo(&cm_info)) + return -ENOMEM; + cm_id->add_ref(cm_id); /* create a connect CM node connection */ @@ -3343,14 +3563,23 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) &cm_info); if (!cm_node) { if (apbvt_set) - nes_manage_apbvt(nesvnic, ntohs(cm_id->local_addr.sin_port), + nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port, PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); + nes_debug(NES_DBG_NLMSG, "Delete mapped_loc_port = %04X\n", + cm_info.mapped_loc_port); + nes_remove_mapinfo(cm_info.loc_addr, cm_info.loc_port, + cm_info.mapped_loc_addr, cm_info.mapped_loc_port); cm_id->rem_ref(cm_id); return -ENOMEM; } + record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); + if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO && + cm_node->ord_size == 0) + cm_node->ord_size = 1; + cm_node->apbvt_set = apbvt_set; nesqp->cm_node = cm_node; cm_node->nesqp = nesqp; @@ -3369,10 +3598,13 @@ int nes_create_listen(struct iw_cm_id *cm_id, int backlog) struct nes_cm_listener *cm_node; struct nes_cm_info cm_info; int err; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; nes_debug(NES_DBG_CM, "cm_id = %p, local port = 0x%04X.\n", - cm_id, ntohs(cm_id->local_addr.sin_port)); + cm_id, ntohs(laddr->sin_port)); + if (cm_id->local_addr.ss_family != AF_INET) + return -ENOSYS; nesvnic = to_nesvnic(cm_id->device); if (!nesvnic) return -EINVAL; @@ -3381,16 +3613,19 @@ int nes_create_listen(struct iw_cm_id *cm_id, int backlog) nesvnic, nesvnic->netdev, nesvnic->netdev->name); nes_debug(NES_DBG_CM, "nesvnic->local_ipaddr=0x%08x, sin_addr.s_addr=0x%08x\n", - nesvnic->local_ipaddr, cm_id->local_addr.sin_addr.s_addr); + nesvnic->local_ipaddr, laddr->sin_addr.s_addr); /* setup listen params in our api call struct */ - cm_info.loc_addr = nesvnic->local_ipaddr; - cm_info.loc_port = cm_id->local_addr.sin_port; + cm_info.loc_addr = ntohl(nesvnic->local_ipaddr); + cm_info.loc_port = ntohs(laddr->sin_port); cm_info.backlog = backlog; cm_info.cm_id = cm_id; cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; + /* No port mapper available, go with the specified info */ + cm_info.mapped_loc_addr = cm_info.loc_addr; + cm_info.mapped_loc_port = cm_info.loc_port; cm_node = g_cm_core->api->listen(g_cm_core, nesvnic, &cm_info); if (!cm_node) { @@ -3402,8 +3637,10 @@ int nes_create_listen(struct iw_cm_id *cm_id, int backlog) cm_id->provider_data = cm_node; if (!cm_node->reused_node) { - err = nes_manage_apbvt(nesvnic, - ntohs(cm_id->local_addr.sin_port), + if (nes_create_mapinfo(&cm_info)) + return -ENOMEM; + + err = nes_manage_apbvt(nesvnic, cm_node->mapped_loc_port, PCI_FUNC(nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD); if (err) { @@ -3501,6 +3738,9 @@ static void cm_event_connected(struct nes_cm_event *event) struct nes_v4_quad nes_quad; u32 crc_value; int ret; + struct sockaddr_in *laddr; + struct sockaddr_in *raddr; + struct sockaddr_in *cm_event_laddr; /* get all our handles */ cm_node = event->cm_node; @@ -3510,27 +3750,26 @@ static void cm_event_connected(struct nes_cm_event *event) nesvnic = to_nesvnic(nesqp->ibqp.device); nesdev = nesvnic->nesdev; nesadapter = nesdev->nesadapter; + laddr = (struct sockaddr_in *)&cm_id->local_addr; + raddr = (struct sockaddr_in *)&cm_id->remote_addr; + cm_event_laddr = (struct sockaddr_in *)&cm_event.local_addr; if (nesqp->destroyed) return; atomic_inc(&cm_connecteds); nes_debug(NES_DBG_CM, "QP%u attempting to connect to 0x%08X:0x%04X on" " local port 0x%04X. jiffies = %lu.\n", - nesqp->hwqp.qp_id, - ntohl(cm_id->remote_addr.sin_addr.s_addr), - ntohs(cm_id->remote_addr.sin_port), - ntohs(cm_id->local_addr.sin_port), - jiffies); + nesqp->hwqp.qp_id, ntohl(raddr->sin_addr.s_addr), + ntohs(raddr->sin_port), ntohs(laddr->sin_port), jiffies); nes_cm_init_tsa_conn(nesqp, cm_node); /* set the QP tsa context */ nesqp->nesqp_context->tcpPorts[0] = - cpu_to_le16(ntohs(cm_id->local_addr.sin_port)); + cpu_to_le16(cm_node->mapped_loc_port); nesqp->nesqp_context->tcpPorts[1] = - cpu_to_le16(ntohs(cm_id->remote_addr.sin_port)); - nesqp->nesqp_context->ip0 = - cpu_to_le32(ntohl(cm_id->remote_addr.sin_addr.s_addr)); + cpu_to_le16(cm_node->mapped_rem_port); + nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr); nesqp->nesqp_context->misc2 |= cpu_to_le32( (u32)PCI_FUNC(nesdev->pcidev->devfn) << @@ -3545,6 +3784,8 @@ static void cm_event_connected(struct nes_cm_event *event) nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT); + nesqp->nesqp_context->ird_ord_sizes |= + cpu_to_le32((u32)cm_node->ord_size); /* Adjust tail for not having a LSMM */ /*nesqp->hwqp.sq_tail = 1;*/ @@ -3558,9 +3799,9 @@ static void cm_event_connected(struct nes_cm_event *event) nes_quad.DstIpAdrIndex = cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); - nes_quad.SrcIpadr = cm_id->remote_addr.sin_addr.s_addr; - nes_quad.TcpPorts[0] = cm_id->remote_addr.sin_port; - nes_quad.TcpPorts[1] = cm_id->local_addr.sin_port; + nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr); + nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port); + nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port); /* Produce hash key */ crc_value = get_crc_value(&nes_quad); @@ -3579,8 +3820,8 @@ static void cm_event_connected(struct nes_cm_event *event) cm_event.event = IW_CM_EVENT_CONNECT_REPLY; cm_event.status = 0; cm_event.provider_data = cm_id->provider_data; - cm_event.local_addr.sin_family = AF_INET; - cm_event.local_addr.sin_port = cm_id->local_addr.sin_port; + cm_event_laddr->sin_family = AF_INET; + cm_event_laddr->sin_port = laddr->sin_port; cm_event.remote_addr = cm_id->remote_addr; cm_event.private_data = (void *)event->cm_node->mpa_frame_buf; @@ -3588,7 +3829,7 @@ static void cm_event_connected(struct nes_cm_event *event) cm_event.ird = cm_node->ird_size; cm_event.ord = cm_node->ord_size; - cm_event.local_addr.sin_addr.s_addr = event->cm_info.rem_addr; + cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr); ret = cm_id->event_handler(cm_id, &cm_event); nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); @@ -3641,9 +3882,16 @@ static void cm_event_connect_error(struct nes_cm_event *event) cm_event.private_data = NULL; cm_event.private_data_len = 0; - nes_debug(NES_DBG_CM, "call CM_EVENT REJECTED, local_addr=%08x, " - "remove_addr=%08x\n", cm_event.local_addr.sin_addr.s_addr, - cm_event.remote_addr.sin_addr.s_addr); +#ifdef CONFIG_INFINIBAND_NES_DEBUG + { + struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *) + &cm_event.local_addr; + struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *) + &cm_event.remote_addr; + nes_debug(NES_DBG_CM, "call CM_EVENT REJECTED, local_addr=%08x, remote_addr=%08x\n", + cm_event_laddr->sin_addr.s_addr, cm_event_raddr->sin_addr.s_addr); + } +#endif ret = cm_id->event_handler(cm_id, &cm_event); nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); @@ -3723,6 +3971,10 @@ static void cm_event_mpa_req(struct nes_cm_event *event) struct iw_cm_event cm_event; int ret; struct nes_cm_node *cm_node; + struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *) + &cm_event.local_addr; + struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *) + &cm_event.remote_addr; cm_node = event->cm_node; if (!cm_node) @@ -3737,17 +3989,22 @@ static void cm_event_mpa_req(struct nes_cm_event *event) cm_event.status = 0; cm_event.provider_data = (void *)cm_node; - cm_event.local_addr.sin_family = AF_INET; - cm_event.local_addr.sin_port = htons(event->cm_info.loc_port); - cm_event.local_addr.sin_addr.s_addr = htonl(event->cm_info.loc_addr); + cm_event_laddr->sin_family = AF_INET; + cm_event_laddr->sin_port = htons(event->cm_info.loc_port); + cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr); - cm_event.remote_addr.sin_family = AF_INET; - cm_event.remote_addr.sin_port = htons(event->cm_info.rem_port); - cm_event.remote_addr.sin_addr.s_addr = htonl(event->cm_info.rem_addr); + cm_event_raddr->sin_family = AF_INET; + cm_event_raddr->sin_port = htons(event->cm_info.rem_port); + cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr); cm_event.private_data = cm_node->mpa_frame_buf; cm_event.private_data_len = (u8)cm_node->mpa_frame_size; + if (cm_node->mpa_frame_rev == IETF_MPA_V1) { + cm_event.ird = NES_MAX_IRD; + cm_event.ord = NES_MAX_ORD; + } else { cm_event.ird = cm_node->ird_size; cm_event.ord = cm_node->ord_size; + } ret = cm_id->event_handler(cm_id, &cm_event); if (ret) @@ -3763,6 +4020,10 @@ static void cm_event_mpa_reject(struct nes_cm_event *event) struct iw_cm_event cm_event; struct nes_cm_node *cm_node; int ret; + struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *) + &cm_event.local_addr; + struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *) + &cm_event.remote_addr; cm_node = event->cm_node; if (!cm_node) @@ -3777,21 +4038,21 @@ static void cm_event_mpa_reject(struct nes_cm_event *event) cm_event.status = -ECONNREFUSED; cm_event.provider_data = cm_id->provider_data; - cm_event.local_addr.sin_family = AF_INET; - cm_event.local_addr.sin_port = htons(event->cm_info.loc_port); - cm_event.local_addr.sin_addr.s_addr = htonl(event->cm_info.loc_addr); + cm_event_laddr->sin_family = AF_INET; + cm_event_laddr->sin_port = htons(event->cm_info.loc_port); + cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr); - cm_event.remote_addr.sin_family = AF_INET; - cm_event.remote_addr.sin_port = htons(event->cm_info.rem_port); - cm_event.remote_addr.sin_addr.s_addr = htonl(event->cm_info.rem_addr); + cm_event_raddr->sin_family = AF_INET; + cm_event_raddr->sin_port = htons(event->cm_info.rem_port); + cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr); cm_event.private_data = cm_node->mpa_frame_buf; cm_event.private_data_len = (u8)cm_node->mpa_frame_size; nes_debug(NES_DBG_CM, "call CM_EVENT_MPA_REJECTED, local_addr=%08x, " "remove_addr=%08x\n", - cm_event.local_addr.sin_addr.s_addr, - cm_event.remote_addr.sin_addr.s_addr); + cm_event_laddr->sin_addr.s_addr, + cm_event_raddr->sin_addr.s_addr); ret = cm_id->event_handler(cm_id, &cm_event); if (ret) diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 4646e666608..f522cf63978 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -58,6 +58,8 @@ #define IETF_RDMA0_WRITE 0x8000 #define IETF_RDMA0_READ 0x4000 #define IETF_NO_IRD_ORD 0x3FFF +#define NES_MAX_IRD 0x40 +#define NES_MAX_ORD 0x7F enum ietf_mpa_flags { IETF_MPA_FLAGS_MARKERS = 0x80, /* receive Markers */ @@ -291,8 +293,8 @@ struct nes_cm_listener { struct list_head list; struct nes_cm_core *cm_core; u8 loc_mac[ETH_ALEN]; - nes_addr_t loc_addr; - u16 loc_port; + nes_addr_t loc_addr, mapped_loc_addr; + u16 loc_port, mapped_loc_port; struct iw_cm_id *cm_id; enum nes_cm_conn_type conn_type; atomic_t ref_count; @@ -306,7 +308,9 @@ struct nes_cm_listener { /* per connection node and node state information */ struct nes_cm_node { nes_addr_t loc_addr, rem_addr; + nes_addr_t mapped_loc_addr, mapped_rem_addr; u16 loc_port, rem_port; + u16 mapped_loc_port, mapped_rem_port; u8 loc_mac[ETH_ALEN]; u8 rem_mac[ETH_ALEN]; @@ -333,6 +337,7 @@ struct nes_cm_node { enum mpa_frame_version mpa_frame_rev; u16 ird_size; u16 ord_size; + u16 mpav2_ird_ord; u16 mpa_frame_size; struct iw_cm_id *cm_id; @@ -361,6 +366,10 @@ struct nes_cm_info { u16 rem_port; nes_addr_t loc_addr; nes_addr_t rem_addr; + u16 mapped_loc_port; + u16 mapped_rem_port; + nes_addr_t mapped_loc_addr; + nes_addr_t mapped_rem_addr; enum nes_cm_conn_type conn_type; int backlog; diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index fe7965ee409..90200245c5e 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -75,7 +75,6 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, static void process_critical_error(struct nes_device *nesdev); static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number); static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode); -static void nes_terminate_timeout(unsigned long context); static void nes_terminate_start_timer(struct nes_qp *nesqp); #ifdef CONFIG_INFINIBAND_NES_DEBUG @@ -2949,7 +2948,7 @@ void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) nes_debug(NES_DBG_CQ, "%s: Reporting stripped VLAN packet. Tag = 0x%04X\n", nesvnic->netdev->name, vlan_tag); - __vlan_hwaccel_put_tag(rx_skb, vlan_tag); + __vlan_hwaccel_put_tag(rx_skb, htons(ETH_P_8021Q), vlan_tag); } if (nes_use_lro) lro_receive_skb(&nesvnic->lro_mgr, rx_skb, NULL); @@ -3520,7 +3519,7 @@ static void nes_terminate_received(struct nes_device *nesdev, } /* Timeout routine in case terminate fails to complete */ -static void nes_terminate_timeout(unsigned long context) +void nes_terminate_timeout(unsigned long context) { struct nes_qp *nesqp = (struct nes_qp *)(unsigned long)context; @@ -3530,11 +3529,7 @@ static void nes_terminate_timeout(unsigned long context) /* Set a timer in case hw cannot complete the terminate sequence */ static void nes_terminate_start_timer(struct nes_qp *nesqp) { - init_timer(&nesqp->terminate_timer); - nesqp->terminate_timer.function = nes_terminate_timeout; - nesqp->terminate_timer.expires = jiffies + HZ; - nesqp->terminate_timer.data = (unsigned long)nesqp; - add_timer(&nesqp->terminate_timer); + mod_timer(&nesqp->terminate_timer, (jiffies + HZ)); } /** @@ -3575,10 +3570,10 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT; iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT; nes_debug(NES_DBG_AEQ, "aeid = 0x%04X, qp-cq id = %d, aeqe = %p," - " Tcp state = %d, iWARP state = %d\n", + " Tcp state = %s, iWARP state = %s\n", async_event_id, le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), aeqe, - tcp_state, iwarp_state); + nes_tcp_state_str[tcp_state], nes_iwarp_state_str[iwarp_state]); aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]); if (aeq_info & NES_AEQE_QP) { diff --git a/drivers/infiniband/hw/nes/nes_mgt.c b/drivers/infiniband/hw/nes/nes_mgt.c index 3ba7be36945..416645259b0 100644 --- a/drivers/infiniband/hw/nes/nes_mgt.c +++ b/drivers/infiniband/hw/nes/nes_mgt.c @@ -210,6 +210,9 @@ static struct sk_buff *nes_get_next_skb(struct nes_device *nesdev, struct nes_qp } while (1) { + if (skb_queue_empty(&nesqp->pau_list)) + goto out; + seq = nes_get_seq(skb, ack, wnd, fin_rcvd, rst_rcvd); if (seq == nextseq) { if (skb->len || processacks) @@ -218,14 +221,13 @@ static struct sk_buff *nes_get_next_skb(struct nes_device *nesdev, struct nes_qp goto out; } - if (skb->next == (struct sk_buff *)&nesqp->pau_list) - goto out; - old_skb = skb; skb = skb->next; skb_unlink(old_skb, &nesqp->pau_list); nes_mgt_free_skb(nesdev, old_skb, PCI_DMA_TODEVICE); nes_rem_ref_cm_node(nesqp->cm_node); + if (skb == (struct sk_buff *)&nesqp->pau_list) + goto out; } return skb; @@ -245,7 +247,6 @@ static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp, struct nes_rskb_cb *cb; struct pau_fpdu_info *fpdu_info = NULL; struct pau_fpdu_frag frags[MAX_FPDU_FRAGS]; - unsigned long flags; u32 fpdu_len = 0; u32 tmp_len; int frag_cnt = 0; @@ -260,12 +261,10 @@ static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp, *pau_fpdu_info = NULL; - spin_lock_irqsave(&nesqp->pau_lock, flags); skb = nes_get_next_skb(nesdev, nesqp, NULL, nesqp->pau_rcv_nxt, &ack, &wnd, &fin_rcvd, &rst_rcvd); - if (!skb) { - spin_unlock_irqrestore(&nesqp->pau_lock, flags); + if (!skb) goto out; - } + cb = (struct nes_rskb_cb *)&skb->cb[0]; if (skb->len) { fpdu_len = be16_to_cpu(*(__be16 *) skb->data) + MPA_FRAMING; @@ -290,10 +289,9 @@ static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp, skb = nes_get_next_skb(nesdev, nesqp, skb, nesqp->pau_rcv_nxt + frag_tot, &ack, &wnd, &fin_rcvd, &rst_rcvd); - if (!skb) { - spin_unlock_irqrestore(&nesqp->pau_lock, flags); + if (!skb) goto out; - } else if (rst_rcvd) { + if (rst_rcvd) { /* rst received in the middle of fpdu */ for (; i >= 0; i--) { skb_unlink(frags[i].skb, &nesqp->pau_list); @@ -320,8 +318,6 @@ static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp, frag_cnt = 1; } - spin_unlock_irqrestore(&nesqp->pau_lock, flags); - /* Found one */ fpdu_info = kzalloc(sizeof(*fpdu_info), GFP_ATOMIC); if (fpdu_info == NULL) { @@ -383,9 +379,8 @@ static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp, if (frags[i].skb->len == 0) { /* Pull skb off the list - it will be freed in the callback */ - spin_lock_irqsave(&nesqp->pau_lock, flags); - skb_unlink(frags[i].skb, &nesqp->pau_list); - spin_unlock_irqrestore(&nesqp->pau_lock, flags); + if (!skb_queue_empty(&nesqp->pau_list)) + skb_unlink(frags[i].skb, &nesqp->pau_list); } else { /* Last skb still has data so update the seq */ iph = (struct iphdr *)(cb->data_start + ETH_HLEN); @@ -414,14 +409,18 @@ static int forward_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp) struct pau_fpdu_info *fpdu_info; struct nes_hw_cqp_wqe *cqp_wqe; struct nes_cqp_request *cqp_request; + unsigned long flags; u64 u64tmp; u32 u32tmp; int rc; while (1) { + spin_lock_irqsave(&nesqp->pau_lock, flags); rc = get_fpdu_info(nesdev, nesqp, &fpdu_info); - if (fpdu_info == NULL) + if (rc || (fpdu_info == NULL)) { + spin_unlock_irqrestore(&nesqp->pau_lock, flags); return rc; + } cqp_request = fpdu_info->cqp_request; cqp_wqe = &cqp_request->cqp_wqe; @@ -447,7 +446,7 @@ static int forward_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp) set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX, lower_32_bits(u64tmp)); set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_HIGH_IDX, - upper_32_bits(u64tmp >> 32)); + upper_32_bits(u64tmp)); set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX, lower_32_bits(fpdu_info->frags[0].physaddr)); @@ -475,6 +474,7 @@ static int forward_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp) atomic_set(&cqp_request->refcount, 1); nes_post_cqp_request(nesdev, cqp_request); + spin_unlock_irqrestore(&nesqp->pau_lock, flags); } return 0; @@ -649,11 +649,9 @@ static void nes_chg_qh_handler(struct nes_device *nesdev, struct nes_cqp_request nesqp = qh_chg->nesqp; /* Should we handle the bad completion */ - if (cqp_request->major_code) { - printk(KERN_ERR PFX "Invalid cqp_request major_code=0x%x\n", + if (cqp_request->major_code) + WARN(1, PFX "Invalid cqp_request major_code=0x%x\n", cqp_request->major_code); - WARN_ON(1); - } switch (nesqp->pau_state) { case PAU_DEL_QH: diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c index 0564be757d8..49eb5111d2c 100644 --- a/drivers/infiniband/hw/nes/nes_nic.c +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -944,12 +944,13 @@ static void nes_netdev_set_multicast_list(struct net_device *netdev) addr, perfect_filter_register_address+(mc_index * 8), mc_nic_index); - macaddr_high = ((u16) addr[0]) << 8; - macaddr_high += (u16) addr[1]; - macaddr_low = ((u32) addr[2]) << 24; - macaddr_low += ((u32) addr[3]) << 16; - macaddr_low += ((u32) addr[4]) << 8; - macaddr_low += (u32) addr[5]; + macaddr_high = ((u8) addr[0]) << 8; + macaddr_high += (u8) addr[1]; + macaddr_low = ((u8) addr[2]) << 24; + macaddr_low += ((u8) addr[3]) << 16; + macaddr_low += ((u8) addr[4]) << 8; + macaddr_low += (u8) addr[5]; + nes_write_indexed(nesdev, perfect_filter_register_address+(mc_index * 8), macaddr_low); @@ -1316,11 +1317,13 @@ static void nes_netdev_get_drvinfo(struct net_device *netdev, struct nes_vnic *nesvnic = netdev_priv(netdev); struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter; - strcpy(drvinfo->driver, DRV_NAME); - strcpy(drvinfo->bus_info, pci_name(nesvnic->nesdev->pcidev)); - sprintf(drvinfo->fw_version, "%u.%u", nesadapter->firmware_version>>16, - nesadapter->firmware_version & 0x000000ff); - strcpy(drvinfo->version, DRV_VERSION); + strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver)); + strlcpy(drvinfo->bus_info, pci_name(nesvnic->nesdev->pcidev), + sizeof(drvinfo->bus_info)); + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%u.%u", nesadapter->firmware_version >> 16, + nesadapter->firmware_version & 0x000000ff); + strlcpy(drvinfo->version, DRV_VERSION, sizeof(drvinfo->version)); drvinfo->testinfo_len = 0; drvinfo->eedump_len = 0; drvinfo->regdump_len = 0; @@ -1596,7 +1599,7 @@ static void nes_vlan_mode(struct net_device *netdev, struct nes_device *nesdev, /* Enable/Disable VLAN Stripping */ u32temp = nes_read_indexed(nesdev, NES_IDX_PCIX_DIAG); - if (features & NETIF_F_HW_VLAN_RX) + if (features & NETIF_F_HW_VLAN_CTAG_RX) u32temp &= 0xfdffffff; else u32temp |= 0x02000000; @@ -1611,10 +1614,10 @@ static netdev_features_t nes_fix_features(struct net_device *netdev, netdev_feat * Since there is no support for separate rx/tx vlan accel * enable/disable make sure tx flag is always in same state as rx. */ - if (features & NETIF_F_HW_VLAN_RX) - features |= NETIF_F_HW_VLAN_TX; + if (features & NETIF_F_HW_VLAN_CTAG_RX) + features |= NETIF_F_HW_VLAN_CTAG_TX; else - features &= ~NETIF_F_HW_VLAN_TX; + features &= ~NETIF_F_HW_VLAN_CTAG_TX; return features; } @@ -1625,7 +1628,7 @@ static int nes_set_features(struct net_device *netdev, netdev_features_t feature struct nes_device *nesdev = nesvnic->nesdev; u32 changed = netdev->features ^ features; - if (changed & NETIF_F_HW_VLAN_RX) + if (changed & NETIF_F_HW_VLAN_CTAG_RX) nes_vlan_mode(netdev, nesdev, features); return 0; @@ -1702,13 +1705,12 @@ struct net_device *nes_netdev_init(struct nes_device *nesdev, netdev->dev_addr[3] = (u8)(u64temp>>16); netdev->dev_addr[4] = (u8)(u64temp>>8); netdev->dev_addr[5] = (u8)u64temp; - memcpy(netdev->perm_addr, netdev->dev_addr, 6); - netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_RX; + netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX; if ((nesvnic->logical_port < 2) || (nesdev->nesadapter->hw_rev != NE020_REV)) netdev->hw_features |= NETIF_F_TSO; - netdev->features = netdev->hw_features | NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_TX; + netdev->features = netdev->hw_features | NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX; netdev->hw_features |= NETIF_F_LRO; nes_debug(NES_DBG_INIT, "nesvnic = %p, reported features = 0x%lX, QPid = %d," diff --git a/drivers/infiniband/hw/nes/nes_user.h b/drivers/infiniband/hw/nes/nes_user.h index 4926de74448..529c421bb15 100644 --- a/drivers/infiniband/hw/nes/nes_user.h +++ b/drivers/infiniband/hw/nes/nes_user.h @@ -39,8 +39,8 @@ #include <linux/types.h> -#define NES_ABI_USERSPACE_VER 1 -#define NES_ABI_KERNEL_VER 1 +#define NES_ABI_USERSPACE_VER 2 +#define NES_ABI_KERNEL_VER 2 /* * Make sure that all structs defined in this file remain laid out so @@ -78,6 +78,7 @@ struct nes_create_cq_req { struct nes_create_qp_req { __u64 user_wqe_buffers; + __u64 user_qp_buffer; }; enum iwnes_memreg_type { diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index cd0ecb215cc..218dd357428 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -55,7 +55,8 @@ static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev); /** * nes_alloc_mw */ -static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd) { +static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type) +{ struct nes_pd *nespd = to_nespd(ibpd); struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); struct nes_device *nesdev = nesvnic->nesdev; @@ -71,6 +72,9 @@ static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd) { u32 driver_key = 0; u8 stag_key = 0; + if (type != IB_MW_TYPE_1) + return ERR_PTR(-EINVAL); + get_random_bytes(&next_stag_index, sizeof(next_stag_index)); stag_key = (u8)next_stag_index; @@ -244,20 +248,19 @@ static int nes_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw, if (ibmw_bind->send_flags & IB_SEND_SIGNALED) wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL; - if (ibmw_bind->mw_access_flags & IB_ACCESS_REMOTE_WRITE) { + if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_WRITE) wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE; - } - if (ibmw_bind->mw_access_flags & IB_ACCESS_REMOTE_READ) { + if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_READ) wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_READ; - } set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_MISC_IDX, wqe_misc); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MR_IDX, ibmw_bind->mr->lkey); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MR_IDX, + ibmw_bind->bind_info.mr->lkey); set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MW_IDX, ibmw->rkey); set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX, - ibmw_bind->length); + ibmw_bind->bind_info.length); wqe->wqe_words[NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX] = 0; - u64temp = (u64)ibmw_bind->addr; + u64temp = (u64)ibmw_bind->bind_info.addr; set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX, u64temp); head++; @@ -1183,11 +1186,13 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); kfree(nesqp->allocated_buffer); nes_debug(NES_DBG_QP, "ib_copy_from_udata() Failed \n"); - return NULL; + return ERR_PTR(-EFAULT); } if (req.user_wqe_buffers) { virt_wqs = 1; } + if (req.user_qp_buffer) + nesqp->nesuqp_addr = req.user_qp_buffer; if ((ibpd->uobject) && (ibpd->uobject->context)) { nesqp->user_mode = 1; nes_ucontext = to_nesucontext(ibpd->uobject->context); @@ -1381,6 +1386,7 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, if (ibpd->uobject) { uresp.mmap_sq_db_index = nesqp->mmap_sq_db_index; + uresp.mmap_rq_db_index = 0; uresp.actual_sq_size = sq_size; uresp.actual_rq_size = rq_size; uresp.qp_id = nesqp->hwqp.qp_id; @@ -1404,6 +1410,9 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, } nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR); + init_timer(&nesqp->terminate_timer); + nesqp->terminate_timer.function = nes_terminate_timeout; + nesqp->terminate_timer.data = (unsigned long)nesqp; /* update the QP table */ nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp; @@ -1413,7 +1422,6 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, return &nesqp->ibqp; } - /** * nes_clean_cq */ @@ -1762,7 +1770,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries, resp.cq_id = nescq->hw_cq.cq_number; resp.cq_size = nescq->hw_cq.cq_size; resp.mmap_db_index = 0; - if (ib_copy_to_udata(udata, &resp, sizeof resp)) { + if (ib_copy_to_udata(udata, &resp, sizeof resp - sizeof resp.reserved)) { nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); kfree(nescq); return ERR_PTR(-EFAULT); @@ -2301,7 +2309,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct nes_device *nesdev = nesvnic->nesdev; struct nes_adapter *nesadapter = nesdev->nesadapter; struct ib_mr *ibmr = ERR_PTR(-EINVAL); - struct ib_umem_chunk *chunk; + struct scatterlist *sg; struct nes_ucontext *nes_ucontext; struct nes_pbl *nespbl; struct nes_mr *nesmr; @@ -2309,7 +2317,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct nes_mem_reg_req req; struct nes_vpbl vpbl; struct nes_root_vpbl root_vpbl; - int nmap_index, page_index; + int entry, page_index; int page_count = 0; int err, pbl_depth = 0; int chunk_pages; @@ -2324,6 +2332,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u16 pbl_count; u8 single_page = 1; u8 stag_key; + int first_page = 1; region = ib_umem_get(pd->uobject->context, start, length, acc, 0); if (IS_ERR(region)) { @@ -2374,128 +2383,125 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } nesmr->region = region; - list_for_each_entry(chunk, ®ion->chunk_list, list) { - nes_debug(NES_DBG_MR, "Chunk: nents = %u, nmap = %u .\n", - chunk->nents, chunk->nmap); - for (nmap_index = 0; nmap_index < chunk->nmap; ++nmap_index) { - if (sg_dma_address(&chunk->page_list[nmap_index]) & ~PAGE_MASK) { - ib_umem_release(region); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n", - (unsigned int) sg_dma_address(&chunk->page_list[nmap_index])); - ibmr = ERR_PTR(-EINVAL); - kfree(nesmr); - goto reg_user_mr_err; - } + for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) { + if (sg_dma_address(sg) & ~PAGE_MASK) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n", + (unsigned int) sg_dma_address(sg)); + ibmr = ERR_PTR(-EINVAL); + kfree(nesmr); + goto reg_user_mr_err; + } - if (!sg_dma_len(&chunk->page_list[nmap_index])) { - ib_umem_release(region); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, - stag_index); - nes_debug(NES_DBG_MR, "Invalid Buffer Size\n"); - ibmr = ERR_PTR(-EINVAL); - kfree(nesmr); - goto reg_user_mr_err; - } + if (!sg_dma_len(sg)) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + stag_index); + nes_debug(NES_DBG_MR, "Invalid Buffer Size\n"); + ibmr = ERR_PTR(-EINVAL); + kfree(nesmr); + goto reg_user_mr_err; + } - region_length += sg_dma_len(&chunk->page_list[nmap_index]); - chunk_pages = sg_dma_len(&chunk->page_list[nmap_index]) >> 12; - region_length -= skip_pages << 12; - for (page_index=skip_pages; page_index < chunk_pages; page_index++) { - skip_pages = 0; - if ((page_count!=0)&&(page_count<<12)-(region->offset&(4096-1))>=region->length) - goto enough_pages; - if ((page_count&0x01FF) == 0) { - if (page_count >= 1024 * 512) { + region_length += sg_dma_len(sg); + chunk_pages = sg_dma_len(sg) >> 12; + region_length -= skip_pages << 12; + for (page_index = skip_pages; page_index < chunk_pages; page_index++) { + skip_pages = 0; + if ((page_count != 0) && (page_count<<12)-(region->offset&(4096-1)) >= region->length) + goto enough_pages; + if ((page_count&0x01FF) == 0) { + if (page_count >= 1024 * 512) { + ib_umem_release(region); + nes_free_resource(nesadapter, + nesadapter->allocated_mrs, stag_index); + kfree(nesmr); + ibmr = ERR_PTR(-E2BIG); + goto reg_user_mr_err; + } + if (root_pbl_index == 1) { + root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, + 8192, &root_vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n", + root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase); + if (!root_vpbl.pbl_vbase) { ib_umem_release(region); - nes_free_resource(nesadapter, - nesadapter->allocated_mrs, stag_index); + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + stag_index); kfree(nesmr); - ibmr = ERR_PTR(-E2BIG); + ibmr = ERR_PTR(-ENOMEM); goto reg_user_mr_err; } - if (root_pbl_index == 1) { - root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, - 8192, &root_vpbl.pbl_pbase); - nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n", - root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase); - if (!root_vpbl.pbl_vbase) { - ib_umem_release(region); - pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, - vpbl.pbl_pbase); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, - stag_index); - kfree(nesmr); - ibmr = ERR_PTR(-ENOMEM); - goto reg_user_mr_err; - } - root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, - GFP_KERNEL); - if (!root_vpbl.leaf_vpbl) { - ib_umem_release(region); - pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, - root_vpbl.pbl_pbase); - pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, - vpbl.pbl_pbase); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, - stag_index); - kfree(nesmr); - ibmr = ERR_PTR(-ENOMEM); - goto reg_user_mr_err; - } - root_vpbl.pbl_vbase[0].pa_low = - cpu_to_le32((u32)vpbl.pbl_pbase); - root_vpbl.pbl_vbase[0].pa_high = - cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); - root_vpbl.leaf_vpbl[0] = vpbl; - } - vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, - &vpbl.pbl_pbase); - nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n", - vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase); - if (!vpbl.pbl_vbase) { + root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, + GFP_KERNEL); + if (!root_vpbl.leaf_vpbl) { ib_umem_release(region); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - ibmr = ERR_PTR(-ENOMEM); + pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, + root_vpbl.pbl_pbase); + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + stag_index); kfree(nesmr); + ibmr = ERR_PTR(-ENOMEM); goto reg_user_mr_err; } - if (1 <= root_pbl_index) { - root_vpbl.pbl_vbase[root_pbl_index].pa_low = - cpu_to_le32((u32)vpbl.pbl_pbase); - root_vpbl.pbl_vbase[root_pbl_index].pa_high = - cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32))); - root_vpbl.leaf_vpbl[root_pbl_index] = vpbl; - } - root_pbl_index++; - cur_pbl_index = 0; + root_vpbl.pbl_vbase[0].pa_low = + cpu_to_le32((u32)vpbl.pbl_pbase); + root_vpbl.pbl_vbase[0].pa_high = + cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); + root_vpbl.leaf_vpbl[0] = vpbl; } - if (single_page) { - if (page_count != 0) { - if ((last_dma_addr+4096) != - (sg_dma_address(&chunk->page_list[nmap_index])+ - (page_index*4096))) - single_page = 0; - last_dma_addr = sg_dma_address(&chunk->page_list[nmap_index])+ - (page_index*4096); - } else { - first_dma_addr = sg_dma_address(&chunk->page_list[nmap_index])+ - (page_index*4096); - last_dma_addr = first_dma_addr; - } + vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, + &vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n", + vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase); + if (!vpbl.pbl_vbase) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + ibmr = ERR_PTR(-ENOMEM); + kfree(nesmr); + goto reg_user_mr_err; + } + if (1 <= root_pbl_index) { + root_vpbl.pbl_vbase[root_pbl_index].pa_low = + cpu_to_le32((u32)vpbl.pbl_pbase); + root_vpbl.pbl_vbase[root_pbl_index].pa_high = + cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32))); + root_vpbl.leaf_vpbl[root_pbl_index] = vpbl; + } + root_pbl_index++; + cur_pbl_index = 0; + } + if (single_page) { + if (page_count != 0) { + if ((last_dma_addr+4096) != + (sg_dma_address(sg)+ + (page_index*4096))) + single_page = 0; + last_dma_addr = sg_dma_address(sg)+ + (page_index*4096); + } else { + first_dma_addr = sg_dma_address(sg)+ + (page_index*4096); + last_dma_addr = first_dma_addr; } - - vpbl.pbl_vbase[cur_pbl_index].pa_low = - cpu_to_le32((u32)(sg_dma_address(&chunk->page_list[nmap_index])+ - (page_index*4096))); - vpbl.pbl_vbase[cur_pbl_index].pa_high = - cpu_to_le32((u32)((((u64)(sg_dma_address(&chunk->page_list[nmap_index])+ - (page_index*4096))) >> 32))); - cur_pbl_index++; - page_count++; } + + vpbl.pbl_vbase[cur_pbl_index].pa_low = + cpu_to_le32((u32)(sg_dma_address(sg)+ + (page_index*4096))); + vpbl.pbl_vbase[cur_pbl_index].pa_high = + cpu_to_le32((u32)((((u64)(sg_dma_address(sg)+ + (page_index*4096))) >> 32))); + cur_pbl_index++; + page_count++; } } + enough_pages: nes_debug(NES_DBG_MR, "calculating stag, stag_index=0x%08x, driver_key=0x%08x," " stag_key=0x%08x\n", @@ -2559,6 +2565,11 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ibmr; case IWNES_MEMREG_TYPE_QP: case IWNES_MEMREG_TYPE_CQ: + if (!region->length) { + nes_debug(NES_DBG_MR, "Unable to register zero length region for CQ\n"); + ib_umem_release(region); + return ERR_PTR(-EINVAL); + } nespbl = kzalloc(sizeof(*nespbl), GFP_KERNEL); if (!nespbl) { nes_debug(NES_DBG_MR, "Unable to allocate PBL\n"); @@ -2602,25 +2613,28 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, nespbl->pbl_size, (unsigned long) nespbl->pbl_pbase, (void *) nespbl->pbl_vbase, nespbl->user_base); - list_for_each_entry(chunk, ®ion->chunk_list, list) { - for (nmap_index = 0; nmap_index < chunk->nmap; ++nmap_index) { - chunk_pages = sg_dma_len(&chunk->page_list[nmap_index]) >> 12; - chunk_pages += (sg_dma_len(&chunk->page_list[nmap_index]) & (4096-1)) ? 1 : 0; - nespbl->page = sg_page(&chunk->page_list[0]); - for (page_index=0; page_index<chunk_pages; page_index++) { - ((__le32 *)pbl)[0] = cpu_to_le32((u32) - (sg_dma_address(&chunk->page_list[nmap_index])+ - (page_index*4096))); - ((__le32 *)pbl)[1] = cpu_to_le32(((u64) - (sg_dma_address(&chunk->page_list[nmap_index])+ - (page_index*4096)))>>32); - nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl, - (unsigned long long)*pbl, - le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0])); - pbl++; - } + for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) { + chunk_pages = sg_dma_len(sg) >> 12; + chunk_pages += (sg_dma_len(sg) & (4096-1)) ? 1 : 0; + if (first_page) { + nespbl->page = sg_page(sg); + first_page = 0; + } + + for (page_index = 0; page_index < chunk_pages; page_index++) { + ((__le32 *)pbl)[0] = cpu_to_le32((u32) + (sg_dma_address(sg)+ + (page_index*4096))); + ((__le32 *)pbl)[1] = cpu_to_le32(((u64) + (sg_dma_address(sg)+ + (page_index*4096)))>>32); + nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl, + (unsigned long long)*pbl, + le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0])); + pbl++; } } + if (req.reg_type == IWNES_MEMREG_TYPE_QP) { list_add_tail(&nespbl->list, &nes_ucontext->qp_reg_mem_list); } else { @@ -2823,7 +2837,7 @@ static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, init_attr->qp_context = nesqp->ibqp.qp_context; init_attr->send_cq = nesqp->ibqp.send_cq; init_attr->recv_cq = nesqp->ibqp.recv_cq; - init_attr->srq = nesqp->ibqp.srq = nesqp->ibqp.srq; + init_attr->srq = nesqp->ibqp.srq; init_attr->cap = attr->cap; return 0; @@ -3123,9 +3137,7 @@ int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), original_last_aeq, nesqp->last_aeq); - if ((!ret) || - ((original_last_aeq != NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) && - (ret))) { + if (!ret || original_last_aeq != NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) { if (dont_wait) { if (nesqp->cm_id && nesqp->hw_tcp_state != 0) { nes_debug(NES_DBG_MOD_QP, "QP%u Queuing fake disconnect for QP refcount (%d)," diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index 0eff7c44d76..309b31c31ae 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -184,5 +184,6 @@ struct nes_qp { u8 pau_busy; u8 pau_pending; u8 pau_state; + __u64 nesuqp_addr; }; #endif /* NES_VERBS_H */ diff --git a/drivers/infiniband/hw/ocrdma/Kconfig b/drivers/infiniband/hw/ocrdma/Kconfig index b5b6056c851..c0cddc0192d 100644 --- a/drivers/infiniband/hw/ocrdma/Kconfig +++ b/drivers/infiniband/hw/ocrdma/Kconfig @@ -1,6 +1,6 @@ config INFINIBAND_OCRDMA tristate "Emulex One Connect HCA support" - depends on ETHERNET && NETDEVICES && PCI && (IPV6 || IPV6=n) + depends on ETHERNET && NETDEVICES && PCI && INET && (IPV6 || IPV6=n) select NET_VENDOR_EMULEX select BE2NET ---help--- diff --git a/drivers/infiniband/hw/ocrdma/Makefile b/drivers/infiniband/hw/ocrdma/Makefile index 06a5bed12e4..d1bfd4f4cdd 100644 --- a/drivers/infiniband/hw/ocrdma/Makefile +++ b/drivers/infiniband/hw/ocrdma/Makefile @@ -2,4 +2,4 @@ ccflags-y := -Idrivers/net/ethernet/emulex/benet obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma.o -ocrdma-y := ocrdma_main.o ocrdma_verbs.o ocrdma_hw.o ocrdma_ah.o +ocrdma-y := ocrdma_main.o ocrdma_verbs.o ocrdma_hw.o ocrdma_ah.o ocrdma_stats.o diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index 48970af2367..19011dbb930 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -35,19 +35,27 @@ #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> +#include <rdma/ib_addr.h> #include <be_roce.h> #include "ocrdma_sli.h" -#define OCRDMA_ROCE_DEV_VERSION "1.0.0" +#define OCRDMA_ROCE_DRV_VERSION "10.2.145.0u" + +#define OCRDMA_ROCE_DRV_DESC "Emulex OneConnect RoCE Driver" #define OCRDMA_NODE_DESC "Emulex OneConnect RoCE HCA" -#define ocrdma_err(format, arg...) printk(KERN_ERR format, ##arg) +#define OC_NAME_SH OCRDMA_NODE_DESC "(Skyhawk)" +#define OC_NAME_UNKNOWN OCRDMA_NODE_DESC "(Unknown)" +#define OC_SKH_DEVICE_PF 0x720 +#define OC_SKH_DEVICE_VF 0x728 #define OCRDMA_MAX_AH 512 #define OCRDMA_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME) +#define convert_to_64bit(lo, hi) ((u64)hi << 32 | (u64)lo) + struct ocrdma_dev_attr { u8 fw_ver[32]; u32 vendor_id; @@ -58,13 +66,16 @@ struct ocrdma_dev_attr { u16 max_qp; u16 max_wqe; u16 max_rqe; + u16 max_srq; u32 max_inline_data; int max_send_sge; int max_recv_sge; int max_srq_sge; + int max_rdma_sge; int max_mr; u64 max_mr_size; u32 max_num_mr_pbl; + int max_mw; int max_fmr; int max_map_per_fmr; int max_pages_per_frmr; @@ -83,6 +94,12 @@ struct ocrdma_dev_attr { u8 num_ird_pages; }; +struct ocrdma_dma_mem { + void *va; + dma_addr_t pa; + u32 size; +}; + struct ocrdma_pbl { void *va; dma_addr_t pa; @@ -97,7 +114,6 @@ struct ocrdma_queue_info { u16 id; /* qid, where to ring the doorbell. */ u16 head, tail; bool created; - atomic_t used; /* Number of valid elements in the queue */ }; struct ocrdma_eq { @@ -123,6 +139,52 @@ struct mqe_ctx { bool cmd_done; }; +struct ocrdma_hw_mr { + u32 lkey; + u8 fr_mr; + u8 remote_atomic; + u8 remote_rd; + u8 remote_wr; + u8 local_rd; + u8 local_wr; + u8 mw_bind; + u8 rsvd; + u64 len; + struct ocrdma_pbl *pbl_table; + u32 num_pbls; + u32 num_pbes; + u32 pbl_size; + u32 pbe_size; + u64 fbo; + u64 va; +}; + +struct ocrdma_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct ocrdma_hw_mr hwmr; +}; + +struct ocrdma_stats { + u8 type; + struct ocrdma_dev *dev; +}; + +struct stats_mem { + struct ocrdma_mqe mqe; + void *va; + dma_addr_t pa; + u32 size; + char *debugfs_mem; +}; + +struct phy_info { + u16 auto_speeds_supported; + u16 fixed_speeds_supported; + u16 phy_type; + u16 interface_type; +}; + struct ocrdma_dev { struct ib_device ibdev; struct ocrdma_dev_attr attr; @@ -133,8 +195,7 @@ struct ocrdma_dev { struct ocrdma_cq **cq_tbl; struct ocrdma_qp **qp_tbl; - struct ocrdma_eq meq; - struct ocrdma_eq *qp_eq_tbl; + struct ocrdma_eq *eq_tbl; int eq_cnt; u16 base_eqid; u16 max_eq; @@ -167,15 +228,34 @@ struct ocrdma_dev { struct mqe_ctx mqe_ctx; struct be_dev_info nic_info; + struct phy_info phy; + char model_number[32]; + u32 hba_port_num; struct list_head entry; struct rcu_head rcu; int id; + u64 stag_arr[OCRDMA_MAX_STAG]; + u16 pvid; + u32 asic_id; + + ulong last_stats_time; + struct mutex stats_lock; /* provide synch for debugfs operations */ + struct stats_mem stats_mem; + struct ocrdma_stats rsrc_stats; + struct ocrdma_stats rx_stats; + struct ocrdma_stats wqe_stats; + struct ocrdma_stats tx_stats; + struct ocrdma_stats db_err_stats; + struct ocrdma_stats tx_qp_err_stats; + struct ocrdma_stats rx_qp_err_stats; + struct ocrdma_stats tx_dbg_stats; + struct ocrdma_stats rx_dbg_stats; + struct dentry *dir; }; struct ocrdma_cq { struct ib_cq ibcq; - struct ocrdma_dev *dev; struct ocrdma_cqe *va; u32 phase; u32 getp; /* pointer to pending wrs to @@ -184,8 +264,8 @@ struct ocrdma_cq { */ u32 max_hw_cqe; bool phase_change; - bool armed, solicited; - bool arm_needed; + bool deferred_arm, deferred_sol; + bool first_arm; spinlock_t cq_lock ____cacheline_aligned; /* provide synchronization * to cq polling @@ -198,7 +278,7 @@ struct ocrdma_cq { struct ocrdma_ucontext *ucontext; dma_addr_t pa; u32 len; - atomic_t use_cnt; + u32 cqe_cnt; /* head of all qp's sq and rq for which cqes need to be flushed * by the software. @@ -208,9 +288,7 @@ struct ocrdma_cq { struct ocrdma_pd { struct ib_pd ibpd; - struct ocrdma_dev *dev; struct ocrdma_ucontext *uctx; - atomic_t use_cnt; u32 id; int num_dpp_qp; u32 dpp_page; @@ -219,7 +297,6 @@ struct ocrdma_pd { struct ocrdma_ah { struct ib_ah ibah; - struct ocrdma_dev *dev; struct ocrdma_av *av; u16 sgid_index; u32 id; @@ -239,18 +316,17 @@ struct ocrdma_qp_hwq_info { struct ocrdma_srq { struct ib_srq ibsrq; - struct ocrdma_dev *dev; u8 __iomem *db; + struct ocrdma_qp_hwq_info rq; + u64 *rqe_wr_id_tbl; + u32 *idx_bit_fields; + u32 bit_fields_len; + /* provide synchronization to multiple context(s) posting rqe */ spinlock_t q_lock ____cacheline_aligned; - struct ocrdma_qp_hwq_info rq; struct ocrdma_pd *pd; - atomic_t use_cnt; u32 id; - u64 *rqe_wr_id_tbl; - u32 *idx_bit_fields; - u32 bit_fields_len; }; struct ocrdma_qp { @@ -258,8 +334,6 @@ struct ocrdma_qp { struct ocrdma_dev *dev; u8 __iomem *sq_db; - /* provide synchronization to multiple context(s) posting wqe, rqe */ - spinlock_t q_lock ____cacheline_aligned; struct ocrdma_qp_hwq_info sq; struct { uint64_t wrid; @@ -269,6 +343,9 @@ struct ocrdma_qp { uint8_t rsvd[3]; } *wqe_wr_id_tbl; u32 max_inline_data; + + /* provide synchronization to multiple context(s) posting wqe, rqe */ + spinlock_t q_lock ____cacheline_aligned; struct ocrdma_cq *sq_cq; /* list maintained per CQ to flush SQ errors */ struct list_head sq_entry; @@ -294,46 +371,17 @@ struct ocrdma_qp { u32 qkey; bool dpp_enabled; u8 *ird_q_va; -}; - -#define OCRDMA_GET_NUM_POSTED_SHIFT_VAL(qp) \ - (((qp->dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) && \ - (qp->id < 64)) ? 24 : 16) - -struct ocrdma_hw_mr { - struct ocrdma_dev *dev; - u32 lkey; - u8 fr_mr; - u8 remote_atomic; - u8 remote_rd; - u8 remote_wr; - u8 local_rd; - u8 local_wr; - u8 mw_bind; - u8 rsvd; - u64 len; - struct ocrdma_pbl *pbl_table; - u32 num_pbls; - u32 num_pbes; - u32 pbl_size; - u32 pbe_size; - u64 fbo; - u64 va; -}; - -struct ocrdma_mr { - struct ib_mr ibmr; - struct ib_umem *umem; - struct ocrdma_hw_mr hwmr; - struct ocrdma_pd *pd; + bool signaled; }; struct ocrdma_ucontext { struct ib_ucontext ibucontext; - struct ocrdma_dev *dev; struct list_head mm_head; struct mutex mm_list_lock; /* protects list entries of mm type */ + struct ocrdma_pd *cntxt_pd; + int pd_in_use; + struct { u32 *va; dma_addr_t pa; @@ -390,4 +438,84 @@ static inline struct ocrdma_srq *get_ocrdma_srq(struct ib_srq *ibsrq) return container_of(ibsrq, struct ocrdma_srq, ibsrq); } +static inline int is_cqe_valid(struct ocrdma_cq *cq, struct ocrdma_cqe *cqe) +{ + int cqe_valid; + cqe_valid = le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_VALID; + return (cqe_valid == cq->phase); +} + +static inline int is_cqe_for_sq(struct ocrdma_cqe *cqe) +{ + return (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_QTYPE) ? 0 : 1; +} + +static inline int is_cqe_invalidated(struct ocrdma_cqe *cqe) +{ + return (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_INVALIDATE) ? 1 : 0; +} + +static inline int is_cqe_imm(struct ocrdma_cqe *cqe) +{ + return (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_IMM) ? 1 : 0; +} + +static inline int is_cqe_wr_imm(struct ocrdma_cqe *cqe) +{ + return (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_WRITE_IMM) ? 1 : 0; +} + +static inline int ocrdma_resolve_dmac(struct ocrdma_dev *dev, + struct ib_ah_attr *ah_attr, u8 *mac_addr) +{ + struct in6_addr in6; + + memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6)); + if (rdma_is_multicast_addr(&in6)) + rdma_get_mcast_mac(&in6, mac_addr); + else + memcpy(mac_addr, ah_attr->dmac, ETH_ALEN); + return 0; +} + +static inline char *hca_name(struct ocrdma_dev *dev) +{ + switch (dev->nic_info.pdev->device) { + case OC_SKH_DEVICE_PF: + case OC_SKH_DEVICE_VF: + return OC_NAME_SH; + default: + return OC_NAME_UNKNOWN; + } +} + +static inline int ocrdma_get_eq_table_index(struct ocrdma_dev *dev, + int eqid) +{ + int indx; + + for (indx = 0; indx < dev->eq_cnt; indx++) { + if (dev->eq_tbl[indx].q.id == eqid) + return indx; + } + + return -EINVAL; +} + +static inline u8 ocrdma_get_asic_type(struct ocrdma_dev *dev) +{ + if (dev->nic_info.dev_family == 0xF && !dev->asic_id) { + pci_read_config_dword( + dev->nic_info.pdev, + OCRDMA_SLI_ASIC_ID_OFFSET, &dev->asic_id); + } + + return (dev->asic_id & OCRDMA_SLI_ASIC_GEN_NUM_MASK) >> + OCRDMA_SLI_ASIC_GEN_NUM_SHIFT; +} + #endif diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_abi.h b/drivers/infiniband/hw/ocrdma/ocrdma_abi.h index 517ab20b727..1554cca5712 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_abi.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_abi.h @@ -28,6 +28,10 @@ #ifndef __OCRDMA_ABI_H__ #define __OCRDMA_ABI_H__ +#define OCRDMA_ABI_VERSION 2 +#define OCRDMA_BE_ROCE_ABI_VERSION 1 +/* user kernel communication data structures. */ + struct ocrdma_alloc_ucontext_resp { u32 dev_id; u32 wqe_size; @@ -35,16 +39,16 @@ struct ocrdma_alloc_ucontext_resp { u32 dpp_wqe_size; u64 ah_tbl_page; u32 ah_tbl_len; - u32 rsvd; - u8 fw_ver[32]; u32 rqe_size; + u8 fw_ver[32]; + /* for future use/new features in progress */ u64 rsvd1; -} __packed; + u64 rsvd2; +}; -/* user kernel communication data structures. */ struct ocrdma_alloc_pd_ureq { u64 rsvd1; -} __packed; +}; struct ocrdma_alloc_pd_uresp { u32 id; @@ -52,12 +56,12 @@ struct ocrdma_alloc_pd_uresp { u32 dpp_page_addr_hi; u32 dpp_page_addr_lo; u64 rsvd1; -} __packed; +}; struct ocrdma_create_cq_ureq { u32 dpp_cq; - u32 rsvd; -} __packed; + u32 rsvd; /* pad */ +}; #define MAX_CQ_PAGES 8 struct ocrdma_create_cq_uresp { @@ -69,9 +73,10 @@ struct ocrdma_create_cq_uresp { u64 db_page_addr; u32 db_page_size; u32 phase_change; + /* for future use/new features in progress */ u64 rsvd1; u64 rsvd2; -} __packed; +}; #define MAX_QP_PAGES 8 #define MAX_UD_AV_PAGES 8 @@ -80,14 +85,14 @@ struct ocrdma_create_qp_ureq { u8 enable_dpp_cq; u8 rsvd; u16 dpp_cq_id; - u32 rsvd1; + u32 rsvd1; /* pad */ }; struct ocrdma_create_qp_uresp { u16 qp_id; u16 sq_dbid; u16 rq_dbid; - u16 resv0; + u16 resv0; /* pad */ u32 sq_page_size; u32 rq_page_size; u32 num_sq_pages; @@ -98,19 +103,17 @@ struct ocrdma_create_qp_uresp { u32 db_page_size; u32 dpp_credit; u32 dpp_offset; - u32 rsvd1; u32 num_wqe_allocated; u32 num_rqe_allocated; u32 db_sq_offset; u32 db_rq_offset; u32 db_shift; - u64 rsvd2; - u64 rsvd3; + u64 rsvd[11]; } __packed; struct ocrdma_create_srq_uresp { u16 rq_dbid; - u16 resv0; + u16 resv0; /* pad */ u32 resv1; u32 rq_page_size; @@ -126,6 +129,6 @@ struct ocrdma_create_srq_uresp { u64 rsvd2; u64 rsvd3; -} __packed; +}; #endif /* __OCRDMA_ABI_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index a877a8ed790..d4cc01f10c0 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -29,19 +29,17 @@ #include <net/netevent.h> #include <rdma/ib_addr.h> -#include <rdma/ib_cache.h> #include "ocrdma.h" #include "ocrdma_verbs.h" #include "ocrdma_ah.h" #include "ocrdma_hw.h" -static inline int set_av_attr(struct ocrdma_ah *ah, +static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, struct ib_ah_attr *attr, int pdid) { int status = 0; u16 vlan_tag; bool vlan_enabled = false; - struct ocrdma_dev *dev = ah->dev; struct ocrdma_eth_vlan eth; struct ocrdma_grh grh; int eth_sz; @@ -51,7 +49,9 @@ static inline int set_av_attr(struct ocrdma_ah *ah, ah->sgid_index = attr->grh.sgid_index; - vlan_tag = rdma_get_vlan_id(&attr->grh.dgid); + vlan_tag = attr->vlan_id; + if (!vlan_tag || (vlan_tag > 0xFFF)) + vlan_tag = dev->pvid; if (vlan_tag && (vlan_tag < 0x1000)) { eth.eth_type = cpu_to_be16(0x8100); eth.roce_eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE); @@ -64,7 +64,8 @@ static inline int set_av_attr(struct ocrdma_ah *ah, eth_sz = sizeof(struct ocrdma_eth_basic); } memcpy(ð.smac[0], &dev->nic_info.mac_addr[0], ETH_ALEN); - status = ocrdma_resolve_dgid(dev, &attr->grh.dgid, ð.dmac[0]); + memcpy(ð.dmac[0], attr->dmac, ETH_ALEN); + status = ocrdma_resolve_dmac(dev, attr, ð.dmac[0]); if (status) return status; status = ocrdma_query_gid(&dev->ibdev, 1, attr->grh.sgid_index, @@ -84,6 +85,7 @@ static inline int set_av_attr(struct ocrdma_ah *ah, memcpy((u8 *)ah->av + eth_sz, &grh, sizeof(struct ocrdma_grh)); if (vlan_enabled) ah->av->valid |= OCRDMA_AV_VLAN_VALID; + ah->av->valid = cpu_to_le32(ah->av->valid); return status; } @@ -93,20 +95,19 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) int status; struct ocrdma_ah *ah; struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); - struct ocrdma_dev *dev = pd->dev; + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); if (!(attr->ah_flags & IB_AH_GRH)) return ERR_PTR(-EINVAL); - ah = kzalloc(sizeof *ah, GFP_ATOMIC); + ah = kzalloc(sizeof(*ah), GFP_ATOMIC); if (!ah) return ERR_PTR(-ENOMEM); - ah->dev = pd->dev; status = ocrdma_alloc_av(dev, ah); if (status) goto av_err; - status = set_av_attr(ah, attr, pd->id); + status = set_av_attr(dev, ah, attr, pd->id); if (status) goto av_conf_err; @@ -127,7 +128,9 @@ av_err: int ocrdma_destroy_ah(struct ib_ah *ibah) { struct ocrdma_ah *ah = get_ocrdma_ah(ibah); - ocrdma_free_av(ah->dev, ah); + struct ocrdma_dev *dev = get_ocrdma_dev(ibah->device); + + ocrdma_free_av(dev, ah); kfree(ah); return 0; } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 71942af4fce..3bbf2010a82 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -32,7 +32,6 @@ #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> -#include <rdma/ib_addr.h> #include "ocrdma.h" #include "ocrdma_hw.h" @@ -94,7 +93,7 @@ enum cqe_status { static inline void *ocrdma_get_eqe(struct ocrdma_eq *eq) { - return (u8 *)eq->q.va + (eq->q.tail * sizeof(struct ocrdma_eqe)); + return eq->q.va + (eq->q.tail * sizeof(struct ocrdma_eqe)); } static inline void ocrdma_eq_inc_tail(struct ocrdma_eq *eq) @@ -105,8 +104,7 @@ static inline void ocrdma_eq_inc_tail(struct ocrdma_eq *eq) static inline void *ocrdma_get_mcqe(struct ocrdma_dev *dev) { struct ocrdma_mcqe *cqe = (struct ocrdma_mcqe *) - ((u8 *) dev->mq.cq.va + - (dev->mq.cq.tail * sizeof(struct ocrdma_mcqe))); + (dev->mq.cq.va + (dev->mq.cq.tail * sizeof(struct ocrdma_mcqe))); if (!(le32_to_cpu(cqe->valid_ae_cmpl_cons) & OCRDMA_MCQE_VALID_MASK)) return NULL; @@ -120,21 +118,17 @@ static inline void ocrdma_mcq_inc_tail(struct ocrdma_dev *dev) static inline struct ocrdma_mqe *ocrdma_get_mqe(struct ocrdma_dev *dev) { - return (struct ocrdma_mqe *)((u8 *) dev->mq.sq.va + - (dev->mq.sq.head * - sizeof(struct ocrdma_mqe))); + return dev->mq.sq.va + (dev->mq.sq.head * sizeof(struct ocrdma_mqe)); } static inline void ocrdma_mq_inc_head(struct ocrdma_dev *dev) { dev->mq.sq.head = (dev->mq.sq.head + 1) & (OCRDMA_MQ_LEN - 1); - atomic_inc(&dev->mq.sq.used); } static inline void *ocrdma_get_mqe_rsp(struct ocrdma_dev *dev) { - return (void *)((u8 *) dev->mq.sq.va + - (dev->mqe_ctx.tag * sizeof(struct ocrdma_mqe))); + return dev->mq.sq.va + (dev->mqe_ctx.tag * sizeof(struct ocrdma_mqe)); } enum ib_qp_state get_ibqp_state(enum ocrdma_qp_state qps) @@ -155,7 +149,7 @@ enum ib_qp_state get_ibqp_state(enum ocrdma_qp_state qps) return IB_QPS_SQE; case OCRDMA_QPS_ERR: return IB_QPS_ERR; - }; + } return IB_QPS_ERR; } @@ -176,13 +170,13 @@ static enum ocrdma_qp_state get_ocrdma_qp_state(enum ib_qp_state qps) return OCRDMA_QPS_SQE; case IB_QPS_ERR: return OCRDMA_QPS_ERR; - }; + } return OCRDMA_QPS_ERR; } static int ocrdma_get_mbx_errno(u32 status) { - int err_num = -EFAULT; + int err_num; u8 mbox_status = (status & OCRDMA_MBX_RSP_STATUS_MASK) >> OCRDMA_MBX_RSP_STATUS_SHIFT; u8 add_status = (status & OCRDMA_MBX_RSP_ASTATUS_MASK) >> @@ -248,6 +242,23 @@ static int ocrdma_get_mbx_errno(u32 status) return err_num; } +char *port_speed_string(struct ocrdma_dev *dev) +{ + char *str = ""; + u16 speeds_supported; + + speeds_supported = dev->phy.fixed_speeds_supported | + dev->phy.auto_speeds_supported; + if (speeds_supported & OCRDMA_PHY_SPEED_40GBPS) + str = "40Gbps "; + else if (speeds_supported & OCRDMA_PHY_SPEED_10GBPS) + str = "10Gbps "; + else if (speeds_supported & OCRDMA_PHY_SPEED_1GBPS) + str = "1Gbps "; + + return str; +} + static int ocrdma_get_mbx_cqe_errno(u16 cqe_status) { int err_num = -EINVAL; @@ -261,10 +272,11 @@ static int ocrdma_get_mbx_cqe_errno(u16 cqe_status) break; case OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_RESOURCES: case OCRDMA_MBX_CQE_STATUS_QUEUE_FLUSHING: - err_num = -EAGAIN; + err_num = -EINVAL; break; case OCRDMA_MBX_CQE_STATUS_DMA_FAILED: - err_num = -EIO; + default: + err_num = -EINVAL; break; } return err_num; @@ -336,6 +348,11 @@ static void *ocrdma_init_emb_mqe(u8 opcode, u32 cmd_len) return mqe; } +static void *ocrdma_alloc_mqe(void) +{ + return kzalloc(sizeof(struct ocrdma_mqe), GFP_KERNEL); +} + static void ocrdma_free_q(struct ocrdma_dev *dev, struct ocrdma_queue_info *q) { dma_free_coherent(&dev->nic_info.pdev->dev, q->size, q->va, q->dma); @@ -368,24 +385,8 @@ static void ocrdma_build_q_pages(struct ocrdma_pa *q_pa, int cnt, } } -static void ocrdma_assign_eq_vect_gen2(struct ocrdma_dev *dev, - struct ocrdma_eq *eq) -{ - /* assign vector and update vector id for next EQ */ - eq->vector = dev->nic_info.msix.start_vector; - dev->nic_info.msix.start_vector += 1; -} - -static void ocrdma_free_eq_vect_gen2(struct ocrdma_dev *dev) -{ - /* this assumes that EQs are freed in exactly reverse order - * as its allocation. - */ - dev->nic_info.msix.start_vector -= 1; -} - -static int ocrdma_mbx_delete_q(struct ocrdma_dev *dev, struct ocrdma_queue_info *q, - int queue_type) +static int ocrdma_mbx_delete_q(struct ocrdma_dev *dev, + struct ocrdma_queue_info *q, int queue_type) { u8 opcode = 0; int status; @@ -424,11 +425,8 @@ static int ocrdma_mbx_create_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) memset(cmd, 0, sizeof(*cmd)); ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_EQ, OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); - if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) - cmd->req.rsvd_version = 0; - else - cmd->req.rsvd_version = 2; + cmd->req.rsvd_version = 2; cmd->num_pages = 4; cmd->valid = OCRDMA_CREATE_EQ_VALID; cmd->cnt = 4 << OCRDMA_CREATE_EQ_CNT_SHIFT; @@ -439,12 +437,7 @@ static int ocrdma_mbx_create_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) NULL); if (!status) { eq->q.id = rsp->vector_eqid & 0xffff; - if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) - ocrdma_assign_eq_vect_gen2(dev, eq); - else { - eq->vector = (rsp->vector_eqid >> 16) & 0xffff; - dev->nic_info.msix.start_vector += 1; - } + eq->vector = (rsp->vector_eqid >> 16) & 0xffff; eq->q.created = true; } return status; @@ -472,7 +465,7 @@ mbx_err: return status; } -static int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) +int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) { int irq; @@ -487,8 +480,6 @@ static void _ocrdma_destroy_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) { if (eq->q.created) { ocrdma_mbx_delete_q(dev, &eq->q, QTYPE_EQ); - if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) - ocrdma_free_eq_vect_gen2(dev); ocrdma_free_q(dev, &eq->q); } } @@ -507,13 +498,12 @@ static void ocrdma_destroy_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) _ocrdma_destroy_eq(dev, eq); } -static void ocrdma_destroy_qp_eqs(struct ocrdma_dev *dev) +static void ocrdma_destroy_eqs(struct ocrdma_dev *dev) { int i; - /* deallocate the data path eqs */ for (i = 0; i < dev->eq_cnt; i++) - ocrdma_destroy_eq(dev, &dev->qp_eq_tbl[i]); + ocrdma_destroy_eq(dev, &dev->eq_tbl[i]); } static int ocrdma_mbx_mq_cq_create(struct ocrdma_dev *dev, @@ -528,16 +518,21 @@ static int ocrdma_mbx_mq_cq_create(struct ocrdma_dev *dev, ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_CQ, OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); - cmd->pgsz_pgcnt = PAGES_4K_SPANNED(cq->va, cq->size); + cmd->req.rsvd_version = OCRDMA_CREATE_CQ_VER2; + cmd->pgsz_pgcnt = (cq->size / OCRDMA_MIN_Q_PAGE_SIZE) << + OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT; + cmd->pgsz_pgcnt |= PAGES_4K_SPANNED(cq->va, cq->size); + cmd->ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS; - cmd->eqn = (eq->id << OCRDMA_CREATE_CQ_EQID_SHIFT); + cmd->eqn = eq->id; + cmd->cqe_count = cq->size / sizeof(struct ocrdma_mcqe); - ocrdma_build_q_pages(&cmd->pa[0], cmd->pgsz_pgcnt, + ocrdma_build_q_pages(&cmd->pa[0], cq->size / OCRDMA_MIN_Q_PAGE_SIZE, cq->dma, PAGE_SIZE_4K); status = be_roce_mcc_cmd(dev->nic_info.netdev, cmd, sizeof(*cmd), NULL, NULL); if (!status) { - cq->id = (rsp->cq_id & OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK); + cq->id = (u16) (rsp->cq_id & OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK); cq->created = true; } return status; @@ -564,32 +559,22 @@ static int ocrdma_mbx_create_mq(struct ocrdma_dev *dev, memset(cmd, 0, sizeof(*cmd)); num_pages = PAGES_4K_SPANNED(mq->va, mq->size); - if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { - ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_MQ, - OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); - cmd->v0.pages = num_pages; - cmd->v0.async_cqid_valid = OCRDMA_CREATE_MQ_ASYNC_CQ_VALID; - cmd->v0.async_cqid_valid = (cq->id << 1); - cmd->v0.cqid_ringsize |= (ocrdma_encoded_q_len(mq->len) << - OCRDMA_CREATE_MQ_RING_SIZE_SHIFT); - cmd->v0.cqid_ringsize |= - (cq->id << OCRDMA_CREATE_MQ_V0_CQ_ID_SHIFT); - cmd->v0.valid = OCRDMA_CREATE_MQ_VALID; - pa = &cmd->v0.pa[0]; - } else { - ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_MQ_EXT, - OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); - cmd->req.rsvd_version = 1; - cmd->v1.cqid_pages = num_pages; - cmd->v1.cqid_pages |= (cq->id << OCRDMA_CREATE_MQ_CQ_ID_SHIFT); - cmd->v1.async_cqid_valid = OCRDMA_CREATE_MQ_ASYNC_CQ_VALID; - cmd->v1.async_event_bitmap = Bit(20); - cmd->v1.async_cqid_ringsize = cq->id; - cmd->v1.async_cqid_ringsize |= (ocrdma_encoded_q_len(mq->len) << - OCRDMA_CREATE_MQ_RING_SIZE_SHIFT); - cmd->v1.valid = OCRDMA_CREATE_MQ_VALID; - pa = &cmd->v1.pa[0]; - } + ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_MQ_EXT, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + cmd->req.rsvd_version = 1; + cmd->cqid_pages = num_pages; + cmd->cqid_pages |= (cq->id << OCRDMA_CREATE_MQ_CQ_ID_SHIFT); + cmd->async_cqid_valid = OCRDMA_CREATE_MQ_ASYNC_CQ_VALID; + + cmd->async_event_bitmap = Bit(OCRDMA_ASYNC_GRP5_EVE_CODE); + cmd->async_event_bitmap |= Bit(OCRDMA_ASYNC_RDMA_EVE_CODE); + + cmd->async_cqid_ringsize = cq->id; + cmd->async_cqid_ringsize |= (ocrdma_encoded_q_len(mq->len) << + OCRDMA_CREATE_MQ_RING_SIZE_SHIFT); + cmd->valid = OCRDMA_CREATE_MQ_VALID; + pa = &cmd->pa[0]; + ocrdma_build_q_pages(pa, num_pages, mq->dma, PAGE_SIZE_4K); status = be_roce_mcc_cmd(dev->nic_info.netdev, cmd, sizeof(*cmd), NULL, NULL); @@ -610,7 +595,8 @@ static int ocrdma_create_mq(struct ocrdma_dev *dev) if (status) goto alloc_err; - status = ocrdma_mbx_mq_cq_create(dev, &dev->mq.cq, &dev->meq.q); + dev->eq_tbl[0].cq_cnt++; + status = ocrdma_mbx_mq_cq_create(dev, &dev->mq.cq, &dev->eq_tbl[0].q); if (status) goto mbx_cq_free; @@ -667,7 +653,7 @@ static void ocrdma_process_qpcat_error(struct ocrdma_dev *dev, if (qp == NULL) BUG(); - ocrdma_qp_state_machine(qp, new_ib_qps, &old_ib_qps); + ocrdma_qp_state_change(qp, new_ib_qps, &old_ib_qps); } static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, @@ -675,7 +661,7 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, { struct ocrdma_qp *qp = NULL; struct ocrdma_cq *cq = NULL; - struct ib_event ib_evt; + struct ib_event ib_evt = { 0 }; int cq_event = 0; int qp_event = 1; int srq_event = 0; @@ -700,6 +686,8 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, case OCRDMA_CQ_OVERRUN_ERROR: ib_evt.element.cq = &cq->ibcq; ib_evt.event = IB_EVENT_CQ_ERR; + cq_event = 1; + qp_event = 0; break; case OCRDMA_CQ_QPCAT_ERROR: ib_evt.element.qp = &qp->ibqp; @@ -745,7 +733,7 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, qp_event = 0; srq_event = 0; dev_event = 0; - ocrdma_err("%s() unknown type=0x%x\n", __func__, type); + pr_err("%s() unknown type=0x%x\n", __func__, type); break; } @@ -760,11 +748,35 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, qp->srq->ibsrq.event_handler(&ib_evt, qp->srq->ibsrq. srq_context); - } else if (dev_event) + } else if (dev_event) { + pr_err("%s: Fatal event received\n", dev->ibdev.name); ib_dispatch_event(&ib_evt); + } } +static void ocrdma_process_grp5_aync(struct ocrdma_dev *dev, + struct ocrdma_ae_mcqe *cqe) +{ + struct ocrdma_ae_pvid_mcqe *evt; + int type = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_TYPE_MASK) >> + OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT; + + switch (type) { + case OCRDMA_ASYNC_EVENT_PVID_STATE: + evt = (struct ocrdma_ae_pvid_mcqe *)cqe; + if ((evt->tag_enabled & OCRDMA_AE_PVID_MCQE_ENABLED_MASK) >> + OCRDMA_AE_PVID_MCQE_ENABLED_SHIFT) + dev->pvid = ((evt->tag_enabled & + OCRDMA_AE_PVID_MCQE_TAG_MASK) >> + OCRDMA_AE_PVID_MCQE_TAG_SHIFT); + break; + default: + /* Not interested evts. */ + break; + } +} + static void ocrdma_process_acqe(struct ocrdma_dev *dev, void *ae_cqe) { /* async CQE processing */ @@ -772,11 +784,13 @@ static void ocrdma_process_acqe(struct ocrdma_dev *dev, void *ae_cqe) u32 evt_code = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_CODE_MASK) >> OCRDMA_AE_MCQE_EVENT_CODE_SHIFT; - if (evt_code == OCRDMA_ASYNC_EVE_CODE) + if (evt_code == OCRDMA_ASYNC_RDMA_EVE_CODE) ocrdma_dispatch_ibevent(dev, cqe); + else if (evt_code == OCRDMA_ASYNC_GRP5_EVE_CODE) + ocrdma_process_grp5_aync(dev, cqe); else - ocrdma_err("%s(%d) invalid evt code=0x%x\n", - __func__, dev->id, evt_code); + pr_err("%s(%d) invalid evt code=0x%x\n", __func__, + dev->id, evt_code); } static void ocrdma_process_mcqe(struct ocrdma_dev *dev, struct ocrdma_mcqe *cqe) @@ -790,8 +804,8 @@ static void ocrdma_process_mcqe(struct ocrdma_dev *dev, struct ocrdma_mcqe *cqe) dev->mqe_ctx.cmd_done = true; wake_up(&dev->mqe_ctx.cmd_wait); } else - ocrdma_err("%s() cqe for invalid tag0x%x.expected=0x%x\n", - __func__, cqe->tag_lo, dev->mqe_ctx.tag); + pr_err("%s() cqe for invalid tag0x%x.expected=0x%x\n", + __func__, cqe->tag_lo, dev->mqe_ctx.tag); } static int ocrdma_mq_cq_handler(struct ocrdma_dev *dev, u16 cq_id) @@ -809,8 +823,6 @@ static int ocrdma_mq_cq_handler(struct ocrdma_dev *dev, u16 cq_id) ocrdma_process_acqe(dev, cqe); else if (cqe->valid_ae_cmpl_cons & OCRDMA_MCQE_CMPL_MASK) ocrdma_process_mcqe(dev, cqe); - else - ocrdma_err("%s() cqe->compl is not set.\n", __func__); memset(cqe, 0, sizeof(struct ocrdma_mcqe)); ocrdma_mcq_inc_tail(dev); } @@ -868,16 +880,8 @@ static void ocrdma_qp_cq_handler(struct ocrdma_dev *dev, u16 cq_idx) BUG(); cq = dev->cq_tbl[cq_idx]; - if (cq == NULL) { - ocrdma_err("%s%d invalid id=0x%x\n", __func__, dev->id, cq_idx); + if (cq == NULL) return; - } - spin_lock_irqsave(&cq->cq_lock, flags); - cq->armed = false; - cq->solicited = false; - spin_unlock_irqrestore(&cq->cq_lock, flags); - - ocrdma_ring_cq_db(dev, cq->id, false, false, 0); if (cq->ibcq.comp_handler) { spin_lock_irqsave(&cq->comp_handler_lock, flags); @@ -902,27 +906,35 @@ static irqreturn_t ocrdma_irq_handler(int irq, void *handle) struct ocrdma_dev *dev = eq->dev; struct ocrdma_eqe eqe; struct ocrdma_eqe *ptr; - u16 eqe_popped = 0; u16 cq_id; - while (1) { + int budget = eq->cq_cnt; + + do { ptr = ocrdma_get_eqe(eq); eqe = *ptr; ocrdma_le32_to_cpu(&eqe, sizeof(eqe)); if ((eqe.id_valid & OCRDMA_EQE_VALID_MASK) == 0) break; - eqe_popped += 1; + ptr->id_valid = 0; + /* ring eq doorbell as soon as its consumed. */ + ocrdma_ring_eq_db(dev, eq->q.id, false, true, 1); /* check whether its CQE or not. */ if ((eqe.id_valid & OCRDMA_EQE_FOR_CQE_MASK) == 0) { cq_id = eqe.id_valid >> OCRDMA_EQE_RESOURCE_ID_SHIFT; ocrdma_cq_handler(dev, cq_id); } ocrdma_eq_inc_tail(eq); - } - ocrdma_ring_eq_db(dev, eq->q.id, true, true, eqe_popped); - /* Ring EQ doorbell with num_popped to 0 to enable interrupts again. */ - if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX) - ocrdma_ring_eq_db(dev, eq->q.id, true, true, 0); + + /* There can be a stale EQE after the last bound CQ is + * destroyed. EQE valid and budget == 0 implies this. + */ + if (budget) + budget--; + + } while (budget); + + ocrdma_ring_eq_db(dev, eq->q.id, true, true, 0); return IRQ_HANDLED; } @@ -959,7 +971,8 @@ static int ocrdma_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe) { int status = 0; u16 cqe_status, ext_status; - struct ocrdma_mqe *rsp; + struct ocrdma_mqe *rsp_mqe; + struct ocrdma_mbx_rsp *rsp = NULL; mutex_lock(&dev->mqe_ctx.lock); ocrdma_post_mqe(dev, mqe); @@ -968,24 +981,61 @@ static int ocrdma_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe) goto mbx_err; cqe_status = dev->mqe_ctx.cqe_status; ext_status = dev->mqe_ctx.ext_status; - rsp = ocrdma_get_mqe_rsp(dev); - ocrdma_copy_le32_to_cpu(mqe, rsp, (sizeof(*mqe))); + rsp_mqe = ocrdma_get_mqe_rsp(dev); + ocrdma_copy_le32_to_cpu(mqe, rsp_mqe, (sizeof(*mqe))); + if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >> + OCRDMA_MQE_HDR_EMB_SHIFT) + rsp = &mqe->u.rsp; + if (cqe_status || ext_status) { - ocrdma_err - ("%s() opcode=0x%x, cqe_status=0x%x, ext_status=0x%x\n", - __func__, - (rsp->u.rsp.subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >> - OCRDMA_MBX_RSP_OPCODE_SHIFT, cqe_status, ext_status); + pr_err("%s() cqe_status=0x%x, ext_status=0x%x,", + __func__, cqe_status, ext_status); + if (rsp) { + /* This is for embedded cmds. */ + pr_err("opcode=0x%x, subsystem=0x%x\n", + (rsp->subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >> + OCRDMA_MBX_RSP_OPCODE_SHIFT, + (rsp->subsys_op & OCRDMA_MBX_RSP_SUBSYS_MASK) >> + OCRDMA_MBX_RSP_SUBSYS_SHIFT); + } status = ocrdma_get_mbx_cqe_errno(cqe_status); goto mbx_err; } - if (mqe->u.rsp.status & OCRDMA_MBX_RSP_STATUS_MASK) + /* For non embedded, rsp errors are handled in ocrdma_nonemb_mbx_cmd */ + if (rsp && (mqe->u.rsp.status & OCRDMA_MBX_RSP_STATUS_MASK)) status = ocrdma_get_mbx_errno(mqe->u.rsp.status); mbx_err: mutex_unlock(&dev->mqe_ctx.lock); return status; } +static int ocrdma_nonemb_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe, + void *payload_va) +{ + int status = 0; + struct ocrdma_mbx_rsp *rsp = payload_va; + + if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >> + OCRDMA_MQE_HDR_EMB_SHIFT) + BUG(); + + status = ocrdma_mbx_cmd(dev, mqe); + if (!status) + /* For non embedded, only CQE failures are handled in + * ocrdma_mbx_cmd. We need to check for RSP errors. + */ + if (rsp->status & OCRDMA_MBX_RSP_STATUS_MASK) + status = ocrdma_get_mbx_errno(rsp->status); + + if (status) + pr_err("opcode=0x%x, subsystem=0x%x\n", + (rsp->subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >> + OCRDMA_MBX_RSP_OPCODE_SHIFT, + (rsp->subsys_op & OCRDMA_MBX_RSP_SUBSYS_MASK) >> + OCRDMA_MBX_RSP_SUBSYS_SHIFT); + return status; +} + static void ocrdma_get_attr(struct ocrdma_dev *dev, struct ocrdma_dev_attr *attr, struct ocrdma_mbx_query_config *rsp) @@ -996,6 +1046,9 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev, attr->max_qp = (rsp->qp_srq_cq_ird_ord & OCRDMA_MBX_QUERY_CFG_MAX_QP_MASK) >> OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT; + attr->max_srq = + (rsp->max_srq_rpir_qps & OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET; attr->max_send_sge = ((rsp->max_write_send_sge & OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK) >> OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT); @@ -1005,6 +1058,9 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev, attr->max_srq_sge = (rsp->max_srq_rqe_sge & OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_MASK) >> OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET; + attr->max_rdma_sge = (rsp->max_write_send_sge & + OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT; attr->max_ord_per_qp = (rsp->max_ird_ord_per_qp & OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_MASK) >> OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_SHIFT; @@ -1020,6 +1076,7 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev, attr->local_ca_ack_delay = (rsp->max_pd_ca_ack_delay & OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK) >> OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT; + attr->max_mw = rsp->max_mw; attr->max_mr = rsp->max_mr; attr->max_mr_size = ~0ull; attr->max_fmr = 0; @@ -1027,6 +1084,9 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev, attr->max_num_mr_pbl = rsp->max_num_mr_pbl; attr->max_cqe = rsp->max_cq_cqes_per_cq & OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_MASK; + attr->max_cq = (rsp->max_cq_cqes_per_cq & + OCRDMA_MBX_QUERY_CFG_MAX_CQ_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET; attr->wqe_size = ((rsp->wqe_rqe_stride_max_dpp_cqs & OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_MASK) >> OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET) * @@ -1038,7 +1098,7 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev, attr->max_inline_data = attr->wqe_size - (sizeof(struct ocrdma_hdr_wqe) + sizeof(struct ocrdma_sge)); - if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { attr->ird = 1; attr->ird_page_size = OCRDMA_MIN_Q_PAGE_SIZE; attr->num_ird_pages = MAX_OCRDMA_IRD_PAGES; @@ -1059,7 +1119,6 @@ static int ocrdma_check_fw_config(struct ocrdma_dev *dev, return -EINVAL; dev->base_eqid = conf->base_eqid; dev->max_eq = conf->max_eq; - dev->attr.max_cq = OCRDMA_MAX_CQ - 1; return 0; } @@ -1113,6 +1172,96 @@ mbx_err: return status; } +int ocrdma_mbx_rdma_stats(struct ocrdma_dev *dev, bool reset) +{ + struct ocrdma_rdma_stats_req *req = dev->stats_mem.va; + struct ocrdma_mqe *mqe = &dev->stats_mem.mqe; + struct ocrdma_rdma_stats_resp *old_stats = NULL; + int status; + + old_stats = kzalloc(sizeof(*old_stats), GFP_KERNEL); + if (old_stats == NULL) + return -ENOMEM; + + memset(mqe, 0, sizeof(*mqe)); + mqe->hdr.pyld_len = dev->stats_mem.size; + mqe->hdr.spcl_sge_cnt_emb |= + (1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) & + OCRDMA_MQE_HDR_SGE_CNT_MASK; + mqe->u.nonemb_req.sge[0].pa_lo = (u32) (dev->stats_mem.pa & 0xffffffff); + mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dev->stats_mem.pa); + mqe->u.nonemb_req.sge[0].len = dev->stats_mem.size; + + /* Cache the old stats */ + memcpy(old_stats, req, sizeof(struct ocrdma_rdma_stats_resp)); + memset(req, 0, dev->stats_mem.size); + + ocrdma_init_mch((struct ocrdma_mbx_hdr *)req, + OCRDMA_CMD_GET_RDMA_STATS, + OCRDMA_SUBSYS_ROCE, + dev->stats_mem.size); + if (reset) + req->reset_stats = reset; + + status = ocrdma_nonemb_mbx_cmd(dev, mqe, dev->stats_mem.va); + if (status) + /* Copy from cache, if mbox fails */ + memcpy(req, old_stats, sizeof(struct ocrdma_rdma_stats_resp)); + else + ocrdma_le32_to_cpu(req, dev->stats_mem.size); + + kfree(old_stats); + return status; +} + +static int ocrdma_mbx_get_ctrl_attribs(struct ocrdma_dev *dev) +{ + int status = -ENOMEM; + struct ocrdma_dma_mem dma; + struct ocrdma_mqe *mqe; + struct ocrdma_get_ctrl_attribs_rsp *ctrl_attr_rsp; + struct mgmt_hba_attribs *hba_attribs; + + mqe = ocrdma_alloc_mqe(); + if (!mqe) + return status; + memset(mqe, 0, sizeof(*mqe)); + + dma.size = sizeof(struct ocrdma_get_ctrl_attribs_rsp); + dma.va = dma_alloc_coherent(&dev->nic_info.pdev->dev, + dma.size, &dma.pa, GFP_KERNEL); + if (!dma.va) + goto free_mqe; + + mqe->hdr.pyld_len = dma.size; + mqe->hdr.spcl_sge_cnt_emb |= + (1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) & + OCRDMA_MQE_HDR_SGE_CNT_MASK; + mqe->u.nonemb_req.sge[0].pa_lo = (u32) (dma.pa & 0xffffffff); + mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dma.pa); + mqe->u.nonemb_req.sge[0].len = dma.size; + + memset(dma.va, 0, dma.size); + ocrdma_init_mch((struct ocrdma_mbx_hdr *)dma.va, + OCRDMA_CMD_GET_CTRL_ATTRIBUTES, + OCRDMA_SUBSYS_COMMON, + dma.size); + + status = ocrdma_nonemb_mbx_cmd(dev, mqe, dma.va); + if (!status) { + ctrl_attr_rsp = (struct ocrdma_get_ctrl_attribs_rsp *)dma.va; + hba_attribs = &ctrl_attr_rsp->ctrl_attribs.hba_attribs; + + dev->hba_port_num = hba_attribs->phy_port; + strncpy(dev->model_number, + hba_attribs->controller_model_number, 31); + } + dma_free_coherent(&dev->nic_info.pdev->dev, dma.size, dma.va, dma.pa); +free_mqe: + kfree(mqe); + return status; +} + static int ocrdma_mbx_query_dev(struct ocrdma_dev *dev) { int status = -ENOMEM; @@ -1132,6 +1281,63 @@ mbx_err: return status; } +int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed) +{ + int status = -ENOMEM; + struct ocrdma_get_link_speed_rsp *rsp; + struct ocrdma_mqe *cmd; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1, + sizeof(*cmd)); + if (!cmd) + return status; + ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0], + OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + + ((struct ocrdma_mbx_hdr *)cmd->u.cmd)->rsvd_version = 0x1; + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + + rsp = (struct ocrdma_get_link_speed_rsp *)cmd; + *lnk_speed = rsp->phys_port_speed; + +mbx_err: + kfree(cmd); + return status; +} + +static int ocrdma_mbx_get_phy_info(struct ocrdma_dev *dev) +{ + int status = -ENOMEM; + struct ocrdma_mqe *cmd; + struct ocrdma_get_phy_info_rsp *rsp; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_PHY_DETAILS, sizeof(*cmd)); + if (!cmd) + return status; + + ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0], + OCRDMA_CMD_PHY_DETAILS, OCRDMA_SUBSYS_COMMON, + sizeof(*cmd)); + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + + rsp = (struct ocrdma_get_phy_info_rsp *)cmd; + dev->phy.phy_type = le16_to_cpu(rsp->phy_type); + dev->phy.auto_speeds_supported = + le16_to_cpu(rsp->auto_speeds_supported); + dev->phy.fixed_speeds_supported = + le16_to_cpu(rsp->fixed_speeds_supported); +mbx_err: + kfree(cmd); + return status; +} + int ocrdma_mbx_alloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd) { int status = -ENOMEM; @@ -1201,7 +1407,7 @@ static int ocrdma_build_q_conf(u32 *num_entries, int entry_size, static int ocrdma_mbx_create_ah_tbl(struct ocrdma_dev *dev) { - int i ; + int i; int status = 0; int max_ah; struct ocrdma_create_ah_tbl *cmd; @@ -1310,19 +1516,19 @@ static u16 ocrdma_bind_eq(struct ocrdma_dev *dev) u16 eq_id; mutex_lock(&dev->dev_lock); - cq_cnt = dev->qp_eq_tbl[0].cq_cnt; - eq_id = dev->qp_eq_tbl[0].q.id; + cq_cnt = dev->eq_tbl[0].cq_cnt; + eq_id = dev->eq_tbl[0].q.id; /* find the EQ which is has the least number of * CQs associated with it. */ for (i = 0; i < dev->eq_cnt; i++) { - if (dev->qp_eq_tbl[i].cq_cnt < cq_cnt) { - cq_cnt = dev->qp_eq_tbl[i].cq_cnt; - eq_id = dev->qp_eq_tbl[i].q.id; + if (dev->eq_tbl[i].cq_cnt < cq_cnt) { + cq_cnt = dev->eq_tbl[i].cq_cnt; + eq_id = dev->eq_tbl[i].q.id; selected_eq = i; } } - dev->qp_eq_tbl[selected_eq].cq_cnt += 1; + dev->eq_tbl[selected_eq].cq_cnt += 1; mutex_unlock(&dev->dev_lock); return eq_id; } @@ -1332,17 +1538,15 @@ static void ocrdma_unbind_eq(struct ocrdma_dev *dev, u16 eq_id) int i; mutex_lock(&dev->dev_lock); - for (i = 0; i < dev->eq_cnt; i++) { - if (dev->qp_eq_tbl[i].q.id != eq_id) - continue; - dev->qp_eq_tbl[i].cq_cnt -= 1; - break; - } + i = ocrdma_get_eq_table_index(dev, eq_id); + if (i == -EINVAL) + BUG(); + dev->eq_tbl[i].cq_cnt -= 1; mutex_unlock(&dev->dev_lock); } int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq, - int entries, int dpp_cq) + int entries, int dpp_cq, u16 pd_id) { int status = -ENOMEM; int max_hw_cqe; struct pci_dev *pdev = dev->nic_info.pdev; @@ -1350,14 +1554,12 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq, struct ocrdma_create_cq_rsp *rsp; u32 hw_pages, cqe_size, page_size, cqe_count; - if (dpp_cq) - return -EINVAL; if (entries > dev->attr.max_cqe) { - ocrdma_err("%s(%d) max_cqe=0x%x, requester_cqe=0x%x\n", - __func__, dev->id, dev->attr.max_cqe, entries); + pr_err("%s(%d) max_cqe=0x%x, requester_cqe=0x%x\n", + __func__, dev->id, dev->attr.max_cqe, entries); return -EINVAL; } - if (dpp_cq && (dev->nic_info.dev_family != OCRDMA_GEN2_FAMILY)) + if (dpp_cq && (ocrdma_get_asic_type(dev) != OCRDMA_ASIC_GEN_SKH_R)) return -EINVAL; if (dpp_cq) { @@ -1391,15 +1593,14 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq, cmd->cmd.pgsz_pgcnt |= hw_pages; cmd->cmd.ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS; - if (dev->eq_cnt < 0) - goto eq_err; cq->eqn = ocrdma_bind_eq(dev); - cmd->cmd.req.rsvd_version = OCRDMA_CREATE_CQ_VER2; + cmd->cmd.req.rsvd_version = OCRDMA_CREATE_CQ_VER3; cqe_count = cq->len / cqe_size; - if (cqe_count > 1024) + cq->cqe_cnt = cqe_count; + if (cqe_count > 1024) { /* Set cnt to 3 to indicate more than 1024 cq entries */ cmd->cmd.ev_cnt_flags |= (0x3 << OCRDMA_CREATE_CQ_CNT_SHIFT); - else { + } else { u8 count = 0; switch (cqe_count) { case 256: @@ -1418,7 +1619,7 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq, } /* shared eq between all the consumer cqs. */ cmd->cmd.eqn = cq->eqn; - if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { if (dpp_cq) cmd->cmd.pgsz_pgcnt |= OCRDMA_CREATE_CQ_DPP << OCRDMA_CREATE_CQ_TYPE_SHIFT; @@ -1430,6 +1631,7 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq, cq->phase_change = true; } + cmd->cmd.pd_id = pd_id; /* valid only for v3 */ ocrdma_build_q_pages(&cmd->cmd.pa[0], hw_pages, cq->pa, page_size); status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); if (status) @@ -1441,7 +1643,6 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq, return 0; mbx_err: ocrdma_unbind_eq(dev, cq->eqn); -eq_err: dma_free_coherent(&pdev->dev, cq->len, cq->va, cq->pa); mem_err: kfree(cmd); @@ -1463,12 +1664,9 @@ int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq) (cq->id << OCRDMA_DESTROY_CQ_QID_SHIFT) & OCRDMA_DESTROY_CQ_QID_MASK; - ocrdma_unbind_eq(dev, cq->eqn); status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); - if (status) - goto mbx_err; + ocrdma_unbind_eq(dev, cq->eqn); dma_free_coherent(&dev->nic_info.pdev->dev, cq->len, cq->va, cq->pa); -mbx_err: kfree(cmd); return status; } @@ -1538,6 +1736,7 @@ static int ocrdma_mbx_reg_mr(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr, return -ENOMEM; cmd->num_pbl_pdid = pdid | (hwmr->num_pbls << OCRDMA_REG_NSMR_NUM_PBL_SHIFT); + cmd->fr_mr = hwmr->fr_mr; cmd->flags_hpage_pbe_sz |= (hwmr->remote_wr << OCRDMA_REG_NSMR_REMOTE_WR_SHIFT); @@ -1621,7 +1820,7 @@ int ocrdma_reg_mr(struct ocrdma_dev *dev, status = ocrdma_mbx_reg_mr(dev, hwmr, pdid, cur_pbl_cnt, hwmr->pbe_size, last); if (status) { - ocrdma_err("%s() status=%d\n", __func__, status); + pr_err("%s() status=%d\n", __func__, status); return status; } /* if there is no more pbls to register then exit. */ @@ -1644,7 +1843,7 @@ int ocrdma_reg_mr(struct ocrdma_dev *dev, break; } if (status) - ocrdma_err("%s() err. status=%d\n", __func__, status); + pr_err("%s() err. status=%d\n", __func__, status); return status; } @@ -1692,8 +1891,16 @@ void ocrdma_flush_qp(struct ocrdma_qp *qp) spin_unlock_irqrestore(&qp->dev->flush_q_lock, flags); } -int ocrdma_qp_state_machine(struct ocrdma_qp *qp, enum ib_qp_state new_ib_state, - enum ib_qp_state *old_ib_state) +static void ocrdma_init_hwq_ptr(struct ocrdma_qp *qp) +{ + qp->sq.head = 0; + qp->sq.tail = 0; + qp->rq.head = 0; + qp->rq.tail = 0; +} + +int ocrdma_qp_state_change(struct ocrdma_qp *qp, enum ib_qp_state new_ib_state, + enum ib_qp_state *old_ib_state) { unsigned long flags; int status = 0; @@ -1710,96 +1917,15 @@ int ocrdma_qp_state_machine(struct ocrdma_qp *qp, enum ib_qp_state new_ib_state, return 1; } - switch (qp->state) { - case OCRDMA_QPS_RST: - switch (new_state) { - case OCRDMA_QPS_RST: - case OCRDMA_QPS_INIT: - break; - default: - status = -EINVAL; - break; - }; - break; - case OCRDMA_QPS_INIT: - /* qps: INIT->XXX */ - switch (new_state) { - case OCRDMA_QPS_INIT: - case OCRDMA_QPS_RTR: - break; - case OCRDMA_QPS_ERR: - ocrdma_flush_qp(qp); - break; - default: - status = -EINVAL; - break; - }; - break; - case OCRDMA_QPS_RTR: - /* qps: RTS->XXX */ - switch (new_state) { - case OCRDMA_QPS_RTS: - break; - case OCRDMA_QPS_ERR: - ocrdma_flush_qp(qp); - break; - default: - status = -EINVAL; - break; - }; - break; - case OCRDMA_QPS_RTS: - /* qps: RTS->XXX */ - switch (new_state) { - case OCRDMA_QPS_SQD: - case OCRDMA_QPS_SQE: - break; - case OCRDMA_QPS_ERR: - ocrdma_flush_qp(qp); - break; - default: - status = -EINVAL; - break; - }; - break; - case OCRDMA_QPS_SQD: - /* qps: SQD->XXX */ - switch (new_state) { - case OCRDMA_QPS_RTS: - case OCRDMA_QPS_SQE: - case OCRDMA_QPS_ERR: - break; - default: - status = -EINVAL; - break; - }; - break; - case OCRDMA_QPS_SQE: - switch (new_state) { - case OCRDMA_QPS_RTS: - case OCRDMA_QPS_ERR: - break; - default: - status = -EINVAL; - break; - }; - break; - case OCRDMA_QPS_ERR: - /* qps: ERR->XXX */ - switch (new_state) { - case OCRDMA_QPS_RST: - break; - default: - status = -EINVAL; - break; - }; - break; - default: - status = -EINVAL; - break; - }; - if (!status) - qp->state = new_state; + + if (new_state == OCRDMA_QPS_INIT) { + ocrdma_init_hwq_ptr(qp); + ocrdma_del_flush_qp(qp); + } else if (new_state == OCRDMA_QPS_ERR) { + ocrdma_flush_qp(qp); + } + + qp->state = new_state; spin_unlock_irqrestore(&qp->q_lock, flags); return status; @@ -1833,16 +1959,15 @@ static int ocrdma_set_create_qp_sq_cmd(struct ocrdma_create_qp_req *cmd, u32 max_wqe_allocated; u32 max_sges = attrs->cap.max_send_sge; - max_wqe_allocated = attrs->cap.max_send_wr; - /* need to allocate one extra to for GEN1 family */ - if (dev->nic_info.dev_family != OCRDMA_GEN2_FAMILY) - max_wqe_allocated += 1; + /* QP1 may exceed 127 */ + max_wqe_allocated = min_t(u32, attrs->cap.max_send_wr + 1, + dev->attr.max_wqe); status = ocrdma_build_q_conf(&max_wqe_allocated, dev->attr.wqe_size, &hw_pages, &hw_page_size); if (status) { - ocrdma_err("%s() req. max_send_wr=0x%x\n", __func__, - max_wqe_allocated); + pr_err("%s() req. max_send_wr=0x%x\n", __func__, + max_wqe_allocated); return -EINVAL; } qp->sq.max_cnt = max_wqe_allocated; @@ -1891,8 +2016,8 @@ static int ocrdma_set_create_qp_rq_cmd(struct ocrdma_create_qp_req *cmd, status = ocrdma_build_q_conf(&max_rqe_allocated, dev->attr.rqe_size, &hw_pages, &hw_page_size); if (status) { - ocrdma_err("%s() req. max_recv_wr=0x%x\n", __func__, - attrs->cap.max_recv_wr + 1); + pr_err("%s() req. max_recv_wr=0x%x\n", __func__, + attrs->cap.max_recv_wr + 1); return status; } qp->rq.max_cnt = max_rqe_allocated; @@ -1900,7 +2025,7 @@ static int ocrdma_set_create_qp_rq_cmd(struct ocrdma_create_qp_req *cmd, qp->rq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL); if (!qp->rq.va) - return status; + return -ENOMEM; memset(qp->rq.va, 0, len); qp->rq.pa = pa; qp->rq.len = len; @@ -1948,6 +2073,8 @@ static int ocrdma_set_create_qp_ird_cmd(struct ocrdma_create_qp_req *cmd, dma_addr_t pa = 0; int ird_page_size = dev->attr.ird_page_size; int ird_q_len = dev->attr.num_ird_pages * ird_page_size; + struct ocrdma_hdr_wqe *rqe; + int i = 0; if (dev->attr.ird == 0) return 0; @@ -1959,6 +2086,15 @@ static int ocrdma_set_create_qp_ird_cmd(struct ocrdma_create_qp_req *cmd, memset(qp->ird_q_va, 0, ird_q_len); ocrdma_build_q_pages(&cmd->ird_addr[0], dev->attr.num_ird_pages, pa, ird_page_size); + for (; i < ird_q_len / dev->attr.rqe_size; i++) { + rqe = (struct ocrdma_hdr_wqe *)(qp->ird_q_va + + (i * dev->attr.rqe_size)); + rqe->cw = 0; + rqe->cw |= 2; + rqe->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT); + rqe->cw |= (8 << OCRDMA_WQE_SIZE_SHIFT); + rqe->cw |= (8 << OCRDMA_WQE_NXT_WQE_SIZE_SHIFT); + } return 0; } @@ -2023,7 +2159,7 @@ int ocrdma_mbx_create_qp(struct ocrdma_qp *qp, struct ib_qp_init_attr *attrs, break; default: return -EINVAL; - }; + } cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_QP, sizeof(*cmd)); if (!cmd) @@ -2070,10 +2206,10 @@ int ocrdma_mbx_create_qp(struct ocrdma_qp *qp, struct ib_qp_init_attr *attrs, OCRDMA_CREATE_QP_REQ_RQ_CQID_MASK; qp->rq_cq = cq; - if (pd->dpp_enabled && attrs->cap.max_inline_data && pd->num_dpp_qp && - (attrs->cap.max_inline_data <= dev->attr.max_inline_data)) + if (pd->dpp_enabled && pd->num_dpp_qp) { ocrdma_set_create_qp_dpp_cmd(cmd, pd, qp, enable_dpp_cq, dpp_cq_id); + } status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); if (status) @@ -2087,10 +2223,10 @@ mbx_err: if (qp->rq.va) dma_free_coherent(&pdev->dev, qp->rq.len, qp->rq.va, qp->rq.pa); rq_err: - ocrdma_err("%s(%d) rq_err\n", __func__, dev->id); + pr_err("%s(%d) rq_err\n", __func__, dev->id); dma_free_coherent(&pdev->dev, qp->sq.len, qp->sq.va, qp->sq.pa); sq_err: - ocrdma_err("%s(%d) sq_err\n", __func__, dev->id); + pr_err("%s(%d) sq_err\n", __func__, dev->id); kfree(cmd); return status; } @@ -2116,69 +2252,60 @@ mbx_err: return status; } -int ocrdma_resolve_dgid(struct ocrdma_dev *dev, union ib_gid *dgid, - u8 *mac_addr) -{ - struct in6_addr in6; - - memcpy(&in6, dgid, sizeof in6); - if (rdma_is_multicast_addr(&in6)) - rdma_get_mcast_mac(&in6, mac_addr); - else if (rdma_link_local_addr(&in6)) - rdma_get_ll_mac(&in6, mac_addr); - else { - ocrdma_err("%s() fail to resolve mac_addr.\n", __func__); - return -EINVAL; - } - return 0; -} - -static void ocrdma_set_av_params(struct ocrdma_qp *qp, +static int ocrdma_set_av_params(struct ocrdma_qp *qp, struct ocrdma_modify_qp *cmd, struct ib_qp_attr *attrs) { + int status; struct ib_ah_attr *ah_attr = &attrs->ah_attr; - union ib_gid sgid; + union ib_gid sgid, zgid; u32 vlan_id; u8 mac_addr[6]; + if ((ah_attr->ah_flags & IB_AH_GRH) == 0) - return; + return -EINVAL; cmd->params.tclass_sq_psn |= (ah_attr->grh.traffic_class << OCRDMA_QP_PARAMS_TCLASS_SHIFT); cmd->params.rnt_rc_sl_fl |= (ah_attr->grh.flow_label & OCRDMA_QP_PARAMS_FLOW_LABEL_MASK); + cmd->params.rnt_rc_sl_fl |= (ah_attr->sl << OCRDMA_QP_PARAMS_SL_SHIFT); cmd->params.hop_lmt_rq_psn |= (ah_attr->grh.hop_limit << OCRDMA_QP_PARAMS_HOP_LMT_SHIFT); cmd->flags |= OCRDMA_QP_PARA_FLOW_LBL_VALID; memcpy(&cmd->params.dgid[0], &ah_attr->grh.dgid.raw[0], sizeof(cmd->params.dgid)); - ocrdma_query_gid(&qp->dev->ibdev, 1, - ah_attr->grh.sgid_index, &sgid); + status = ocrdma_query_gid(&qp->dev->ibdev, 1, + ah_attr->grh.sgid_index, &sgid); + if (status) + return status; + + memset(&zgid, 0, sizeof(zgid)); + if (!memcmp(&sgid, &zgid, sizeof(zgid))) + return -EINVAL; + qp->sgid_idx = ah_attr->grh.sgid_index; memcpy(&cmd->params.sgid[0], &sgid.raw[0], sizeof(cmd->params.sgid)); - ocrdma_resolve_dgid(qp->dev, &ah_attr->grh.dgid, &mac_addr[0]); + ocrdma_resolve_dmac(qp->dev, ah_attr, &mac_addr[0]); cmd->params.dmac_b0_to_b3 = mac_addr[0] | (mac_addr[1] << 8) | (mac_addr[2] << 16) | (mac_addr[3] << 24); /* convert them to LE format. */ ocrdma_cpu_to_le32(&cmd->params.dgid[0], sizeof(cmd->params.dgid)); ocrdma_cpu_to_le32(&cmd->params.sgid[0], sizeof(cmd->params.sgid)); cmd->params.vlan_dmac_b4_to_b5 = mac_addr[4] | (mac_addr[5] << 8); - vlan_id = rdma_get_vlan_id(&sgid); + vlan_id = ah_attr->vlan_id; if (vlan_id && (vlan_id < 0x1000)) { cmd->params.vlan_dmac_b4_to_b5 |= vlan_id << OCRDMA_QP_PARAMS_VLAN_SHIFT; cmd->flags |= OCRDMA_QP_PARA_VLAN_EN_VALID; } + return 0; } static int ocrdma_set_qp_params(struct ocrdma_qp *qp, struct ocrdma_modify_qp *cmd, - struct ib_qp_attr *attrs, int attr_mask, - enum ib_qp_state old_qps) + struct ib_qp_attr *attrs, int attr_mask) { int status = 0; - struct net_device *netdev = qp->dev->nic_info.netdev; - int eth_mtu = iboe_get_mtu(netdev->mtu); if (attr_mask & IB_QP_PKEY_INDEX) { cmd->params.path_mtu_pkey_indx |= (attrs->pkey_index & @@ -2190,9 +2317,11 @@ static int ocrdma_set_qp_params(struct ocrdma_qp *qp, cmd->params.qkey = attrs->qkey; cmd->flags |= OCRDMA_QP_PARA_QKEY_VALID; } - if (attr_mask & IB_QP_AV) - ocrdma_set_av_params(qp, cmd, attrs); - else if (qp->qp_type == IB_QPT_GSI || qp->qp_type == IB_QPT_UD) { + if (attr_mask & IB_QP_AV) { + status = ocrdma_set_av_params(qp, cmd, attrs); + if (status) + return status; + } else if (qp->qp_type == IB_QPT_GSI || qp->qp_type == IB_QPT_UD) { /* set the default mac address for UD, GSI QPs */ cmd->params.dmac_b0_to_b3 = qp->dev->nic_info.mac_addr[0] | (qp->dev->nic_info.mac_addr[1] << 8) | @@ -2213,8 +2342,8 @@ static int ocrdma_set_qp_params(struct ocrdma_qp *qp, cmd->flags |= OCRDMA_QP_PARA_DST_QPN_VALID; } if (attr_mask & IB_QP_PATH_MTU) { - if (ib_mtu_enum_to_int(eth_mtu) < - ib_mtu_enum_to_int(attrs->path_mtu)) { + if (attrs->path_mtu < IB_MTU_256 || + attrs->path_mtu > IB_MTU_4096) { status = -EINVAL; goto pmtu_err; } @@ -2279,8 +2408,7 @@ pmtu_err: } int ocrdma_mbx_modify_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp, - struct ib_qp_attr *attrs, int attr_mask, - enum ib_qp_state old_qps) + struct ib_qp_attr *attrs, int attr_mask) { int status = -ENOMEM; struct ocrdma_modify_qp *cmd; @@ -2297,11 +2425,13 @@ int ocrdma_mbx_modify_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp, OCRDMA_QP_PARAMS_STATE_SHIFT) & OCRDMA_QP_PARAMS_STATE_MASK; cmd->flags |= OCRDMA_QP_PARA_QPS_VALID; - } else + } else { cmd->params.max_sge_recv_flags |= (qp->state << OCRDMA_QP_PARAMS_STATE_SHIFT) & OCRDMA_QP_PARAMS_STATE_MASK; - status = ocrdma_set_qp_params(qp, cmd, attrs, attr_mask, old_qps); + } + + status = ocrdma_set_qp_params(qp, cmd, attrs, attr_mask); if (status) goto mbx_err; status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); @@ -2338,7 +2468,7 @@ mbx_err: return status; } -int ocrdma_mbx_create_srq(struct ocrdma_srq *srq, +int ocrdma_mbx_create_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq, struct ib_srq_init_attr *srq_attr, struct ocrdma_pd *pd) { @@ -2348,7 +2478,6 @@ int ocrdma_mbx_create_srq(struct ocrdma_srq *srq, struct ocrdma_create_srq_rsp *rsp; struct ocrdma_create_srq *cmd; dma_addr_t pa; - struct ocrdma_dev *dev = srq->dev; struct pci_dev *pdev = dev->nic_info.pdev; u32 max_rqe_allocated; @@ -2362,8 +2491,8 @@ int ocrdma_mbx_create_srq(struct ocrdma_srq *srq, dev->attr.rqe_size, &hw_pages, &hw_page_size); if (status) { - ocrdma_err("%s() req. max_wr=0x%x\n", __func__, - srq_attr->attr.max_wr); + pr_err("%s() req. max_wr=0x%x\n", __func__, + srq_attr->attr.max_wr); status = -EINVAL; goto ret; } @@ -2418,13 +2547,16 @@ int ocrdma_mbx_modify_srq(struct ocrdma_srq *srq, struct ib_srq_attr *srq_attr) { int status = -ENOMEM; struct ocrdma_modify_srq *cmd; - cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_SRQ, sizeof(*cmd)); + struct ocrdma_pd *pd = srq->pd; + struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device); + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_MODIFY_SRQ, sizeof(*cmd)); if (!cmd) return status; cmd->id = srq->id; cmd->limit_max_rqe |= srq_attr->srq_limit << OCRDMA_MODIFY_SRQ_LIMIT_SHIFT; - status = ocrdma_mbx_cmd(srq->dev, (struct ocrdma_mqe *)cmd); + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); kfree(cmd); return status; } @@ -2433,11 +2565,13 @@ int ocrdma_mbx_query_srq(struct ocrdma_srq *srq, struct ib_srq_attr *srq_attr) { int status = -ENOMEM; struct ocrdma_query_srq *cmd; - cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_SRQ, sizeof(*cmd)); + struct ocrdma_dev *dev = get_ocrdma_dev(srq->ibsrq.device); + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_SRQ, sizeof(*cmd)); if (!cmd) return status; cmd->id = srq->rq.dbid; - status = ocrdma_mbx_cmd(srq->dev, (struct ocrdma_mqe *)cmd); + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); if (status == 0) { struct ocrdma_query_srq_rsp *rsp = (struct ocrdma_query_srq_rsp *)cmd; @@ -2462,7 +2596,7 @@ int ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq) if (!cmd) return status; cmd->id = srq->id; - status = ocrdma_mbx_cmd(srq->dev, (struct ocrdma_mqe *)cmd); + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); if (srq->rq.va) dma_free_coherent(&pdev->dev, srq->rq.len, srq->rq.va, srq->rq.pa); @@ -2504,38 +2638,7 @@ int ocrdma_free_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah) return 0; } -static int ocrdma_create_mq_eq(struct ocrdma_dev *dev) -{ - int status; - int irq; - unsigned long flags = 0; - int num_eq = 0; - - if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX) - flags = IRQF_SHARED; - else { - num_eq = dev->nic_info.msix.num_vectors - - dev->nic_info.msix.start_vector; - /* minimum two vectors/eq are required for rdma to work. - * one for control path and one for data path. - */ - if (num_eq < 2) - return -EBUSY; - } - - status = ocrdma_create_eq(dev, &dev->meq, OCRDMA_EQ_LEN); - if (status) - return status; - sprintf(dev->meq.irq_name, "ocrdma_mq%d", dev->id); - irq = ocrdma_get_irq(dev, &dev->meq); - status = request_irq(irq, ocrdma_irq_handler, flags, dev->meq.irq_name, - &dev->meq); - if (status) - _ocrdma_destroy_eq(dev, &dev->meq); - return status; -} - -static int ocrdma_create_qp_eqs(struct ocrdma_dev *dev) +static int ocrdma_create_eqs(struct ocrdma_dev *dev) { int num_eq, i, status = 0; int irq; @@ -2546,49 +2649,47 @@ static int ocrdma_create_qp_eqs(struct ocrdma_dev *dev) if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX) { num_eq = 1; flags = IRQF_SHARED; - } else + } else { num_eq = min_t(u32, num_eq, num_online_cpus()); - dev->qp_eq_tbl = kzalloc(sizeof(struct ocrdma_eq) * num_eq, GFP_KERNEL); - if (!dev->qp_eq_tbl) + } + + if (!num_eq) + return -EINVAL; + + dev->eq_tbl = kzalloc(sizeof(struct ocrdma_eq) * num_eq, GFP_KERNEL); + if (!dev->eq_tbl) return -ENOMEM; for (i = 0; i < num_eq; i++) { - status = ocrdma_create_eq(dev, &dev->qp_eq_tbl[i], - OCRDMA_EQ_LEN); + status = ocrdma_create_eq(dev, &dev->eq_tbl[i], + OCRDMA_EQ_LEN); if (status) { status = -EINVAL; break; } - sprintf(dev->qp_eq_tbl[i].irq_name, "ocrdma_qp%d-%d", + sprintf(dev->eq_tbl[i].irq_name, "ocrdma%d-%d", dev->id, i); - irq = ocrdma_get_irq(dev, &dev->qp_eq_tbl[i]); + irq = ocrdma_get_irq(dev, &dev->eq_tbl[i]); status = request_irq(irq, ocrdma_irq_handler, flags, - dev->qp_eq_tbl[i].irq_name, - &dev->qp_eq_tbl[i]); - if (status) { - _ocrdma_destroy_eq(dev, &dev->qp_eq_tbl[i]); - status = -EINVAL; - break; - } + dev->eq_tbl[i].irq_name, + &dev->eq_tbl[i]); + if (status) + goto done; dev->eq_cnt += 1; } /* one eq is sufficient for data path to work */ - if (dev->eq_cnt >= 1) - return 0; - if (status) - ocrdma_destroy_qp_eqs(dev); + return 0; +done: + ocrdma_destroy_eqs(dev); return status; } int ocrdma_init_hw(struct ocrdma_dev *dev) { int status; - /* set up control path eq */ - status = ocrdma_create_mq_eq(dev); - if (status) - return status; - /* set up data path eq */ - status = ocrdma_create_qp_eqs(dev); + + /* create the eqs */ + status = ocrdma_create_eqs(dev); if (status) goto qpeq_err; status = ocrdma_create_mq(dev); @@ -2606,15 +2707,21 @@ int ocrdma_init_hw(struct ocrdma_dev *dev) status = ocrdma_mbx_create_ah_tbl(dev); if (status) goto conf_err; + status = ocrdma_mbx_get_phy_info(dev); + if (status) + goto conf_err; + status = ocrdma_mbx_get_ctrl_attribs(dev); + if (status) + goto conf_err; + return 0; conf_err: ocrdma_destroy_mq(dev); mq_err: - ocrdma_destroy_qp_eqs(dev); + ocrdma_destroy_eqs(dev); qpeq_err: - ocrdma_destroy_eq(dev, &dev->meq); - ocrdma_err("%s() status=%d\n", __func__, status); + pr_err("%s() status=%d\n", __func__, status); return status; } @@ -2622,10 +2729,9 @@ void ocrdma_cleanup_hw(struct ocrdma_dev *dev) { ocrdma_mbx_delete_ah_tbl(dev); - /* cleanup the data path eqs */ - ocrdma_destroy_qp_eqs(dev); + /* cleanup the eqs */ + ocrdma_destroy_eqs(dev); /* cleanup the control path */ ocrdma_destroy_mq(dev); - ocrdma_destroy_eq(dev, &dev->meq); } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h index be5db77404d..e513f729314 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h @@ -78,6 +78,11 @@ static inline void ocrdma_copy_le32_to_cpu(void *dst, void *src, u32 len) #endif } +static inline u64 ocrdma_get_db_addr(struct ocrdma_dev *dev, u32 pdid) +{ + return dev->nic_info.unmapped_db + (pdid * dev->nic_info.db_page_size); +} + int ocrdma_init_hw(struct ocrdma_dev *); void ocrdma_cleanup_hw(struct ocrdma_dev *); @@ -86,9 +91,9 @@ void ocrdma_ring_cq_db(struct ocrdma_dev *, u16 cq_id, bool armed, bool solicited, u16 cqe_popped); /* verbs specific mailbox commands */ +int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed); int ocrdma_query_config(struct ocrdma_dev *, struct ocrdma_mbx_query_config *config); -int ocrdma_resolve_dgid(struct ocrdma_dev *, union ib_gid *dgid, u8 *mac_addr); int ocrdma_mbx_alloc_pd(struct ocrdma_dev *, struct ocrdma_pd *); int ocrdma_mbx_dealloc_pd(struct ocrdma_dev *, struct ocrdma_pd *); @@ -100,20 +105,18 @@ int ocrdma_mbx_dealloc_lkey(struct ocrdma_dev *, int fmr, u32 lkey); int ocrdma_reg_mr(struct ocrdma_dev *, struct ocrdma_hw_mr *hwmr, u32 pd_id, int acc); int ocrdma_mbx_create_cq(struct ocrdma_dev *, struct ocrdma_cq *, - int entries, int dpp_cq); + int entries, int dpp_cq, u16 pd_id); int ocrdma_mbx_destroy_cq(struct ocrdma_dev *, struct ocrdma_cq *); int ocrdma_mbx_create_qp(struct ocrdma_qp *, struct ib_qp_init_attr *attrs, u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset, u16 *dpp_credit_lmt); int ocrdma_mbx_modify_qp(struct ocrdma_dev *, struct ocrdma_qp *, - struct ib_qp_attr *attrs, int attr_mask, - enum ib_qp_state old_qps); + struct ib_qp_attr *attrs, int attr_mask); int ocrdma_mbx_query_qp(struct ocrdma_dev *, struct ocrdma_qp *, struct ocrdma_qp_params *param); int ocrdma_mbx_destroy_qp(struct ocrdma_dev *, struct ocrdma_qp *); - -int ocrdma_mbx_create_srq(struct ocrdma_srq *, +int ocrdma_mbx_create_srq(struct ocrdma_dev *, struct ocrdma_srq *, struct ib_srq_init_attr *, struct ocrdma_pd *); int ocrdma_mbx_modify_srq(struct ocrdma_srq *, struct ib_srq_attr *); @@ -123,10 +126,13 @@ int ocrdma_mbx_destroy_srq(struct ocrdma_dev *, struct ocrdma_srq *); int ocrdma_alloc_av(struct ocrdma_dev *, struct ocrdma_ah *); int ocrdma_free_av(struct ocrdma_dev *, struct ocrdma_ah *); -int ocrdma_qp_state_machine(struct ocrdma_qp *, enum ib_qp_state new_state, +int ocrdma_qp_state_change(struct ocrdma_qp *, enum ib_qp_state new_state, enum ib_qp_state *old_ib_state); bool ocrdma_is_qp_in_sq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *); bool ocrdma_is_qp_in_rq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *); void ocrdma_flush_qp(struct ocrdma_qp *); +int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq); +int ocrdma_mbx_rdma_stats(struct ocrdma_dev *, bool reset); +char *port_speed_string(struct ocrdma_dev *dev); #endif /* __OCRDMA_HW_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index c4e0131f1b5..7c504e07974 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -39,9 +39,11 @@ #include "ocrdma_ah.h" #include "be_roce.h" #include "ocrdma_hw.h" +#include "ocrdma_stats.h" +#include "ocrdma_abi.h" -MODULE_VERSION(OCRDMA_ROCE_DEV_VERSION); -MODULE_DESCRIPTION("Emulex RoCE HCA Driver"); +MODULE_VERSION(OCRDMA_ROCE_DRV_VERSION); +MODULE_DESCRIPTION(OCRDMA_ROCE_DRV_DESC " " OCRDMA_ROCE_DRV_VERSION); MODULE_AUTHOR("Emulex Corporation"); MODULE_LICENSE("GPL"); @@ -51,18 +53,6 @@ static DEFINE_IDR(ocrdma_dev_id); static union ib_gid ocrdma_zero_sgid; -static int ocrdma_get_instance(void) -{ - int instance = 0; - - /* Assign an unused number */ - if (!idr_pre_get(&ocrdma_dev_id, GFP_KERNEL)) - return -1; - if (idr_get_new(&ocrdma_dev_id, NULL, &instance)) - return -1; - return instance; -} - void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid) { u8 mac_addr[6]; @@ -78,46 +68,24 @@ void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid) guid[7] = mac_addr[5]; } -static void ocrdma_build_sgid_mac(union ib_gid *sgid, unsigned char *mac_addr, - bool is_vlan, u16 vlan_id) -{ - sgid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); - sgid->raw[8] = mac_addr[0] ^ 2; - sgid->raw[9] = mac_addr[1]; - sgid->raw[10] = mac_addr[2]; - if (is_vlan) { - sgid->raw[11] = vlan_id >> 8; - sgid->raw[12] = vlan_id & 0xff; - } else { - sgid->raw[11] = 0xff; - sgid->raw[12] = 0xfe; - } - sgid->raw[13] = mac_addr[3]; - sgid->raw[14] = mac_addr[4]; - sgid->raw[15] = mac_addr[5]; -} - -static bool ocrdma_add_sgid(struct ocrdma_dev *dev, unsigned char *mac_addr, - bool is_vlan, u16 vlan_id) +static bool ocrdma_add_sgid(struct ocrdma_dev *dev, union ib_gid *new_sgid) { int i; - union ib_gid new_sgid; unsigned long flags; memset(&ocrdma_zero_sgid, 0, sizeof(union ib_gid)); - ocrdma_build_sgid_mac(&new_sgid, mac_addr, is_vlan, vlan_id); spin_lock_irqsave(&dev->sgid_lock, flags); for (i = 0; i < OCRDMA_MAX_SGID; i++) { if (!memcmp(&dev->sgid_tbl[i], &ocrdma_zero_sgid, sizeof(union ib_gid))) { /* found free entry */ - memcpy(&dev->sgid_tbl[i], &new_sgid, + memcpy(&dev->sgid_tbl[i], new_sgid, sizeof(union ib_gid)); spin_unlock_irqrestore(&dev->sgid_lock, flags); return true; - } else if (!memcmp(&dev->sgid_tbl[i], &new_sgid, + } else if (!memcmp(&dev->sgid_tbl[i], new_sgid, sizeof(union ib_gid))) { /* entry already present, no addition is required. */ spin_unlock_irqrestore(&dev->sgid_lock, flags); @@ -128,20 +96,17 @@ static bool ocrdma_add_sgid(struct ocrdma_dev *dev, unsigned char *mac_addr, return false; } -static bool ocrdma_del_sgid(struct ocrdma_dev *dev, unsigned char *mac_addr, - bool is_vlan, u16 vlan_id) +static bool ocrdma_del_sgid(struct ocrdma_dev *dev, union ib_gid *sgid) { int found = false; int i; - union ib_gid sgid; unsigned long flags; - ocrdma_build_sgid_mac(&sgid, mac_addr, is_vlan, vlan_id); spin_lock_irqsave(&dev->sgid_lock, flags); /* first is default sgid, which cannot be deleted. */ for (i = 1; i < OCRDMA_MAX_SGID; i++) { - if (!memcmp(&dev->sgid_tbl[i], &sgid, sizeof(union ib_gid))) { + if (!memcmp(&dev->sgid_tbl[i], sgid, sizeof(union ib_gid))) { /* found matching entry */ memset(&dev->sgid_tbl[i], 0, sizeof(union ib_gid)); found = true; @@ -152,75 +117,18 @@ static bool ocrdma_del_sgid(struct ocrdma_dev *dev, unsigned char *mac_addr, return found; } -static void ocrdma_add_default_sgid(struct ocrdma_dev *dev) -{ - /* GID Index 0 - Invariant manufacturer-assigned EUI-64 */ - union ib_gid *sgid = &dev->sgid_tbl[0]; - - sgid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); - ocrdma_get_guid(dev, &sgid->raw[8]); -} - -#if IS_ENABLED(CONFIG_VLAN_8021Q) -static void ocrdma_add_vlan_sgids(struct ocrdma_dev *dev) -{ - struct net_device *netdev, *tmp; - u16 vlan_id; - bool is_vlan; - - netdev = dev->nic_info.netdev; - - rcu_read_lock(); - for_each_netdev_rcu(&init_net, tmp) { - if (netdev == tmp || vlan_dev_real_dev(tmp) == netdev) { - if (!netif_running(tmp) || !netif_oper_up(tmp)) - continue; - if (netdev != tmp) { - vlan_id = vlan_dev_vlan_id(tmp); - is_vlan = true; - } else { - is_vlan = false; - vlan_id = 0; - tmp = netdev; - } - ocrdma_add_sgid(dev, tmp->dev_addr, is_vlan, vlan_id); - } - } - rcu_read_unlock(); -} -#else -static void ocrdma_add_vlan_sgids(struct ocrdma_dev *dev) -{ - -} -#endif /* VLAN */ - -static int ocrdma_build_sgid_tbl(struct ocrdma_dev *dev) -{ - ocrdma_add_default_sgid(dev); - ocrdma_add_vlan_sgids(dev); - return 0; -} - -#if IS_ENABLED(CONFIG_IPV6) - -static int ocrdma_inet6addr_event(struct notifier_block *notifier, - unsigned long event, void *ptr) +static int ocrdma_addr_event(unsigned long event, struct net_device *netdev, + union ib_gid *gid) { - struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; - struct net_device *netdev = ifa->idev->dev; struct ib_event gid_event; struct ocrdma_dev *dev; bool found = false; bool updated = false; bool is_vlan = false; - u16 vid = 0; is_vlan = netdev->priv_flags & IFF_802_1Q_VLAN; - if (is_vlan) { - vid = vlan_dev_vlan_id(netdev); - netdev = vlan_dev_real_dev(netdev); - } + if (is_vlan) + netdev = rdma_vlan_dev_real_dev(netdev); rcu_read_lock(); list_for_each_entry_rcu(dev, &ocrdma_dev_list, entry) { @@ -233,16 +141,14 @@ static int ocrdma_inet6addr_event(struct notifier_block *notifier, if (!found) return NOTIFY_DONE; - if (!rdma_link_local_addr((struct in6_addr *)&ifa->addr)) - return NOTIFY_DONE; mutex_lock(&dev->dev_lock); switch (event) { case NETDEV_UP: - updated = ocrdma_add_sgid(dev, netdev->dev_addr, is_vlan, vid); + updated = ocrdma_add_sgid(dev, gid); break; case NETDEV_DOWN: - updated = ocrdma_del_sgid(dev, netdev->dev_addr, is_vlan, vid); + updated = ocrdma_del_sgid(dev, gid); break; default: break; @@ -258,6 +164,32 @@ static int ocrdma_inet6addr_event(struct notifier_block *notifier, return NOTIFY_OK; } +static int ocrdma_inetaddr_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = ptr; + union ib_gid gid; + struct net_device *netdev = ifa->ifa_dev->dev; + + ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid); + return ocrdma_addr_event(event, netdev, &gid); +} + +static struct notifier_block ocrdma_inetaddr_notifier = { + .notifier_call = ocrdma_inetaddr_event +}; + +#if IS_ENABLED(CONFIG_IPV6) + +static int ocrdma_inet6addr_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; + union ib_gid *gid = (union ib_gid *)&ifa->addr; + struct net_device *netdev = ifa->idev->dev; + return ocrdma_addr_event(event, netdev, gid); +} + static struct notifier_block ocrdma_inet6addr_notifier = { .notifier_call = ocrdma_inet6addr_event }; @@ -277,6 +209,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC, sizeof(OCRDMA_NODE_DESC)); dev->ibdev.owner = THIS_MODULE; + dev->ibdev.uverbs_abi_ver = OCRDMA_ABI_VERSION; dev->ibdev.uverbs_cmd_mask = OCRDMA_UVERBS(GET_CONTEXT) | OCRDMA_UVERBS(QUERY_DEVICE) | @@ -338,9 +271,14 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) dev->ibdev.req_notify_cq = ocrdma_arm_cq; dev->ibdev.get_dma_mr = ocrdma_get_dma_mr; + dev->ibdev.reg_phys_mr = ocrdma_reg_kernel_mr; dev->ibdev.dereg_mr = ocrdma_dereg_mr; dev->ibdev.reg_user_mr = ocrdma_reg_user_mr; + dev->ibdev.alloc_fast_reg_mr = ocrdma_alloc_frmr; + dev->ibdev.alloc_fast_reg_page_list = ocrdma_alloc_frmr_page_list; + dev->ibdev.free_fast_reg_page_list = ocrdma_free_frmr_page_list; + /* mandatory to support user space verbs consumer. */ dev->ibdev.alloc_ucontext = ocrdma_alloc_ucontext; dev->ibdev.dealloc_ucontext = ocrdma_dealloc_ucontext; @@ -349,7 +287,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) dev->ibdev.process_mad = ocrdma_process_mad; - if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { dev->ibdev.uverbs_cmd_mask |= OCRDMA_UVERBS(CREATE_SRQ) | OCRDMA_UVERBS(MODIFY_SRQ) | @@ -390,7 +328,7 @@ static int ocrdma_alloc_resources(struct ocrdma_dev *dev) spin_lock_init(&dev->flush_q_lock); return 0; alloc_err: - ocrdma_err("%s(%d) error.\n", __func__, dev->id); + pr_err("%s(%d) error.\n", __func__, dev->id); return -ENOMEM; } @@ -401,14 +339,47 @@ static void ocrdma_free_resources(struct ocrdma_dev *dev) kfree(dev->sgid_tbl); } +/* OCRDMA sysfs interface */ +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ocrdma_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor); +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ocrdma_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "%s", &dev->attr.fw_ver[0]); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); + +static struct device_attribute *ocrdma_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver +}; + +static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++) + device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]); +} + static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) { - int status = 0; + int status = 0, i; struct ocrdma_dev *dev; dev = (struct ocrdma_dev *)ib_alloc_device(sizeof(struct ocrdma_dev)); if (!dev) { - ocrdma_err("Unable to allocate ib device\n"); + pr_err("Unable to allocate ib device\n"); return NULL; } dev->mbx_cmd = kzalloc(sizeof(struct ocrdma_mqe_emb_cmd), GFP_KERNEL); @@ -416,7 +387,7 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) goto idr_err; memcpy(&dev->nic_info, dev_info, sizeof(*dev_info)); - dev->id = ocrdma_get_instance(); + dev->id = idr_alloc(&ocrdma_dev_id, NULL, 0, 0, GFP_KERNEL); if (dev->id < 0) goto idr_err; @@ -428,19 +399,29 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) if (status) goto alloc_err; - status = ocrdma_build_sgid_tbl(dev); - if (status) - goto alloc_err; - status = ocrdma_register_device(dev); if (status) goto alloc_err; + for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++) + if (device_create_file(&dev->ibdev.dev, ocrdma_attributes[i])) + goto sysfs_err; spin_lock(&ocrdma_devlist_lock); list_add_tail_rcu(&dev->entry, &ocrdma_dev_list); spin_unlock(&ocrdma_devlist_lock); + /* Init stats */ + ocrdma_add_port_stats(dev); + + pr_info("%s %s: %s \"%s\" port %d\n", + dev_name(&dev->nic_info.pdev->dev), hca_name(dev), + port_speed_string(dev), dev->model_number, + dev->hba_port_num); + pr_info("%s ocrdma%d driver loaded successfully\n", + dev_name(&dev->nic_info.pdev->dev), dev->id); return dev; +sysfs_err: + ocrdma_remove_sysfiles(dev); alloc_err: ocrdma_free_resources(dev); ocrdma_cleanup_hw(dev); @@ -449,7 +430,7 @@ init_err: idr_err: kfree(dev->mbx_cmd); ib_dealloc_device(&dev->ibdev); - ocrdma_err("%s() leaving. ret=%d\n", __func__, status); + pr_err("%s() leaving. ret=%d\n", __func__, status); return NULL; } @@ -457,9 +438,6 @@ static void ocrdma_remove_free(struct rcu_head *rcu) { struct ocrdma_dev *dev = container_of(rcu, struct ocrdma_dev, rcu); - ocrdma_free_resources(dev); - ocrdma_cleanup_hw(dev); - idr_remove(&ocrdma_dev_id, dev->id); kfree(dev->mbx_cmd); ib_dealloc_device(&dev->ibdev); @@ -470,11 +448,18 @@ static void ocrdma_remove(struct ocrdma_dev *dev) /* first unregister with stack to stop all the active traffic * of the registered clients. */ + ocrdma_rem_port_stats(dev); + ocrdma_remove_sysfiles(dev); + ib_unregister_device(&dev->ibdev); spin_lock(&ocrdma_devlist_lock); list_del_rcu(&dev->entry); spin_unlock(&ocrdma_devlist_lock); + + ocrdma_free_resources(dev); + ocrdma_cleanup_hw(dev); + call_rcu(&dev->rcu, ocrdma_remove_free); } @@ -503,7 +488,7 @@ static int ocrdma_close(struct ocrdma_dev *dev) cur_qp = dev->qp_tbl; for (i = 0; i < OCRDMA_MAX_QP; i++) { qp = cur_qp[i]; - if (qp) { + if (qp && qp->ibqp.qp_type != IB_QPT_GSI) { /* change the QP state to ERROR */ _ocrdma_modify_qp(&qp->ibqp, &attrs, attr_mask); @@ -536,7 +521,7 @@ static void ocrdma_event_handler(struct ocrdma_dev *dev, u32 event) case BE_DEV_DOWN: ocrdma_close(dev); break; - }; + } } static struct ocrdma_driver ocrdma_drv = { @@ -544,6 +529,7 @@ static struct ocrdma_driver ocrdma_drv = { .add = ocrdma_add, .remove = ocrdma_remove, .state_change_handler = ocrdma_event_handler, + .be_abi_version = OCRDMA_BE_ROCE_ABI_VERSION, }; static void ocrdma_unregister_inet6addr_notifier(void) @@ -553,20 +539,37 @@ static void ocrdma_unregister_inet6addr_notifier(void) #endif } +static void ocrdma_unregister_inetaddr_notifier(void) +{ + unregister_inetaddr_notifier(&ocrdma_inetaddr_notifier); +} + static int __init ocrdma_init_module(void) { int status; + ocrdma_init_debugfs(); + + status = register_inetaddr_notifier(&ocrdma_inetaddr_notifier); + if (status) + return status; + #if IS_ENABLED(CONFIG_IPV6) status = register_inet6addr_notifier(&ocrdma_inet6addr_notifier); if (status) - return status; + goto err_notifier6; #endif status = be_roce_register_driver(&ocrdma_drv); if (status) - ocrdma_unregister_inet6addr_notifier(); + goto err_be_reg; + return 0; + +err_be_reg: + ocrdma_unregister_inet6addr_notifier(); +err_notifier6: + ocrdma_unregister_inetaddr_notifier(); return status; } @@ -574,6 +577,8 @@ static void __exit ocrdma_exit_module(void) { be_roce_unregister_driver(&ocrdma_drv); ocrdma_unregister_inet6addr_notifier(); + ocrdma_unregister_inetaddr_notifier(); + ocrdma_rem_debugfs(); } module_init(ocrdma_init_module); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h index c75cbdfa87e..96c9ee602ba 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -30,8 +30,16 @@ #define Bit(_b) (1 << (_b)) -#define OCRDMA_GEN1_FAMILY 0xB -#define OCRDMA_GEN2_FAMILY 0x2 +enum { + OCRDMA_ASIC_GEN_SKH_R = 0x04, + OCRDMA_ASIC_GEN_LANCER = 0x0B +}; + +enum { + OCRDMA_ASIC_REV_A0 = 0x00, + OCRDMA_ASIC_REV_B0 = 0x10, + OCRDMA_ASIC_REV_C0 = 0x20 +}; #define OCRDMA_SUBSYS_ROCE 10 enum { @@ -64,21 +72,25 @@ enum { OCRDMA_CMD_ATTACH_MCAST, OCRDMA_CMD_DETACH_MCAST, + OCRDMA_CMD_GET_RDMA_STATS, OCRDMA_CMD_MAX }; #define OCRDMA_SUBSYS_COMMON 1 enum { + OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1 = 5, OCRDMA_CMD_CREATE_CQ = 12, OCRDMA_CMD_CREATE_EQ = 13, OCRDMA_CMD_CREATE_MQ = 21, + OCRDMA_CMD_GET_CTRL_ATTRIBUTES = 32, OCRDMA_CMD_GET_FW_VER = 35, OCRDMA_CMD_DELETE_MQ = 53, OCRDMA_CMD_DELETE_CQ = 54, OCRDMA_CMD_DELETE_EQ = 55, OCRDMA_CMD_GET_FW_CONFIG = 58, - OCRDMA_CMD_CREATE_MQ_EXT = 90 + OCRDMA_CMD_CREATE_MQ_EXT = 90, + OCRDMA_CMD_PHY_DETAILS = 102 }; enum { @@ -91,18 +103,21 @@ enum { #define OCRDMA_MAX_QP 2048 #define OCRDMA_MAX_CQ 2048 +#define OCRDMA_MAX_STAG 8192 enum { OCRDMA_DB_RQ_OFFSET = 0xE0, - OCRDMA_DB_GEN2_RQ1_OFFSET = 0x100, - OCRDMA_DB_GEN2_RQ2_OFFSET = 0xC0, + OCRDMA_DB_GEN2_RQ_OFFSET = 0x100, OCRDMA_DB_SQ_OFFSET = 0x60, OCRDMA_DB_GEN2_SQ_OFFSET = 0x1C0, OCRDMA_DB_SRQ_OFFSET = OCRDMA_DB_RQ_OFFSET, - OCRDMA_DB_GEN2_SRQ_OFFSET = OCRDMA_DB_GEN2_RQ1_OFFSET, + OCRDMA_DB_GEN2_SRQ_OFFSET = OCRDMA_DB_GEN2_RQ_OFFSET, OCRDMA_DB_CQ_OFFSET = 0x120, OCRDMA_DB_EQ_OFFSET = OCRDMA_DB_CQ_OFFSET, - OCRDMA_DB_MQ_OFFSET = 0x140 + OCRDMA_DB_MQ_OFFSET = 0x140, + + OCRDMA_DB_SQ_SHIFT = 16, + OCRDMA_DB_RQ_SHIFT = 24 }; #define OCRDMA_DB_CQ_RING_ID_MASK 0x3FF /* bits 0 - 9 */ @@ -137,14 +152,21 @@ enum { #define OCRDMA_MIN_Q_PAGE_SIZE (4096) #define OCRDMA_MAX_Q_PAGES (8) +#define OCRDMA_SLI_ASIC_ID_OFFSET 0x9C +#define OCRDMA_SLI_ASIC_REV_MASK 0x000000FF +#define OCRDMA_SLI_ASIC_GEN_NUM_MASK 0x0000FF00 +#define OCRDMA_SLI_ASIC_GEN_NUM_SHIFT 0x08 /* # 0: 4K Bytes # 1: 8K Bytes # 2: 16K Bytes # 3: 32K Bytes # 4: 64K Bytes +# 5: 128K Bytes +# 6: 256K Bytes +# 7: 512K Bytes */ -#define OCRDMA_MAX_Q_PAGE_SIZE_CNT (5) +#define OCRDMA_MAX_Q_PAGE_SIZE_CNT (8) #define OCRDMA_Q_PAGE_BASE_SIZE (OCRDMA_MIN_Q_PAGE_SIZE * OCRDMA_MAX_Q_PAGES) #define MAX_OCRDMA_QP_PAGES (8) @@ -177,7 +199,7 @@ struct ocrdma_mbx_hdr { u32 timeout; /* in seconds */ u32 cmd_len; u32 rsvd_version; -} __packed; +}; enum { OCRDMA_MBX_RSP_OPCODE_SHIFT = 0, @@ -197,7 +219,7 @@ struct ocrdma_mbx_rsp { u32 status; u32 rsp_len; u32 add_rsp_len; -} __packed; +}; enum { OCRDMA_MQE_EMBEDDED = 1, @@ -208,7 +230,7 @@ struct ocrdma_mqe_sge { u32 pa_lo; u32 pa_hi; u32 len; -} __packed; +}; enum { OCRDMA_MQE_HDR_EMB_SHIFT = 0, @@ -225,12 +247,12 @@ struct ocrdma_mqe_hdr { u32 tag_lo; u32 tag_hi; u32 rsvd3; -} __packed; +}; struct ocrdma_mqe_emb_cmd { struct ocrdma_mbx_hdr mch; u8 pyld[220]; -} __packed; +}; struct ocrdma_mqe { struct ocrdma_mqe_hdr hdr; @@ -242,7 +264,7 @@ struct ocrdma_mqe { u8 cmd[236]; struct ocrdma_mbx_rsp rsp; } u; -} __packed; +}; #define OCRDMA_EQ_LEN 4096 #define OCRDMA_MQ_CQ_LEN 256 @@ -259,12 +281,12 @@ struct ocrdma_mqe { struct ocrdma_delete_q_req { struct ocrdma_mbx_hdr req; u32 id; -} __packed; +}; struct ocrdma_pa { u32 lo; u32 hi; -} __packed; +}; #define MAX_OCRDMA_EQ_PAGES (8) struct ocrdma_create_eq_req { @@ -275,7 +297,7 @@ struct ocrdma_create_eq_req { u32 delay; u32 rsvd; struct ocrdma_pa pa[MAX_OCRDMA_EQ_PAGES]; -} __packed; +}; enum { OCRDMA_CREATE_EQ_VALID = Bit(29), @@ -310,7 +332,7 @@ struct ocrdma_mcqe { u32 tag_lo; u32 tag_hi; u32 valid_ae_cmpl_cons; -} __packed; +}; enum { OCRDMA_AE_MCQE_QPVALID = Bit(31), @@ -332,7 +354,21 @@ struct ocrdma_ae_mcqe { u32 cqvalid_cqid; u32 evt_tag; u32 valid_ae_event; -} __packed; +}; + +enum { + OCRDMA_AE_PVID_MCQE_ENABLED_SHIFT = 0, + OCRDMA_AE_PVID_MCQE_ENABLED_MASK = 0xFF, + OCRDMA_AE_PVID_MCQE_TAG_SHIFT = 16, + OCRDMA_AE_PVID_MCQE_TAG_MASK = 0xFFFF << OCRDMA_AE_PVID_MCQE_TAG_SHIFT +}; + +struct ocrdma_ae_pvid_mcqe { + u32 tag_enabled; + u32 event_tag; + u32 rsvd1; + u32 rsvd2; +}; enum { OCRDMA_AE_MPA_MCQE_REQ_ID_SHIFT = 16, @@ -356,7 +392,7 @@ struct ocrdma_ae_mpa_mcqe { u32 w1; u32 w2; u32 valid_ae_event; -} __packed; +}; enum { OCRDMA_AE_QP_MCQE_NEW_QP_STATE_SHIFT = 0, @@ -382,9 +418,11 @@ struct ocrdma_ae_qp_mcqe { u32 w1; u32 w2; u32 valid_ae_event; -} __packed; +}; -#define OCRDMA_ASYNC_EVE_CODE 0x14 +#define OCRDMA_ASYNC_RDMA_EVE_CODE 0x14 +#define OCRDMA_ASYNC_GRP5_EVE_CODE 0x5 +#define OCRDMA_ASYNC_EVENT_PVID_STATE 0x3 enum OCRDMA_ASYNC_EVENT_TYPE { OCRDMA_CQ_ERROR = 0x00, @@ -487,7 +525,8 @@ struct ocrdma_mbx_query_config { u32 max_ird_ord_per_qp; u32 max_shared_ird_ord; u32 max_mr; - u64 max_mr_size; + u32 max_mr_size_lo; + u32 max_mr_size_hi; u32 max_num_mr_pbl; u32 max_mw; u32 max_fmr; @@ -502,14 +541,14 @@ struct ocrdma_mbx_query_config { u32 max_wqes_rqes_per_q; u32 max_cq_cqes_per_cq; u32 max_srq_rqe_sge; -} __packed; +}; struct ocrdma_fw_ver_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; u8 running_ver[32]; -} __packed; +}; struct ocrdma_fw_conf_rsp { struct ocrdma_mqe_hdr hdr; @@ -535,14 +574,65 @@ struct ocrdma_fw_conf_rsp { u32 base_eqid; u32 max_eq; -} __packed; +}; enum { OCRDMA_FN_MODE_RDMA = 0x4 }; +struct ocrdma_get_phy_info_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u16 phy_type; + u16 interface_type; + u32 misc_params; + u16 ext_phy_details; + u16 rsvd; + u16 auto_speeds_supported; + u16 fixed_speeds_supported; + u32 future_use[2]; +}; + +enum { + OCRDMA_PHY_SPEED_ZERO = 0x0, + OCRDMA_PHY_SPEED_10MBPS = 0x1, + OCRDMA_PHY_SPEED_100MBPS = 0x2, + OCRDMA_PHY_SPEED_1GBPS = 0x4, + OCRDMA_PHY_SPEED_10GBPS = 0x8, + OCRDMA_PHY_SPEED_40GBPS = 0x20 +}; + + +struct ocrdma_get_link_speed_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u8 pt_port_num; + u8 link_duplex; + u8 phys_port_speed; + u8 phys_port_fault; + u16 rsvd1; + u16 qos_lnk_speed; + u8 logical_lnk_status; + u8 rsvd2[3]; +}; + +enum { + OCRDMA_PHYS_LINK_SPEED_ZERO = 0x0, + OCRDMA_PHYS_LINK_SPEED_10MBPS = 0x1, + OCRDMA_PHYS_LINK_SPEED_100MBPS = 0x2, + OCRDMA_PHYS_LINK_SPEED_1GBPS = 0x3, + OCRDMA_PHYS_LINK_SPEED_10GBPS = 0x4, + OCRDMA_PHYS_LINK_SPEED_20GBPS = 0x5, + OCRDMA_PHYS_LINK_SPEED_25GBPS = 0x6, + OCRDMA_PHYS_LINK_SPEED_40GBPS = 0x7, + OCRDMA_PHYS_LINK_SPEED_100GBPS = 0x8 +}; + enum { OCRDMA_CREATE_CQ_VER2 = 2, + OCRDMA_CREATE_CQ_VER3 = 3, OCRDMA_CREATE_CQ_PAGE_CNT_MASK = 0xFFFF, OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT = 16, @@ -576,7 +666,8 @@ struct ocrdma_create_cq_cmd { u32 pgsz_pgcnt; u32 ev_cnt_flags; u32 eqn; - u32 cqe_count; + u16 cqe_count; + u16 pd_id; u32 rsvd6; struct ocrdma_pa pa[OCRDMA_CREATE_CQ_MAX_PAGES]; }; @@ -584,7 +675,7 @@ struct ocrdma_create_cq_cmd { struct ocrdma_create_cq { struct ocrdma_mqe_hdr hdr; struct ocrdma_create_cq_cmd cmd; -} __packed; +}; enum { OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK = 0xFFFF @@ -593,12 +684,12 @@ enum { struct ocrdma_create_cq_cmd_rsp { struct ocrdma_mbx_rsp rsp; u32 cq_id; -} __packed; +}; struct ocrdma_create_cq_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_create_cq_cmd_rsp rsp; -} __packed; +}; enum { OCRDMA_CREATE_MQ_V0_CQ_ID_SHIFT = 22, @@ -608,16 +699,8 @@ enum { OCRDMA_CREATE_MQ_ASYNC_CQ_VALID = Bit(0) }; -struct ocrdma_create_mq_v0 { - u32 pages; - u32 cqid_ringsize; - u32 valid; - u32 async_cqid_valid; - u32 rsvd; - struct ocrdma_pa pa[8]; -} __packed; - -struct ocrdma_create_mq_v1 { +struct ocrdma_create_mq_req { + struct ocrdma_mbx_hdr req; u32 cqid_pages; u32 async_event_bitmap; u32 async_cqid_ringsize; @@ -625,20 +708,12 @@ struct ocrdma_create_mq_v1 { u32 async_cqid_valid; u32 rsvd; struct ocrdma_pa pa[8]; -} __packed; - -struct ocrdma_create_mq_req { - struct ocrdma_mbx_hdr req; - union { - struct ocrdma_create_mq_v0 v0; - struct ocrdma_create_mq_v1 v1; - }; -} __packed; +}; struct ocrdma_create_mq_rsp { struct ocrdma_mbx_rsp rsp; u32 id; -} __packed; +}; enum { OCRDMA_DESTROY_CQ_QID_SHIFT = 0, @@ -653,12 +728,12 @@ struct ocrdma_destroy_cq { struct ocrdma_mbx_hdr req; u32 bypass_flush_qid; -} __packed; +}; struct ocrdma_destroy_cq_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; -} __packed; +}; enum { OCRDMA_QPT_GSI = 1, @@ -782,7 +857,7 @@ struct ocrdma_create_qp_req { u32 dpp_credits_cqid; u32 rpir_lkey; struct ocrdma_pa ird_addr[MAX_OCRDMA_IRD_PAGES]; -} __packed; +}; enum { OCRDMA_CREATE_QP_RSP_QP_ID_SHIFT = 0, @@ -836,18 +911,18 @@ struct ocrdma_create_qp_rsp { u32 max_ord_ird; u32 sq_rq_id; u32 dpp_response; -} __packed; +}; struct ocrdma_destroy_qp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_hdr req; u32 qp_id; -} __packed; +}; struct ocrdma_destroy_qp_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; -} __packed; +}; enum { OCRDMA_MODIFY_QP_ID_SHIFT = 0, @@ -991,7 +1066,7 @@ struct ocrdma_qp_params { u32 dmac_b0_to_b3; u32 vlan_dmac_b4_to_b5; u32 qkey; -} __packed; +}; struct ocrdma_modify_qp { @@ -1002,7 +1077,7 @@ struct ocrdma_modify_qp { u32 flags; u32 rdma_flags; u32 num_outstanding_atomic_rd; -} __packed; +}; enum { OCRDMA_MODIFY_QP_RSP_MAX_RQE_SHIFT = 0, @@ -1017,28 +1092,29 @@ enum { OCRDMA_MODIFY_QP_RSP_MAX_ORD_MASK = 0xFFFF << OCRDMA_MODIFY_QP_RSP_MAX_ORD_SHIFT }; + struct ocrdma_modify_qp_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; u32 max_wqe_rqe; u32 max_ord_ird; -} __packed; +}; struct ocrdma_query_qp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_hdr req; -#define OCRDMA_QUERY_UP_QP_ID_SHIFT 0 -#define OCRDMA_QUERY_UP_QP_ID_MASK 0xFFFFFF +#define OCRDMA_QUERY_UP_QP_ID_SHIFT 0 +#define OCRDMA_QUERY_UP_QP_ID_MASK 0xFFFFFF u32 qp_id; -} __packed; +}; struct ocrdma_query_qp_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; struct ocrdma_qp_params params; -} __packed; +}; enum { OCRDMA_CREATE_SRQ_PD_ID_SHIFT = 0, @@ -1067,7 +1143,7 @@ struct ocrdma_create_srq { u32 max_sge_rqe; u32 pages_rqe_sz; struct ocrdma_pa rq_addr[MAX_OCRDMA_SRQ_PAGES]; -} __packed; +}; enum { OCRDMA_CREATE_SRQ_RSP_SRQ_ID_SHIFT = 0, @@ -1086,7 +1162,7 @@ struct ocrdma_create_srq_rsp { u32 id; u32 max_sge_rqe_allocated; -} __packed; +}; enum { OCRDMA_MODIFY_SRQ_ID_SHIFT = 0, @@ -1105,7 +1181,7 @@ struct ocrdma_modify_srq { u32 id; u32 limit_max_rqe; -} __packed; +}; enum { OCRDMA_QUERY_SRQ_ID_SHIFT = 0, @@ -1117,7 +1193,7 @@ struct ocrdma_query_srq { struct ocrdma_mbx_rsp req; u32 id; -} __packed; +}; enum { OCRDMA_QUERY_SRQ_RSP_PD_ID_SHIFT = 0, @@ -1139,7 +1215,7 @@ struct ocrdma_query_srq_rsp { u32 max_rqe_pdid; u32 srq_lmt_max_sge; -} __packed; +}; enum { OCRDMA_DESTROY_SRQ_ID_SHIFT = 0, @@ -1151,7 +1227,7 @@ struct ocrdma_destroy_srq { struct ocrdma_mbx_rsp req; u32 id; -} __packed; +}; enum { OCRDMA_ALLOC_PD_ENABLE_DPP = BIT(16), @@ -1163,7 +1239,7 @@ struct ocrdma_alloc_pd { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_hdr req; u32 enable_dpp_rsvd; -} __packed; +}; enum { OCRDMA_ALLOC_PD_RSP_DPP = Bit(16), @@ -1175,18 +1251,18 @@ struct ocrdma_alloc_pd_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; u32 dpp_page_pdid; -} __packed; +}; struct ocrdma_dealloc_pd { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_hdr req; u32 id; -} __packed; +}; struct ocrdma_dealloc_pd_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; -} __packed; +}; enum { OCRDMA_ADDR_CHECK_ENABLE = 1, @@ -1222,7 +1298,7 @@ struct ocrdma_alloc_lkey { u32 pdid; u32 pbl_sz_flags; -} __packed; +}; struct ocrdma_alloc_lkey_rsp { struct ocrdma_mqe_hdr hdr; @@ -1230,7 +1306,7 @@ struct ocrdma_alloc_lkey_rsp { u32 lrkey; u32 num_pbl_rsvd; -} __packed; +}; struct ocrdma_dealloc_lkey { struct ocrdma_mqe_hdr hdr; @@ -1238,12 +1314,12 @@ struct ocrdma_dealloc_lkey { u32 lkey; u32 rsvd_frmr; -} __packed; +}; struct ocrdma_dealloc_lkey_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; -} __packed; +}; #define MAX_OCRDMA_NSMR_PBL (u32)22 #define MAX_OCRDMA_PBL_SIZE 65536 @@ -1289,7 +1365,7 @@ struct ocrdma_reg_nsmr { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_hdr cmd; - u32 lrkey_key_index; + u32 fr_mr; u32 num_pbl_pdid; u32 flags_hpage_pbe_sz; u32 totlen_low; @@ -1299,7 +1375,7 @@ struct ocrdma_reg_nsmr { u32 va_loaddr; u32 va_hiaddr; struct ocrdma_pa pbl[MAX_OCRDMA_NSMR_PBL]; -} __packed; +}; enum { OCRDMA_REG_NSMR_CONT_PBL_SHIFT = 0, @@ -1321,12 +1397,12 @@ struct ocrdma_reg_nsmr_cont { u32 last; struct ocrdma_pa pbl[MAX_OCRDMA_NSMR_PBL]; -} __packed; +}; struct ocrdma_pbe { u32 pa_hi; u32 pa_lo; -} __packed; +}; enum { OCRDMA_REG_NSMR_RSP_NUM_PBL_SHIFT = 16, @@ -1338,7 +1414,7 @@ struct ocrdma_reg_nsmr_rsp { u32 lrkey; u32 num_pbl; -} __packed; +}; enum { OCRDMA_REG_NSMR_CONT_RSP_LRKEY_INDEX_SHIFT = 0, @@ -1358,7 +1434,7 @@ struct ocrdma_reg_nsmr_cont_rsp { u32 lrkey_key_index; u32 num_pbl; -} __packed; +}; enum { OCRDMA_ALLOC_MW_PD_ID_SHIFT = 0, @@ -1370,7 +1446,7 @@ struct ocrdma_alloc_mw { struct ocrdma_mbx_hdr req; u32 pdid; -} __packed; +}; enum { OCRDMA_ALLOC_MW_RSP_LRKEY_INDEX_SHIFT = 0, @@ -1382,7 +1458,7 @@ struct ocrdma_alloc_mw_rsp { struct ocrdma_mbx_rsp rsp; u32 lrkey_index; -} __packed; +}; struct ocrdma_attach_mcast { struct ocrdma_mqe_hdr hdr; @@ -1391,12 +1467,12 @@ struct ocrdma_attach_mcast { u8 mgid[16]; u32 mac_b0_to_b3; u32 vlan_mac_b4_to_b5; -} __packed; +}; struct ocrdma_attach_mcast_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; -} __packed; +}; struct ocrdma_detach_mcast { struct ocrdma_mqe_hdr hdr; @@ -1405,12 +1481,12 @@ struct ocrdma_detach_mcast { u8 mgid[16]; u32 mac_b0_to_b3; u32 vlan_mac_b4_to_b5; -} __packed; +}; struct ocrdma_detach_mcast_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; -} __packed; +}; enum { OCRDMA_CREATE_AH_NUM_PAGES_SHIFT = 19, @@ -1434,24 +1510,24 @@ struct ocrdma_create_ah_tbl { u32 ah_conf; struct ocrdma_pa tbl_addr[8]; -} __packed; +}; struct ocrdma_create_ah_tbl_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; u32 ahid; -} __packed; +}; struct ocrdma_delete_ah_tbl { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_hdr req; u32 ahid; -} __packed; +}; struct ocrdma_delete_ah_tbl_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; -} __packed; +}; enum { OCRDMA_EQE_VALID_SHIFT = 0, @@ -1464,7 +1540,7 @@ enum { struct ocrdma_eqe { u32 id_valid; -} __packed; +}; enum OCRDMA_CQE_STATUS { OCRDMA_CQE_SUCCESS = 0, @@ -1548,29 +1624,14 @@ struct ocrdma_cqe { } cmn; }; u32 flags_status_srcqpn; /* w3 */ -} __packed; - -#define is_cqe_valid(cq, cqe) \ - (((le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_VALID)\ - == cq->phase) ? 1 : 0) -#define is_cqe_for_sq(cqe) \ - ((le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_QTYPE) ? 0 : 1) -#define is_cqe_for_rq(cqe) \ - ((le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_QTYPE) ? 1 : 0) -#define is_cqe_invalidated(cqe) \ - ((le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_INVALIDATE) ? \ - 1 : 0) -#define is_cqe_imm(cqe) \ - ((le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_IMM) ? 1 : 0) -#define is_cqe_wr_imm(cqe) \ - ((le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_WRITE_IMM) ? 1 : 0) +}; struct ocrdma_sge { u32 addr_hi; u32 addr_lo; u32 lrkey; u32 len; -} __packed; +}; enum { OCRDMA_FLAG_SIG = 0x1, @@ -1594,6 +1655,7 @@ enum OCRDMA_WQE_OPCODE { OCRDMA_SEND = 0x00, OCRDMA_CMP_SWP = 0x14, OCRDMA_BIND_MW = 0x10, + OCRDMA_FR_MR = 0x11, OCRDMA_RESV1 = 0x0A, OCRDMA_LKEY_INV = 0x15, OCRDMA_FETCH_ADD = 0x13, @@ -1631,14 +1693,26 @@ struct ocrdma_hdr_wqe { u32 lkey; }; u32 total_len; -} __packed; +}; struct ocrdma_ewqe_ud_hdr { u32 rsvd_dest_qpn; u32 qkey; u32 rsvd_ahid; u32 rsvd; -} __packed; +}; + +/* extended wqe followed by hdr_wqe for Fast Memory register */ +struct ocrdma_ewqe_fr { + u32 va_hi; + u32 va_lo; + u32 fbo_hi; + u32 fbo_lo; + u32 size_sge; + u32 num_sges; + u32 rsvd; + u32 rsvd2; +}; struct ocrdma_eth_basic { u8 dmac[6]; @@ -1663,7 +1737,7 @@ struct ocrdma_grh { u16 rsvd; } __packed; -#define OCRDMA_AV_VALID Bit(0) +#define OCRDMA_AV_VALID Bit(7) #define OCRDMA_AV_VLAN_VALID Bit(1) struct ocrdma_av { @@ -1672,4 +1746,208 @@ struct ocrdma_av { u32 valid; } __packed; +struct ocrdma_rsrc_stats { + u32 dpp_pds; + u32 non_dpp_pds; + u32 rc_dpp_qps; + u32 uc_dpp_qps; + u32 ud_dpp_qps; + u32 rc_non_dpp_qps; + u32 rsvd; + u32 uc_non_dpp_qps; + u32 ud_non_dpp_qps; + u32 rsvd1; + u32 srqs; + u32 rbqs; + u32 r64K_nsmr; + u32 r64K_to_2M_nsmr; + u32 r2M_to_44M_nsmr; + u32 r44M_to_1G_nsmr; + u32 r1G_to_4G_nsmr; + u32 nsmr_count_4G_to_32G; + u32 r32G_to_64G_nsmr; + u32 r64G_to_128G_nsmr; + u32 r128G_to_higher_nsmr; + u32 embedded_nsmr; + u32 frmr; + u32 prefetch_qps; + u32 ondemand_qps; + u32 phy_mr; + u32 mw; + u32 rsvd2[7]; +}; + +struct ocrdma_db_err_stats { + u32 sq_doorbell_errors; + u32 cq_doorbell_errors; + u32 rq_srq_doorbell_errors; + u32 cq_overflow_errors; + u32 rsvd[4]; +}; + +struct ocrdma_wqe_stats { + u32 large_send_rc_wqes_lo; + u32 large_send_rc_wqes_hi; + u32 large_write_rc_wqes_lo; + u32 large_write_rc_wqes_hi; + u32 rsvd[4]; + u32 read_wqes_lo; + u32 read_wqes_hi; + u32 frmr_wqes_lo; + u32 frmr_wqes_hi; + u32 mw_bind_wqes_lo; + u32 mw_bind_wqes_hi; + u32 invalidate_wqes_lo; + u32 invalidate_wqes_hi; + u32 rsvd1[2]; + u32 dpp_wqe_drops; + u32 rsvd2[5]; +}; + +struct ocrdma_tx_stats { + u32 send_pkts_lo; + u32 send_pkts_hi; + u32 write_pkts_lo; + u32 write_pkts_hi; + u32 read_pkts_lo; + u32 read_pkts_hi; + u32 read_rsp_pkts_lo; + u32 read_rsp_pkts_hi; + u32 ack_pkts_lo; + u32 ack_pkts_hi; + u32 send_bytes_lo; + u32 send_bytes_hi; + u32 write_bytes_lo; + u32 write_bytes_hi; + u32 read_req_bytes_lo; + u32 read_req_bytes_hi; + u32 read_rsp_bytes_lo; + u32 read_rsp_bytes_hi; + u32 ack_timeouts; + u32 rsvd[5]; +}; + + +struct ocrdma_tx_qp_err_stats { + u32 local_length_errors; + u32 local_protection_errors; + u32 local_qp_operation_errors; + u32 retry_count_exceeded_errors; + u32 rnr_retry_count_exceeded_errors; + u32 rsvd[3]; +}; + +struct ocrdma_rx_stats { + u32 roce_frame_bytes_lo; + u32 roce_frame_bytes_hi; + u32 roce_frame_icrc_drops; + u32 roce_frame_payload_len_drops; + u32 ud_drops; + u32 qp1_drops; + u32 psn_error_request_packets; + u32 psn_error_resp_packets; + u32 rnr_nak_timeouts; + u32 rnr_nak_receives; + u32 roce_frame_rxmt_drops; + u32 nak_count_psn_sequence_errors; + u32 rc_drop_count_lookup_errors; + u32 rq_rnr_naks; + u32 srq_rnr_naks; + u32 roce_frames_lo; + u32 roce_frames_hi; + u32 rsvd; +}; + +struct ocrdma_rx_qp_err_stats { + u32 nak_invalid_requst_errors; + u32 nak_remote_operation_errors; + u32 nak_count_remote_access_errors; + u32 local_length_errors; + u32 local_protection_errors; + u32 local_qp_operation_errors; + u32 rsvd[2]; +}; + +struct ocrdma_tx_dbg_stats { + u32 data[100]; +}; + +struct ocrdma_rx_dbg_stats { + u32 data[200]; +}; + +struct ocrdma_rdma_stats_req { + struct ocrdma_mbx_hdr hdr; + u8 reset_stats; + u8 rsvd[3]; +} __packed; + +struct ocrdma_rdma_stats_resp { + struct ocrdma_mbx_hdr hdr; + struct ocrdma_rsrc_stats act_rsrc_stats; + struct ocrdma_rsrc_stats th_rsrc_stats; + struct ocrdma_db_err_stats db_err_stats; + struct ocrdma_wqe_stats wqe_stats; + struct ocrdma_tx_stats tx_stats; + struct ocrdma_tx_qp_err_stats tx_qp_err_stats; + struct ocrdma_rx_stats rx_stats; + struct ocrdma_rx_qp_err_stats rx_qp_err_stats; + struct ocrdma_tx_dbg_stats tx_dbg_stats; + struct ocrdma_rx_dbg_stats rx_dbg_stats; +} __packed; + + +struct mgmt_hba_attribs { + u8 flashrom_version_string[32]; + u8 manufacturer_name[32]; + u32 supported_modes; + u32 rsvd0[3]; + u8 ncsi_ver_string[12]; + u32 default_extended_timeout; + u8 controller_model_number[32]; + u8 controller_description[64]; + u8 controller_serial_number[32]; + u8 ip_version_string[32]; + u8 firmware_version_string[32]; + u8 bios_version_string[32]; + u8 redboot_version_string[32]; + u8 driver_version_string[32]; + u8 fw_on_flash_version_string[32]; + u32 functionalities_supported; + u16 max_cdblength; + u8 asic_revision; + u8 generational_guid[16]; + u8 hba_port_count; + u16 default_link_down_timeout; + u8 iscsi_ver_min_max; + u8 multifunction_device; + u8 cache_valid; + u8 hba_status; + u8 max_domains_supported; + u8 phy_port; + u32 firmware_post_status; + u32 hba_mtu[8]; + u32 rsvd1[4]; +}; + +struct mgmt_controller_attrib { + struct mgmt_hba_attribs hba_attribs; + u16 pci_vendor_id; + u16 pci_device_id; + u16 pci_sub_vendor_id; + u16 pci_sub_system_id; + u8 pci_bus_number; + u8 pci_device_number; + u8 pci_function_number; + u8 interface_type; + u64 unique_identifier; + u32 rsvd0[5]; +}; + +struct ocrdma_get_ctrl_attribs_rsp { + struct ocrdma_mbx_hdr hdr; + struct mgmt_controller_attrib ctrl_attribs; +}; + + #endif /* __OCRDMA_SLI_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c new file mode 100644 index 00000000000..41a9aec9998 --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -0,0 +1,616 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2014 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#include <rdma/ib_addr.h> +#include "ocrdma_stats.h" + +static struct dentry *ocrdma_dbgfs_dir; + +static int ocrdma_add_stat(char *start, char *pcur, + char *name, u64 count) +{ + char buff[128] = {0}; + int cpy_len = 0; + + snprintf(buff, 128, "%s: %llu\n", name, count); + cpy_len = strlen(buff); + + if (pcur + cpy_len > start + OCRDMA_MAX_DBGFS_MEM) { + pr_err("%s: No space in stats buff\n", __func__); + return 0; + } + + memcpy(pcur, buff, cpy_len); + return cpy_len; +} + +static bool ocrdma_alloc_stats_mem(struct ocrdma_dev *dev) +{ + struct stats_mem *mem = &dev->stats_mem; + + /* Alloc mbox command mem*/ + mem->size = max_t(u32, sizeof(struct ocrdma_rdma_stats_req), + sizeof(struct ocrdma_rdma_stats_resp)); + + mem->va = dma_alloc_coherent(&dev->nic_info.pdev->dev, mem->size, + &mem->pa, GFP_KERNEL); + if (!mem->va) { + pr_err("%s: stats mbox allocation failed\n", __func__); + return false; + } + + memset(mem->va, 0, mem->size); + + /* Alloc debugfs mem */ + mem->debugfs_mem = kzalloc(OCRDMA_MAX_DBGFS_MEM, GFP_KERNEL); + if (!mem->debugfs_mem) { + pr_err("%s: stats debugfs mem allocation failed\n", __func__); + return false; + } + + return true; +} + +static void ocrdma_release_stats_mem(struct ocrdma_dev *dev) +{ + struct stats_mem *mem = &dev->stats_mem; + + if (mem->va) + dma_free_coherent(&dev->nic_info.pdev->dev, mem->size, + mem->va, mem->pa); + kfree(mem->debugfs_mem); +} + +static char *ocrdma_resource_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rsrc_stats *rsrc_stats = &rdma_stats->act_rsrc_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "active_dpp_pds", + (u64)rsrc_stats->dpp_pds); + pcur += ocrdma_add_stat(stats, pcur, "active_non_dpp_pds", + (u64)rsrc_stats->non_dpp_pds); + pcur += ocrdma_add_stat(stats, pcur, "active_rc_dpp_qps", + (u64)rsrc_stats->rc_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_uc_dpp_qps", + (u64)rsrc_stats->uc_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_ud_dpp_qps", + (u64)rsrc_stats->ud_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_rc_non_dpp_qps", + (u64)rsrc_stats->rc_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_uc_non_dpp_qps", + (u64)rsrc_stats->uc_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_ud_non_dpp_qps", + (u64)rsrc_stats->ud_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_srqs", + (u64)rsrc_stats->srqs); + pcur += ocrdma_add_stat(stats, pcur, "active_rbqs", + (u64)rsrc_stats->rbqs); + pcur += ocrdma_add_stat(stats, pcur, "active_64K_nsmr", + (u64)rsrc_stats->r64K_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_64K_to_2M_nsmr", + (u64)rsrc_stats->r64K_to_2M_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_2M_to_44M_nsmr", + (u64)rsrc_stats->r2M_to_44M_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_44M_to_1G_nsmr", + (u64)rsrc_stats->r44M_to_1G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_1G_to_4G_nsmr", + (u64)rsrc_stats->r1G_to_4G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_nsmr_count_4G_to_32G", + (u64)rsrc_stats->nsmr_count_4G_to_32G); + pcur += ocrdma_add_stat(stats, pcur, "active_32G_to_64G_nsmr", + (u64)rsrc_stats->r32G_to_64G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_64G_to_128G_nsmr", + (u64)rsrc_stats->r64G_to_128G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_128G_to_higher_nsmr", + (u64)rsrc_stats->r128G_to_higher_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_embedded_nsmr", + (u64)rsrc_stats->embedded_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_frmr", + (u64)rsrc_stats->frmr); + pcur += ocrdma_add_stat(stats, pcur, "active_prefetch_qps", + (u64)rsrc_stats->prefetch_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_ondemand_qps", + (u64)rsrc_stats->ondemand_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_phy_mr", + (u64)rsrc_stats->phy_mr); + pcur += ocrdma_add_stat(stats, pcur, "active_mw", + (u64)rsrc_stats->mw); + + /* Print the threshold stats */ + rsrc_stats = &rdma_stats->th_rsrc_stats; + + pcur += ocrdma_add_stat(stats, pcur, "threshold_dpp_pds", + (u64)rsrc_stats->dpp_pds); + pcur += ocrdma_add_stat(stats, pcur, "threshold_non_dpp_pds", + (u64)rsrc_stats->non_dpp_pds); + pcur += ocrdma_add_stat(stats, pcur, "threshold_rc_dpp_qps", + (u64)rsrc_stats->rc_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_uc_dpp_qps", + (u64)rsrc_stats->uc_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_ud_dpp_qps", + (u64)rsrc_stats->ud_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_rc_non_dpp_qps", + (u64)rsrc_stats->rc_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_uc_non_dpp_qps", + (u64)rsrc_stats->uc_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_ud_non_dpp_qps", + (u64)rsrc_stats->ud_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_srqs", + (u64)rsrc_stats->srqs); + pcur += ocrdma_add_stat(stats, pcur, "threshold_rbqs", + (u64)rsrc_stats->rbqs); + pcur += ocrdma_add_stat(stats, pcur, "threshold_64K_nsmr", + (u64)rsrc_stats->r64K_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_64K_to_2M_nsmr", + (u64)rsrc_stats->r64K_to_2M_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_2M_to_44M_nsmr", + (u64)rsrc_stats->r2M_to_44M_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_44M_to_1G_nsmr", + (u64)rsrc_stats->r44M_to_1G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_1G_to_4G_nsmr", + (u64)rsrc_stats->r1G_to_4G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_nsmr_count_4G_to_32G", + (u64)rsrc_stats->nsmr_count_4G_to_32G); + pcur += ocrdma_add_stat(stats, pcur, "threshold_32G_to_64G_nsmr", + (u64)rsrc_stats->r32G_to_64G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_64G_to_128G_nsmr", + (u64)rsrc_stats->r64G_to_128G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_128G_to_higher_nsmr", + (u64)rsrc_stats->r128G_to_higher_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_embedded_nsmr", + (u64)rsrc_stats->embedded_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_frmr", + (u64)rsrc_stats->frmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_prefetch_qps", + (u64)rsrc_stats->prefetch_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_ondemand_qps", + (u64)rsrc_stats->ondemand_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_phy_mr", + (u64)rsrc_stats->phy_mr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_mw", + (u64)rsrc_stats->mw); + return stats; +} + +static char *ocrdma_rx_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rx_stats *rx_stats = &rdma_stats->rx_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat + (stats, pcur, "roce_frame_bytes", + convert_to_64bit(rx_stats->roce_frame_bytes_lo, + rx_stats->roce_frame_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "roce_frame_icrc_drops", + (u64)rx_stats->roce_frame_icrc_drops); + pcur += ocrdma_add_stat(stats, pcur, "roce_frame_payload_len_drops", + (u64)rx_stats->roce_frame_payload_len_drops); + pcur += ocrdma_add_stat(stats, pcur, "ud_drops", + (u64)rx_stats->ud_drops); + pcur += ocrdma_add_stat(stats, pcur, "qp1_drops", + (u64)rx_stats->qp1_drops); + pcur += ocrdma_add_stat(stats, pcur, "psn_error_request_packets", + (u64)rx_stats->psn_error_request_packets); + pcur += ocrdma_add_stat(stats, pcur, "psn_error_resp_packets", + (u64)rx_stats->psn_error_resp_packets); + pcur += ocrdma_add_stat(stats, pcur, "rnr_nak_timeouts", + (u64)rx_stats->rnr_nak_timeouts); + pcur += ocrdma_add_stat(stats, pcur, "rnr_nak_receives", + (u64)rx_stats->rnr_nak_receives); + pcur += ocrdma_add_stat(stats, pcur, "roce_frame_rxmt_drops", + (u64)rx_stats->roce_frame_rxmt_drops); + pcur += ocrdma_add_stat(stats, pcur, "nak_count_psn_sequence_errors", + (u64)rx_stats->nak_count_psn_sequence_errors); + pcur += ocrdma_add_stat(stats, pcur, "rc_drop_count_lookup_errors", + (u64)rx_stats->rc_drop_count_lookup_errors); + pcur += ocrdma_add_stat(stats, pcur, "rq_rnr_naks", + (u64)rx_stats->rq_rnr_naks); + pcur += ocrdma_add_stat(stats, pcur, "srq_rnr_naks", + (u64)rx_stats->srq_rnr_naks); + pcur += ocrdma_add_stat(stats, pcur, "roce_frames", + convert_to_64bit(rx_stats->roce_frames_lo, + rx_stats->roce_frames_hi)); + + return stats; +} + +static char *ocrdma_tx_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_tx_stats *tx_stats = &rdma_stats->tx_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "send_pkts", + convert_to_64bit(tx_stats->send_pkts_lo, + tx_stats->send_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "write_pkts", + convert_to_64bit(tx_stats->write_pkts_lo, + tx_stats->write_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_pkts", + convert_to_64bit(tx_stats->read_pkts_lo, + tx_stats->read_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_rsp_pkts", + convert_to_64bit(tx_stats->read_rsp_pkts_lo, + tx_stats->read_rsp_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "ack_pkts", + convert_to_64bit(tx_stats->ack_pkts_lo, + tx_stats->ack_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "send_bytes", + convert_to_64bit(tx_stats->send_bytes_lo, + tx_stats->send_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "write_bytes", + convert_to_64bit(tx_stats->write_bytes_lo, + tx_stats->write_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_req_bytes", + convert_to_64bit(tx_stats->read_req_bytes_lo, + tx_stats->read_req_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_rsp_bytes", + convert_to_64bit(tx_stats->read_rsp_bytes_lo, + tx_stats->read_rsp_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "ack_timeouts", + (u64)tx_stats->ack_timeouts); + + return stats; +} + +static char *ocrdma_wqe_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_wqe_stats *wqe_stats = &rdma_stats->wqe_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "large_send_rc_wqes", + convert_to_64bit(wqe_stats->large_send_rc_wqes_lo, + wqe_stats->large_send_rc_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "large_write_rc_wqes", + convert_to_64bit(wqe_stats->large_write_rc_wqes_lo, + wqe_stats->large_write_rc_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_wqes", + convert_to_64bit(wqe_stats->read_wqes_lo, + wqe_stats->read_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "frmr_wqes", + convert_to_64bit(wqe_stats->frmr_wqes_lo, + wqe_stats->frmr_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "mw_bind_wqes", + convert_to_64bit(wqe_stats->mw_bind_wqes_lo, + wqe_stats->mw_bind_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "invalidate_wqes", + convert_to_64bit(wqe_stats->invalidate_wqes_lo, + wqe_stats->invalidate_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "dpp_wqe_drops", + (u64)wqe_stats->dpp_wqe_drops); + return stats; +} + +static char *ocrdma_db_errstats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_db_err_stats *db_err_stats = &rdma_stats->db_err_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "sq_doorbell_errors", + (u64)db_err_stats->sq_doorbell_errors); + pcur += ocrdma_add_stat(stats, pcur, "cq_doorbell_errors", + (u64)db_err_stats->cq_doorbell_errors); + pcur += ocrdma_add_stat(stats, pcur, "rq_srq_doorbell_errors", + (u64)db_err_stats->rq_srq_doorbell_errors); + pcur += ocrdma_add_stat(stats, pcur, "cq_overflow_errors", + (u64)db_err_stats->cq_overflow_errors); + return stats; +} + +static char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rx_qp_err_stats *rx_qp_err_stats = + &rdma_stats->rx_qp_err_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "nak_invalid_requst_errors", + (u64)rx_qp_err_stats->nak_invalid_requst_errors); + pcur += ocrdma_add_stat(stats, pcur, "nak_remote_operation_errors", + (u64)rx_qp_err_stats->nak_remote_operation_errors); + pcur += ocrdma_add_stat(stats, pcur, "nak_count_remote_access_errors", + (u64)rx_qp_err_stats->nak_count_remote_access_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_length_errors", + (u64)rx_qp_err_stats->local_length_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_protection_errors", + (u64)rx_qp_err_stats->local_protection_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_qp_operation_errors", + (u64)rx_qp_err_stats->local_qp_operation_errors); + return stats; +} + +static char *ocrdma_txqp_errstats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_tx_qp_err_stats *tx_qp_err_stats = + &rdma_stats->tx_qp_err_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "local_length_errors", + (u64)tx_qp_err_stats->local_length_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_protection_errors", + (u64)tx_qp_err_stats->local_protection_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_qp_operation_errors", + (u64)tx_qp_err_stats->local_qp_operation_errors); + pcur += ocrdma_add_stat(stats, pcur, "retry_count_exceeded_errors", + (u64)tx_qp_err_stats->retry_count_exceeded_errors); + pcur += ocrdma_add_stat(stats, pcur, "rnr_retry_count_exceeded_errors", + (u64)tx_qp_err_stats->rnr_retry_count_exceeded_errors); + return stats; +} + +static char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev) +{ + int i; + char *pstats = dev->stats_mem.debugfs_mem; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_tx_dbg_stats *tx_dbg_stats = + &rdma_stats->tx_dbg_stats; + + memset(pstats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + for (i = 0; i < 100; i++) + pstats += snprintf(pstats, 80, "DW[%d] = 0x%x\n", i, + tx_dbg_stats->data[i]); + + return dev->stats_mem.debugfs_mem; +} + +static char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev) +{ + int i; + char *pstats = dev->stats_mem.debugfs_mem; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rx_dbg_stats *rx_dbg_stats = + &rdma_stats->rx_dbg_stats; + + memset(pstats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + for (i = 0; i < 200; i++) + pstats += snprintf(pstats, 80, "DW[%d] = 0x%x\n", i, + rx_dbg_stats->data[i]); + + return dev->stats_mem.debugfs_mem; +} + +static void ocrdma_update_stats(struct ocrdma_dev *dev) +{ + ulong now = jiffies, secs; + int status = 0; + + secs = jiffies_to_msecs(now - dev->last_stats_time) / 1000U; + if (secs) { + /* update */ + status = ocrdma_mbx_rdma_stats(dev, false); + if (status) + pr_err("%s: stats mbox failed with status = %d\n", + __func__, status); + dev->last_stats_time = jiffies; + } +} + +static ssize_t ocrdma_dbgfs_ops_read(struct file *filp, char __user *buffer, + size_t usr_buf_len, loff_t *ppos) +{ + struct ocrdma_stats *pstats = filp->private_data; + struct ocrdma_dev *dev = pstats->dev; + ssize_t status = 0; + char *data = NULL; + + /* No partial reads */ + if (*ppos != 0) + return 0; + + mutex_lock(&dev->stats_lock); + + ocrdma_update_stats(dev); + + switch (pstats->type) { + case OCRDMA_RSRC_STATS: + data = ocrdma_resource_stats(dev); + break; + case OCRDMA_RXSTATS: + data = ocrdma_rx_stats(dev); + break; + case OCRDMA_WQESTATS: + data = ocrdma_wqe_stats(dev); + break; + case OCRDMA_TXSTATS: + data = ocrdma_tx_stats(dev); + break; + case OCRDMA_DB_ERRSTATS: + data = ocrdma_db_errstats(dev); + break; + case OCRDMA_RXQP_ERRSTATS: + data = ocrdma_rxqp_errstats(dev); + break; + case OCRDMA_TXQP_ERRSTATS: + data = ocrdma_txqp_errstats(dev); + break; + case OCRDMA_TX_DBG_STATS: + data = ocrdma_tx_dbg_stats(dev); + break; + case OCRDMA_RX_DBG_STATS: + data = ocrdma_rx_dbg_stats(dev); + break; + + default: + status = -EFAULT; + goto exit; + } + + if (usr_buf_len < strlen(data)) { + status = -ENOSPC; + goto exit; + } + + status = simple_read_from_buffer(buffer, usr_buf_len, ppos, data, + strlen(data)); +exit: + mutex_unlock(&dev->stats_lock); + return status; +} + +static const struct file_operations ocrdma_dbg_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = ocrdma_dbgfs_ops_read, +}; + +void ocrdma_add_port_stats(struct ocrdma_dev *dev) +{ + if (!ocrdma_dbgfs_dir) + return; + + /* Create post stats base dir */ + dev->dir = debugfs_create_dir(dev->ibdev.name, ocrdma_dbgfs_dir); + if (!dev->dir) + goto err; + + dev->rsrc_stats.type = OCRDMA_RSRC_STATS; + dev->rsrc_stats.dev = dev; + if (!debugfs_create_file("resource_stats", S_IRUSR, dev->dir, + &dev->rsrc_stats, &ocrdma_dbg_ops)) + goto err; + + dev->rx_stats.type = OCRDMA_RXSTATS; + dev->rx_stats.dev = dev; + if (!debugfs_create_file("rx_stats", S_IRUSR, dev->dir, + &dev->rx_stats, &ocrdma_dbg_ops)) + goto err; + + dev->wqe_stats.type = OCRDMA_WQESTATS; + dev->wqe_stats.dev = dev; + if (!debugfs_create_file("wqe_stats", S_IRUSR, dev->dir, + &dev->wqe_stats, &ocrdma_dbg_ops)) + goto err; + + dev->tx_stats.type = OCRDMA_TXSTATS; + dev->tx_stats.dev = dev; + if (!debugfs_create_file("tx_stats", S_IRUSR, dev->dir, + &dev->tx_stats, &ocrdma_dbg_ops)) + goto err; + + dev->db_err_stats.type = OCRDMA_DB_ERRSTATS; + dev->db_err_stats.dev = dev; + if (!debugfs_create_file("db_err_stats", S_IRUSR, dev->dir, + &dev->db_err_stats, &ocrdma_dbg_ops)) + goto err; + + + dev->tx_qp_err_stats.type = OCRDMA_TXQP_ERRSTATS; + dev->tx_qp_err_stats.dev = dev; + if (!debugfs_create_file("tx_qp_err_stats", S_IRUSR, dev->dir, + &dev->tx_qp_err_stats, &ocrdma_dbg_ops)) + goto err; + + dev->rx_qp_err_stats.type = OCRDMA_RXQP_ERRSTATS; + dev->rx_qp_err_stats.dev = dev; + if (!debugfs_create_file("rx_qp_err_stats", S_IRUSR, dev->dir, + &dev->rx_qp_err_stats, &ocrdma_dbg_ops)) + goto err; + + + dev->tx_dbg_stats.type = OCRDMA_TX_DBG_STATS; + dev->tx_dbg_stats.dev = dev; + if (!debugfs_create_file("tx_dbg_stats", S_IRUSR, dev->dir, + &dev->tx_dbg_stats, &ocrdma_dbg_ops)) + goto err; + + dev->rx_dbg_stats.type = OCRDMA_RX_DBG_STATS; + dev->rx_dbg_stats.dev = dev; + if (!debugfs_create_file("rx_dbg_stats", S_IRUSR, dev->dir, + &dev->rx_dbg_stats, &ocrdma_dbg_ops)) + goto err; + + /* Now create dma_mem for stats mbx command */ + if (!ocrdma_alloc_stats_mem(dev)) + goto err; + + mutex_init(&dev->stats_lock); + + return; +err: + ocrdma_release_stats_mem(dev); + debugfs_remove_recursive(dev->dir); + dev->dir = NULL; +} + +void ocrdma_rem_port_stats(struct ocrdma_dev *dev) +{ + if (!dev->dir) + return; + mutex_destroy(&dev->stats_lock); + ocrdma_release_stats_mem(dev); + debugfs_remove(dev->dir); +} + +void ocrdma_init_debugfs(void) +{ + /* Create base dir in debugfs root dir */ + ocrdma_dbgfs_dir = debugfs_create_dir("ocrdma", NULL); +} + +void ocrdma_rem_debugfs(void) +{ + debugfs_remove_recursive(ocrdma_dbgfs_dir); +} diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.h b/drivers/infiniband/hw/ocrdma/ocrdma_stats.h new file mode 100644 index 00000000000..5f5e20c46d7 --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.h @@ -0,0 +1,54 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2014 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_STATS_H__ +#define __OCRDMA_STATS_H__ + +#include <linux/debugfs.h> +#include "ocrdma.h" +#include "ocrdma_hw.h" + +#define OCRDMA_MAX_DBGFS_MEM 4096 + +enum OCRDMA_STATS_TYPE { + OCRDMA_RSRC_STATS, + OCRDMA_RXSTATS, + OCRDMA_WQESTATS, + OCRDMA_TXSTATS, + OCRDMA_DB_ERRSTATS, + OCRDMA_RXQP_ERRSTATS, + OCRDMA_TXQP_ERRSTATS, + OCRDMA_TX_DBG_STATS, + OCRDMA_RX_DBG_STATS +}; + +void ocrdma_rem_debugfs(void); +void ocrdma_init_debugfs(void); +void ocrdma_rem_port_stats(struct ocrdma_dev *dev); +void ocrdma_add_port_stats(struct ocrdma_dev *dev); + +#endif /* __OCRDMA_STATS_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index b29a4246ef4..edf6211d84b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -53,7 +53,7 @@ int ocrdma_query_gid(struct ib_device *ibdev, u8 port, dev = get_ocrdma_dev(ibdev); memset(sgid, 0, sizeof(*sgid)); - if (index >= OCRDMA_MAX_SGID) + if (index > OCRDMA_MAX_SGID) return -EINVAL; memcpy(sgid, &dev->sgid_tbl[index], sizeof(*sgid)); @@ -75,20 +75,21 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr) attr->vendor_part_id = dev->nic_info.pdev->device; attr->hw_ver = 0; attr->max_qp = dev->attr.max_qp; - attr->max_ah = dev->attr.max_qp; + attr->max_ah = OCRDMA_MAX_AH; attr->max_qp_wr = dev->attr.max_wqe; attr->device_cap_flags = IB_DEVICE_CURR_QP_STATE_MOD | IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_SHUTDOWN_PORT | IB_DEVICE_SYS_IMAGE_GUID | - IB_DEVICE_LOCAL_DMA_LKEY; + IB_DEVICE_LOCAL_DMA_LKEY | + IB_DEVICE_MEM_MGT_EXTENSIONS; attr->max_sge = min(dev->attr.max_send_sge, dev->attr.max_srq_sge); attr->max_sge_rd = 0; attr->max_cq = dev->attr.max_cq; attr->max_cqe = dev->attr.max_cqe; attr->max_mr = dev->attr.max_mr; - attr->max_mw = 0; + attr->max_mw = dev->attr.max_mw; attr->max_pd = dev->attr.max_pd; attr->atomic_cap = 0; attr->max_fmr = 0; @@ -96,7 +97,7 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr) attr->max_qp_rd_atom = min(dev->attr.max_ord_per_qp, dev->attr.max_ird_per_qp); attr->max_qp_init_rd_atom = dev->attr.max_ord_per_qp; - attr->max_srq = (dev->attr.max_qp - 1); + attr->max_srq = dev->attr.max_srq; attr->max_srq_sge = dev->attr.max_srq_sge; attr->max_srq_wr = dev->attr.max_rqe; attr->local_ca_ack_delay = dev->attr.local_ca_ack_delay; @@ -105,6 +106,44 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr) return 0; } +static inline void get_link_speed_and_width(struct ocrdma_dev *dev, + u8 *ib_speed, u8 *ib_width) +{ + int status; + u8 speed; + + status = ocrdma_mbx_get_link_speed(dev, &speed); + if (status) + speed = OCRDMA_PHYS_LINK_SPEED_ZERO; + + switch (speed) { + case OCRDMA_PHYS_LINK_SPEED_1GBPS: + *ib_speed = IB_SPEED_SDR; + *ib_width = IB_WIDTH_1X; + break; + + case OCRDMA_PHYS_LINK_SPEED_10GBPS: + *ib_speed = IB_SPEED_QDR; + *ib_width = IB_WIDTH_1X; + break; + + case OCRDMA_PHYS_LINK_SPEED_20GBPS: + *ib_speed = IB_SPEED_DDR; + *ib_width = IB_WIDTH_4X; + break; + + case OCRDMA_PHYS_LINK_SPEED_40GBPS: + *ib_speed = IB_SPEED_QDR; + *ib_width = IB_WIDTH_4X; + break; + + default: + /* Unsupported */ + *ib_speed = IB_SPEED_SDR; + *ib_width = IB_WIDTH_1X; + } +} + int ocrdma_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { @@ -114,8 +153,8 @@ int ocrdma_query_port(struct ib_device *ibdev, dev = get_ocrdma_dev(ibdev); if (port > 1) { - ocrdma_err("%s(%d) invalid_port=0x%x\n", __func__, - dev->id, port); + pr_err("%s(%d) invalid_port=0x%x\n", __func__, + dev->id, port); return -EINVAL; } netdev = dev->nic_info.netdev; @@ -136,13 +175,13 @@ int ocrdma_query_port(struct ib_device *ibdev, props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP | - IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP; + IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_IP_BASED_GIDS; props->gid_tbl_len = OCRDMA_MAX_SGID; props->pkey_tbl_len = 1; props->bad_pkey_cntr = 0; props->qkey_viol_cntr = 0; - props->active_width = IB_WIDTH_1X; - props->active_speed = 4; + get_link_speed_and_width(dev, &props->active_speed, + &props->active_width); props->max_msg_sz = 0x80000000; props->max_vl_num = 4; return 0; @@ -155,8 +194,7 @@ int ocrdma_modify_port(struct ib_device *ibdev, u8 port, int mask, dev = get_ocrdma_dev(ibdev); if (port > 1) { - ocrdma_err("%s(%d) invalid_port=0x%x\n", __func__, - dev->id, port); + pr_err("%s(%d) invalid_port=0x%x\n", __func__, dev->id, port); return -EINVAL; } return 0; @@ -187,7 +225,7 @@ static void ocrdma_del_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr, mutex_lock(&uctx->mm_list_lock); list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) { - if (len != mm->key.len || phy_addr != mm->key.phy_addr) + if (len != mm->key.len && phy_addr != mm->key.phy_addr) continue; list_del(&mm->entry); @@ -205,7 +243,7 @@ static bool ocrdma_search_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr, mutex_lock(&uctx->mm_list_lock); list_for_each_entry(mm, &uctx->mm_head, entry) { - if (len != mm->key.len || phy_addr != mm->key.phy_addr) + if (len != mm->key.len && phy_addr != mm->key.phy_addr) continue; found = true; @@ -215,6 +253,108 @@ static bool ocrdma_search_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr, return found; } +static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev, + struct ocrdma_ucontext *uctx, + struct ib_udata *udata) +{ + struct ocrdma_pd *pd = NULL; + int status = 0; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + if (udata && uctx) { + pd->dpp_enabled = + ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R; + pd->num_dpp_qp = + pd->dpp_enabled ? OCRDMA_PD_MAX_DPP_ENABLED_QP : 0; + } + +retry: + status = ocrdma_mbx_alloc_pd(dev, pd); + if (status) { + if (pd->dpp_enabled) { + pd->dpp_enabled = false; + pd->num_dpp_qp = 0; + goto retry; + } else { + kfree(pd); + return ERR_PTR(status); + } + } + + return pd; +} + +static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx, + struct ocrdma_pd *pd) +{ + return (uctx->cntxt_pd == pd ? true : false); +} + +static int _ocrdma_dealloc_pd(struct ocrdma_dev *dev, + struct ocrdma_pd *pd) +{ + int status = 0; + + status = ocrdma_mbx_dealloc_pd(dev, pd); + kfree(pd); + return status; +} + +static int ocrdma_alloc_ucontext_pd(struct ocrdma_dev *dev, + struct ocrdma_ucontext *uctx, + struct ib_udata *udata) +{ + int status = 0; + + uctx->cntxt_pd = _ocrdma_alloc_pd(dev, uctx, udata); + if (IS_ERR(uctx->cntxt_pd)) { + status = PTR_ERR(uctx->cntxt_pd); + uctx->cntxt_pd = NULL; + goto err; + } + + uctx->cntxt_pd->uctx = uctx; + uctx->cntxt_pd->ibpd.device = &dev->ibdev; +err: + return status; +} + +static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx) +{ + int status = 0; + struct ocrdma_pd *pd = uctx->cntxt_pd; + struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device); + + BUG_ON(uctx->pd_in_use); + uctx->cntxt_pd = NULL; + status = _ocrdma_dealloc_pd(dev, pd); + return status; +} + +static struct ocrdma_pd *ocrdma_get_ucontext_pd(struct ocrdma_ucontext *uctx) +{ + struct ocrdma_pd *pd = NULL; + + mutex_lock(&uctx->mm_list_lock); + if (!uctx->pd_in_use) { + uctx->pd_in_use = true; + pd = uctx->cntxt_pd; + } + mutex_unlock(&uctx->mm_list_lock); + + return pd; +} + +static void ocrdma_release_ucontext_pd(struct ocrdma_ucontext *uctx) +{ + mutex_lock(&uctx->mm_list_lock); + uctx->pd_in_use = false; + mutex_unlock(&uctx->mm_list_lock); +} + struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { @@ -230,7 +370,6 @@ struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev, ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); - ctx->dev = dev; INIT_LIST_HEAD(&ctx->mm_head); mutex_init(&ctx->mm_list_lock); @@ -243,18 +382,23 @@ struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev, memset(ctx->ah_tbl.va, 0, map_len); ctx->ah_tbl.len = map_len; + memset(&resp, 0, sizeof(resp)); resp.ah_tbl_len = ctx->ah_tbl.len; resp.ah_tbl_page = ctx->ah_tbl.pa; status = ocrdma_add_mmap(ctx, resp.ah_tbl_page, resp.ah_tbl_len); if (status) goto map_err; + + status = ocrdma_alloc_ucontext_pd(dev, ctx, udata); + if (status) + goto pd_err; + resp.dev_id = dev->id; resp.max_inline_data = dev->attr.max_inline_data; resp.wqe_size = dev->attr.wqe_size; resp.rqe_size = dev->attr.rqe_size; resp.dpp_wqe_size = dev->attr.wqe_size; - resp.rsvd = 0; memcpy(resp.fw_ver, dev->attr.fw_ver, sizeof(resp.fw_ver)); status = ib_copy_to_udata(udata, &resp, sizeof(resp)); @@ -263,6 +407,7 @@ struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev, return &ctx->ibucontext; cpy_err: +pd_err: ocrdma_del_mmap(ctx, ctx->ah_tbl.pa, ctx->ah_tbl.len); map_err: dma_free_coherent(&pdev->dev, ctx->ah_tbl.len, ctx->ah_tbl.va, @@ -273,9 +418,13 @@ map_err: int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx) { + int status = 0; struct ocrdma_mm *mm, *tmp; struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ibctx); - struct pci_dev *pdev = uctx->dev->nic_info.pdev; + struct ocrdma_dev *dev = get_ocrdma_dev(ibctx->device); + struct pci_dev *pdev = dev->nic_info.pdev; + + status = ocrdma_dealloc_ucontext_pd(uctx); ocrdma_del_mmap(uctx, uctx->ah_tbl.pa, uctx->ah_tbl.len); dma_free_coherent(&pdev->dev, uctx->ah_tbl.len, uctx->ah_tbl.va, @@ -286,13 +435,13 @@ int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx) kfree(mm); } kfree(uctx); - return 0; + return status; } int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { struct ocrdma_ucontext *ucontext = get_ocrdma_ucontext(context); - struct ocrdma_dev *dev = ucontext->dev; + struct ocrdma_dev *dev = get_ocrdma_dev(context->device); unsigned long vm_page = vma->vm_pgoff << PAGE_SHIFT; u64 unmapped_db = (u64) dev->nic_info.unmapped_db; unsigned long len = (vma->vm_end - vma->vm_start); @@ -308,7 +457,10 @@ int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) if ((vm_page >= unmapped_db) && (vm_page <= (unmapped_db + dev->nic_info.db_total_size)) && (len <= dev->nic_info.db_page_size)) { - /* doorbell mapping */ + if (vma->vm_flags & VM_READ) + return -EPERM; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); status = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, len, vma->vm_page_prot); } else if (dev->nic_info.dpp_unmapped_len && @@ -316,19 +468,20 @@ int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) (vm_page <= (u64) (dev->nic_info.dpp_unmapped_addr + dev->nic_info.dpp_unmapped_len)) && (len <= dev->nic_info.dpp_unmapped_len)) { - /* dpp area mapping */ + if (vma->vm_flags & VM_READ) + return -EPERM; + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); status = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, len, vma->vm_page_prot); } else { - /* queue memory mapping */ status = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, len, vma->vm_page_prot); } return status; } -static int ocrdma_copy_pd_uresp(struct ocrdma_pd *pd, +static int ocrdma_copy_pd_uresp(struct ocrdma_dev *dev, struct ocrdma_pd *pd, struct ib_ucontext *ib_ctx, struct ib_udata *udata) { @@ -339,21 +492,21 @@ static int ocrdma_copy_pd_uresp(struct ocrdma_pd *pd, struct ocrdma_alloc_pd_uresp rsp; struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ib_ctx); + memset(&rsp, 0, sizeof(rsp)); rsp.id = pd->id; rsp.dpp_enabled = pd->dpp_enabled; - db_page_addr = pd->dev->nic_info.unmapped_db + - (pd->id * pd->dev->nic_info.db_page_size); - db_page_size = pd->dev->nic_info.db_page_size; + db_page_addr = ocrdma_get_db_addr(dev, pd->id); + db_page_size = dev->nic_info.db_page_size; status = ocrdma_add_mmap(uctx, db_page_addr, db_page_size); if (status) return status; if (pd->dpp_enabled) { - dpp_page_addr = pd->dev->nic_info.dpp_unmapped_addr + - (pd->id * OCRDMA_DPP_PAGE_SIZE); + dpp_page_addr = dev->nic_info.dpp_unmapped_addr + + (pd->id * PAGE_SIZE); status = ocrdma_add_mmap(uctx, dpp_page_addr, - OCRDMA_DPP_PAGE_SIZE); + PAGE_SIZE); if (status) goto dpp_map_err; rsp.dpp_page_addr_hi = upper_32_bits(dpp_page_addr); @@ -369,7 +522,7 @@ static int ocrdma_copy_pd_uresp(struct ocrdma_pd *pd, ucopy_err: if (pd->dpp_enabled) - ocrdma_del_mmap(pd->uctx, dpp_page_addr, OCRDMA_DPP_PAGE_SIZE); + ocrdma_del_mmap(pd->uctx, dpp_page_addr, PAGE_SIZE); dpp_map_err: ocrdma_del_mmap(pd->uctx, db_page_addr, db_page_size); return status; @@ -381,84 +534,75 @@ struct ib_pd *ocrdma_alloc_pd(struct ib_device *ibdev, { struct ocrdma_dev *dev = get_ocrdma_dev(ibdev); struct ocrdma_pd *pd; + struct ocrdma_ucontext *uctx = NULL; int status; + u8 is_uctx_pd = false; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - pd->dev = dev; if (udata && context) { - pd->dpp_enabled = (dev->nic_info.dev_family == - OCRDMA_GEN2_FAMILY) ? true : false; - pd->num_dpp_qp = - pd->dpp_enabled ? OCRDMA_PD_MAX_DPP_ENABLED_QP : 0; + uctx = get_ocrdma_ucontext(context); + pd = ocrdma_get_ucontext_pd(uctx); + if (pd) { + is_uctx_pd = true; + goto pd_mapping; + } } - status = ocrdma_mbx_alloc_pd(dev, pd); - if (status) { - kfree(pd); - return ERR_PTR(status); + + pd = _ocrdma_alloc_pd(dev, uctx, udata); + if (IS_ERR(pd)) { + status = PTR_ERR(pd); + goto exit; } - atomic_set(&pd->use_cnt, 0); +pd_mapping: if (udata && context) { - status = ocrdma_copy_pd_uresp(pd, context, udata); + status = ocrdma_copy_pd_uresp(dev, pd, context, udata); if (status) goto err; } return &pd->ibpd; err: - ocrdma_dealloc_pd(&pd->ibpd); + if (is_uctx_pd) { + ocrdma_release_ucontext_pd(uctx); + } else { + status = ocrdma_mbx_dealloc_pd(dev, pd); + kfree(pd); + } +exit: return ERR_PTR(status); } int ocrdma_dealloc_pd(struct ib_pd *ibpd) { struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); - struct ocrdma_dev *dev = pd->dev; - int status; + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + struct ocrdma_ucontext *uctx = NULL; + int status = 0; u64 usr_db; - if (atomic_read(&pd->use_cnt)) { - ocrdma_err("%s(%d) pd=0x%x is in use.\n", - __func__, dev->id, pd->id); - status = -EFAULT; - goto dealloc_err; - } - status = ocrdma_mbx_dealloc_pd(dev, pd); - if (pd->uctx) { + uctx = pd->uctx; + if (uctx) { u64 dpp_db = dev->nic_info.dpp_unmapped_addr + - (pd->id * OCRDMA_DPP_PAGE_SIZE); + (pd->id * PAGE_SIZE); if (pd->dpp_enabled) - ocrdma_del_mmap(pd->uctx, dpp_db, OCRDMA_DPP_PAGE_SIZE); - usr_db = dev->nic_info.unmapped_db + - (pd->id * dev->nic_info.db_page_size); + ocrdma_del_mmap(pd->uctx, dpp_db, PAGE_SIZE); + usr_db = ocrdma_get_db_addr(dev, pd->id); ocrdma_del_mmap(pd->uctx, usr_db, dev->nic_info.db_page_size); + + if (is_ucontext_pd(uctx, pd)) { + ocrdma_release_ucontext_pd(uctx); + return status; + } } - kfree(pd); -dealloc_err: + status = _ocrdma_dealloc_pd(dev, pd); return status; } -static struct ocrdma_mr *ocrdma_alloc_lkey(struct ib_pd *ibpd, - int acc, u32 num_pbls, - u32 addr_check) +static int ocrdma_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_mr *mr, + u32 pdid, int acc, u32 num_pbls, u32 addr_check) { int status; - struct ocrdma_mr *mr; - struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); - struct ocrdma_dev *dev = pd->dev; - - if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE)) { - ocrdma_err("%s(%d) leaving err, invalid access rights\n", - __func__, dev->id); - return ERR_PTR(-EINVAL); - } - mr = kzalloc(sizeof(*mr), GFP_KERNEL); - if (!mr) - return ERR_PTR(-ENOMEM); - mr->hwmr.dev = dev; mr->hwmr.fr_mr = 0; mr->hwmr.local_rd = 1; mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0; @@ -468,26 +612,38 @@ static struct ocrdma_mr *ocrdma_alloc_lkey(struct ib_pd *ibpd, mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; mr->hwmr.num_pbls = num_pbls; - status = ocrdma_mbx_alloc_lkey(dev, &mr->hwmr, pd->id, addr_check); - if (status) { - kfree(mr); - return ERR_PTR(-ENOMEM); - } - mr->pd = pd; - atomic_inc(&pd->use_cnt); + status = ocrdma_mbx_alloc_lkey(dev, &mr->hwmr, pdid, addr_check); + if (status) + return status; + mr->ibmr.lkey = mr->hwmr.lkey; if (mr->hwmr.remote_wr || mr->hwmr.remote_rd) mr->ibmr.rkey = mr->hwmr.lkey; - return mr; + return 0; } struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *ibpd, int acc) { + int status; struct ocrdma_mr *mr; + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + + if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE)) { + pr_err("%s err, invalid access rights\n", __func__); + return ERR_PTR(-EINVAL); + } - mr = ocrdma_alloc_lkey(ibpd, acc, 0, OCRDMA_ADDR_CHECK_DISABLE); - if (IS_ERR(mr)) - return ERR_CAST(mr); + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + status = ocrdma_alloc_lkey(dev, mr, pd->id, acc, 0, + OCRDMA_ADDR_CHECK_DISABLE); + if (status) { + kfree(mr); + return ERR_PTR(status); + } return &mr->ibmr; } @@ -511,7 +667,8 @@ static void ocrdma_free_mr_pbl_tbl(struct ocrdma_dev *dev, } } -static int ocrdma_get_pbl_info(struct ocrdma_mr *mr, u32 num_pbes) +static int ocrdma_get_pbl_info(struct ocrdma_dev *dev, struct ocrdma_mr *mr, + u32 num_pbes) { u32 num_pbls = 0; u32 idx = 0; @@ -527,7 +684,7 @@ static int ocrdma_get_pbl_info(struct ocrdma_mr *mr, u32 num_pbes) num_pbls = roundup(num_pbes, (pbl_size / sizeof(u64))); num_pbls = num_pbls / (pbl_size / sizeof(u64)); idx++; - } while (num_pbls >= mr->hwmr.dev->attr.max_num_mr_pbl); + } while (num_pbls >= dev->attr.max_num_mr_pbl); mr->hwmr.num_pbes = num_pbes; mr->hwmr.num_pbls = num_pbls; @@ -568,10 +725,10 @@ static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr, u32 num_pbes) { struct ocrdma_pbe *pbe; - struct ib_umem_chunk *chunk; + struct scatterlist *sg; struct ocrdma_pbl *pbl_tbl = mr->hwmr.pbl_table; struct ib_umem *umem = mr->umem; - int i, shift, pg_cnt, pages, pbe_cnt, total_num_pbes = 0; + int shift, pg_cnt, pages, pbe_cnt, entry, total_num_pbes = 0; if (!mr->hwmr.num_pbes) return; @@ -581,39 +738,37 @@ static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr, shift = ilog2(umem->page_size); - list_for_each_entry(chunk, &umem->chunk_list, list) { - /* get all the dma regions from the chunk. */ - for (i = 0; i < chunk->nmap; i++) { - pages = sg_dma_len(&chunk->page_list[i]) >> shift; - for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) { - /* store the page address in pbe */ - pbe->pa_lo = - cpu_to_le32(sg_dma_address - (&chunk->page_list[i]) + - (umem->page_size * pg_cnt)); - pbe->pa_hi = - cpu_to_le32(upper_32_bits - ((sg_dma_address - (&chunk->page_list[i]) + - umem->page_size * pg_cnt))); - pbe_cnt += 1; - total_num_pbes += 1; - pbe++; - - /* if done building pbes, issue the mbx cmd. */ - if (total_num_pbes == num_pbes) - return; - - /* if the given pbl is full storing the pbes, - * move to next pbl. - */ - if (pbe_cnt == - (mr->hwmr.pbl_size / sizeof(u64))) { - pbl_tbl++; - pbe = (struct ocrdma_pbe *)pbl_tbl->va; - pbe_cnt = 0; - } + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + pages = sg_dma_len(sg) >> shift; + for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) { + /* store the page address in pbe */ + pbe->pa_lo = + cpu_to_le32(sg_dma_address + (sg) + + (umem->page_size * pg_cnt)); + pbe->pa_hi = + cpu_to_le32(upper_32_bits + ((sg_dma_address + (sg) + + umem->page_size * pg_cnt))); + pbe_cnt += 1; + total_num_pbes += 1; + pbe++; + + /* if done building pbes, issue the mbx cmd. */ + if (total_num_pbes == num_pbes) + return; + + /* if the given pbl is full storing the pbes, + * move to next pbl. + */ + if (pbe_cnt == + (mr->hwmr.pbl_size / sizeof(u64))) { + pbl_tbl++; + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + pbe_cnt = 0; } + } } } @@ -622,13 +777,12 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, u64 usr_addr, int acc, struct ib_udata *udata) { int status = -ENOMEM; - struct ocrdma_dev *dev; + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); struct ocrdma_mr *mr; struct ocrdma_pd *pd; u32 num_pbes; pd = get_ocrdma_pd(ibpd); - dev = pd->dev; if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE)) return ERR_PTR(-EINVAL); @@ -636,14 +790,13 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(status); - mr->hwmr.dev = dev; mr->umem = ib_umem_get(ibpd->uobject->context, start, len, acc, 0); if (IS_ERR(mr->umem)) { status = -EFAULT; goto umem_err; } num_pbes = ib_umem_page_count(mr->umem); - status = ocrdma_get_pbl_info(mr, num_pbes); + status = ocrdma_get_pbl_info(dev, mr, num_pbes); if (status) goto umem_err; @@ -663,8 +816,6 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, acc); if (status) goto mbx_err; - mr->pd = pd; - atomic_inc(&pd->use_cnt); mr->ibmr.lkey = mr->hwmr.lkey; if (mr->hwmr.remote_wr || mr->hwmr.remote_rd) mr->ibmr.rkey = mr->hwmr.lkey; @@ -681,15 +832,13 @@ umem_err: int ocrdma_dereg_mr(struct ib_mr *ib_mr) { struct ocrdma_mr *mr = get_ocrdma_mr(ib_mr); - struct ocrdma_dev *dev = mr->hwmr.dev; + struct ocrdma_dev *dev = get_ocrdma_dev(ib_mr->device); int status; status = ocrdma_mbx_dealloc_lkey(dev, mr->hwmr.fr_mr, mr->hwmr.lkey); - if (mr->hwmr.fr_mr == 0) - ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); + ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); - atomic_dec(&mr->pd->use_cnt); /* it could be user registered memory. */ if (mr->umem) ib_umem_release(mr->umem); @@ -697,28 +846,29 @@ int ocrdma_dereg_mr(struct ib_mr *ib_mr) return status; } -static int ocrdma_copy_cq_uresp(struct ocrdma_cq *cq, struct ib_udata *udata, +static int ocrdma_copy_cq_uresp(struct ocrdma_dev *dev, struct ocrdma_cq *cq, + struct ib_udata *udata, struct ib_ucontext *ib_ctx) { int status; - struct ocrdma_ucontext *uctx; + struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ib_ctx); struct ocrdma_create_cq_uresp uresp; + memset(&uresp, 0, sizeof(uresp)); uresp.cq_id = cq->id; - uresp.page_size = cq->len; + uresp.page_size = PAGE_ALIGN(cq->len); uresp.num_pages = 1; uresp.max_hw_cqe = cq->max_hw_cqe; uresp.page_addr[0] = cq->pa; - uresp.db_page_addr = cq->dev->nic_info.unmapped_db; - uresp.db_page_size = cq->dev->nic_info.db_page_size; + uresp.db_page_addr = ocrdma_get_db_addr(dev, uctx->cntxt_pd->id); + uresp.db_page_size = dev->nic_info.db_page_size; uresp.phase_change = cq->phase_change ? 1 : 0; status = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (status) { - ocrdma_err("%s(%d) copy error cqid=0x%x.\n", - __func__, cq->dev->id, cq->id); + pr_err("%s(%d) copy error cqid=0x%x.\n", + __func__, dev->id, cq->id); goto err; } - uctx = get_ocrdma_ucontext(ib_ctx); status = ocrdma_add_mmap(uctx, uresp.db_page_addr, uresp.db_page_size); if (status) goto err; @@ -738,6 +888,8 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, int entries, int vector, { struct ocrdma_cq *cq; struct ocrdma_dev *dev = get_ocrdma_dev(ibdev); + struct ocrdma_ucontext *uctx = NULL; + u16 pd_id = 0; int status; struct ocrdma_create_cq_ureq ureq; @@ -752,25 +904,27 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, int entries, int vector, spin_lock_init(&cq->cq_lock); spin_lock_init(&cq->comp_handler_lock); - atomic_set(&cq->use_cnt, 0); INIT_LIST_HEAD(&cq->sq_head); INIT_LIST_HEAD(&cq->rq_head); - cq->dev = dev; + cq->first_arm = true; - status = ocrdma_mbx_create_cq(dev, cq, entries, ureq.dpp_cq); + if (ib_ctx) { + uctx = get_ocrdma_ucontext(ib_ctx); + pd_id = uctx->cntxt_pd->id; + } + + status = ocrdma_mbx_create_cq(dev, cq, entries, ureq.dpp_cq, pd_id); if (status) { kfree(cq); return ERR_PTR(status); } if (ib_ctx) { - status = ocrdma_copy_cq_uresp(cq, udata, ib_ctx); + status = ocrdma_copy_cq_uresp(dev, cq, udata, ib_ctx); if (status) goto ctx_err; } cq->phase = OCRDMA_CQE_VALID; - cq->arm_needed = true; dev->cq_tbl[cq->id] = cq; - return &cq->ibcq; ctx_err: @@ -793,23 +947,60 @@ int ocrdma_resize_cq(struct ib_cq *ibcq, int new_cnt, return status; } +static void ocrdma_flush_cq(struct ocrdma_cq *cq) +{ + int cqe_cnt; + int valid_count = 0; + unsigned long flags; + + struct ocrdma_dev *dev = get_ocrdma_dev(cq->ibcq.device); + struct ocrdma_cqe *cqe = NULL; + + cqe = cq->va; + cqe_cnt = cq->cqe_cnt; + + /* Last irq might have scheduled a polling thread + * sync-up with it before hard flushing. + */ + spin_lock_irqsave(&cq->cq_lock, flags); + while (cqe_cnt) { + if (is_cqe_valid(cq, cqe)) + valid_count++; + cqe++; + cqe_cnt--; + } + ocrdma_ring_cq_db(dev, cq->id, false, false, valid_count); + spin_unlock_irqrestore(&cq->cq_lock, flags); +} + int ocrdma_destroy_cq(struct ib_cq *ibcq) { int status; struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); - struct ocrdma_dev *dev = cq->dev; + struct ocrdma_eq *eq = NULL; + struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device); + int pdid = 0; + u32 irq, indx; - if (atomic_read(&cq->use_cnt)) - return -EINVAL; + dev->cq_tbl[cq->id] = NULL; + indx = ocrdma_get_eq_table_index(dev, cq->eqn); + if (indx == -EINVAL) + BUG(); - status = ocrdma_mbx_destroy_cq(dev, cq); + eq = &dev->eq_tbl[indx]; + irq = ocrdma_get_irq(dev, eq); + synchronize_irq(irq); + ocrdma_flush_cq(cq); + status = ocrdma_mbx_destroy_cq(dev, cq); if (cq->ucontext) { - ocrdma_del_mmap(cq->ucontext, (u64) cq->pa, cq->len); - ocrdma_del_mmap(cq->ucontext, dev->nic_info.unmapped_db, + pdid = cq->ucontext->cntxt_pd->id; + ocrdma_del_mmap(cq->ucontext, (u64) cq->pa, + PAGE_ALIGN(cq->len)); + ocrdma_del_mmap(cq->ucontext, + ocrdma_get_db_addr(dev, pdid), dev->nic_info.db_page_size); } - dev->cq_tbl[cq->id] = NULL; kfree(cq); return status; @@ -834,70 +1025,70 @@ static void ocrdma_del_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp) static int ocrdma_check_qp_params(struct ib_pd *ibpd, struct ocrdma_dev *dev, struct ib_qp_init_attr *attrs) { - if (attrs->qp_type != IB_QPT_GSI && - attrs->qp_type != IB_QPT_RC && - attrs->qp_type != IB_QPT_UD) { - ocrdma_err("%s(%d) unsupported qp type=0x%x requested\n", - __func__, dev->id, attrs->qp_type); + if ((attrs->qp_type != IB_QPT_GSI) && + (attrs->qp_type != IB_QPT_RC) && + (attrs->qp_type != IB_QPT_UC) && + (attrs->qp_type != IB_QPT_UD)) { + pr_err("%s(%d) unsupported qp type=0x%x requested\n", + __func__, dev->id, attrs->qp_type); return -EINVAL; } - if (attrs->cap.max_send_wr > dev->attr.max_wqe) { - ocrdma_err("%s(%d) unsupported send_wr=0x%x requested\n", - __func__, dev->id, attrs->cap.max_send_wr); - ocrdma_err("%s(%d) supported send_wr=0x%x\n", - __func__, dev->id, dev->attr.max_wqe); + /* Skip the check for QP1 to support CM size of 128 */ + if ((attrs->qp_type != IB_QPT_GSI) && + (attrs->cap.max_send_wr > dev->attr.max_wqe)) { + pr_err("%s(%d) unsupported send_wr=0x%x requested\n", + __func__, dev->id, attrs->cap.max_send_wr); + pr_err("%s(%d) supported send_wr=0x%x\n", + __func__, dev->id, dev->attr.max_wqe); return -EINVAL; } if (!attrs->srq && (attrs->cap.max_recv_wr > dev->attr.max_rqe)) { - ocrdma_err("%s(%d) unsupported recv_wr=0x%x requested\n", - __func__, dev->id, attrs->cap.max_recv_wr); - ocrdma_err("%s(%d) supported recv_wr=0x%x\n", - __func__, dev->id, dev->attr.max_rqe); + pr_err("%s(%d) unsupported recv_wr=0x%x requested\n", + __func__, dev->id, attrs->cap.max_recv_wr); + pr_err("%s(%d) supported recv_wr=0x%x\n", + __func__, dev->id, dev->attr.max_rqe); return -EINVAL; } if (attrs->cap.max_inline_data > dev->attr.max_inline_data) { - ocrdma_err("%s(%d) unsupported inline data size=0x%x" - " requested\n", __func__, dev->id, - attrs->cap.max_inline_data); - ocrdma_err("%s(%d) supported inline data size=0x%x\n", - __func__, dev->id, dev->attr.max_inline_data); + pr_err("%s(%d) unsupported inline data size=0x%x requested\n", + __func__, dev->id, attrs->cap.max_inline_data); + pr_err("%s(%d) supported inline data size=0x%x\n", + __func__, dev->id, dev->attr.max_inline_data); return -EINVAL; } if (attrs->cap.max_send_sge > dev->attr.max_send_sge) { - ocrdma_err("%s(%d) unsupported send_sge=0x%x requested\n", - __func__, dev->id, attrs->cap.max_send_sge); - ocrdma_err("%s(%d) supported send_sge=0x%x\n", - __func__, dev->id, dev->attr.max_send_sge); + pr_err("%s(%d) unsupported send_sge=0x%x requested\n", + __func__, dev->id, attrs->cap.max_send_sge); + pr_err("%s(%d) supported send_sge=0x%x\n", + __func__, dev->id, dev->attr.max_send_sge); return -EINVAL; } if (attrs->cap.max_recv_sge > dev->attr.max_recv_sge) { - ocrdma_err("%s(%d) unsupported recv_sge=0x%x requested\n", - __func__, dev->id, attrs->cap.max_recv_sge); - ocrdma_err("%s(%d) supported recv_sge=0x%x\n", - __func__, dev->id, dev->attr.max_recv_sge); + pr_err("%s(%d) unsupported recv_sge=0x%x requested\n", + __func__, dev->id, attrs->cap.max_recv_sge); + pr_err("%s(%d) supported recv_sge=0x%x\n", + __func__, dev->id, dev->attr.max_recv_sge); return -EINVAL; } /* unprivileged user space cannot create special QP */ if (ibpd->uobject && attrs->qp_type == IB_QPT_GSI) { - ocrdma_err + pr_err ("%s(%d) Userspace can't create special QPs of type=0x%x\n", __func__, dev->id, attrs->qp_type); return -EINVAL; } /* allow creating only one GSI type of QP */ if (attrs->qp_type == IB_QPT_GSI && dev->gsi_qp_created) { - ocrdma_err("%s(%d) GSI special QPs already created.\n", - __func__, dev->id); + pr_err("%s(%d) GSI special QPs already created.\n", + __func__, dev->id); return -EINVAL; } /* verify consumer QPs are not trying to use GSI QP's CQ */ if ((attrs->qp_type != IB_QPT_GSI) && (dev->gsi_qp_created)) { if ((dev->gsi_sqcq == get_ocrdma_cq(attrs->send_cq)) || - (dev->gsi_sqcq == get_ocrdma_cq(attrs->recv_cq)) || - (dev->gsi_rqcq == get_ocrdma_cq(attrs->send_cq)) || - (dev->gsi_rqcq == get_ocrdma_cq(attrs->recv_cq))) { - ocrdma_err("%s(%d) Consumer QP cannot use GSI CQs.\n", - __func__, dev->id); + (dev->gsi_rqcq == get_ocrdma_cq(attrs->recv_cq))) { + pr_err("%s(%d) Consumer QP cannot use GSI CQs.\n", + __func__, dev->id); return -EINVAL; } } @@ -920,28 +1111,21 @@ static int ocrdma_copy_qp_uresp(struct ocrdma_qp *qp, uresp.qp_id = qp->id; uresp.sq_dbid = qp->sq.dbid; uresp.num_sq_pages = 1; - uresp.sq_page_size = qp->sq.len; + uresp.sq_page_size = PAGE_ALIGN(qp->sq.len); uresp.sq_page_addr[0] = qp->sq.pa; uresp.num_wqe_allocated = qp->sq.max_cnt; if (!srq) { uresp.rq_dbid = qp->rq.dbid; uresp.num_rq_pages = 1; - uresp.rq_page_size = qp->rq.len; + uresp.rq_page_size = PAGE_ALIGN(qp->rq.len); uresp.rq_page_addr[0] = qp->rq.pa; uresp.num_rqe_allocated = qp->rq.max_cnt; } uresp.db_page_addr = usr_db; uresp.db_page_size = dev->nic_info.db_page_size; - if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { - uresp.db_sq_offset = OCRDMA_DB_GEN2_SQ_OFFSET; - uresp.db_rq_offset = ((qp->id & 0xFFFF) < 128) ? - OCRDMA_DB_GEN2_RQ1_OFFSET : OCRDMA_DB_GEN2_RQ2_OFFSET; - uresp.db_shift = (qp->id < 128) ? 24 : 16; - } else { - uresp.db_sq_offset = OCRDMA_DB_SQ_OFFSET; - uresp.db_rq_offset = OCRDMA_DB_RQ_OFFSET; - uresp.db_shift = 16; - } + uresp.db_sq_offset = OCRDMA_DB_GEN2_SQ_OFFSET; + uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET; + uresp.db_shift = OCRDMA_DB_RQ_SHIFT; if (qp->dpp_enabled) { uresp.dpp_credit = dpp_credit_lmt; @@ -949,7 +1133,7 @@ static int ocrdma_copy_qp_uresp(struct ocrdma_qp *qp, } status = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (status) { - ocrdma_err("%s(%d) user copy error.\n", __func__, dev->id); + pr_err("%s(%d) user copy error.\n", __func__, dev->id); goto err; } status = ocrdma_add_mmap(pd->uctx, uresp.sq_page_addr[0], @@ -973,14 +1157,13 @@ err: static void ocrdma_set_qp_db(struct ocrdma_dev *dev, struct ocrdma_qp *qp, struct ocrdma_pd *pd) { - if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { qp->sq_db = dev->nic_info.db + (pd->id * dev->nic_info.db_page_size) + OCRDMA_DB_GEN2_SQ_OFFSET; qp->rq_db = dev->nic_info.db + (pd->id * dev->nic_info.db_page_size) + - ((qp->id < 128) ? - OCRDMA_DB_GEN2_RQ1_OFFSET : OCRDMA_DB_GEN2_RQ2_OFFSET); + OCRDMA_DB_GEN2_RQ_OFFSET; } else { qp->sq_db = dev->nic_info.db + (pd->id * dev->nic_info.db_page_size) + @@ -1021,16 +1204,7 @@ static void ocrdma_set_qp_init_params(struct ocrdma_qp *qp, qp->sq.max_sges = attrs->cap.max_send_sge; qp->rq.max_sges = attrs->cap.max_recv_sge; qp->state = OCRDMA_QPS_RST; -} - -static void ocrdma_set_qp_use_cnt(struct ocrdma_qp *qp, struct ocrdma_pd *pd) -{ - atomic_inc(&pd->use_cnt); - atomic_inc(&qp->sq_cq->use_cnt); - atomic_inc(&qp->rq_cq->use_cnt); - if (qp->srq) - atomic_inc(&qp->srq->use_cnt); - qp->ibqp.qp_num = qp->id; + qp->signaled = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) ? true : false; } static void ocrdma_store_gsi_qp_cq(struct ocrdma_dev *dev, @@ -1050,7 +1224,7 @@ struct ib_qp *ocrdma_create_qp(struct ib_pd *ibpd, int status; struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); struct ocrdma_qp *qp; - struct ocrdma_dev *dev = pd->dev; + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); struct ocrdma_create_qp_ureq ureq; u16 dpp_credit_lmt, dpp_offset; @@ -1070,6 +1244,9 @@ struct ib_qp *ocrdma_create_qp(struct ib_pd *ibpd, } qp->dev = dev; ocrdma_set_qp_init_params(qp, pd, attrs); + if (udata == NULL) + qp->cap_flags |= (OCRDMA_QP_MW_BIND | OCRDMA_QP_LKEY0 | + OCRDMA_QP_FAST_REG); mutex_lock(&dev->dev_lock); status = ocrdma_mbx_create_qp(qp, attrs, ureq.enable_dpp_cq, @@ -1080,8 +1257,6 @@ struct ib_qp *ocrdma_create_qp(struct ib_pd *ibpd, /* user space QP's wr_id table are managed in library */ if (udata == NULL) { - qp->cap_flags |= (OCRDMA_QP_MW_BIND | OCRDMA_QP_LKEY0 | - OCRDMA_QP_FAST_REG); status = ocrdma_alloc_wr_id_tbl(qp); if (status) goto map_err; @@ -1099,7 +1274,7 @@ struct ib_qp *ocrdma_create_qp(struct ib_pd *ibpd, goto cpy_err; } ocrdma_store_gsi_qp_cq(dev, attrs); - ocrdma_set_qp_use_cnt(qp, pd); + qp->ibqp.qp_num = qp->id; mutex_unlock(&dev->dev_lock); return &qp->ibqp; @@ -1112,7 +1287,7 @@ mbx_err: kfree(qp->wqe_wr_id_tbl); kfree(qp->rqe_wr_id_tbl); kfree(qp); - ocrdma_err("%s(%d) error=%d\n", __func__, dev->id, status); + pr_err("%s(%d) error=%d\n", __func__, dev->id, status); gen_err: return ERR_PTR(status); } @@ -1128,13 +1303,14 @@ int _ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, qp = get_ocrdma_qp(ibqp); dev = qp->dev; if (attr_mask & IB_QP_STATE) - status = ocrdma_qp_state_machine(qp, attr->qp_state, &old_qps); + status = ocrdma_qp_state_change(qp, attr->qp_state, &old_qps); /* if new and previous states are same hw doesn't need to * know about it. */ if (status < 0) return status; - status = ocrdma_mbx_modify_qp(dev, qp, attr, attr_mask, old_qps); + status = ocrdma_mbx_modify_qp(dev, qp, attr, attr_mask); + return status; } @@ -1161,11 +1337,12 @@ int ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, new_qps = old_qps; spin_unlock_irqrestore(&qp->q_lock, flags); - if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask)) { - ocrdma_err("%s(%d) invalid attribute mask=0x%x specified for " - "qpn=0x%x of type=0x%x old_qps=0x%x, new_qps=0x%x\n", - __func__, dev->id, attr_mask, qp->id, ibqp->qp_type, - old_qps, new_qps); + if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask, + IB_LINK_LAYER_ETHERNET)) { + pr_err("%s(%d) invalid attribute mask=0x%x specified for\n" + "qpn=0x%x of type=0x%x old_qps=0x%x, new_qps=0x%x\n", + __func__, dev->id, attr_mask, qp->id, ibqp->qp_type, + old_qps, new_qps); goto param_err; } @@ -1239,7 +1416,7 @@ int ocrdma_query_qp(struct ib_qp *ibqp, qp_attr->cap.max_recv_wr = qp->rq.max_cnt - 1; qp_attr->cap.max_send_sge = qp->sq.max_sges; qp_attr->cap.max_recv_sge = qp->rq.max_sges; - qp_attr->cap.max_inline_data = dev->attr.max_inline_data; + qp_attr->cap.max_inline_data = qp->max_inline_data; qp_init_attr->cap = qp_attr->cap; memcpy(&qp_attr->ah_attr.grh.dgid, ¶ms.dgid[0], sizeof(params.dgid)); @@ -1250,7 +1427,7 @@ int ocrdma_query_qp(struct ib_qp *ibqp, OCRDMA_QP_PARAMS_HOP_LMT_MASK) >> OCRDMA_QP_PARAMS_HOP_LMT_SHIFT; qp_attr->ah_attr.grh.traffic_class = (params.tclass_sq_psn & - OCRDMA_QP_PARAMS_SQ_PSN_MASK) >> + OCRDMA_QP_PARAMS_TCLASS_MASK) >> OCRDMA_QP_PARAMS_TCLASS_SHIFT; qp_attr->ah_attr.ah_flags = IB_AH_GRH; @@ -1302,23 +1479,17 @@ static void ocrdma_srq_toggle_bit(struct ocrdma_srq *srq, int idx) static int ocrdma_hwq_free_cnt(struct ocrdma_qp_hwq_info *q) { - int free_cnt; - if (q->head >= q->tail) - free_cnt = (q->max_cnt - q->head) + q->tail; - else - free_cnt = q->tail - q->head; - return free_cnt; + return ((q->max_wqe_idx - q->head) + q->tail) % q->max_cnt; } static int is_hw_sq_empty(struct ocrdma_qp *qp) { - return (qp->sq.tail == qp->sq.head && - ocrdma_hwq_free_cnt(&qp->sq) ? 1 : 0); + return (qp->sq.tail == qp->sq.head); } static int is_hw_rq_empty(struct ocrdma_qp *qp) { - return (qp->rq.tail == qp->rq.head) ? 1 : 0; + return (qp->rq.tail == qp->rq.head); } static void *ocrdma_hwq_head(struct ocrdma_qp_hwq_info *q) @@ -1350,7 +1521,7 @@ static void ocrdma_discard_cqes(struct ocrdma_qp *qp, struct ocrdma_cq *cq) int discard_cnt = 0; u32 cur_getp, stop_getp; struct ocrdma_cqe *cqe; - u32 qpn = 0; + u32 qpn = 0, wqe_idx = 0; spin_lock_irqsave(&cq->cq_lock, cq_flags); @@ -1379,30 +1550,36 @@ static void ocrdma_discard_cqes(struct ocrdma_qp *qp, struct ocrdma_cq *cq) if (qpn == 0 || qpn != qp->id) goto skip_cqe; - /* mark cqe discarded so that it is not picked up later - * in the poll_cq(). - */ - discard_cnt += 1; - cqe->cmn.qpn = 0; - if (is_cqe_for_sq(cqe)) + if (is_cqe_for_sq(cqe)) { ocrdma_hwq_inc_tail(&qp->sq); - else { + } else { if (qp->srq) { + wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >> + OCRDMA_CQE_BUFTAG_SHIFT) & + qp->srq->rq.max_wqe_idx; + if (wqe_idx < 1) + BUG(); spin_lock_irqsave(&qp->srq->q_lock, flags); ocrdma_hwq_inc_tail(&qp->srq->rq); - ocrdma_srq_toggle_bit(qp->srq, cur_getp); + ocrdma_srq_toggle_bit(qp->srq, wqe_idx - 1); spin_unlock_irqrestore(&qp->srq->q_lock, flags); - } else + } else { ocrdma_hwq_inc_tail(&qp->rq); + } } + /* mark cqe discarded so that it is not picked up later + * in the poll_cq(). + */ + discard_cnt += 1; + cqe->cmn.qpn = 0; skip_cqe: cur_getp = (cur_getp + 1) % cq->max_hw_cqe; } while (cur_getp != stop_getp); spin_unlock_irqrestore(&cq->cq_lock, cq_flags); } -static void ocrdma_del_flush_qp(struct ocrdma_qp *qp) +void ocrdma_del_flush_qp(struct ocrdma_qp *qp) { int found = false; unsigned long flags; @@ -1468,39 +1645,38 @@ int ocrdma_destroy_qp(struct ib_qp *ibqp) mutex_unlock(&dev->dev_lock); if (pd->uctx) { - ocrdma_del_mmap(pd->uctx, (u64) qp->sq.pa, qp->sq.len); + ocrdma_del_mmap(pd->uctx, (u64) qp->sq.pa, + PAGE_ALIGN(qp->sq.len)); if (!qp->srq) - ocrdma_del_mmap(pd->uctx, (u64) qp->rq.pa, qp->rq.len); + ocrdma_del_mmap(pd->uctx, (u64) qp->rq.pa, + PAGE_ALIGN(qp->rq.len)); } ocrdma_del_flush_qp(qp); - atomic_dec(&qp->pd->use_cnt); - atomic_dec(&qp->sq_cq->use_cnt); - atomic_dec(&qp->rq_cq->use_cnt); - if (qp->srq) - atomic_dec(&qp->srq->use_cnt); kfree(qp->wqe_wr_id_tbl); kfree(qp->rqe_wr_id_tbl); kfree(qp); return status; } -static int ocrdma_copy_srq_uresp(struct ocrdma_srq *srq, struct ib_udata *udata) +static int ocrdma_copy_srq_uresp(struct ocrdma_dev *dev, struct ocrdma_srq *srq, + struct ib_udata *udata) { int status; struct ocrdma_create_srq_uresp uresp; + memset(&uresp, 0, sizeof(uresp)); uresp.rq_dbid = srq->rq.dbid; uresp.num_rq_pages = 1; uresp.rq_page_addr[0] = srq->rq.pa; uresp.rq_page_size = srq->rq.len; - uresp.db_page_addr = srq->dev->nic_info.unmapped_db + - (srq->pd->id * srq->dev->nic_info.db_page_size); - uresp.db_page_size = srq->dev->nic_info.db_page_size; + uresp.db_page_addr = dev->nic_info.unmapped_db + + (srq->pd->id * dev->nic_info.db_page_size); + uresp.db_page_size = dev->nic_info.db_page_size; uresp.num_rqe_allocated = srq->rq.max_cnt; - if (srq->dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { - uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ1_OFFSET; + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { + uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET; uresp.db_shift = 24; } else { uresp.db_rq_offset = OCRDMA_DB_RQ_OFFSET; @@ -1523,7 +1699,7 @@ struct ib_srq *ocrdma_create_srq(struct ib_pd *ibpd, { int status = -ENOMEM; struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); - struct ocrdma_dev *dev = pd->dev; + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); struct ocrdma_srq *srq; if (init_attr->attr.max_sge > dev->attr.max_recv_sge) @@ -1536,10 +1712,9 @@ struct ib_srq *ocrdma_create_srq(struct ib_pd *ibpd, return ERR_PTR(status); spin_lock_init(&srq->q_lock); - srq->dev = dev; srq->pd = pd; srq->db = dev->nic_info.db + (pd->id * dev->nic_info.db_page_size); - status = ocrdma_mbx_create_srq(srq, init_attr, pd); + status = ocrdma_mbx_create_srq(dev, srq, init_attr, pd); if (status) goto err; @@ -1565,14 +1740,12 @@ struct ib_srq *ocrdma_create_srq(struct ib_pd *ibpd, goto arm_err; } - atomic_set(&srq->use_cnt, 0); if (udata) { - status = ocrdma_copy_srq_uresp(srq, udata); + status = ocrdma_copy_srq_uresp(dev, srq, udata); if (status) goto arm_err; } - atomic_inc(&pd->use_cnt); return &srq->ibsrq; arm_err: @@ -1614,22 +1787,16 @@ int ocrdma_destroy_srq(struct ib_srq *ibsrq) { int status; struct ocrdma_srq *srq; - struct ocrdma_dev *dev; + struct ocrdma_dev *dev = get_ocrdma_dev(ibsrq->device); srq = get_ocrdma_srq(ibsrq); - dev = srq->dev; - if (atomic_read(&srq->use_cnt)) { - ocrdma_err("%s(%d) err, srq=0x%x in use\n", - __func__, dev->id, srq->id); - return -EAGAIN; - } status = ocrdma_mbx_destroy_srq(dev, srq); if (srq->pd->uctx) - ocrdma_del_mmap(srq->pd->uctx, (u64) srq->rq.pa, srq->rq.len); + ocrdma_del_mmap(srq->pd->uctx, (u64) srq->rq.pa, + PAGE_ALIGN(srq->rq.len)); - atomic_dec(&srq->pd->use_cnt); kfree(srq->idx_bit_fields); kfree(srq->rqe_wr_id_tbl); kfree(srq); @@ -1670,23 +1837,43 @@ static void ocrdma_build_sges(struct ocrdma_hdr_wqe *hdr, memset(sge, 0, sizeof(*sge)); } +static inline uint32_t ocrdma_sglist_len(struct ib_sge *sg_list, int num_sge) +{ + uint32_t total_len = 0, i; + + for (i = 0; i < num_sge; i++) + total_len += sg_list[i].length; + return total_len; +} + + static int ocrdma_build_inline_sges(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, struct ocrdma_sge *sge, struct ib_send_wr *wr, u32 wqe_size) { - if (wr->send_flags & IB_SEND_INLINE) { - if (wr->sg_list[0].length > qp->max_inline_data) { - ocrdma_err("%s() supported_len=0x%x," - " unspported len req=0x%x\n", __func__, - qp->max_inline_data, wr->sg_list[0].length); + int i; + char *dpp_addr; + + if (wr->send_flags & IB_SEND_INLINE && qp->qp_type != IB_QPT_UD) { + hdr->total_len = ocrdma_sglist_len(wr->sg_list, wr->num_sge); + if (unlikely(hdr->total_len > qp->max_inline_data)) { + pr_err("%s() supported_len=0x%x,\n" + " unspported len req=0x%x\n", __func__, + qp->max_inline_data, hdr->total_len); return -EINVAL; } - memcpy(sge, - (void *)(unsigned long)wr->sg_list[0].addr, - wr->sg_list[0].length); - hdr->total_len = wr->sg_list[0].length; + dpp_addr = (char *)sge; + for (i = 0; i < wr->num_sge; i++) { + memcpy(dpp_addr, + (void *)(unsigned long)wr->sg_list[i].addr, + wr->sg_list[i].length); + dpp_addr += wr->sg_list[i].length; + } + wqe_size += roundup(hdr->total_len, OCRDMA_WQE_ALIGN_BYTES); + if (0 == hdr->total_len) + wqe_size += sizeof(struct ocrdma_sge); hdr->cw |= (OCRDMA_TYPE_INLINE << OCRDMA_WQE_TYPE_SHIFT); } else { ocrdma_build_sges(hdr, sge, wr->num_sge, wr->sg_list); @@ -1711,8 +1898,9 @@ static int ocrdma_build_send(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, ocrdma_build_ud_hdr(qp, hdr, wr); sge = (struct ocrdma_sge *)(hdr + 2); wqe_size += sizeof(struct ocrdma_ewqe_ud_hdr); - } else + } else { sge = (struct ocrdma_sge *)(hdr + 1); + } status = ocrdma_build_inline_sges(qp, hdr, sge, wr, wqe_size); return status; @@ -1755,9 +1943,97 @@ static void ocrdma_build_read(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, ext_rw->len = hdr->total_len; } +static void build_frmr_pbes(struct ib_send_wr *wr, struct ocrdma_pbl *pbl_tbl, + struct ocrdma_hw_mr *hwmr) +{ + int i; + u64 buf_addr = 0; + int num_pbes; + struct ocrdma_pbe *pbe; + + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + num_pbes = 0; + + /* go through the OS phy regions & fill hw pbe entries into pbls. */ + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) { + /* number of pbes can be more for one OS buf, when + * buffers are of different sizes. + * split the ib_buf to one or more pbes. + */ + buf_addr = wr->wr.fast_reg.page_list->page_list[i]; + pbe->pa_lo = cpu_to_le32((u32) (buf_addr & PAGE_MASK)); + pbe->pa_hi = cpu_to_le32((u32) upper_32_bits(buf_addr)); + num_pbes += 1; + pbe++; + + /* if the pbl is full storing the pbes, + * move to next pbl. + */ + if (num_pbes == (hwmr->pbl_size/sizeof(u64))) { + pbl_tbl++; + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + } + } + return; +} + +static int get_encoded_page_size(int pg_sz) +{ + /* Max size is 256M 4096 << 16 */ + int i = 0; + for (; i < 17; i++) + if (pg_sz == (4096 << i)) + break; + return i; +} + + +static int ocrdma_build_fr(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, + struct ib_send_wr *wr) +{ + u64 fbo; + struct ocrdma_ewqe_fr *fast_reg = (struct ocrdma_ewqe_fr *)(hdr + 1); + struct ocrdma_mr *mr; + u32 wqe_size = sizeof(*fast_reg) + sizeof(*hdr); + + wqe_size = roundup(wqe_size, OCRDMA_WQE_ALIGN_BYTES); + + if (wr->wr.fast_reg.page_list_len > qp->dev->attr.max_pages_per_frmr) + return -EINVAL; + + hdr->cw |= (OCRDMA_FR_MR << OCRDMA_WQE_OPCODE_SHIFT); + hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT); + + if (wr->wr.fast_reg.page_list_len == 0) + BUG(); + if (wr->wr.fast_reg.access_flags & IB_ACCESS_LOCAL_WRITE) + hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_LOCAL_WR; + if (wr->wr.fast_reg.access_flags & IB_ACCESS_REMOTE_WRITE) + hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_REMOTE_WR; + if (wr->wr.fast_reg.access_flags & IB_ACCESS_REMOTE_READ) + hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_REMOTE_RD; + hdr->lkey = wr->wr.fast_reg.rkey; + hdr->total_len = wr->wr.fast_reg.length; + + fbo = wr->wr.fast_reg.iova_start - + (wr->wr.fast_reg.page_list->page_list[0] & PAGE_MASK); + + fast_reg->va_hi = upper_32_bits(wr->wr.fast_reg.iova_start); + fast_reg->va_lo = (u32) (wr->wr.fast_reg.iova_start & 0xffffffff); + fast_reg->fbo_hi = upper_32_bits(fbo); + fast_reg->fbo_lo = (u32) fbo & 0xffffffff; + fast_reg->num_sges = wr->wr.fast_reg.page_list_len; + fast_reg->size_sge = + get_encoded_page_size(1 << wr->wr.fast_reg.page_shift); + mr = (struct ocrdma_mr *) (unsigned long) + qp->dev->stag_arr[(hdr->lkey >> 8) & (OCRDMA_MAX_STAG - 1)]; + build_frmr_pbes(wr, mr->hwmr.pbl_table, &mr->hwmr); + return 0; +} + static void ocrdma_ring_sq_db(struct ocrdma_qp *qp) { - u32 val = qp->sq.dbid | (1 << 16); + u32 val = qp->sq.dbid | (1 << OCRDMA_DB_SQ_SHIFT); iowrite32(val, qp->sq_db); } @@ -1773,18 +2049,20 @@ int ocrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, spin_lock_irqsave(&qp->q_lock, flags); if (qp->state != OCRDMA_QPS_RTS && qp->state != OCRDMA_QPS_SQD) { spin_unlock_irqrestore(&qp->q_lock, flags); + *bad_wr = wr; return -EINVAL; } while (wr) { if (ocrdma_hwq_free_cnt(&qp->sq) == 0 || wr->num_sge > qp->sq.max_sges) { + *bad_wr = wr; status = -ENOMEM; break; } hdr = ocrdma_hwq_head(&qp->sq); hdr->cw = 0; - if (wr->send_flags & IB_SEND_SIGNALED) + if (wr->send_flags & IB_SEND_SIGNALED || qp->signaled) hdr->cw |= (OCRDMA_FLAG_SIG << OCRDMA_WQE_FLAGS_SHIFT); if (wr->send_flags & IB_SEND_FENCE) hdr->cw |= @@ -1822,10 +2100,14 @@ int ocrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, case IB_WR_LOCAL_INV: hdr->cw |= (OCRDMA_LKEY_INV << OCRDMA_WQE_OPCODE_SHIFT); - hdr->cw |= (sizeof(struct ocrdma_hdr_wqe) / + hdr->cw |= ((sizeof(struct ocrdma_hdr_wqe) + + sizeof(struct ocrdma_sge)) / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT; hdr->lkey = wr->ex.invalidate_rkey; break; + case IB_WR_FAST_REG_MR: + status = ocrdma_build_fr(qp, hdr, wr); + break; default: status = -EINVAL; break; @@ -1834,7 +2116,7 @@ int ocrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, *bad_wr = wr; break; } - if (wr->send_flags & IB_SEND_SIGNALED) + if (wr->send_flags & IB_SEND_SIGNALED || qp->signaled) qp->wqe_wr_id_tbl[qp->sq.head].signaled = 1; else qp->wqe_wr_id_tbl[qp->sq.head].signaled = 0; @@ -1856,7 +2138,7 @@ int ocrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, static void ocrdma_ring_rq_db(struct ocrdma_qp *qp) { - u32 val = qp->rq.dbid | (1 << OCRDMA_GET_NUM_POSTED_SHIFT_VAL(qp)); + u32 val = qp->rq.dbid | (1 << OCRDMA_DB_RQ_SHIFT); iowrite32(val, qp->rq_db); } @@ -1944,7 +2226,7 @@ static int ocrdma_srq_get_idx(struct ocrdma_srq *srq) if (row == srq->bit_fields_len) BUG(); - return indx; + return indx + 1; /* Use from index 1 */ } static void ocrdma_ring_srq_db(struct ocrdma_srq *srq) @@ -1992,7 +2274,7 @@ int ocrdma_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, static enum ib_wc_status ocrdma_to_ibwc_err(u16 status) { - enum ib_wc_status ibwc_status = IB_WC_GENERAL_ERR; + enum ib_wc_status ibwc_status; switch (status) { case OCRDMA_CQE_GENERAL_ERR: @@ -2061,7 +2343,7 @@ static enum ib_wc_status ocrdma_to_ibwc_err(u16 status) default: ibwc_status = IB_WC_GENERAL_ERR; break; - }; + } return ibwc_status; } @@ -2089,15 +2371,18 @@ static void ocrdma_update_wc(struct ocrdma_qp *qp, struct ib_wc *ibwc, case OCRDMA_SEND: ibwc->opcode = IB_WC_SEND; break; + case OCRDMA_FR_MR: + ibwc->opcode = IB_WC_FAST_REG_MR; + break; case OCRDMA_LKEY_INV: ibwc->opcode = IB_WC_LOCAL_INV; break; default: ibwc->status = IB_WC_GENERAL_ERR; - ocrdma_err("%s() invalid opcode received = 0x%x\n", - __func__, hdr->cw & OCRDMA_WQE_OPCODE_MASK); + pr_err("%s() invalid opcode received = 0x%x\n", + __func__, hdr->cw & OCRDMA_WQE_OPCODE_MASK); break; - }; + } } static void ocrdma_set_cqe_status_flushed(struct ocrdma_qp *qp, @@ -2142,7 +2427,7 @@ static bool ocrdma_update_err_cqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe, ibwc->status = ocrdma_to_ibwc_err(status); ocrdma_flush_qp(qp); - ocrdma_qp_state_machine(qp, IB_QPS_ERR, NULL); + ocrdma_qp_state_change(qp, IB_QPS_ERR, NULL); /* if wqe/rqe pending for which cqe needs to be returned, * trigger inflating it. @@ -2227,7 +2512,8 @@ static bool ocrdma_poll_success_scqe(struct ocrdma_qp *qp, ocrdma_update_wc(qp, ibwc, tail); *polled = true; } - wqe_idx = le32_to_cpu(cqe->wq.wqeidx) & OCRDMA_CQE_WQEIDX_MASK; + wqe_idx = (le32_to_cpu(cqe->wq.wqeidx) & + OCRDMA_CQE_WQEIDX_MASK) & qp->sq.max_wqe_idx; if (tail != wqe_idx) expand = true; /* Coalesced CQE can't be consumed yet */ @@ -2276,10 +2562,14 @@ static void ocrdma_update_free_srq_cqe(struct ib_wc *ibwc, u32 wqe_idx; srq = get_ocrdma_srq(qp->ibqp.srq); - wqe_idx = le32_to_cpu(cqe->rq.buftag_qpn) >> OCRDMA_CQE_BUFTAG_SHIFT; + wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >> + OCRDMA_CQE_BUFTAG_SHIFT) & srq->rq.max_wqe_idx; + if (wqe_idx < 1) + BUG(); + ibwc->wr_id = srq->rqe_wr_id_tbl[wqe_idx]; spin_lock_irqsave(&srq->q_lock, flags); - ocrdma_srq_toggle_bit(srq, wqe_idx); + ocrdma_srq_toggle_bit(srq, wqe_idx - 1); spin_unlock_irqrestore(&srq->q_lock, flags); ocrdma_hwq_inc_tail(&srq->rq); } @@ -2333,9 +2623,9 @@ static void ocrdma_poll_success_rcqe(struct ocrdma_qp *qp, ibwc->ex.invalidate_rkey = le32_to_cpu(cqe->rq.lkey_immdt); ibwc->wc_flags |= IB_WC_WITH_INVALIDATE; } - if (qp->ibqp.srq) + if (qp->ibqp.srq) { ocrdma_update_free_srq_cqe(ibwc, cqe, qp); - else { + } else { ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail]; ocrdma_hwq_inc_tail(&qp->rq); } @@ -2348,13 +2638,14 @@ static bool ocrdma_poll_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, bool expand = false; ibwc->wc_flags = 0; - if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) + if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) { status = (le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_UD_STATUS_MASK) >> OCRDMA_CQE_UD_STATUS_SHIFT; - else + } else { status = (le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT; + } if (status == OCRDMA_CQE_SUCCESS) { *polled = true; @@ -2372,9 +2663,10 @@ static void ocrdma_change_cq_phase(struct ocrdma_cq *cq, struct ocrdma_cqe *cqe, if (cq->phase_change) { if (cur_getp == 0) cq->phase = (~cq->phase & OCRDMA_CQE_VALID); - } else + } else { /* clear valid bit */ cqe->flags_status_srcqpn = 0; + } } static int ocrdma_poll_hwcq(struct ocrdma_cq *cq, int num_entries, @@ -2385,7 +2677,7 @@ static int ocrdma_poll_hwcq(struct ocrdma_cq *cq, int num_entries, bool expand = false; int polled_hw_cqes = 0; struct ocrdma_qp *qp = NULL; - struct ocrdma_dev *dev = cq->dev; + struct ocrdma_dev *dev = get_ocrdma_dev(cq->ibcq.device); struct ocrdma_cqe *cqe; u16 cur_getp; bool polled = false; bool stop = false; @@ -2429,10 +2721,18 @@ expand_cqe: } stop_cqe: cq->getp = cur_getp; - if (polled_hw_cqes || expand || stop) { - ocrdma_ring_cq_db(dev, cq->id, cq->armed, cq->solicited, + if (cq->deferred_arm) { + ocrdma_ring_cq_db(dev, cq->id, true, cq->deferred_sol, + polled_hw_cqes); + cq->deferred_arm = false; + cq->deferred_sol = false; + } else { + /* We need to pop the CQE. No need to arm */ + ocrdma_ring_cq_db(dev, cq->id, false, cq->deferred_sol, polled_hw_cqes); + cq->deferred_sol = false; } + return i; } @@ -2451,8 +2751,9 @@ static int ocrdma_add_err_cqe(struct ocrdma_cq *cq, int num_entries, } else if (!is_hw_rq_empty(qp) && qp->rq_cq == cq) { ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail]; ocrdma_hwq_inc_tail(&qp->rq); - } else + } else { return err_cqes; + } ibwc->byte_len = 0; ibwc->status = IB_WC_WR_FLUSH_ERR; ibwc = ibwc + 1; @@ -2465,14 +2766,11 @@ static int ocrdma_add_err_cqe(struct ocrdma_cq *cq, int num_entries, int ocrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) { int cqes_to_poll = num_entries; - struct ocrdma_cq *cq = NULL; - unsigned long flags; - struct ocrdma_dev *dev; + struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); + struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device); int num_os_cqe = 0, err_cqes = 0; struct ocrdma_qp *qp; - - cq = get_ocrdma_cq(ibcq); - dev = cq->dev; + unsigned long flags; /* poll cqes from adapter CQ */ spin_lock_irqsave(&cq->cq_lock, flags); @@ -2503,34 +2801,254 @@ int ocrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) int ocrdma_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags cq_flags) { - struct ocrdma_cq *cq; - unsigned long flags; - struct ocrdma_dev *dev; + struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); + struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device); u16 cq_id; - u16 cur_getp; - struct ocrdma_cqe *cqe; + unsigned long flags; + bool arm_needed = false, sol_needed = false; - cq = get_ocrdma_cq(ibcq); cq_id = cq->id; - dev = cq->dev; spin_lock_irqsave(&cq->cq_lock, flags); if (cq_flags & IB_CQ_NEXT_COMP || cq_flags & IB_CQ_SOLICITED) - cq->armed = true; + arm_needed = true; if (cq_flags & IB_CQ_SOLICITED) - cq->solicited = true; + sol_needed = true; - cur_getp = cq->getp; - cqe = cq->va + cur_getp; - - /* check whether any valid cqe exist or not, if not then safe to - * arm. If cqe is not yet consumed, then let it get consumed and then - * we arm it to avoid false interrupts. - */ - if (!is_cqe_valid(cq, cqe) || cq->arm_needed) { - cq->arm_needed = false; - ocrdma_ring_cq_db(dev, cq_id, cq->armed, cq->solicited, 0); + if (cq->first_arm) { + ocrdma_ring_cq_db(dev, cq_id, arm_needed, sol_needed, 0); + cq->first_arm = false; + goto skip_defer; } + cq->deferred_arm = true; + +skip_defer: + cq->deferred_sol = sol_needed; spin_unlock_irqrestore(&cq->cq_lock, flags); + return 0; } + +struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *ibpd, int max_page_list_len) +{ + int status; + struct ocrdma_mr *mr; + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + + if (max_page_list_len > dev->attr.max_pages_per_frmr) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + status = ocrdma_get_pbl_info(dev, mr, max_page_list_len); + if (status) + goto pbl_err; + mr->hwmr.fr_mr = 1; + mr->hwmr.remote_rd = 0; + mr->hwmr.remote_wr = 0; + mr->hwmr.local_rd = 0; + mr->hwmr.local_wr = 0; + mr->hwmr.mw_bind = 0; + status = ocrdma_build_pbl_tbl(dev, &mr->hwmr); + if (status) + goto pbl_err; + status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, 0); + if (status) + goto mbx_err; + mr->ibmr.rkey = mr->hwmr.lkey; + mr->ibmr.lkey = mr->hwmr.lkey; + dev->stag_arr[(mr->hwmr.lkey >> 8) & (OCRDMA_MAX_STAG - 1)] = + (unsigned long) mr; + return &mr->ibmr; +mbx_err: + ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); +pbl_err: + kfree(mr); + return ERR_PTR(-ENOMEM); +} + +struct ib_fast_reg_page_list *ocrdma_alloc_frmr_page_list(struct ib_device + *ibdev, + int page_list_len) +{ + struct ib_fast_reg_page_list *frmr_list; + int size; + + size = sizeof(*frmr_list) + (page_list_len * sizeof(u64)); + frmr_list = kzalloc(size, GFP_KERNEL); + if (!frmr_list) + return ERR_PTR(-ENOMEM); + frmr_list->page_list = (u64 *)(frmr_list + 1); + return frmr_list; +} + +void ocrdma_free_frmr_page_list(struct ib_fast_reg_page_list *page_list) +{ + kfree(page_list); +} + +#define MAX_KERNEL_PBE_SIZE 65536 +static inline int count_kernel_pbes(struct ib_phys_buf *buf_list, + int buf_cnt, u32 *pbe_size) +{ + u64 total_size = 0; + u64 buf_size = 0; + int i; + *pbe_size = roundup(buf_list[0].size, PAGE_SIZE); + *pbe_size = roundup_pow_of_two(*pbe_size); + + /* find the smallest PBE size that we can have */ + for (i = 0; i < buf_cnt; i++) { + /* first addr may not be page aligned, so ignore checking */ + if ((i != 0) && ((buf_list[i].addr & ~PAGE_MASK) || + (buf_list[i].size & ~PAGE_MASK))) { + return 0; + } + + /* if configured PBE size is greater then the chosen one, + * reduce the PBE size. + */ + buf_size = roundup(buf_list[i].size, PAGE_SIZE); + /* pbe_size has to be even multiple of 4K 1,2,4,8...*/ + buf_size = roundup_pow_of_two(buf_size); + if (*pbe_size > buf_size) + *pbe_size = buf_size; + + total_size += buf_size; + } + *pbe_size = *pbe_size > MAX_KERNEL_PBE_SIZE ? + (MAX_KERNEL_PBE_SIZE) : (*pbe_size); + + /* num_pbes = total_size / (*pbe_size); this is implemented below. */ + + return total_size >> ilog2(*pbe_size); +} + +static void build_kernel_pbes(struct ib_phys_buf *buf_list, int ib_buf_cnt, + u32 pbe_size, struct ocrdma_pbl *pbl_tbl, + struct ocrdma_hw_mr *hwmr) +{ + int i; + int idx; + int pbes_per_buf = 0; + u64 buf_addr = 0; + int num_pbes; + struct ocrdma_pbe *pbe; + int total_num_pbes = 0; + + if (!hwmr->num_pbes) + return; + + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + num_pbes = 0; + + /* go through the OS phy regions & fill hw pbe entries into pbls. */ + for (i = 0; i < ib_buf_cnt; i++) { + buf_addr = buf_list[i].addr; + pbes_per_buf = + roundup_pow_of_two(roundup(buf_list[i].size, PAGE_SIZE)) / + pbe_size; + hwmr->len += buf_list[i].size; + /* number of pbes can be more for one OS buf, when + * buffers are of different sizes. + * split the ib_buf to one or more pbes. + */ + for (idx = 0; idx < pbes_per_buf; idx++) { + /* we program always page aligned addresses, + * first unaligned address is taken care by fbo. + */ + if (i == 0) { + /* for non zero fbo, assign the + * start of the page. + */ + pbe->pa_lo = + cpu_to_le32((u32) (buf_addr & PAGE_MASK)); + pbe->pa_hi = + cpu_to_le32((u32) upper_32_bits(buf_addr)); + } else { + pbe->pa_lo = + cpu_to_le32((u32) (buf_addr & 0xffffffff)); + pbe->pa_hi = + cpu_to_le32((u32) upper_32_bits(buf_addr)); + } + buf_addr += pbe_size; + num_pbes += 1; + total_num_pbes += 1; + pbe++; + + if (total_num_pbes == hwmr->num_pbes) + goto mr_tbl_done; + /* if the pbl is full storing the pbes, + * move to next pbl. + */ + if (num_pbes == (hwmr->pbl_size/sizeof(u64))) { + pbl_tbl++; + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + num_pbes = 0; + } + } + } +mr_tbl_done: + return; +} + +struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *ibpd, + struct ib_phys_buf *buf_list, + int buf_cnt, int acc, u64 *iova_start) +{ + int status = -ENOMEM; + struct ocrdma_mr *mr; + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + u32 num_pbes; + u32 pbe_size = 0; + + if ((acc & IB_ACCESS_REMOTE_WRITE) && !(acc & IB_ACCESS_LOCAL_WRITE)) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(status); + + num_pbes = count_kernel_pbes(buf_list, buf_cnt, &pbe_size); + if (num_pbes == 0) { + status = -EINVAL; + goto pbl_err; + } + status = ocrdma_get_pbl_info(dev, mr, num_pbes); + if (status) + goto pbl_err; + + mr->hwmr.pbe_size = pbe_size; + mr->hwmr.fbo = *iova_start - (buf_list[0].addr & PAGE_MASK); + mr->hwmr.va = *iova_start; + mr->hwmr.local_rd = 1; + mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; + mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0; + mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0; + mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; + mr->hwmr.mw_bind = (acc & IB_ACCESS_MW_BIND) ? 1 : 0; + + status = ocrdma_build_pbl_tbl(dev, &mr->hwmr); + if (status) + goto pbl_err; + build_kernel_pbes(buf_list, buf_cnt, pbe_size, mr->hwmr.pbl_table, + &mr->hwmr); + status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, acc); + if (status) + goto mbx_err; + + mr->ibmr.lkey = mr->hwmr.lkey; + if (mr->hwmr.remote_wr || mr->hwmr.remote_rd) + mr->ibmr.rkey = mr->hwmr.lkey; + return &mr->ibmr; + +mbx_err: + ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); +pbl_err: + kfree(mr); + return ERR_PTR(status); +} diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index 633f03d8027..b8f7853fd36 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -72,6 +72,7 @@ int ocrdma_query_qp(struct ib_qp *, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *); int ocrdma_destroy_qp(struct ib_qp *); +void ocrdma_del_flush_qp(struct ocrdma_qp *qp); struct ib_srq *ocrdma_create_srq(struct ib_pd *, struct ib_srq_init_attr *, struct ib_udata *); @@ -89,5 +90,10 @@ struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *, int num_phys_buf, int acc, u64 *iova_start); struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length, u64 virt, int acc, struct ib_udata *); +struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *pd, int max_page_list_len); +struct ib_fast_reg_page_list *ocrdma_alloc_frmr_page_list(struct ib_device + *ibdev, + int page_list_len); +void ocrdma_free_frmr_page_list(struct ib_fast_reg_page_list *page_list); #endif /* __OCRDMA_VERBS_H__ */ diff --git a/drivers/infiniband/hw/qib/Kconfig b/drivers/infiniband/hw/qib/Kconfig index 8349f9c5064..495be09781b 100644 --- a/drivers/infiniband/hw/qib/Kconfig +++ b/drivers/infiniband/hw/qib/Kconfig @@ -1,7 +1,15 @@ config INFINIBAND_QIB - tristate "QLogic PCIe HCA support" + tristate "Intel PCIe HCA support" depends on 64BIT ---help--- - This is a low-level driver for QLogic PCIe QLE InfiniBand host - channel adapters. This driver does not support the QLogic + This is a low-level driver for Intel PCIe QLE InfiniBand host + channel adapters. This driver does not support the Intel HyperTransport card (model QHT7140). + +config INFINIBAND_QIB_DCA + bool "QIB DCA support" + depends on INFINIBAND_QIB && DCA && SMP && !(INFINIBAND_QIB=y && DCA=m) + default y + ---help--- + Setting this enables DCA support on some Intel chip sets + with the iba7322 HCA. diff --git a/drivers/infiniband/hw/qib/Makefile b/drivers/infiniband/hw/qib/Makefile index f12d7bb8b39..57f8103e51f 100644 --- a/drivers/infiniband/hw/qib/Makefile +++ b/drivers/infiniband/hw/qib/Makefile @@ -13,3 +13,4 @@ ib_qib-$(CONFIG_PCI_MSI) += qib_iba6120.o ib_qib-$(CONFIG_X86_64) += qib_wc_x86_64.o ib_qib-$(CONFIG_PPC64) += qib_wc_ppc64.o +ib_qib-$(CONFIG_DEBUG_FS) += qib_debugfs.o diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 4d11575c201..c00ae093b6f 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -1,7 +1,7 @@ #ifndef _QIB_KERNEL_H #define _QIB_KERNEL_H /* - * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * @@ -51,6 +51,7 @@ #include <linux/completion.h> #include <linux/kref.h> #include <linux/sched.h> +#include <linux/kthread.h> #include "qib_common.h" #include "qib_verbs.h" @@ -88,7 +89,6 @@ struct qlogic_ib_stats { extern struct qlogic_ib_stats qib_stats; extern const struct pci_error_handlers qib_pci_err_handler; -extern struct pci_driver qib_driver; #define QIB_CHIP_SWVERSION QIB_CHIP_VERS_MAJ /* @@ -114,6 +114,11 @@ struct qib_eep_log_mask { /* * Below contains all data related to a single context (formerly called port). */ + +#ifdef CONFIG_DEBUG_FS +struct qib_opcode_stats_perctx; +#endif + struct qib_ctxtdata { void **rcvegrbuf; dma_addr_t *rcvegrbuf_phys; @@ -154,6 +159,8 @@ struct qib_ctxtdata { */ /* instead of calculating it */ unsigned ctxt; + /* local node of context */ + int node_id; /* non-zero if ctxt is being shared. */ u16 subctxt_cnt; /* non-zero if ctxt is being shared. */ @@ -222,12 +229,15 @@ struct qib_ctxtdata { u8 redirect_seq_cnt; /* ctxt rcvhdrq head offset */ u32 head; - u32 pkt_count; /* lookaside fields */ struct qib_qp *lookaside_qp; u32 lookaside_qpn; /* QPs waiting for context processing */ struct list_head qp_wait_list; +#ifdef CONFIG_DEBUG_FS + /* verbs stats per CTX */ + struct qib_opcode_stats_perctx *opstats; +#endif }; struct qib_sge_state; @@ -428,9 +438,19 @@ struct qib_verbs_txreq { #define ACTIVITY_TIMER 5 #define MAX_NAME_SIZE 64 + +#ifdef CONFIG_INFINIBAND_QIB_DCA +struct qib_irq_notify; +#endif + struct qib_msix_entry { struct msix_entry msix; void *arg; +#ifdef CONFIG_INFINIBAND_QIB_DCA + int dca; + int rcv; + struct qib_irq_notify *notifier; +#endif char name[MAX_NAME_SIZE]; cpumask_var_t mask; }; @@ -555,11 +575,13 @@ struct qib_pportdata { /* read/write using lock */ spinlock_t sdma_lock ____cacheline_aligned_in_smp; struct list_head sdma_activelist; + struct list_head sdma_userpending; u64 sdma_descq_added; u64 sdma_descq_removed; u16 sdma_descq_tail; u16 sdma_descq_head; u8 sdma_generation; + u8 sdma_intrequest; struct tasklet_struct sdma_sw_clean_up_task ____cacheline_aligned_in_smp; @@ -828,6 +850,9 @@ struct qib_devdata { struct qib_ctxtdata *); void (*f_writescratch)(struct qib_devdata *, u32); int (*f_tempsense_rd)(struct qib_devdata *, int regnum); +#ifdef CONFIG_INFINIBAND_QIB_DCA + int (*f_notify_dca)(struct qib_devdata *, unsigned long event); +#endif char *boardname; /* human readable board info */ @@ -843,8 +868,10 @@ struct qib_devdata { /* last buffer for user use */ u32 lastctxt_piobuf; - /* saturating counter of (non-port-specific) device interrupts */ - u32 int_counter; + /* reset value */ + u64 z_int_counter; + /* percpu intcounter */ + u64 __percpu *int_counter; /* pio bufs allocated per ctxt */ u32 pbufsctxt; @@ -1075,6 +1102,10 @@ struct qib_devdata { u16 psxmitwait_check_rate; /* high volume overflow errors defered to tasklet */ struct tasklet_struct error_tasklet; + /* per device cq worker */ + struct kthread_worker *worker; + + int assigned_node_id; /* NUMA node closest to HCA */ }; /* hol_state values */ @@ -1154,8 +1185,8 @@ int qib_create_rcvhdrq(struct qib_devdata *, struct qib_ctxtdata *); int qib_setup_eagerbufs(struct qib_ctxtdata *); void qib_set_ctxtcnt(struct qib_devdata *); int qib_create_ctxts(struct qib_devdata *dd); -struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *, u32); -void qib_init_pportdata(struct qib_pportdata *, struct qib_devdata *, u8, u8); +struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *, u32, int); +int qib_init_pportdata(struct qib_pportdata *, struct qib_devdata *, u8, u8); void qib_free_ctxtdata(struct qib_devdata *, struct qib_ctxtdata *); u32 qib_kreceive(struct qib_ctxtdata *, u32 *, u32 *); @@ -1298,6 +1329,8 @@ int qib_setup_sdma(struct qib_pportdata *); void qib_teardown_sdma(struct qib_pportdata *); void __qib_sdma_intr(struct qib_pportdata *); void qib_sdma_intr(struct qib_pportdata *); +void qib_user_sdma_send_desc(struct qib_pportdata *dd, + struct list_head *pktlist); int qib_sdma_verbs_send(struct qib_pportdata *, struct qib_sge_state *, u32, struct qib_verbs_txreq *); /* ppd->sdma_lock should be locked before calling this. */ @@ -1320,7 +1353,7 @@ static inline int __qib_sdma_running(struct qib_pportdata *ppd) return ppd->sdma_state.current_state == qib_sdma_state_s99_running; } int qib_sdma_running(struct qib_pportdata *); - +void dump_sdma_state(struct qib_pportdata *ppd); void __qib_sdma_process_event(struct qib_pportdata *, enum qib_sdma_events); void qib_sdma_process_event(struct qib_pportdata *, enum qib_sdma_events); @@ -1418,6 +1451,10 @@ void qib_nomsi(struct qib_devdata *); void qib_nomsix(struct qib_devdata *); void qib_pcie_getcmd(struct qib_devdata *, u16 *, u8 *, u8 *); void qib_pcie_reenable(struct qib_devdata *, u16, u8, u8); +/* interrupts for device */ +u64 qib_int_counter(struct qib_devdata *); +/* interrupt for all devices */ +u64 qib_sps_ints(void); /* * dma_addr wrappers - all 0's invalid for hw @@ -1445,6 +1482,7 @@ extern unsigned qib_n_krcv_queues; extern unsigned qib_sdma_fetch_arb; extern unsigned qib_compat_ddr_negotiate; extern int qib_special_trigger; +extern unsigned qib_numa_aware; extern struct mutex qib_mutex; @@ -1474,27 +1512,23 @@ extern struct mutex qib_mutex; * first to avoid possible serial port delays from printk. */ #define qib_early_err(dev, fmt, ...) \ - do { \ - dev_err(dev, fmt, ##__VA_ARGS__); \ - } while (0) + dev_err(dev, fmt, ##__VA_ARGS__) #define qib_dev_err(dd, fmt, ...) \ - do { \ - dev_err(&(dd)->pcidev->dev, "%s: " fmt, \ - qib_get_unit_name((dd)->unit), ##__VA_ARGS__); \ - } while (0) + dev_err(&(dd)->pcidev->dev, "%s: " fmt, \ + qib_get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define qib_dev_warn(dd, fmt, ...) \ + dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \ + qib_get_unit_name((dd)->unit), ##__VA_ARGS__) #define qib_dev_porterr(dd, port, fmt, ...) \ - do { \ - dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \ - qib_get_unit_name((dd)->unit), (dd)->unit, (port), \ - ##__VA_ARGS__); \ - } while (0) + dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \ + qib_get_unit_name((dd)->unit), (dd)->unit, (port), \ + ##__VA_ARGS__) #define qib_devinfo(pcidev, fmt, ...) \ - do { \ - dev_info(&(pcidev)->dev, fmt, ##__VA_ARGS__); \ - } while (0) + dev_info(&(pcidev)->dev, fmt, ##__VA_ARGS__) /* * this is used for formatting hw error messages... diff --git a/drivers/infiniband/hw/qib/qib_common.h b/drivers/infiniband/hw/qib/qib_common.h index d39e0183ff8..5670ace27c6 100644 --- a/drivers/infiniband/hw/qib/qib_common.h +++ b/drivers/infiniband/hw/qib/qib_common.h @@ -279,7 +279,7 @@ struct qib_base_info { * may not be implemented; the user code must deal with this if it * cares, or it must abort after initialization reports the difference. */ -#define QIB_USER_SWMINOR 11 +#define QIB_USER_SWMINOR 13 #define QIB_USER_SWVERSION ((QIB_USER_SWMAJOR << 16) | QIB_USER_SWMINOR) @@ -701,7 +701,37 @@ struct qib_message_header { __be32 bth[3]; /* fields below this point are in host byte order */ struct qib_header iph; + /* fields below are simplified, but should match PSM */ + /* some are accessed by driver when packet spliting is needed */ __u8 sub_opcode; + __u8 flags; + __u16 commidx; + __u32 ack_seq_num; + __u8 flowid; + __u8 hdr_dlen; + __u16 mqhdr; + __u32 uwords[4]; +}; + +/* sequence number bits for message */ +union qib_seqnum { + struct { + __u32 seq:11; + __u32 gen:8; + __u32 flow:5; + }; + struct { + __u32 pkt:16; + __u32 msg:8; + }; + __u32 val; +}; + +/* qib receiving-dma tid-session-member */ +struct qib_tid_session_member { + __u16 tid; + __u16 offset; + __u16 length; }; /* IB - LRH header consts */ diff --git a/drivers/infiniband/hw/qib/qib_cq.c b/drivers/infiniband/hw/qib/qib_cq.c index 5246aa486bb..ab4e11cfab1 100644 --- a/drivers/infiniband/hw/qib/qib_cq.c +++ b/drivers/infiniband/hw/qib/qib_cq.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2013 Intel Corporation. All rights reserved. * Copyright (c) 2006, 2007, 2008, 2010 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * @@ -34,8 +35,10 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/kthread.h> #include "qib_verbs.h" +#include "qib.h" /** * qib_cq_enter - add a new entry to the completion queue @@ -102,13 +105,18 @@ void qib_cq_enter(struct qib_cq *cq, struct ib_wc *entry, int solicited) if (cq->notify == IB_CQ_NEXT_COMP || (cq->notify == IB_CQ_SOLICITED && (solicited || entry->status != IB_WC_SUCCESS))) { - cq->notify = IB_CQ_NONE; - cq->triggered++; + struct kthread_worker *worker; /* * This will cause send_complete() to be called in * another thread. */ - queue_work(qib_cq_wq, &cq->comptask); + smp_rmb(); + worker = cq->dd->worker; + if (likely(worker)) { + cq->notify = IB_CQ_NONE; + cq->triggered++; + queue_kthread_work(worker, &cq->comptask); + } } spin_unlock_irqrestore(&cq->lock, flags); @@ -163,7 +171,7 @@ bail: return npolled; } -static void send_complete(struct work_struct *work) +static void send_complete(struct kthread_work *work) { struct qib_cq *cq = container_of(work, struct qib_cq, comptask); @@ -287,11 +295,12 @@ struct ib_cq *qib_create_cq(struct ib_device *ibdev, int entries, * The number of entries should be >= the number requested or return * an error. */ + cq->dd = dd_from_dev(dev); cq->ibcq.cqe = entries; cq->notify = IB_CQ_NONE; cq->triggered = 0; spin_lock_init(&cq->lock); - INIT_WORK(&cq->comptask, send_complete); + init_kthread_work(&cq->comptask, send_complete); wc->head = 0; wc->tail = 0; cq->queue = wc; @@ -323,7 +332,7 @@ int qib_destroy_cq(struct ib_cq *ibcq) struct qib_ibdev *dev = to_idev(ibcq->device); struct qib_cq *cq = to_icq(ibcq); - flush_work(&cq->comptask); + flush_kthread_work(&cq->comptask); spin_lock(&dev->n_cqs_lock); dev->n_cqs_allocated--; spin_unlock(&dev->n_cqs_lock); @@ -483,3 +492,49 @@ bail_free: bail: return ret; } + +int qib_cq_init(struct qib_devdata *dd) +{ + int ret = 0; + int cpu; + struct task_struct *task; + + if (dd->worker) + return 0; + dd->worker = kzalloc(sizeof(*dd->worker), GFP_KERNEL); + if (!dd->worker) + return -ENOMEM; + init_kthread_worker(dd->worker); + task = kthread_create_on_node( + kthread_worker_fn, + dd->worker, + dd->assigned_node_id, + "qib_cq%d", dd->unit); + if (IS_ERR(task)) + goto task_fail; + cpu = cpumask_first(cpumask_of_node(dd->assigned_node_id)); + kthread_bind(task, cpu); + wake_up_process(task); +out: + return ret; +task_fail: + ret = PTR_ERR(task); + kfree(dd->worker); + dd->worker = NULL; + goto out; +} + +void qib_cq_exit(struct qib_devdata *dd) +{ + struct kthread_worker *worker; + + worker = dd->worker; + if (!worker) + return; + /* blocks future queuing from send_complete() */ + dd->worker = NULL; + smp_wmb(); + flush_kthread_worker(worker); + kthread_stop(worker->task); + kfree(worker); +} diff --git a/drivers/infiniband/hw/qib/qib_debugfs.c b/drivers/infiniband/hw/qib/qib_debugfs.c new file mode 100644 index 00000000000..799a0c3bffc --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_debugfs.c @@ -0,0 +1,283 @@ +#ifdef CONFIG_DEBUG_FS +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/kernel.h> +#include <linux/export.h> + +#include "qib.h" +#include "qib_verbs.h" +#include "qib_debugfs.h" + +static struct dentry *qib_dbg_root; + +#define DEBUGFS_FILE(name) \ +static const struct seq_operations _##name##_seq_ops = { \ + .start = _##name##_seq_start, \ + .next = _##name##_seq_next, \ + .stop = _##name##_seq_stop, \ + .show = _##name##_seq_show \ +}; \ +static int _##name##_open(struct inode *inode, struct file *s) \ +{ \ + struct seq_file *seq; \ + int ret; \ + ret = seq_open(s, &_##name##_seq_ops); \ + if (ret) \ + return ret; \ + seq = s->private_data; \ + seq->private = inode->i_private; \ + return 0; \ +} \ +static const struct file_operations _##name##_file_ops = { \ + .owner = THIS_MODULE, \ + .open = _##name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = seq_release \ +}; + +#define DEBUGFS_FILE_CREATE(name) \ +do { \ + struct dentry *ent; \ + ent = debugfs_create_file(#name , 0400, ibd->qib_ibdev_dbg, \ + ibd, &_##name##_file_ops); \ + if (!ent) \ + pr_warn("create of " #name " failed\n"); \ +} while (0) + +static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + struct qib_opcode_stats_perctx *opstats; + + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + +static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct qib_opcode_stats_perctx *opstats; + + ++*pos; + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + + +static void _opcode_stats_seq_stop(struct seq_file *s, void *v) +{ + /* nothing allocated */ +} + +static int _opcode_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos = v; + loff_t i = *spos, j; + u64 n_packets = 0, n_bytes = 0; + struct qib_ibdev *ibd = (struct qib_ibdev *)s->private; + struct qib_devdata *dd = dd_from_dev(ibd); + + for (j = 0; j < dd->first_user_ctxt; j++) { + if (!dd->rcd[j]) + continue; + n_packets += dd->rcd[j]->opstats->stats[i].n_packets; + n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes; + } + if (!n_packets && !n_bytes) + return SEQ_SKIP; + seq_printf(s, "%02llx %llu/%llu\n", i, + (unsigned long long) n_packets, + (unsigned long long) n_bytes); + + return 0; +} + +DEBUGFS_FILE(opcode_stats) + +static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + struct qib_ibdev *ibd = (struct qib_ibdev *)s->private; + struct qib_devdata *dd = dd_from_dev(ibd); + + if (!*pos) + return SEQ_START_TOKEN; + if (*pos >= dd->first_user_ctxt) + return NULL; + return pos; +} + +static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct qib_ibdev *ibd = (struct qib_ibdev *)s->private; + struct qib_devdata *dd = dd_from_dev(ibd); + + if (v == SEQ_START_TOKEN) + return pos; + + ++*pos; + if (*pos >= dd->first_user_ctxt) + return NULL; + return pos; +} + +static void _ctx_stats_seq_stop(struct seq_file *s, void *v) +{ + /* nothing allocated */ +} + +static int _ctx_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos; + loff_t i, j; + u64 n_packets = 0; + struct qib_ibdev *ibd = (struct qib_ibdev *)s->private; + struct qib_devdata *dd = dd_from_dev(ibd); + + if (v == SEQ_START_TOKEN) { + seq_puts(s, "Ctx:npkts\n"); + return 0; + } + + spos = v; + i = *spos; + + if (!dd->rcd[i]) + return SEQ_SKIP; + + for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++) + n_packets += dd->rcd[i]->opstats->stats[j].n_packets; + + if (!n_packets) + return SEQ_SKIP; + + seq_printf(s, " %llu:%llu\n", i, n_packets); + return 0; +} + +DEBUGFS_FILE(ctx_stats) + +static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + struct qib_qp_iter *iter; + loff_t n = *pos; + + iter = qib_qp_iter_init(s->private); + if (!iter) + return NULL; + + while (n--) { + if (qib_qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + } + + return iter; +} + +static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr, + loff_t *pos) +{ + struct qib_qp_iter *iter = iter_ptr; + + (*pos)++; + + if (qib_qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr) +{ + /* nothing for now */ +} + +static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr) +{ + struct qib_qp_iter *iter = iter_ptr; + + if (!iter) + return 0; + + qib_qp_iter_print(s, iter); + + return 0; +} + +DEBUGFS_FILE(qp_stats) + +void qib_dbg_ibdev_init(struct qib_ibdev *ibd) +{ + char name[10]; + + snprintf(name, sizeof(name), "qib%d", dd_from_dev(ibd)->unit); + ibd->qib_ibdev_dbg = debugfs_create_dir(name, qib_dbg_root); + if (!ibd->qib_ibdev_dbg) { + pr_warn("create of %s failed\n", name); + return; + } + DEBUGFS_FILE_CREATE(opcode_stats); + DEBUGFS_FILE_CREATE(ctx_stats); + DEBUGFS_FILE_CREATE(qp_stats); + return; +} + +void qib_dbg_ibdev_exit(struct qib_ibdev *ibd) +{ + if (!qib_dbg_root) + goto out; + debugfs_remove_recursive(ibd->qib_ibdev_dbg); +out: + ibd->qib_ibdev_dbg = NULL; +} + +void qib_dbg_init(void) +{ + qib_dbg_root = debugfs_create_dir(QIB_DRV_NAME, NULL); + if (!qib_dbg_root) + pr_warn("init of debugfs failed\n"); +} + +void qib_dbg_exit(void) +{ + debugfs_remove_recursive(qib_dbg_root); + qib_dbg_root = NULL; +} + +#endif + diff --git a/drivers/infiniband/hw/qib/qib_debugfs.h b/drivers/infiniband/hw/qib/qib_debugfs.h new file mode 100644 index 00000000000..7ae983a91b8 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_debugfs.h @@ -0,0 +1,45 @@ +#ifndef _QIB_DEBUGFS_H +#define _QIB_DEBUGFS_H + +#ifdef CONFIG_DEBUG_FS +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +struct qib_ibdev; +void qib_dbg_ibdev_init(struct qib_ibdev *ibd); +void qib_dbg_ibdev_exit(struct qib_ibdev *ibd); +void qib_dbg_init(void); +void qib_dbg_exit(void); + +#endif + +#endif /* _QIB_DEBUGFS_H */ diff --git a/drivers/infiniband/hw/qib/qib_diag.c b/drivers/infiniband/hw/qib/qib_diag.c index 1686fd4bda8..5dfda4c5cc9 100644 --- a/drivers/infiniband/hw/qib/qib_diag.c +++ b/drivers/infiniband/hw/qib/qib_diag.c @@ -546,7 +546,7 @@ static ssize_t qib_diagpkt_write(struct file *fp, size_t count, loff_t *off) { u32 __iomem *piobuf; - u32 plen, clen, pbufn; + u32 plen, pbufn, maxlen_reserve; struct qib_diag_xpkt dp; u32 *tmpbuf = NULL; struct qib_devdata *dd; @@ -590,15 +590,20 @@ static ssize_t qib_diagpkt_write(struct file *fp, } ppd = &dd->pport[dp.port - 1]; - /* need total length before first word written */ - /* +1 word is for the qword padding */ - plen = sizeof(u32) + dp.len; - clen = dp.len >> 2; - - if ((plen + 4) > ppd->ibmaxlen) { + /* + * need total length before first word written, plus 2 Dwords. One Dword + * is for padding so we get the full user data when not aligned on + * a word boundary. The other Dword is to make sure we have room for the + * ICRC which gets tacked on later. + */ + maxlen_reserve = 2 * sizeof(u32); + if (dp.len > ppd->ibmaxlen - maxlen_reserve) { ret = -EINVAL; - goto bail; /* before writing pbc */ + goto bail; } + + plen = sizeof(u32) + dp.len; + tmpbuf = vmalloc(plen); if (!tmpbuf) { qib_devinfo(dd->pcidev, @@ -638,11 +643,11 @@ static ssize_t qib_diagpkt_write(struct file *fp, */ if (dd->flags & QIB_PIO_FLUSH_WC) { qib_flush_wc(); - qib_pio_copy(piobuf + 2, tmpbuf, clen - 1); + qib_pio_copy(piobuf + 2, tmpbuf, plen - 1); qib_flush_wc(); - __raw_writel(tmpbuf[clen - 1], piobuf + clen + 1); + __raw_writel(tmpbuf[plen - 1], piobuf + plen + 1); } else - qib_pio_copy(piobuf + 2, tmpbuf, clen); + qib_pio_copy(piobuf + 2, tmpbuf, plen); if (dd->flags & QIB_USE_SPCL_TRIG) { u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023; @@ -689,28 +694,23 @@ int qib_register_observer(struct qib_devdata *dd, const struct diag_observer *op) { struct diag_observer_list_elt *olp; - int ret = -EINVAL; + unsigned long flags; if (!dd || !op) - goto bail; - ret = -ENOMEM; + return -EINVAL; olp = vmalloc(sizeof *olp); if (!olp) { pr_err("vmalloc for observer failed\n"); - goto bail; + return -ENOMEM; } - if (olp) { - unsigned long flags; - spin_lock_irqsave(&dd->qib_diag_trans_lock, flags); - olp->op = op; - olp->next = dd->diag_observer_list; - dd->diag_observer_list = olp; - spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags); - ret = 0; - } -bail: - return ret; + spin_lock_irqsave(&dd->qib_diag_trans_lock, flags); + olp->op = op; + olp->next = dd->diag_observer_list; + dd->diag_observer_list = olp; + spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags); + + return 0; } /* Remove all registered observers when device is closed */ diff --git a/drivers/infiniband/hw/qib/qib_dma.c b/drivers/infiniband/hw/qib/qib_dma.c index 2920bb39a65..59fe092b4b0 100644 --- a/drivers/infiniband/hw/qib/qib_dma.c +++ b/drivers/infiniband/hw/qib/qib_dma.c @@ -108,6 +108,10 @@ static int qib_map_sg(struct ib_device *dev, struct scatterlist *sgl, ret = 0; break; } + sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif } return ret; } @@ -119,21 +123,6 @@ static void qib_unmap_sg(struct ib_device *dev, BUG_ON(!valid_dma_direction(direction)); } -static u64 qib_sg_dma_address(struct ib_device *dev, struct scatterlist *sg) -{ - u64 addr = (u64) page_address(sg_page(sg)); - - if (addr) - addr += sg->offset; - return addr; -} - -static unsigned int qib_sg_dma_len(struct ib_device *dev, - struct scatterlist *sg) -{ - return sg->length; -} - static void qib_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { @@ -173,8 +162,6 @@ struct ib_dma_mapping_ops qib_dma_mapping_ops = { .unmap_page = qib_dma_unmap_page, .map_sg = qib_map_sg, .unmap_sg = qib_unmap_sg, - .dma_address = qib_sg_dma_address, - .dma_len = qib_sg_dma_len, .sync_single_for_cpu = qib_sync_single_for_cpu, .sync_single_for_device = qib_sync_single_for_device, .alloc_coherent = qib_dma_alloc_coherent, diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c index 5423edcab51..5bee08f16d7 100644 --- a/drivers/infiniband/hw/qib/qib_driver.c +++ b/drivers/infiniband/hw/qib/qib_driver.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2013 Intel Corporation. All rights reserved. * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * @@ -63,8 +64,8 @@ MODULE_PARM_DESC(compat_ddr_negotiate, "Attempt pre-IBTA 1.2 DDR speed negotiation"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_AUTHOR("QLogic <support@qlogic.com>"); -MODULE_DESCRIPTION("QLogic IB driver"); +MODULE_AUTHOR("Intel <ibsupport@intel.com>"); +MODULE_DESCRIPTION("Intel IB driver"); MODULE_VERSION(QIB_DRIVER_VERSION); /* @@ -557,7 +558,6 @@ move_along: } rcd->head = l; - rcd->pkt_count += i; /* * Iterate over all QPs waiting to respond. diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 959a5c4ff81..b15e34eeef6 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * @@ -39,7 +39,7 @@ #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/io.h> -#include <linux/uio.h> +#include <linux/aio.h> #include <linux/jiffies.h> #include <asm/pgtable.h> #include <linux/delay.h> @@ -1155,6 +1155,49 @@ static unsigned int qib_poll(struct file *fp, struct poll_table_struct *pt) return pollflag; } +static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd) +{ + struct qib_filedata *fd = fp->private_data; + const unsigned int weight = cpumask_weight(¤t->cpus_allowed); + const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus); + int local_cpu; + + /* + * If process has NOT already set it's affinity, select and + * reserve a processor for it on the local NUMA node. + */ + if ((weight >= qib_cpulist_count) && + (cpumask_weight(local_mask) <= qib_cpulist_count)) { + for_each_cpu(local_cpu, local_mask) + if (!test_and_set_bit(local_cpu, qib_cpulist)) { + fd->rec_cpu_num = local_cpu; + return; + } + } + + /* + * If process has NOT already set it's affinity, select and + * reserve a processor for it, as a rendevous for all + * users of the driver. If they don't actually later + * set affinity to this cpu, or set it to some other cpu, + * it just means that sooner or later we don't recommend + * a cpu, and let the scheduler do it's best. + */ + if (weight >= qib_cpulist_count) { + int cpu; + cpu = find_first_zero_bit(qib_cpulist, + qib_cpulist_count); + if (cpu == qib_cpulist_count) + qib_dev_err(dd, + "no cpus avail for affinity PID %u\n", + current->pid); + else { + __set_bit(cpu, qib_cpulist); + fd->rec_cpu_num = cpu; + } + } +} + /* * Check that userland and driver are compatible for subcontexts. */ @@ -1177,7 +1220,7 @@ static int qib_compatible_subctxts(int user_swmajor, int user_swminor) return user_swminor == 3; default: /* >= 4 are compatible (or are expected to be) */ - return user_swminor >= 4; + return user_swminor <= QIB_USER_SWMINOR; } } /* make no promises yet for future major versions */ @@ -1259,12 +1302,20 @@ bail: static int setup_ctxt(struct qib_pportdata *ppd, int ctxt, struct file *fp, const struct qib_user_info *uinfo) { + struct qib_filedata *fd = fp->private_data; struct qib_devdata *dd = ppd->dd; struct qib_ctxtdata *rcd; void *ptmp = NULL; int ret; + int numa_id; + + assign_ctxt_affinity(fp, dd); + + numa_id = qib_numa_aware ? ((fd->rec_cpu_num != -1) ? + cpu_to_node(fd->rec_cpu_num) : + numa_node_id()) : dd->assigned_node_id; - rcd = qib_create_ctxtdata(ppd, ctxt); + rcd = qib_create_ctxtdata(ppd, ctxt, numa_id); /* * Allocate memory for use in qib_tid_update() at open to @@ -1296,6 +1347,9 @@ static int setup_ctxt(struct qib_pportdata *ppd, int ctxt, goto bail; bailerr: + if (fd->rec_cpu_num != -1) + __clear_bit(fd->rec_cpu_num, qib_cpulist); + dd->rcd[ctxt] = NULL; kfree(rcd); kfree(ptmp); @@ -1405,7 +1459,7 @@ static int get_a_ctxt(struct file *fp, const struct qib_user_info *uinfo, cused++; else cfree++; - if (pusable && cfree && cused < inuse) { + if (cfree && cused < inuse) { udd = dd; inuse = cused; } @@ -1485,6 +1539,58 @@ static int qib_open(struct inode *in, struct file *fp) return fp->private_data ? 0 : -ENOMEM; } +static int find_hca(unsigned int cpu, int *unit) +{ + int ret = 0, devmax, npresent, nup, ndev; + + *unit = -1; + + devmax = qib_count_units(&npresent, &nup); + if (!npresent) { + ret = -ENXIO; + goto done; + } + if (!nup) { + ret = -ENETDOWN; + goto done; + } + for (ndev = 0; ndev < devmax; ndev++) { + struct qib_devdata *dd = qib_lookup(ndev); + if (dd) { + if (pcibus_to_node(dd->pcidev->bus) < 0) { + ret = -EINVAL; + goto done; + } + if (cpu_to_node(cpu) == + pcibus_to_node(dd->pcidev->bus)) { + *unit = ndev; + goto done; + } + } + } +done: + return ret; +} + +static int do_qib_user_sdma_queue_create(struct file *fp) +{ + struct qib_filedata *fd = fp->private_data; + struct qib_ctxtdata *rcd = fd->rcd; + struct qib_devdata *dd = rcd->dd; + + if (dd->flags & QIB_HAS_SEND_DMA) { + + fd->pq = qib_user_sdma_queue_create(&dd->pcidev->dev, + dd->unit, + rcd->ctxt, + fd->subctxt); + if (!fd->pq) + return -ENOMEM; + } + + return 0; +} + /* * Get ctxt early, so can set affinity prior to memory allocation. */ @@ -1517,61 +1623,36 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo) if (qib_compatible_subctxts(swmajor, swminor) && uinfo->spu_subctxt_cnt) { ret = find_shared_ctxt(fp, uinfo); - if (ret) { - if (ret > 0) - ret = 0; - goto done_chk_sdma; + if (ret > 0) { + ret = do_qib_user_sdma_queue_create(fp); + if (!ret) + assign_ctxt_affinity(fp, (ctxt_fp(fp))->dd); + goto done_ok; } } - i_minor = iminor(fp->f_dentry->d_inode) - QIB_USER_MINOR_BASE; + i_minor = iminor(file_inode(fp)) - QIB_USER_MINOR_BASE; if (i_minor) ret = find_free_ctxt(i_minor - 1, fp, uinfo); - else + else { + int unit; + const unsigned int cpu = cpumask_first(¤t->cpus_allowed); + const unsigned int weight = + cpumask_weight(¤t->cpus_allowed); + + if (weight == 1 && !test_bit(cpu, qib_cpulist)) + if (!find_hca(cpu, &unit) && unit >= 0) + if (!find_free_ctxt(unit, fp, uinfo)) { + ret = 0; + goto done_chk_sdma; + } ret = get_a_ctxt(fp, uinfo, alg); - -done_chk_sdma: - if (!ret) { - struct qib_filedata *fd = fp->private_data; - const struct qib_ctxtdata *rcd = fd->rcd; - const struct qib_devdata *dd = rcd->dd; - unsigned int weight; - - if (dd->flags & QIB_HAS_SEND_DMA) { - fd->pq = qib_user_sdma_queue_create(&dd->pcidev->dev, - dd->unit, - rcd->ctxt, - fd->subctxt); - if (!fd->pq) - ret = -ENOMEM; - } - - /* - * If process has NOT already set it's affinity, select and - * reserve a processor for it, as a rendezvous for all - * users of the driver. If they don't actually later - * set affinity to this cpu, or set it to some other cpu, - * it just means that sooner or later we don't recommend - * a cpu, and let the scheduler do it's best. - */ - weight = cpumask_weight(tsk_cpus_allowed(current)); - if (!ret && weight >= qib_cpulist_count) { - int cpu; - cpu = find_first_zero_bit(qib_cpulist, - qib_cpulist_count); - if (cpu != qib_cpulist_count) { - __set_bit(cpu, qib_cpulist); - fd->rec_cpu_num = cpu; - } - } else if (weight == 1 && - test_bit(cpumask_first(tsk_cpus_allowed(current)), - qib_cpulist)) - qib_devinfo(dd->pcidev, - "%s PID %u affinity set to cpu %d; already allocated\n", - current->comm, current->pid, - cpumask_first(tsk_cpus_allowed(current))); } +done_chk_sdma: + if (!ret) + ret = do_qib_user_sdma_queue_create(fp); +done_ok: mutex_unlock(&qib_mutex); done: @@ -2208,7 +2289,7 @@ int qib_cdev_init(int minor, const char *name, goto err_cdev; } - device = device_create(qib_class, NULL, dev, NULL, name); + device = device_create(qib_class, NULL, dev, NULL, "%s", name); if (!IS_ERR(device)) goto done; ret = PTR_ERR(device); diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c index 65a2a23f6f8..cab610ccd50 100644 --- a/drivers/infiniband/hw/qib/qib_fs.c +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -45,7 +45,7 @@ static struct super_block *qib_super; -#define private2dd(file) ((file)->f_dentry->d_inode->i_private) +#define private2dd(file) (file_inode(file)->i_private) static int qibfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, const struct file_operations *fops, @@ -105,6 +105,7 @@ static int create_file(const char *name, umode_t mode, static ssize_t driver_stats_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + qib_stats.sps_ints = qib_sps_ints(); return simple_read_from_buffer(buf, count, ppos, &qib_stats, sizeof qib_stats); } @@ -171,7 +172,7 @@ static const struct file_operations cntr_ops[] = { }; /* - * Could use file->f_dentry->d_inode->i_ino to figure out which file, + * Could use file_inode(file)->i_ino to figure out which file, * instead of separate routine for each, but for now, this works... */ @@ -456,13 +457,13 @@ static int remove_file(struct dentry *parent, char *name) spin_lock(&tmp->d_lock); if (!(d_unhashed(tmp) && tmp->d_inode)) { - dget_dlock(tmp); __d_drop(tmp); spin_unlock(&tmp->d_lock); simple_unlink(parent->d_inode, tmp); } else { spin_unlock(&tmp->d_lock); } + dput(tmp); ret = 0; bail: @@ -491,6 +492,7 @@ static int remove_device_files(struct super_block *sb, goto bail; } + mutex_lock(&dir->d_inode->i_mutex); remove_file(dir, "counters"); remove_file(dir, "counter_names"); remove_file(dir, "portcounter_names"); @@ -505,8 +507,10 @@ static int remove_device_files(struct super_block *sb, } } remove_file(dir, "flash"); - d_delete(dir); + mutex_unlock(&dir->d_inode->i_mutex); ret = simple_rmdir(root->d_inode, dir); + d_delete(dir); + dput(dir); bail: mutex_unlock(&root->d_inode->i_mutex); @@ -604,6 +608,7 @@ static struct file_system_type qibfs_fs_type = { .mount = qibfs_mount, .kill_sb = qibfs_kill_super, }; +MODULE_ALIAS_FS("ipathfs"); int __init qib_init_qibfs(void) { diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c index a099ac171e2..d68266ac761 100644 --- a/drivers/infiniband/hw/qib/qib_iba6120.c +++ b/drivers/infiniband/hw/qib/qib_iba6120.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2013 Intel Corporation. All rights reserved. * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. * All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. @@ -51,7 +52,7 @@ static u32 qib_6120_iblink_state(u64); /* * This file contains all the chip-specific register information and - * access functions for the QLogic QLogic_IB PCI-Express chip. + * access functions for the Intel Intel_IB PCI-Express chip. * */ @@ -1633,9 +1634,7 @@ static irqreturn_t qib_6120intr(int irq, void *data) goto bail; } - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; + this_cpu_inc(*dd->int_counter); if (unlikely(istat & (~QLOGIC_IB_I_BITSEXTANT | QLOGIC_IB_I_GPIO | QLOGIC_IB_I_ERROR))) @@ -1807,7 +1806,8 @@ static int qib_6120_setup_reset(struct qib_devdata *dd) * isn't set. */ dd->flags &= ~(QIB_INITTED | QIB_PRESENT); - dd->int_counter = 0; /* so we check interrupts work again */ + /* so we check interrupts work again */ + dd->z_int_counter = qib_int_counter(dd); val = dd->control | QLOGIC_IB_C_RESET; writeq(val, &dd->kregbase[kr_control]); mb(); /* prevent compiler re-ordering around actual reset */ @@ -3265,7 +3265,9 @@ static int init_6120_variables(struct qib_devdata *dd) dd->eep_st_masks[2].errs_to_log = ERR_MASK(ResetNegated); - qib_init_pportdata(ppd, dd, 0, 1); + ret = qib_init_pportdata(ppd, dd, 0, 1); + if (ret) + goto bail; ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X; ppd->link_speed_supported = QIB_IB_SDR; ppd->link_width_enabled = IB_WIDTH_4X; @@ -3463,6 +3465,13 @@ static int qib_6120_tempsense_rd(struct qib_devdata *dd, int regnum) return -ENXIO; } +#ifdef CONFIG_INFINIBAND_QIB_DCA +static int qib_6120_notify_dca(struct qib_devdata *dd, unsigned long event) +{ + return 0; +} +#endif + /* Dummy function, as 6120 boards never disable EEPROM Write */ static int qib_6120_eeprom_wen(struct qib_devdata *dd, int wen) { @@ -3538,6 +3547,9 @@ struct qib_devdata *qib_init_iba6120_funcs(struct pci_dev *pdev, dd->f_xgxs_reset = qib_6120_xgxs_reset; dd->f_writescratch = writescratch; dd->f_tempsense_rd = qib_6120_tempsense_rd; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dd->f_notify_dca = qib_6120_notify_dca; +#endif /* * Do remaining pcie setup and save pcie values in dd. * Any error printing is already done by the init code. diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c index 64d0ecb90cd..7dec89fdc12 100644 --- a/drivers/infiniband/hw/qib/qib_iba7220.c +++ b/drivers/infiniband/hw/qib/qib_iba7220.c @@ -1962,10 +1962,7 @@ static irqreturn_t qib_7220intr(int irq, void *data) goto bail; } - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; - + this_cpu_inc(*dd->int_counter); if (unlikely(istat & (~QLOGIC_IB_I_BITSEXTANT | QLOGIC_IB_I_GPIO | QLOGIC_IB_I_ERROR))) unlikely_7220_intr(dd, istat); @@ -2120,7 +2117,8 @@ static int qib_setup_7220_reset(struct qib_devdata *dd) * isn't set. */ dd->flags &= ~(QIB_INITTED | QIB_PRESENT); - dd->int_counter = 0; /* so we check interrupts work again */ + /* so we check interrupts work again */ + dd->z_int_counter = qib_int_counter(dd); val = dd->control | QLOGIC_IB_C_RESET; writeq(val, &dd->kregbase[kr_control]); mb(); /* prevent compiler reordering around actual reset */ @@ -4061,7 +4059,9 @@ static int qib_init_7220_variables(struct qib_devdata *dd) init_waitqueue_head(&cpspec->autoneg_wait); INIT_DELAYED_WORK(&cpspec->autoneg_work, autoneg_7220_work); - qib_init_pportdata(ppd, dd, 0, 1); + ret = qib_init_pportdata(ppd, dd, 0, 1); + if (ret) + goto bail; ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X; ppd->link_speed_supported = QIB_IB_SDR | QIB_IB_DDR; @@ -4513,6 +4513,13 @@ bail: return ret; } +#ifdef CONFIG_INFINIBAND_QIB_DCA +static int qib_7220_notify_dca(struct qib_devdata *dd, unsigned long event) +{ + return 0; +} +#endif + /* Dummy function, as 7220 boards never disable EEPROM Write */ static int qib_7220_eeprom_wen(struct qib_devdata *dd, int wen) { @@ -4587,6 +4594,9 @@ struct qib_devdata *qib_init_iba7220_funcs(struct pci_dev *pdev, dd->f_xgxs_reset = qib_7220_xgxs_reset; dd->f_writescratch = writescratch; dd->f_tempsense_rd = qib_7220_tempsense_rd; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dd->f_notify_dca = qib_7220_notify_dca; +#endif /* * Do remaining pcie setup and save pcie values in dd. * Any error printing is already done by the init code. diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 3f6b21e9dc1..a7eb32517a0 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -44,6 +44,9 @@ #include <linux/module.h> #include <rdma/ib_verbs.h> #include <rdma/ib_smi.h> +#ifdef CONFIG_INFINIBAND_QIB_DCA +#include <linux/dca.h> +#endif #include "qib.h" #include "qib_7322_regs.h" @@ -80,6 +83,7 @@ static void ibsd_wr_allchans(struct qib_pportdata *, int, unsigned, unsigned); static void serdes_7322_los_enable(struct qib_pportdata *, int); static int serdes_7322_init_old(struct qib_pportdata *); static int serdes_7322_init_new(struct qib_pportdata *); +static void dump_sdma_7322_state(struct qib_pportdata *); #define BMASK(msb, lsb) (((1 << ((msb) + 1 - (lsb))) - 1) << (lsb)) @@ -519,6 +523,14 @@ static const u8 qib_7322_physportstate[0x20] = { [0x17] = IB_PHYSPORTSTATE_CFG_TRAIN }; +#ifdef CONFIG_INFINIBAND_QIB_DCA +struct qib_irq_notify { + int rcv; + void *arg; + struct irq_affinity_notify notify; +}; +#endif + struct qib_chip_specific { u64 __iomem *cregbase; u64 *cntrs; @@ -546,6 +558,12 @@ struct qib_chip_specific { u32 lastbuf_for_pio; u32 stay_in_freeze; u32 recovery_ports_initted; +#ifdef CONFIG_INFINIBAND_QIB_DCA + u32 dca_ctrl; + int rhdr_cpu[18]; + int sdma_cpu[2]; + u64 dca_rcvhdr_ctrl[5]; /* B, C, D, E, F */ +#endif struct qib_msix_entry *msix_entries; unsigned long *sendchkenable; unsigned long *sendgrhchk; @@ -573,7 +591,7 @@ struct vendor_txdds_ent { static void write_tx_serdes_param(struct qib_pportdata *, struct txdds_ent *); #define TXDDS_TABLE_SZ 16 /* number of entries per speed in onchip table */ -#define TXDDS_EXTRA_SZ 13 /* number of extra tx settings entries */ +#define TXDDS_EXTRA_SZ 18 /* number of extra tx settings entries */ #define TXDDS_MFG_SZ 2 /* number of mfg tx settings entries */ #define SERDES_CHANS 4 /* yes, it's obvious, but one less magic number */ @@ -635,6 +653,7 @@ struct qib_chippport_specific { u8 ibmalfusesnap; struct qib_qsfp_data qsfp_data; char epmsgbuf[192]; /* for port error interrupt msg buffer */ + char sdmamsgbuf[192]; /* for per-port sdma error messages */ }; static struct { @@ -642,27 +661,75 @@ static struct { irq_handler_t handler; int lsb; int port; /* 0 if not port-specific, else port # */ + int dca; } irq_table[] = { - { "", qib_7322intr, -1, 0 }, + { "", qib_7322intr, -1, 0, 0 }, { " (buf avail)", qib_7322bufavail, - SYM_LSB(IntStatus, SendBufAvail), 0 }, + SYM_LSB(IntStatus, SendBufAvail), 0, 0}, { " (sdma 0)", sdma_intr, - SYM_LSB(IntStatus, SDmaInt_0), 1 }, + SYM_LSB(IntStatus, SDmaInt_0), 1, 1 }, { " (sdma 1)", sdma_intr, - SYM_LSB(IntStatus, SDmaInt_1), 2 }, + SYM_LSB(IntStatus, SDmaInt_1), 2, 1 }, { " (sdmaI 0)", sdma_idle_intr, - SYM_LSB(IntStatus, SDmaIdleInt_0), 1 }, + SYM_LSB(IntStatus, SDmaIdleInt_0), 1, 1}, { " (sdmaI 1)", sdma_idle_intr, - SYM_LSB(IntStatus, SDmaIdleInt_1), 2 }, + SYM_LSB(IntStatus, SDmaIdleInt_1), 2, 1}, { " (sdmaP 0)", sdma_progress_intr, - SYM_LSB(IntStatus, SDmaProgressInt_0), 1 }, + SYM_LSB(IntStatus, SDmaProgressInt_0), 1, 1 }, { " (sdmaP 1)", sdma_progress_intr, - SYM_LSB(IntStatus, SDmaProgressInt_1), 2 }, + SYM_LSB(IntStatus, SDmaProgressInt_1), 2, 1 }, { " (sdmaC 0)", sdma_cleanup_intr, - SYM_LSB(IntStatus, SDmaCleanupDone_0), 1 }, + SYM_LSB(IntStatus, SDmaCleanupDone_0), 1, 0 }, { " (sdmaC 1)", sdma_cleanup_intr, - SYM_LSB(IntStatus, SDmaCleanupDone_1), 2 }, + SYM_LSB(IntStatus, SDmaCleanupDone_1), 2 , 0}, +}; + +#ifdef CONFIG_INFINIBAND_QIB_DCA + +static const struct dca_reg_map { + int shadow_inx; + int lsb; + u64 mask; + u16 regno; +} dca_rcvhdr_reg_map[] = { + { 0, SYM_LSB(DCACtrlB, RcvHdrq0DCAOPH), + ~SYM_MASK(DCACtrlB, RcvHdrq0DCAOPH) , KREG_IDX(DCACtrlB) }, + { 0, SYM_LSB(DCACtrlB, RcvHdrq1DCAOPH), + ~SYM_MASK(DCACtrlB, RcvHdrq1DCAOPH) , KREG_IDX(DCACtrlB) }, + { 0, SYM_LSB(DCACtrlB, RcvHdrq2DCAOPH), + ~SYM_MASK(DCACtrlB, RcvHdrq2DCAOPH) , KREG_IDX(DCACtrlB) }, + { 0, SYM_LSB(DCACtrlB, RcvHdrq3DCAOPH), + ~SYM_MASK(DCACtrlB, RcvHdrq3DCAOPH) , KREG_IDX(DCACtrlB) }, + { 1, SYM_LSB(DCACtrlC, RcvHdrq4DCAOPH), + ~SYM_MASK(DCACtrlC, RcvHdrq4DCAOPH) , KREG_IDX(DCACtrlC) }, + { 1, SYM_LSB(DCACtrlC, RcvHdrq5DCAOPH), + ~SYM_MASK(DCACtrlC, RcvHdrq5DCAOPH) , KREG_IDX(DCACtrlC) }, + { 1, SYM_LSB(DCACtrlC, RcvHdrq6DCAOPH), + ~SYM_MASK(DCACtrlC, RcvHdrq6DCAOPH) , KREG_IDX(DCACtrlC) }, + { 1, SYM_LSB(DCACtrlC, RcvHdrq7DCAOPH), + ~SYM_MASK(DCACtrlC, RcvHdrq7DCAOPH) , KREG_IDX(DCACtrlC) }, + { 2, SYM_LSB(DCACtrlD, RcvHdrq8DCAOPH), + ~SYM_MASK(DCACtrlD, RcvHdrq8DCAOPH) , KREG_IDX(DCACtrlD) }, + { 2, SYM_LSB(DCACtrlD, RcvHdrq9DCAOPH), + ~SYM_MASK(DCACtrlD, RcvHdrq9DCAOPH) , KREG_IDX(DCACtrlD) }, + { 2, SYM_LSB(DCACtrlD, RcvHdrq10DCAOPH), + ~SYM_MASK(DCACtrlD, RcvHdrq10DCAOPH) , KREG_IDX(DCACtrlD) }, + { 2, SYM_LSB(DCACtrlD, RcvHdrq11DCAOPH), + ~SYM_MASK(DCACtrlD, RcvHdrq11DCAOPH) , KREG_IDX(DCACtrlD) }, + { 3, SYM_LSB(DCACtrlE, RcvHdrq12DCAOPH), + ~SYM_MASK(DCACtrlE, RcvHdrq12DCAOPH) , KREG_IDX(DCACtrlE) }, + { 3, SYM_LSB(DCACtrlE, RcvHdrq13DCAOPH), + ~SYM_MASK(DCACtrlE, RcvHdrq13DCAOPH) , KREG_IDX(DCACtrlE) }, + { 3, SYM_LSB(DCACtrlE, RcvHdrq14DCAOPH), + ~SYM_MASK(DCACtrlE, RcvHdrq14DCAOPH) , KREG_IDX(DCACtrlE) }, + { 3, SYM_LSB(DCACtrlE, RcvHdrq15DCAOPH), + ~SYM_MASK(DCACtrlE, RcvHdrq15DCAOPH) , KREG_IDX(DCACtrlE) }, + { 4, SYM_LSB(DCACtrlF, RcvHdrq16DCAOPH), + ~SYM_MASK(DCACtrlF, RcvHdrq16DCAOPH) , KREG_IDX(DCACtrlF) }, + { 4, SYM_LSB(DCACtrlF, RcvHdrq17DCAOPH), + ~SYM_MASK(DCACtrlF, RcvHdrq17DCAOPH) , KREG_IDX(DCACtrlF) }, }; +#endif /* ibcctrl bits */ #define QLOGIC_IB_IBCC_LINKINITCMD_DISABLE 1 @@ -686,6 +753,13 @@ static void write_7322_init_portregs(struct qib_pportdata *); static void setup_7322_link_recovery(struct qib_pportdata *, u32); static void check_7322_rxe_status(struct qib_pportdata *); static u32 __iomem *qib_7322_getsendbuf(struct qib_pportdata *, u64, u32 *); +#ifdef CONFIG_INFINIBAND_QIB_DCA +static void qib_setup_dca(struct qib_devdata *dd); +static void setup_dca_notifier(struct qib_devdata *dd, + struct qib_msix_entry *m); +static void reset_dca_notifier(struct qib_devdata *dd, + struct qib_msix_entry *m); +#endif /** * qib_read_ureg32 - read 32-bit virtualized per-context register @@ -1522,6 +1596,8 @@ static void sdma_7322_p_errors(struct qib_pportdata *ppd, u64 errs) struct qib_devdata *dd = ppd->dd; errs &= QIB_E_P_SDMAERRS; + err_decode(ppd->cpspec->sdmamsgbuf, sizeof(ppd->cpspec->sdmamsgbuf), + errs, qib_7322p_error_msgs); if (errs & QIB_E_P_SDMAUNEXPDATA) qib_dev_err(dd, "IB%u:%u SDmaUnexpData\n", dd->unit, @@ -1529,6 +1605,15 @@ static void sdma_7322_p_errors(struct qib_pportdata *ppd, u64 errs) spin_lock_irqsave(&ppd->sdma_lock, flags); + if (errs != QIB_E_P_SDMAHALT) { + /* SDMA errors have QIB_E_P_SDMAHALT and another bit set */ + qib_dev_porterr(dd, ppd->port, + "SDMA %s 0x%016llx %s\n", + qib_sdma_state_names[ppd->sdma_state.current_state], + errs, ppd->cpspec->sdmamsgbuf); + dump_sdma_7322_state(ppd); + } + switch (ppd->sdma_state.current_state) { case qib_sdma_state_s00_hw_down: break; @@ -2084,6 +2169,29 @@ static void qib_7322_handle_hwerrors(struct qib_devdata *dd, char *msg, qib_dev_err(dd, "%s hardware error\n", msg); + if (hwerrs & + (SYM_MASK(HwErrMask, SDmaMemReadErrMask_0) | + SYM_MASK(HwErrMask, SDmaMemReadErrMask_1))) { + int pidx = 0; + int err; + unsigned long flags; + struct qib_pportdata *ppd = dd->pport; + for (; pidx < dd->num_pports; ++pidx, ppd++) { + err = 0; + if (pidx == 0 && (hwerrs & + SYM_MASK(HwErrMask, SDmaMemReadErrMask_0))) + err++; + if (pidx == 1 && (hwerrs & + SYM_MASK(HwErrMask, SDmaMemReadErrMask_1))) + err++; + if (err) { + spin_lock_irqsave(&ppd->sdma_lock, flags); + dump_sdma_7322_state(ppd); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + } + } + } + if (isfatal && !dd->diag_client) { qib_dev_err(dd, "Fatal Hardware Error, no longer usable, SN %.16s\n", @@ -2287,6 +2395,11 @@ static int qib_7322_bringup_serdes(struct qib_pportdata *ppd) qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a); qib_write_kreg(dd, kr_scratch, 0ULL); + /* ensure previous Tx parameters are not still forced */ + qib_write_kreg_port(ppd, krp_tx_deemph_override, + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + reset_tx_deemphasis_override)); + if (qib_compat_ddr_negotiate) { ppd->cpspec->ibdeltainprog = 1; ppd->cpspec->ibsymsnap = read_7322_creg32_port(ppd, @@ -2558,6 +2671,162 @@ static void qib_setup_7322_setextled(struct qib_pportdata *ppd, u32 on) qib_write_kreg_port(ppd, krp_rcvpktledcnt, ledblink); } +#ifdef CONFIG_INFINIBAND_QIB_DCA + +static int qib_7322_notify_dca(struct qib_devdata *dd, unsigned long event) +{ + switch (event) { + case DCA_PROVIDER_ADD: + if (dd->flags & QIB_DCA_ENABLED) + break; + if (!dca_add_requester(&dd->pcidev->dev)) { + qib_devinfo(dd->pcidev, "DCA enabled\n"); + dd->flags |= QIB_DCA_ENABLED; + qib_setup_dca(dd); + } + break; + case DCA_PROVIDER_REMOVE: + if (dd->flags & QIB_DCA_ENABLED) { + dca_remove_requester(&dd->pcidev->dev); + dd->flags &= ~QIB_DCA_ENABLED; + dd->cspec->dca_ctrl = 0; + qib_write_kreg(dd, KREG_IDX(DCACtrlA), + dd->cspec->dca_ctrl); + } + break; + } + return 0; +} + +static void qib_update_rhdrq_dca(struct qib_ctxtdata *rcd, int cpu) +{ + struct qib_devdata *dd = rcd->dd; + struct qib_chip_specific *cspec = dd->cspec; + + if (!(dd->flags & QIB_DCA_ENABLED)) + return; + if (cspec->rhdr_cpu[rcd->ctxt] != cpu) { + const struct dca_reg_map *rmp; + + cspec->rhdr_cpu[rcd->ctxt] = cpu; + rmp = &dca_rcvhdr_reg_map[rcd->ctxt]; + cspec->dca_rcvhdr_ctrl[rmp->shadow_inx] &= rmp->mask; + cspec->dca_rcvhdr_ctrl[rmp->shadow_inx] |= + (u64) dca3_get_tag(&dd->pcidev->dev, cpu) << rmp->lsb; + qib_devinfo(dd->pcidev, + "Ctxt %d cpu %d dca %llx\n", rcd->ctxt, cpu, + (long long) cspec->dca_rcvhdr_ctrl[rmp->shadow_inx]); + qib_write_kreg(dd, rmp->regno, + cspec->dca_rcvhdr_ctrl[rmp->shadow_inx]); + cspec->dca_ctrl |= SYM_MASK(DCACtrlA, RcvHdrqDCAEnable); + qib_write_kreg(dd, KREG_IDX(DCACtrlA), cspec->dca_ctrl); + } +} + +static void qib_update_sdma_dca(struct qib_pportdata *ppd, int cpu) +{ + struct qib_devdata *dd = ppd->dd; + struct qib_chip_specific *cspec = dd->cspec; + unsigned pidx = ppd->port - 1; + + if (!(dd->flags & QIB_DCA_ENABLED)) + return; + if (cspec->sdma_cpu[pidx] != cpu) { + cspec->sdma_cpu[pidx] = cpu; + cspec->dca_rcvhdr_ctrl[4] &= ~(ppd->hw_pidx ? + SYM_MASK(DCACtrlF, SendDma1DCAOPH) : + SYM_MASK(DCACtrlF, SendDma0DCAOPH)); + cspec->dca_rcvhdr_ctrl[4] |= + (u64) dca3_get_tag(&dd->pcidev->dev, cpu) << + (ppd->hw_pidx ? + SYM_LSB(DCACtrlF, SendDma1DCAOPH) : + SYM_LSB(DCACtrlF, SendDma0DCAOPH)); + qib_devinfo(dd->pcidev, + "sdma %d cpu %d dca %llx\n", ppd->hw_pidx, cpu, + (long long) cspec->dca_rcvhdr_ctrl[4]); + qib_write_kreg(dd, KREG_IDX(DCACtrlF), + cspec->dca_rcvhdr_ctrl[4]); + cspec->dca_ctrl |= ppd->hw_pidx ? + SYM_MASK(DCACtrlA, SendDMAHead1DCAEnable) : + SYM_MASK(DCACtrlA, SendDMAHead0DCAEnable); + qib_write_kreg(dd, KREG_IDX(DCACtrlA), cspec->dca_ctrl); + } +} + +static void qib_setup_dca(struct qib_devdata *dd) +{ + struct qib_chip_specific *cspec = dd->cspec; + int i; + + for (i = 0; i < ARRAY_SIZE(cspec->rhdr_cpu); i++) + cspec->rhdr_cpu[i] = -1; + for (i = 0; i < ARRAY_SIZE(cspec->sdma_cpu); i++) + cspec->sdma_cpu[i] = -1; + cspec->dca_rcvhdr_ctrl[0] = + (1ULL << SYM_LSB(DCACtrlB, RcvHdrq0DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlB, RcvHdrq1DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlB, RcvHdrq2DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlB, RcvHdrq3DCAXfrCnt)); + cspec->dca_rcvhdr_ctrl[1] = + (1ULL << SYM_LSB(DCACtrlC, RcvHdrq4DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlC, RcvHdrq5DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlC, RcvHdrq6DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlC, RcvHdrq7DCAXfrCnt)); + cspec->dca_rcvhdr_ctrl[2] = + (1ULL << SYM_LSB(DCACtrlD, RcvHdrq8DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlD, RcvHdrq9DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlD, RcvHdrq10DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlD, RcvHdrq11DCAXfrCnt)); + cspec->dca_rcvhdr_ctrl[3] = + (1ULL << SYM_LSB(DCACtrlE, RcvHdrq12DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlE, RcvHdrq13DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlE, RcvHdrq14DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlE, RcvHdrq15DCAXfrCnt)); + cspec->dca_rcvhdr_ctrl[4] = + (1ULL << SYM_LSB(DCACtrlF, RcvHdrq16DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlF, RcvHdrq17DCAXfrCnt)); + for (i = 0; i < ARRAY_SIZE(cspec->sdma_cpu); i++) + qib_write_kreg(dd, KREG_IDX(DCACtrlB) + i, + cspec->dca_rcvhdr_ctrl[i]); + for (i = 0; i < cspec->num_msix_entries; i++) + setup_dca_notifier(dd, &cspec->msix_entries[i]); +} + +static void qib_irq_notifier_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) +{ + struct qib_irq_notify *n = + container_of(notify, struct qib_irq_notify, notify); + int cpu = cpumask_first(mask); + + if (n->rcv) { + struct qib_ctxtdata *rcd = (struct qib_ctxtdata *)n->arg; + qib_update_rhdrq_dca(rcd, cpu); + } else { + struct qib_pportdata *ppd = (struct qib_pportdata *)n->arg; + qib_update_sdma_dca(ppd, cpu); + } +} + +static void qib_irq_notifier_release(struct kref *ref) +{ + struct qib_irq_notify *n = + container_of(ref, struct qib_irq_notify, notify.kref); + struct qib_devdata *dd; + + if (n->rcv) { + struct qib_ctxtdata *rcd = (struct qib_ctxtdata *)n->arg; + dd = rcd->dd; + } else { + struct qib_pportdata *ppd = (struct qib_pportdata *)n->arg; + dd = ppd->dd; + } + qib_devinfo(dd->pcidev, + "release on HCA notify 0x%p n 0x%p\n", ref, n); + kfree(n); +} +#endif + /* * Disable MSIx interrupt if enabled, call generic MSIx code * to cleanup, and clear pending MSIx interrupts. @@ -2575,6 +2844,9 @@ static void qib_7322_nomsix(struct qib_devdata *dd) dd->cspec->num_msix_entries = 0; for (i = 0; i < n; i++) { +#ifdef CONFIG_INFINIBAND_QIB_DCA + reset_dca_notifier(dd, &dd->cspec->msix_entries[i]); +#endif irq_set_affinity_hint( dd->cspec->msix_entries[i].msix.vector, NULL); free_cpumask_var(dd->cspec->msix_entries[i].mask); @@ -2602,6 +2874,15 @@ static void qib_setup_7322_cleanup(struct qib_devdata *dd) { int i; +#ifdef CONFIG_INFINIBAND_QIB_DCA + if (dd->flags & QIB_DCA_ENABLED) { + dca_remove_requester(&dd->pcidev->dev); + dd->flags &= ~QIB_DCA_ENABLED; + dd->cspec->dca_ctrl = 0; + qib_write_kreg(dd, KREG_IDX(DCACtrlA), dd->cspec->dca_ctrl); + } +#endif + qib_7322_free_irq(dd); kfree(dd->cspec->cntrs); kfree(dd->cspec->sendchkenable); @@ -2834,9 +3115,7 @@ static irqreturn_t qib_7322intr(int irq, void *data) goto bail; } - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; + this_cpu_inc(*dd->int_counter); /* handle "errors" of various kinds first, device ahead of port */ if (unlikely(istat & (~QIB_I_BITSEXTANT | QIB_I_GPIO | @@ -2905,9 +3184,7 @@ static irqreturn_t qib_7322pintr(int irq, void *data) */ return IRQ_HANDLED; - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; + this_cpu_inc(*dd->int_counter); /* Clear the interrupt bit we expect to be set. */ qib_write_kreg(dd, kr_intclear, ((1ULL << QIB_I_RCVAVAIL_LSB) | @@ -2934,9 +3211,7 @@ static irqreturn_t qib_7322bufavail(int irq, void *data) */ return IRQ_HANDLED; - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; + this_cpu_inc(*dd->int_counter); /* Clear the interrupt bit we expect to be set. */ qib_write_kreg(dd, kr_intclear, QIB_I_SPIOBUFAVAIL); @@ -2967,9 +3242,7 @@ static irqreturn_t sdma_intr(int irq, void *data) */ return IRQ_HANDLED; - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; + this_cpu_inc(*dd->int_counter); /* Clear the interrupt bit we expect to be set. */ qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? @@ -2996,9 +3269,7 @@ static irqreturn_t sdma_idle_intr(int irq, void *data) */ return IRQ_HANDLED; - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; + this_cpu_inc(*dd->int_counter); /* Clear the interrupt bit we expect to be set. */ qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? @@ -3025,9 +3296,7 @@ static irqreturn_t sdma_progress_intr(int irq, void *data) */ return IRQ_HANDLED; - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; + this_cpu_inc(*dd->int_counter); /* Clear the interrupt bit we expect to be set. */ qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? @@ -3055,9 +3324,7 @@ static irqreturn_t sdma_cleanup_intr(int irq, void *data) */ return IRQ_HANDLED; - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; + this_cpu_inc(*dd->int_counter); /* Clear the interrupt bit we expect to be set. */ qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? @@ -3068,6 +3335,53 @@ static irqreturn_t sdma_cleanup_intr(int irq, void *data) return IRQ_HANDLED; } +#ifdef CONFIG_INFINIBAND_QIB_DCA + +static void reset_dca_notifier(struct qib_devdata *dd, struct qib_msix_entry *m) +{ + if (!m->dca) + return; + qib_devinfo(dd->pcidev, + "Disabling notifier on HCA %d irq %d\n", + dd->unit, + m->msix.vector); + irq_set_affinity_notifier( + m->msix.vector, + NULL); + m->notifier = NULL; +} + +static void setup_dca_notifier(struct qib_devdata *dd, struct qib_msix_entry *m) +{ + struct qib_irq_notify *n; + + if (!m->dca) + return; + n = kzalloc(sizeof(*n), GFP_KERNEL); + if (n) { + int ret; + + m->notifier = n; + n->notify.irq = m->msix.vector; + n->notify.notify = qib_irq_notifier_notify; + n->notify.release = qib_irq_notifier_release; + n->arg = m->arg; + n->rcv = m->rcv; + qib_devinfo(dd->pcidev, + "set notifier irq %d rcv %d notify %p\n", + n->notify.irq, n->rcv, &n->notify); + ret = irq_set_affinity_notifier( + n->notify.irq, + &n->notify); + if (ret) { + m->notifier = NULL; + kfree(n); + } + } +} + +#endif + /* * Set up our chip-specific interrupt handler. * The interrupt type has already been setup, so @@ -3149,6 +3463,9 @@ try_intx: void *arg; u64 val; int lsb, reg, sh; +#ifdef CONFIG_INFINIBAND_QIB_DCA + int dca = 0; +#endif dd->cspec->msix_entries[msixnum]. name[sizeof(dd->cspec->msix_entries[msixnum].name) - 1] @@ -3161,6 +3478,9 @@ try_intx: arg = dd->pport + irq_table[i].port - 1; } else arg = dd; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dca = irq_table[i].dca; +#endif lsb = irq_table[i].lsb; handler = irq_table[i].handler; snprintf(dd->cspec->msix_entries[msixnum].name, @@ -3178,6 +3498,9 @@ try_intx: continue; if (qib_krcvq01_no_msi && ctxt < 2) continue; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dca = 1; +#endif lsb = QIB_I_RCVAVAIL_LSB + ctxt; handler = qib_7322pintr; snprintf(dd->cspec->msix_entries[msixnum].name, @@ -3203,6 +3526,11 @@ try_intx: goto try_intx; } dd->cspec->msix_entries[msixnum].arg = arg; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dd->cspec->msix_entries[msixnum].dca = dca; + dd->cspec->msix_entries[msixnum].rcv = + handler == qib_7322pintr; +#endif if (lsb >= 0) { reg = lsb / IBA7322_REDIRECT_VEC_PER_REG; sh = (lsb % IBA7322_REDIRECT_VEC_PER_REG) * @@ -3381,7 +3709,8 @@ static int qib_do_7322_reset(struct qib_devdata *dd) dd->pport->cpspec->ibsymdelta = 0; dd->pport->cpspec->iblnkerrdelta = 0; dd->pport->cpspec->ibmalfdelta = 0; - dd->int_counter = 0; /* so we check interrupts work again */ + /* so we check interrupts work again */ + dd->z_int_counter = qib_int_counter(dd); /* * Keep chip from being accessed until we are ready. Use @@ -5853,21 +6182,20 @@ static int setup_txselect(const char *str, struct kernel_param *kp) { struct qib_devdata *dd; unsigned long val; - int ret; - + char *n; if (strlen(str) >= MAX_ATTEN_LEN) { pr_info("txselect_values string too long\n"); return -ENOSPC; } - ret = kstrtoul(str, 0, &val); - if (ret || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + + val = simple_strtoul(str, &n, 0); + if (n == str || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ)) { pr_info("txselect_values must start with a number < %d\n", TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ); - return ret ? ret : -EINVAL; + return -EINVAL; } - strcpy(txselect_list, str); + list_for_each_entry(dd, &qib_dev_list, list) if (dd->deviceid == PCI_DEVICE_ID_QLOGIC_IB_7322) set_no_qsfp_atten(dd, 1); @@ -6216,7 +6544,11 @@ static int qib_init_7322_variables(struct qib_devdata *dd) } dd->num_pports++; - qib_init_pportdata(ppd, dd, pidx, dd->num_pports); + ret = qib_init_pportdata(ppd, dd, pidx, dd->num_pports); + if (ret) { + dd->num_pports--; + goto bail; + } ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X; ppd->link_width_enabled = IB_WIDTH_4X; @@ -6452,6 +6784,86 @@ static void qib_sdma_set_7322_desc_cnt(struct qib_pportdata *ppd, unsigned cnt) qib_write_kreg_port(ppd, krp_senddmadesccnt, cnt); } +/* + * sdma_lock should be acquired before calling this routine + */ +static void dump_sdma_7322_state(struct qib_pportdata *ppd) +{ + u64 reg, reg1, reg2; + + reg = qib_read_kreg_port(ppd, krp_senddmastatus); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmastatus: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_sendctrl); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA sendctrl: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmabase); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmabase: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmabufmask0); + reg1 = qib_read_kreg_port(ppd, krp_senddmabufmask1); + reg2 = qib_read_kreg_port(ppd, krp_senddmabufmask2); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmabufmask 0:%llx 1:%llx 2:%llx\n", + reg, reg1, reg2); + + /* get bufuse bits, clear them, and print them again if non-zero */ + reg = qib_read_kreg_port(ppd, krp_senddmabuf_use0); + qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg); + reg1 = qib_read_kreg_port(ppd, krp_senddmabuf_use1); + qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg1); + reg2 = qib_read_kreg_port(ppd, krp_senddmabuf_use2); + qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg2); + /* 0 and 1 should always be zero, so print as short form */ + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA current senddmabuf_use 0:%llx 1:%llx 2:%llx\n", + reg, reg1, reg2); + reg = qib_read_kreg_port(ppd, krp_senddmabuf_use0); + reg1 = qib_read_kreg_port(ppd, krp_senddmabuf_use1); + reg2 = qib_read_kreg_port(ppd, krp_senddmabuf_use2); + /* 0 and 1 should always be zero, so print as short form */ + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA cleared senddmabuf_use 0:%llx 1:%llx 2:%llx\n", + reg, reg1, reg2); + + reg = qib_read_kreg_port(ppd, krp_senddmatail); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmatail: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmahead); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmahead: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmaheadaddr); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmaheadaddr: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmalengen); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmalengen: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmadesccnt); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmadesccnt: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmaidlecnt); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmaidlecnt: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmaprioritythld); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmapriorityhld: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmareloadcnt); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmareloadcnt: 0x%016llx\n", reg); + + dump_sdma_state(ppd); +} + static struct sdma_set_state_action sdma_7322_action_table[] = { [qib_sdma_state_s00_hw_down] = { .go_s99_running_tofalse = 1, @@ -6885,6 +7297,9 @@ struct qib_devdata *qib_init_iba7322_funcs(struct pci_dev *pdev, dd->f_sdma_init_early = qib_7322_sdma_init_early; dd->f_writescratch = writescratch; dd->f_tempsense_rd = qib_7322_tempsense_rd; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dd->f_notify_dca = qib_7322_notify_dca; +#endif /* * Do remaining PCIe setup and save PCIe values in dd. * Any error printing is already done by the init code. @@ -6921,7 +7336,7 @@ struct qib_devdata *qib_init_iba7322_funcs(struct pci_dev *pdev, actual_cnt -= dd->num_pports; tabsize = actual_cnt; - dd->cspec->msix_entries = kmalloc(tabsize * + dd->cspec->msix_entries = kzalloc(tabsize * sizeof(struct qib_msix_entry), GFP_KERNEL); if (!dd->cspec->msix_entries) { qib_dev_err(dd, "No memory for MSIx table\n"); @@ -6941,7 +7356,13 @@ struct qib_devdata *qib_init_iba7322_funcs(struct pci_dev *pdev, /* clear diagctrl register, in case diags were running and crashed */ qib_write_kreg(dd, kr_hwdiagctrl, 0); - +#ifdef CONFIG_INFINIBAND_QIB_DCA + if (!dca_add_requester(&pdev->dev)) { + qib_devinfo(dd->pcidev, "DCA enabled\n"); + dd->flags |= QIB_DCA_ENABLED; + qib_setup_dca(dd); + } +#endif goto bail; bail_cleanup: @@ -7156,15 +7577,20 @@ static const struct txdds_ent txdds_extra_sdr[TXDDS_EXTRA_SZ] = { { 0, 0, 0, 1 }, /* QMH7342 backplane settings */ { 0, 0, 0, 2 }, /* QMH7342 backplane settings */ { 0, 0, 0, 2 }, /* QMH7342 backplane settings */ - { 0, 0, 0, 11 }, /* QME7342 backplane settings */ - { 0, 0, 0, 11 }, /* QME7342 backplane settings */ - { 0, 0, 0, 11 }, /* QME7342 backplane settings */ - { 0, 0, 0, 11 }, /* QME7342 backplane settings */ - { 0, 0, 0, 11 }, /* QME7342 backplane settings */ - { 0, 0, 0, 11 }, /* QME7342 backplane settings */ - { 0, 0, 0, 11 }, /* QME7342 backplane settings */ { 0, 0, 0, 3 }, /* QMH7342 backplane settings */ { 0, 0, 0, 4 }, /* QMH7342 backplane settings */ + { 0, 1, 4, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 3, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 12 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 14 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 2, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 7 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 6 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 8 }, /* QME7342 backplane settings 1.1 */ }; static const struct txdds_ent txdds_extra_ddr[TXDDS_EXTRA_SZ] = { @@ -7173,15 +7599,20 @@ static const struct txdds_ent txdds_extra_ddr[TXDDS_EXTRA_SZ] = { { 0, 0, 0, 7 }, /* QMH7342 backplane settings */ { 0, 0, 0, 8 }, /* QMH7342 backplane settings */ { 0, 0, 0, 8 }, /* QMH7342 backplane settings */ - { 0, 0, 0, 13 }, /* QME7342 backplane settings */ - { 0, 0, 0, 13 }, /* QME7342 backplane settings */ - { 0, 0, 0, 13 }, /* QME7342 backplane settings */ - { 0, 0, 0, 13 }, /* QME7342 backplane settings */ - { 0, 0, 0, 13 }, /* QME7342 backplane settings */ - { 0, 0, 0, 13 }, /* QME7342 backplane settings */ - { 0, 0, 0, 13 }, /* QME7342 backplane settings */ { 0, 0, 0, 9 }, /* QMH7342 backplane settings */ { 0, 0, 0, 10 }, /* QMH7342 backplane settings */ + { 0, 1, 4, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 3, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 12 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 14 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 2, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 7 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 6 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 8 }, /* QME7342 backplane settings 1.1 */ }; static const struct txdds_ent txdds_extra_qdr[TXDDS_EXTRA_SZ] = { @@ -7190,15 +7621,20 @@ static const struct txdds_ent txdds_extra_qdr[TXDDS_EXTRA_SZ] = { { 0, 1, 0, 5 }, /* QMH7342 backplane settings */ { 0, 1, 0, 6 }, /* QMH7342 backplane settings */ { 0, 1, 0, 8 }, /* QMH7342 backplane settings */ - { 0, 1, 12, 10 }, /* QME7342 backplane setting */ - { 0, 1, 12, 11 }, /* QME7342 backplane setting */ - { 0, 1, 12, 12 }, /* QME7342 backplane setting */ - { 0, 1, 12, 14 }, /* QME7342 backplane setting */ - { 0, 1, 12, 6 }, /* QME7342 backplane setting */ - { 0, 1, 12, 7 }, /* QME7342 backplane setting */ - { 0, 1, 12, 8 }, /* QME7342 backplane setting */ { 0, 1, 0, 10 }, /* QMH7342 backplane settings */ { 0, 1, 0, 12 }, /* QMH7342 backplane settings */ + { 0, 1, 4, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 3, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 12 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 14 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 2, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 7 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 6 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 8 }, /* QME7342 backplane settings 1.1 */ }; static const struct txdds_ent txdds_extra_mfg[TXDDS_MFG_SZ] = { diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 4443adfcd9e..8d3c78ddc90 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * @@ -39,10 +39,17 @@ #include <linux/idr.h> #include <linux/module.h> #include <linux/printk.h> +#ifdef CONFIG_INFINIBAND_QIB_DCA +#include <linux/dca.h> +#endif #include "qib.h" #include "qib_common.h" #include "qib_mad.h" +#ifdef CONFIG_DEBUG_FS +#include "qib_debugfs.h" +#include "qib_verbs.h" +#endif #undef pr_fmt #define pr_fmt(fmt) QIB_DRV_NAME ": " fmt @@ -64,6 +71,11 @@ ushort qib_cfgctxts; module_param_named(cfgctxts, qib_cfgctxts, ushort, S_IRUGO); MODULE_PARM_DESC(cfgctxts, "Set max number of contexts to use"); +unsigned qib_numa_aware; +module_param_named(numa_aware, qib_numa_aware, uint, S_IRUGO); +MODULE_PARM_DESC(numa_aware, + "0 -> PSM allocation close to HCA, 1 -> PSM allocation local to process"); + /* * If set, do not write to any regs if avoidable, hack to allow * check for deranged default register values. @@ -89,8 +101,6 @@ unsigned qib_wc_pat = 1; /* default (1) is to use PAT, not MTRR */ module_param_named(wc_pat, qib_wc_pat, uint, S_IRUGO); MODULE_PARM_DESC(wc_pat, "enable write-combining via PAT mechanism"); -struct workqueue_struct *qib_cq_wq; - static void verify_interrupt(unsigned long); static struct idr qib_unit_table; @@ -120,7 +130,11 @@ void qib_set_ctxtcnt(struct qib_devdata *dd) int qib_create_ctxts(struct qib_devdata *dd) { unsigned i; - int ret; + int local_node_id = pcibus_to_node(dd->pcidev->bus); + + if (local_node_id < 0) + local_node_id = numa_node_id(); + dd->assigned_node_id = local_node_id; /* * Allocate full ctxtcnt array, rather than just cfgctxts, because @@ -130,8 +144,7 @@ int qib_create_ctxts(struct qib_devdata *dd) if (!dd->rcd) { qib_dev_err(dd, "Unable to allocate ctxtdata array, failing\n"); - ret = -ENOMEM; - goto done; + return -ENOMEM; } /* create (one or more) kctxt */ @@ -143,38 +156,51 @@ int qib_create_ctxts(struct qib_devdata *dd) continue; ppd = dd->pport + (i % dd->num_pports); - rcd = qib_create_ctxtdata(ppd, i); + + rcd = qib_create_ctxtdata(ppd, i, dd->assigned_node_id); if (!rcd) { qib_dev_err(dd, "Unable to allocate ctxtdata for Kernel ctxt, failing\n"); - ret = -ENOMEM; - goto done; + kfree(dd->rcd); + dd->rcd = NULL; + return -ENOMEM; } rcd->pkeys[0] = QIB_DEFAULT_P_KEY; rcd->seq_cnt = 1; } - ret = 0; -done: - return ret; + return 0; } /* * Common code for user and kernel context setup. */ -struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt) +struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt, + int node_id) { struct qib_devdata *dd = ppd->dd; struct qib_ctxtdata *rcd; - rcd = kzalloc(sizeof(*rcd), GFP_KERNEL); + rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, node_id); if (rcd) { INIT_LIST_HEAD(&rcd->qp_wait_list); + rcd->node_id = node_id; rcd->ppd = ppd; rcd->dd = dd; rcd->cnt = 1; rcd->ctxt = ctxt; dd->rcd[ctxt] = rcd; - +#ifdef CONFIG_DEBUG_FS + if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */ + rcd->opstats = kzalloc_node(sizeof(*rcd->opstats), + GFP_KERNEL, node_id); + if (!rcd->opstats) { + kfree(rcd); + qib_dev_err(dd, + "Unable to allocate per ctxt stats buffer\n"); + return NULL; + } + } +#endif dd->f_init_ctxt(rcd); /* @@ -204,7 +230,7 @@ struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt) /* * Common code for initializing the physical port structure. */ -void qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, +int qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, u8 hw_pidx, u8 port) { int size; @@ -214,6 +240,7 @@ void qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, spin_lock_init(&ppd->sdma_lock); spin_lock_init(&ppd->lflags_lock); + spin_lock_init(&ppd->cc_shadow_lock); init_waitqueue_head(&ppd->state_wait); init_timer(&ppd->symerr_clear_timer); @@ -221,8 +248,10 @@ void qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, ppd->symerr_clear_timer.data = (unsigned long)ppd; ppd->qib_wq = NULL; - - spin_lock_init(&ppd->cc_shadow_lock); + ppd->ibport_data.pmastats = + alloc_percpu(struct qib_pma_counters); + if (!ppd->ibport_data.pmastats) + return -ENOMEM; if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) goto bail; @@ -270,7 +299,7 @@ void qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, goto bail_3; } - return; + return 0; bail_3: kfree(ppd->ccti_entries_shadow); @@ -284,7 +313,7 @@ bail_1: bail: /* User is intentionally disabling the congestion control agent */ if (!qib_cc_table_size) - return; + return 0; if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) { qib_cc_table_size = 0; @@ -295,7 +324,7 @@ bail: qib_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port); - return; + return 0; } static int init_pioavailregs(struct qib_devdata *dd) @@ -429,6 +458,7 @@ static int loadtime_init(struct qib_devdata *dd) dd->intrchk_timer.function = verify_interrupt; dd->intrchk_timer.data = (unsigned long) dd; + ret = qib_cq_init(dd); done: return ret; } @@ -495,6 +525,7 @@ static void enable_chip(struct qib_devdata *dd) static void verify_interrupt(unsigned long opaque) { struct qib_devdata *dd = (struct qib_devdata *) opaque; + u64 int_counter; if (!dd) return; /* being torn down */ @@ -503,7 +534,8 @@ static void verify_interrupt(unsigned long opaque) * If we don't have a lid or any interrupts, let the user know and * don't bother checking again. */ - if (dd->int_counter == 0) { + int_counter = qib_int_counter(dd) - dd->z_int_counter; + if (int_counter == 0) { if (!dd->f_intr_fallback(dd)) dev_err(&dd->pcidev->dev, "No interrupts detected, not usable.\n"); @@ -603,6 +635,12 @@ wq_error: return -ENOMEM; } +static void qib_free_pportdata(struct qib_pportdata *ppd) +{ + free_percpu(ppd->ibport_data.pmastats); + ppd->ibport_data.pmastats = NULL; +} + /** * qib_init - do the actual initialization sequence on the chip * @dd: the qlogic_ib device @@ -890,6 +928,7 @@ static void qib_shutdown_device(struct qib_devdata *dd) destroy_workqueue(ppd->qib_wq); ppd->qib_wq = NULL; } + qib_free_pportdata(ppd); } qib_update_eeprom_log(dd); @@ -944,6 +983,10 @@ void qib_free_ctxtdata(struct qib_devdata *dd, struct qib_ctxtdata *rcd) vfree(rcd->subctxt_uregbase); vfree(rcd->subctxt_rcvegrbuf); vfree(rcd->subctxt_rcvhdr_base); +#ifdef CONFIG_DEBUG_FS + kfree(rcd->opstats); + rcd->opstats = NULL; +#endif kfree(rcd); } @@ -1033,7 +1076,6 @@ done: dd->f_set_armlaunch(dd, 1); } - void qib_free_devdata(struct qib_devdata *dd) { unsigned long flags; @@ -1043,9 +1085,37 @@ void qib_free_devdata(struct qib_devdata *dd) list_del(&dd->list); spin_unlock_irqrestore(&qib_devs_lock, flags); +#ifdef CONFIG_DEBUG_FS + qib_dbg_ibdev_exit(&dd->verbs_dev); +#endif + free_percpu(dd->int_counter); ib_dealloc_device(&dd->verbs_dev.ibdev); } +u64 qib_int_counter(struct qib_devdata *dd) +{ + int cpu; + u64 int_counter = 0; + + for_each_possible_cpu(cpu) + int_counter += *per_cpu_ptr(dd->int_counter, cpu); + return int_counter; +} + +u64 qib_sps_ints(void) +{ + unsigned long flags; + struct qib_devdata *dd; + u64 sps_ints = 0; + + spin_lock_irqsave(&qib_devs_lock, flags); + list_for_each_entry(dd, &qib_dev_list, list) { + sps_ints += qib_int_counter(dd); + } + spin_unlock_irqrestore(&qib_devs_lock, flags); + return sps_ints; +} + /* * Allocate our primary per-unit data structure. Must be done via verbs * allocator, because the verbs cleanup process both does cleanup and @@ -1060,28 +1130,34 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) struct qib_devdata *dd; int ret; - if (!idr_pre_get(&qib_unit_table, GFP_KERNEL)) { - dd = ERR_PTR(-ENOMEM); - goto bail; - } - dd = (struct qib_devdata *) ib_alloc_device(sizeof(*dd) + extra); - if (!dd) { - dd = ERR_PTR(-ENOMEM); - goto bail; - } + if (!dd) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&dd->list); + + idr_preload(GFP_KERNEL); spin_lock_irqsave(&qib_devs_lock, flags); - ret = idr_get_new(&qib_unit_table, dd, &dd->unit); - if (ret >= 0) + + ret = idr_alloc(&qib_unit_table, dd, 0, 0, GFP_NOWAIT); + if (ret >= 0) { + dd->unit = ret; list_add(&dd->list, &qib_dev_list); + } + spin_unlock_irqrestore(&qib_devs_lock, flags); + idr_preload_end(); if (ret < 0) { qib_early_err(&pdev->dev, "Could not allocate unit ID: error %d\n", -ret); - ib_dealloc_device(&dd->verbs_dev.ibdev); - dd = ERR_PTR(ret); + goto bail; + } + dd->int_counter = alloc_percpu(u64); + if (!dd->int_counter) { + ret = -ENOMEM; + qib_early_err(&pdev->dev, + "Could not allocate per-cpu int_counter\n"); goto bail; } @@ -1095,9 +1171,15 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) qib_early_err(&pdev->dev, "Could not alloc cpulist info, cpu affinity might be wrong\n"); } - -bail: +#ifdef CONFIG_DEBUG_FS + qib_dbg_ibdev_init(&dd->verbs_dev); +#endif return dd; +bail: + if (!list_empty(&dd->list)) + list_del_init(&dd->list); + ib_dealloc_device(&dd->verbs_dev.ibdev); + return ERR_PTR(ret);; } /* @@ -1134,11 +1216,10 @@ void qib_disable_after_error(struct qib_devdata *dd) *dd->devstatusp |= QIB_STATUS_HWERROR; } -static void __devexit qib_remove_one(struct pci_dev *); -static int __devinit qib_init_one(struct pci_dev *, - const struct pci_device_id *); +static void qib_remove_one(struct pci_dev *); +static int qib_init_one(struct pci_dev *, const struct pci_device_id *); -#define DRIVER_LOAD_MSG "QLogic " QIB_DRV_NAME " loaded: " +#define DRIVER_LOAD_MSG "Intel " QIB_DRV_NAME " loaded: " #define PFX QIB_DRV_NAME ": " static DEFINE_PCI_DEVICE_TABLE(qib_pci_tbl) = { @@ -1150,19 +1231,48 @@ static DEFINE_PCI_DEVICE_TABLE(qib_pci_tbl) = { MODULE_DEVICE_TABLE(pci, qib_pci_tbl); -struct pci_driver qib_driver = { +static struct pci_driver qib_driver = { .name = QIB_DRV_NAME, .probe = qib_init_one, - .remove = __devexit_p(qib_remove_one), + .remove = qib_remove_one, .id_table = qib_pci_tbl, .err_handler = &qib_pci_err_handler, }; +#ifdef CONFIG_INFINIBAND_QIB_DCA + +static int qib_notify_dca(struct notifier_block *, unsigned long, void *); +static struct notifier_block dca_notifier = { + .notifier_call = qib_notify_dca, + .next = NULL, + .priority = 0 +}; + +static int qib_notify_dca_device(struct device *device, void *data) +{ + struct qib_devdata *dd = dev_get_drvdata(device); + unsigned long event = *(unsigned long *)data; + + return dd->f_notify_dca(dd, event); +} + +static int qib_notify_dca(struct notifier_block *nb, unsigned long event, + void *p) +{ + int rval; + + rval = driver_for_each_device(&qib_driver.driver, NULL, + &event, qib_notify_dca_device); + return rval ? NOTIFY_BAD : NOTIFY_DONE; +} + +#endif + /* * Do all the generic driver unit- and chip-independent memory * allocation and initialization. */ -static int __init qlogic_ib_init(void) +static int __init qib_ib_init(void) { int ret; @@ -1170,27 +1280,22 @@ static int __init qlogic_ib_init(void) if (ret) goto bail; - qib_cq_wq = create_singlethread_workqueue("qib_cq"); - if (!qib_cq_wq) { - ret = -ENOMEM; - goto bail_dev; - } - /* * These must be called before the driver is registered with * the PCI subsystem. */ idr_init(&qib_unit_table); - if (!idr_pre_get(&qib_unit_table, GFP_KERNEL)) { - pr_err("idr_pre_get() failed\n"); - ret = -ENOMEM; - goto bail_cq_wq; - } +#ifdef CONFIG_INFINIBAND_QIB_DCA + dca_register_notify(&dca_notifier); +#endif +#ifdef CONFIG_DEBUG_FS + qib_dbg_init(); +#endif ret = pci_register_driver(&qib_driver); if (ret < 0) { pr_err("Unable to register driver: error %d\n", -ret); - goto bail_unit; + goto bail_dev; } /* not fatal if it doesn't work */ @@ -1198,22 +1303,25 @@ static int __init qlogic_ib_init(void) pr_err("Unable to register ipathfs\n"); goto bail; /* all OK */ -bail_unit: - idr_destroy(&qib_unit_table); -bail_cq_wq: - destroy_workqueue(qib_cq_wq); bail_dev: +#ifdef CONFIG_INFINIBAND_QIB_DCA + dca_unregister_notify(&dca_notifier); +#endif +#ifdef CONFIG_DEBUG_FS + qib_dbg_exit(); +#endif + idr_destroy(&qib_unit_table); qib_dev_cleanup(); bail: return ret; } -module_init(qlogic_ib_init); +module_init(qib_ib_init); /* * Do the non-unit driver cleanup, memory free, etc. at unload. */ -static void __exit qlogic_ib_cleanup(void) +static void __exit qib_ib_cleanup(void) { int ret; @@ -1223,9 +1331,13 @@ static void __exit qlogic_ib_cleanup(void) "Unable to cleanup counter filesystem: error %d\n", -ret); +#ifdef CONFIG_INFINIBAND_QIB_DCA + dca_unregister_notify(&dca_notifier); +#endif pci_unregister_driver(&qib_driver); - - destroy_workqueue(qib_cq_wq); +#ifdef CONFIG_DEBUG_FS + qib_dbg_exit(); +#endif qib_cpulist_count = 0; kfree(qib_cpulist); @@ -1234,7 +1346,7 @@ static void __exit qlogic_ib_cleanup(void) qib_dev_cleanup(); } -module_exit(qlogic_ib_cleanup); +module_exit(qib_ib_cleanup); /* this can only be called after a successful initialization */ static void cleanup_device_data(struct qib_devdata *dd) @@ -1276,7 +1388,7 @@ static void cleanup_device_data(struct qib_devdata *dd) if (dd->pageshadow) { struct page **tmpp = dd->pageshadow; dma_addr_t *tmpd = dd->physshadow; - int i, cnt = 0; + int i; for (ctxt = 0; ctxt < dd->cfgctxts; ctxt++) { int ctxt_tidbase = ctxt * dd->rcvtidcnt; @@ -1289,13 +1401,13 @@ static void cleanup_device_data(struct qib_devdata *dd) PAGE_SIZE, PCI_DMA_FROMDEVICE); qib_release_user_pages(&tmpp[i], 1); tmpp[i] = NULL; - cnt++; } } - tmpp = dd->pageshadow; dd->pageshadow = NULL; vfree(tmpp); + dd->physshadow = NULL; + vfree(tmpd); } /* @@ -1317,6 +1429,7 @@ static void cleanup_device_data(struct qib_devdata *dd) } kfree(tmp); kfree(dd->boardname); + qib_cq_exit(dd); } /* @@ -1342,8 +1455,7 @@ static void qib_postinit_cleanup(struct qib_devdata *dd) qib_free_devdata(dd); } -static int __devinit qib_init_one(struct pci_dev *pdev, - const struct pci_device_id *ent) +static int qib_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { int ret, j, pidx, initfail; struct qib_devdata *dd = NULL; @@ -1362,7 +1474,7 @@ static int __devinit qib_init_one(struct pci_dev *pdev, dd = qib_init_iba6120_funcs(pdev, ent); #else qib_early_err(&pdev->dev, - "QLogic PCIE device 0x%x cannot work if CONFIG_PCI_MSI is not enabled\n", + "Intel PCIE device 0x%x cannot work if CONFIG_PCI_MSI is not enabled\n", ent->device); dd = ERR_PTR(-ENODEV); #endif @@ -1378,7 +1490,7 @@ static int __devinit qib_init_one(struct pci_dev *pdev, default: qib_early_err(&pdev->dev, - "Failing on unknown QLogic deviceid 0x%x\n", + "Failing on unknown Intel deviceid 0x%x\n", ent->device); ret = -ENODEV; } @@ -1448,7 +1560,7 @@ bail: return ret; } -static void __devexit qib_remove_one(struct pci_dev *pdev) +static void qib_remove_one(struct pci_dev *pdev) { struct qib_devdata *dd = pci_get_drvdata(pdev); int ret; @@ -1490,6 +1602,7 @@ static void __devexit qib_remove_one(struct pci_dev *pdev) int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd) { unsigned amt; + int old_node_id; if (!rcd->rcvhdrq) { dma_addr_t phys_hdrqtail; @@ -1499,9 +1612,13 @@ int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd) sizeof(u32), PAGE_SIZE); gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ? GFP_USER : GFP_KERNEL; + + old_node_id = dev_to_node(&dd->pcidev->dev); + set_dev_node(&dd->pcidev->dev, rcd->node_id); rcd->rcvhdrq = dma_alloc_coherent( &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys, gfp_flags | __GFP_COMP); + set_dev_node(&dd->pcidev->dev, old_node_id); if (!rcd->rcvhdrq) { qib_dev_err(dd, @@ -1517,9 +1634,11 @@ int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd) } if (!(dd->flags & QIB_NODMA_RTAIL)) { + set_dev_node(&dd->pcidev->dev, rcd->node_id); rcd->rcvhdrtail_kvaddr = dma_alloc_coherent( &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, gfp_flags); + set_dev_node(&dd->pcidev->dev, old_node_id); if (!rcd->rcvhdrtail_kvaddr) goto bail_free; rcd->rcvhdrqtailaddr_phys = phys_hdrqtail; @@ -1563,6 +1682,7 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff; size_t size; gfp_t gfp_flags; + int old_node_id; /* * GFP_USER, but without GFP_FS, so buffer cache can be @@ -1581,25 +1701,29 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) size = rcd->rcvegrbuf_size; if (!rcd->rcvegrbuf) { rcd->rcvegrbuf = - kzalloc(chunk * sizeof(rcd->rcvegrbuf[0]), - GFP_KERNEL); + kzalloc_node(chunk * sizeof(rcd->rcvegrbuf[0]), + GFP_KERNEL, rcd->node_id); if (!rcd->rcvegrbuf) goto bail; } if (!rcd->rcvegrbuf_phys) { rcd->rcvegrbuf_phys = - kmalloc(chunk * sizeof(rcd->rcvegrbuf_phys[0]), - GFP_KERNEL); + kmalloc_node(chunk * sizeof(rcd->rcvegrbuf_phys[0]), + GFP_KERNEL, rcd->node_id); if (!rcd->rcvegrbuf_phys) goto bail_rcvegrbuf; } for (e = 0; e < rcd->rcvegrbuf_chunks; e++) { if (rcd->rcvegrbuf[e]) continue; + + old_node_id = dev_to_node(&dd->pcidev->dev); + set_dev_node(&dd->pcidev->dev, rcd->node_id); rcd->rcvegrbuf[e] = dma_alloc_coherent(&dd->pcidev->dev, size, &rcd->rcvegrbuf_phys[e], gfp_flags); + set_dev_node(&dd->pcidev->dev, old_node_id); if (!rcd->rcvegrbuf[e]) goto bail_rcvegrbuf_phys; } diff --git a/drivers/infiniband/hw/qib/qib_keys.c b/drivers/infiniband/hw/qib/qib_keys.c index 81c7b73695d..3b9afccaaad 100644 --- a/drivers/infiniband/hw/qib/qib_keys.c +++ b/drivers/infiniband/hw/qib/qib_keys.c @@ -61,7 +61,7 @@ int qib_alloc_lkey(struct qib_mregion *mr, int dma_region) if (dma_region) { struct qib_mregion *tmr; - tmr = rcu_dereference(dev->dma_mr); + tmr = rcu_access_pointer(dev->dma_mr); if (!tmr) { qib_get_mr(mr); rcu_assign_pointer(dev->dma_mr, mr); diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c index ccb119143d2..22c720e5740 100644 --- a/drivers/infiniband/hw/qib/qib_mad.c +++ b/drivers/infiniband/hw/qib/qib_mad.c @@ -1028,7 +1028,7 @@ static int set_pkeys(struct qib_devdata *dd, u8 port, u16 *pkeys) event.event = IB_EVENT_PKEY_CHANGE; event.device = &dd->verbs_dev.ibdev; - event.element.port_num = 1; + event.element.port_num = port; ib_dispatch_event(&event); } return 0; @@ -1634,6 +1634,23 @@ static int pma_get_portcounters_cong(struct ib_pma_mad *pmp, return reply((struct ib_smp *)pmp); } +static void qib_snapshot_pmacounters( + struct qib_ibport *ibp, + struct qib_pma_counters *pmacounters) +{ + struct qib_pma_counters *p; + int cpu; + + memset(pmacounters, 0, sizeof(*pmacounters)); + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(ibp->pmastats, cpu); + pmacounters->n_unicast_xmit += p->n_unicast_xmit; + pmacounters->n_unicast_rcv += p->n_unicast_rcv; + pmacounters->n_multicast_xmit += p->n_multicast_xmit; + pmacounters->n_multicast_rcv += p->n_multicast_rcv; + } +} + static int pma_get_portcounters_ext(struct ib_pma_mad *pmp, struct ib_device *ibdev, u8 port) { @@ -1642,6 +1659,7 @@ static int pma_get_portcounters_ext(struct ib_pma_mad *pmp, struct qib_ibport *ibp = to_iport(ibdev, port); struct qib_pportdata *ppd = ppd_from_ibp(ibp); u64 swords, rwords, spkts, rpkts, xwait; + struct qib_pma_counters pma; u8 port_select = p->port_select; memset(pmp->data, 0, sizeof(pmp->data)); @@ -1664,10 +1682,17 @@ static int pma_get_portcounters_ext(struct ib_pma_mad *pmp, p->port_rcv_data = cpu_to_be64(rwords); p->port_xmit_packets = cpu_to_be64(spkts); p->port_rcv_packets = cpu_to_be64(rpkts); - p->port_unicast_xmit_packets = cpu_to_be64(ibp->n_unicast_xmit); - p->port_unicast_rcv_packets = cpu_to_be64(ibp->n_unicast_rcv); - p->port_multicast_xmit_packets = cpu_to_be64(ibp->n_multicast_xmit); - p->port_multicast_rcv_packets = cpu_to_be64(ibp->n_multicast_rcv); + + qib_snapshot_pmacounters(ibp, &pma); + + p->port_unicast_xmit_packets = cpu_to_be64(pma.n_unicast_xmit + - ibp->z_unicast_xmit); + p->port_unicast_rcv_packets = cpu_to_be64(pma.n_unicast_rcv + - ibp->z_unicast_rcv); + p->port_multicast_xmit_packets = cpu_to_be64(pma.n_multicast_xmit + - ibp->z_multicast_xmit); + p->port_multicast_rcv_packets = cpu_to_be64(pma.n_multicast_rcv + - ibp->z_multicast_rcv); bail: return reply((struct ib_smp *) pmp); @@ -1795,6 +1820,7 @@ static int pma_set_portcounters_ext(struct ib_pma_mad *pmp, struct qib_ibport *ibp = to_iport(ibdev, port); struct qib_pportdata *ppd = ppd_from_ibp(ibp); u64 swords, rwords, spkts, rpkts, xwait; + struct qib_pma_counters pma; qib_snapshot_counters(ppd, &swords, &rwords, &spkts, &rpkts, &xwait); @@ -1810,17 +1836,19 @@ static int pma_set_portcounters_ext(struct ib_pma_mad *pmp, if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS) ibp->z_port_rcv_packets = rpkts; + qib_snapshot_pmacounters(ibp, &pma); + if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS) - ibp->n_unicast_xmit = 0; + ibp->z_unicast_xmit = pma.n_unicast_xmit; if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS) - ibp->n_unicast_rcv = 0; + ibp->z_unicast_rcv = pma.n_unicast_rcv; if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS) - ibp->n_multicast_xmit = 0; + ibp->z_multicast_xmit = pma.n_multicast_xmit; if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS) - ibp->n_multicast_rcv = 0; + ibp->z_multicast_rcv = pma.n_multicast_rcv; return pma_get_portcounters_ext(pmp, ibdev, port); } diff --git a/drivers/infiniband/hw/qib/qib_mad.h b/drivers/infiniband/hw/qib/qib_mad.h index 57bd3fa016b..941d4d50d8e 100644 --- a/drivers/infiniband/hw/qib/qib_mad.h +++ b/drivers/infiniband/hw/qib/qib_mad.h @@ -54,7 +54,7 @@ struct ib_node_info { __be32 revision; u8 local_port_num; u8 vendor_id[3]; -} __attribute__ ((packed)); +} __packed; struct ib_mad_notice_attr { u8 generic_type; @@ -73,7 +73,7 @@ struct ib_mad_notice_attr { __be16 reserved; __be16 lid; /* where violation happened */ u8 port_num; /* where violation happened */ - } __attribute__ ((packed)) ntc_129_131; + } __packed ntc_129_131; struct { __be16 reserved; @@ -83,14 +83,14 @@ struct ib_mad_notice_attr { __be32 new_cap_mask; /* new capability mask */ u8 reserved3; u8 change_flags; /* low 3 bits only */ - } __attribute__ ((packed)) ntc_144; + } __packed ntc_144; struct { __be16 reserved; __be16 lid; /* lid where sys guid changed */ __be16 reserved2; __be64 new_sys_guid; - } __attribute__ ((packed)) ntc_145; + } __packed ntc_145; struct { __be16 reserved; @@ -104,7 +104,7 @@ struct ib_mad_notice_attr { u8 reserved3; u8 dr_trunc_hop; u8 dr_rtn_path[30]; - } __attribute__ ((packed)) ntc_256; + } __packed ntc_256; struct { __be16 reserved; @@ -115,7 +115,7 @@ struct ib_mad_notice_attr { __be32 qp2; /* high 8 bits reserved */ union ib_gid gid1; union ib_gid gid2; - } __attribute__ ((packed)) ntc_257_258; + } __packed ntc_257_258; } details; }; @@ -209,7 +209,7 @@ struct ib_pma_portcounters_cong { __be64 port_rcv_packets; __be64 port_xmit_wait; __be64 port_adr_events; -} __attribute__ ((packed)); +} __packed; #define IB_PMA_CONG_HW_CONTROL_TIMER 0x00 #define IB_PMA_CONG_HW_CONTROL_SAMPLE 0x01 @@ -415,7 +415,6 @@ struct cc_table_shadow { struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX]; } __packed; -#endif /* _QIB_MAD_H */ /* * The PortSamplesControl.CounterMasks field is an array of 3 bit fields * which specify the N'th counter's capabilities. See ch. 16.1.3.2. @@ -428,3 +427,5 @@ struct cc_table_shadow { COUNTER_MASK(1, 2) | \ COUNTER_MASK(1, 3) | \ COUNTER_MASK(1, 4)) + +#endif /* _QIB_MAD_H */ diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c index e6687ded821..9bbb55347cc 100644 --- a/drivers/infiniband/hw/qib/qib_mr.c +++ b/drivers/infiniband/hw/qib/qib_mr.c @@ -232,8 +232,8 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, { struct qib_mr *mr; struct ib_umem *umem; - struct ib_umem_chunk *chunk; - int n, m, i; + struct scatterlist *sg; + int n, m, entry; struct ib_mr *ret; if (length == 0) { @@ -246,9 +246,7 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (IS_ERR(umem)) return (void *) umem; - n = 0; - list_for_each_entry(chunk, &umem->chunk_list, list) - n += chunk->nents; + n = umem->nmap; mr = alloc_mr(n, pd); if (IS_ERR(mr)) { @@ -268,11 +266,10 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mr->mr.page_shift = ilog2(umem->page_size); m = 0; n = 0; - list_for_each_entry(chunk, &umem->chunk_list, list) { - for (i = 0; i < chunk->nents; i++) { + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { void *vaddr; - vaddr = page_address(sg_page(&chunk->page_list[i])); + vaddr = page_address(sg_page(sg)); if (!vaddr) { ret = ERR_PTR(-EINVAL); goto bail; @@ -284,7 +281,6 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, m++; n = 0; } - } } ret = &mr->ibmr; diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c index c574ec7c85e..61a0046efb7 100644 --- a/drivers/infiniband/hw/qib/qib_pcie.c +++ b/drivers/infiniband/hw/qib/qib_pcie.c @@ -51,8 +51,8 @@ * file calls, even though this violates some * expectations of harmlessness. */ -static int qib_tune_pcie_caps(struct qib_devdata *); -static int qib_tune_pcie_coalesce(struct qib_devdata *); +static void qib_tune_pcie_caps(struct qib_devdata *); +static void qib_tune_pcie_coalesce(struct qib_devdata *); /* * Do all the common PCIe setup and initialization. @@ -197,46 +197,47 @@ static void qib_msix_setup(struct qib_devdata *dd, int pos, u32 *msixcnt, struct qib_msix_entry *qib_msix_entry) { int ret; - u32 tabsize = 0; - u16 msix_flags; + int nvec = *msixcnt; struct msix_entry *msix_entry; int i; + ret = pci_msix_vec_count(dd->pcidev); + if (ret < 0) + goto do_intx; + + nvec = min(nvec, ret); + /* We can't pass qib_msix_entry array to qib_msix_setup * so use a dummy msix_entry array and copy the allocated * irq back to the qib_msix_entry array. */ - msix_entry = kmalloc(*msixcnt * sizeof(*msix_entry), GFP_KERNEL); - if (!msix_entry) { - ret = -ENOMEM; + msix_entry = kmalloc(nvec * sizeof(*msix_entry), GFP_KERNEL); + if (!msix_entry) goto do_intx; - } - for (i = 0; i < *msixcnt; i++) + + for (i = 0; i < nvec; i++) msix_entry[i] = qib_msix_entry[i].msix; - pci_read_config_word(dd->pcidev, pos + PCI_MSIX_FLAGS, &msix_flags); - tabsize = 1 + (msix_flags & PCI_MSIX_FLAGS_QSIZE); - if (tabsize > *msixcnt) - tabsize = *msixcnt; - ret = pci_enable_msix(dd->pcidev, msix_entry, tabsize); - if (ret > 0) { - tabsize = ret; - ret = pci_enable_msix(dd->pcidev, msix_entry, tabsize); - } -do_intx: - if (ret) { - qib_dev_err(dd, - "pci_enable_msix %d vectors failed: %d, falling back to INTx\n", - tabsize, ret); - tabsize = 0; - } - for (i = 0; i < tabsize; i++) + ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec); + if (ret < 0) + goto free_msix_entry; + else + nvec = ret; + + for (i = 0; i < nvec; i++) qib_msix_entry[i].msix = msix_entry[i]; + kfree(msix_entry); - *msixcnt = tabsize; + *msixcnt = nvec; + return; - if (ret) - qib_enable_intx(dd->pcidev); +free_msix_entry: + kfree(msix_entry); +do_intx: + qib_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, " + "falling back to INTx\n", nvec, ret); + *msixcnt = 0; + qib_enable_intx(dd->pcidev); } /** @@ -283,12 +284,12 @@ int qib_pcie_params(struct qib_devdata *dd, u32 minw, u32 *nent, goto bail; } - pos = pci_find_capability(dd->pcidev, PCI_CAP_ID_MSIX); + pos = dd->pcidev->msix_cap; if (nent && *nent && pos) { qib_msix_setup(dd, pos, nent, entry); ret = 0; /* did it, either MSIx or INTx */ } else { - pos = pci_find_capability(dd->pcidev, PCI_CAP_ID_MSI); + pos = dd->pcidev->msi_cap; if (pos) ret = qib_msi_setup(dd, pos); else @@ -357,7 +358,7 @@ int qib_reinit_intr(struct qib_devdata *dd) if (!dd->msi_lo) goto bail; - pos = pci_find_capability(dd->pcidev, PCI_CAP_ID_MSI); + pos = dd->pcidev->msi_cap; if (!pos) { qib_dev_err(dd, "Can't find MSI capability, can't restore MSI settings\n"); @@ -426,7 +427,7 @@ void qib_enable_intx(struct pci_dev *pdev) if (new != cw) pci_write_config_word(pdev, PCI_COMMAND, new); - pos = pci_find_capability(pdev, PCI_CAP_ID_MSI); + pos = pdev->msi_cap; if (pos) { /* then turn off MSI */ pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &cw); @@ -434,7 +435,7 @@ void qib_enable_intx(struct pci_dev *pdev) if (new != cw) pci_write_config_word(pdev, pos + PCI_MSI_FLAGS, new); } - pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX); + pos = pdev->msix_cap; if (pos) { /* then turn off MSIx */ pci_read_config_word(pdev, pos + PCI_MSIX_FLAGS, &cw); @@ -476,30 +477,6 @@ void qib_pcie_reenable(struct qib_devdata *dd, u16 cmd, u8 iline, u8 cline) "pci_enable_device failed after reset: %d\n", r); } -/* code to adjust PCIe capabilities. */ - -static int fld2val(int wd, int mask) -{ - int lsbmask; - - if (!mask) - return 0; - wd &= mask; - lsbmask = mask ^ (mask & (mask - 1)); - wd /= lsbmask; - return wd; -} - -static int val2fld(int wd, int mask) -{ - int lsbmask; - - if (!mask) - return 0; - lsbmask = mask ^ (mask & (mask - 1)); - wd *= lsbmask; - return wd; -} static int qib_pcie_coalesce; module_param_named(pcie_coalesce, qib_pcie_coalesce, int, S_IRUGO); @@ -511,7 +488,7 @@ MODULE_PARM_DESC(pcie_coalesce, "tune PCIe colescing on some Intel chipsets"); * of these chipsets, with some BIOS settings, and enabling it on those * systems may result in the system crashing, and/or data corruption. */ -static int qib_tune_pcie_coalesce(struct qib_devdata *dd) +static void qib_tune_pcie_coalesce(struct qib_devdata *dd) { int r; struct pci_dev *parent; @@ -519,18 +496,18 @@ static int qib_tune_pcie_coalesce(struct qib_devdata *dd) u32 mask, bits, val; if (!qib_pcie_coalesce) - return 0; + return; /* Find out supported and configured values for parent (root) */ parent = dd->pcidev->bus->self; if (parent->bus->parent) { qib_devinfo(dd->pcidev, "Parent not root\n"); - return 1; + return; } if (!pci_is_pcie(parent)) - return 1; + return; if (parent->vendor != 0x8086) - return 1; + return; /* * - bit 12: Max_rdcmp_Imt_EN: need to set to 1 @@ -563,13 +540,12 @@ static int qib_tune_pcie_coalesce(struct qib_devdata *dd) mask = (3U << 24) | (7U << 10); } else { /* not one of the chipsets that we know about */ - return 1; + return; } pci_read_config_dword(parent, 0x48, &val); val &= ~mask; val |= bits; r = pci_write_config_dword(parent, 0x48, val); - return 0; } /* @@ -580,55 +556,44 @@ static int qib_pcie_caps; module_param_named(pcie_caps, qib_pcie_caps, int, S_IRUGO); MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)"); -static int qib_tune_pcie_caps(struct qib_devdata *dd) +static void qib_tune_pcie_caps(struct qib_devdata *dd) { - int ret = 1; /* Assume the worst */ struct pci_dev *parent; - u16 pcaps, pctl, ecaps, ectl; - int rc_sup, ep_sup; - int rc_cur, ep_cur; + u16 rc_mpss, rc_mps, ep_mpss, ep_mps; + u16 rc_mrrs, ep_mrrs, max_mrrs; /* Find out supported and configured values for parent (root) */ parent = dd->pcidev->bus->self; - if (parent->bus->parent) { + if (!pci_is_root_bus(parent->bus)) { qib_devinfo(dd->pcidev, "Parent not root\n"); - goto bail; + return; } if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev)) - goto bail; - pcie_capability_read_word(parent, PCI_EXP_DEVCAP, &pcaps); - pcie_capability_read_word(parent, PCI_EXP_DEVCTL, &pctl); + return; + + rc_mpss = parent->pcie_mpss; + rc_mps = ffs(pcie_get_mps(parent)) - 8; /* Find out supported and configured values for endpoint (us) */ - pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCAP, &ecaps); - pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl); + ep_mpss = dd->pcidev->pcie_mpss; + ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8; - ret = 0; /* Find max payload supported by root, endpoint */ - rc_sup = fld2val(pcaps, PCI_EXP_DEVCAP_PAYLOAD); - ep_sup = fld2val(ecaps, PCI_EXP_DEVCAP_PAYLOAD); - if (rc_sup > ep_sup) - rc_sup = ep_sup; - - rc_cur = fld2val(pctl, PCI_EXP_DEVCTL_PAYLOAD); - ep_cur = fld2val(ectl, PCI_EXP_DEVCTL_PAYLOAD); + if (rc_mpss > ep_mpss) + rc_mpss = ep_mpss; /* If Supported greater than limit in module param, limit it */ - if (rc_sup > (qib_pcie_caps & 7)) - rc_sup = qib_pcie_caps & 7; + if (rc_mpss > (qib_pcie_caps & 7)) + rc_mpss = qib_pcie_caps & 7; /* If less than (allowed, supported), bump root payload */ - if (rc_sup > rc_cur) { - rc_cur = rc_sup; - pctl = (pctl & ~PCI_EXP_DEVCTL_PAYLOAD) | - val2fld(rc_cur, PCI_EXP_DEVCTL_PAYLOAD); - pcie_capability_write_word(parent, PCI_EXP_DEVCTL, pctl); + if (rc_mpss > rc_mps) { + rc_mps = rc_mpss; + pcie_set_mps(parent, 128 << rc_mps); } /* If less than (allowed, supported), bump endpoint payload */ - if (rc_sup > ep_cur) { - ep_cur = rc_sup; - ectl = (ectl & ~PCI_EXP_DEVCTL_PAYLOAD) | - val2fld(ep_cur, PCI_EXP_DEVCTL_PAYLOAD); - pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl); + if (rc_mpss > ep_mps) { + ep_mps = rc_mpss; + pcie_set_mps(dd->pcidev, 128 << ep_mps); } /* @@ -636,26 +601,22 @@ static int qib_tune_pcie_caps(struct qib_devdata *dd) * No field for max supported, but PCIe spec limits it to 4096, * which is code '5' (log2(4096) - 7) */ - rc_sup = 5; - if (rc_sup > ((qib_pcie_caps >> 4) & 7)) - rc_sup = (qib_pcie_caps >> 4) & 7; - rc_cur = fld2val(pctl, PCI_EXP_DEVCTL_READRQ); - ep_cur = fld2val(ectl, PCI_EXP_DEVCTL_READRQ); - - if (rc_sup > rc_cur) { - rc_cur = rc_sup; - pctl = (pctl & ~PCI_EXP_DEVCTL_READRQ) | - val2fld(rc_cur, PCI_EXP_DEVCTL_READRQ); - pcie_capability_write_word(parent, PCI_EXP_DEVCTL, pctl); + max_mrrs = 5; + if (max_mrrs > ((qib_pcie_caps >> 4) & 7)) + max_mrrs = (qib_pcie_caps >> 4) & 7; + + max_mrrs = 128 << max_mrrs; + rc_mrrs = pcie_get_readrq(parent); + ep_mrrs = pcie_get_readrq(dd->pcidev); + + if (max_mrrs > rc_mrrs) { + rc_mrrs = max_mrrs; + pcie_set_readrq(parent, rc_mrrs); } - if (rc_sup > ep_cur) { - ep_cur = rc_sup; - ectl = (ectl & ~PCI_EXP_DEVCTL_READRQ) | - val2fld(ep_cur, PCI_EXP_DEVCTL_READRQ); - pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl); + if (max_mrrs > ep_mrrs) { + ep_mrrs = max_mrrs; + pcie_set_readrq(dd->pcidev, ep_mrrs); } -bail: - return ret; } /* End of PCIe capability tuning */ diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 4850d03870c..7fcc150d603 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. * All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * @@ -35,6 +35,9 @@ #include <linux/err.h> #include <linux/vmalloc.h> #include <linux/jhash.h> +#ifdef CONFIG_DEBUG_FS +#include <linux/seq_file.h> +#endif #include "qib.h" @@ -222,8 +225,8 @@ static void insert_qp(struct qib_ibdev *dev, struct qib_qp *qp) unsigned long flags; unsigned n = qpn_hash(dev, qp->ibqp.qp_num); - spin_lock_irqsave(&dev->qpt_lock, flags); atomic_inc(&qp->refcount); + spin_lock_irqsave(&dev->qpt_lock, flags); if (qp->ibqp.qp_num == 0) rcu_assign_pointer(ibp->qp0, qp); @@ -235,7 +238,6 @@ static void insert_qp(struct qib_ibdev *dev, struct qib_qp *qp) } spin_unlock_irqrestore(&dev->qpt_lock, flags); - synchronize_rcu(); } /* @@ -247,40 +249,39 @@ static void remove_qp(struct qib_ibdev *dev, struct qib_qp *qp) struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); unsigned n = qpn_hash(dev, qp->ibqp.qp_num); unsigned long flags; + int removed = 1; spin_lock_irqsave(&dev->qpt_lock, flags); if (rcu_dereference_protected(ibp->qp0, lockdep_is_held(&dev->qpt_lock)) == qp) { - atomic_dec(&qp->refcount); rcu_assign_pointer(ibp->qp0, NULL); } else if (rcu_dereference_protected(ibp->qp1, lockdep_is_held(&dev->qpt_lock)) == qp) { - atomic_dec(&qp->refcount); rcu_assign_pointer(ibp->qp1, NULL); } else { struct qib_qp *q; struct qib_qp __rcu **qpp; + removed = 0; qpp = &dev->qp_table[n]; - q = rcu_dereference_protected(*qpp, - lockdep_is_held(&dev->qpt_lock)); - for (; q; qpp = &q->next) { + for (; (q = rcu_dereference_protected(*qpp, + lockdep_is_held(&dev->qpt_lock))) != NULL; + qpp = &q->next) if (q == qp) { - atomic_dec(&qp->refcount); - *qpp = qp->next; - rcu_assign_pointer(qp->next, NULL); - q = rcu_dereference_protected(*qpp, - lockdep_is_held(&dev->qpt_lock)); + rcu_assign_pointer(*qpp, + rcu_dereference_protected(qp->next, + lockdep_is_held(&dev->qpt_lock))); + removed = 1; break; } - q = rcu_dereference_protected(*qpp, - lockdep_is_held(&dev->qpt_lock)); - } } spin_unlock_irqrestore(&dev->qpt_lock, flags); - synchronize_rcu(); + if (removed) { + synchronize_rcu(); + atomic_dec(&qp->refcount); + } } /** @@ -338,26 +339,25 @@ struct qib_qp *qib_lookup_qpn(struct qib_ibport *ibp, u32 qpn) { struct qib_qp *qp = NULL; + rcu_read_lock(); if (unlikely(qpn <= 1)) { - rcu_read_lock(); if (qpn == 0) qp = rcu_dereference(ibp->qp0); else qp = rcu_dereference(ibp->qp1); + if (qp) + atomic_inc(&qp->refcount); } else { struct qib_ibdev *dev = &ppd_from_ibp(ibp)->dd->verbs_dev; unsigned n = qpn_hash(dev, qpn); - rcu_read_lock(); for (qp = rcu_dereference(dev->qp_table[n]); qp; qp = rcu_dereference(qp->next)) - if (qp->ibqp.qp_num == qpn) + if (qp->ibqp.qp_num == qpn) { + atomic_inc(&qp->refcount); break; + } } - if (qp) - if (unlikely(!atomic_inc_not_zero(&qp->refcount))) - qp = NULL; - rcu_read_unlock(); return qp; } @@ -585,7 +585,7 @@ int qib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask)) + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) goto inval; if (attr_mask & IB_QP_AV) { @@ -985,7 +985,8 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd, struct ib_qp *ret; if (init_attr->cap.max_send_sge > ib_qib_max_sges || - init_attr->cap.max_send_wr > ib_qib_max_qp_wrs) { + init_attr->cap.max_send_wr > ib_qib_max_qp_wrs || + init_attr->create_flags) { ret = ERR_PTR(-EINVAL); goto bail; } @@ -1290,3 +1291,94 @@ void qib_get_credit(struct qib_qp *qp, u32 aeth) } } } + +#ifdef CONFIG_DEBUG_FS + +struct qib_qp_iter { + struct qib_ibdev *dev; + struct qib_qp *qp; + int n; +}; + +struct qib_qp_iter *qib_qp_iter_init(struct qib_ibdev *dev) +{ + struct qib_qp_iter *iter; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return NULL; + + iter->dev = dev; + if (qib_qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +int qib_qp_iter_next(struct qib_qp_iter *iter) +{ + struct qib_ibdev *dev = iter->dev; + int n = iter->n; + int ret = 1; + struct qib_qp *pqp = iter->qp; + struct qib_qp *qp; + + rcu_read_lock(); + for (; n < dev->qp_table_size; n++) { + if (pqp) + qp = rcu_dereference(pqp->next); + else + qp = rcu_dereference(dev->qp_table[n]); + pqp = qp; + if (qp) { + if (iter->qp) + atomic_dec(&iter->qp->refcount); + atomic_inc(&qp->refcount); + rcu_read_unlock(); + iter->qp = qp; + iter->n = n; + return 0; + } + } + rcu_read_unlock(); + if (iter->qp) + atomic_dec(&iter->qp->refcount); + return ret; +} + +static const char * const qp_type_str[] = { + "SMI", "GSI", "RC", "UC", "UD", +}; + +void qib_qp_iter_print(struct seq_file *s, struct qib_qp_iter *iter) +{ + struct qib_swqe *wqe; + struct qib_qp *qp = iter->qp; + + wqe = get_swqe_ptr(qp, qp->s_last); + seq_printf(s, + "N %d QP%u %s %u %u %u f=%x %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u) QP%u LID %x\n", + iter->n, + qp->ibqp.qp_num, + qp_type_str[qp->ibqp.qp_type], + qp->state, + wqe->wr.opcode, + qp->s_hdrwords, + qp->s_flags, + atomic_read(&qp->s_dma_busy), + !list_empty(&qp->iowait), + qp->timeout, + wqe->ssn, + qp->s_lsn, + qp->s_last_psn, + qp->s_psn, qp->s_next_psn, + qp->s_sending_psn, qp->s_sending_hpsn, + qp->s_last, qp->s_acked, qp->s_cur, + qp->s_tail, qp->s_head, qp->s_size, + qp->remote_qpn, + qp->remote_ah_attr.dlid); +} + +#endif diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 3ab341320ea..2f2501890c4 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -752,7 +752,7 @@ void qib_send_rc_ack(struct qib_qp *qp) qib_flush_wc(); qib_sendbuf_done(dd, pbufn); - ibp->n_unicast_xmit++; + this_cpu_inc(ibp->pmastats->n_unicast_xmit); goto done; queue_ack: diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index 357b6cfcd46..4c07a8b34ff 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -703,6 +703,7 @@ void qib_make_ruc_header(struct qib_qp *qp, struct qib_other_headers *ohdr, ohdr->bth[0] = cpu_to_be32(bth0); ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); ohdr->bth[2] = cpu_to_be32(bth2); + this_cpu_inc(ibp->pmastats->n_unicast_xmit); } /** diff --git a/drivers/infiniband/hw/qib/qib_sd7220.c b/drivers/infiniband/hw/qib/qib_sd7220.c index 50a8a0d4fe6..911205d3d5a 100644 --- a/drivers/infiniband/hw/qib/qib_sd7220.c +++ b/drivers/infiniband/hw/qib/qib_sd7220.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2013 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c index 3fc51443121..c6d6a54d2e1 100644 --- a/drivers/infiniband/hw/qib/qib_sdma.c +++ b/drivers/infiniband/hw/qib/qib_sdma.c @@ -423,8 +423,11 @@ void qib_sdma_intr(struct qib_pportdata *ppd) void __qib_sdma_intr(struct qib_pportdata *ppd) { - if (__qib_sdma_running(ppd)) + if (__qib_sdma_running(ppd)) { qib_sdma_make_progress(ppd); + if (!list_empty(&ppd->sdma_userpending)) + qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending); + } } int qib_setup_sdma(struct qib_pportdata *ppd) @@ -452,6 +455,9 @@ int qib_setup_sdma(struct qib_pportdata *ppd) ppd->sdma_descq_removed = 0; ppd->sdma_descq_added = 0; + ppd->sdma_intrequest = 0; + INIT_LIST_HEAD(&ppd->sdma_userpending); + INIT_LIST_HEAD(&ppd->sdma_activelist); tasklet_init(&ppd->sdma_sw_clean_up_task, sdma_sw_clean_up_task, @@ -708,6 +714,62 @@ unlock: return ret; } +/* + * sdma_lock should be acquired before calling this routine + */ +void dump_sdma_state(struct qib_pportdata *ppd) +{ + struct qib_sdma_desc *descq; + struct qib_sdma_txreq *txp, *txpnext; + __le64 *descqp; + u64 desc[2]; + u64 addr; + u16 gen, dwlen, dwoffset; + u16 head, tail, cnt; + + head = ppd->sdma_descq_head; + tail = ppd->sdma_descq_tail; + cnt = qib_sdma_descq_freecnt(ppd); + descq = ppd->sdma_descq; + + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA ppd->sdma_descq_head: %u\n", head); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA ppd->sdma_descq_tail: %u\n", tail); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA sdma_descq_freecnt: %u\n", cnt); + + /* print info for each entry in the descriptor queue */ + while (head != tail) { + char flags[6] = { 'x', 'x', 'x', 'x', 'x', 0 }; + + descqp = &descq[head].qw[0]; + desc[0] = le64_to_cpu(descqp[0]); + desc[1] = le64_to_cpu(descqp[1]); + flags[0] = (desc[0] & 1<<15) ? 'I' : '-'; + flags[1] = (desc[0] & 1<<14) ? 'L' : 'S'; + flags[2] = (desc[0] & 1<<13) ? 'H' : '-'; + flags[3] = (desc[0] & 1<<12) ? 'F' : '-'; + flags[4] = (desc[0] & 1<<11) ? 'L' : '-'; + addr = (desc[1] << 32) | ((desc[0] >> 32) & 0xfffffffcULL); + gen = (desc[0] >> 30) & 3ULL; + dwlen = (desc[0] >> 14) & (0x7ffULL << 2); + dwoffset = (desc[0] & 0x7ffULL) << 2; + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes offset:%u bytes\n", + head, flags, addr, gen, dwlen, dwoffset); + if (++head == ppd->sdma_descq_cnt) + head = 0; + } + + /* print dma descriptor indices from the TX requests */ + list_for_each_entry_safe(txp, txpnext, &ppd->sdma_activelist, + list) + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA txp->start_idx: %u txp->next_descq_idx: %u\n", + txp->start_idx, txp->next_descq_idx); +} + void qib_sdma_process_event(struct qib_pportdata *ppd, enum qib_sdma_events event) { diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c index 034cc821de5..3c8e4e3caca 100644 --- a/drivers/infiniband/hw/qib/qib_sysfs.c +++ b/drivers/infiniband/hw/qib/qib_sysfs.c @@ -808,10 +808,14 @@ int qib_verbs_register_sysfs(struct qib_devdata *dd) for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i) { ret = device_create_file(&dev->dev, qib_attributes[i]); if (ret) - return ret; + goto bail; } return 0; +bail: + for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i) + device_remove_file(&dev->dev, qib_attributes[i]); + return ret; } /* diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index d6c7fe7f88d..aaf7039f8ed 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -57,13 +57,20 @@ static void qib_ud_loopback(struct qib_qp *sqp, struct qib_swqe *swqe) struct qib_sge *sge; struct ib_wc wc; u32 length; + enum ib_qp_type sqptype, dqptype; qp = qib_lookup_qpn(ibp, swqe->wr.wr.ud.remote_qpn); if (!qp) { ibp->n_pkt_drops++; return; } - if (qp->ibqp.qp_type != sqp->ibqp.qp_type || + + sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ? + IB_QPT_UD : sqp->ibqp.qp_type; + dqptype = qp->ibqp.qp_type == IB_QPT_GSI ? + IB_QPT_UD : qp->ibqp.qp_type; + + if (dqptype != sqptype || !(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) { ibp->n_pkt_drops++; goto drop; @@ -273,11 +280,11 @@ int qib_make_ud_req(struct qib_qp *qp) ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr; if (ah_attr->dlid >= QIB_MULTICAST_LID_BASE) { if (ah_attr->dlid != QIB_PERMISSIVE_LID) - ibp->n_multicast_xmit++; + this_cpu_inc(ibp->pmastats->n_multicast_xmit); else - ibp->n_unicast_xmit++; + this_cpu_inc(ibp->pmastats->n_unicast_xmit); } else { - ibp->n_unicast_xmit++; + this_cpu_inc(ibp->pmastats->n_unicast_xmit); lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1); if (unlikely(lid == ppd->lid)) { /* diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c index 82442085cbe..d2806cae234 100644 --- a/drivers/infiniband/hw/qib/qib_user_sdma.c +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -52,21 +52,48 @@ /* attempt to drain the queue for 5secs */ #define QIB_USER_SDMA_DRAIN_TIMEOUT 500 +/* + * track how many times a process open this driver. + */ +static struct rb_root qib_user_sdma_rb_root = RB_ROOT; + +struct qib_user_sdma_rb_node { + struct rb_node node; + int refcount; + pid_t pid; +}; + struct qib_user_sdma_pkt { - u8 naddr; /* dimension of addr (1..3) ... */ + struct list_head list; /* list element */ + + u8 tiddma; /* if this is NEW tid-sdma */ + u8 largepkt; /* this is large pkt from kmalloc */ + u16 frag_size; /* frag size used by PSM */ + u16 index; /* last header index or push index */ + u16 naddr; /* dimension of addr (1..3) ... */ + u16 addrlimit; /* addr array size */ + u16 tidsmidx; /* current tidsm index */ + u16 tidsmcount; /* tidsm array item count */ + u16 payload_size; /* payload size so far for header */ + u32 bytes_togo; /* bytes for processing */ u32 counter; /* sdma pkts queued counter for this entry */ + struct qib_tid_session_member *tidsm; /* tid session member array */ + struct qib_user_sdma_queue *pq; /* which pq this pkt belongs to */ u64 added; /* global descq number of entries */ struct { - u32 offset; /* offset for kvaddr, addr */ - u32 length; /* length in page */ - u8 put_page; /* should we put_page? */ - u8 dma_mapped; /* is page dma_mapped? */ + u16 offset; /* offset for kvaddr, addr */ + u16 length; /* length in page */ + u16 first_desc; /* first desc */ + u16 last_desc; /* last desc */ + u16 put_page; /* should we put_page? */ + u16 dma_mapped; /* is page dma_mapped? */ + u16 dma_length; /* for dma_unmap_page() */ + u16 padding; struct page *page; /* may be NULL (coherent mem) */ void *kvaddr; /* FIXME: only for pio hack */ dma_addr_t addr; } addr[4]; /* max pages, any more and we coalesce */ - struct list_head list; /* list element */ }; struct qib_user_sdma_queue { @@ -77,6 +104,12 @@ struct qib_user_sdma_queue { */ struct list_head sent; + /* + * Because above list will be accessed by both process and + * signal handler, we need a spinlock for it. + */ + spinlock_t sent_lock ____cacheline_aligned_in_smp; + /* headers with expected length are allocated from here... */ char header_cache_name[64]; struct dma_pool *header_cache; @@ -88,27 +121,83 @@ struct qib_user_sdma_queue { /* as packets go on the queued queue, they are counted... */ u32 counter; u32 sent_counter; + /* pending packets, not sending yet */ + u32 num_pending; + /* sending packets, not complete yet */ + u32 num_sending; + /* global descq number of entry of last sending packet */ + u64 added; /* dma page table */ struct rb_root dma_pages_root; + struct qib_user_sdma_rb_node *sdma_rb_node; + /* protect everything above... */ struct mutex lock; }; +static struct qib_user_sdma_rb_node * +qib_user_sdma_rb_search(struct rb_root *root, pid_t pid) +{ + struct qib_user_sdma_rb_node *sdma_rb_node; + struct rb_node *node = root->rb_node; + + while (node) { + sdma_rb_node = container_of(node, + struct qib_user_sdma_rb_node, node); + if (pid < sdma_rb_node->pid) + node = node->rb_left; + else if (pid > sdma_rb_node->pid) + node = node->rb_right; + else + return sdma_rb_node; + } + return NULL; +} + +static int +qib_user_sdma_rb_insert(struct rb_root *root, struct qib_user_sdma_rb_node *new) +{ + struct rb_node **node = &(root->rb_node); + struct rb_node *parent = NULL; + struct qib_user_sdma_rb_node *got; + + while (*node) { + got = container_of(*node, struct qib_user_sdma_rb_node, node); + parent = *node; + if (new->pid < got->pid) + node = &((*node)->rb_left); + else if (new->pid > got->pid) + node = &((*node)->rb_right); + else + return 0; + } + + rb_link_node(&new->node, parent, node); + rb_insert_color(&new->node, root); + return 1; +} + struct qib_user_sdma_queue * qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt) { struct qib_user_sdma_queue *pq = kmalloc(sizeof(struct qib_user_sdma_queue), GFP_KERNEL); + struct qib_user_sdma_rb_node *sdma_rb_node; if (!pq) goto done; pq->counter = 0; pq->sent_counter = 0; - INIT_LIST_HEAD(&pq->sent); + pq->num_pending = 0; + pq->num_sending = 0; + pq->added = 0; + pq->sdma_rb_node = NULL; + INIT_LIST_HEAD(&pq->sent); + spin_lock_init(&pq->sent_lock); mutex_init(&pq->lock); snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name), @@ -131,8 +220,30 @@ qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt) pq->dma_pages_root = RB_ROOT; + sdma_rb_node = qib_user_sdma_rb_search(&qib_user_sdma_rb_root, + current->pid); + if (sdma_rb_node) { + sdma_rb_node->refcount++; + } else { + int ret; + sdma_rb_node = kmalloc(sizeof( + struct qib_user_sdma_rb_node), GFP_KERNEL); + if (!sdma_rb_node) + goto err_rb; + + sdma_rb_node->refcount = 1; + sdma_rb_node->pid = current->pid; + + ret = qib_user_sdma_rb_insert(&qib_user_sdma_rb_root, + sdma_rb_node); + BUG_ON(ret == 0); + } + pq->sdma_rb_node = sdma_rb_node; + goto done; +err_rb: + dma_pool_destroy(pq->header_cache); err_slab: kmem_cache_destroy(pq->pkt_slab); err_kfree: @@ -144,34 +255,310 @@ done: } static void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt, - int i, size_t offset, size_t len, - int put_page, int dma_mapped, - struct page *page, - void *kvaddr, dma_addr_t dma_addr) + int i, u16 offset, u16 len, + u16 first_desc, u16 last_desc, + u16 put_page, u16 dma_mapped, + struct page *page, void *kvaddr, + dma_addr_t dma_addr, u16 dma_length) { pkt->addr[i].offset = offset; pkt->addr[i].length = len; + pkt->addr[i].first_desc = first_desc; + pkt->addr[i].last_desc = last_desc; pkt->addr[i].put_page = put_page; pkt->addr[i].dma_mapped = dma_mapped; pkt->addr[i].page = page; pkt->addr[i].kvaddr = kvaddr; pkt->addr[i].addr = dma_addr; + pkt->addr[i].dma_length = dma_length; +} + +static void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq, + size_t len, dma_addr_t *dma_addr) +{ + void *hdr; + + if (len == QIB_USER_SDMA_EXP_HEADER_LENGTH) + hdr = dma_pool_alloc(pq->header_cache, GFP_KERNEL, + dma_addr); + else + hdr = NULL; + + if (!hdr) { + hdr = kmalloc(len, GFP_KERNEL); + if (!hdr) + return NULL; + + *dma_addr = 0; + } + + return hdr; } -static void qib_user_sdma_init_header(struct qib_user_sdma_pkt *pkt, - u32 counter, size_t offset, - size_t len, int dma_mapped, - struct page *page, - void *kvaddr, dma_addr_t dma_addr) +static int qib_user_sdma_page_to_frags(const struct qib_devdata *dd, + struct qib_user_sdma_queue *pq, + struct qib_user_sdma_pkt *pkt, + struct page *page, u16 put, + u16 offset, u16 len, void *kvaddr) { - pkt->naddr = 1; - pkt->counter = counter; - qib_user_sdma_init_frag(pkt, 0, offset, len, 0, dma_mapped, page, - kvaddr, dma_addr); + __le16 *pbc16; + void *pbcvaddr; + struct qib_message_header *hdr; + u16 newlen, pbclen, lastdesc, dma_mapped; + u32 vcto; + union qib_seqnum seqnum; + dma_addr_t pbcdaddr; + dma_addr_t dma_addr = + dma_map_page(&dd->pcidev->dev, + page, offset, len, DMA_TO_DEVICE); + int ret = 0; + + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + /* + * dma mapping error, pkt has not managed + * this page yet, return the page here so + * the caller can ignore this page. + */ + if (put) { + put_page(page); + } else { + /* coalesce case */ + kunmap(page); + __free_page(page); + } + ret = -ENOMEM; + goto done; + } + offset = 0; + dma_mapped = 1; + + +next_fragment: + + /* + * In tid-sdma, the transfer length is restricted by + * receiver side current tid page length. + */ + if (pkt->tiddma && len > pkt->tidsm[pkt->tidsmidx].length) + newlen = pkt->tidsm[pkt->tidsmidx].length; + else + newlen = len; + + /* + * Then the transfer length is restricted by MTU. + * the last descriptor flag is determined by: + * 1. the current packet is at frag size length. + * 2. the current tid page is done if tid-sdma. + * 3. there is no more byte togo if sdma. + */ + lastdesc = 0; + if ((pkt->payload_size + newlen) >= pkt->frag_size) { + newlen = pkt->frag_size - pkt->payload_size; + lastdesc = 1; + } else if (pkt->tiddma) { + if (newlen == pkt->tidsm[pkt->tidsmidx].length) + lastdesc = 1; + } else { + if (newlen == pkt->bytes_togo) + lastdesc = 1; + } + + /* fill the next fragment in this page */ + qib_user_sdma_init_frag(pkt, pkt->naddr, /* index */ + offset, newlen, /* offset, len */ + 0, lastdesc, /* first last desc */ + put, dma_mapped, /* put page, dma mapped */ + page, kvaddr, /* struct page, virt addr */ + dma_addr, len); /* dma addr, dma length */ + pkt->bytes_togo -= newlen; + pkt->payload_size += newlen; + pkt->naddr++; + if (pkt->naddr == pkt->addrlimit) { + ret = -EFAULT; + goto done; + } + + /* If there is no more byte togo. (lastdesc==1) */ + if (pkt->bytes_togo == 0) { + /* The packet is done, header is not dma mapped yet. + * it should be from kmalloc */ + if (!pkt->addr[pkt->index].addr) { + pkt->addr[pkt->index].addr = + dma_map_single(&dd->pcidev->dev, + pkt->addr[pkt->index].kvaddr, + pkt->addr[pkt->index].dma_length, + DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, + pkt->addr[pkt->index].addr)) { + ret = -ENOMEM; + goto done; + } + pkt->addr[pkt->index].dma_mapped = 1; + } + + goto done; + } + + /* If tid-sdma, advance tid info. */ + if (pkt->tiddma) { + pkt->tidsm[pkt->tidsmidx].length -= newlen; + if (pkt->tidsm[pkt->tidsmidx].length) { + pkt->tidsm[pkt->tidsmidx].offset += newlen; + } else { + pkt->tidsmidx++; + if (pkt->tidsmidx == pkt->tidsmcount) { + ret = -EFAULT; + goto done; + } + } + } + + /* + * If this is NOT the last descriptor. (newlen==len) + * the current packet is not done yet, but the current + * send side page is done. + */ + if (lastdesc == 0) + goto done; + + /* + * If running this driver under PSM with message size + * fitting into one transfer unit, it is not possible + * to pass this line. otherwise, it is a buggggg. + */ + + /* + * Since the current packet is done, and there are more + * bytes togo, we need to create a new sdma header, copying + * from previous sdma header and modify both. + */ + pbclen = pkt->addr[pkt->index].length; + pbcvaddr = qib_user_sdma_alloc_header(pq, pbclen, &pbcdaddr); + if (!pbcvaddr) { + ret = -ENOMEM; + goto done; + } + /* Copy the previous sdma header to new sdma header */ + pbc16 = (__le16 *)pkt->addr[pkt->index].kvaddr; + memcpy(pbcvaddr, pbc16, pbclen); + + /* Modify the previous sdma header */ + hdr = (struct qib_message_header *)&pbc16[4]; + + /* New pbc length */ + pbc16[0] = cpu_to_le16(le16_to_cpu(pbc16[0])-(pkt->bytes_togo>>2)); + + /* New packet length */ + hdr->lrh[2] = cpu_to_be16(le16_to_cpu(pbc16[0])); + + if (pkt->tiddma) { + /* turn on the header suppression */ + hdr->iph.pkt_flags = + cpu_to_le16(le16_to_cpu(hdr->iph.pkt_flags)|0x2); + /* turn off ACK_REQ: 0x04 and EXPECTED_DONE: 0x20 */ + hdr->flags &= ~(0x04|0x20); + } else { + /* turn off extra bytes: 20-21 bits */ + hdr->bth[0] = cpu_to_be32(be32_to_cpu(hdr->bth[0])&0xFFCFFFFF); + /* turn off ACK_REQ: 0x04 */ + hdr->flags &= ~(0x04); + } + + /* New kdeth checksum */ + vcto = le32_to_cpu(hdr->iph.ver_ctxt_tid_offset); + hdr->iph.chksum = cpu_to_le16(QIB_LRH_BTH + + be16_to_cpu(hdr->lrh[2]) - + ((vcto>>16)&0xFFFF) - (vcto&0xFFFF) - + le16_to_cpu(hdr->iph.pkt_flags)); + + /* The packet is done, header is not dma mapped yet. + * it should be from kmalloc */ + if (!pkt->addr[pkt->index].addr) { + pkt->addr[pkt->index].addr = + dma_map_single(&dd->pcidev->dev, + pkt->addr[pkt->index].kvaddr, + pkt->addr[pkt->index].dma_length, + DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, + pkt->addr[pkt->index].addr)) { + ret = -ENOMEM; + goto done; + } + pkt->addr[pkt->index].dma_mapped = 1; + } + + /* Modify the new sdma header */ + pbc16 = (__le16 *)pbcvaddr; + hdr = (struct qib_message_header *)&pbc16[4]; + + /* New pbc length */ + pbc16[0] = cpu_to_le16(le16_to_cpu(pbc16[0])-(pkt->payload_size>>2)); + + /* New packet length */ + hdr->lrh[2] = cpu_to_be16(le16_to_cpu(pbc16[0])); + + if (pkt->tiddma) { + /* Set new tid and offset for new sdma header */ + hdr->iph.ver_ctxt_tid_offset = cpu_to_le32( + (le32_to_cpu(hdr->iph.ver_ctxt_tid_offset)&0xFF000000) + + (pkt->tidsm[pkt->tidsmidx].tid<<QLOGIC_IB_I_TID_SHIFT) + + (pkt->tidsm[pkt->tidsmidx].offset>>2)); + } else { + /* Middle protocol new packet offset */ + hdr->uwords[2] += pkt->payload_size; + } + + /* New kdeth checksum */ + vcto = le32_to_cpu(hdr->iph.ver_ctxt_tid_offset); + hdr->iph.chksum = cpu_to_le16(QIB_LRH_BTH + + be16_to_cpu(hdr->lrh[2]) - + ((vcto>>16)&0xFFFF) - (vcto&0xFFFF) - + le16_to_cpu(hdr->iph.pkt_flags)); + + /* Next sequence number in new sdma header */ + seqnum.val = be32_to_cpu(hdr->bth[2]); + if (pkt->tiddma) + seqnum.seq++; + else + seqnum.pkt++; + hdr->bth[2] = cpu_to_be32(seqnum.val); + + /* Init new sdma header. */ + qib_user_sdma_init_frag(pkt, pkt->naddr, /* index */ + 0, pbclen, /* offset, len */ + 1, 0, /* first last desc */ + 0, 0, /* put page, dma mapped */ + NULL, pbcvaddr, /* struct page, virt addr */ + pbcdaddr, pbclen); /* dma addr, dma length */ + pkt->index = pkt->naddr; + pkt->payload_size = 0; + pkt->naddr++; + if (pkt->naddr == pkt->addrlimit) { + ret = -EFAULT; + goto done; + } + + /* Prepare for next fragment in this page */ + if (newlen != len) { + if (dma_mapped) { + put = 0; + dma_mapped = 0; + page = NULL; + kvaddr = NULL; + } + len -= newlen; + offset += newlen; + + goto next_fragment; + } + +done: + return ret; } /* we've too many pages in the iovec, coalesce to a single page */ static int qib_user_sdma_coalesce(const struct qib_devdata *dd, + struct qib_user_sdma_queue *pq, struct qib_user_sdma_pkt *pkt, const struct iovec *iov, unsigned long niov) @@ -182,7 +569,6 @@ static int qib_user_sdma_coalesce(const struct qib_devdata *dd, char *mpage; int i; int len = 0; - dma_addr_t dma_addr; if (!page) { ret = -ENOMEM; @@ -205,17 +591,8 @@ static int qib_user_sdma_coalesce(const struct qib_devdata *dd, len += iov[i].iov_len; } - dma_addr = dma_map_page(&dd->pcidev->dev, page, 0, len, - DMA_TO_DEVICE); - if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { - ret = -ENOMEM; - goto free_unmap; - } - - qib_user_sdma_init_frag(pkt, 1, 0, len, 0, 1, page, mpage_save, - dma_addr); - pkt->naddr = 2; - + ret = qib_user_sdma_page_to_frags(dd, pq, pkt, + page, 0, 0, len, mpage_save); goto done; free_unmap: @@ -238,16 +615,6 @@ static int qib_user_sdma_num_pages(const struct iovec *iov) return 1 + ((epage - spage) >> PAGE_SHIFT); } -/* - * Truncate length to page boundary. - */ -static int qib_user_sdma_page_length(unsigned long addr, unsigned long len) -{ - const unsigned long offset = addr & ~PAGE_MASK; - - return ((offset + len) > PAGE_SIZE) ? (PAGE_SIZE - offset) : len; -} - static void qib_user_sdma_free_pkt_frag(struct device *dev, struct qib_user_sdma_queue *pq, struct qib_user_sdma_pkt *pkt, @@ -256,10 +623,11 @@ static void qib_user_sdma_free_pkt_frag(struct device *dev, const int i = frag; if (pkt->addr[i].page) { + /* only user data has page */ if (pkt->addr[i].dma_mapped) dma_unmap_page(dev, pkt->addr[i].addr, - pkt->addr[i].length, + pkt->addr[i].dma_length, DMA_TO_DEVICE); if (pkt->addr[i].kvaddr) @@ -269,55 +637,80 @@ static void qib_user_sdma_free_pkt_frag(struct device *dev, put_page(pkt->addr[i].page); else __free_page(pkt->addr[i].page); - } else if (pkt->addr[i].kvaddr) - /* free coherent mem from cache... */ - dma_pool_free(pq->header_cache, + } else if (pkt->addr[i].kvaddr) { + /* for headers */ + if (pkt->addr[i].dma_mapped) { + /* from kmalloc & dma mapped */ + dma_unmap_single(dev, + pkt->addr[i].addr, + pkt->addr[i].dma_length, + DMA_TO_DEVICE); + kfree(pkt->addr[i].kvaddr); + } else if (pkt->addr[i].addr) { + /* free coherent mem from cache... */ + dma_pool_free(pq->header_cache, pkt->addr[i].kvaddr, pkt->addr[i].addr); + } else { + /* from kmalloc but not dma mapped */ + kfree(pkt->addr[i].kvaddr); + } + } } /* return number of pages pinned... */ static int qib_user_sdma_pin_pages(const struct qib_devdata *dd, + struct qib_user_sdma_queue *pq, struct qib_user_sdma_pkt *pkt, unsigned long addr, int tlen, int npages) { - struct page *pages[2]; - int j; - int ret; - - ret = get_user_pages(current, current->mm, addr, - npages, 0, 1, pages, NULL); - - if (ret != npages) { - int i; - - for (i = 0; i < ret; i++) - put_page(pages[i]); - - ret = -ENOMEM; - goto done; - } + struct page *pages[8]; + int i, j; + int ret = 0; - for (j = 0; j < npages; j++) { - /* map the pages... */ - const int flen = qib_user_sdma_page_length(addr, tlen); - dma_addr_t dma_addr = - dma_map_page(&dd->pcidev->dev, - pages[j], 0, flen, DMA_TO_DEVICE); - unsigned long fofs = addr & ~PAGE_MASK; + while (npages) { + if (npages > 8) + j = 8; + else + j = npages; - if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + ret = get_user_pages_fast(addr, j, 0, pages); + if (ret != j) { + i = 0; + j = ret; ret = -ENOMEM; - goto done; + goto free_pages; } - qib_user_sdma_init_frag(pkt, pkt->naddr, fofs, flen, 1, 1, - pages[j], kmap(pages[j]), dma_addr); + for (i = 0; i < j; i++) { + /* map the pages... */ + unsigned long fofs = addr & ~PAGE_MASK; + int flen = ((fofs + tlen) > PAGE_SIZE) ? + (PAGE_SIZE - fofs) : tlen; + + ret = qib_user_sdma_page_to_frags(dd, pq, pkt, + pages[i], 1, fofs, flen, NULL); + if (ret < 0) { + /* current page has beed taken + * care of inside above call. + */ + i++; + goto free_pages; + } + + addr += flen; + tlen -= flen; + } - pkt->naddr++; - addr += flen; - tlen -= flen; + npages -= j; } + goto done; + + /* if error, return all pages not managed by pkt */ +free_pages: + while (i < j) + put_page(pages[i++]); + done: return ret; } @@ -335,7 +728,7 @@ static int qib_user_sdma_pin_pkt(const struct qib_devdata *dd, const int npages = qib_user_sdma_num_pages(iov + idx); const unsigned long addr = (unsigned long) iov[idx].iov_base; - ret = qib_user_sdma_pin_pages(dd, pkt, addr, + ret = qib_user_sdma_pin_pages(dd, pq, pkt, addr, iov[idx].iov_len, npages); if (ret < 0) goto free_pkt; @@ -344,9 +737,22 @@ static int qib_user_sdma_pin_pkt(const struct qib_devdata *dd, goto done; free_pkt: - for (idx = 0; idx < pkt->naddr; idx++) + /* we need to ignore the first entry here */ + for (idx = 1; idx < pkt->naddr; idx++) qib_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx); + /* need to dma unmap the first entry, this is to restore to + * the original state so that caller can free the memory in + * error condition. Caller does not know if dma mapped or not*/ + if (pkt->addr[0].dma_mapped) { + dma_unmap_single(&dd->pcidev->dev, + pkt->addr[0].addr, + pkt->addr[0].dma_length, + DMA_TO_DEVICE); + pkt->addr[0].addr = 0; + pkt->addr[0].dma_mapped = 0; + } + done: return ret; } @@ -359,8 +765,9 @@ static int qib_user_sdma_init_payload(const struct qib_devdata *dd, { int ret = 0; - if (npages >= ARRAY_SIZE(pkt->addr)) - ret = qib_user_sdma_coalesce(dd, pkt, iov, niov); + if (pkt->frag_size == pkt->bytes_togo && + npages >= ARRAY_SIZE(pkt->addr)) + ret = qib_user_sdma_coalesce(dd, pq, pkt, iov, niov); else ret = qib_user_sdma_pin_pkt(dd, pq, pkt, iov, niov); @@ -380,7 +787,10 @@ static void qib_user_sdma_free_pkt_list(struct device *dev, for (i = 0; i < pkt->naddr; i++) qib_user_sdma_free_pkt_frag(dev, pq, pkt, i); - kmem_cache_free(pq->pkt_slab, pkt); + if (pkt->largepkt) + kfree(pkt); + else + kmem_cache_free(pq->pkt_slab, pkt); } INIT_LIST_HEAD(list); } @@ -393,63 +803,48 @@ static void qib_user_sdma_free_pkt_list(struct device *dev, * as, if there is an error we clean it... */ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, + struct qib_pportdata *ppd, struct qib_user_sdma_queue *pq, - struct list_head *list, const struct iovec *iov, unsigned long niov, - int maxpkts) + struct list_head *list, + int *maxpkts, int *ndesc) { unsigned long idx = 0; int ret = 0; int npkts = 0; - struct page *page = NULL; __le32 *pbc; dma_addr_t dma_addr; struct qib_user_sdma_pkt *pkt = NULL; size_t len; size_t nw; u32 counter = pq->counter; - int dma_mapped = 0; + u16 frag_size; - while (idx < niov && npkts < maxpkts) { + while (idx < niov && npkts < *maxpkts) { const unsigned long addr = (unsigned long) iov[idx].iov_base; const unsigned long idx_save = idx; unsigned pktnw; unsigned pktnwc; int nfrags = 0; int npages = 0; + int bytes_togo = 0; + int tiddma = 0; int cfur; - dma_mapped = 0; len = iov[idx].iov_len; nw = len >> 2; - page = NULL; - - pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL); - if (!pkt) { - ret = -ENOMEM; - goto free_list; - } if (len < QIB_USER_SDMA_MIN_HEADER_LENGTH || len > PAGE_SIZE || len & 3 || addr & 3) { ret = -EINVAL; - goto free_pkt; + goto free_list; } - if (len == QIB_USER_SDMA_EXP_HEADER_LENGTH) - pbc = dma_pool_alloc(pq->header_cache, GFP_KERNEL, - &dma_addr); - else - pbc = NULL; - + pbc = qib_user_sdma_alloc_header(pq, len, &dma_addr); if (!pbc) { - page = alloc_page(GFP_KERNEL); - if (!page) { - ret = -ENOMEM; - goto free_pkt; - } - pbc = kmap(page); + ret = -ENOMEM; + goto free_list; } cfur = copy_from_user(pbc, iov[idx].iov_base, len); @@ -474,8 +869,8 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, * we can verify that the packet is consistent with the * iovec lengths. */ - pktnw = le32_to_cpu(*pbc) & QIB_PBC_LENGTH_MASK; - if (pktnw < pktnwc || pktnw > pktnwc + (PAGE_SIZE >> 2)) { + pktnw = le32_to_cpu(*pbc) & 0xFFFF; + if (pktnw < pktnwc) { ret = -EINVAL; goto free_pbc; } @@ -486,17 +881,14 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, const unsigned long faddr = (unsigned long) iov[idx].iov_base; - if (slen & 3 || faddr & 3 || !slen || - slen > PAGE_SIZE) { + if (slen & 3 || faddr & 3 || !slen) { ret = -EINVAL; goto free_pbc; } - npages++; - if ((faddr & PAGE_MASK) != - ((faddr + slen - 1) & PAGE_MASK)) - npages++; + npages += qib_user_sdma_num_pages(&iov[idx]); + bytes_togo += slen; pktnwc += slen >> 2; idx++; nfrags++; @@ -507,48 +899,139 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, goto free_pbc; } - if (page) { - dma_addr = dma_map_page(&dd->pcidev->dev, - page, 0, len, DMA_TO_DEVICE); - if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + frag_size = ((le32_to_cpu(*pbc))>>16) & 0xFFFF; + if (((frag_size ? frag_size : bytes_togo) + len) > + ppd->ibmaxlen) { + ret = -EINVAL; + goto free_pbc; + } + + if (frag_size) { + int pktsize, tidsmsize, n; + + n = npages*((2*PAGE_SIZE/frag_size)+1); + pktsize = sizeof(*pkt) + sizeof(pkt->addr[0])*n; + + /* + * Determine if this is tid-sdma or just sdma. + */ + tiddma = (((le32_to_cpu(pbc[7])>> + QLOGIC_IB_I_TID_SHIFT)& + QLOGIC_IB_I_TID_MASK) != + QLOGIC_IB_I_TID_MASK); + + if (tiddma) + tidsmsize = iov[idx].iov_len; + else + tidsmsize = 0; + + pkt = kmalloc(pktsize+tidsmsize, GFP_KERNEL); + if (!pkt) { ret = -ENOMEM; goto free_pbc; } + pkt->largepkt = 1; + pkt->frag_size = frag_size; + pkt->addrlimit = n + ARRAY_SIZE(pkt->addr); + + if (tiddma) { + char *tidsm = (char *)pkt + pktsize; + cfur = copy_from_user(tidsm, + iov[idx].iov_base, tidsmsize); + if (cfur) { + ret = -EFAULT; + goto free_pkt; + } + pkt->tidsm = + (struct qib_tid_session_member *)tidsm; + pkt->tidsmcount = tidsmsize/ + sizeof(struct qib_tid_session_member); + pkt->tidsmidx = 0; + idx++; + } - dma_mapped = 1; + /* + * pbc 'fill1' field is borrowed to pass frag size, + * we need to clear it after picking frag size, the + * hardware requires this field to be zero. + */ + *pbc = cpu_to_le32(le32_to_cpu(*pbc) & 0x0000FFFF); + } else { + pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL); + if (!pkt) { + ret = -ENOMEM; + goto free_pbc; + } + pkt->largepkt = 0; + pkt->frag_size = bytes_togo; + pkt->addrlimit = ARRAY_SIZE(pkt->addr); } - - qib_user_sdma_init_header(pkt, counter, 0, len, dma_mapped, - page, pbc, dma_addr); + pkt->bytes_togo = bytes_togo; + pkt->payload_size = 0; + pkt->counter = counter; + pkt->tiddma = tiddma; + + /* setup the first header */ + qib_user_sdma_init_frag(pkt, 0, /* index */ + 0, len, /* offset, len */ + 1, 0, /* first last desc */ + 0, 0, /* put page, dma mapped */ + NULL, pbc, /* struct page, virt addr */ + dma_addr, len); /* dma addr, dma length */ + pkt->index = 0; + pkt->naddr = 1; if (nfrags) { ret = qib_user_sdma_init_payload(dd, pq, pkt, iov + idx_save + 1, nfrags, npages); if (ret < 0) - goto free_pbc_dma; + goto free_pkt; + } else { + /* since there is no payload, mark the + * header as the last desc. */ + pkt->addr[0].last_desc = 1; + + if (dma_addr == 0) { + /* + * the header is not dma mapped yet. + * it should be from kmalloc. + */ + dma_addr = dma_map_single(&dd->pcidev->dev, + pbc, len, DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, + dma_addr)) { + ret = -ENOMEM; + goto free_pkt; + } + pkt->addr[0].addr = dma_addr; + pkt->addr[0].dma_mapped = 1; + } } counter++; npkts++; + pkt->pq = pq; + pkt->index = 0; /* reset index for push on hw */ + *ndesc += pkt->naddr; list_add_tail(&pkt->list, list); } + *maxpkts = npkts; ret = idx; goto done; -free_pbc_dma: - if (dma_mapped) - dma_unmap_page(&dd->pcidev->dev, dma_addr, len, DMA_TO_DEVICE); +free_pkt: + if (pkt->largepkt) + kfree(pkt); + else + kmem_cache_free(pq->pkt_slab, pkt); free_pbc: - if (page) { - kunmap(page); - __free_page(page); - } else + if (dma_addr) dma_pool_free(pq->header_cache, pbc, dma_addr); -free_pkt: - kmem_cache_free(pq->pkt_slab, pkt); + else + kfree(pbc); free_list: qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list); done: @@ -569,10 +1052,20 @@ static int qib_user_sdma_queue_clean(struct qib_pportdata *ppd, struct list_head free_list; struct qib_user_sdma_pkt *pkt; struct qib_user_sdma_pkt *pkt_prev; + unsigned long flags; int ret = 0; + if (!pq->num_sending) + return 0; + INIT_LIST_HEAD(&free_list); + /* + * We need this spin lock here because interrupt handler + * might modify this list in qib_user_sdma_send_desc(), also + * we can not get interrupted, otherwise it is a deadlock. + */ + spin_lock_irqsave(&pq->sent_lock, flags); list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) { s64 descd = ppd->sdma_descq_removed - pkt->added; @@ -583,7 +1076,9 @@ static int qib_user_sdma_queue_clean(struct qib_pportdata *ppd, /* one more packet cleaned */ ret++; + pq->num_sending--; } + spin_unlock_irqrestore(&pq->sent_lock, flags); if (!list_empty(&free_list)) { u32 counter; @@ -604,8 +1099,13 @@ void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq) if (!pq) return; - kmem_cache_destroy(pq->pkt_slab); + pq->sdma_rb_node->refcount--; + if (pq->sdma_rb_node->refcount == 0) { + rb_erase(&pq->sdma_rb_node->node, &qib_user_sdma_rb_root); + kfree(pq->sdma_rb_node); + } dma_pool_destroy(pq->header_cache); + kmem_cache_destroy(pq->pkt_slab); kfree(pq); } @@ -627,6 +1127,7 @@ void qib_user_sdma_queue_drain(struct qib_pportdata *ppd, struct qib_user_sdma_queue *pq) { struct qib_devdata *dd = ppd->dd; + unsigned long flags; int i; if (!pq) @@ -634,7 +1135,7 @@ void qib_user_sdma_queue_drain(struct qib_pportdata *ppd, for (i = 0; i < QIB_USER_SDMA_DRAIN_TIMEOUT; i++) { mutex_lock(&pq->lock); - if (list_empty(&pq->sent)) { + if (!pq->num_pending && !pq->num_sending) { mutex_unlock(&pq->lock); break; } @@ -644,29 +1145,44 @@ void qib_user_sdma_queue_drain(struct qib_pportdata *ppd, msleep(10); } - if (!list_empty(&pq->sent)) { + if (pq->num_pending || pq->num_sending) { + struct qib_user_sdma_pkt *pkt; + struct qib_user_sdma_pkt *pkt_prev; struct list_head free_list; + mutex_lock(&pq->lock); + spin_lock_irqsave(&ppd->sdma_lock, flags); + /* + * Since we hold sdma_lock, it is safe without sent_lock. + */ + if (pq->num_pending) { + list_for_each_entry_safe(pkt, pkt_prev, + &ppd->sdma_userpending, list) { + if (pkt->pq == pq) { + list_move_tail(&pkt->list, &pq->sent); + pq->num_pending--; + pq->num_sending++; + } + } + } + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + qib_dev_err(dd, "user sdma lists not empty: forcing!\n"); INIT_LIST_HEAD(&free_list); - mutex_lock(&pq->lock); list_splice_init(&pq->sent, &free_list); + pq->num_sending = 0; qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list); mutex_unlock(&pq->lock); } } -static inline __le64 qib_sdma_make_desc0(struct qib_pportdata *ppd, +static inline __le64 qib_sdma_make_desc0(u8 gen, u64 addr, u64 dwlen, u64 dwoffset) { - u8 tmpgen; - - tmpgen = ppd->sdma_generation; - return cpu_to_le64(/* SDmaPhyAddr[31:0] */ ((addr & 0xfffffffcULL) << 32) | /* SDmaGeneration[1:0] */ - ((tmpgen & 3ULL) << 30) | + ((gen & 3ULL) << 30) | /* SDmaDwordCount[10:0] */ ((dwlen & 0x7ffULL) << 16) | /* SDmaBufOffset[12:2] */ @@ -692,7 +1208,7 @@ static inline __le64 qib_sdma_make_desc1(u64 addr) static void qib_user_sdma_send_frag(struct qib_pportdata *ppd, struct qib_user_sdma_pkt *pkt, int idx, - unsigned ofs, u16 tail) + unsigned ofs, u16 tail, u8 gen) { const u64 addr = (u64) pkt->addr[idx].addr + (u64) pkt->addr[idx].offset; @@ -702,105 +1218,159 @@ static void qib_user_sdma_send_frag(struct qib_pportdata *ppd, descqp = &ppd->sdma_descq[tail].qw[0]; - descq0 = qib_sdma_make_desc0(ppd, addr, dwlen, ofs); - if (idx == 0) + descq0 = qib_sdma_make_desc0(gen, addr, dwlen, ofs); + if (pkt->addr[idx].first_desc) descq0 = qib_sdma_make_first_desc0(descq0); - if (idx == pkt->naddr - 1) + if (pkt->addr[idx].last_desc) { descq0 = qib_sdma_make_last_desc0(descq0); + if (ppd->sdma_intrequest) { + descq0 |= cpu_to_le64(1ULL << 15); + ppd->sdma_intrequest = 0; + } + } descqp[0] = descq0; descqp[1] = qib_sdma_make_desc1(addr); } -/* pq->lock must be held, get packets on the wire... */ -static int qib_user_sdma_push_pkts(struct qib_pportdata *ppd, - struct qib_user_sdma_queue *pq, - struct list_head *pktlist) +void qib_user_sdma_send_desc(struct qib_pportdata *ppd, + struct list_head *pktlist) { struct qib_devdata *dd = ppd->dd; - int ret = 0; - unsigned long flags; - u16 tail; - u8 generation; - u64 descq_added; - - if (list_empty(pktlist)) - return 0; - - if (unlikely(!(ppd->lflags & QIBL_LINKACTIVE))) - return -ECOMM; - - spin_lock_irqsave(&ppd->sdma_lock, flags); + u16 nfree, nsent; + u16 tail, tail_c; + u8 gen, gen_c; - /* keep a copy for restoring purposes in case of problems */ - generation = ppd->sdma_generation; - descq_added = ppd->sdma_descq_added; - - if (unlikely(!__qib_sdma_running(ppd))) { - ret = -ECOMM; - goto unlock; - } + nfree = qib_sdma_descq_freecnt(ppd); + if (!nfree) + return; - tail = ppd->sdma_descq_tail; +retry: + nsent = 0; + tail_c = tail = ppd->sdma_descq_tail; + gen_c = gen = ppd->sdma_generation; while (!list_empty(pktlist)) { struct qib_user_sdma_pkt *pkt = list_entry(pktlist->next, struct qib_user_sdma_pkt, list); - int i; + int i, j, c = 0; unsigned ofs = 0; u16 dtail = tail; - if (pkt->naddr > qib_sdma_descq_freecnt(ppd)) - goto unlock_check_tail; - - for (i = 0; i < pkt->naddr; i++) { - qib_user_sdma_send_frag(ppd, pkt, i, ofs, tail); + for (i = pkt->index; i < pkt->naddr && nfree; i++) { + qib_user_sdma_send_frag(ppd, pkt, i, ofs, tail, gen); ofs += pkt->addr[i].length >> 2; if (++tail == ppd->sdma_descq_cnt) { tail = 0; - ++ppd->sdma_generation; + ++gen; + ppd->sdma_intrequest = 1; + } else if (tail == (ppd->sdma_descq_cnt>>1)) { + ppd->sdma_intrequest = 1; } - } - - if ((ofs << 2) > ppd->ibmaxlen) { - ret = -EMSGSIZE; - goto unlock; - } + nfree--; + if (pkt->addr[i].last_desc == 0) + continue; - /* - * If the packet is >= 2KB mtu equivalent, we have to use - * the large buffers, and have to mark each descriptor as - * part of a large buffer packet. - */ - if (ofs > dd->piosize2kmax_dwords) { - for (i = 0; i < pkt->naddr; i++) { - ppd->sdma_descq[dtail].qw[0] |= - cpu_to_le64(1ULL << 14); - if (++dtail == ppd->sdma_descq_cnt) - dtail = 0; + /* + * If the packet is >= 2KB mtu equivalent, we + * have to use the large buffers, and have to + * mark each descriptor as part of a large + * buffer packet. + */ + if (ofs > dd->piosize2kmax_dwords) { + for (j = pkt->index; j <= i; j++) { + ppd->sdma_descq[dtail].qw[0] |= + cpu_to_le64(1ULL << 14); + if (++dtail == ppd->sdma_descq_cnt) + dtail = 0; + } } + c += i + 1 - pkt->index; + pkt->index = i + 1; /* index for next first */ + tail_c = dtail = tail; + gen_c = gen; + ofs = 0; /* reset for next packet */ } - ppd->sdma_descq_added += pkt->naddr; - pkt->added = ppd->sdma_descq_added; - list_move_tail(&pkt->list, &pq->sent); - ret++; + ppd->sdma_descq_added += c; + nsent += c; + if (pkt->index == pkt->naddr) { + pkt->added = ppd->sdma_descq_added; + pkt->pq->added = pkt->added; + pkt->pq->num_pending--; + spin_lock(&pkt->pq->sent_lock); + pkt->pq->num_sending++; + list_move_tail(&pkt->list, &pkt->pq->sent); + spin_unlock(&pkt->pq->sent_lock); + } + if (!nfree || (nsent<<2) > ppd->sdma_descq_cnt) + break; } -unlock_check_tail: /* advance the tail on the chip if necessary */ - if (ppd->sdma_descq_tail != tail) - dd->f_sdma_update_tail(ppd, tail); + if (ppd->sdma_descq_tail != tail_c) { + ppd->sdma_generation = gen_c; + dd->f_sdma_update_tail(ppd, tail_c); + } + + if (nfree && !list_empty(pktlist)) + goto retry; + + return; +} + +/* pq->lock must be held, get packets on the wire... */ +static int qib_user_sdma_push_pkts(struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq, + struct list_head *pktlist, int count) +{ + unsigned long flags; + + if (unlikely(!(ppd->lflags & QIBL_LINKACTIVE))) + return -ECOMM; -unlock: - if (unlikely(ret < 0)) { - ppd->sdma_generation = generation; - ppd->sdma_descq_added = descq_added; + /* non-blocking mode */ + if (pq->sdma_rb_node->refcount > 1) { + spin_lock_irqsave(&ppd->sdma_lock, flags); + if (unlikely(!__qib_sdma_running(ppd))) { + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + return -ECOMM; + } + pq->num_pending += count; + list_splice_tail_init(pktlist, &ppd->sdma_userpending); + qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + return 0; } - spin_unlock_irqrestore(&ppd->sdma_lock, flags); - return ret; + /* In this case, descriptors from this process are not + * linked to ppd pending queue, interrupt handler + * won't update this process, it is OK to directly + * modify without sdma lock. + */ + + + pq->num_pending += count; + /* + * Blocking mode for single rail process, we must + * release/regain sdma_lock to give other process + * chance to make progress. This is important for + * performance. + */ + do { + spin_lock_irqsave(&ppd->sdma_lock, flags); + if (unlikely(!__qib_sdma_running(ppd))) { + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + return -ECOMM; + } + qib_user_sdma_send_desc(ppd, pktlist); + if (!list_empty(pktlist)) + qib_sdma_make_progress(ppd); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + } while (!list_empty(pktlist)); + + return 0; } int qib_user_sdma_writev(struct qib_ctxtdata *rcd, @@ -822,19 +1392,20 @@ int qib_user_sdma_writev(struct qib_ctxtdata *rcd, if (!qib_sdma_running(ppd)) goto done_unlock; - if (ppd->sdma_descq_added != ppd->sdma_descq_removed) { + /* if I have packets not complete yet */ + if (pq->added > ppd->sdma_descq_removed) qib_user_sdma_hwqueue_clean(ppd); + /* if I have complete packets to be freed */ + if (pq->num_sending) qib_user_sdma_queue_clean(ppd, pq); - } while (dim) { - const int mxp = 8; + int mxp = 1; + int ndesc = 0; - down_write(¤t->mm->mmap_sem); - ret = qib_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp); - up_write(¤t->mm->mmap_sem); - - if (ret <= 0) + ret = qib_user_sdma_queue_pkts(dd, ppd, pq, + iov, dim, &list, &mxp, &ndesc); + if (ret < 0) goto done_unlock; else { dim -= ret; @@ -844,24 +1415,20 @@ int qib_user_sdma_writev(struct qib_ctxtdata *rcd, /* force packets onto the sdma hw queue... */ if (!list_empty(&list)) { /* - * Lazily clean hw queue. the 4 is a guess of about - * how many sdma descriptors a packet will take (it - * doesn't have to be perfect). + * Lazily clean hw queue. */ - if (qib_sdma_descq_freecnt(ppd) < ret * 4) { + if (qib_sdma_descq_freecnt(ppd) < ndesc) { qib_user_sdma_hwqueue_clean(ppd); - qib_user_sdma_queue_clean(ppd, pq); + if (pq->num_sending) + qib_user_sdma_queue_clean(ppd, pq); } - ret = qib_user_sdma_push_pkts(ppd, pq, &list); + ret = qib_user_sdma_push_pkts(ppd, pq, &list, mxp); if (ret < 0) goto done_unlock; else { - npkts += ret; - pq->counter += ret; - - if (!list_empty(&list)) - goto done_unlock; + npkts += mxp; + pq->counter += mxp; } } } diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index ba51a4715a1..9bcfbd84298 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * @@ -645,9 +645,11 @@ void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen) } else goto drop; - opcode = be32_to_cpu(ohdr->bth[0]) >> 24; - ibp->opstats[opcode & 0x7f].n_bytes += tlen; - ibp->opstats[opcode & 0x7f].n_packets++; + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f; +#ifdef CONFIG_DEBUG_FS + rcd->opstats->stats[opcode].n_bytes += tlen; + rcd->opstats->stats[opcode].n_packets++; +#endif /* Get the destination QP number. */ qp_num = be32_to_cpu(ohdr->bth[1]) & QIB_QPN_MASK; @@ -660,7 +662,7 @@ void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen) mcast = qib_mcast_find(ibp, &hdr->u.l.grh.dgid); if (mcast == NULL) goto drop; - ibp->n_multicast_rcv++; + this_cpu_inc(ibp->pmastats->n_multicast_rcv); list_for_each_entry_rcu(p, &mcast->qp_list, list) qib_qp_rcv(rcd, hdr, 1, data, tlen, p->qp); /* @@ -676,8 +678,8 @@ void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen) &rcd->lookaside_qp->refcount)) wake_up( &rcd->lookaside_qp->wait); - rcd->lookaside_qp = NULL; - } + rcd->lookaside_qp = NULL; + } } if (!rcd->lookaside_qp) { qp = qib_lookup_qpn(ibp, qp_num); @@ -687,7 +689,7 @@ void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen) rcd->lookaside_qpn = qp_num; } else qp = rcd->lookaside_qp; - ibp->n_unicast_rcv++; + this_cpu_inc(ibp->pmastats->n_unicast_rcv); qib_qp_rcv(rcd, hdr, lnh == QIB_LRH_GRH, data, tlen, qp); } return; @@ -2224,7 +2226,7 @@ int qib_register_ib_device(struct qib_devdata *dd) ibdev->dma_ops = &qib_dma_mapping_ops; snprintf(ibdev->node_desc, sizeof(ibdev->node_desc), - "QLogic Infiniband HCA %s", init_utsname()->nodename); + "Intel Infiniband HCA %s", init_utsname()->nodename); ret = ib_register_device(ibdev, qib_create_port_files); if (ret) @@ -2234,7 +2236,8 @@ int qib_register_ib_device(struct qib_devdata *dd) if (ret) goto err_agents; - if (qib_verbs_register_sysfs(dd)) + ret = qib_verbs_register_sysfs(dd); + if (ret) goto err_class; goto bail; diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index aff8b2c1788..bfc8948fdd3 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * @@ -41,6 +41,7 @@ #include <linux/interrupt.h> #include <linux/kref.h> #include <linux/workqueue.h> +#include <linux/kthread.h> #include <linux/completion.h> #include <rdma/ib_pack.h> #include <rdma/ib_user_verbs.h> @@ -149,14 +150,14 @@ struct ib_reth { __be64 vaddr; __be32 rkey; __be32 length; -} __attribute__ ((packed)); +} __packed; struct ib_atomic_eth { __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */ __be32 rkey; __be64 swap_data; __be64 compare_data; -} __attribute__ ((packed)); +} __packed; struct qib_other_headers { __be32 bth[3]; @@ -177,7 +178,7 @@ struct qib_other_headers { __be32 aeth; struct ib_atomic_eth atomic_eth; } u; -} __attribute__ ((packed)); +} __packed; /* * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes @@ -194,12 +195,12 @@ struct qib_ib_header { } l; struct qib_other_headers oth; } u; -} __attribute__ ((packed)); +} __packed; struct qib_pio_header { __le32 pbc[2]; struct qib_ib_header hdr; -} __attribute__ ((packed)); +} __packed; /* * There is one struct qib_mcast for each multicast GID. @@ -267,7 +268,8 @@ struct qib_cq_wc { */ struct qib_cq { struct ib_cq ibcq; - struct work_struct comptask; + struct kthread_work comptask; + struct qib_devdata *dd; spinlock_t lock; /* protect changes in this struct */ u8 notify; u8 triggered; @@ -658,6 +660,17 @@ struct qib_opcode_stats { u64 n_bytes; /* total number of bytes */ }; +struct qib_opcode_stats_perctx { + struct qib_opcode_stats stats[128]; +}; + +struct qib_pma_counters { + u64 n_unicast_xmit; /* total unicast packets sent */ + u64 n_unicast_rcv; /* total unicast packets received */ + u64 n_multicast_xmit; /* total multicast packets sent */ + u64 n_multicast_rcv; /* total multicast packets received */ +}; + struct qib_ibport { struct qib_qp __rcu *qp0; struct qib_qp __rcu *qp1; @@ -674,10 +687,11 @@ struct qib_ibport { __be64 mkey; __be64 guids[QIB_GUIDS_PER_PORT - 1]; /* writable GUIDs */ u64 tid; /* TID for traps */ - u64 n_unicast_xmit; /* total unicast packets sent */ - u64 n_unicast_rcv; /* total unicast packets received */ - u64 n_multicast_xmit; /* total multicast packets sent */ - u64 n_multicast_rcv; /* total multicast packets received */ + struct qib_pma_counters __percpu *pmastats; + u64 z_unicast_xmit; /* starting count for PMA */ + u64 z_unicast_rcv; /* starting count for PMA */ + u64 z_multicast_xmit; /* starting count for PMA */ + u64 z_multicast_rcv; /* starting count for PMA */ u64 z_symbol_error_counter; /* starting count for PMA */ u64 z_link_error_recovery_counter; /* starting count for PMA */ u64 z_link_downed_counter; /* starting count for PMA */ @@ -724,7 +738,6 @@ struct qib_ibport { u8 vl_high_limit; u8 sl_to_vl[16]; - struct qib_opcode_stats opstats[128]; }; @@ -768,6 +781,10 @@ struct qib_ibdev { spinlock_t n_srqs_lock; u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ spinlock_t n_mcast_grps_lock; +#ifdef CONFIG_DEBUG_FS + /* per HCA debugfs */ + struct dentry *qib_ibdev_dbg; +#endif }; struct qib_verbs_counters { @@ -832,8 +849,6 @@ static inline int qib_send_ok(struct qib_qp *qp) !(qp->s_flags & QIB_S_ANY_WAIT_SEND)); } -extern struct workqueue_struct *qib_cq_wq; - /* * This must be called with s_lock held. */ @@ -910,6 +925,18 @@ void qib_init_qpn_table(struct qib_devdata *dd, struct qib_qpn_table *qpt); void qib_free_qpn_table(struct qib_qpn_table *qpt); +#ifdef CONFIG_DEBUG_FS + +struct qib_qp_iter; + +struct qib_qp_iter *qib_qp_iter_init(struct qib_ibdev *dev); + +int qib_qp_iter_next(struct qib_qp_iter *iter); + +void qib_qp_iter_print(struct seq_file *s, struct qib_qp_iter *iter); + +#endif + void qib_get_credit(struct qib_qp *qp, u32 aeth); unsigned qib_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult); @@ -972,6 +999,10 @@ int qib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); int qib_destroy_srq(struct ib_srq *ibsrq); +int qib_cq_init(struct qib_devdata *dd); + +void qib_cq_exit(struct qib_devdata *dd); + void qib_cq_enter(struct qib_cq *cq, struct ib_wc *entry, int sig); int qib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); diff --git a/drivers/infiniband/hw/usnic/Kconfig b/drivers/infiniband/hw/usnic/Kconfig new file mode 100644 index 00000000000..29ab11c34f3 --- /dev/null +++ b/drivers/infiniband/hw/usnic/Kconfig @@ -0,0 +1,10 @@ +config INFINIBAND_USNIC + tristate "Verbs support for Cisco VIC" + depends on NETDEVICES && ETHERNET && INET && PCI && INTEL_IOMMU + select ENIC + select NET_VENDOR_CISCO + select PCI_IOV + select INFINIBAND_USER_ACCESS + ---help--- + This is a low-level driver for Cisco's Virtual Interface + Cards (VICs), including the VIC 1240 and 1280 cards. diff --git a/drivers/infiniband/hw/usnic/Makefile b/drivers/infiniband/hw/usnic/Makefile new file mode 100644 index 00000000000..99fb2db47cd --- /dev/null +++ b/drivers/infiniband/hw/usnic/Makefile @@ -0,0 +1,15 @@ +ccflags-y := -Idrivers/net/ethernet/cisco/enic + +obj-$(CONFIG_INFINIBAND_USNIC)+= usnic_verbs.o + +usnic_verbs-y=\ +usnic_fwd.o \ +usnic_transport.o \ +usnic_uiom.o \ +usnic_uiom_interval_tree.o \ +usnic_vnic.o \ +usnic_ib_main.o \ +usnic_ib_qp_grp.o \ +usnic_ib_sysfs.o \ +usnic_ib_verbs.o \ +usnic_debugfs.o \ diff --git a/drivers/infiniband/hw/usnic/usnic.h b/drivers/infiniband/hw/usnic/usnic.h new file mode 100644 index 00000000000..5be13d8991b --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_H_ +#define USNIC_H_ + +#define DRV_NAME "usnic_verbs" + +#define PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC 0x00cf /* User space NIC */ + +#define DRV_VERSION "1.0.3" +#define DRV_RELDATE "December 19, 2013" + +#endif /* USNIC_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_abi.h b/drivers/infiniband/hw/usnic/usnic_abi.h new file mode 100644 index 00000000000..04a66229584 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_abi.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + + +#ifndef USNIC_ABI_H +#define USNIC_ABI_H + +/* ABI between userspace and kernel */ +#define USNIC_UVERBS_ABI_VERSION 4 + +#define USNIC_QP_GRP_MAX_WQS 8 +#define USNIC_QP_GRP_MAX_RQS 8 +#define USNIC_QP_GRP_MAX_CQS 16 + +enum usnic_transport_type { + USNIC_TRANSPORT_UNKNOWN = 0, + USNIC_TRANSPORT_ROCE_CUSTOM = 1, + USNIC_TRANSPORT_IPV4_UDP = 2, + USNIC_TRANSPORT_MAX = 3, +}; + +struct usnic_transport_spec { + enum usnic_transport_type trans_type; + union { + struct { + uint16_t port_num; + } usnic_roce; + struct { + uint32_t sock_fd; + } udp; + }; +}; + +struct usnic_ib_create_qp_cmd { + struct usnic_transport_spec spec; +}; + +/*TODO: Future - usnic_modify_qp needs to pass in generic filters */ +struct usnic_ib_create_qp_resp { + u32 vfid; + u32 qp_grp_id; + u64 bar_bus_addr; + u32 bar_len; +/* + * WQ, RQ, CQ are explicity specified bc exposing a generic resources inteface + * expands the scope of ABI to many files. + */ + u32 wq_cnt; + u32 rq_cnt; + u32 cq_cnt; + u32 wq_idx[USNIC_QP_GRP_MAX_WQS]; + u32 rq_idx[USNIC_QP_GRP_MAX_RQS]; + u32 cq_idx[USNIC_QP_GRP_MAX_CQS]; + u32 transport; + u32 reserved[9]; +}; + +#endif /* USNIC_ABI_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h b/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h new file mode 100644 index 00000000000..39356726614 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_CMN_PKT_HDR_H +#define USNIC_CMN_PKT_HDR_H + +#define USNIC_ROCE_ETHERTYPE (0x8915) +#define USNIC_ROCE_GRH_VER (8) +#define USNIC_PROTO_VER (1) +#define USNIC_ROCE_GRH_VER_SHIFT (4) + +#endif /* USNIC_COMMON_PKT_HDR_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_common_util.h b/drivers/infiniband/hw/usnic/usnic_common_util.h new file mode 100644 index 00000000000..9d737ed5e55 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_common_util.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_CMN_UTIL_H +#define USNIC_CMN_UTIL_H + +static inline void +usnic_mac_to_gid(const char *const mac, char *raw_gid) +{ + raw_gid[0] = 0xfe; + raw_gid[1] = 0x80; + memset(&raw_gid[2], 0, 6); + raw_gid[8] = mac[0]^2; + raw_gid[9] = mac[1]; + raw_gid[10] = mac[2]; + raw_gid[11] = 0xff; + raw_gid[12] = 0xfe; + raw_gid[13] = mac[3]; + raw_gid[14] = mac[4]; + raw_gid[15] = mac[5]; +} + +static inline void +usnic_mac_ip_to_gid(const char *const mac, const __be32 inaddr, char *raw_gid) +{ + raw_gid[0] = 0xfe; + raw_gid[1] = 0x80; + memset(&raw_gid[2], 0, 2); + memcpy(&raw_gid[4], &inaddr, 4); + raw_gid[8] = mac[0]^2; + raw_gid[9] = mac[1]; + raw_gid[10] = mac[2]; + raw_gid[11] = 0xff; + raw_gid[12] = 0xfe; + raw_gid[13] = mac[3]; + raw_gid[14] = mac[4]; + raw_gid[15] = mac[5]; +} + +static inline void +usnic_write_gid_if_id_from_mac(char *mac, char *raw_gid) +{ + raw_gid[8] = mac[0]^2; + raw_gid[9] = mac[1]; + raw_gid[10] = mac[2]; + raw_gid[11] = 0xff; + raw_gid[12] = 0xfe; + raw_gid[13] = mac[3]; + raw_gid[14] = mac[4]; + raw_gid[15] = mac[5]; +} + +#endif /* USNIC_COMMON_UTIL_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.c b/drivers/infiniband/hw/usnic/usnic_debugfs.c new file mode 100644 index 00000000000..5d13860161a --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_debugfs.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/debugfs.h> +#include <linux/module.h> + +#include "usnic.h" +#include "usnic_log.h" +#include "usnic_debugfs.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_transport.h" + +static struct dentry *debugfs_root; +static struct dentry *flows_dentry; + +static ssize_t usnic_debugfs_buildinfo_read(struct file *f, char __user *data, + size_t count, loff_t *ppos) +{ + char buf[500]; + int res; + + if (*ppos > 0) + return 0; + + res = scnprintf(buf, sizeof(buf), + "version: %s\n" + "build date: %s\n", + DRV_VERSION, DRV_RELDATE); + + return simple_read_from_buffer(data, count, ppos, buf, res); +} + +static const struct file_operations usnic_debugfs_buildinfo_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = usnic_debugfs_buildinfo_read +}; + +static ssize_t flowinfo_read(struct file *f, char __user *data, + size_t count, loff_t *ppos) +{ + struct usnic_ib_qp_grp_flow *qp_flow; + int n; + int left; + char *ptr; + char buf[512]; + + qp_flow = f->private_data; + ptr = buf; + left = count; + + if (*ppos > 0) + return 0; + + spin_lock(&qp_flow->qp_grp->lock); + n = scnprintf(ptr, left, + "QP Grp ID: %d Transport: %s ", + qp_flow->qp_grp->grp_id, + usnic_transport_to_str(qp_flow->trans_type)); + UPDATE_PTR_LEFT(n, ptr, left); + if (qp_flow->trans_type == USNIC_TRANSPORT_ROCE_CUSTOM) { + n = scnprintf(ptr, left, "Port_Num:%hu\n", + qp_flow->usnic_roce.port_num); + UPDATE_PTR_LEFT(n, ptr, left); + } else if (qp_flow->trans_type == USNIC_TRANSPORT_IPV4_UDP) { + n = usnic_transport_sock_to_str(ptr, left, + qp_flow->udp.sock); + UPDATE_PTR_LEFT(n, ptr, left); + n = scnprintf(ptr, left, "\n"); + UPDATE_PTR_LEFT(n, ptr, left); + } + spin_unlock(&qp_flow->qp_grp->lock); + + return simple_read_from_buffer(data, count, ppos, buf, ptr - buf); +} + +static const struct file_operations flowinfo_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = flowinfo_read, +}; + +void usnic_debugfs_init(void) +{ + debugfs_root = debugfs_create_dir(DRV_NAME, NULL); + if (IS_ERR(debugfs_root)) { + usnic_err("Failed to create debugfs root dir, check if debugfs is enabled in kernel configuration\n"); + goto out_clear_root; + } + + flows_dentry = debugfs_create_dir("flows", debugfs_root); + if (IS_ERR_OR_NULL(flows_dentry)) { + usnic_err("Failed to create debugfs flow dir with err %ld\n", + PTR_ERR(flows_dentry)); + goto out_free_root; + } + + debugfs_create_file("build-info", S_IRUGO, debugfs_root, + NULL, &usnic_debugfs_buildinfo_ops); + return; + +out_free_root: + debugfs_remove_recursive(debugfs_root); +out_clear_root: + debugfs_root = NULL; +} + +void usnic_debugfs_exit(void) +{ + if (!debugfs_root) + return; + + debugfs_remove_recursive(debugfs_root); + debugfs_root = NULL; +} + +void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow) +{ + if (IS_ERR_OR_NULL(flows_dentry)) + return; + + scnprintf(qp_flow->dentry_name, sizeof(qp_flow->dentry_name), + "%u", qp_flow->flow->flow_id); + qp_flow->dbgfs_dentry = debugfs_create_file(qp_flow->dentry_name, + S_IRUGO, + flows_dentry, + qp_flow, + &flowinfo_ops); + if (IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) { + usnic_err("Failed to create dbg fs entry for flow %u\n", + qp_flow->flow->flow_id); + } +} + +void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow) +{ + if (!IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) + debugfs_remove(qp_flow->dbgfs_dentry); +} diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.h b/drivers/infiniband/hw/usnic/usnic_debugfs.h new file mode 100644 index 00000000000..4087d24a88f --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_debugfs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#ifndef USNIC_DEBUGFS_H_ +#define USNIC_DEBUGFS_H_ + +#include "usnic_ib_qp_grp.h" + +void usnic_debugfs_init(void); + +void usnic_debugfs_exit(void); +void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow); +void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow); + +#endif /*!USNIC_DEBUGFS_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.c b/drivers/infiniband/hw/usnic/usnic_fwd.c new file mode 100644 index 00000000000..e3c9bd9d3ba --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_fwd.c @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/netdevice.h> +#include <linux/pci.h> + +#include "enic_api.h" +#include "usnic_common_pkt_hdr.h" +#include "usnic_fwd.h" +#include "usnic_log.h" + +static int usnic_fwd_devcmd_locked(struct usnic_fwd_dev *ufdev, int vnic_idx, + enum vnic_devcmd_cmd cmd, u64 *a0, + u64 *a1) +{ + int status; + struct net_device *netdev = ufdev->netdev; + + lockdep_assert_held(&ufdev->lock); + + status = enic_api_devcmd_proxy_by_index(netdev, + vnic_idx, + cmd, + a0, a1, + 1000); + if (status) { + if (status == ERR_EINVAL && cmd == CMD_DEL_FILTER) { + usnic_dbg("Dev %s vnic idx %u cmd %u already deleted", + ufdev->name, vnic_idx, cmd); + } else { + usnic_err("Dev %s vnic idx %u cmd %u failed with status %d\n", + ufdev->name, vnic_idx, cmd, + status); + } + } else { + usnic_dbg("Dev %s vnic idx %u cmd %u success", + ufdev->name, vnic_idx, cmd); + } + + return status; +} + +static int usnic_fwd_devcmd(struct usnic_fwd_dev *ufdev, int vnic_idx, + enum vnic_devcmd_cmd cmd, u64 *a0, u64 *a1) +{ + int status; + + spin_lock(&ufdev->lock); + status = usnic_fwd_devcmd_locked(ufdev, vnic_idx, cmd, a0, a1); + spin_unlock(&ufdev->lock); + + return status; +} + +struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev) +{ + struct usnic_fwd_dev *ufdev; + + ufdev = kzalloc(sizeof(*ufdev), GFP_KERNEL); + if (!ufdev) + return NULL; + + ufdev->pdev = pdev; + ufdev->netdev = pci_get_drvdata(pdev); + spin_lock_init(&ufdev->lock); + strncpy(ufdev->name, netdev_name(ufdev->netdev), + sizeof(ufdev->name) - 1); + + return ufdev; +} + +void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev) +{ + kfree(ufdev); +} + +void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN]) +{ + spin_lock(&ufdev->lock); + memcpy(&ufdev->mac, mac, sizeof(ufdev->mac)); + spin_unlock(&ufdev->lock); +} + +int usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr) +{ + int status; + + spin_lock(&ufdev->lock); + if (ufdev->inaddr == 0) { + ufdev->inaddr = inaddr; + status = 0; + } else { + status = -EFAULT; + } + spin_unlock(&ufdev->lock); + + return status; +} + +void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev) +{ + spin_lock(&ufdev->lock); + ufdev->inaddr = 0; + spin_unlock(&ufdev->lock); +} + +void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev) +{ + spin_lock(&ufdev->lock); + ufdev->link_up = 1; + spin_unlock(&ufdev->lock); +} + +void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev) +{ + spin_lock(&ufdev->lock); + ufdev->link_up = 0; + spin_unlock(&ufdev->lock); +} + +void usnic_fwd_set_mtu(struct usnic_fwd_dev *ufdev, unsigned int mtu) +{ + spin_lock(&ufdev->lock); + ufdev->mtu = mtu; + spin_unlock(&ufdev->lock); +} + +static int usnic_fwd_dev_ready_locked(struct usnic_fwd_dev *ufdev) +{ + lockdep_assert_held(&ufdev->lock); + + if (!ufdev->link_up) + return -EPERM; + + return 0; +} + +static int validate_filter_locked(struct usnic_fwd_dev *ufdev, + struct filter *filter) +{ + + lockdep_assert_held(&ufdev->lock); + + if (filter->type == FILTER_IPV4_5TUPLE) { + if (!(filter->u.ipv4.flags & FILTER_FIELD_5TUP_DST_AD)) + return -EACCES; + if (!(filter->u.ipv4.flags & FILTER_FIELD_5TUP_DST_PT)) + return -EBUSY; + else if (ufdev->inaddr == 0) + return -EINVAL; + else if (filter->u.ipv4.dst_port == 0) + return -ERANGE; + else if (ntohl(ufdev->inaddr) != filter->u.ipv4.dst_addr) + return -EFAULT; + else + return 0; + } + + return 0; +} + +static void fill_tlv(struct filter_tlv *tlv, struct filter *filter, + struct filter_action *action) +{ + tlv->type = CLSF_TLV_FILTER; + tlv->length = sizeof(struct filter); + *((struct filter *)&tlv->val) = *filter; + + tlv = (struct filter_tlv *)((char *)tlv + sizeof(struct filter_tlv) + + sizeof(struct filter)); + tlv->type = CLSF_TLV_ACTION; + tlv->length = sizeof(struct filter_action); + *((struct filter_action *)&tlv->val) = *action; +} + +struct usnic_fwd_flow* +usnic_fwd_alloc_flow(struct usnic_fwd_dev *ufdev, struct filter *filter, + struct usnic_filter_action *uaction) +{ + struct filter_tlv *tlv; + struct pci_dev *pdev; + struct usnic_fwd_flow *flow; + uint64_t a0, a1; + uint64_t tlv_size; + dma_addr_t tlv_pa; + int status; + + pdev = ufdev->pdev; + tlv_size = (2*sizeof(struct filter_tlv) + sizeof(struct filter) + + sizeof(struct filter_action)); + + flow = kzalloc(sizeof(*flow), GFP_ATOMIC); + if (!flow) + return ERR_PTR(-ENOMEM); + + tlv = pci_alloc_consistent(pdev, tlv_size, &tlv_pa); + if (!tlv) { + usnic_err("Failed to allocate memory\n"); + status = -ENOMEM; + goto out_free_flow; + } + + fill_tlv(tlv, filter, &uaction->action); + + spin_lock(&ufdev->lock); + status = usnic_fwd_dev_ready_locked(ufdev); + if (status) { + usnic_err("Forwarding dev %s not ready with status %d\n", + ufdev->name, status); + goto out_free_tlv; + } + + status = validate_filter_locked(ufdev, filter); + if (status) { + usnic_err("Failed to validate filter with status %d\n", + status); + goto out_free_tlv; + } + + /* Issue Devcmd */ + a0 = tlv_pa; + a1 = tlv_size; + status = usnic_fwd_devcmd_locked(ufdev, uaction->vnic_idx, + CMD_ADD_FILTER, &a0, &a1); + if (status) { + usnic_err("VF %s Filter add failed with status:%d", + ufdev->name, status); + status = -EFAULT; + goto out_free_tlv; + } else { + usnic_dbg("VF %s FILTER ID:%llu", ufdev->name, a0); + } + + flow->flow_id = (uint32_t) a0; + flow->vnic_idx = uaction->vnic_idx; + flow->ufdev = ufdev; + +out_free_tlv: + spin_unlock(&ufdev->lock); + pci_free_consistent(pdev, tlv_size, tlv, tlv_pa); + if (!status) + return flow; +out_free_flow: + kfree(flow); + return ERR_PTR(status); +} + +int usnic_fwd_dealloc_flow(struct usnic_fwd_flow *flow) +{ + int status; + u64 a0, a1; + + a0 = flow->flow_id; + + status = usnic_fwd_devcmd(flow->ufdev, flow->vnic_idx, + CMD_DEL_FILTER, &a0, &a1); + if (status) { + if (status == ERR_EINVAL) { + usnic_dbg("Filter %u already deleted for VF Idx %u pf: %s status: %d", + flow->flow_id, flow->vnic_idx, + flow->ufdev->name, status); + } else { + usnic_err("PF %s VF Idx %u Filter: %u FILTER DELETE failed with status %d", + flow->ufdev->name, flow->vnic_idx, + flow->flow_id, status); + } + status = 0; + /* + * Log the error and fake success to the caller because if + * a flow fails to be deleted in the firmware, it is an + * unrecoverable error. + */ + } else { + usnic_dbg("PF %s VF Idx %u Filter: %u FILTER DELETED", + flow->ufdev->name, flow->vnic_idx, + flow->flow_id); + } + + kfree(flow); + return status; +} + +int usnic_fwd_enable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx) +{ + int status; + struct net_device *pf_netdev; + u64 a0, a1; + + pf_netdev = ufdev->netdev; + a0 = qp_idx; + a1 = CMD_QP_RQWQ; + + status = usnic_fwd_devcmd(ufdev, vnic_idx, CMD_QP_ENABLE, + &a0, &a1); + if (status) { + usnic_err("PF %s VNIC Index %u RQ Index: %u ENABLE Failed with status %d", + netdev_name(pf_netdev), + vnic_idx, + qp_idx, + status); + } else { + usnic_dbg("PF %s VNIC Index %u RQ Index: %u ENABLED", + netdev_name(pf_netdev), + vnic_idx, qp_idx); + } + + return status; +} + +int usnic_fwd_disable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx) +{ + int status; + u64 a0, a1; + struct net_device *pf_netdev; + + pf_netdev = ufdev->netdev; + a0 = qp_idx; + a1 = CMD_QP_RQWQ; + + status = usnic_fwd_devcmd(ufdev, vnic_idx, CMD_QP_DISABLE, + &a0, &a1); + if (status) { + usnic_err("PF %s VNIC Index %u RQ Index: %u DISABLE Failed with status %d", + netdev_name(pf_netdev), + vnic_idx, + qp_idx, + status); + } else { + usnic_dbg("PF %s VNIC Index %u RQ Index: %u DISABLED", + netdev_name(pf_netdev), + vnic_idx, + qp_idx); + } + + return status; +} diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.h b/drivers/infiniband/hw/usnic/usnic_fwd.h new file mode 100644 index 00000000000..93713a2230b --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_fwd.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_FWD_H_ +#define USNIC_FWD_H_ + +#include <linux/if.h> +#include <linux/netdevice.h> +#include <linux/pci.h> +#include <linux/in.h> + +#include "usnic_abi.h" +#include "usnic_common_pkt_hdr.h" +#include "vnic_devcmd.h" + +struct usnic_fwd_dev { + struct pci_dev *pdev; + struct net_device *netdev; + spinlock_t lock; + /* + * The following fields can be read directly off the device. + * However, they should be set by a accessor function, except name, + * which cannot be changed. + */ + bool link_up; + char mac[ETH_ALEN]; + unsigned int mtu; + __be32 inaddr; + char name[IFNAMSIZ+1]; +}; + +struct usnic_fwd_flow { + uint32_t flow_id; + struct usnic_fwd_dev *ufdev; + unsigned int vnic_idx; +}; + +struct usnic_filter_action { + int vnic_idx; + struct filter_action action; +}; + +struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev); +void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev); + +void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN]); +int usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr); +void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev); +void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev); +void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev); +void usnic_fwd_set_mtu(struct usnic_fwd_dev *ufdev, unsigned int mtu); + +/* + * Allocate a flow on this forwarding device. Whoever calls this function, + * must monitor netdev events on ufdev's netdevice. If NETDEV_REBOOT or + * NETDEV_DOWN is seen, flow will no longer function and must be + * immediately freed by calling usnic_dealloc_flow. + */ +struct usnic_fwd_flow* +usnic_fwd_alloc_flow(struct usnic_fwd_dev *ufdev, struct filter *filter, + struct usnic_filter_action *action); +int usnic_fwd_dealloc_flow(struct usnic_fwd_flow *flow); +int usnic_fwd_enable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx); +int usnic_fwd_disable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx); + +static inline void usnic_fwd_init_usnic_filter(struct filter *filter, + uint32_t usnic_id) +{ + filter->type = FILTER_USNIC_ID; + filter->u.usnic.ethtype = USNIC_ROCE_ETHERTYPE; + filter->u.usnic.flags = FILTER_FIELD_USNIC_ETHTYPE | + FILTER_FIELD_USNIC_ID | + FILTER_FIELD_USNIC_PROTO; + filter->u.usnic.proto_version = (USNIC_ROCE_GRH_VER << + USNIC_ROCE_GRH_VER_SHIFT) | + USNIC_PROTO_VER; + filter->u.usnic.usnic_id = usnic_id; +} + +static inline void usnic_fwd_init_udp_filter(struct filter *filter, + uint32_t daddr, uint16_t dport) +{ + filter->type = FILTER_IPV4_5TUPLE; + filter->u.ipv4.flags = FILTER_FIELD_5TUP_PROTO; + filter->u.ipv4.protocol = PROTO_UDP; + + if (daddr) { + filter->u.ipv4.flags |= FILTER_FIELD_5TUP_DST_AD; + filter->u.ipv4.dst_addr = daddr; + } + + if (dport) { + filter->u.ipv4.flags |= FILTER_FIELD_5TUP_DST_PT; + filter->u.ipv4.dst_port = dport; + } +} + +#endif /* !USNIC_FWD_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib.h b/drivers/infiniband/hw/usnic/usnic_ib.h new file mode 100644 index 00000000000..e5a9297dd1b --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_H_ +#define USNIC_IB_H_ + +#include <linux/iommu.h> +#include <linux/netdevice.h> + +#include <rdma/ib_verbs.h> + + +#include "usnic.h" +#include "usnic_abi.h" +#include "usnic_vnic.h" + +#define USNIC_IB_PORT_CNT 1 +#define USNIC_IB_NUM_COMP_VECTORS 1 + +extern unsigned int usnic_ib_share_vf; + +struct usnic_ib_ucontext { + struct ib_ucontext ibucontext; + /* Protected by usnic_ib_dev->usdev_lock */ + struct list_head qp_grp_list; + struct list_head link; +}; + +struct usnic_ib_pd { + struct ib_pd ibpd; + struct usnic_uiom_pd *umem_pd; +}; + +struct usnic_ib_mr { + struct ib_mr ibmr; + struct usnic_uiom_reg *umem; +}; + +struct usnic_ib_dev { + struct ib_device ib_dev; + struct pci_dev *pdev; + struct net_device *netdev; + struct usnic_fwd_dev *ufdev; + struct list_head ib_dev_link; + struct list_head vf_dev_list; + struct list_head ctx_list; + struct mutex usdev_lock; + + /* provisioning information */ + struct kref vf_cnt; + unsigned int vf_res_cnt[USNIC_VNIC_RES_TYPE_MAX]; + + /* sysfs vars for QPN reporting */ + struct kobject *qpn_kobj; +}; + +struct usnic_ib_vf { + struct usnic_ib_dev *pf; + spinlock_t lock; + struct usnic_vnic *vnic; + unsigned int qp_grp_ref_cnt; + struct usnic_ib_pd *pd; + struct list_head link; +}; + +static inline +struct usnic_ib_dev *to_usdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct usnic_ib_dev, ib_dev); +} + +static inline +struct usnic_ib_ucontext *to_ucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct usnic_ib_ucontext, ibucontext); +} + +static inline +struct usnic_ib_pd *to_upd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct usnic_ib_pd, ibpd); +} + +static inline +struct usnic_ib_ucontext *to_uucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct usnic_ib_ucontext, ibucontext); +} + +static inline +struct usnic_ib_mr *to_umr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct usnic_ib_mr, ibmr); +} +void usnic_ib_log_vf(struct usnic_ib_vf *vf); + +#define UPDATE_PTR_LEFT(N, P, L) \ +do { \ + L -= (N); \ + P += (N); \ +} while (0) + +#endif /* USNIC_IB_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c new file mode 100644 index 00000000000..fb6d026f92c --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -0,0 +1,682 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Author: Upinder Malhi <umalhi@cisco.com> + * Author: Anant Deepak <anadeepa@cisco.com> + * Author: Cesare Cantu' <cantuc@cisco.com> + * Author: Jeff Squyres <jsquyres@cisco.com> + * Author: Kiran Thirumalai <kithirum@cisco.com> + * Author: Xuyang Wang <xuywang@cisco.com> + * Author: Reese Faucette <rfaucett@cisco.com> + * + */ + +#include <linux/module.h> +#include <linux/inetdevice.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/errno.h> +#include <linux/pci.h> +#include <linux/netdevice.h> + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_addr.h> + +#include "usnic_abi.h" +#include "usnic_common_util.h" +#include "usnic_ib.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_log.h" +#include "usnic_fwd.h" +#include "usnic_debugfs.h" +#include "usnic_ib_verbs.h" +#include "usnic_transport.h" +#include "usnic_uiom.h" +#include "usnic_ib_sysfs.h" + +unsigned int usnic_log_lvl = USNIC_LOG_LVL_ERR; +unsigned int usnic_ib_share_vf = 1; + +static const char usnic_version[] = + DRV_NAME ": Cisco VIC (USNIC) Verbs Driver v" + DRV_VERSION " (" DRV_RELDATE ")\n"; + +static DEFINE_MUTEX(usnic_ib_ibdev_list_lock); +static LIST_HEAD(usnic_ib_ibdev_list); + +/* Callback dump funcs */ +static int usnic_ib_dump_vf_hdr(void *obj, char *buf, int buf_sz) +{ + struct usnic_ib_vf *vf = obj; + return scnprintf(buf, buf_sz, "PF: %s ", vf->pf->ib_dev.name); +} +/* End callback dump funcs */ + +static void usnic_ib_dump_vf(struct usnic_ib_vf *vf, char *buf, int buf_sz) +{ + usnic_vnic_dump(vf->vnic, buf, buf_sz, vf, + usnic_ib_dump_vf_hdr, + usnic_ib_qp_grp_dump_hdr, usnic_ib_qp_grp_dump_rows); +} + +void usnic_ib_log_vf(struct usnic_ib_vf *vf) +{ + char buf[1000]; + usnic_ib_dump_vf(vf, buf, sizeof(buf)); + usnic_dbg("%s\n", buf); +} + +/* Start of netdev section */ +static inline const char *usnic_ib_netdev_event_to_string(unsigned long event) +{ + const char *event2str[] = {"NETDEV_NONE", "NETDEV_UP", "NETDEV_DOWN", + "NETDEV_REBOOT", "NETDEV_CHANGE", + "NETDEV_REGISTER", "NETDEV_UNREGISTER", "NETDEV_CHANGEMTU", + "NETDEV_CHANGEADDR", "NETDEV_GOING_DOWN", "NETDEV_FEAT_CHANGE", + "NETDEV_BONDING_FAILOVER", "NETDEV_PRE_UP", + "NETDEV_PRE_TYPE_CHANGE", "NETDEV_POST_TYPE_CHANGE", + "NETDEV_POST_INT", "NETDEV_UNREGISTER_FINAL", "NETDEV_RELEASE", + "NETDEV_NOTIFY_PEERS", "NETDEV_JOIN" + }; + + if (event >= ARRAY_SIZE(event2str)) + return "UNKNOWN_NETDEV_EVENT"; + else + return event2str[event]; +} + +static void usnic_ib_qp_grp_modify_active_to_err(struct usnic_ib_dev *us_ibdev) +{ + struct usnic_ib_ucontext *ctx; + struct usnic_ib_qp_grp *qp_grp; + enum ib_qp_state cur_state; + int status; + + BUG_ON(!mutex_is_locked(&us_ibdev->usdev_lock)); + + list_for_each_entry(ctx, &us_ibdev->ctx_list, link) { + list_for_each_entry(qp_grp, &ctx->qp_grp_list, link) { + cur_state = qp_grp->state; + if (cur_state == IB_QPS_INIT || + cur_state == IB_QPS_RTR || + cur_state == IB_QPS_RTS) { + status = usnic_ib_qp_grp_modify(qp_grp, + IB_QPS_ERR, + NULL); + if (status) { + usnic_err("Failed to transistion qp grp %u from %s to %s\n", + qp_grp->grp_id, + usnic_ib_qp_grp_state_to_string + (cur_state), + usnic_ib_qp_grp_state_to_string + (IB_QPS_ERR)); + } + } + } + } +} + +static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, + unsigned long event) +{ + struct net_device *netdev; + struct ib_event ib_event; + + memset(&ib_event, 0, sizeof(ib_event)); + + mutex_lock(&us_ibdev->usdev_lock); + netdev = us_ibdev->netdev; + switch (event) { + case NETDEV_REBOOT: + usnic_info("PF Reset on %s\n", us_ibdev->ib_dev.name); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_PORT_ERR; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + break; + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_CHANGE: + if (!us_ibdev->ufdev->link_up && + netif_carrier_ok(netdev)) { + usnic_fwd_carrier_up(us_ibdev->ufdev); + usnic_info("Link UP on %s\n", us_ibdev->ib_dev.name); + ib_event.event = IB_EVENT_PORT_ACTIVE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } else if (us_ibdev->ufdev->link_up && + !netif_carrier_ok(netdev)) { + usnic_fwd_carrier_down(us_ibdev->ufdev); + usnic_info("Link DOWN on %s\n", us_ibdev->ib_dev.name); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_PORT_ERR; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } else { + usnic_dbg("Ignoring %s on %s\n", + usnic_ib_netdev_event_to_string(event), + us_ibdev->ib_dev.name); + } + break; + case NETDEV_CHANGEADDR: + if (!memcmp(us_ibdev->ufdev->mac, netdev->dev_addr, + sizeof(us_ibdev->ufdev->mac))) { + usnic_dbg("Ignoring addr change on %s\n", + us_ibdev->ib_dev.name); + } else { + usnic_info(" %s old mac: %pM new mac: %pM\n", + us_ibdev->ib_dev.name, + us_ibdev->ufdev->mac, + netdev->dev_addr); + usnic_fwd_set_mac(us_ibdev->ufdev, netdev->dev_addr); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_GID_CHANGE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } + + break; + case NETDEV_CHANGEMTU: + if (us_ibdev->ufdev->mtu != netdev->mtu) { + usnic_info("MTU Change on %s old: %u new: %u\n", + us_ibdev->ib_dev.name, + us_ibdev->ufdev->mtu, netdev->mtu); + usnic_fwd_set_mtu(us_ibdev->ufdev, netdev->mtu); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + } else { + usnic_dbg("Ignoring MTU change on %s\n", + us_ibdev->ib_dev.name); + } + break; + default: + usnic_dbg("Ignoring event %s on %s", + usnic_ib_netdev_event_to_string(event), + us_ibdev->ib_dev.name); + } + mutex_unlock(&us_ibdev->usdev_lock); +} + +static int usnic_ib_netdevice_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct usnic_ib_dev *us_ibdev; + + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + mutex_lock(&usnic_ib_ibdev_list_lock); + list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) { + if (us_ibdev->netdev == netdev) { + usnic_ib_handle_usdev_event(us_ibdev, event); + break; + } + } + mutex_unlock(&usnic_ib_ibdev_list_lock); + + return NOTIFY_DONE; +} + +static struct notifier_block usnic_ib_netdevice_notifier = { + .notifier_call = usnic_ib_netdevice_event +}; +/* End of netdev section */ + +/* Start of inet section */ +static int usnic_ib_handle_inet_event(struct usnic_ib_dev *us_ibdev, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = ptr; + struct ib_event ib_event; + + mutex_lock(&us_ibdev->usdev_lock); + + switch (event) { + case NETDEV_DOWN: + usnic_info("%s via ip notifiers", + usnic_ib_netdev_event_to_string(event)); + usnic_fwd_del_ipaddr(us_ibdev->ufdev); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_GID_CHANGE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + break; + case NETDEV_UP: + usnic_fwd_add_ipaddr(us_ibdev->ufdev, ifa->ifa_address); + usnic_info("%s via ip notifiers: ip %pI4", + usnic_ib_netdev_event_to_string(event), + &us_ibdev->ufdev->inaddr); + ib_event.event = IB_EVENT_GID_CHANGE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + break; + default: + usnic_info("Ignoring event %s on %s", + usnic_ib_netdev_event_to_string(event), + us_ibdev->ib_dev.name); + } + mutex_unlock(&us_ibdev->usdev_lock); + + return NOTIFY_DONE; +} + +static int usnic_ib_inetaddr_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct usnic_ib_dev *us_ibdev; + struct in_ifaddr *ifa = ptr; + struct net_device *netdev = ifa->ifa_dev->dev; + + mutex_lock(&usnic_ib_ibdev_list_lock); + list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) { + if (us_ibdev->netdev == netdev) { + usnic_ib_handle_inet_event(us_ibdev, event, ptr); + break; + } + } + mutex_unlock(&usnic_ib_ibdev_list_lock); + + return NOTIFY_DONE; +} +static struct notifier_block usnic_ib_inetaddr_notifier = { + .notifier_call = usnic_ib_inetaddr_event +}; +/* End of inet section*/ + +/* Start of PF discovery section */ +static void *usnic_ib_device_add(struct pci_dev *dev) +{ + struct usnic_ib_dev *us_ibdev; + union ib_gid gid; + struct in_ifaddr *in; + struct net_device *netdev; + + usnic_dbg("\n"); + netdev = pci_get_drvdata(dev); + + us_ibdev = (struct usnic_ib_dev *)ib_alloc_device(sizeof(*us_ibdev)); + if (IS_ERR_OR_NULL(us_ibdev)) { + usnic_err("Device %s context alloc failed\n", + netdev_name(pci_get_drvdata(dev))); + return ERR_PTR(us_ibdev ? PTR_ERR(us_ibdev) : -EFAULT); + } + + us_ibdev->ufdev = usnic_fwd_dev_alloc(dev); + if (IS_ERR_OR_NULL(us_ibdev->ufdev)) { + usnic_err("Failed to alloc ufdev for %s with err %ld\n", + pci_name(dev), PTR_ERR(us_ibdev->ufdev)); + goto err_dealloc; + } + + mutex_init(&us_ibdev->usdev_lock); + INIT_LIST_HEAD(&us_ibdev->vf_dev_list); + INIT_LIST_HEAD(&us_ibdev->ctx_list); + + us_ibdev->pdev = dev; + us_ibdev->netdev = pci_get_drvdata(dev); + us_ibdev->ib_dev.owner = THIS_MODULE; + us_ibdev->ib_dev.node_type = RDMA_NODE_USNIC_UDP; + us_ibdev->ib_dev.phys_port_cnt = USNIC_IB_PORT_CNT; + us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS; + us_ibdev->ib_dev.dma_device = &dev->dev; + us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION; + strlcpy(us_ibdev->ib_dev.name, "usnic_%d", IB_DEVICE_NAME_MAX); + + us_ibdev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP); + + us_ibdev->ib_dev.query_device = usnic_ib_query_device; + us_ibdev->ib_dev.query_port = usnic_ib_query_port; + us_ibdev->ib_dev.query_pkey = usnic_ib_query_pkey; + us_ibdev->ib_dev.query_gid = usnic_ib_query_gid; + us_ibdev->ib_dev.get_link_layer = usnic_ib_port_link_layer; + us_ibdev->ib_dev.alloc_pd = usnic_ib_alloc_pd; + us_ibdev->ib_dev.dealloc_pd = usnic_ib_dealloc_pd; + us_ibdev->ib_dev.create_qp = usnic_ib_create_qp; + us_ibdev->ib_dev.modify_qp = usnic_ib_modify_qp; + us_ibdev->ib_dev.query_qp = usnic_ib_query_qp; + us_ibdev->ib_dev.destroy_qp = usnic_ib_destroy_qp; + us_ibdev->ib_dev.create_cq = usnic_ib_create_cq; + us_ibdev->ib_dev.destroy_cq = usnic_ib_destroy_cq; + us_ibdev->ib_dev.reg_user_mr = usnic_ib_reg_mr; + us_ibdev->ib_dev.dereg_mr = usnic_ib_dereg_mr; + us_ibdev->ib_dev.alloc_ucontext = usnic_ib_alloc_ucontext; + us_ibdev->ib_dev.dealloc_ucontext = usnic_ib_dealloc_ucontext; + us_ibdev->ib_dev.mmap = usnic_ib_mmap; + us_ibdev->ib_dev.create_ah = usnic_ib_create_ah; + us_ibdev->ib_dev.destroy_ah = usnic_ib_destroy_ah; + us_ibdev->ib_dev.post_send = usnic_ib_post_send; + us_ibdev->ib_dev.post_recv = usnic_ib_post_recv; + us_ibdev->ib_dev.poll_cq = usnic_ib_poll_cq; + us_ibdev->ib_dev.req_notify_cq = usnic_ib_req_notify_cq; + us_ibdev->ib_dev.get_dma_mr = usnic_ib_get_dma_mr; + + + if (ib_register_device(&us_ibdev->ib_dev, NULL)) + goto err_fwd_dealloc; + + usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu); + usnic_fwd_set_mac(us_ibdev->ufdev, us_ibdev->netdev->dev_addr); + if (netif_carrier_ok(us_ibdev->netdev)) + usnic_fwd_carrier_up(us_ibdev->ufdev); + + in = ((struct in_device *)(netdev->ip_ptr))->ifa_list; + if (in != NULL) + usnic_fwd_add_ipaddr(us_ibdev->ufdev, in->ifa_address); + + usnic_mac_ip_to_gid(us_ibdev->netdev->perm_addr, + us_ibdev->ufdev->inaddr, &gid.raw[0]); + memcpy(&us_ibdev->ib_dev.node_guid, &gid.global.interface_id, + sizeof(gid.global.interface_id)); + kref_init(&us_ibdev->vf_cnt); + + usnic_info("Added ibdev: %s netdev: %s with mac %pM Link: %u MTU: %u\n", + us_ibdev->ib_dev.name, netdev_name(us_ibdev->netdev), + us_ibdev->ufdev->mac, us_ibdev->ufdev->link_up, + us_ibdev->ufdev->mtu); + return us_ibdev; + +err_fwd_dealloc: + usnic_fwd_dev_free(us_ibdev->ufdev); +err_dealloc: + usnic_err("failed -- deallocing device\n"); + ib_dealloc_device(&us_ibdev->ib_dev); + return NULL; +} + +static void usnic_ib_device_remove(struct usnic_ib_dev *us_ibdev) +{ + usnic_info("Unregistering %s\n", us_ibdev->ib_dev.name); + usnic_ib_sysfs_unregister_usdev(us_ibdev); + usnic_fwd_dev_free(us_ibdev->ufdev); + ib_unregister_device(&us_ibdev->ib_dev); + ib_dealloc_device(&us_ibdev->ib_dev); +} + +static void usnic_ib_undiscover_pf(struct kref *kref) +{ + struct usnic_ib_dev *us_ibdev, *tmp; + struct pci_dev *dev; + bool found = false; + + dev = container_of(kref, struct usnic_ib_dev, vf_cnt)->pdev; + mutex_lock(&usnic_ib_ibdev_list_lock); + list_for_each_entry_safe(us_ibdev, tmp, + &usnic_ib_ibdev_list, ib_dev_link) { + if (us_ibdev->pdev == dev) { + list_del(&us_ibdev->ib_dev_link); + usnic_ib_device_remove(us_ibdev); + found = true; + break; + } + } + + WARN(!found, "Failed to remove PF %s\n", pci_name(dev)); + + mutex_unlock(&usnic_ib_ibdev_list_lock); +} + +static struct usnic_ib_dev *usnic_ib_discover_pf(struct usnic_vnic *vnic) +{ + struct usnic_ib_dev *us_ibdev; + struct pci_dev *parent_pci, *vf_pci; + int err; + + vf_pci = usnic_vnic_get_pdev(vnic); + parent_pci = pci_physfn(vf_pci); + + BUG_ON(!parent_pci); + + mutex_lock(&usnic_ib_ibdev_list_lock); + list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) { + if (us_ibdev->pdev == parent_pci) { + kref_get(&us_ibdev->vf_cnt); + goto out; + } + } + + us_ibdev = usnic_ib_device_add(parent_pci); + if (IS_ERR_OR_NULL(us_ibdev)) { + us_ibdev = us_ibdev ? us_ibdev : ERR_PTR(-EFAULT); + goto out; + } + + err = usnic_ib_sysfs_register_usdev(us_ibdev); + if (err) { + usnic_ib_device_remove(us_ibdev); + us_ibdev = ERR_PTR(err); + goto out; + } + + list_add(&us_ibdev->ib_dev_link, &usnic_ib_ibdev_list); +out: + mutex_unlock(&usnic_ib_ibdev_list_lock); + return us_ibdev; +} +/* End of PF discovery section */ + +/* Start of PCI section */ + +static DEFINE_PCI_DEVICE_TABLE(usnic_ib_pci_ids) = { + {PCI_DEVICE(PCI_VENDOR_ID_CISCO, PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC)}, + {0,} +}; + +static int usnic_ib_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + int err; + struct usnic_ib_dev *pf; + struct usnic_ib_vf *vf; + enum usnic_vnic_res_type res_type; + + vf = kzalloc(sizeof(*vf), GFP_KERNEL); + if (!vf) + return -ENOMEM; + + err = pci_enable_device(pdev); + if (err) { + usnic_err("Failed to enable %s with err %d\n", + pci_name(pdev), err); + goto out_clean_vf; + } + + err = pci_request_regions(pdev, DRV_NAME); + if (err) { + usnic_err("Failed to request region for %s with err %d\n", + pci_name(pdev), err); + goto out_disable_device; + } + + pci_set_master(pdev); + pci_set_drvdata(pdev, vf); + + vf->vnic = usnic_vnic_alloc(pdev); + if (IS_ERR_OR_NULL(vf->vnic)) { + err = vf->vnic ? PTR_ERR(vf->vnic) : -ENOMEM; + usnic_err("Failed to alloc vnic for %s with err %d\n", + pci_name(pdev), err); + goto out_release_regions; + } + + pf = usnic_ib_discover_pf(vf->vnic); + if (IS_ERR_OR_NULL(pf)) { + usnic_err("Failed to discover pf of vnic %s with err%ld\n", + pci_name(pdev), PTR_ERR(pf)); + err = pf ? PTR_ERR(pf) : -EFAULT; + goto out_clean_vnic; + } + + vf->pf = pf; + spin_lock_init(&vf->lock); + mutex_lock(&pf->usdev_lock); + list_add_tail(&vf->link, &pf->vf_dev_list); + /* + * Save max settings (will be same for each VF, easier to re-write than + * to say "if (!set) { set_values(); set=1; } + */ + for (res_type = USNIC_VNIC_RES_TYPE_EOL+1; + res_type < USNIC_VNIC_RES_TYPE_MAX; + res_type++) { + pf->vf_res_cnt[res_type] = usnic_vnic_res_cnt(vf->vnic, + res_type); + } + + mutex_unlock(&pf->usdev_lock); + + usnic_info("Registering usnic VF %s into PF %s\n", pci_name(pdev), + pf->ib_dev.name); + usnic_ib_log_vf(vf); + return 0; + +out_clean_vnic: + usnic_vnic_free(vf->vnic); +out_release_regions: + pci_set_drvdata(pdev, NULL); + pci_clear_master(pdev); + pci_release_regions(pdev); +out_disable_device: + pci_disable_device(pdev); +out_clean_vf: + kfree(vf); + return err; +} + +static void usnic_ib_pci_remove(struct pci_dev *pdev) +{ + struct usnic_ib_vf *vf = pci_get_drvdata(pdev); + struct usnic_ib_dev *pf = vf->pf; + + mutex_lock(&pf->usdev_lock); + list_del(&vf->link); + mutex_unlock(&pf->usdev_lock); + + kref_put(&pf->vf_cnt, usnic_ib_undiscover_pf); + usnic_vnic_free(vf->vnic); + pci_set_drvdata(pdev, NULL); + pci_clear_master(pdev); + pci_release_regions(pdev); + pci_disable_device(pdev); + kfree(vf); + + usnic_info("Removed VF %s\n", pci_name(pdev)); +} + +/* PCI driver entry points */ +static struct pci_driver usnic_ib_pci_driver = { + .name = DRV_NAME, + .id_table = usnic_ib_pci_ids, + .probe = usnic_ib_pci_probe, + .remove = usnic_ib_pci_remove, +}; +/* End of PCI section */ + +/* Start of module section */ +static int __init usnic_ib_init(void) +{ + int err; + + printk_once(KERN_INFO "%s", usnic_version); + + err = usnic_uiom_init(DRV_NAME); + if (err) { + usnic_err("Unable to initalize umem with err %d\n", err); + return err; + } + + if (pci_register_driver(&usnic_ib_pci_driver)) { + usnic_err("Unable to register with PCI\n"); + goto out_umem_fini; + } + + err = register_netdevice_notifier(&usnic_ib_netdevice_notifier); + if (err) { + usnic_err("Failed to register netdev notifier\n"); + goto out_pci_unreg; + } + + err = register_inetaddr_notifier(&usnic_ib_inetaddr_notifier); + if (err) { + usnic_err("Failed to register inet addr notifier\n"); + goto out_unreg_netdev_notifier; + } + + err = usnic_transport_init(); + if (err) { + usnic_err("Failed to initialize transport\n"); + goto out_unreg_inetaddr_notifier; + } + + usnic_debugfs_init(); + + return 0; + +out_unreg_inetaddr_notifier: + unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier); +out_unreg_netdev_notifier: + unregister_netdevice_notifier(&usnic_ib_netdevice_notifier); +out_pci_unreg: + pci_unregister_driver(&usnic_ib_pci_driver); +out_umem_fini: + usnic_uiom_fini(); + + return err; +} + +static void __exit usnic_ib_destroy(void) +{ + usnic_dbg("\n"); + usnic_debugfs_exit(); + usnic_transport_fini(); + unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier); + unregister_netdevice_notifier(&usnic_ib_netdevice_notifier); + pci_unregister_driver(&usnic_ib_pci_driver); + usnic_uiom_fini(); +} + +MODULE_DESCRIPTION("Cisco VIC (usNIC) Verbs Driver"); +MODULE_AUTHOR("Upinder Malhi <umalhi@cisco.com>"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); +module_param(usnic_log_lvl, uint, S_IRUGO | S_IWUSR); +module_param(usnic_ib_share_vf, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(usnic_log_lvl, " Off=0, Err=1, Info=2, Debug=3"); +MODULE_PARM_DESC(usnic_ib_share_vf, "Off=0, On=1 VF sharing amongst QPs"); +MODULE_DEVICE_TABLE(pci, usnic_ib_pci_ids); + +module_init(usnic_ib_init); +module_exit(usnic_ib_destroy); +/* End of module section */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c new file mode 100644 index 00000000000..f8dfd76be89 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c @@ -0,0 +1,761 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/bug.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/spinlock.h> + +#include "usnic_log.h" +#include "usnic_vnic.h" +#include "usnic_fwd.h" +#include "usnic_uiom.h" +#include "usnic_debugfs.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_ib_sysfs.h" +#include "usnic_transport.h" + +#define DFLT_RQ_IDX 0 + +const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: + return "Rst"; + case IB_QPS_INIT: + return "Init"; + case IB_QPS_RTR: + return "RTR"; + case IB_QPS_RTS: + return "RTS"; + case IB_QPS_SQD: + return "SQD"; + case IB_QPS_SQE: + return "SQE"; + case IB_QPS_ERR: + return "ERR"; + default: + return "UNKOWN STATE"; + + } +} + +int usnic_ib_qp_grp_dump_hdr(char *buf, int buf_sz) +{ + return scnprintf(buf, buf_sz, "|QPN\t|State\t|PID\t|VF Idx\t|Fil ID"); +} + +int usnic_ib_qp_grp_dump_rows(void *obj, char *buf, int buf_sz) +{ + struct usnic_ib_qp_grp *qp_grp = obj; + struct usnic_ib_qp_grp_flow *default_flow; + if (obj) { + default_flow = list_first_entry(&qp_grp->flows_lst, + struct usnic_ib_qp_grp_flow, link); + return scnprintf(buf, buf_sz, "|%d\t|%s\t|%d\t|%hu\t|%d", + qp_grp->ibqp.qp_num, + usnic_ib_qp_grp_state_to_string( + qp_grp->state), + qp_grp->owner_pid, + usnic_vnic_get_index(qp_grp->vf->vnic), + default_flow->flow->flow_id); + } else { + return scnprintf(buf, buf_sz, "|N/A\t|N/A\t|N/A\t|N/A\t|N/A"); + } +} + +static struct usnic_vnic_res_chunk * +get_qp_res_chunk(struct usnic_ib_qp_grp *qp_grp) +{ + lockdep_assert_held(&qp_grp->lock); + /* + * The QP res chunk, used to derive qp indices, + * are just indices of the RQs + */ + return usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ); +} + +static int enable_qp_grp(struct usnic_ib_qp_grp *qp_grp) +{ + + int status; + int i, vnic_idx; + struct usnic_vnic_res_chunk *res_chunk; + struct usnic_vnic_res *res; + + lockdep_assert_held(&qp_grp->lock); + + vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + + res_chunk = get_qp_res_chunk(qp_grp); + if (IS_ERR_OR_NULL(res_chunk)) { + usnic_err("Unable to get qp res with err %ld\n", + PTR_ERR(res_chunk)); + return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM; + } + + for (i = 0; i < res_chunk->cnt; i++) { + res = res_chunk->res[i]; + status = usnic_fwd_enable_qp(qp_grp->ufdev, vnic_idx, + res->vnic_idx); + if (status) { + usnic_err("Failed to enable qp %d of %s:%d\n with err %d\n", + res->vnic_idx, qp_grp->ufdev->name, + vnic_idx, status); + goto out_err; + } + } + + return 0; + +out_err: + for (i--; i >= 0; i--) { + res = res_chunk->res[i]; + usnic_fwd_disable_qp(qp_grp->ufdev, vnic_idx, + res->vnic_idx); + } + + return status; +} + +static int disable_qp_grp(struct usnic_ib_qp_grp *qp_grp) +{ + int i, vnic_idx; + struct usnic_vnic_res_chunk *res_chunk; + struct usnic_vnic_res *res; + int status = 0; + + lockdep_assert_held(&qp_grp->lock); + vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + + res_chunk = get_qp_res_chunk(qp_grp); + if (IS_ERR_OR_NULL(res_chunk)) { + usnic_err("Unable to get qp res with err %ld\n", + PTR_ERR(res_chunk)); + return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM; + } + + for (i = 0; i < res_chunk->cnt; i++) { + res = res_chunk->res[i]; + status = usnic_fwd_disable_qp(qp_grp->ufdev, vnic_idx, + res->vnic_idx); + if (status) { + usnic_err("Failed to disable rq %d of %s:%d\n with err %d\n", + res->vnic_idx, + qp_grp->ufdev->name, + vnic_idx, status); + } + } + + return status; + +} + +static int init_filter_action(struct usnic_ib_qp_grp *qp_grp, + struct usnic_filter_action *uaction) +{ + struct usnic_vnic_res_chunk *res_chunk; + + res_chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ); + if (IS_ERR_OR_NULL(res_chunk)) { + usnic_err("Unable to get %s with err %ld\n", + usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_RQ), + PTR_ERR(res_chunk)); + return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM; + } + + uaction->vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + uaction->action.type = FILTER_ACTION_RQ_STEERING; + uaction->action.u.rq_idx = res_chunk->res[DFLT_RQ_IDX]->vnic_idx; + + return 0; +} + +static struct usnic_ib_qp_grp_flow* +create_roce_custom_flow(struct usnic_ib_qp_grp *qp_grp, + struct usnic_transport_spec *trans_spec) +{ + uint16_t port_num; + int err; + struct filter filter; + struct usnic_filter_action uaction; + struct usnic_ib_qp_grp_flow *qp_flow; + struct usnic_fwd_flow *flow; + enum usnic_transport_type trans_type; + + trans_type = trans_spec->trans_type; + port_num = trans_spec->usnic_roce.port_num; + + /* Reserve Port */ + port_num = usnic_transport_rsrv_port(trans_type, port_num); + if (port_num == 0) + return ERR_PTR(-EINVAL); + + /* Create Flow */ + usnic_fwd_init_usnic_filter(&filter, port_num); + err = init_filter_action(qp_grp, &uaction); + if (err) + goto out_unreserve_port; + + flow = usnic_fwd_alloc_flow(qp_grp->ufdev, &filter, &uaction); + if (IS_ERR_OR_NULL(flow)) { + usnic_err("Unable to alloc flow failed with err %ld\n", + PTR_ERR(flow)); + err = flow ? PTR_ERR(flow) : -EFAULT; + goto out_unreserve_port; + } + + /* Create Flow Handle */ + qp_flow = kzalloc(sizeof(*qp_flow), GFP_ATOMIC); + if (IS_ERR_OR_NULL(qp_flow)) { + err = qp_flow ? PTR_ERR(qp_flow) : -ENOMEM; + goto out_dealloc_flow; + } + qp_flow->flow = flow; + qp_flow->trans_type = trans_type; + qp_flow->usnic_roce.port_num = port_num; + qp_flow->qp_grp = qp_grp; + return qp_flow; + +out_dealloc_flow: + usnic_fwd_dealloc_flow(flow); +out_unreserve_port: + usnic_transport_unrsrv_port(trans_type, port_num); + return ERR_PTR(err); +} + +static void release_roce_custom_flow(struct usnic_ib_qp_grp_flow *qp_flow) +{ + usnic_fwd_dealloc_flow(qp_flow->flow); + usnic_transport_unrsrv_port(qp_flow->trans_type, + qp_flow->usnic_roce.port_num); + kfree(qp_flow); +} + +static struct usnic_ib_qp_grp_flow* +create_udp_flow(struct usnic_ib_qp_grp *qp_grp, + struct usnic_transport_spec *trans_spec) +{ + struct socket *sock; + int sock_fd; + int err; + struct filter filter; + struct usnic_filter_action uaction; + struct usnic_ib_qp_grp_flow *qp_flow; + struct usnic_fwd_flow *flow; + enum usnic_transport_type trans_type; + uint32_t addr; + uint16_t port_num; + int proto; + + trans_type = trans_spec->trans_type; + sock_fd = trans_spec->udp.sock_fd; + + /* Get and check socket */ + sock = usnic_transport_get_socket(sock_fd); + if (IS_ERR_OR_NULL(sock)) + return ERR_CAST(sock); + + err = usnic_transport_sock_get_addr(sock, &proto, &addr, &port_num); + if (err) + goto out_put_sock; + + if (proto != IPPROTO_UDP) { + usnic_err("Protocol for fd %d is not UDP", sock_fd); + err = -EPERM; + goto out_put_sock; + } + + /* Create flow */ + usnic_fwd_init_udp_filter(&filter, addr, port_num); + err = init_filter_action(qp_grp, &uaction); + if (err) + goto out_put_sock; + + flow = usnic_fwd_alloc_flow(qp_grp->ufdev, &filter, &uaction); + if (IS_ERR_OR_NULL(flow)) { + usnic_err("Unable to alloc flow failed with err %ld\n", + PTR_ERR(flow)); + err = flow ? PTR_ERR(flow) : -EFAULT; + goto out_put_sock; + } + + /* Create qp_flow */ + qp_flow = kzalloc(sizeof(*qp_flow), GFP_ATOMIC); + if (IS_ERR_OR_NULL(qp_flow)) { + err = qp_flow ? PTR_ERR(qp_flow) : -ENOMEM; + goto out_dealloc_flow; + } + qp_flow->flow = flow; + qp_flow->trans_type = trans_type; + qp_flow->udp.sock = sock; + qp_flow->qp_grp = qp_grp; + return qp_flow; + +out_dealloc_flow: + usnic_fwd_dealloc_flow(flow); +out_put_sock: + usnic_transport_put_socket(sock); + return ERR_PTR(err); +} + +static void release_udp_flow(struct usnic_ib_qp_grp_flow *qp_flow) +{ + usnic_fwd_dealloc_flow(qp_flow->flow); + usnic_transport_put_socket(qp_flow->udp.sock); + kfree(qp_flow); +} + +static struct usnic_ib_qp_grp_flow* +create_and_add_flow(struct usnic_ib_qp_grp *qp_grp, + struct usnic_transport_spec *trans_spec) +{ + struct usnic_ib_qp_grp_flow *qp_flow; + enum usnic_transport_type trans_type; + + trans_type = trans_spec->trans_type; + switch (trans_type) { + case USNIC_TRANSPORT_ROCE_CUSTOM: + qp_flow = create_roce_custom_flow(qp_grp, trans_spec); + break; + case USNIC_TRANSPORT_IPV4_UDP: + qp_flow = create_udp_flow(qp_grp, trans_spec); + break; + default: + usnic_err("Unsupported transport %u\n", + trans_spec->trans_type); + return ERR_PTR(-EINVAL); + } + + if (!IS_ERR_OR_NULL(qp_flow)) { + list_add_tail(&qp_flow->link, &qp_grp->flows_lst); + usnic_debugfs_flow_add(qp_flow); + } + + + return qp_flow; +} + +static void release_and_remove_flow(struct usnic_ib_qp_grp_flow *qp_flow) +{ + usnic_debugfs_flow_remove(qp_flow); + list_del(&qp_flow->link); + + switch (qp_flow->trans_type) { + case USNIC_TRANSPORT_ROCE_CUSTOM: + release_roce_custom_flow(qp_flow); + break; + case USNIC_TRANSPORT_IPV4_UDP: + release_udp_flow(qp_flow); + break; + default: + WARN(1, "Unsupported transport %u\n", + qp_flow->trans_type); + break; + } +} + +static void release_and_remove_all_flows(struct usnic_ib_qp_grp *qp_grp) +{ + struct usnic_ib_qp_grp_flow *qp_flow, *tmp; + list_for_each_entry_safe(qp_flow, tmp, &qp_grp->flows_lst, link) + release_and_remove_flow(qp_flow); +} + +int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp, + enum ib_qp_state new_state, + void *data) +{ + int status = 0; + int vnic_idx; + struct ib_event ib_event; + enum ib_qp_state old_state; + struct usnic_transport_spec *trans_spec; + struct usnic_ib_qp_grp_flow *qp_flow; + + old_state = qp_grp->state; + vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + trans_spec = (struct usnic_transport_spec *) data; + + spin_lock(&qp_grp->lock); + switch (new_state) { + case IB_QPS_RESET: + switch (old_state) { + case IB_QPS_RESET: + /* NO-OP */ + break; + case IB_QPS_INIT: + release_and_remove_all_flows(qp_grp); + status = 0; + break; + case IB_QPS_RTR: + case IB_QPS_RTS: + case IB_QPS_ERR: + status = disable_qp_grp(qp_grp); + release_and_remove_all_flows(qp_grp); + break; + default: + status = -EINVAL; + } + break; + case IB_QPS_INIT: + switch (old_state) { + case IB_QPS_RESET: + if (trans_spec) { + qp_flow = create_and_add_flow(qp_grp, + trans_spec); + if (IS_ERR_OR_NULL(qp_flow)) { + status = qp_flow ? PTR_ERR(qp_flow) : -EFAULT; + break; + } + } else { + /* + * Optional to specify filters. + */ + status = 0; + } + break; + case IB_QPS_INIT: + if (trans_spec) { + qp_flow = create_and_add_flow(qp_grp, + trans_spec); + if (IS_ERR_OR_NULL(qp_flow)) { + status = qp_flow ? PTR_ERR(qp_flow) : -EFAULT; + break; + } + } else { + /* + * Doesn't make sense to go into INIT state + * from INIT state w/o adding filters. + */ + status = -EINVAL; + } + break; + case IB_QPS_RTR: + status = disable_qp_grp(qp_grp); + break; + case IB_QPS_RTS: + status = disable_qp_grp(qp_grp); + break; + default: + status = -EINVAL; + } + break; + case IB_QPS_RTR: + switch (old_state) { + case IB_QPS_INIT: + status = enable_qp_grp(qp_grp); + break; + default: + status = -EINVAL; + } + break; + case IB_QPS_RTS: + switch (old_state) { + case IB_QPS_RTR: + /* NO-OP FOR NOW */ + break; + default: + status = -EINVAL; + } + break; + case IB_QPS_ERR: + ib_event.device = &qp_grp->vf->pf->ib_dev; + ib_event.element.qp = &qp_grp->ibqp; + ib_event.event = IB_EVENT_QP_FATAL; + + switch (old_state) { + case IB_QPS_RESET: + qp_grp->ibqp.event_handler(&ib_event, + qp_grp->ibqp.qp_context); + break; + case IB_QPS_INIT: + release_and_remove_all_flows(qp_grp); + qp_grp->ibqp.event_handler(&ib_event, + qp_grp->ibqp.qp_context); + break; + case IB_QPS_RTR: + case IB_QPS_RTS: + status = disable_qp_grp(qp_grp); + release_and_remove_all_flows(qp_grp); + qp_grp->ibqp.event_handler(&ib_event, + qp_grp->ibqp.qp_context); + break; + default: + status = -EINVAL; + } + break; + default: + status = -EINVAL; + } + spin_unlock(&qp_grp->lock); + + if (!status) { + qp_grp->state = new_state; + usnic_info("Transistioned %u from %s to %s", + qp_grp->grp_id, + usnic_ib_qp_grp_state_to_string(old_state), + usnic_ib_qp_grp_state_to_string(new_state)); + } else { + usnic_err("Failed to transistion %u from %s to %s", + qp_grp->grp_id, + usnic_ib_qp_grp_state_to_string(old_state), + usnic_ib_qp_grp_state_to_string(new_state)); + } + + return status; +} + +static struct usnic_vnic_res_chunk** +alloc_res_chunk_list(struct usnic_vnic *vnic, + struct usnic_vnic_res_spec *res_spec, void *owner_obj) +{ + enum usnic_vnic_res_type res_type; + struct usnic_vnic_res_chunk **res_chunk_list; + int err, i, res_cnt, res_lst_sz; + + for (res_lst_sz = 0; + res_spec->resources[res_lst_sz].type != USNIC_VNIC_RES_TYPE_EOL; + res_lst_sz++) { + /* Do Nothing */ + } + + res_chunk_list = kzalloc(sizeof(*res_chunk_list)*(res_lst_sz+1), + GFP_ATOMIC); + if (!res_chunk_list) + return ERR_PTR(-ENOMEM); + + for (i = 0; res_spec->resources[i].type != USNIC_VNIC_RES_TYPE_EOL; + i++) { + res_type = res_spec->resources[i].type; + res_cnt = res_spec->resources[i].cnt; + + res_chunk_list[i] = usnic_vnic_get_resources(vnic, res_type, + res_cnt, owner_obj); + if (IS_ERR_OR_NULL(res_chunk_list[i])) { + err = res_chunk_list[i] ? + PTR_ERR(res_chunk_list[i]) : -ENOMEM; + usnic_err("Failed to get %s from %s with err %d\n", + usnic_vnic_res_type_to_str(res_type), + usnic_vnic_pci_name(vnic), + err); + goto out_free_res; + } + } + + return res_chunk_list; + +out_free_res: + for (i--; i > 0; i--) + usnic_vnic_put_resources(res_chunk_list[i]); + kfree(res_chunk_list); + return ERR_PTR(err); +} + +static void free_qp_grp_res(struct usnic_vnic_res_chunk **res_chunk_list) +{ + int i; + for (i = 0; res_chunk_list[i]; i++) + usnic_vnic_put_resources(res_chunk_list[i]); + kfree(res_chunk_list); +} + +static int qp_grp_and_vf_bind(struct usnic_ib_vf *vf, + struct usnic_ib_pd *pd, + struct usnic_ib_qp_grp *qp_grp) +{ + int err; + struct pci_dev *pdev; + + lockdep_assert_held(&vf->lock); + + pdev = usnic_vnic_get_pdev(vf->vnic); + if (vf->qp_grp_ref_cnt == 0) { + err = usnic_uiom_attach_dev_to_pd(pd->umem_pd, &pdev->dev); + if (err) { + usnic_err("Failed to attach %s to domain\n", + pci_name(pdev)); + return err; + } + vf->pd = pd; + } + vf->qp_grp_ref_cnt++; + + WARN_ON(vf->pd != pd); + qp_grp->vf = vf; + + return 0; +} + +static void qp_grp_and_vf_unbind(struct usnic_ib_qp_grp *qp_grp) +{ + struct pci_dev *pdev; + struct usnic_ib_pd *pd; + + lockdep_assert_held(&qp_grp->vf->lock); + + pd = qp_grp->vf->pd; + pdev = usnic_vnic_get_pdev(qp_grp->vf->vnic); + if (--qp_grp->vf->qp_grp_ref_cnt == 0) { + qp_grp->vf->pd = NULL; + usnic_uiom_detach_dev_from_pd(pd->umem_pd, &pdev->dev); + } + qp_grp->vf = NULL; +} + +static void log_spec(struct usnic_vnic_res_spec *res_spec) +{ + char buf[512]; + usnic_vnic_spec_dump(buf, sizeof(buf), res_spec); + usnic_dbg("%s\n", buf); +} + +static int qp_grp_id_from_flow(struct usnic_ib_qp_grp_flow *qp_flow, + uint32_t *id) +{ + enum usnic_transport_type trans_type = qp_flow->trans_type; + int err; + uint16_t port_num = 0; + + switch (trans_type) { + case USNIC_TRANSPORT_ROCE_CUSTOM: + *id = qp_flow->usnic_roce.port_num; + break; + case USNIC_TRANSPORT_IPV4_UDP: + err = usnic_transport_sock_get_addr(qp_flow->udp.sock, + NULL, NULL, + &port_num); + if (err) + return err; + /* + * Copy port_num to stack first and then to *id, + * so that the short to int cast works for little + * and big endian systems. + */ + *id = port_num; + break; + default: + usnic_err("Unsupported transport %u\n", trans_type); + return -EINVAL; + } + + return 0; +} + +struct usnic_ib_qp_grp * +usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf, + struct usnic_ib_pd *pd, + struct usnic_vnic_res_spec *res_spec, + struct usnic_transport_spec *transport_spec) +{ + struct usnic_ib_qp_grp *qp_grp; + int err; + enum usnic_transport_type transport = transport_spec->trans_type; + struct usnic_ib_qp_grp_flow *qp_flow; + + lockdep_assert_held(&vf->lock); + + err = usnic_vnic_res_spec_satisfied(&min_transport_spec[transport], + res_spec); + if (err) { + usnic_err("Spec does not meet miniumum req for transport %d\n", + transport); + log_spec(res_spec); + return ERR_PTR(err); + } + + qp_grp = kzalloc(sizeof(*qp_grp), GFP_ATOMIC); + if (!qp_grp) { + usnic_err("Unable to alloc qp_grp - Out of memory\n"); + return NULL; + } + + qp_grp->res_chunk_list = alloc_res_chunk_list(vf->vnic, res_spec, + qp_grp); + if (IS_ERR_OR_NULL(qp_grp->res_chunk_list)) { + err = qp_grp->res_chunk_list ? + PTR_ERR(qp_grp->res_chunk_list) : -ENOMEM; + usnic_err("Unable to alloc res for %d with err %d\n", + qp_grp->grp_id, err); + goto out_free_qp_grp; + } + + err = qp_grp_and_vf_bind(vf, pd, qp_grp); + if (err) + goto out_free_res; + + INIT_LIST_HEAD(&qp_grp->flows_lst); + spin_lock_init(&qp_grp->lock); + qp_grp->ufdev = ufdev; + qp_grp->state = IB_QPS_RESET; + qp_grp->owner_pid = current->pid; + + qp_flow = create_and_add_flow(qp_grp, transport_spec); + if (IS_ERR_OR_NULL(qp_flow)) { + usnic_err("Unable to create and add flow with err %ld\n", + PTR_ERR(qp_flow)); + err = qp_flow ? PTR_ERR(qp_flow) : -EFAULT; + goto out_qp_grp_vf_unbind; + } + + err = qp_grp_id_from_flow(qp_flow, &qp_grp->grp_id); + if (err) + goto out_release_flow; + qp_grp->ibqp.qp_num = qp_grp->grp_id; + + usnic_ib_sysfs_qpn_add(qp_grp); + + return qp_grp; + +out_release_flow: + release_and_remove_flow(qp_flow); +out_qp_grp_vf_unbind: + qp_grp_and_vf_unbind(qp_grp); +out_free_res: + free_qp_grp_res(qp_grp->res_chunk_list); +out_free_qp_grp: + kfree(qp_grp); + + return ERR_PTR(err); +} + +void usnic_ib_qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp) +{ + + WARN_ON(qp_grp->state != IB_QPS_RESET); + lockdep_assert_held(&qp_grp->vf->lock); + + release_and_remove_all_flows(qp_grp); + usnic_ib_sysfs_qpn_remove(qp_grp); + qp_grp_and_vf_unbind(qp_grp); + free_qp_grp_res(qp_grp->res_chunk_list); + kfree(qp_grp); +} + +struct usnic_vnic_res_chunk* +usnic_ib_qp_grp_get_chunk(struct usnic_ib_qp_grp *qp_grp, + enum usnic_vnic_res_type res_type) +{ + int i; + + for (i = 0; qp_grp->res_chunk_list[i]; i++) { + if (qp_grp->res_chunk_list[i]->type == res_type) + return qp_grp->res_chunk_list[i]; + } + + return ERR_PTR(-EINVAL); +} diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h new file mode 100644 index 00000000000..b0aafe8db0c --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_QP_GRP_H_ +#define USNIC_IB_QP_GRP_H_ + +#include <linux/debugfs.h> +#include <rdma/ib_verbs.h> + +#include "usnic_ib.h" +#include "usnic_abi.h" +#include "usnic_fwd.h" +#include "usnic_vnic.h" + +/* + * The qp group struct represents all the hw resources needed to present a ib_qp + */ +struct usnic_ib_qp_grp { + struct ib_qp ibqp; + enum ib_qp_state state; + int grp_id; + + struct usnic_fwd_dev *ufdev; + struct usnic_ib_ucontext *ctx; + struct list_head flows_lst; + + struct usnic_vnic_res_chunk **res_chunk_list; + + pid_t owner_pid; + struct usnic_ib_vf *vf; + struct list_head link; + + spinlock_t lock; + + struct kobject kobj; +}; + +struct usnic_ib_qp_grp_flow { + struct usnic_fwd_flow *flow; + enum usnic_transport_type trans_type; + union { + struct { + uint16_t port_num; + } usnic_roce; + struct { + struct socket *sock; + } udp; + }; + struct usnic_ib_qp_grp *qp_grp; + struct list_head link; + + /* Debug FS */ + struct dentry *dbgfs_dentry; + char dentry_name[32]; +}; + +static const struct +usnic_vnic_res_spec min_transport_spec[USNIC_TRANSPORT_MAX] = { + { /*USNIC_TRANSPORT_UNKNOWN*/ + .resources = { + {.type = USNIC_VNIC_RES_TYPE_EOL, .cnt = 0,}, + }, + }, + { /*USNIC_TRANSPORT_ROCE_CUSTOM*/ + .resources = { + {.type = USNIC_VNIC_RES_TYPE_WQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_RQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_CQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_EOL, .cnt = 0,}, + }, + }, + { /*USNIC_TRANSPORT_IPV4_UDP*/ + .resources = { + {.type = USNIC_VNIC_RES_TYPE_WQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_RQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_CQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_EOL, .cnt = 0,}, + }, + }, +}; + +const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state); +int usnic_ib_qp_grp_dump_hdr(char *buf, int buf_sz); +int usnic_ib_qp_grp_dump_rows(void *obj, char *buf, int buf_sz); +struct usnic_ib_qp_grp * +usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf, + struct usnic_ib_pd *pd, + struct usnic_vnic_res_spec *res_spec, + struct usnic_transport_spec *trans_spec); +void usnic_ib_qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp); +int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp, + enum ib_qp_state new_state, + void *data); +struct usnic_vnic_res_chunk +*usnic_ib_qp_grp_get_chunk(struct usnic_ib_qp_grp *qp_grp, + enum usnic_vnic_res_type type); +static inline +struct usnic_ib_qp_grp *to_uqp_grp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct usnic_ib_qp_grp, ibqp); +} +#endif /* USNIC_IB_QP_GRP_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c new file mode 100644 index 00000000000..27dc67c1689 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/errno.h> + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_addr.h> + +#include "usnic_common_util.h" +#include "usnic_ib.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_vnic.h" +#include "usnic_ib_verbs.h" +#include "usnic_log.h" + +static ssize_t usnic_ib_show_fw_ver(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev = + container_of(device, struct usnic_ib_dev, ib_dev.dev); + struct ethtool_drvinfo info; + + mutex_lock(&us_ibdev->usdev_lock); + us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info); + mutex_unlock(&us_ibdev->usdev_lock); + + return scnprintf(buf, PAGE_SIZE, "%s\n", info.fw_version); +} + +static ssize_t usnic_ib_show_board(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev = + container_of(device, struct usnic_ib_dev, ib_dev.dev); + unsigned short subsystem_device_id; + + mutex_lock(&us_ibdev->usdev_lock); + subsystem_device_id = us_ibdev->pdev->subsystem_device; + mutex_unlock(&us_ibdev->usdev_lock); + + return scnprintf(buf, PAGE_SIZE, "%hu\n", subsystem_device_id); +} + +/* + * Report the configuration for this PF + */ +static ssize_t +usnic_ib_show_config(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + char *ptr; + unsigned left; + unsigned n; + enum usnic_vnic_res_type res_type; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + + /* Buffer space limit is 1 page */ + ptr = buf; + left = PAGE_SIZE; + + mutex_lock(&us_ibdev->usdev_lock); + if (atomic_read(&us_ibdev->vf_cnt.refcount) > 0) { + char *busname; + + /* + * bus name seems to come with annoying prefix. + * Remove it if it is predictable + */ + busname = us_ibdev->pdev->bus->name; + if (strncmp(busname, "PCI Bus ", 8) == 0) + busname += 8; + + n = scnprintf(ptr, left, + "%s: %s:%d.%d, %s, %pM, %u VFs\n Per VF:", + us_ibdev->ib_dev.name, + busname, + PCI_SLOT(us_ibdev->pdev->devfn), + PCI_FUNC(us_ibdev->pdev->devfn), + netdev_name(us_ibdev->netdev), + us_ibdev->ufdev->mac, + atomic_read(&us_ibdev->vf_cnt.refcount)); + UPDATE_PTR_LEFT(n, ptr, left); + + for (res_type = USNIC_VNIC_RES_TYPE_EOL; + res_type < USNIC_VNIC_RES_TYPE_MAX; + res_type++) { + if (us_ibdev->vf_res_cnt[res_type] == 0) + continue; + n = scnprintf(ptr, left, " %d %s%s", + us_ibdev->vf_res_cnt[res_type], + usnic_vnic_res_type_to_str(res_type), + (res_type < (USNIC_VNIC_RES_TYPE_MAX - 1)) ? + "," : ""); + UPDATE_PTR_LEFT(n, ptr, left); + } + n = scnprintf(ptr, left, "\n"); + UPDATE_PTR_LEFT(n, ptr, left); + } else { + n = scnprintf(ptr, left, "%s: no VFs\n", + us_ibdev->ib_dev.name); + UPDATE_PTR_LEFT(n, ptr, left); + } + mutex_unlock(&us_ibdev->usdev_lock); + + return ptr - buf; +} + +static ssize_t +usnic_ib_show_iface(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + + return scnprintf(buf, PAGE_SIZE, "%s\n", + netdev_name(us_ibdev->netdev)); +} + +static ssize_t +usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + atomic_read(&us_ibdev->vf_cnt.refcount)); +} + +static ssize_t +usnic_ib_show_qp_per_vf(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + int qp_per_vf; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ], + us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]); + + return scnprintf(buf, PAGE_SIZE, + "%d\n", qp_per_vf); +} + +static ssize_t +usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", + us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]); +} + +static DEVICE_ATTR(fw_ver, S_IRUGO, usnic_ib_show_fw_ver, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, usnic_ib_show_board, NULL); +static DEVICE_ATTR(config, S_IRUGO, usnic_ib_show_config, NULL); +static DEVICE_ATTR(iface, S_IRUGO, usnic_ib_show_iface, NULL); +static DEVICE_ATTR(max_vf, S_IRUGO, usnic_ib_show_max_vf, NULL); +static DEVICE_ATTR(qp_per_vf, S_IRUGO, usnic_ib_show_qp_per_vf, NULL); +static DEVICE_ATTR(cq_per_vf, S_IRUGO, usnic_ib_show_cq_per_vf, NULL); + +static struct device_attribute *usnic_class_attributes[] = { + &dev_attr_fw_ver, + &dev_attr_board_id, + &dev_attr_config, + &dev_attr_iface, + &dev_attr_max_vf, + &dev_attr_qp_per_vf, + &dev_attr_cq_per_vf, +}; + +struct qpn_attribute { + struct attribute attr; + ssize_t (*show)(struct usnic_ib_qp_grp *, char *buf); +}; + +/* + * Definitions for supporting QPN entries in sysfs + */ +static ssize_t +usnic_ib_qpn_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct usnic_ib_qp_grp *qp_grp; + struct qpn_attribute *qpn_attr; + + qp_grp = container_of(kobj, struct usnic_ib_qp_grp, kobj); + qpn_attr = container_of(attr, struct qpn_attribute, attr); + + return qpn_attr->show(qp_grp, buf); +} + +static const struct sysfs_ops usnic_ib_qpn_sysfs_ops = { + .show = usnic_ib_qpn_attr_show +}; + +#define QPN_ATTR_RO(NAME) \ +struct qpn_attribute qpn_attr_##NAME = __ATTR_RO(NAME) + +static ssize_t context_show(struct usnic_ib_qp_grp *qp_grp, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "0x%p\n", qp_grp->ctx); +} + +static ssize_t summary_show(struct usnic_ib_qp_grp *qp_grp, char *buf) +{ + int i, j, n; + int left; + char *ptr; + struct usnic_vnic_res_chunk *res_chunk; + struct usnic_vnic_res *vnic_res; + + left = PAGE_SIZE; + ptr = buf; + + n = scnprintf(ptr, left, + "QPN: %d State: (%s) PID: %u VF Idx: %hu ", + qp_grp->ibqp.qp_num, + usnic_ib_qp_grp_state_to_string(qp_grp->state), + qp_grp->owner_pid, + usnic_vnic_get_index(qp_grp->vf->vnic)); + UPDATE_PTR_LEFT(n, ptr, left); + + for (i = 0; qp_grp->res_chunk_list[i]; i++) { + res_chunk = qp_grp->res_chunk_list[i]; + for (j = 0; j < res_chunk->cnt; j++) { + vnic_res = res_chunk->res[j]; + n = scnprintf(ptr, left, "%s[%d] ", + usnic_vnic_res_type_to_str(vnic_res->type), + vnic_res->vnic_idx); + UPDATE_PTR_LEFT(n, ptr, left); + } + } + + n = scnprintf(ptr, left, "\n"); + UPDATE_PTR_LEFT(n, ptr, left); + + return ptr - buf; +} + +static QPN_ATTR_RO(context); +static QPN_ATTR_RO(summary); + +static struct attribute *usnic_ib_qpn_default_attrs[] = { + &qpn_attr_context.attr, + &qpn_attr_summary.attr, + NULL +}; + +static struct kobj_type usnic_ib_qpn_type = { + .sysfs_ops = &usnic_ib_qpn_sysfs_ops, + .default_attrs = usnic_ib_qpn_default_attrs +}; + +int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev) +{ + int i; + int err; + for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) { + err = device_create_file(&us_ibdev->ib_dev.dev, + usnic_class_attributes[i]); + if (err) { + usnic_err("Failed to create device file %d for %s eith err %d", + i, us_ibdev->ib_dev.name, err); + return -EINVAL; + } + } + + /* create kernel object for looking at individual QPs */ + kobject_get(&us_ibdev->ib_dev.dev.kobj); + us_ibdev->qpn_kobj = kobject_create_and_add("qpn", + &us_ibdev->ib_dev.dev.kobj); + if (us_ibdev->qpn_kobj == NULL) { + kobject_put(&us_ibdev->ib_dev.dev.kobj); + return -ENOMEM; + } + + return 0; +} + +void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev) +{ + int i; + for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) { + device_remove_file(&us_ibdev->ib_dev.dev, + usnic_class_attributes[i]); + } + + kobject_put(us_ibdev->qpn_kobj); +} + +void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp) +{ + struct usnic_ib_dev *us_ibdev; + int err; + + us_ibdev = qp_grp->vf->pf; + + err = kobject_init_and_add(&qp_grp->kobj, &usnic_ib_qpn_type, + kobject_get(us_ibdev->qpn_kobj), + "%d", qp_grp->grp_id); + if (err) { + kobject_put(us_ibdev->qpn_kobj); + return; + } +} + +void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp) +{ + struct usnic_ib_dev *us_ibdev; + + us_ibdev = qp_grp->vf->pf; + + kobject_put(&qp_grp->kobj); + kobject_put(us_ibdev->qpn_kobj); +} diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h new file mode 100644 index 00000000000..0d09b493cd0 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_SYSFS_H_ +#define USNIC_IB_SYSFS_H_ + +#include "usnic_ib.h" + +int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev); +void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev); +void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp); +void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp); + +#endif /* !USNIC_IB_SYSFS_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c new file mode 100644 index 00000000000..53bd6a2d9cd --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -0,0 +1,768 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/errno.h> + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_addr.h> + +#include "usnic_abi.h" +#include "usnic_ib.h" +#include "usnic_common_util.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_fwd.h" +#include "usnic_log.h" +#include "usnic_uiom.h" +#include "usnic_transport.h" + +#define USNIC_DEFAULT_TRANSPORT USNIC_TRANSPORT_ROCE_CUSTOM + +static void usnic_ib_fw_string_to_u64(char *fw_ver_str, u64 *fw_ver) +{ + *fw_ver = (u64) *fw_ver_str; +} + +static int usnic_ib_fill_create_qp_resp(struct usnic_ib_qp_grp *qp_grp, + struct ib_udata *udata) +{ + struct usnic_ib_dev *us_ibdev; + struct usnic_ib_create_qp_resp resp; + struct pci_dev *pdev; + struct vnic_dev_bar *bar; + struct usnic_vnic_res_chunk *chunk; + struct usnic_ib_qp_grp_flow *default_flow; + int i, err; + + memset(&resp, 0, sizeof(resp)); + + us_ibdev = qp_grp->vf->pf; + pdev = usnic_vnic_get_pdev(qp_grp->vf->vnic); + if (!pdev) { + usnic_err("Failed to get pdev of qp_grp %d\n", + qp_grp->grp_id); + return -EFAULT; + } + + bar = usnic_vnic_get_bar(qp_grp->vf->vnic, 0); + if (!bar) { + usnic_err("Failed to get bar0 of qp_grp %d vf %s", + qp_grp->grp_id, pci_name(pdev)); + return -EFAULT; + } + + resp.vfid = usnic_vnic_get_index(qp_grp->vf->vnic); + resp.bar_bus_addr = bar->bus_addr; + resp.bar_len = bar->len; + + chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ); + if (IS_ERR_OR_NULL(chunk)) { + usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n", + usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_RQ), + qp_grp->grp_id, + PTR_ERR(chunk)); + return chunk ? PTR_ERR(chunk) : -ENOMEM; + } + + WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_RQ); + resp.rq_cnt = chunk->cnt; + for (i = 0; i < chunk->cnt; i++) + resp.rq_idx[i] = chunk->res[i]->vnic_idx; + + chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_WQ); + if (IS_ERR_OR_NULL(chunk)) { + usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n", + usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_WQ), + qp_grp->grp_id, + PTR_ERR(chunk)); + return chunk ? PTR_ERR(chunk) : -ENOMEM; + } + + WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_WQ); + resp.wq_cnt = chunk->cnt; + for (i = 0; i < chunk->cnt; i++) + resp.wq_idx[i] = chunk->res[i]->vnic_idx; + + chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_CQ); + if (IS_ERR_OR_NULL(chunk)) { + usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n", + usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_CQ), + qp_grp->grp_id, + PTR_ERR(chunk)); + return chunk ? PTR_ERR(chunk) : -ENOMEM; + } + + WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_CQ); + resp.cq_cnt = chunk->cnt; + for (i = 0; i < chunk->cnt; i++) + resp.cq_idx[i] = chunk->res[i]->vnic_idx; + + default_flow = list_first_entry(&qp_grp->flows_lst, + struct usnic_ib_qp_grp_flow, link); + resp.transport = default_flow->trans_type; + + err = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (err) { + usnic_err("Failed to copy udata for %s", us_ibdev->ib_dev.name); + return err; + } + + return 0; +} + +static struct usnic_ib_qp_grp* +find_free_vf_and_create_qp_grp(struct usnic_ib_dev *us_ibdev, + struct usnic_ib_pd *pd, + struct usnic_transport_spec *trans_spec, + struct usnic_vnic_res_spec *res_spec) +{ + struct usnic_ib_vf *vf; + struct usnic_vnic *vnic; + struct usnic_ib_qp_grp *qp_grp; + struct device *dev, **dev_list; + int i, found = 0; + + BUG_ON(!mutex_is_locked(&us_ibdev->usdev_lock)); + + if (list_empty(&us_ibdev->vf_dev_list)) { + usnic_info("No vfs to allocate\n"); + return NULL; + } + + if (usnic_ib_share_vf) { + /* Try to find resouces on a used vf which is in pd */ + dev_list = usnic_uiom_get_dev_list(pd->umem_pd); + for (i = 0; dev_list[i]; i++) { + dev = dev_list[i]; + vf = pci_get_drvdata(to_pci_dev(dev)); + spin_lock(&vf->lock); + vnic = vf->vnic; + if (!usnic_vnic_check_room(vnic, res_spec)) { + usnic_dbg("Found used vnic %s from %s\n", + us_ibdev->ib_dev.name, + pci_name(usnic_vnic_get_pdev( + vnic))); + found = 1; + break; + } + spin_unlock(&vf->lock); + + } + usnic_uiom_free_dev_list(dev_list); + } + + if (!found) { + /* Try to find resources on an unused vf */ + list_for_each_entry(vf, &us_ibdev->vf_dev_list, link) { + spin_lock(&vf->lock); + vnic = vf->vnic; + if (vf->qp_grp_ref_cnt == 0 && + usnic_vnic_check_room(vnic, res_spec) == 0) { + found = 1; + break; + } + spin_unlock(&vf->lock); + } + } + + if (!found) { + usnic_info("No free qp grp found on %s\n", + us_ibdev->ib_dev.name); + return ERR_PTR(-ENOMEM); + } + + qp_grp = usnic_ib_qp_grp_create(us_ibdev->ufdev, vf, pd, res_spec, + trans_spec); + spin_unlock(&vf->lock); + if (IS_ERR_OR_NULL(qp_grp)) { + usnic_err("Failed to allocate qp_grp\n"); + return ERR_PTR(qp_grp ? PTR_ERR(qp_grp) : -ENOMEM); + } + + return qp_grp; +} + +static void qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp) +{ + struct usnic_ib_vf *vf = qp_grp->vf; + + WARN_ON(qp_grp->state != IB_QPS_RESET); + + spin_lock(&vf->lock); + usnic_ib_qp_grp_destroy(qp_grp); + spin_unlock(&vf->lock); +} + +static void eth_speed_to_ib_speed(int speed, u8 *active_speed, + u8 *active_width) +{ + if (speed <= 10000) { + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_FDR10; + } else if (speed <= 20000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_DDR; + } else if (speed <= 30000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_QDR; + } else if (speed <= 40000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_FDR10; + } else { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_EDR; + } +} + +static int create_qp_validate_user_data(struct usnic_ib_create_qp_cmd cmd) +{ + if (cmd.spec.trans_type <= USNIC_TRANSPORT_UNKNOWN || + cmd.spec.trans_type >= USNIC_TRANSPORT_MAX) + return -EINVAL; + + return 0; +} + +/* Start of ib callback functions */ + +enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device, + u8 port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} + +int usnic_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); + union ib_gid gid; + struct ethtool_drvinfo info; + struct ethtool_cmd cmd; + int qp_per_vf; + + usnic_dbg("\n"); + mutex_lock(&us_ibdev->usdev_lock); + us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info); + us_ibdev->netdev->ethtool_ops->get_settings(us_ibdev->netdev, &cmd); + memset(props, 0, sizeof(*props)); + usnic_mac_ip_to_gid(us_ibdev->ufdev->mac, us_ibdev->ufdev->inaddr, + &gid.raw[0]); + memcpy(&props->sys_image_guid, &gid.global.interface_id, + sizeof(gid.global.interface_id)); + usnic_ib_fw_string_to_u64(&info.fw_version[0], &props->fw_ver); + props->max_mr_size = USNIC_UIOM_MAX_MR_SIZE; + props->page_size_cap = USNIC_UIOM_PAGE_SIZE; + props->vendor_id = PCI_VENDOR_ID_CISCO; + props->vendor_part_id = PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC; + props->hw_ver = us_ibdev->pdev->subsystem_device; + qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ], + us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]); + props->max_qp = qp_per_vf * + atomic_read(&us_ibdev->vf_cnt.refcount); + props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + props->max_cq = us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ] * + atomic_read(&us_ibdev->vf_cnt.refcount); + props->max_pd = USNIC_UIOM_MAX_PD_CNT; + props->max_mr = USNIC_UIOM_MAX_MR_CNT; + props->local_ca_ack_delay = 0; + props->max_pkeys = 0; + props->atomic_cap = IB_ATOMIC_NONE; + props->masked_atomic_cap = props->atomic_cap; + props->max_qp_rd_atom = 0; + props->max_qp_init_rd_atom = 0; + props->max_res_rd_atom = 0; + props->max_srq = 0; + props->max_srq_wr = 0; + props->max_srq_sge = 0; + props->max_fast_reg_page_list_len = 0; + props->max_mcast_grp = 0; + props->max_mcast_qp_attach = 0; + props->max_total_mcast_qp_attach = 0; + props->max_map_per_fmr = 0; + /* Owned by Userspace + * max_qp_wr, max_sge, max_sge_rd, max_cqe */ + mutex_unlock(&us_ibdev->usdev_lock); + + return 0; +} + +int usnic_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); + struct ethtool_cmd cmd; + + usnic_dbg("\n"); + + mutex_lock(&us_ibdev->usdev_lock); + us_ibdev->netdev->ethtool_ops->get_settings(us_ibdev->netdev, &cmd); + memset(props, 0, sizeof(*props)); + + props->lid = 0; + props->lmc = 1; + props->sm_lid = 0; + props->sm_sl = 0; + + if (!us_ibdev->ufdev->link_up) { + props->state = IB_PORT_DOWN; + props->phys_state = 3; + } else if (!us_ibdev->ufdev->inaddr) { + props->state = IB_PORT_INIT; + props->phys_state = 4; + } else { + props->state = IB_PORT_ACTIVE; + props->phys_state = 5; + } + + props->port_cap_flags = 0; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->bad_pkey_cntr = 0; + props->qkey_viol_cntr = 0; + eth_speed_to_ib_speed(cmd.speed, &props->active_speed, + &props->active_width); + props->max_mtu = IB_MTU_4096; + props->active_mtu = iboe_get_mtu(us_ibdev->ufdev->mtu); + /* Userspace will adjust for hdrs */ + props->max_msg_sz = us_ibdev->ufdev->mtu; + props->max_vl_num = 1; + mutex_unlock(&us_ibdev->usdev_lock); + + return 0; +} + +int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct usnic_ib_qp_grp *qp_grp; + struct usnic_ib_vf *vf; + int err; + + usnic_dbg("\n"); + + memset(qp_attr, 0, sizeof(*qp_attr)); + memset(qp_init_attr, 0, sizeof(*qp_init_attr)); + + qp_grp = to_uqp_grp(qp); + vf = qp_grp->vf; + mutex_lock(&vf->pf->usdev_lock); + usnic_dbg("\n"); + qp_attr->qp_state = qp_grp->state; + qp_attr->cur_qp_state = qp_grp->state; + + switch (qp_grp->ibqp.qp_type) { + case IB_QPT_UD: + qp_attr->qkey = 0; + break; + default: + usnic_err("Unexpected qp_type %d\n", qp_grp->ibqp.qp_type); + err = -EINVAL; + goto err_out; + } + + mutex_unlock(&vf->pf->usdev_lock); + return 0; + +err_out: + mutex_unlock(&vf->pf->usdev_lock); + return err; +} + +int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + + struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); + usnic_dbg("\n"); + + if (index > 1) + return -EINVAL; + + mutex_lock(&us_ibdev->usdev_lock); + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + usnic_mac_ip_to_gid(us_ibdev->ufdev->mac, us_ibdev->ufdev->inaddr, + &gid->raw[0]); + mutex_unlock(&us_ibdev->usdev_lock); + + return 0; +} + +int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + if (index > 1) + return -EINVAL; + + *pkey = 0xffff; + return 0; +} + +struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct usnic_ib_pd *pd; + void *umem_pd; + + usnic_dbg("\n"); + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + umem_pd = pd->umem_pd = usnic_uiom_alloc_pd(); + if (IS_ERR_OR_NULL(umem_pd)) { + kfree(pd); + return ERR_PTR(umem_pd ? PTR_ERR(umem_pd) : -ENOMEM); + } + + usnic_info("domain 0x%p allocated for context 0x%p and device %s\n", + pd, context, ibdev->name); + return &pd->ibpd; +} + +int usnic_ib_dealloc_pd(struct ib_pd *pd) +{ + usnic_info("freeing domain 0x%p\n", pd); + + usnic_uiom_dealloc_pd((to_upd(pd))->umem_pd); + kfree(pd); + return 0; +} + +struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + int err; + struct usnic_ib_dev *us_ibdev; + struct usnic_ib_qp_grp *qp_grp; + struct usnic_ib_ucontext *ucontext; + int cq_cnt; + struct usnic_vnic_res_spec res_spec; + struct usnic_ib_create_qp_cmd cmd; + struct usnic_transport_spec trans_spec; + + usnic_dbg("\n"); + + ucontext = to_uucontext(pd->uobject->context); + us_ibdev = to_usdev(pd->device); + + if (init_attr->create_flags) + return ERR_PTR(-EINVAL); + + err = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); + if (err) { + usnic_err("%s: cannot copy udata for create_qp\n", + us_ibdev->ib_dev.name); + return ERR_PTR(-EINVAL); + } + + err = create_qp_validate_user_data(cmd); + if (err) { + usnic_err("%s: Failed to validate user data\n", + us_ibdev->ib_dev.name); + return ERR_PTR(-EINVAL); + } + + if (init_attr->qp_type != IB_QPT_UD) { + usnic_err("%s asked to make a non-UD QP: %d\n", + us_ibdev->ib_dev.name, init_attr->qp_type); + return ERR_PTR(-EINVAL); + } + + trans_spec = cmd.spec; + mutex_lock(&us_ibdev->usdev_lock); + cq_cnt = (init_attr->send_cq == init_attr->recv_cq) ? 1 : 2; + res_spec = min_transport_spec[trans_spec.trans_type]; + usnic_vnic_res_spec_update(&res_spec, USNIC_VNIC_RES_TYPE_CQ, cq_cnt); + qp_grp = find_free_vf_and_create_qp_grp(us_ibdev, to_upd(pd), + &trans_spec, + &res_spec); + if (IS_ERR_OR_NULL(qp_grp)) { + err = qp_grp ? PTR_ERR(qp_grp) : -ENOMEM; + goto out_release_mutex; + } + + err = usnic_ib_fill_create_qp_resp(qp_grp, udata); + if (err) { + err = -EBUSY; + goto out_release_qp_grp; + } + + qp_grp->ctx = ucontext; + list_add_tail(&qp_grp->link, &ucontext->qp_grp_list); + usnic_ib_log_vf(qp_grp->vf); + mutex_unlock(&us_ibdev->usdev_lock); + return &qp_grp->ibqp; + +out_release_qp_grp: + qp_grp_destroy(qp_grp); +out_release_mutex: + mutex_unlock(&us_ibdev->usdev_lock); + return ERR_PTR(err); +} + +int usnic_ib_destroy_qp(struct ib_qp *qp) +{ + struct usnic_ib_qp_grp *qp_grp; + struct usnic_ib_vf *vf; + + usnic_dbg("\n"); + + qp_grp = to_uqp_grp(qp); + vf = qp_grp->vf; + mutex_lock(&vf->pf->usdev_lock); + if (usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RESET, NULL)) { + usnic_err("Failed to move qp grp %u to reset\n", + qp_grp->grp_id); + } + + list_del(&qp_grp->link); + qp_grp_destroy(qp_grp); + mutex_unlock(&vf->pf->usdev_lock); + + return 0; +} + +int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct usnic_ib_qp_grp *qp_grp; + int status; + usnic_dbg("\n"); + + qp_grp = to_uqp_grp(ibqp); + + /* TODO: Future Support All States */ + mutex_lock(&qp_grp->vf->pf->usdev_lock); + if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT) { + status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_INIT, NULL); + } else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTR) { + status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTR, NULL); + } else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTS) { + status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTS, NULL); + } else { + usnic_err("Unexpected combination mask: %u state: %u\n", + attr_mask & IB_QP_STATE, attr->qp_state); + status = -EINVAL; + } + + mutex_unlock(&qp_grp->vf->pf->usdev_lock); + return status; +} + +struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct ib_cq *cq; + + usnic_dbg("\n"); + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return ERR_PTR(-EBUSY); + + return cq; +} + +int usnic_ib_destroy_cq(struct ib_cq *cq) +{ + usnic_dbg("\n"); + kfree(cq); + return 0; +} + +struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata) +{ + struct usnic_ib_mr *mr; + int err; + + usnic_dbg("start 0x%llx va 0x%llx length 0x%llx\n", start, + virt_addr, length); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (IS_ERR_OR_NULL(mr)) + return ERR_PTR(mr ? PTR_ERR(mr) : -ENOMEM); + + mr->umem = usnic_uiom_reg_get(to_upd(pd)->umem_pd, start, length, + access_flags, 0); + if (IS_ERR_OR_NULL(mr->umem)) { + err = mr->umem ? PTR_ERR(mr->umem) : -EFAULT; + goto err_free; + } + + mr->ibmr.lkey = mr->ibmr.rkey = 0; + return &mr->ibmr; + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +int usnic_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct usnic_ib_mr *mr = to_umr(ibmr); + + usnic_dbg("va 0x%lx length 0x%zx\n", mr->umem->va, mr->umem->length); + + usnic_uiom_reg_release(mr->umem, ibmr->pd->uobject->context->closing); + kfree(mr); + return 0; +} + +struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct usnic_ib_ucontext *context; + struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); + usnic_dbg("\n"); + + context = kmalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&context->qp_grp_list); + mutex_lock(&us_ibdev->usdev_lock); + list_add_tail(&context->link, &us_ibdev->ctx_list); + mutex_unlock(&us_ibdev->usdev_lock); + + return &context->ibucontext; +} + +int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct usnic_ib_ucontext *context = to_uucontext(ibcontext); + struct usnic_ib_dev *us_ibdev = to_usdev(ibcontext->device); + usnic_dbg("\n"); + + mutex_lock(&us_ibdev->usdev_lock); + BUG_ON(!list_empty(&context->qp_grp_list)); + list_del(&context->link); + mutex_unlock(&us_ibdev->usdev_lock); + kfree(context); + return 0; +} + +int usnic_ib_mmap(struct ib_ucontext *context, + struct vm_area_struct *vma) +{ + struct usnic_ib_ucontext *uctx = to_ucontext(context); + struct usnic_ib_dev *us_ibdev; + struct usnic_ib_qp_grp *qp_grp; + struct usnic_ib_vf *vf; + struct vnic_dev_bar *bar; + dma_addr_t bus_addr; + unsigned int len; + unsigned int vfid; + + usnic_dbg("\n"); + + us_ibdev = to_usdev(context->device); + vma->vm_flags |= VM_IO; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vfid = vma->vm_pgoff; + usnic_dbg("Page Offset %lu PAGE_SHIFT %u VFID %u\n", + vma->vm_pgoff, PAGE_SHIFT, vfid); + + mutex_lock(&us_ibdev->usdev_lock); + list_for_each_entry(qp_grp, &uctx->qp_grp_list, link) { + vf = qp_grp->vf; + if (usnic_vnic_get_index(vf->vnic) == vfid) { + bar = usnic_vnic_get_bar(vf->vnic, 0); + if ((vma->vm_end - vma->vm_start) != bar->len) { + usnic_err("Bar0 Len %lu - Request map %lu\n", + bar->len, + vma->vm_end - vma->vm_start); + mutex_unlock(&us_ibdev->usdev_lock); + return -EINVAL; + } + bus_addr = bar->bus_addr; + len = bar->len; + usnic_dbg("bus: %pa vaddr: %p size: %ld\n", + &bus_addr, bar->vaddr, bar->len); + mutex_unlock(&us_ibdev->usdev_lock); + + return remap_pfn_range(vma, + vma->vm_start, + bus_addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } + } + + mutex_unlock(&us_ibdev->usdev_lock); + usnic_err("No VF %u found\n", vfid); + return -EINVAL; +} + +/* In ib callbacks section - Start of stub funcs */ +struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + usnic_dbg("\n"); + return ERR_PTR(-EPERM); +} + +int usnic_ib_destroy_ah(struct ib_ah *ah) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *wc) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +int usnic_ib_req_notify_cq(struct ib_cq *cq, + enum ib_cq_notify_flags flags) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ + usnic_dbg("\n"); + return ERR_PTR(-ENOMEM); +} + + +/* In ib callbacks section - End of stub funcs */ +/* End of ib callbacks section */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h new file mode 100644 index 00000000000..bb864f5aed7 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_VERBS_H_ +#define USNIC_IB_VERBS_H_ + +#include "usnic_ib.h" + +enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device, + u8 port_num); +int usnic_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props); +int usnic_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); +int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid); +int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey); +struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata); +int usnic_ib_dealloc_pd(struct ib_pd *pd); +struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int usnic_ib_destroy_qp(struct ib_qp *qp); +int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *context, + struct ib_udata *udata); +int usnic_ib_destroy_cq(struct ib_cq *cq); +struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata); +int usnic_ib_dereg_mr(struct ib_mr *ibmr); +struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata); +int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext); +int usnic_ib_mmap(struct ib_ucontext *context, + struct vm_area_struct *vma); +struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr); +int usnic_ib_destroy_ah(struct ib_ah *ah); +int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *wc); +int usnic_ib_req_notify_cq(struct ib_cq *cq, + enum ib_cq_notify_flags flags); +struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc); +#endif /* !USNIC_IB_VERBS_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_log.h b/drivers/infiniband/hw/usnic/usnic_log.h new file mode 100644 index 00000000000..75777a66c68 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_log.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_LOG_H_ +#define USNIC_LOG_H_ + +#include "usnic.h" + +extern unsigned int usnic_log_lvl; + +#define USNIC_LOG_LVL_NONE (0) +#define USNIC_LOG_LVL_ERR (1) +#define USNIC_LOG_LVL_INFO (2) +#define USNIC_LOG_LVL_DBG (3) + +#define usnic_printk(lvl, args...) \ + do { \ + printk(lvl "%s:%s:%d: ", DRV_NAME, __func__, \ + __LINE__); \ + printk(args); \ + } while (0) + +#define usnic_dbg(args...) \ + do { \ + if (unlikely(usnic_log_lvl >= USNIC_LOG_LVL_DBG)) { \ + usnic_printk(KERN_INFO, args); \ + } \ +} while (0) + +#define usnic_info(args...) \ +do { \ + if (usnic_log_lvl >= USNIC_LOG_LVL_INFO) { \ + usnic_printk(KERN_INFO, args); \ + } \ +} while (0) + +#define usnic_err(args...) \ + do { \ + if (usnic_log_lvl >= USNIC_LOG_LVL_ERR) { \ + usnic_printk(KERN_ERR, args); \ + } \ + } while (0) +#endif /* !USNIC_LOG_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_transport.c b/drivers/infiniband/hw/usnic/usnic_transport.c new file mode 100644 index 00000000000..ddef6f77a78 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_transport.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/bitmap.h> +#include <linux/file.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <net/inet_sock.h> + +#include "usnic_transport.h" +#include "usnic_log.h" + +/* ROCE */ +static unsigned long *roce_bitmap; +static u16 roce_next_port = 1; +#define ROCE_BITMAP_SZ ((1 << (8 /*CHAR_BIT*/ * sizeof(u16)))/8 /*CHAR BIT*/) +static DEFINE_SPINLOCK(roce_bitmap_lock); + +const char *usnic_transport_to_str(enum usnic_transport_type type) +{ + switch (type) { + case USNIC_TRANSPORT_UNKNOWN: + return "Unknown"; + case USNIC_TRANSPORT_ROCE_CUSTOM: + return "roce custom"; + case USNIC_TRANSPORT_IPV4_UDP: + return "IPv4 UDP"; + case USNIC_TRANSPORT_MAX: + return "Max?"; + default: + return "Not known"; + } +} + +int usnic_transport_sock_to_str(char *buf, int buf_sz, + struct socket *sock) +{ + int err; + uint32_t addr; + uint16_t port; + int proto; + + memset(buf, 0, buf_sz); + err = usnic_transport_sock_get_addr(sock, &proto, &addr, &port); + if (err) + return 0; + + return scnprintf(buf, buf_sz, "Proto:%u Addr:%pI4h Port:%hu", + proto, &addr, port); +} + +/* + * reserve a port number. if "0" specified, we will try to pick one + * starting at roce_next_port. roce_next_port will take on the values + * 1..4096 + */ +u16 usnic_transport_rsrv_port(enum usnic_transport_type type, u16 port_num) +{ + if (type == USNIC_TRANSPORT_ROCE_CUSTOM) { + spin_lock(&roce_bitmap_lock); + if (!port_num) { + port_num = bitmap_find_next_zero_area(roce_bitmap, + ROCE_BITMAP_SZ, + roce_next_port /* start */, + 1 /* nr */, + 0 /* align */); + roce_next_port = (port_num & 4095) + 1; + } else if (test_bit(port_num, roce_bitmap)) { + usnic_err("Failed to allocate port for %s\n", + usnic_transport_to_str(type)); + spin_unlock(&roce_bitmap_lock); + goto out_fail; + } + bitmap_set(roce_bitmap, port_num, 1); + spin_unlock(&roce_bitmap_lock); + } else { + usnic_err("Failed to allocate port - transport %s unsupported\n", + usnic_transport_to_str(type)); + goto out_fail; + } + + usnic_dbg("Allocating port %hu for %s\n", port_num, + usnic_transport_to_str(type)); + return port_num; + +out_fail: + return 0; +} + +void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num) +{ + if (type == USNIC_TRANSPORT_ROCE_CUSTOM) { + spin_lock(&roce_bitmap_lock); + if (!port_num) { + usnic_err("Unreserved unvalid port num 0 for %s\n", + usnic_transport_to_str(type)); + goto out_roce_custom; + } + + if (!test_bit(port_num, roce_bitmap)) { + usnic_err("Unreserving invalid %hu for %s\n", + port_num, + usnic_transport_to_str(type)); + goto out_roce_custom; + } + bitmap_clear(roce_bitmap, port_num, 1); + usnic_dbg("Freeing port %hu for %s\n", port_num, + usnic_transport_to_str(type)); +out_roce_custom: + spin_unlock(&roce_bitmap_lock); + } else { + usnic_err("Freeing invalid port %hu for %d\n", port_num, type); + } +} + +struct socket *usnic_transport_get_socket(int sock_fd) +{ + struct socket *sock; + int err; + char buf[25]; + + /* sockfd_lookup will internally do a fget */ + sock = sockfd_lookup(sock_fd, &err); + if (!sock) { + usnic_err("Unable to lookup socket for fd %d with err %d\n", + sock_fd, err); + return ERR_PTR(-ENOENT); + } + + usnic_transport_sock_to_str(buf, sizeof(buf), sock); + usnic_dbg("Get sock %s\n", buf); + + return sock; +} + +void usnic_transport_put_socket(struct socket *sock) +{ + char buf[100]; + + usnic_transport_sock_to_str(buf, sizeof(buf), sock); + usnic_dbg("Put sock %s\n", buf); + sockfd_put(sock); +} + +int usnic_transport_sock_get_addr(struct socket *sock, int *proto, + uint32_t *addr, uint16_t *port) +{ + int len; + int err; + struct sockaddr_in sock_addr; + + err = sock->ops->getname(sock, + (struct sockaddr *)&sock_addr, + &len, 0); + if (err) + return err; + + if (sock_addr.sin_family != AF_INET) + return -EINVAL; + + if (proto) + *proto = sock->sk->sk_protocol; + if (port) + *port = ntohs(((struct sockaddr_in *)&sock_addr)->sin_port); + if (addr) + *addr = ntohl(((struct sockaddr_in *) + &sock_addr)->sin_addr.s_addr); + + return 0; +} + +int usnic_transport_init(void) +{ + roce_bitmap = kzalloc(ROCE_BITMAP_SZ, GFP_KERNEL); + if (!roce_bitmap) { + usnic_err("Failed to allocate bit map"); + return -ENOMEM; + } + + /* Do not ever allocate bit 0, hence set it here */ + bitmap_set(roce_bitmap, 0, 1); + return 0; +} + +void usnic_transport_fini(void) +{ + kfree(roce_bitmap); +} diff --git a/drivers/infiniband/hw/usnic/usnic_transport.h b/drivers/infiniband/hw/usnic/usnic_transport.h new file mode 100644 index 00000000000..7e5dc6d9f46 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_transport.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_TRANSPORT_H_ +#define USNIC_TRANSPORT_H_ + +#include "usnic_abi.h" + +const char *usnic_transport_to_str(enum usnic_transport_type trans_type); +/* + * Returns number of bytes written, excluding null terminator. If + * nothing was written, the function returns 0. + */ +int usnic_transport_sock_to_str(char *buf, int buf_sz, + struct socket *sock); +/* + * Reserve a port. If "port_num" is set, then the function will try + * to reserve that particular port. + */ +u16 usnic_transport_rsrv_port(enum usnic_transport_type type, u16 port_num); +void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num); +/* + * Do a fget on the socket refered to by sock_fd and returns the socket. + * Socket will not be destroyed before usnic_transport_put_socket has + * been called. + */ +struct socket *usnic_transport_get_socket(int sock_fd); +void usnic_transport_put_socket(struct socket *sock); +/* + * Call usnic_transport_get_socket before calling *_sock_get_addr + */ +int usnic_transport_sock_get_addr(struct socket *sock, int *proto, + uint32_t *addr, uint16_t *port); +int usnic_transport_init(void); +void usnic_transport_fini(void); +#endif /* !USNIC_TRANSPORT_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c new file mode 100644 index 00000000000..801a1d6937e --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -0,0 +1,604 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2013 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/mm.h> +#include <linux/dma-mapping.h> +#include <linux/sched.h> +#include <linux/hugetlb.h> +#include <linux/dma-attrs.h> +#include <linux/iommu.h> +#include <linux/workqueue.h> +#include <linux/list.h> +#include <linux/pci.h> + +#include "usnic_log.h" +#include "usnic_uiom.h" +#include "usnic_uiom_interval_tree.h" + +static struct workqueue_struct *usnic_uiom_wq; + +#define USNIC_UIOM_PAGE_CHUNK \ + ((PAGE_SIZE - offsetof(struct usnic_uiom_chunk, page_list)) /\ + ((void *) &((struct usnic_uiom_chunk *) 0)->page_list[1] - \ + (void *) &((struct usnic_uiom_chunk *) 0)->page_list[0])) + +static void usnic_uiom_reg_account(struct work_struct *work) +{ + struct usnic_uiom_reg *umem = container_of(work, + struct usnic_uiom_reg, work); + + down_write(&umem->mm->mmap_sem); + umem->mm->locked_vm -= umem->diff; + up_write(&umem->mm->mmap_sem); + mmput(umem->mm); + kfree(umem); +} + +static int usnic_uiom_dma_fault(struct iommu_domain *domain, + struct device *dev, + unsigned long iova, int flags, + void *token) +{ + usnic_err("Device %s iommu fault domain 0x%pK va 0x%lx flags 0x%x\n", + dev_name(dev), + domain, iova, flags); + return -ENOSYS; +} + +static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty) +{ + struct usnic_uiom_chunk *chunk, *tmp; + struct page *page; + struct scatterlist *sg; + int i; + dma_addr_t pa; + + list_for_each_entry_safe(chunk, tmp, chunk_list, list) { + for_each_sg(chunk->page_list, sg, chunk->nents, i) { + page = sg_page(sg); + pa = sg_phys(sg); + if (dirty) + set_page_dirty_lock(page); + put_page(page); + usnic_dbg("pa: %pa\n", &pa); + } + kfree(chunk); + } +} + +static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, + int dmasync, struct list_head *chunk_list) +{ + struct page **page_list; + struct scatterlist *sg; + struct usnic_uiom_chunk *chunk; + unsigned long locked; + unsigned long lock_limit; + unsigned long cur_base; + unsigned long npages; + int ret; + int off; + int i; + int flags; + dma_addr_t pa; + DEFINE_DMA_ATTRS(attrs); + + if (dmasync) + dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); + + if (!can_do_mlock()) + return -EPERM; + + INIT_LIST_HEAD(chunk_list); + + page_list = (struct page **) __get_free_page(GFP_KERNEL); + if (!page_list) + return -ENOMEM; + + npages = PAGE_ALIGN(size + (addr & ~PAGE_MASK)) >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + locked = npages + current->mm->locked_vm; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { + ret = -ENOMEM; + goto out; + } + + flags = IOMMU_READ | IOMMU_CACHE; + flags |= (writable) ? IOMMU_WRITE : 0; + cur_base = addr & PAGE_MASK; + ret = 0; + + while (npages) { + ret = get_user_pages(current, current->mm, cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / sizeof(struct page *)), + 1, !writable, page_list, NULL); + + if (ret < 0) + goto out; + + npages -= ret; + off = 0; + + while (ret) { + chunk = kmalloc(sizeof(*chunk) + + sizeof(struct scatterlist) * + min_t(int, ret, USNIC_UIOM_PAGE_CHUNK), + GFP_KERNEL); + if (!chunk) { + ret = -ENOMEM; + goto out; + } + + chunk->nents = min_t(int, ret, USNIC_UIOM_PAGE_CHUNK); + sg_init_table(chunk->page_list, chunk->nents); + for_each_sg(chunk->page_list, sg, chunk->nents, i) { + sg_set_page(sg, page_list[i + off], + PAGE_SIZE, 0); + pa = sg_phys(sg); + usnic_dbg("va: 0x%lx pa: %pa\n", + cur_base + i*PAGE_SIZE, &pa); + } + cur_base += chunk->nents * PAGE_SIZE; + ret -= chunk->nents; + off += chunk->nents; + list_add_tail(&chunk->list, chunk_list); + } + + ret = 0; + } + +out: + if (ret < 0) + usnic_uiom_put_pages(chunk_list, 0); + else + current->mm->locked_vm = locked; + + up_write(¤t->mm->mmap_sem); + free_page((unsigned long) page_list); + return ret; +} + +static void usnic_uiom_unmap_sorted_intervals(struct list_head *intervals, + struct usnic_uiom_pd *pd) +{ + struct usnic_uiom_interval_node *interval, *tmp; + long unsigned va, size; + + list_for_each_entry_safe(interval, tmp, intervals, link) { + va = interval->start << PAGE_SHIFT; + size = ((interval->last - interval->start) + 1) << PAGE_SHIFT; + while (size > 0) { + /* Workaround for RH 970401 */ + usnic_dbg("va 0x%lx size 0x%lx", va, PAGE_SIZE); + iommu_unmap(pd->domain, va, PAGE_SIZE); + va += PAGE_SIZE; + size -= PAGE_SIZE; + } + } +} + +static void __usnic_uiom_reg_release(struct usnic_uiom_pd *pd, + struct usnic_uiom_reg *uiomr, + int dirty) +{ + int npages; + unsigned long vpn_start, vpn_last; + struct usnic_uiom_interval_node *interval, *tmp; + int writable = 0; + LIST_HEAD(rm_intervals); + + npages = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; + vpn_start = (uiomr->va & PAGE_MASK) >> PAGE_SHIFT; + vpn_last = vpn_start + npages - 1; + + spin_lock(&pd->lock); + usnic_uiom_remove_interval(&pd->rb_root, vpn_start, + vpn_last, &rm_intervals); + usnic_uiom_unmap_sorted_intervals(&rm_intervals, pd); + + list_for_each_entry_safe(interval, tmp, &rm_intervals, link) { + if (interval->flags & IOMMU_WRITE) + writable = 1; + list_del(&interval->link); + kfree(interval); + } + + usnic_uiom_put_pages(&uiomr->chunk_list, dirty & writable); + spin_unlock(&pd->lock); +} + +static int usnic_uiom_map_sorted_intervals(struct list_head *intervals, + struct usnic_uiom_reg *uiomr) +{ + int i, err; + size_t size; + struct usnic_uiom_chunk *chunk; + struct usnic_uiom_interval_node *interval_node; + dma_addr_t pa; + dma_addr_t pa_start = 0; + dma_addr_t pa_end = 0; + long int va_start = -EINVAL; + struct usnic_uiom_pd *pd = uiomr->pd; + long int va = uiomr->va & PAGE_MASK; + int flags = IOMMU_READ | IOMMU_CACHE; + + flags |= (uiomr->writable) ? IOMMU_WRITE : 0; + chunk = list_first_entry(&uiomr->chunk_list, struct usnic_uiom_chunk, + list); + list_for_each_entry(interval_node, intervals, link) { +iter_chunk: + for (i = 0; i < chunk->nents; i++, va += PAGE_SIZE) { + pa = sg_phys(&chunk->page_list[i]); + if ((va >> PAGE_SHIFT) < interval_node->start) + continue; + + if ((va >> PAGE_SHIFT) == interval_node->start) { + /* First page of the interval */ + va_start = va; + pa_start = pa; + pa_end = pa; + } + + WARN_ON(va_start == -EINVAL); + + if ((pa_end + PAGE_SIZE != pa) && + (pa != pa_start)) { + /* PAs are not contiguous */ + size = pa_end - pa_start + PAGE_SIZE; + usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x", + va_start, &pa_start, size, flags); + err = iommu_map(pd->domain, va_start, pa_start, + size, flags); + if (err) { + usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", + va_start, &pa_start, size, err); + goto err_out; + } + va_start = va; + pa_start = pa; + pa_end = pa; + } + + if ((va >> PAGE_SHIFT) == interval_node->last) { + /* Last page of the interval */ + size = pa - pa_start + PAGE_SIZE; + usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x\n", + va_start, &pa_start, size, flags); + err = iommu_map(pd->domain, va_start, pa_start, + size, flags); + if (err) { + usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", + va_start, &pa_start, size, err); + goto err_out; + } + break; + } + + if (pa != pa_start) + pa_end += PAGE_SIZE; + } + + if (i == chunk->nents) { + /* + * Hit last entry of the chunk, + * hence advance to next chunk + */ + chunk = list_first_entry(&chunk->list, + struct usnic_uiom_chunk, + list); + goto iter_chunk; + } + } + + return 0; + +err_out: + usnic_uiom_unmap_sorted_intervals(intervals, pd); + return err; +} + +struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, + unsigned long addr, size_t size, + int writable, int dmasync) +{ + struct usnic_uiom_reg *uiomr; + unsigned long va_base, vpn_start, vpn_last; + unsigned long npages; + int offset, err; + LIST_HEAD(sorted_diff_intervals); + + /* + * Intel IOMMU map throws an error if a translation entry is + * changed from read to write. This module may not unmap + * and then remap the entry after fixing the permission + * b/c this open up a small windows where hw DMA may page fault + * Hence, make all entries to be writable. + */ + writable = 1; + + va_base = addr & PAGE_MASK; + offset = addr & ~PAGE_MASK; + npages = PAGE_ALIGN(size + offset) >> PAGE_SHIFT; + vpn_start = (addr & PAGE_MASK) >> PAGE_SHIFT; + vpn_last = vpn_start + npages - 1; + + uiomr = kmalloc(sizeof(*uiomr), GFP_KERNEL); + if (!uiomr) + return ERR_PTR(-ENOMEM); + + uiomr->va = va_base; + uiomr->offset = offset; + uiomr->length = size; + uiomr->writable = writable; + uiomr->pd = pd; + + err = usnic_uiom_get_pages(addr, size, writable, dmasync, + &uiomr->chunk_list); + if (err) { + usnic_err("Failed get_pages vpn [0x%lx,0x%lx] err %d\n", + vpn_start, vpn_last, err); + goto out_free_uiomr; + } + + spin_lock(&pd->lock); + err = usnic_uiom_get_intervals_diff(vpn_start, vpn_last, + (writable) ? IOMMU_WRITE : 0, + IOMMU_WRITE, + &pd->rb_root, + &sorted_diff_intervals); + if (err) { + usnic_err("Failed disjoint interval vpn [0x%lx,0x%lx] err %d\n", + vpn_start, vpn_last, err); + goto out_put_pages; + } + + err = usnic_uiom_map_sorted_intervals(&sorted_diff_intervals, uiomr); + if (err) { + usnic_err("Failed map interval vpn [0x%lx,0x%lx] err %d\n", + vpn_start, vpn_last, err); + goto out_put_intervals; + + } + + err = usnic_uiom_insert_interval(&pd->rb_root, vpn_start, vpn_last, + (writable) ? IOMMU_WRITE : 0); + if (err) { + usnic_err("Failed insert interval vpn [0x%lx,0x%lx] err %d\n", + vpn_start, vpn_last, err); + goto out_unmap_intervals; + } + + usnic_uiom_put_interval_set(&sorted_diff_intervals); + spin_unlock(&pd->lock); + + return uiomr; + +out_unmap_intervals: + usnic_uiom_unmap_sorted_intervals(&sorted_diff_intervals, pd); +out_put_intervals: + usnic_uiom_put_interval_set(&sorted_diff_intervals); +out_put_pages: + usnic_uiom_put_pages(&uiomr->chunk_list, 0); + spin_unlock(&pd->lock); +out_free_uiomr: + kfree(uiomr); + return ERR_PTR(err); +} + +void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing) +{ + struct mm_struct *mm; + unsigned long diff; + + __usnic_uiom_reg_release(uiomr->pd, uiomr, 1); + + mm = get_task_mm(current); + if (!mm) { + kfree(uiomr); + return; + } + + diff = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; + + /* + * We may be called with the mm's mmap_sem already held. This + * can happen when a userspace munmap() is the call that drops + * the last reference to our file and calls our release + * method. If there are memory regions to destroy, we'll end + * up here and not be able to take the mmap_sem. In that case + * we defer the vm_locked accounting to the system workqueue. + */ + if (closing) { + if (!down_write_trylock(&mm->mmap_sem)) { + INIT_WORK(&uiomr->work, usnic_uiom_reg_account); + uiomr->mm = mm; + uiomr->diff = diff; + + queue_work(usnic_uiom_wq, &uiomr->work); + return; + } + } else + down_write(&mm->mmap_sem); + + current->mm->locked_vm -= diff; + up_write(&mm->mmap_sem); + mmput(mm); + kfree(uiomr); +} + +struct usnic_uiom_pd *usnic_uiom_alloc_pd(void) +{ + struct usnic_uiom_pd *pd; + void *domain; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + pd->domain = domain = iommu_domain_alloc(&pci_bus_type); + if (IS_ERR_OR_NULL(domain)) { + usnic_err("Failed to allocate IOMMU domain with err %ld\n", + PTR_ERR(pd->domain)); + kfree(pd); + return ERR_PTR(domain ? PTR_ERR(domain) : -ENOMEM); + } + + iommu_set_fault_handler(pd->domain, usnic_uiom_dma_fault, NULL); + + spin_lock_init(&pd->lock); + INIT_LIST_HEAD(&pd->devs); + + return pd; +} + +void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd) +{ + iommu_domain_free(pd->domain); + kfree(pd); +} + +int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev) +{ + struct usnic_uiom_dev *uiom_dev; + int err; + + uiom_dev = kzalloc(sizeof(*uiom_dev), GFP_ATOMIC); + if (!uiom_dev) + return -ENOMEM; + uiom_dev->dev = dev; + + err = iommu_attach_device(pd->domain, dev); + if (err) + goto out_free_dev; + + if (!iommu_domain_has_cap(pd->domain, IOMMU_CAP_CACHE_COHERENCY)) { + usnic_err("IOMMU of %s does not support cache coherency\n", + dev_name(dev)); + err = -EINVAL; + goto out_detach_device; + } + + spin_lock(&pd->lock); + list_add_tail(&uiom_dev->link, &pd->devs); + pd->dev_cnt++; + spin_unlock(&pd->lock); + + return 0; + +out_detach_device: + iommu_detach_device(pd->domain, dev); +out_free_dev: + kfree(uiom_dev); + return err; +} + +void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd, struct device *dev) +{ + struct usnic_uiom_dev *uiom_dev; + int found = 0; + + spin_lock(&pd->lock); + list_for_each_entry(uiom_dev, &pd->devs, link) { + if (uiom_dev->dev == dev) { + found = 1; + break; + } + } + + if (!found) { + usnic_err("Unable to free dev %s - not found\n", + dev_name(dev)); + spin_unlock(&pd->lock); + return; + } + + list_del(&uiom_dev->link); + pd->dev_cnt--; + spin_unlock(&pd->lock); + + return iommu_detach_device(pd->domain, dev); +} + +struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd) +{ + struct usnic_uiom_dev *uiom_dev; + struct device **devs; + int i = 0; + + spin_lock(&pd->lock); + devs = kcalloc(pd->dev_cnt + 1, sizeof(*devs), GFP_ATOMIC); + if (!devs) { + devs = ERR_PTR(-ENOMEM); + goto out; + } + + list_for_each_entry(uiom_dev, &pd->devs, link) { + devs[i++] = uiom_dev->dev; + } +out: + spin_unlock(&pd->lock); + return devs; +} + +void usnic_uiom_free_dev_list(struct device **devs) +{ + kfree(devs); +} + +int usnic_uiom_init(char *drv_name) +{ + if (!iommu_present(&pci_bus_type)) { + usnic_err("IOMMU required but not present or enabled. USNIC QPs will not function w/o enabling IOMMU\n"); + return -EPERM; + } + + usnic_uiom_wq = create_workqueue(drv_name); + if (!usnic_uiom_wq) { + usnic_err("Unable to alloc wq for drv %s\n", drv_name); + return -ENOMEM; + } + + return 0; +} + +void usnic_uiom_fini(void) +{ + flush_workqueue(usnic_uiom_wq); + destroy_workqueue(usnic_uiom_wq); +} diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h new file mode 100644 index 00000000000..70440996e8f --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_uiom.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_UIOM_H_ +#define USNIC_UIOM_H_ + +#include <linux/list.h> +#include <linux/scatterlist.h> + +#include "usnic_uiom_interval_tree.h" + +#define USNIC_UIOM_READ (1) +#define USNIC_UIOM_WRITE (2) + +#define USNIC_UIOM_MAX_PD_CNT (1000) +#define USNIC_UIOM_MAX_MR_CNT (1000000) +#define USNIC_UIOM_MAX_MR_SIZE (~0UL) +#define USNIC_UIOM_PAGE_SIZE (PAGE_SIZE) + +struct usnic_uiom_dev { + struct device *dev; + struct list_head link; +}; + +struct usnic_uiom_pd { + struct iommu_domain *domain; + spinlock_t lock; + struct rb_root rb_root; + struct list_head devs; + int dev_cnt; +}; + +struct usnic_uiom_reg { + struct usnic_uiom_pd *pd; + unsigned long va; + size_t length; + int offset; + int page_size; + int writable; + struct list_head chunk_list; + struct work_struct work; + struct mm_struct *mm; + unsigned long diff; +}; + +struct usnic_uiom_chunk { + struct list_head list; + int nents; + struct scatterlist page_list[0]; +}; + +struct usnic_uiom_pd *usnic_uiom_alloc_pd(void); +void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd); +int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev); +void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd, + struct device *dev); +struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd); +void usnic_uiom_free_dev_list(struct device **devs); +struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, + unsigned long addr, size_t size, + int access, int dmasync); +void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing); +int usnic_uiom_init(char *drv_name); +void usnic_uiom_fini(void); +#endif /* USNIC_UIOM_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c new file mode 100644 index 00000000000..3a4288e0fba --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/init.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/list_sort.h> + +#include <linux/interval_tree_generic.h> +#include "usnic_uiom_interval_tree.h" + +#define START(node) ((node)->start) +#define LAST(node) ((node)->last) + +#define MAKE_NODE(node, start, end, ref_cnt, flags, err, err_out) \ + do { \ + node = usnic_uiom_interval_node_alloc(start, \ + end, ref_cnt, flags); \ + if (!node) { \ + err = -ENOMEM; \ + goto err_out; \ + } \ + } while (0) + +#define MARK_FOR_ADD(node, list) (list_add_tail(&node->link, list)) + +#define MAKE_NODE_AND_APPEND(node, start, end, ref_cnt, flags, err, \ + err_out, list) \ + do { \ + MAKE_NODE(node, start, end, \ + ref_cnt, flags, err, \ + err_out); \ + MARK_FOR_ADD(node, list); \ + } while (0) + +#define FLAGS_EQUAL(flags1, flags2, mask) \ + (((flags1) & (mask)) == ((flags2) & (mask))) + +static struct usnic_uiom_interval_node* +usnic_uiom_interval_node_alloc(long int start, long int last, int ref_cnt, + int flags) +{ + struct usnic_uiom_interval_node *interval = kzalloc(sizeof(*interval), + GFP_ATOMIC); + if (!interval) + return NULL; + + interval->start = start; + interval->last = last; + interval->flags = flags; + interval->ref_cnt = ref_cnt; + + return interval; +} + +static int interval_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct usnic_uiom_interval_node *node_a, *node_b; + + node_a = list_entry(a, struct usnic_uiom_interval_node, link); + node_b = list_entry(b, struct usnic_uiom_interval_node, link); + + /* long to int */ + if (node_a->start < node_b->start) + return -1; + else if (node_a->start > node_b->start) + return 1; + + return 0; +} + +static void +find_intervals_intersection_sorted(struct rb_root *root, unsigned long start, + unsigned long last, + struct list_head *list) +{ + struct usnic_uiom_interval_node *node; + + INIT_LIST_HEAD(list); + + for (node = usnic_uiom_interval_tree_iter_first(root, start, last); + node; + node = usnic_uiom_interval_tree_iter_next(node, start, last)) + list_add_tail(&node->link, list); + + list_sort(NULL, list, interval_cmp); +} + +int usnic_uiom_get_intervals_diff(unsigned long start, unsigned long last, + int flags, int flag_mask, + struct rb_root *root, + struct list_head *diff_set) +{ + struct usnic_uiom_interval_node *interval, *tmp; + int err = 0; + long int pivot = start; + LIST_HEAD(intersection_set); + + INIT_LIST_HEAD(diff_set); + + find_intervals_intersection_sorted(root, start, last, + &intersection_set); + + list_for_each_entry(interval, &intersection_set, link) { + if (pivot < interval->start) { + MAKE_NODE_AND_APPEND(tmp, pivot, interval->start - 1, + 1, flags, err, err_out, + diff_set); + pivot = interval->start; + } + + /* + * Invariant: Set [start, pivot] is either in diff_set or root, + * but not in both. + */ + + if (pivot > interval->last) { + continue; + } else if (pivot <= interval->last && + FLAGS_EQUAL(interval->flags, flags, + flag_mask)) { + pivot = interval->last + 1; + } + } + + if (pivot <= last) + MAKE_NODE_AND_APPEND(tmp, pivot, last, 1, flags, err, err_out, + diff_set); + + return 0; + +err_out: + list_for_each_entry_safe(interval, tmp, diff_set, link) { + list_del(&interval->link); + kfree(interval); + } + + return err; +} + +void usnic_uiom_put_interval_set(struct list_head *intervals) +{ + struct usnic_uiom_interval_node *interval, *tmp; + list_for_each_entry_safe(interval, tmp, intervals, link) + kfree(interval); +} + +int usnic_uiom_insert_interval(struct rb_root *root, unsigned long start, + unsigned long last, int flags) +{ + struct usnic_uiom_interval_node *interval, *tmp; + unsigned long istart, ilast; + int iref_cnt, iflags; + unsigned long lpivot = start; + int err = 0; + LIST_HEAD(to_add); + LIST_HEAD(intersection_set); + + find_intervals_intersection_sorted(root, start, last, + &intersection_set); + + list_for_each_entry(interval, &intersection_set, link) { + /* + * Invariant - lpivot is the left edge of next interval to be + * inserted + */ + istart = interval->start; + ilast = interval->last; + iref_cnt = interval->ref_cnt; + iflags = interval->flags; + + if (istart < lpivot) { + MAKE_NODE_AND_APPEND(tmp, istart, lpivot - 1, iref_cnt, + iflags, err, err_out, &to_add); + } else if (istart > lpivot) { + MAKE_NODE_AND_APPEND(tmp, lpivot, istart - 1, 1, flags, + err, err_out, &to_add); + lpivot = istart; + } else { + lpivot = istart; + } + + if (ilast > last) { + MAKE_NODE_AND_APPEND(tmp, lpivot, last, iref_cnt + 1, + iflags | flags, err, err_out, + &to_add); + MAKE_NODE_AND_APPEND(tmp, last + 1, ilast, iref_cnt, + iflags, err, err_out, &to_add); + } else { + MAKE_NODE_AND_APPEND(tmp, lpivot, ilast, iref_cnt + 1, + iflags | flags, err, err_out, + &to_add); + } + + lpivot = ilast + 1; + } + + if (lpivot <= last) + MAKE_NODE_AND_APPEND(tmp, lpivot, last, 1, flags, err, err_out, + &to_add); + + list_for_each_entry_safe(interval, tmp, &intersection_set, link) { + usnic_uiom_interval_tree_remove(interval, root); + kfree(interval); + } + + list_for_each_entry(interval, &to_add, link) + usnic_uiom_interval_tree_insert(interval, root); + + return 0; + +err_out: + list_for_each_entry_safe(interval, tmp, &to_add, link) + kfree(interval); + + return err; +} + +void usnic_uiom_remove_interval(struct rb_root *root, unsigned long start, + unsigned long last, struct list_head *removed) +{ + struct usnic_uiom_interval_node *interval; + + for (interval = usnic_uiom_interval_tree_iter_first(root, start, last); + interval; + interval = usnic_uiom_interval_tree_iter_next(interval, + start, + last)) { + if (--interval->ref_cnt == 0) + list_add_tail(&interval->link, removed); + } + + list_for_each_entry(interval, removed, link) + usnic_uiom_interval_tree_remove(interval, root); +} + +INTERVAL_TREE_DEFINE(struct usnic_uiom_interval_node, rb, + unsigned long, __subtree_last, + START, LAST, , usnic_uiom_interval_tree) diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h new file mode 100644 index 00000000000..d4f752e258f --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_UIOM_INTERVAL_TREE_H_ +#define USNIC_UIOM_INTERVAL_TREE_H_ + +#include <linux/rbtree.h> + +struct usnic_uiom_interval_node { + struct rb_node rb; + struct list_head link; + unsigned long start; + unsigned long last; + unsigned long __subtree_last; + unsigned int ref_cnt; + int flags; +}; + +extern void +usnic_uiom_interval_tree_insert(struct usnic_uiom_interval_node *node, + struct rb_root *root); +extern void +usnic_uiom_interval_tree_remove(struct usnic_uiom_interval_node *node, + struct rb_root *root); +extern struct usnic_uiom_interval_node * +usnic_uiom_interval_tree_iter_first(struct rb_root *root, + unsigned long start, + unsigned long last); +extern struct usnic_uiom_interval_node * +usnic_uiom_interval_tree_iter_next(struct usnic_uiom_interval_node *node, + unsigned long start, unsigned long last); +/* + * Inserts {start...last} into {root}. If there are overlaps, + * nodes will be broken up and merged + */ +int usnic_uiom_insert_interval(struct rb_root *root, + unsigned long start, unsigned long last, + int flags); +/* + * Removed {start...last} from {root}. The nodes removed are returned in + * 'removed.' The caller is responsibile for freeing memory of nodes in + * 'removed.' + */ +void usnic_uiom_remove_interval(struct rb_root *root, + unsigned long start, unsigned long last, + struct list_head *removed); +/* + * Returns {start...last} - {root} (relative complement of {start...last} in + * {root}) in diff_set sorted ascendingly + */ +int usnic_uiom_get_intervals_diff(unsigned long start, + unsigned long last, int flags, + int flag_mask, + struct rb_root *root, + struct list_head *diff_set); +/* Call this to free diff_set returned by usnic_uiom_get_intervals_diff */ +void usnic_uiom_put_interval_set(struct list_head *intervals); +#endif /* USNIC_UIOM_INTERVAL_TREE_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.c b/drivers/infiniband/hw/usnic/usnic_vnic.c new file mode 100644 index 00000000000..656b88c39ed --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_vnic.c @@ -0,0 +1,467 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/pci.h> + +#include "usnic_ib.h" +#include "vnic_resource.h" +#include "usnic_log.h" +#include "usnic_vnic.h" + +struct usnic_vnic { + struct vnic_dev *vdev; + struct vnic_dev_bar bar[PCI_NUM_RESOURCES]; + struct usnic_vnic_res_chunk chunks[USNIC_VNIC_RES_TYPE_MAX]; + spinlock_t res_lock; +}; + +static enum vnic_res_type _to_vnic_res_type(enum usnic_vnic_res_type res_type) +{ +#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \ + vnic_res_type, +#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \ + vnic_res_type, + static enum vnic_res_type usnic_vnic_type_2_vnic_type[] = { + USNIC_VNIC_RES_TYPES}; +#undef DEFINE_USNIC_VNIC_RES +#undef DEFINE_USNIC_VNIC_RES_AT + + if (res_type >= USNIC_VNIC_RES_TYPE_MAX) + return RES_TYPE_MAX; + + return usnic_vnic_type_2_vnic_type[res_type]; +} + +const char *usnic_vnic_res_type_to_str(enum usnic_vnic_res_type res_type) +{ +#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \ + desc, +#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \ + desc, + static const char * const usnic_vnic_res_type_desc[] = { + USNIC_VNIC_RES_TYPES}; +#undef DEFINE_USNIC_VNIC_RES +#undef DEFINE_USNIC_VNIC_RES_AT + + if (res_type >= USNIC_VNIC_RES_TYPE_MAX) + return "unknown"; + + return usnic_vnic_res_type_desc[res_type]; + +} + +const char *usnic_vnic_pci_name(struct usnic_vnic *vnic) +{ + return pci_name(usnic_vnic_get_pdev(vnic)); +} + +int usnic_vnic_dump(struct usnic_vnic *vnic, char *buf, + int buf_sz, + void *hdr_obj, + int (*printtitle)(void *, char*, int), + int (*printcols)(char *, int), + int (*printrow)(void *, char *, int)) +{ + struct usnic_vnic_res_chunk *chunk; + struct usnic_vnic_res *res; + struct vnic_dev_bar *bar0; + int i, j, offset; + + offset = 0; + bar0 = usnic_vnic_get_bar(vnic, 0); + offset += scnprintf(buf + offset, buf_sz - offset, + "VF:%hu BAR0 bus_addr=%pa vaddr=0x%p size=%ld ", + usnic_vnic_get_index(vnic), + &bar0->bus_addr, + bar0->vaddr, bar0->len); + if (printtitle) + offset += printtitle(hdr_obj, buf + offset, buf_sz - offset); + offset += scnprintf(buf + offset, buf_sz - offset, "\n"); + offset += scnprintf(buf + offset, buf_sz - offset, + "|RES\t|CTRL_PIN\t\t|IN_USE\t"); + if (printcols) + offset += printcols(buf + offset, buf_sz - offset); + offset += scnprintf(buf + offset, buf_sz - offset, "\n"); + + spin_lock(&vnic->res_lock); + for (i = 0; i < ARRAY_SIZE(vnic->chunks); i++) { + chunk = &vnic->chunks[i]; + for (j = 0; j < chunk->cnt; j++) { + res = chunk->res[j]; + offset += scnprintf(buf + offset, buf_sz - offset, + "|%s[%u]\t|0x%p\t|%u\t", + usnic_vnic_res_type_to_str(res->type), + res->vnic_idx, res->ctrl, !!res->owner); + if (printrow) { + offset += printrow(res->owner, buf + offset, + buf_sz - offset); + } + offset += scnprintf(buf + offset, buf_sz - offset, + "\n"); + } + } + spin_unlock(&vnic->res_lock); + return offset; +} + +void usnic_vnic_res_spec_update(struct usnic_vnic_res_spec *spec, + enum usnic_vnic_res_type trgt_type, + u16 cnt) +{ + int i; + + for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { + if (spec->resources[i].type == trgt_type) { + spec->resources[i].cnt = cnt; + return; + } + } + + WARN_ON(1); +} + +int usnic_vnic_res_spec_satisfied(const struct usnic_vnic_res_spec *min_spec, + struct usnic_vnic_res_spec *res_spec) +{ + int found, i, j; + + for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { + found = 0; + + for (j = 0; j < USNIC_VNIC_RES_TYPE_MAX; j++) { + if (res_spec->resources[i].type != + min_spec->resources[i].type) + continue; + found = 1; + if (min_spec->resources[i].cnt > + res_spec->resources[i].cnt) + return -EINVAL; + break; + } + + if (!found) + return -EINVAL; + } + return 0; +} + +int usnic_vnic_spec_dump(char *buf, int buf_sz, + struct usnic_vnic_res_spec *res_spec) +{ + enum usnic_vnic_res_type res_type; + int res_cnt; + int i; + int offset = 0; + + for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { + res_type = res_spec->resources[i].type; + res_cnt = res_spec->resources[i].cnt; + offset += scnprintf(buf + offset, buf_sz - offset, + "Res: %s Cnt: %d ", + usnic_vnic_res_type_to_str(res_type), + res_cnt); + } + + return offset; +} + +int usnic_vnic_check_room(struct usnic_vnic *vnic, + struct usnic_vnic_res_spec *res_spec) +{ + int i; + enum usnic_vnic_res_type res_type; + int res_cnt; + + for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { + res_type = res_spec->resources[i].type; + res_cnt = res_spec->resources[i].cnt; + + if (res_type == USNIC_VNIC_RES_TYPE_EOL) + break; + + if (res_cnt > usnic_vnic_res_free_cnt(vnic, res_type)) + return -EBUSY; + } + + return 0; +} + +int usnic_vnic_res_cnt(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type) +{ + return vnic->chunks[type].cnt; +} + +int usnic_vnic_res_free_cnt(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type) +{ + return vnic->chunks[type].free_cnt; +} + +struct usnic_vnic_res_chunk * +usnic_vnic_get_resources(struct usnic_vnic *vnic, enum usnic_vnic_res_type type, + int cnt, void *owner) +{ + struct usnic_vnic_res_chunk *src, *ret; + struct usnic_vnic_res *res; + int i; + + if (usnic_vnic_res_free_cnt(vnic, type) < cnt || cnt < 1 || !owner) + return ERR_PTR(-EINVAL); + + ret = kzalloc(sizeof(*ret), GFP_ATOMIC); + if (!ret) { + usnic_err("Failed to allocate chunk for %s - Out of memory\n", + usnic_vnic_pci_name(vnic)); + return ERR_PTR(-ENOMEM); + } + + ret->res = kzalloc(sizeof(*(ret->res))*cnt, GFP_ATOMIC); + if (!ret->res) { + usnic_err("Failed to allocate resources for %s. Out of memory\n", + usnic_vnic_pci_name(vnic)); + kfree(ret); + return ERR_PTR(-ENOMEM); + } + + spin_lock(&vnic->res_lock); + src = &vnic->chunks[type]; + for (i = 0; i < src->cnt && ret->cnt < cnt; i++) { + res = src->res[i]; + if (!res->owner) { + src->free_cnt--; + res->owner = owner; + ret->res[ret->cnt++] = res; + } + } + + spin_unlock(&vnic->res_lock); + ret->type = type; + ret->vnic = vnic; + WARN_ON(ret->cnt != cnt); + + return ret; +} + +void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk) +{ + + struct usnic_vnic_res *res; + int i; + struct usnic_vnic *vnic = chunk->vnic; + + spin_lock(&vnic->res_lock); + while ((i = --chunk->cnt) >= 0) { + res = chunk->res[i]; + chunk->res[i] = NULL; + res->owner = NULL; + vnic->chunks[res->type].free_cnt++; + } + spin_unlock(&vnic->res_lock); + + kfree(chunk->res); + kfree(chunk); +} + +u16 usnic_vnic_get_index(struct usnic_vnic *vnic) +{ + return usnic_vnic_get_pdev(vnic)->devfn - 1; +} + +static int usnic_vnic_alloc_res_chunk(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type, + struct usnic_vnic_res_chunk *chunk) +{ + int cnt, err, i; + struct usnic_vnic_res *res; + + cnt = vnic_dev_get_res_count(vnic->vdev, _to_vnic_res_type(type)); + if (cnt < 1) + return -EINVAL; + + chunk->cnt = chunk->free_cnt = cnt; + chunk->res = kzalloc(sizeof(*(chunk->res))*cnt, GFP_KERNEL); + if (!chunk->res) + return -ENOMEM; + + for (i = 0; i < cnt; i++) { + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) { + err = -ENOMEM; + goto fail; + } + res->type = type; + res->vnic_idx = i; + res->vnic = vnic; + res->ctrl = vnic_dev_get_res(vnic->vdev, + _to_vnic_res_type(type), i); + chunk->res[i] = res; + } + + chunk->vnic = vnic; + return 0; +fail: + for (i--; i >= 0; i--) + kfree(chunk->res[i]); + kfree(chunk->res); + return err; +} + +static void usnic_vnic_free_res_chunk(struct usnic_vnic_res_chunk *chunk) +{ + int i; + for (i = 0; i < chunk->cnt; i++) + kfree(chunk->res[i]); + kfree(chunk->res); +} + +static int usnic_vnic_discover_resources(struct pci_dev *pdev, + struct usnic_vnic *vnic) +{ + enum usnic_vnic_res_type res_type; + int i; + int err = 0; + + for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; + vnic->bar[i].len = pci_resource_len(pdev, i); + vnic->bar[i].vaddr = pci_iomap(pdev, i, vnic->bar[i].len); + if (!vnic->bar[i].vaddr) { + usnic_err("Cannot memory-map BAR %d, aborting\n", + i); + err = -ENODEV; + goto out_clean_bar; + } + vnic->bar[i].bus_addr = pci_resource_start(pdev, i); + } + + vnic->vdev = vnic_dev_register(NULL, pdev, pdev, vnic->bar, + ARRAY_SIZE(vnic->bar)); + if (!vnic->vdev) { + usnic_err("Failed to register device %s\n", + pci_name(pdev)); + err = -EINVAL; + goto out_clean_bar; + } + + for (res_type = USNIC_VNIC_RES_TYPE_EOL + 1; + res_type < USNIC_VNIC_RES_TYPE_MAX; res_type++) { + err = usnic_vnic_alloc_res_chunk(vnic, res_type, + &vnic->chunks[res_type]); + if (err) { + usnic_err("Failed to alloc res %s with err %d\n", + usnic_vnic_res_type_to_str(res_type), + err); + goto out_clean_chunks; + } + } + + return 0; + +out_clean_chunks: + for (res_type--; res_type > USNIC_VNIC_RES_TYPE_EOL; res_type--) + usnic_vnic_free_res_chunk(&vnic->chunks[res_type]); + vnic_dev_unregister(vnic->vdev); +out_clean_bar: + for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; + if (!vnic->bar[i].vaddr) + break; + + iounmap(vnic->bar[i].vaddr); + } + + return err; +} + +struct pci_dev *usnic_vnic_get_pdev(struct usnic_vnic *vnic) +{ + return vnic_dev_get_pdev(vnic->vdev); +} + +struct vnic_dev_bar *usnic_vnic_get_bar(struct usnic_vnic *vnic, + int bar_num) +{ + return (bar_num < ARRAY_SIZE(vnic->bar)) ? &vnic->bar[bar_num] : NULL; +} + +static void usnic_vnic_release_resources(struct usnic_vnic *vnic) +{ + int i; + struct pci_dev *pdev; + enum usnic_vnic_res_type res_type; + + pdev = usnic_vnic_get_pdev(vnic); + + for (res_type = USNIC_VNIC_RES_TYPE_EOL + 1; + res_type < USNIC_VNIC_RES_TYPE_MAX; res_type++) + usnic_vnic_free_res_chunk(&vnic->chunks[res_type]); + + vnic_dev_unregister(vnic->vdev); + + for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; + iounmap(vnic->bar[i].vaddr); + } +} + +struct usnic_vnic *usnic_vnic_alloc(struct pci_dev *pdev) +{ + struct usnic_vnic *vnic; + int err = 0; + + if (!pci_is_enabled(pdev)) { + usnic_err("PCI dev %s is disabled\n", pci_name(pdev)); + return ERR_PTR(-EINVAL); + } + + vnic = kzalloc(sizeof(*vnic), GFP_KERNEL); + if (!vnic) { + usnic_err("Failed to alloc vnic for %s - out of memory\n", + pci_name(pdev)); + return ERR_PTR(-ENOMEM); + } + + spin_lock_init(&vnic->res_lock); + + err = usnic_vnic_discover_resources(pdev, vnic); + if (err) { + usnic_err("Failed to discover %s resources with err %d\n", + pci_name(pdev), err); + goto out_free_vnic; + } + + usnic_dbg("Allocated vnic for %s\n", usnic_vnic_pci_name(vnic)); + + return vnic; + +out_free_vnic: + kfree(vnic); + + return ERR_PTR(err); +} + +void usnic_vnic_free(struct usnic_vnic *vnic) +{ + usnic_vnic_release_resources(vnic); + kfree(vnic); +} diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.h b/drivers/infiniband/hw/usnic/usnic_vnic.h new file mode 100644 index 00000000000..14d931a8829 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_vnic.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_VNIC_H_ +#define USNIC_VNIC_H_ + +#include <linux/pci.h> + +#include "vnic_dev.h" + +/* =USNIC_VNIC_RES_TYPE= =VNIC_RES= =DESC= */ +#define USNIC_VNIC_RES_TYPES \ + DEFINE_USNIC_VNIC_RES_AT(EOL, RES_TYPE_EOL, "EOL", 0) \ + DEFINE_USNIC_VNIC_RES(WQ, RES_TYPE_WQ, "WQ") \ + DEFINE_USNIC_VNIC_RES(RQ, RES_TYPE_RQ, "RQ") \ + DEFINE_USNIC_VNIC_RES(CQ, RES_TYPE_CQ, "CQ") \ + DEFINE_USNIC_VNIC_RES(INTR, RES_TYPE_INTR_CTRL, "INT") \ + DEFINE_USNIC_VNIC_RES(MAX, RES_TYPE_MAX, "MAX")\ + +#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \ + USNIC_VNIC_RES_TYPE_##usnic_vnic_res_t = val, +#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \ + USNIC_VNIC_RES_TYPE_##usnic_vnic_res_t, +enum usnic_vnic_res_type { + USNIC_VNIC_RES_TYPES +}; +#undef DEFINE_USNIC_VNIC_RES +#undef DEFINE_USNIC_VNIC_RES_AT + +struct usnic_vnic_res { + enum usnic_vnic_res_type type; + unsigned int vnic_idx; + struct usnic_vnic *vnic; + void __iomem *ctrl; + void *owner; +}; + +struct usnic_vnic_res_chunk { + enum usnic_vnic_res_type type; + int cnt; + int free_cnt; + struct usnic_vnic_res **res; + struct usnic_vnic *vnic; +}; + +struct usnic_vnic_res_desc { + enum usnic_vnic_res_type type; + uint16_t cnt; +}; + +struct usnic_vnic_res_spec { + struct usnic_vnic_res_desc resources[USNIC_VNIC_RES_TYPE_MAX]; +}; + +const char *usnic_vnic_res_type_to_str(enum usnic_vnic_res_type res_type); +const char *usnic_vnic_pci_name(struct usnic_vnic *vnic); +int usnic_vnic_dump(struct usnic_vnic *vnic, char *buf, int buf_sz, + void *hdr_obj, + int (*printtitle)(void *, char*, int), + int (*printcols)(char *, int), + int (*printrow)(void *, char *, int)); +void usnic_vnic_res_spec_update(struct usnic_vnic_res_spec *spec, + enum usnic_vnic_res_type trgt_type, + u16 cnt); +int usnic_vnic_res_spec_satisfied(const struct usnic_vnic_res_spec *min_spec, + struct usnic_vnic_res_spec *res_spec); +int usnic_vnic_spec_dump(char *buf, int buf_sz, + struct usnic_vnic_res_spec *res_spec); +int usnic_vnic_check_room(struct usnic_vnic *vnic, + struct usnic_vnic_res_spec *res_spec); +int usnic_vnic_res_cnt(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type); +int usnic_vnic_res_free_cnt(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type); +struct usnic_vnic_res_chunk * +usnic_vnic_get_resources(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type, + int cnt, + void *owner); +void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk); +struct pci_dev *usnic_vnic_get_pdev(struct usnic_vnic *vnic); +struct vnic_dev_bar *usnic_vnic_get_bar(struct usnic_vnic *vnic, + int bar_num); +struct usnic_vnic *usnic_vnic_alloc(struct pci_dev *pdev); +void usnic_vnic_free(struct usnic_vnic *vnic); +u16 usnic_vnic_get_index(struct usnic_vnic *vnic); + +#endif /*!USNIC_VNIC_H_*/ diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile new file mode 100644 index 00000000000..f3c7dcf0309 --- /dev/null +++ b/drivers/infiniband/ulp/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_INFINIBAND_IPOIB) += ipoib/ +obj-$(CONFIG_INFINIBAND_SRP) += srp/ +obj-$(CONFIG_INFINIBAND_SRPT) += srpt/ +obj-$(CONFIG_INFINIBAND_ISER) += iser/ +obj-$(CONFIG_INFINIBAND_ISERT) += isert/ diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 07ca6fd5546..c639f90cfda 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -101,6 +101,7 @@ enum { IPOIB_MCAST_FLAG_SENDONLY = 1, IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ IPOIB_MCAST_FLAG_ATTACHED = 3, + IPOIB_MCAST_JOIN_STARTED = 4, MAX_SEND_CQE = 16, IPOIB_CM_COPYBREAK = 256, @@ -117,6 +118,8 @@ enum { #define IPOIB_OP_CM (0) #endif +#define IPOIB_QPN_MASK ((__force u32) cpu_to_be32(0xFFFFFF)) + /* structs */ struct ipoib_header { @@ -149,6 +152,7 @@ struct ipoib_mcast { struct sk_buff_head pkt_queue; struct net_device *dev; + struct completion done; }; struct ipoib_rx_buf { @@ -297,7 +301,7 @@ struct ipoib_dev_priv { unsigned long flags; - struct mutex vlan_mutex; + struct rw_semaphore vlan_rwsem; struct rb_root path_tree; struct list_head path_list; @@ -760,4 +764,6 @@ extern int ipoib_debug_level; #define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) +extern const char ipoib_driver_version[]; + #endif /* _IPOIB_H */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 72ae63f0072..933efcea0d0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -140,7 +140,8 @@ static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, struct ipoib_cm_rx_buf *rx_ring, int id, int frags, - u64 mapping[IPOIB_CM_RX_SG]) + u64 mapping[IPOIB_CM_RX_SG], + gfp_t gfp) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; @@ -164,7 +165,7 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, } for (i = 0; i < frags; i++) { - struct page *page = alloc_page(GFP_ATOMIC); + struct page *page = alloc_page(gfp); if (!page) goto partial_error; @@ -382,7 +383,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1, - rx->rx_ring[i].mapping)) { + rx->rx_ring[i].mapping, + GFP_KERNEL)) { ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); ret = -ENOMEM; goto err_count; @@ -460,7 +462,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even goto err_qp; } - psn = random32() & 0xffffff; + psn = prandom_u32() & 0xffffff; ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); if (ret) goto err_modify; @@ -639,7 +641,8 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; - newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping); + newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, + mapping, GFP_ATOMIC); if (unlikely(!newskb)) { /* * If we can't allocate a new RX buffer, dump @@ -741,6 +744,9 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ tx_req->mapping = addr; + skb_orphan(skb); + skb_dst_drop(skb); + rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), addr, skb->len); if (unlikely(rc)) { @@ -755,9 +761,13 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ if (++priv->tx_outstanding == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", tx->qp->qp_num); - if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) - ipoib_warn(priv, "request notify on send CQ failed\n"); netif_stop_queue(dev); + rc = ib_req_notify_cq(priv->send_cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc < 0) + ipoib_warn(priv, "request notify on send CQ failed\n"); + else if (rc) + ipoib_send_comp_handler(priv->send_cq, dev); } } } @@ -810,7 +820,6 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) if (neigh) { neigh->cm = NULL; - list_del(&neigh->list); ipoib_neigh_free(neigh); tx->neigh = NULL; @@ -1021,10 +1030,20 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ .cap.max_send_sge = 1, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, - .qp_context = tx + .qp_context = tx, + .create_flags = IB_QP_CREATE_USE_GFP_NOIO }; - return ib_create_qp(priv->pd, &attr); + struct ib_qp *tx_qp; + + tx_qp = ib_create_qp(priv->pd, &attr); + if (PTR_ERR(tx_qp) == -EINVAL) { + ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n", + priv->ca->name); + attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO; + tx_qp = ib_create_qp(priv->pd, &attr); + } + return tx_qp; } static int ipoib_cm_send_req(struct net_device *dev, @@ -1095,12 +1114,14 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, struct ipoib_dev_priv *priv = netdev_priv(p->dev); int ret; - p->tx_ring = vzalloc(ipoib_sendq_size * sizeof *p->tx_ring); + p->tx_ring = __vmalloc(ipoib_sendq_size * sizeof *p->tx_ring, + GFP_NOIO, PAGE_KERNEL); if (!p->tx_ring) { ipoib_warn(priv, "failed to allocate tx ring\n"); ret = -ENOMEM; goto err_tx; } + memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring); p->qp = ipoib_cm_create_tx_qp(p->dev, p); if (IS_ERR(p->qp)) { @@ -1227,7 +1248,6 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, if (neigh) { neigh->cm = NULL; - list_del(&neigh->list); ipoib_neigh_free(neigh); tx->neigh = NULL; @@ -1318,7 +1338,6 @@ static void ipoib_cm_tx_start(struct work_struct *work) neigh = p->neigh; if (neigh) { neigh->cm = NULL; - list_del(&neigh->list); ipoib_neigh_free(neigh); } list_del(&p->list); @@ -1552,7 +1571,8 @@ int ipoib_cm_dev_init(struct net_device *dev) for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i, priv->cm.num_frags - 1, - priv->cm.srq_ring[i].mapping)) { + priv->cm.srq_ring[i].mapping, + GFP_KERNEL)) { ipoib_warn(priv, "failed to allocate " "receive buffer %d\n", i); ipoib_cm_dev_cleanup(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 29bc7b5724a..078cadd6c79 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -39,7 +39,24 @@ static void ipoib_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { - strncpy(drvinfo->driver, "ipoib", sizeof(drvinfo->driver) - 1); + struct ipoib_dev_priv *priv = netdev_priv(netdev); + struct ib_device_attr *attr; + + attr = kmalloc(sizeof(*attr), GFP_KERNEL); + if (attr && !ib_query_device(priv->ca, attr)) + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%d", (int)(attr->fw_ver >> 32), + (int)(attr->fw_ver >> 16) & 0xffff, + (int)attr->fw_ver & 0xffff); + kfree(attr); + + strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device), + sizeof(drvinfo->bus_info)); + + strlcpy(drvinfo->version, ipoib_driver_version, + sizeof(drvinfo->version)); + + strlcpy(drvinfo->driver, "ib_ipoib", sizeof(drvinfo->driver)); } static int ipoib_get_coalesce(struct net_device *dev, @@ -88,5 +105,5 @@ static const struct ethtool_ops ipoib_ethtool_ops = { void ipoib_set_ethtool_ops(struct net_device *dev) { - SET_ETHTOOL_OPS(dev, &ipoib_ethtool_ops); + dev->ethtool_ops = &ipoib_ethtool_ops; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index f10221f4080..6a7003ddb0b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -600,6 +600,9 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, netif_stop_queue(dev); } + skb_orphan(skb); + skb_dst_drop(skb); + rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), address->ah, qpn, tx_req, phead, hlen); if (unlikely(rc)) { @@ -615,8 +618,6 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, address->last_send = priv->tx_head; ++priv->tx_head; - skb_orphan(skb); - } if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) @@ -684,15 +685,13 @@ int ipoib_ib_dev_open(struct net_device *dev) ret = ipoib_ib_post_receives(dev); if (ret) { ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); - ipoib_ib_dev_stop(dev, 1); - return -1; + goto dev_stop; } ret = ipoib_cm_dev_open(dev); if (ret) { ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); - ipoib_ib_dev_stop(dev, 1); - return -1; + goto dev_stop; } clear_bit(IPOIB_STOP_REAPER, &priv->flags); @@ -703,6 +702,11 @@ int ipoib_ib_dev_open(struct net_device *dev) napi_enable(&priv->napi); return 0; +dev_stop: + if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_enable(&priv->napi); + ipoib_ib_dev_stop(dev, 1); + return -1; } static void ipoib_pkey_dev_check_presence(struct net_device *dev) @@ -745,10 +749,8 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush) if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { mutex_lock(&pkey_mutex); set_bit(IPOIB_PKEY_STOP, &priv->flags); - cancel_delayed_work(&priv->pkey_poll_task); + cancel_delayed_work_sync(&priv->pkey_poll_task); mutex_unlock(&pkey_mutex); - if (flush) - flush_workqueue(ipoib_workqueue); } ipoib_mcast_stop_thread(dev, flush); @@ -931,14 +933,49 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) return 0; } +/* + * Takes whatever value which is in pkey index 0 and updates priv->pkey + * returns 0 if the pkey value was changed. + */ +static inline int update_parent_pkey(struct ipoib_dev_priv *priv) +{ + int result; + u16 prev_pkey; + + prev_pkey = priv->pkey; + result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); + if (result) { + ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n", + priv->port, result); + return result; + } + + priv->pkey |= 0x8000; + + if (prev_pkey != priv->pkey) { + ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n", + prev_pkey, priv->pkey); + /* + * Update the pkey in the broadcast address, while making sure to set + * the full membership bit, so that we join the right broadcast group. + */ + priv->dev->broadcast[8] = priv->pkey >> 8; + priv->dev->broadcast[9] = priv->pkey & 0xff; + return 0; + } + + return 1; +} + static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, enum ipoib_flush_level level) { struct ipoib_dev_priv *cpriv; struct net_device *dev = priv->dev; u16 new_index; + int result; - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); /* * Flush any child interfaces too -- they might be up even if @@ -947,9 +984,13 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, list_for_each_entry(cpriv, &priv->child_intfs, list) __ipoib_ib_dev_flush(cpriv, level); - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { + /* for non-child devices must check/update the pkey value here */ + if (level == IPOIB_FLUSH_HEAVY && + !test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) + update_parent_pkey(priv); ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); return; } @@ -960,21 +1001,32 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, } if (level == IPOIB_FLUSH_HEAVY) { - if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { - clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); - ipoib_ib_dev_down(dev, 0); - ipoib_ib_dev_stop(dev, 0); - if (ipoib_pkey_dev_delay_open(dev)) + /* child devices chase their origin pkey value, while non-child + * (parent) devices should always takes what present in pkey index 0 + */ + if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + ipoib_ib_dev_down(dev, 0); + ipoib_ib_dev_stop(dev, 0); + if (ipoib_pkey_dev_delay_open(dev)) + return; + } + /* restart QP only if P_Key index is changed */ + if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && + new_index == priv->pkey_index) { + ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); return; + } + priv->pkey_index = new_index; + } else { + result = update_parent_pkey(priv); + /* restart QP only if P_Key value changed */ + if (result) { + ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n"); + return; + } } - - /* restart QP only if P_Key index is changed */ - if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && - new_index == priv->pkey_index) { - ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); - return; - } - priv->pkey_index = new_index; } if (level == IPOIB_FLUSH_LIGHT) { @@ -1030,6 +1082,11 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) struct ipoib_dev_priv *priv = netdev_priv(dev); ipoib_dbg(priv, "cleaning up ib_dev\n"); + /* + * We must make sure there are no more (path) completions + * that may wish to touch priv fields that are no longer valid + */ + ipoib_flush_paths(dev); ipoib_mcast_stop_thread(dev, 1); ipoib_mcast_dev_flush(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 6fdc9e78da0..5786a78ff8b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -49,9 +49,14 @@ #include <linux/jhash.h> #include <net/arp.h> +#define DRV_VERSION "1.0.0" + +const char ipoib_driver_version[] = DRV_VERSION; + MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; @@ -99,6 +104,8 @@ int ipoib_open(struct net_device *dev) ipoib_dbg(priv, "bringing up interface\n"); + netif_carrier_off(dev); + set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_pkey_dev_delay_open(dev)) @@ -114,7 +121,7 @@ int ipoib_open(struct net_device *dev) struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; @@ -124,7 +131,7 @@ int ipoib_open(struct net_device *dev) dev_change_flags(cpriv->dev, flags | IFF_UP); } - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); } netif_start_queue(dev); @@ -157,7 +164,7 @@ static int ipoib_stop(struct net_device *dev) struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; @@ -167,7 +174,7 @@ static int ipoib_stop(struct net_device *dev) dev_change_flags(cpriv->dev, flags & ~IFF_UP); } - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); } return 0; @@ -488,7 +495,6 @@ static void path_rec_completion(int status, path, neigh)); if (!ipoib_cm_get(neigh)) { - list_del(&neigh->list); ipoib_neigh_free(neigh); continue; } @@ -505,6 +511,9 @@ static void path_rec_completion(int status, spin_unlock_irqrestore(&priv->lock, flags); + if (IS_ERR_OR_NULL(ah)) + ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); + if (old_ah) ipoib_put_ah(old_ah); @@ -610,7 +619,6 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, if (!ipoib_cm_get(neigh)) ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); if (!ipoib_cm_get(neigh)) { - list_del(&neigh->list); ipoib_neigh_free(neigh); goto err_drop; } @@ -631,7 +639,7 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, neigh->ah = NULL; if (!path->query && path_rec_start(dev, path)) - goto err_list; + goto err_path; __skb_queue_tail(&neigh->queue, skb); } @@ -640,9 +648,6 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, ipoib_neigh_put(neigh); return; -err_list: - list_del(&neigh->list); - err_path: ipoib_neigh_free(neigh); err_drop: @@ -722,7 +727,8 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) if ((header->proto != htons(ETH_P_IP)) && (header->proto != htons(ETH_P_IPV6)) && (header->proto != htons(ETH_P_ARP)) && - (header->proto != htons(ETH_P_RARP))) { + (header->proto != htons(ETH_P_RARP)) && + (header->proto != htons(ETH_P_TIPC))) { /* ethertype not supported by IPoIB */ ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); @@ -743,6 +749,7 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) switch (header->proto) { case htons(ETH_P_IP): case htons(ETH_P_IPV6): + case htons(ETH_P_TIPC): neigh = ipoib_neigh_get(dev, cb->hwaddr); if (unlikely(!neigh)) { neigh_add_path(skb, cb->hwaddr, dev); @@ -820,7 +827,7 @@ static int ipoib_hard_header(struct sk_buff *skb, */ memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); - return 0; + return sizeof *header; } static void ipoib_set_mcast_list(struct net_device *dev) @@ -844,10 +851,10 @@ static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) * different subnets. */ /* qpn octets[1:4) & port GUID octets[12:20) */ - u32 *daddr_32 = (u32 *) daddr; + u32 *d32 = (u32 *) daddr; u32 hv; - hv = jhash_3words(daddr_32[3], daddr_32[4], 0xFFFFFF & daddr_32[0], 0); + hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0); return hv & htbl->mask; } @@ -1088,6 +1095,8 @@ void ipoib_neigh_free(struct ipoib_neigh *neigh) rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); + /* remove from parent list */ + list_del(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); return; } else { @@ -1343,7 +1352,7 @@ void ipoib_setup(struct net_device *dev) ipoib_set_ethtool_ops(dev); - netif_napi_add(dev, &priv->napi, ipoib_poll, 100); + netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT); dev->watchdog_timeo = HZ; @@ -1359,13 +1368,11 @@ void ipoib_setup(struct net_device *dev) memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); - netif_carrier_off(dev); - priv->dev = dev; spin_lock_init(&priv->lock); - mutex_init(&priv->vlan_mutex); + init_rwsem(&priv->vlan_rwsem); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); @@ -1451,7 +1458,7 @@ static ssize_t create_child(struct device *dev, if (sscanf(buf, "%i", &pkey) != 1) return -EINVAL; - if (pkey < 0 || pkey > 0xffff) + if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000) return -EINVAL; /* @@ -1688,6 +1695,8 @@ static void ipoib_remove_one(struct ib_device *device) return; dev_list = ib_get_client_data(device, &ipoib_client); + if (!dev_list) + return; list_for_each_entry_safe(priv, tmp, dev_list, list) { ib_unregister_event_handler(&priv->event_handler); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index cecb98a4c66..d4e005720d0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -386,8 +386,10 @@ static int ipoib_mcast_join_complete(int status, mcast->mcmember.mgid.raw, status); /* We trap for port events ourselves. */ - if (status == -ENETRESET) - return 0; + if (status == -ENETRESET) { + status = 0; + goto out; + } if (!status) status = ipoib_mcast_join_finish(mcast, &multicast->rec); @@ -407,7 +409,8 @@ static int ipoib_mcast_join_complete(int status, if (mcast == priv->broadcast) queue_work(ipoib_workqueue, &priv->carrier_on_task); - return 0; + status = 0; + goto out; } if (mcast->logcount++ < 20) { @@ -434,7 +437,8 @@ static int ipoib_mcast_join_complete(int status, mcast->backoff * HZ); spin_unlock_irq(&priv->lock); mutex_unlock(&mcast_mutex); - +out: + complete(&mcast->done); return status; } @@ -484,11 +488,15 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, } set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, comp_mask, GFP_KERNEL, ipoib_mcast_join_complete, mcast); if (IS_ERR(mcast->mc)) { clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + complete(&mcast->done); ret = PTR_ERR(mcast->mc); ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); @@ -510,10 +518,18 @@ void ipoib_mcast_join_task(struct work_struct *work) struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, mcast_task.work); struct net_device *dev = priv->dev; + struct ib_port_attr port_attr; if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) return; + if (ib_query_port(priv->ca, priv->port, &port_attr) || + port_attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "port state is not ACTIVE (state = %d) suspending join task\n", + port_attr.state); + return; + } + if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) ipoib_warn(priv, "ib_query_gid() failed\n"); else @@ -751,6 +767,11 @@ void ipoib_mcast_dev_flush(struct net_device *dev) spin_unlock_irqrestore(&priv->lock, flags); + /* seperate between the wait to the leave*/ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) + if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) + wait_for_completion(&mcast->done); + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(dev, mcast); ipoib_mcast_free(mcast); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c index 74685936c94..cdc7df4fdb8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -31,6 +31,7 @@ */ #include <linux/netdevice.h> +#include <linux/if_arp.h> /* For ARPHRD_xxx */ #include <linux/module.h> #include <net/rtnetlink.h> #include "ipoib.h" @@ -103,7 +104,7 @@ static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, return -EINVAL; pdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); - if (!pdev) + if (!pdev || pdev->type != ARPHRD_INFINIBAND) return -ENODEV; ppriv = netdev_priv(pdev); @@ -119,6 +120,15 @@ static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, } else child_pkey = nla_get_u16(data[IFLA_IPOIB_PKEY]); + if (child_pkey == 0 || child_pkey == 0x8000) + return -EINVAL; + + /* + * Set the full membership bit, so that we join the right + * broadcast group, etc. + */ + child_pkey |= 0x8000; + err = __ipoib_vlan_add(ppriv, netdev_priv(dev), child_pkey, IPOIB_RTNL_CHILD); if (!err && data) @@ -133,10 +143,10 @@ static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head priv = netdev_priv(dev); ppriv = netdev_priv(priv->parent); - mutex_lock(&ppriv->vlan_mutex); + down_write(&ppriv->vlan_rwsem); unregister_netdevice_queue(dev, head); list_del(&priv->list); - mutex_unlock(&ppriv->vlan_mutex); + up_write(&ppriv->vlan_rwsem); } static size_t ipoib_get_size(const struct net_device *dev) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 049a997caff..c56d5d44c53 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -192,6 +192,9 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK) init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING) + init_attr.create_flags |= IB_QP_CREATE_NETIF_QP; + if (dev->features & NETIF_F_SG) init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 8292554bccb..9fad7b5ac8b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -140,7 +140,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) if (!rtnl_trylock()) return restart_syscall(); - mutex_lock(&ppriv->vlan_mutex); + down_write(&ppriv->vlan_rwsem); /* * First ensure this isn't a duplicate. We check the parent device and @@ -163,7 +163,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD); out: - mutex_unlock(&ppriv->vlan_mutex); + up_write(&ppriv->vlan_rwsem); if (result) free_netdev(priv->dev); @@ -185,7 +185,8 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) if (!rtnl_trylock()) return restart_syscall(); - mutex_lock(&ppriv->vlan_mutex); + + down_write(&ppriv->vlan_rwsem); list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { if (priv->pkey == pkey && priv->child_type == IPOIB_LEGACY_CHILD) { @@ -195,7 +196,8 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) break; } } - mutex_unlock(&ppriv->vlan_mutex); + up_write(&ppriv->vlan_rwsem); + rtnl_unlock(); if (dev) { diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 0ab8c9cc3a7..eb7973957a6 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -5,6 +5,7 @@ * Copyright (C) 2004 Alex Aizman * Copyright (C) 2005 Mike Christie * Copyright (c) 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. * maintained by openib-general@openib.org * * This software is available to you under a choice of one of two @@ -81,15 +82,24 @@ static unsigned int iscsi_max_lun = 512; module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO); int iser_debug_level = 0; +bool iser_pi_enable = false; +int iser_pi_guard = 0; -MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover " - "v" DRV_VER " (" DRV_DATE ")"); +MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz"); +MODULE_VERSION(DRV_VER); module_param_named(debug_level, iser_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)"); +module_param_named(pi_enable, iser_pi_enable, bool, 0644); +MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)"); + +module_param_named(pi_guard, iser_pi_guard, int, 0644); +MODULE_PARM_DESC(pi_guard, "T10-PI guard_type, 0:CRC|1:IP_CSUM (default:CRC)"); + +static struct workqueue_struct *release_wq; struct iser_global ig; void @@ -137,8 +147,8 @@ static int iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode) int iser_initialize_task_headers(struct iscsi_task *task, struct iser_tx_desc *tx_desc) { - struct iscsi_iser_conn *iser_conn = task->conn->dd_data; - struct iser_device *device = iser_conn->ib_conn->device; + struct iser_conn *ib_conn = task->conn->dd_data; + struct iser_device *device = ib_conn->device; struct iscsi_iser_task *iser_task = task->dd_data; u64 dma_addr; @@ -152,7 +162,7 @@ int iser_initialize_task_headers(struct iscsi_task *task, tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; tx_desc->tx_sg[0].lkey = device->mr->lkey; - iser_task->iser_conn = iser_conn; + iser_task->ib_conn = ib_conn; return 0; } /** @@ -175,6 +185,8 @@ iscsi_iser_task_init(struct iscsi_task *task) iser_task->command_sent = 0; iser_task_rdma_init(iser_task); + iser_task->sc = task->sc; + return 0; } @@ -277,10 +289,9 @@ iscsi_iser_task_xmit(struct iscsi_task *task) static void iscsi_iser_cleanup_task(struct iscsi_task *task) { struct iscsi_iser_task *iser_task = task->dd_data; - struct iser_tx_desc *tx_desc = &iser_task->desc; - - struct iscsi_iser_conn *iser_conn = task->conn->dd_data; - struct iser_device *device = iser_conn->ib_conn->device; + struct iser_tx_desc *tx_desc = &iser_task->desc; + struct iser_conn *ib_conn = task->conn->dd_data; + struct iser_device *device = ib_conn->device; ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); @@ -295,14 +306,25 @@ static void iscsi_iser_cleanup_task(struct iscsi_task *task) } } +static u8 iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + + if (iser_task->dir[ISER_DIR_IN]) + return iser_check_task_pi_status(iser_task, ISER_DIR_IN, + sector); + else + return iser_check_task_pi_status(iser_task, ISER_DIR_OUT, + sector); +} + static struct iscsi_cls_conn * iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, uint32_t conn_idx) { struct iscsi_conn *conn; struct iscsi_cls_conn *cls_conn; - struct iscsi_iser_conn *iser_conn; - cls_conn = iscsi_conn_setup(cls_session, sizeof(*iser_conn), conn_idx); + cls_conn = iscsi_conn_setup(cls_session, 0, conn_idx); if (!cls_conn) return NULL; conn = cls_conn->dd_data; @@ -313,39 +335,16 @@ iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, uint32_t conn_idx) */ conn->max_recv_dlength = ISER_RECV_DATA_SEG_LEN; - iser_conn = conn->dd_data; - conn->dd_data = iser_conn; - iser_conn->iscsi_conn = conn; - return cls_conn; } -static void -iscsi_iser_conn_destroy(struct iscsi_cls_conn *cls_conn) -{ - struct iscsi_conn *conn = cls_conn->dd_data; - struct iscsi_iser_conn *iser_conn = conn->dd_data; - struct iser_conn *ib_conn = iser_conn->ib_conn; - - iscsi_conn_teardown(cls_conn); - /* - * Userspace will normally call the stop callback and - * already have freed the ib_conn, but if it goofed up then - * we free it here. - */ - if (ib_conn) { - ib_conn->iser_conn = NULL; - iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */ - } -} - static int iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, struct iscsi_cls_conn *cls_conn, uint64_t transport_eph, int is_leading) { struct iscsi_conn *conn = cls_conn->dd_data; - struct iscsi_iser_conn *iser_conn; + struct iscsi_session *session; struct iser_conn *ib_conn; struct iscsi_endpoint *ep; int error; @@ -364,41 +363,51 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, } ib_conn = ep->dd_data; - if (iser_alloc_rx_descriptors(ib_conn)) + session = conn->session; + if (iser_alloc_rx_descriptors(ib_conn, session)) return -ENOMEM; /* binds the iSER connection retrieved from the previously * connected ep_handle to the iSCSI layer connection. exchanges * connection pointers */ - iser_err("binding iscsi/iser conn %p %p to ib_conn %p\n", - conn, conn->dd_data, ib_conn); - iser_conn = conn->dd_data; - ib_conn->iser_conn = iser_conn; - iser_conn->ib_conn = ib_conn; - iser_conn_get(ib_conn); /* ref iscsi/ib conn binding */ + iser_info("binding iscsi conn %p to ib_conn %p\n", conn, ib_conn); + + conn->dd_data = ib_conn; + ib_conn->iscsi_conn = conn; + return 0; } +static int +iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn) +{ + struct iscsi_conn *iscsi_conn; + struct iser_conn *ib_conn; + + iscsi_conn = cls_conn->dd_data; + ib_conn = iscsi_conn->dd_data; + reinit_completion(&ib_conn->stop_completion); + + return iscsi_conn_start(cls_conn); +} + static void iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag) { struct iscsi_conn *conn = cls_conn->dd_data; - struct iscsi_iser_conn *iser_conn = conn->dd_data; - struct iser_conn *ib_conn = iser_conn->ib_conn; + struct iser_conn *ib_conn = conn->dd_data; + + iser_dbg("stopping iscsi_conn: %p, ib_conn: %p\n", conn, ib_conn); + iscsi_conn_stop(cls_conn, flag); /* * Userspace may have goofed up and not bound the connection or * might have only partially setup the connection. */ if (ib_conn) { - iscsi_conn_stop(cls_conn, flag); - /* - * There is no unbind event so the stop callback - * must release the ref from the bind. - */ - iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */ + conn->dd_data = NULL; + complete(&ib_conn->stop_completion); } - iser_conn->ib_conn = NULL; } static void iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session) @@ -410,6 +419,17 @@ static void iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session) iscsi_host_free(shost); } +static inline unsigned int +iser_dif_prot_caps(int prot_caps) +{ + return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ? SHOST_DIF_TYPE1_PROTECTION | + SHOST_DIX_TYPE1_PROTECTION : 0) | + ((prot_caps & IB_PROT_T10DIF_TYPE_2) ? SHOST_DIF_TYPE2_PROTECTION | + SHOST_DIX_TYPE2_PROTECTION : 0) | + ((prot_caps & IB_PROT_T10DIF_TYPE_3) ? SHOST_DIF_TYPE3_PROTECTION | + SHOST_DIX_TYPE3_PROTECTION : 0); +} + static struct iscsi_cls_session * iscsi_iser_session_create(struct iscsi_endpoint *ep, uint16_t cmds_max, uint16_t qdepth, @@ -418,12 +438,13 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, struct iscsi_cls_session *cls_session; struct iscsi_session *session; struct Scsi_Host *shost; - struct iser_conn *ib_conn; + struct iser_conn *ib_conn = NULL; shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0); if (!shost) return NULL; shost->transportt = iscsi_iser_scsi_transport; + shost->cmd_per_lun = qdepth; shost->max_lun = iscsi_max_lun; shost->max_id = 0; shost->max_channel = 0; @@ -433,19 +454,31 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, * older userspace tools (before 2.0-870) did not pass us * the leading conn's ep so this will be NULL; */ - if (ep) + if (ep) { ib_conn = ep->dd_data; + if (ib_conn->pi_support) { + u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap; + + scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps)); + if (iser_pi_guard) + scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP); + else + scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC); + } + } if (iscsi_host_add(shost, ep ? ib_conn->device->ib_device->dma_device : NULL)) goto free_host; - /* - * we do not support setting can_queue cmd_per_lun from userspace yet - * because we preallocate so many resources - */ + if (cmds_max > ISER_DEF_XMIT_CMDS_MAX) { + iser_info("cmds_max changed from %u to %u\n", + cmds_max, ISER_DEF_XMIT_CMDS_MAX); + cmds_max = ISER_DEF_XMIT_CMDS_MAX; + } + cls_session = iscsi_session_setup(&iscsi_iser_transport, shost, - ISCSI_DEF_XMIT_CMDS_MAX, 0, + cmds_max, 0, sizeof(struct iscsi_iser_task), initial_cmdsn, 0); if (!cls_session) @@ -475,28 +508,28 @@ iscsi_iser_set_param(struct iscsi_cls_conn *cls_conn, case ISCSI_PARAM_HDRDGST_EN: sscanf(buf, "%d", &value); if (value) { - printk(KERN_ERR "DataDigest wasn't negotiated to None"); + iser_err("DataDigest wasn't negotiated to None\n"); return -EPROTO; } break; case ISCSI_PARAM_DATADGST_EN: sscanf(buf, "%d", &value); if (value) { - printk(KERN_ERR "DataDigest wasn't negotiated to None"); + iser_err("DataDigest wasn't negotiated to None\n"); return -EPROTO; } break; case ISCSI_PARAM_IFMARKER_EN: sscanf(buf, "%d", &value); if (value) { - printk(KERN_ERR "IFMarker wasn't negotiated to No"); + iser_err("IFMarker wasn't negotiated to No\n"); return -EPROTO; } break; case ISCSI_PARAM_OFMARKER_EN: sscanf(buf, "%d", &value); if (value) { - printk(KERN_ERR "OFMarker wasn't negotiated to No"); + iser_err("OFMarker wasn't negotiated to No\n"); return -EPROTO; } break; @@ -596,7 +629,7 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms) ib_conn->state == ISER_CONN_DOWN)) rc = -1; - iser_err("ib conn %p rc = %d\n", ib_conn, rc); + iser_info("ib conn %p rc = %d\n", ib_conn, rc); if (rc > 0) return 1; /* success, this is the equivalent of POLLOUT */ @@ -612,19 +645,20 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) struct iser_conn *ib_conn; ib_conn = ep->dd_data; - if (ib_conn->iser_conn) - /* - * Must suspend xmit path if the ep is bound to the - * iscsi_conn, so we know we are not accessing the ib_conn - * when we free it. - * - * This may not be bound if the ep poll failed. - */ - iscsi_suspend_tx(ib_conn->iser_conn->iscsi_conn); - - - iser_err("ib conn %p state %d\n",ib_conn, ib_conn->state); + iser_info("ep %p ib conn %p state %d\n", ep, ib_conn, ib_conn->state); iser_conn_terminate(ib_conn); + + /* + * if iser_conn and iscsi_conn are bound, we must wait iscsi_conn_stop + * call and ISER_CONN_DOWN state before freeing the iser resources. + * otherwise we are safe to free resources immediately. + */ + if (ib_conn->iscsi_conn) { + INIT_WORK(&ib_conn->release_work, iser_release_work); + queue_work(release_wq, &ib_conn->release_work); + } else { + iser_conn_release(ib_conn); + } } static umode_t iser_attr_is_visible(int param_type, int param) @@ -671,6 +705,7 @@ static umode_t iser_attr_is_visible(int param_type, int param) case ISCSI_PARAM_TGT_RESET_TMO: case ISCSI_PARAM_IFACE_NAME: case ISCSI_PARAM_INITIATOR_NAME: + case ISCSI_PARAM_DISCOVERY_SESS: return S_IRUGO; default: return 0; @@ -682,7 +717,7 @@ static umode_t iser_attr_is_visible(int param_type, int param) static struct scsi_host_template iscsi_iser_sht = { .module = THIS_MODULE, - .name = "iSCSI Initiator over iSER, v." DRV_VER, + .name = "iSCSI Initiator over iSER", .queuecommand = iscsi_queuecommand, .change_queue_depth = iscsi_change_queue_depth, .sg_tablesize = ISCSI_ISER_SG_TABLESIZE, @@ -700,20 +735,20 @@ static struct scsi_host_template iscsi_iser_sht = { static struct iscsi_transport iscsi_iser_transport = { .owner = THIS_MODULE, .name = "iser", - .caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T, + .caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_TEXT_NEGO, /* session management */ .create_session = iscsi_iser_session_create, .destroy_session = iscsi_iser_session_destroy, /* connection management */ .create_conn = iscsi_iser_conn_create, .bind_conn = iscsi_iser_conn_bind, - .destroy_conn = iscsi_iser_conn_destroy, + .destroy_conn = iscsi_conn_teardown, .attr_is_visible = iser_attr_is_visible, .set_param = iscsi_iser_set_param, .get_conn_param = iscsi_conn_get_param, .get_ep_param = iscsi_iser_get_ep_param, .get_session_param = iscsi_session_get_param, - .start_conn = iscsi_conn_start, + .start_conn = iscsi_iser_conn_start, .stop_conn = iscsi_iser_conn_stop, /* iscsi host params */ .get_host_param = iscsi_host_get_param, @@ -725,6 +760,7 @@ static struct iscsi_transport iscsi_iser_transport = { .xmit_task = iscsi_iser_task_xmit, .cleanup_task = iscsi_iser_cleanup_task, .alloc_pdu = iscsi_iser_pdu_alloc, + .check_protection = iscsi_iser_check_protection, /* recovery */ .session_recovery_timedout = iscsi_session_recovery_timedout, @@ -740,7 +776,7 @@ static int __init iser_init(void) iser_dbg("Starting iSER datamover...\n"); if (iscsi_max_lun < 1) { - printk(KERN_ERR "Invalid max_lun value of %u\n", iscsi_max_lun); + iser_err("Invalid max_lun value of %u\n", iscsi_max_lun); return -EINVAL; } @@ -759,6 +795,12 @@ static int __init iser_init(void) mutex_init(&ig.connlist_mutex); INIT_LIST_HEAD(&ig.connlist); + release_wq = alloc_workqueue("release workqueue", 0, 0); + if (!release_wq) { + iser_err("failed to allocate release workqueue\n"); + return -ENOMEM; + } + iscsi_iser_scsi_transport = iscsi_register_transport( &iscsi_iser_transport); if (!iscsi_iser_scsi_transport) { @@ -777,7 +819,24 @@ register_transport_failure: static void __exit iser_exit(void) { + struct iser_conn *ib_conn, *n; + int connlist_empty; + iser_dbg("Removing iSER datamover...\n"); + destroy_workqueue(release_wq); + + mutex_lock(&ig.connlist_mutex); + connlist_empty = list_empty(&ig.connlist); + mutex_unlock(&ig.connlist_mutex); + + if (!connlist_empty) { + iser_err("Error cleanup stage completed but we still have iser " + "connections, destroying them anyway.\n"); + list_for_each_entry_safe(ib_conn, n, &ig.connlist, conn_list) { + iser_conn_release(ib_conn); + } + } + iscsi_unregister_transport(&iscsi_iser_transport); kmem_cache_destroy(ig.desc_cache); } diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index ef7d3be46c3..97cd385bf7f 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -8,6 +8,7 @@ * * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -42,8 +43,11 @@ #include <linux/types.h> #include <linux/net.h> +#include <linux/printk.h> #include <scsi/libiscsi.h> #include <scsi/scsi_transport_iscsi.h> +#include <scsi/scsi_cmnd.h> +#include <scsi/scsi_device.h> #include <linux/interrupt.h> #include <linux/wait.h> @@ -65,12 +69,11 @@ #define DRV_NAME "iser" #define PFX DRV_NAME ": " -#define DRV_VER "0.1" -#define DRV_DATE "May 7th, 2006" +#define DRV_VER "1.4" #define iser_dbg(fmt, arg...) \ do { \ - if (iser_debug_level > 1) \ + if (iser_debug_level > 2) \ printk(KERN_DEBUG PFX "%s:" fmt,\ __func__ , ## arg); \ } while (0) @@ -78,7 +81,14 @@ #define iser_warn(fmt, arg...) \ do { \ if (iser_debug_level > 0) \ - printk(KERN_DEBUG PFX "%s:" fmt,\ + pr_warn(PFX "%s:" fmt, \ + __func__ , ## arg); \ + } while (0) + +#define iser_info(fmt, arg...) \ + do { \ + if (iser_debug_level > 1) \ + pr_info(PFX "%s:" fmt, \ __func__ , ## arg); \ } while (0) @@ -94,7 +104,13 @@ /* support up to 512KB in one RDMA */ #define ISCSI_ISER_SG_TABLESIZE (0x80000 >> SHIFT_4K) -#define ISER_DEF_CMD_PER_LUN 128 +#define ISER_DEF_XMIT_CMDS_DEFAULT 512 +#if ISCSI_DEF_XMIT_CMDS_MAX > ISER_DEF_XMIT_CMDS_DEFAULT + #define ISER_DEF_XMIT_CMDS_MAX ISCSI_DEF_XMIT_CMDS_MAX +#else + #define ISER_DEF_XMIT_CMDS_MAX ISER_DEF_XMIT_CMDS_DEFAULT +#endif +#define ISER_DEF_CMD_PER_LUN ISER_DEF_XMIT_CMDS_MAX /* QP settings */ /* Maximal bounds on received asynchronous PDUs */ @@ -103,9 +119,9 @@ #define ISER_MAX_TX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), * * SCSI_TMFUNC(2), LOGOUT(1) */ -#define ISER_QP_MAX_RECV_DTOS (ISCSI_DEF_XMIT_CMDS_MAX) +#define ISER_QP_MAX_RECV_DTOS (ISER_DEF_XMIT_CMDS_MAX) -#define ISER_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX >> 2) +#define ISER_MIN_POSTED_RX (ISER_DEF_XMIT_CMDS_MAX >> 2) /* the max TX (send) WR supported by the iSER QP is defined by * * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect * @@ -115,15 +131,26 @@ #define ISER_INFLIGHT_DATAOUTS 8 -#define ISER_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX * \ +#define ISER_QP_MAX_REQ_DTOS (ISER_DEF_XMIT_CMDS_MAX * \ (1 + ISER_INFLIGHT_DATAOUTS) + \ ISER_MAX_TX_MISC_PDUS + \ ISER_MAX_RX_MISC_PDUS) +/* Max registration work requests per command */ +#define ISER_MAX_REG_WR_PER_CMD 5 + +/* For Signature we don't support DATAOUTs so no need to make room for them */ +#define ISER_QP_SIG_MAX_REQ_DTOS (ISER_DEF_XMIT_CMDS_MAX * \ + (1 + ISER_MAX_REG_WR_PER_CMD) + \ + ISER_MAX_TX_MISC_PDUS + \ + ISER_MAX_RX_MISC_PDUS) + #define ISER_VER 0x10 #define ISER_WSV 0x08 #define ISER_RSV 0x04 +#define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL + struct iser_hdr { u8 flags; u8 rsvd[3]; @@ -133,6 +160,15 @@ struct iser_hdr { __be64 read_va; } __attribute__((packed)); + +#define ISER_ZBVA_NOT_SUPPORTED 0x80 +#define ISER_SEND_W_INV_NOT_SUPPORTED 0x40 + +struct iser_cm_hdr { + u8 flags; + u8 rsvd[3]; +} __packed; + /* Constant PDU lengths calculations */ #define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr)) @@ -178,7 +214,6 @@ struct iser_data_buf { /* fwd declarations */ struct iser_device; struct iser_cq_desc; -struct iscsi_iser_conn; struct iscsi_iser_task; struct iscsi_endpoint; @@ -188,7 +223,7 @@ struct iser_mem_reg { u64 va; u64 len; void *mem_h; - int is_fmr; + int is_mr; }; struct iser_regd_buf { @@ -229,9 +264,13 @@ struct iser_rx_desc { #define ISER_MAX_CQ 4 +struct iser_conn; +struct iscsi_iser_task; + struct iser_device { struct ib_device *ib_device; struct ib_pd *pd; + struct ib_device_attr dev_attr; struct ib_cq *rx_cq[ISER_MAX_CQ]; struct ib_cq *tx_cq[ISER_MAX_CQ]; struct ib_mr *mr; @@ -242,10 +281,44 @@ struct iser_device { int cq_active_qps[ISER_MAX_CQ]; int cqs_used; struct iser_cq_desc *cq_desc; + int (*iser_alloc_rdma_reg_res)(struct iser_conn *ib_conn, + unsigned cmds_max); + void (*iser_free_rdma_reg_res)(struct iser_conn *ib_conn); + int (*iser_reg_rdma_mem)(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir); + void (*iser_unreg_rdma_mem)(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir); +}; + +#define ISER_CHECK_GUARD 0xc0 +#define ISER_CHECK_REFTAG 0x0f +#define ISER_CHECK_APPTAG 0x30 + +enum iser_reg_indicator { + ISER_DATA_KEY_VALID = 1 << 0, + ISER_PROT_KEY_VALID = 1 << 1, + ISER_SIG_KEY_VALID = 1 << 2, + ISER_FASTREG_PROTECTED = 1 << 3, +}; + +struct iser_pi_context { + struct ib_mr *prot_mr; + struct ib_fast_reg_page_list *prot_frpl; + struct ib_mr *sig_mr; +}; + +struct fast_reg_descriptor { + struct list_head list; + /* For fast registration - FRWR */ + struct ib_mr *data_mr; + struct ib_fast_reg_page_list *data_frpl; + struct iser_pi_context *pi_ctx; + /* registration indicators container */ + u8 reg_indicators; }; struct iser_conn { - struct iscsi_iser_conn *iser_conn; /* iser conn for upcalls */ + struct iscsi_conn *iscsi_conn; struct iscsi_endpoint *ep; enum iser_ib_conn_state state; /* rdma connection state */ atomic_t refcount; @@ -253,13 +326,15 @@ struct iser_conn { struct iser_device *device; /* device context */ struct rdma_cm_id *cma_id; /* CMA ID */ struct ib_qp *qp; /* QP */ - struct ib_fmr_pool *fmr_pool; /* pool of IB FMRs */ wait_queue_head_t wait; /* waitq for conn/disconn */ + unsigned qp_max_recv_dtos; /* num of rx buffers */ + unsigned qp_max_recv_dtos_mask; /* above minus 1 */ + unsigned min_posted_rx; /* qp_max_recv_dtos >> 2 */ int post_recv_buf_count; /* posted rx count */ atomic_t post_send_buf_count; /* posted tx count */ char name[ISER_OBJECT_NAME_SIZE]; - struct iser_page_vec *page_vec; /* represents SG to fmr maps* - * maps serialized as tx is*/ + struct work_struct release_work; + struct completion stop_completion; struct list_head conn_list; /* entry in ig conn list */ char *login_buf; @@ -268,22 +343,34 @@ struct iser_conn { unsigned int rx_desc_head; struct iser_rx_desc *rx_descs; struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; -}; - -struct iscsi_iser_conn { - struct iscsi_conn *iscsi_conn;/* ptr to iscsi conn */ - struct iser_conn *ib_conn; /* iSER IB conn */ + bool pi_support; + + /* Connection memory registration pool */ + union { + struct { + struct ib_fmr_pool *pool; /* pool of IB FMRs */ + struct iser_page_vec *page_vec; /* represents SG to fmr maps* + * maps serialized as tx is*/ + } fmr; + struct { + struct list_head pool; + int pool_size; + } fastreg; + }; }; struct iscsi_iser_task { struct iser_tx_desc desc; - struct iscsi_iser_conn *iser_conn; + struct iser_conn *ib_conn; enum iser_task_status status; + struct scsi_cmnd *sc; int command_sent; /* set if command sent */ int dir[ISER_DIRS_NUM]; /* set if dir use*/ struct iser_regd_buf rdma_regd[ISER_DIRS_NUM];/* regd rdma buf */ struct iser_data_buf data[ISER_DIRS_NUM]; /* orig. data des*/ struct iser_data_buf data_copy[ISER_DIRS_NUM];/* contig. copy */ + struct iser_data_buf prot[ISER_DIRS_NUM]; /* prot desc */ + struct iser_data_buf prot_copy[ISER_DIRS_NUM];/* prot copy */ }; struct iser_page_vec { @@ -309,6 +396,8 @@ struct iser_global { extern struct iser_global ig; extern int iser_debug_level; +extern bool iser_pi_enable; +extern int iser_pi_guard; /* allocate connection resources needed for rdma functionality */ int iser_conn_set_full_featured_mode(struct iscsi_conn *conn); @@ -330,12 +419,12 @@ void iscsi_iser_recv(struct iscsi_conn *conn, void iser_conn_init(struct iser_conn *ib_conn); -void iser_conn_get(struct iser_conn *ib_conn); - -int iser_conn_put(struct iser_conn *ib_conn, int destroy_cma_id_allowed); +void iser_conn_release(struct iser_conn *ib_conn); void iser_conn_terminate(struct iser_conn *ib_conn); +void iser_release_work(struct work_struct *work); + void iser_rcv_completion(struct iser_rx_desc *desc, unsigned long dto_xfer_len, struct iser_conn *ib_conn); @@ -348,11 +437,15 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *task); void iser_free_rx_descriptors(struct iser_conn *ib_conn); -void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *task, - enum iser_data_dir cmd_dir); +void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, + struct iser_data_buf *mem, + struct iser_data_buf *mem_copy, + enum iser_data_dir cmd_dir); -int iser_reg_rdma_mem(struct iscsi_iser_task *task, - enum iser_data_dir cmd_dir); +int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task, + enum iser_data_dir cmd_dir); +int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *task, + enum iser_data_dir cmd_dir); int iser_connect(struct iser_conn *ib_conn, struct sockaddr_in *src_addr, @@ -363,7 +456,10 @@ int iser_reg_page_vec(struct iser_conn *ib_conn, struct iser_page_vec *page_vec, struct iser_mem_reg *mem_reg); -void iser_unreg_mem(struct iser_mem_reg *mem_reg); +void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir); +void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir); int iser_post_recvl(struct iser_conn *ib_conn); int iser_post_recvm(struct iser_conn *ib_conn, int count); @@ -374,8 +470,15 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, enum iser_data_dir iser_dir, enum dma_data_direction dma_dir); -void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task); +void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data); int iser_initialize_task_headers(struct iscsi_task *task, struct iser_tx_desc *tx_desc); -int iser_alloc_rx_descriptors(struct iser_conn *ib_conn); +int iser_alloc_rx_descriptors(struct iser_conn *ib_conn, struct iscsi_session *session); +int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max); +void iser_free_fmr_pool(struct iser_conn *ib_conn); +int iser_create_fastreg_pool(struct iser_conn *ib_conn, unsigned cmds_max); +void iser_free_fastreg_pool(struct iser_conn *ib_conn); +u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir, sector_t *sector); #endif diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index a00ccd1ca33..8d44a406063 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -40,14 +41,15 @@ #include "iscsi_iser.h" /* Register user buffer memory and initialize passive rdma - * dto descriptor. Total data size is stored in - * iser_task->data[ISER_DIR_IN].data_len + * dto descriptor. Data size is stored in + * task->data[ISER_DIR_IN].data_len, Protection size + * os stored in task->prot[ISER_DIR_IN].data_len */ -static int iser_prepare_read_cmd(struct iscsi_task *task, - unsigned int edtl) +static int iser_prepare_read_cmd(struct iscsi_task *task) { struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_device *device = iser_task->ib_conn->device; struct iser_regd_buf *regd_buf; int err; struct iser_hdr *hdr = &iser_task->desc.iser_header; @@ -60,15 +62,18 @@ static int iser_prepare_read_cmd(struct iscsi_task *task, if (err) return err; - if (edtl > iser_task->data[ISER_DIR_IN].data_len) { - iser_err("Total data length: %ld, less than EDTL: " - "%d, in READ cmd BHS itt: %d, conn: 0x%p\n", - iser_task->data[ISER_DIR_IN].data_len, edtl, - task->itt, iser_task->iser_conn); - return -EINVAL; + if (scsi_prot_sg_count(iser_task->sc)) { + struct iser_data_buf *pbuf_in = &iser_task->prot[ISER_DIR_IN]; + + err = iser_dma_map_task_data(iser_task, + pbuf_in, + ISER_DIR_IN, + DMA_FROM_DEVICE); + if (err) + return err; } - err = iser_reg_rdma_mem(iser_task,ISER_DIR_IN); + err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_IN); if (err) { iser_err("Failed to set up Data-IN RDMA\n"); return err; @@ -87,8 +92,9 @@ static int iser_prepare_read_cmd(struct iscsi_task *task, } /* Register user buffer memory and initialize passive rdma - * dto descriptor. Total data size is stored in - * task->data[ISER_DIR_OUT].data_len + * dto descriptor. Data size is stored in + * task->data[ISER_DIR_OUT].data_len, Protection size + * is stored at task->prot[ISER_DIR_OUT].data_len */ static int iser_prepare_write_cmd(struct iscsi_task *task, @@ -97,6 +103,7 @@ iser_prepare_write_cmd(struct iscsi_task *task, unsigned int edtl) { struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_device *device = iser_task->ib_conn->device; struct iser_regd_buf *regd_buf; int err; struct iser_hdr *hdr = &iser_task->desc.iser_header; @@ -110,15 +117,18 @@ iser_prepare_write_cmd(struct iscsi_task *task, if (err) return err; - if (edtl > iser_task->data[ISER_DIR_OUT].data_len) { - iser_err("Total data length: %ld, less than EDTL: %d, " - "in WRITE cmd BHS itt: %d, conn: 0x%p\n", - iser_task->data[ISER_DIR_OUT].data_len, - edtl, task->itt, task->conn); - return -EINVAL; + if (scsi_prot_sg_count(iser_task->sc)) { + struct iser_data_buf *pbuf_out = &iser_task->prot[ISER_DIR_OUT]; + + err = iser_dma_map_task_data(iser_task, + pbuf_out, + ISER_DIR_OUT, + DMA_TO_DEVICE); + if (err) + return err; } - err = iser_reg_rdma_mem(iser_task,ISER_DIR_OUT); + err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_OUT); if (err != 0) { iser_err("Failed to register write cmd RDMA mem\n"); return err; @@ -169,8 +179,78 @@ static void iser_create_send_desc(struct iser_conn *ib_conn, } } +static void iser_free_login_buf(struct iser_conn *ib_conn) +{ + if (!ib_conn->login_buf) + return; + + if (ib_conn->login_req_dma) + ib_dma_unmap_single(ib_conn->device->ib_device, + ib_conn->login_req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); + + if (ib_conn->login_resp_dma) + ib_dma_unmap_single(ib_conn->device->ib_device, + ib_conn->login_resp_dma, + ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); + + kfree(ib_conn->login_buf); + + /* make sure we never redo any unmapping */ + ib_conn->login_req_dma = 0; + ib_conn->login_resp_dma = 0; + ib_conn->login_buf = NULL; +} + +static int iser_alloc_login_buf(struct iser_conn *ib_conn) +{ + struct iser_device *device; + int req_err, resp_err; + + BUG_ON(ib_conn->device == NULL); + + device = ib_conn->device; + + ib_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN + + ISER_RX_LOGIN_SIZE, GFP_KERNEL); + if (!ib_conn->login_buf) + goto out_err; + + ib_conn->login_req_buf = ib_conn->login_buf; + ib_conn->login_resp_buf = ib_conn->login_buf + + ISCSI_DEF_MAX_RECV_SEG_LEN; + + ib_conn->login_req_dma = ib_dma_map_single(ib_conn->device->ib_device, + (void *)ib_conn->login_req_buf, + ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); + + ib_conn->login_resp_dma = ib_dma_map_single(ib_conn->device->ib_device, + (void *)ib_conn->login_resp_buf, + ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); + + req_err = ib_dma_mapping_error(device->ib_device, + ib_conn->login_req_dma); + resp_err = ib_dma_mapping_error(device->ib_device, + ib_conn->login_resp_dma); + + if (req_err || resp_err) { + if (req_err) + ib_conn->login_req_dma = 0; + if (resp_err) + ib_conn->login_resp_dma = 0; + goto free_login_buf; + } + return 0; + +free_login_buf: + iser_free_login_buf(ib_conn); + +out_err: + iser_err("unable to alloc or map login buf\n"); + return -ENOMEM; +} -int iser_alloc_rx_descriptors(struct iser_conn *ib_conn) +int iser_alloc_rx_descriptors(struct iser_conn *ib_conn, struct iscsi_session *session) { int i, j; u64 dma_addr; @@ -178,14 +258,24 @@ int iser_alloc_rx_descriptors(struct iser_conn *ib_conn) struct ib_sge *rx_sg; struct iser_device *device = ib_conn->device; - ib_conn->rx_descs = kmalloc(ISER_QP_MAX_RECV_DTOS * + ib_conn->qp_max_recv_dtos = session->cmds_max; + ib_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */ + ib_conn->min_posted_rx = ib_conn->qp_max_recv_dtos >> 2; + + if (device->iser_alloc_rdma_reg_res(ib_conn, session->scsi_cmds_max)) + goto create_rdma_reg_res_failed; + + if (iser_alloc_login_buf(ib_conn)) + goto alloc_login_buf_fail; + + ib_conn->rx_descs = kmalloc(session->cmds_max * sizeof(struct iser_rx_desc), GFP_KERNEL); if (!ib_conn->rx_descs) goto rx_desc_alloc_fail; rx_desc = ib_conn->rx_descs; - for (i = 0; i < ISER_QP_MAX_RECV_DTOS; i++, rx_desc++) { + for (i = 0; i < ib_conn->qp_max_recv_dtos; i++, rx_desc++) { dma_addr = ib_dma_map_single(device->ib_device, (void *)rx_desc, ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); if (ib_dma_mapping_error(device->ib_device, dma_addr)) @@ -206,10 +296,14 @@ rx_desc_dma_map_failed: rx_desc = ib_conn->rx_descs; for (j = 0; j < i; j++, rx_desc++) ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr, - ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); kfree(ib_conn->rx_descs); ib_conn->rx_descs = NULL; rx_desc_alloc_fail: + iser_free_login_buf(ib_conn); +alloc_login_buf_fail: + device->iser_free_rdma_reg_res(ib_conn); +create_rdma_reg_res_failed: iser_err("failed allocating rx descriptors / data buffers\n"); return -ENOMEM; } @@ -221,18 +315,27 @@ void iser_free_rx_descriptors(struct iser_conn *ib_conn) struct iser_device *device = ib_conn->device; if (!ib_conn->rx_descs) - return; + goto free_login_buf; + + if (device->iser_free_rdma_reg_res) + device->iser_free_rdma_reg_res(ib_conn); rx_desc = ib_conn->rx_descs; - for (i = 0; i < ISER_QP_MAX_RECV_DTOS; i++, rx_desc++) + for (i = 0; i < ib_conn->qp_max_recv_dtos; i++, rx_desc++) ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr, - ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); kfree(ib_conn->rx_descs); + /* make sure we never redo any unmapping */ + ib_conn->rx_descs = NULL; + +free_login_buf: + iser_free_login_buf(ib_conn); } static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) { - struct iscsi_iser_conn *iser_conn = conn->dd_data; + struct iser_conn *ib_conn = conn->dd_data; + struct iscsi_session *session = conn->session; iser_dbg("req op %x flags %x\n", req->opcode, req->flags); /* check if this is the last login - going to full feature phase */ @@ -244,12 +347,18 @@ static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) * response) and no posted send buffers left - they must have been * consumed during previous login phases. */ - WARN_ON(iser_conn->ib_conn->post_recv_buf_count != 1); - WARN_ON(atomic_read(&iser_conn->ib_conn->post_send_buf_count) != 0); + WARN_ON(ib_conn->post_recv_buf_count != 1); + WARN_ON(atomic_read(&ib_conn->post_send_buf_count) != 0); + + if (session->discovery_sess) { + iser_info("Discovery session, re-using login RX buffer\n"); + return 0; + } else + iser_info("Normal session, posting batch of RX %d buffers\n", + ib_conn->min_posted_rx); - iser_dbg("Initially post: %d\n", ISER_MIN_POSTED_RX); /* Initial post receive buffers */ - if (iser_post_recvm(iser_conn->ib_conn, ISER_MIN_POSTED_RX)) + if (iser_post_recvm(ib_conn, ib_conn->min_posted_rx)) return -ENOMEM; return 0; @@ -261,11 +370,11 @@ static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) int iser_send_command(struct iscsi_conn *conn, struct iscsi_task *task) { - struct iscsi_iser_conn *iser_conn = conn->dd_data; + struct iser_conn *ib_conn = conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; unsigned long edtl; int err; - struct iser_data_buf *data_buf; + struct iser_data_buf *data_buf, *prot_buf; struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr; struct scsi_cmnd *sc = task->sc; struct iser_tx_desc *tx_desc = &iser_task->desc; @@ -274,22 +383,31 @@ int iser_send_command(struct iscsi_conn *conn, /* build the tx desc regd header and add it to the tx desc dto */ tx_desc->type = ISCSI_TX_SCSI_COMMAND; - iser_create_send_desc(iser_conn->ib_conn, tx_desc); + iser_create_send_desc(ib_conn, tx_desc); - if (hdr->flags & ISCSI_FLAG_CMD_READ) + if (hdr->flags & ISCSI_FLAG_CMD_READ) { data_buf = &iser_task->data[ISER_DIR_IN]; - else + prot_buf = &iser_task->prot[ISER_DIR_IN]; + } else { data_buf = &iser_task->data[ISER_DIR_OUT]; + prot_buf = &iser_task->prot[ISER_DIR_OUT]; + } if (scsi_sg_count(sc)) { /* using a scatter list */ data_buf->buf = scsi_sglist(sc); data_buf->size = scsi_sg_count(sc); } - data_buf->data_len = scsi_bufflen(sc); + if (scsi_prot_sg_count(sc)) { + prot_buf->buf = scsi_prot_sglist(sc); + prot_buf->size = scsi_prot_sg_count(sc); + prot_buf->data_len = data_buf->data_len >> + ilog2(sc->device->sector_size) * 8; + } + if (hdr->flags & ISCSI_FLAG_CMD_READ) { - err = iser_prepare_read_cmd(task, edtl); + err = iser_prepare_read_cmd(task); if (err) goto send_command_error; } @@ -305,7 +423,7 @@ int iser_send_command(struct iscsi_conn *conn, iser_task->status = ISER_TASK_STATUS_STARTED; - err = iser_post_send(iser_conn->ib_conn, tx_desc); + err = iser_post_send(ib_conn, tx_desc); if (!err) return 0; @@ -321,7 +439,7 @@ int iser_send_data_out(struct iscsi_conn *conn, struct iscsi_task *task, struct iscsi_data *hdr) { - struct iscsi_iser_conn *iser_conn = conn->dd_data; + struct iser_conn *ib_conn = conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; struct iser_tx_desc *tx_desc = NULL; struct iser_regd_buf *regd_buf; @@ -370,7 +488,7 @@ int iser_send_data_out(struct iscsi_conn *conn, itt, buf_offset, data_seg_len); - err = iser_post_send(iser_conn->ib_conn, tx_desc); + err = iser_post_send(ib_conn, tx_desc); if (!err) return 0; @@ -383,19 +501,18 @@ send_data_out_error: int iser_send_control(struct iscsi_conn *conn, struct iscsi_task *task) { - struct iscsi_iser_conn *iser_conn = conn->dd_data; + struct iser_conn *ib_conn = conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; struct iser_tx_desc *mdesc = &iser_task->desc; unsigned long data_seg_len; int err = 0; struct iser_device *device; - struct iser_conn *ib_conn = iser_conn->ib_conn; /* build the tx desc regd header and add it to the tx desc dto */ mdesc->type = ISCSI_TX_CONTROL; - iser_create_send_desc(iser_conn->ib_conn, mdesc); + iser_create_send_desc(ib_conn, mdesc); - device = iser_conn->ib_conn->device; + device = ib_conn->device; data_seg_len = ntoh24(task->hdr->dlength); @@ -410,21 +527,22 @@ int iser_send_control(struct iscsi_conn *conn, ib_conn->login_req_dma, task->data_count, DMA_TO_DEVICE); - memcpy(iser_conn->ib_conn->login_req_buf, task->data, - task->data_count); + memcpy(ib_conn->login_req_buf, task->data, task->data_count); ib_dma_sync_single_for_device(device->ib_device, ib_conn->login_req_dma, task->data_count, DMA_TO_DEVICE); - tx_dsg->addr = iser_conn->ib_conn->login_req_dma; + tx_dsg->addr = ib_conn->login_req_dma; tx_dsg->length = task->data_count; tx_dsg->lkey = device->mr->lkey; mdesc->num_sge = 2; } if (task == conn->login_task) { - err = iser_post_recvl(iser_conn->ib_conn); + iser_dbg("op %x dsl %lx, posting login rx buffer\n", + task->hdr->opcode, data_seg_len); + err = iser_post_recvl(ib_conn); if (err) goto send_control_error; err = iser_post_rx_bufs(conn, task->hdr); @@ -432,7 +550,7 @@ int iser_send_control(struct iscsi_conn *conn, goto send_control_error; } - err = iser_post_send(iser_conn->ib_conn, mdesc); + err = iser_post_send(ib_conn, mdesc); if (!err) return 0; @@ -448,7 +566,6 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc, unsigned long rx_xfer_len, struct iser_conn *ib_conn) { - struct iscsi_iser_conn *conn = ib_conn->iser_conn; struct iscsi_hdr *hdr; u64 rx_dma; int rx_buflen, outstanding, count, err; @@ -470,25 +587,25 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc, iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode, hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN)); - iscsi_iser_recv(conn->iscsi_conn, hdr, - rx_desc->data, rx_xfer_len - ISER_HEADERS_LEN); + iscsi_iser_recv(ib_conn->iscsi_conn, hdr, rx_desc->data, + rx_xfer_len - ISER_HEADERS_LEN); ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma, - rx_buflen, DMA_FROM_DEVICE); + rx_buflen, DMA_FROM_DEVICE); /* decrementing conn->post_recv_buf_count only --after-- freeing the * * task eliminates the need to worry on tasks which are completed in * * parallel to the execution of iser_conn_term. So the code that waits * * for the posted rx bufs refcount to become zero handles everything */ - conn->ib_conn->post_recv_buf_count--; + ib_conn->post_recv_buf_count--; if (rx_dma == ib_conn->login_resp_dma) return; outstanding = ib_conn->post_recv_buf_count; - if (outstanding + ISER_MIN_POSTED_RX <= ISER_QP_MAX_RECV_DTOS) { - count = min(ISER_QP_MAX_RECV_DTOS - outstanding, - ISER_MIN_POSTED_RX); + if (outstanding + ib_conn->min_posted_rx <= ib_conn->qp_max_recv_dtos) { + count = min(ib_conn->qp_max_recv_dtos - outstanding, + ib_conn->min_posted_rx); err = iser_post_recvm(ib_conn, count); if (err) iser_err("posting %d rx bufs err %d\n", count, err); @@ -505,11 +622,12 @@ void iser_snd_completion(struct iser_tx_desc *tx_desc, ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); kmem_cache_free(ig.desc_cache, tx_desc); + tx_desc = NULL; } atomic_dec(&ib_conn->post_send_buf_count); - if (tx_desc->type == ISCSI_TX_CONTROL) { + if (tx_desc && tx_desc->type == ISCSI_TX_CONTROL) { /* this arithmetic is legal by libiscsi dd_data allocation */ task = (void *) ((long)(void *)tx_desc - sizeof(struct iscsi_task)); @@ -529,6 +647,9 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task) iser_task->data[ISER_DIR_IN].data_len = 0; iser_task->data[ISER_DIR_OUT].data_len = 0; + iser_task->prot[ISER_DIR_IN].data_len = 0; + iser_task->prot[ISER_DIR_OUT].data_len = 0; + memset(&iser_task->rdma_regd[ISER_DIR_IN], 0, sizeof(struct iser_regd_buf)); memset(&iser_task->rdma_regd[ISER_DIR_OUT], 0, @@ -537,34 +658,63 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task) void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) { - int is_rdma_aligned = 1; - struct iser_regd_buf *regd; + struct iser_device *device = iser_task->ib_conn->device; + int is_rdma_data_aligned = 1; + int is_rdma_prot_aligned = 1; + int prot_count = scsi_prot_sg_count(iser_task->sc); /* if we were reading, copy back to unaligned sglist, * anyway dma_unmap and free the copy */ if (iser_task->data_copy[ISER_DIR_IN].copy_buf != NULL) { - is_rdma_aligned = 0; - iser_finalize_rdma_unaligned_sg(iser_task, ISER_DIR_IN); + is_rdma_data_aligned = 0; + iser_finalize_rdma_unaligned_sg(iser_task, + &iser_task->data[ISER_DIR_IN], + &iser_task->data_copy[ISER_DIR_IN], + ISER_DIR_IN); } + if (iser_task->data_copy[ISER_DIR_OUT].copy_buf != NULL) { - is_rdma_aligned = 0; - iser_finalize_rdma_unaligned_sg(iser_task, ISER_DIR_OUT); + is_rdma_data_aligned = 0; + iser_finalize_rdma_unaligned_sg(iser_task, + &iser_task->data[ISER_DIR_OUT], + &iser_task->data_copy[ISER_DIR_OUT], + ISER_DIR_OUT); + } + + if (iser_task->prot_copy[ISER_DIR_IN].copy_buf != NULL) { + is_rdma_prot_aligned = 0; + iser_finalize_rdma_unaligned_sg(iser_task, + &iser_task->prot[ISER_DIR_IN], + &iser_task->prot_copy[ISER_DIR_IN], + ISER_DIR_IN); + } + + if (iser_task->prot_copy[ISER_DIR_OUT].copy_buf != NULL) { + is_rdma_prot_aligned = 0; + iser_finalize_rdma_unaligned_sg(iser_task, + &iser_task->prot[ISER_DIR_OUT], + &iser_task->prot_copy[ISER_DIR_OUT], + ISER_DIR_OUT); } if (iser_task->dir[ISER_DIR_IN]) { - regd = &iser_task->rdma_regd[ISER_DIR_IN]; - if (regd->reg.is_fmr) - iser_unreg_mem(®d->reg); + device->iser_unreg_rdma_mem(iser_task, ISER_DIR_IN); + if (is_rdma_data_aligned) + iser_dma_unmap_task_data(iser_task, + &iser_task->data[ISER_DIR_IN]); + if (prot_count && is_rdma_prot_aligned) + iser_dma_unmap_task_data(iser_task, + &iser_task->prot[ISER_DIR_IN]); } if (iser_task->dir[ISER_DIR_OUT]) { - regd = &iser_task->rdma_regd[ISER_DIR_OUT]; - if (regd->reg.is_fmr) - iser_unreg_mem(®d->reg); + device->iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT); + if (is_rdma_data_aligned) + iser_dma_unmap_task_data(iser_task, + &iser_task->data[ISER_DIR_OUT]); + if (prot_count && is_rdma_prot_aligned) + iser_dma_unmap_task_data(iser_task, + &iser_task->prot[ISER_DIR_OUT]); } - - /* if the data was unaligned, it was already unmapped and then copied */ - if (is_rdma_aligned) - iser_dma_unmap_task_data(iser_task); } diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 2033a928d34..47acd3ad3a1 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -44,13 +45,19 @@ * iser_start_rdma_unaligned_sg */ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + struct iser_data_buf *data_copy, enum iser_data_dir cmd_dir) { - int dma_nents; - struct ib_device *dev; + struct ib_device *dev = iser_task->ib_conn->device->ib_device; + struct scatterlist *sgl = (struct scatterlist *)data->buf; + struct scatterlist *sg; char *mem = NULL; - struct iser_data_buf *data = &iser_task->data[cmd_dir]; - unsigned long cmd_data_len = data->data_len; + unsigned long cmd_data_len = 0; + int dma_nents, i; + + for_each_sg(sgl, sg, data->size, i) + cmd_data_len += ib_sg_dma_len(dev, sg); if (cmd_data_len > ISER_KMALLOC_THRESHOLD) mem = (void *)__get_free_pages(GFP_ATOMIC, @@ -60,17 +67,16 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, if (mem == NULL) { iser_err("Failed to allocate mem size %d %d for copying sglist\n", - data->size,(int)cmd_data_len); + data->size, (int)cmd_data_len); return -ENOMEM; } if (cmd_dir == ISER_DIR_OUT) { /* copy the unaligned sg the buffer which is used for RDMA */ - struct scatterlist *sgl = (struct scatterlist *)data->buf; - struct scatterlist *sg; int i; char *p, *from; + sgl = (struct scatterlist *)data->buf; p = mem; for_each_sg(sgl, sg, data->size, i) { from = kmap_atomic(sg_page(sg)); @@ -82,39 +88,37 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, } } - sg_init_one(&iser_task->data_copy[cmd_dir].sg_single, mem, cmd_data_len); - iser_task->data_copy[cmd_dir].buf = - &iser_task->data_copy[cmd_dir].sg_single; - iser_task->data_copy[cmd_dir].size = 1; + sg_init_one(&data_copy->sg_single, mem, cmd_data_len); + data_copy->buf = &data_copy->sg_single; + data_copy->size = 1; + data_copy->copy_buf = mem; - iser_task->data_copy[cmd_dir].copy_buf = mem; - - dev = iser_task->iser_conn->ib_conn->device->ib_device; - dma_nents = ib_dma_map_sg(dev, - &iser_task->data_copy[cmd_dir].sg_single, - 1, + dma_nents = ib_dma_map_sg(dev, &data_copy->sg_single, 1, (cmd_dir == ISER_DIR_OUT) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); BUG_ON(dma_nents == 0); - iser_task->data_copy[cmd_dir].dma_nents = dma_nents; + data_copy->dma_nents = dma_nents; + data_copy->data_len = cmd_data_len; + return 0; } /** * iser_finalize_rdma_unaligned_sg */ + void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir) + struct iser_data_buf *data, + struct iser_data_buf *data_copy, + enum iser_data_dir cmd_dir) { struct ib_device *dev; - struct iser_data_buf *mem_copy; unsigned long cmd_data_len; - dev = iser_task->iser_conn->ib_conn->device->ib_device; - mem_copy = &iser_task->data_copy[cmd_dir]; + dev = iser_task->ib_conn->device->ib_device; - ib_dma_unmap_sg(dev, &mem_copy->sg_single, 1, + ib_dma_unmap_sg(dev, &data_copy->sg_single, 1, (cmd_dir == ISER_DIR_OUT) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); @@ -126,10 +130,10 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, int i; /* copy back read RDMA to unaligned sg */ - mem = mem_copy->copy_buf; + mem = data_copy->copy_buf; - sgl = (struct scatterlist *)iser_task->data[ISER_DIR_IN].buf; - sg_size = iser_task->data[ISER_DIR_IN].size; + sgl = (struct scatterlist *)data->buf; + sg_size = data->size; p = mem; for_each_sg(sgl, sg, sg_size, i) { @@ -142,15 +146,15 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, } } - cmd_data_len = iser_task->data[cmd_dir].data_len; + cmd_data_len = data->data_len; if (cmd_data_len > ISER_KMALLOC_THRESHOLD) - free_pages((unsigned long)mem_copy->copy_buf, + free_pages((unsigned long)data_copy->copy_buf, ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); else - kfree(mem_copy->copy_buf); + kfree(data_copy->copy_buf); - mem_copy->copy_buf = NULL; + data_copy->copy_buf = NULL; } #define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) @@ -169,8 +173,8 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, */ static int iser_sg_to_page_vec(struct iser_data_buf *data, - struct iser_page_vec *page_vec, - struct ib_device *ibdev) + struct ib_device *ibdev, u64 *pages, + int *offset, int *data_size) { struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf; u64 start_addr, end_addr, page, chunk_start = 0; @@ -179,7 +183,7 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data, int i, new_chunk, cur_page, last_ent = data->dma_nents - 1; /* compute the offset of first element */ - page_vec->offset = (u64) sgl[0].offset & ~MASK_4K; + *offset = (u64) sgl[0].offset & ~MASK_4K; new_chunk = 1; cur_page = 0; @@ -203,13 +207,14 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data, which might be unaligned */ page = chunk_start & MASK_4K; do { - page_vec->pages[cur_page++] = page; + pages[cur_page++] = page; page += SIZE_4K; } while (page < end_addr); } - page_vec->data_size = total_sz; - iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page); + *data_size = total_sz; + iser_dbg("page_vec->data_size:%d cur_page %d\n", + *data_size, cur_page); return cur_page; } @@ -266,11 +271,8 @@ static void iser_data_buf_dump(struct iser_data_buf *data, struct scatterlist *sg; int i; - if (iser_debug_level == 0) - return; - for_each_sg(sgl, sg, data->dma_nents, i) - iser_warn("sg[%d] dma_addr:0x%lX page:0x%p " + iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p " "off:0x%x sz:0x%x dma_len:0x%x\n", i, (unsigned long)ib_sg_dma_address(ibdev, sg), sg_page(sg), sg->offset, @@ -297,8 +299,10 @@ static void iser_page_vec_build(struct iser_data_buf *data, page_vec->offset = 0; iser_dbg("Translating sg sz: %d\n", data->dma_nents); - page_vec_len = iser_sg_to_page_vec(data, page_vec, ibdev); - iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents,page_vec_len); + page_vec_len = iser_sg_to_page_vec(data, ibdev, page_vec->pages, + &page_vec->offset, + &page_vec->data_size); + iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents, page_vec_len); page_vec->length = page_vec_len; @@ -318,7 +322,7 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, struct ib_device *dev; iser_task->dir[iser_dir] = 1; - dev = iser_task->iser_conn->ib_conn->device->ib_device; + dev = iser_task->ib_conn->device->ib_device; data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir); if (data->dma_nents == 0) { @@ -328,35 +332,52 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, return 0; } -void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task) +void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data) { struct ib_device *dev; - struct iser_data_buf *data; - dev = iser_task->iser_conn->ib_conn->device->ib_device; + dev = iser_task->ib_conn->device->ib_device; + ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE); +} - if (iser_task->dir[ISER_DIR_IN]) { - data = &iser_task->data[ISER_DIR_IN]; - ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE); - } +static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, + struct ib_device *ibdev, + struct iser_data_buf *mem, + struct iser_data_buf *mem_copy, + enum iser_data_dir cmd_dir, + int aligned_len) +{ + struct iscsi_conn *iscsi_conn = iser_task->ib_conn->iscsi_conn; - if (iser_task->dir[ISER_DIR_OUT]) { - data = &iser_task->data[ISER_DIR_OUT]; - ib_dma_unmap_sg(dev, data->buf, data->size, DMA_TO_DEVICE); - } + iscsi_conn->fmr_unalign_cnt++; + iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n", + aligned_len, mem->size); + + if (iser_debug_level > 0) + iser_data_buf_dump(mem, ibdev); + + /* unmap the command data before accessing it */ + iser_dma_unmap_task_data(iser_task, mem); + + /* allocate copy buf, if we are writing, copy the */ + /* unaligned scatterlist, dma map the copy */ + if (iser_start_rdma_unaligned_sg(iser_task, mem, mem_copy, cmd_dir) != 0) + return -ENOMEM; + + return 0; } /** - * iser_reg_rdma_mem - Registers memory intended for RDMA, - * obtaining rkey and va + * iser_reg_rdma_mem_fmr - Registers memory intended for RDMA, + * using FMR (if possible) obtaining rkey and va * * returns 0 on success, errno code on failure */ -int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir) +int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) { - struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn; - struct iser_conn *ib_conn = iser_task->iser_conn->ib_conn; + struct iser_conn *ib_conn = iser_task->ib_conn; struct iser_device *device = ib_conn->device; struct ib_device *ibdev = device->ib_device; struct iser_data_buf *mem = &iser_task->data[cmd_dir]; @@ -370,18 +391,13 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task, aligned_len = iser_data_buf_aligned_len(mem, ibdev); if (aligned_len != mem->dma_nents) { - iscsi_conn->fmr_unalign_cnt++; - iser_warn("rdma alignment violation %d/%d aligned\n", - aligned_len, mem->size); - iser_data_buf_dump(mem, ibdev); - - /* unmap the command data before accessing it */ - iser_dma_unmap_task_data(iser_task); - - /* allocate copy buf, if we are writing, copy the */ - /* unaligned scatterlist, dma map the copy */ - if (iser_start_rdma_unaligned_sg(iser_task, cmd_dir) != 0) - return -ENOMEM; + err = fall_to_bounce_buf(iser_task, ibdev, mem, + &iser_task->data_copy[cmd_dir], + cmd_dir, aligned_len); + if (err) { + iser_err("failed to allocate bounce buffer\n"); + return err; + } mem = &iser_task->data_copy[cmd_dir]; } @@ -393,7 +409,7 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task, regd_buf->reg.rkey = device->mr->rkey; regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); - regd_buf->reg.is_fmr = 0; + regd_buf->reg.is_mr = 0; iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " "va: 0x%08lX sz: %ld]\n", @@ -402,21 +418,383 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task, (unsigned long)regd_buf->reg.va, (unsigned long)regd_buf->reg.len); } else { /* use FMR for multiple dma entries */ - iser_page_vec_build(mem, ib_conn->page_vec, ibdev); - err = iser_reg_page_vec(ib_conn, ib_conn->page_vec, ®d_buf->reg); - if (err) { + iser_page_vec_build(mem, ib_conn->fmr.page_vec, ibdev); + err = iser_reg_page_vec(ib_conn, ib_conn->fmr.page_vec, + ®d_buf->reg); + if (err && err != -EAGAIN) { iser_data_buf_dump(mem, ibdev); iser_err("mem->dma_nents = %d (dlength = 0x%x)\n", mem->dma_nents, ntoh24(iser_task->desc.iscsi_header.dlength)); iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n", - ib_conn->page_vec->data_size, ib_conn->page_vec->length, - ib_conn->page_vec->offset); - for (i=0 ; i<ib_conn->page_vec->length ; i++) + ib_conn->fmr.page_vec->data_size, + ib_conn->fmr.page_vec->length, + ib_conn->fmr.page_vec->offset); + for (i = 0; i < ib_conn->fmr.page_vec->length; i++) iser_err("page_vec[%d] = 0x%llx\n", i, - (unsigned long long) ib_conn->page_vec->pages[i]); + (unsigned long long) ib_conn->fmr.page_vec->pages[i]); + } + if (err) return err; + } + return 0; +} + +static inline enum ib_t10_dif_type +scsi2ib_prot_type(unsigned char prot_type) +{ + switch (prot_type) { + case SCSI_PROT_DIF_TYPE0: + return IB_T10DIF_NONE; + case SCSI_PROT_DIF_TYPE1: + return IB_T10DIF_TYPE1; + case SCSI_PROT_DIF_TYPE2: + return IB_T10DIF_TYPE2; + case SCSI_PROT_DIF_TYPE3: + return IB_T10DIF_TYPE3; + default: + return IB_T10DIF_NONE; + } +} + + +static int +iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs) +{ + unsigned char scsi_ptype = scsi_get_prot_type(sc); + + sig_attrs->mem.sig_type = IB_SIG_TYPE_T10_DIF; + sig_attrs->wire.sig_type = IB_SIG_TYPE_T10_DIF; + sig_attrs->mem.sig.dif.pi_interval = sc->device->sector_size; + sig_attrs->wire.sig.dif.pi_interval = sc->device->sector_size; + + switch (scsi_get_prot_op(sc)) { + case SCSI_PROT_WRITE_INSERT: + case SCSI_PROT_READ_STRIP: + sig_attrs->mem.sig.dif.type = IB_T10DIF_NONE; + sig_attrs->wire.sig.dif.type = scsi2ib_prot_type(scsi_ptype); + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->wire.sig.dif.ref_tag = scsi_get_lba(sc) & + 0xffffffff; + break; + case SCSI_PROT_READ_INSERT: + case SCSI_PROT_WRITE_STRIP: + sig_attrs->mem.sig.dif.type = scsi2ib_prot_type(scsi_ptype); + sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->mem.sig.dif.ref_tag = scsi_get_lba(sc) & + 0xffffffff; + sig_attrs->wire.sig.dif.type = IB_T10DIF_NONE; + break; + case SCSI_PROT_READ_PASS: + case SCSI_PROT_WRITE_PASS: + sig_attrs->mem.sig.dif.type = scsi2ib_prot_type(scsi_ptype); + sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->mem.sig.dif.ref_tag = scsi_get_lba(sc) & + 0xffffffff; + sig_attrs->wire.sig.dif.type = scsi2ib_prot_type(scsi_ptype); + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->wire.sig.dif.ref_tag = scsi_get_lba(sc) & + 0xffffffff; + break; + default: + iser_err("Unsupported PI operation %d\n", + scsi_get_prot_op(sc)); + return -EINVAL; + } + return 0; +} + + +static int +iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) +{ + switch (scsi_get_prot_type(sc)) { + case SCSI_PROT_DIF_TYPE0: + *mask = 0x0; + break; + case SCSI_PROT_DIF_TYPE1: + case SCSI_PROT_DIF_TYPE2: + *mask = ISER_CHECK_GUARD | ISER_CHECK_REFTAG; + break; + case SCSI_PROT_DIF_TYPE3: + *mask = ISER_CHECK_GUARD; + break; + default: + iser_err("Unsupported protection type %d\n", + scsi_get_prot_type(sc)); + return -EINVAL; + } + + return 0; +} + +static int +iser_reg_sig_mr(struct iscsi_iser_task *iser_task, + struct fast_reg_descriptor *desc, struct ib_sge *data_sge, + struct ib_sge *prot_sge, struct ib_sge *sig_sge) +{ + struct iser_conn *ib_conn = iser_task->ib_conn; + struct iser_pi_context *pi_ctx = desc->pi_ctx; + struct ib_send_wr sig_wr, inv_wr; + struct ib_send_wr *bad_wr, *wr = NULL; + struct ib_sig_attrs sig_attrs; + int ret; + u32 key; + + memset(&sig_attrs, 0, sizeof(sig_attrs)); + ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs); + if (ret) + goto err; + + ret = iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask); + if (ret) + goto err; + + if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) { + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.wr_id = ISER_FASTREG_LI_WRID; + inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey; + wr = &inv_wr; + /* Bump the key */ + key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(pi_ctx->sig_mr, ++key); + } + + memset(&sig_wr, 0, sizeof(sig_wr)); + sig_wr.opcode = IB_WR_REG_SIG_MR; + sig_wr.wr_id = ISER_FASTREG_LI_WRID; + sig_wr.sg_list = data_sge; + sig_wr.num_sge = 1; + sig_wr.wr.sig_handover.sig_attrs = &sig_attrs; + sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr; + if (scsi_prot_sg_count(iser_task->sc)) + sig_wr.wr.sig_handover.prot = prot_sge; + sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + + if (!wr) + wr = &sig_wr; + else + wr->next = &sig_wr; + + ret = ib_post_send(ib_conn->qp, wr, &bad_wr); + if (ret) { + iser_err("reg_sig_mr failed, ret:%d\n", ret); + goto err; + } + desc->reg_indicators &= ~ISER_SIG_KEY_VALID; + + sig_sge->lkey = pi_ctx->sig_mr->lkey; + sig_sge->addr = 0; + sig_sge->length = data_sge->length + prot_sge->length; + if (scsi_get_prot_op(iser_task->sc) == SCSI_PROT_WRITE_INSERT || + scsi_get_prot_op(iser_task->sc) == SCSI_PROT_READ_STRIP) { + sig_sge->length += (data_sge->length / + iser_task->sc->device->sector_size) * 8; + } + + iser_dbg("sig_sge: addr: 0x%llx length: %u lkey: 0x%x\n", + sig_sge->addr, sig_sge->length, + sig_sge->lkey); +err: + return ret; +} + +static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, + struct iser_regd_buf *regd_buf, + struct iser_data_buf *mem, + enum iser_reg_indicator ind, + struct ib_sge *sge) +{ + struct fast_reg_descriptor *desc = regd_buf->reg.mem_h; + struct iser_conn *ib_conn = iser_task->ib_conn; + struct iser_device *device = ib_conn->device; + struct ib_device *ibdev = device->ib_device; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; + struct ib_send_wr fastreg_wr, inv_wr; + struct ib_send_wr *bad_wr, *wr = NULL; + u8 key; + int ret, offset, size, plen; + + /* if there a single dma entry, dma mr suffices */ + if (mem->dma_nents == 1) { + struct scatterlist *sg = (struct scatterlist *)mem->buf; + + sge->lkey = device->mr->lkey; + sge->addr = ib_sg_dma_address(ibdev, &sg[0]); + sge->length = ib_sg_dma_len(ibdev, &sg[0]); + + iser_dbg("Single DMA entry: lkey=0x%x, addr=0x%llx, length=0x%x\n", + sge->lkey, sge->addr, sge->length); + return 0; + } + + if (ind == ISER_DATA_KEY_VALID) { + mr = desc->data_mr; + frpl = desc->data_frpl; + } else { + mr = desc->pi_ctx->prot_mr; + frpl = desc->pi_ctx->prot_frpl; + } + + plen = iser_sg_to_page_vec(mem, device->ib_device, frpl->page_list, + &offset, &size); + if (plen * SIZE_4K < size) { + iser_err("fast reg page_list too short to hold this SG\n"); + return -EINVAL; + } + + if (!(desc->reg_indicators & ind)) { + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.wr_id = ISER_FASTREG_LI_WRID; + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.ex.invalidate_rkey = mr->rkey; + wr = &inv_wr; + /* Bump the key */ + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); + } + + /* Prepare FASTREG WR */ + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.wr_id = ISER_FASTREG_LI_WRID; + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.wr.fast_reg.iova_start = frpl->page_list[0] + offset; + fastreg_wr.wr.fast_reg.page_list = frpl; + fastreg_wr.wr.fast_reg.page_list_len = plen; + fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K; + fastreg_wr.wr.fast_reg.length = size; + fastreg_wr.wr.fast_reg.rkey = mr->rkey; + fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + + if (!wr) + wr = &fastreg_wr; + else + wr->next = &fastreg_wr; + + ret = ib_post_send(ib_conn->qp, wr, &bad_wr); + if (ret) { + iser_err("fast registration failed, ret:%d\n", ret); + return ret; + } + desc->reg_indicators &= ~ind; + + sge->lkey = mr->lkey; + sge->addr = frpl->page_list[0] + offset; + sge->length = size; + + return ret; +} + +/** + * iser_reg_rdma_mem_fastreg - Registers memory intended for RDMA, + * using Fast Registration WR (if possible) obtaining rkey and va + * + * returns 0 on success, errno code on failure + */ +int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) +{ + struct iser_conn *ib_conn = iser_task->ib_conn; + struct iser_device *device = ib_conn->device; + struct ib_device *ibdev = device->ib_device; + struct iser_data_buf *mem = &iser_task->data[cmd_dir]; + struct iser_regd_buf *regd_buf = &iser_task->rdma_regd[cmd_dir]; + struct fast_reg_descriptor *desc = NULL; + struct ib_sge data_sge; + int err, aligned_len; + unsigned long flags; + + aligned_len = iser_data_buf_aligned_len(mem, ibdev); + if (aligned_len != mem->dma_nents) { + err = fall_to_bounce_buf(iser_task, ibdev, mem, + &iser_task->data_copy[cmd_dir], + cmd_dir, aligned_len); + if (err) { + iser_err("failed to allocate bounce buffer\n"); + return err; + } + mem = &iser_task->data_copy[cmd_dir]; + } + + if (mem->dma_nents != 1 || + scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { + spin_lock_irqsave(&ib_conn->lock, flags); + desc = list_first_entry(&ib_conn->fastreg.pool, + struct fast_reg_descriptor, list); + list_del(&desc->list); + spin_unlock_irqrestore(&ib_conn->lock, flags); + regd_buf->reg.mem_h = desc; + } + + err = iser_fast_reg_mr(iser_task, regd_buf, mem, + ISER_DATA_KEY_VALID, &data_sge); + if (err) + goto err_reg; + + if (scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { + struct ib_sge prot_sge, sig_sge; + + memset(&prot_sge, 0, sizeof(prot_sge)); + if (scsi_prot_sg_count(iser_task->sc)) { + mem = &iser_task->prot[cmd_dir]; + aligned_len = iser_data_buf_aligned_len(mem, ibdev); + if (aligned_len != mem->dma_nents) { + err = fall_to_bounce_buf(iser_task, ibdev, mem, + &iser_task->prot_copy[cmd_dir], + cmd_dir, aligned_len); + if (err) { + iser_err("failed to allocate bounce buffer\n"); + return err; + } + mem = &iser_task->prot_copy[cmd_dir]; + } + + err = iser_fast_reg_mr(iser_task, regd_buf, mem, + ISER_PROT_KEY_VALID, &prot_sge); + if (err) + goto err_reg; } + + err = iser_reg_sig_mr(iser_task, desc, &data_sge, + &prot_sge, &sig_sge); + if (err) { + iser_err("Failed to register signature mr\n"); + return err; + } + desc->reg_indicators |= ISER_FASTREG_PROTECTED; + + regd_buf->reg.lkey = sig_sge.lkey; + regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey; + regd_buf->reg.va = sig_sge.addr; + regd_buf->reg.len = sig_sge.length; + regd_buf->reg.is_mr = 1; + } else { + if (desc) { + regd_buf->reg.rkey = desc->data_mr->rkey; + regd_buf->reg.is_mr = 1; + } else { + regd_buf->reg.rkey = device->mr->rkey; + regd_buf->reg.is_mr = 0; + } + + regd_buf->reg.lkey = data_sge.lkey; + regd_buf->reg.va = data_sge.addr; + regd_buf->reg.len = data_sge.length; } + return 0; +err_reg: + if (desc) { + spin_lock_irqsave(&ib_conn->lock, flags); + list_add_tail(&desc->list, &ib_conn->fastreg.pool); + spin_unlock_irqrestore(&ib_conn->lock, flags); + } + + return err; } diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 95a49affee4..ea01075f9f9 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -70,12 +71,40 @@ static void iser_event_handler(struct ib_event_handler *handler, */ static int iser_create_device_ib_res(struct iser_device *device) { - int i, j; struct iser_cq_desc *cq_desc; + struct ib_device_attr *dev_attr = &device->dev_attr; + int ret, i, j; + + ret = ib_query_device(device->ib_device, dev_attr); + if (ret) { + pr_warn("Query device failed for %s\n", device->ib_device->name); + return ret; + } + + /* Assign function handles - based on FMR support */ + if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr && + device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) { + iser_info("FMR supported, using FMR for registration\n"); + device->iser_alloc_rdma_reg_res = iser_create_fmr_pool; + device->iser_free_rdma_reg_res = iser_free_fmr_pool; + device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr; + device->iser_unreg_rdma_mem = iser_unreg_mem_fmr; + } else + if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + iser_info("FastReg supported, using FastReg for registration\n"); + device->iser_alloc_rdma_reg_res = iser_create_fastreg_pool; + device->iser_free_rdma_reg_res = iser_free_fastreg_pool; + device->iser_reg_rdma_mem = iser_reg_rdma_mem_fastreg; + device->iser_unreg_rdma_mem = iser_unreg_mem_fastreg; + } else { + iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n"); + return -1; + } device->cqs_used = min(ISER_MAX_CQ, device->ib_device->num_comp_vectors); - iser_err("using %d CQs, device %s supports %d vectors\n", device->cqs_used, - device->ib_device->name, device->ib_device->num_comp_vectors); + iser_info("using %d CQs, device %s supports %d vectors\n", + device->cqs_used, device->ib_device->name, + device->ib_device->num_comp_vectors); device->cq_desc = kmalloc(sizeof(struct iser_cq_desc) * device->cqs_used, GFP_KERNEL); @@ -176,56 +205,23 @@ static void iser_free_device_ib_res(struct iser_device *device) } /** - * iser_create_ib_conn_res - Creates FMR pool and Queue-Pair (QP) + * iser_create_fmr_pool - Creates FMR pool and page_vector * - * returns 0 on success, -1 on failure + * returns 0 on success, or errno code on failure */ -static int iser_create_ib_conn_res(struct iser_conn *ib_conn) +int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max) { - struct iser_device *device; - struct ib_qp_init_attr init_attr; - int req_err, resp_err, ret = -ENOMEM; + struct iser_device *device = ib_conn->device; struct ib_fmr_pool_param params; - int index, min_index = 0; + int ret = -ENOMEM; - BUG_ON(ib_conn->device == NULL); - - device = ib_conn->device; - - ib_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN + - ISER_RX_LOGIN_SIZE, GFP_KERNEL); - if (!ib_conn->login_buf) - goto out_err; + ib_conn->fmr.page_vec = kmalloc(sizeof(*ib_conn->fmr.page_vec) + + (sizeof(u64)*(ISCSI_ISER_SG_TABLESIZE + 1)), + GFP_KERNEL); + if (!ib_conn->fmr.page_vec) + return ret; - ib_conn->login_req_buf = ib_conn->login_buf; - ib_conn->login_resp_buf = ib_conn->login_buf + ISCSI_DEF_MAX_RECV_SEG_LEN; - - ib_conn->login_req_dma = ib_dma_map_single(ib_conn->device->ib_device, - (void *)ib_conn->login_req_buf, - ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); - - ib_conn->login_resp_dma = ib_dma_map_single(ib_conn->device->ib_device, - (void *)ib_conn->login_resp_buf, - ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); - - req_err = ib_dma_mapping_error(device->ib_device, ib_conn->login_req_dma); - resp_err = ib_dma_mapping_error(device->ib_device, ib_conn->login_resp_dma); - - if (req_err || resp_err) { - if (req_err) - ib_conn->login_req_dma = 0; - if (resp_err) - ib_conn->login_resp_dma = 0; - goto out_err; - } - - ib_conn->page_vec = kmalloc(sizeof(struct iser_page_vec) + - (sizeof(u64) * (ISCSI_ISER_SG_TABLESIZE +1)), - GFP_KERNEL); - if (!ib_conn->page_vec) - goto out_err; - - ib_conn->page_vec->pages = (u64 *) (ib_conn->page_vec + 1); + ib_conn->fmr.page_vec->pages = (u64 *)(ib_conn->fmr.page_vec + 1); params.page_shift = SHIFT_4K; /* when the first/last SG element are not start/end * @@ -233,21 +229,224 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn) params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1; /* make the pool size twice the max number of SCSI commands * * the ML is expected to queue, watermark for unmap at 50% */ - params.pool_size = ISCSI_DEF_XMIT_CMDS_MAX * 2; - params.dirty_watermark = ISCSI_DEF_XMIT_CMDS_MAX; + params.pool_size = cmds_max * 2; + params.dirty_watermark = cmds_max; params.cache = 0; params.flush_function = NULL; params.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ); - ib_conn->fmr_pool = ib_create_fmr_pool(device->pd, ¶ms); - if (IS_ERR(ib_conn->fmr_pool)) { - ret = PTR_ERR(ib_conn->fmr_pool); - ib_conn->fmr_pool = NULL; - goto out_err; + ib_conn->fmr.pool = ib_create_fmr_pool(device->pd, ¶ms); + if (!IS_ERR(ib_conn->fmr.pool)) + return 0; + + /* no FMR => no need for page_vec */ + kfree(ib_conn->fmr.page_vec); + ib_conn->fmr.page_vec = NULL; + + ret = PTR_ERR(ib_conn->fmr.pool); + ib_conn->fmr.pool = NULL; + if (ret != -ENOSYS) { + iser_err("FMR allocation failed, err %d\n", ret); + return ret; + } else { + iser_warn("FMRs are not supported, using unaligned mode\n"); + return 0; + } +} + +/** + * iser_free_fmr_pool - releases the FMR pool and page vec + */ +void iser_free_fmr_pool(struct iser_conn *ib_conn) +{ + iser_info("freeing conn %p fmr pool %p\n", + ib_conn, ib_conn->fmr.pool); + + if (ib_conn->fmr.pool != NULL) + ib_destroy_fmr_pool(ib_conn->fmr.pool); + + ib_conn->fmr.pool = NULL; + + kfree(ib_conn->fmr.page_vec); + ib_conn->fmr.page_vec = NULL; +} + +static int +iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd, + bool pi_enable, struct fast_reg_descriptor *desc) +{ + int ret; + + desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE + 1); + if (IS_ERR(desc->data_frpl)) { + ret = PTR_ERR(desc->data_frpl); + iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n", + ret); + return PTR_ERR(desc->data_frpl); } + desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1); + if (IS_ERR(desc->data_mr)) { + ret = PTR_ERR(desc->data_mr); + iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); + goto fast_reg_mr_failure; + } + desc->reg_indicators |= ISER_DATA_KEY_VALID; + + if (pi_enable) { + struct ib_mr_init_attr mr_init_attr = {0}; + struct iser_pi_context *pi_ctx = NULL; + + desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); + if (!desc->pi_ctx) { + iser_err("Failed to allocate pi context\n"); + ret = -ENOMEM; + goto pi_ctx_alloc_failure; + } + pi_ctx = desc->pi_ctx; + + pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(pi_ctx->prot_frpl)) { + ret = PTR_ERR(pi_ctx->prot_frpl); + iser_err("Failed to allocate prot frpl ret=%d\n", + ret); + goto prot_frpl_failure; + } + + pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, + ISCSI_ISER_SG_TABLESIZE + 1); + if (IS_ERR(pi_ctx->prot_mr)) { + ret = PTR_ERR(pi_ctx->prot_mr); + iser_err("Failed to allocate prot frmr ret=%d\n", + ret); + goto prot_mr_failure; + } + desc->reg_indicators |= ISER_PROT_KEY_VALID; + + mr_init_attr.max_reg_descriptors = 2; + mr_init_attr.flags |= IB_MR_SIGNATURE_EN; + pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); + if (IS_ERR(pi_ctx->sig_mr)) { + ret = PTR_ERR(pi_ctx->sig_mr); + iser_err("Failed to allocate signature enabled mr err=%d\n", + ret); + goto sig_mr_failure; + } + desc->reg_indicators |= ISER_SIG_KEY_VALID; + } + desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; + + iser_dbg("Create fr_desc %p page_list %p\n", + desc, desc->data_frpl->page_list); + + return 0; +sig_mr_failure: + ib_dereg_mr(desc->pi_ctx->prot_mr); +prot_mr_failure: + ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); +prot_frpl_failure: + kfree(desc->pi_ctx); +pi_ctx_alloc_failure: + ib_dereg_mr(desc->data_mr); +fast_reg_mr_failure: + ib_free_fast_reg_page_list(desc->data_frpl); + + return ret; +} + +/** + * iser_create_fastreg_pool - Creates pool of fast_reg descriptors + * for fast registration work requests. + * returns 0 on success, or errno code on failure + */ +int iser_create_fastreg_pool(struct iser_conn *ib_conn, unsigned cmds_max) +{ + struct iser_device *device = ib_conn->device; + struct fast_reg_descriptor *desc; + int i, ret; + + INIT_LIST_HEAD(&ib_conn->fastreg.pool); + ib_conn->fastreg.pool_size = 0; + for (i = 0; i < cmds_max; i++) { + desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) { + iser_err("Failed to allocate a new fast_reg descriptor\n"); + ret = -ENOMEM; + goto err; + } + + ret = iser_create_fastreg_desc(device->ib_device, device->pd, + ib_conn->pi_support, desc); + if (ret) { + iser_err("Failed to create fastreg descriptor err=%d\n", + ret); + kfree(desc); + goto err; + } + + list_add_tail(&desc->list, &ib_conn->fastreg.pool); + ib_conn->fastreg.pool_size++; + } + + return 0; + +err: + iser_free_fastreg_pool(ib_conn); + return ret; +} + +/** + * iser_free_fastreg_pool - releases the pool of fast_reg descriptors + */ +void iser_free_fastreg_pool(struct iser_conn *ib_conn) +{ + struct fast_reg_descriptor *desc, *tmp; + int i = 0; + + if (list_empty(&ib_conn->fastreg.pool)) + return; + + iser_info("freeing conn %p fr pool\n", ib_conn); + + list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) { + list_del(&desc->list); + ib_free_fast_reg_page_list(desc->data_frpl); + ib_dereg_mr(desc->data_mr); + if (desc->pi_ctx) { + ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); + ib_dereg_mr(desc->pi_ctx->prot_mr); + ib_destroy_mr(desc->pi_ctx->sig_mr); + kfree(desc->pi_ctx); + } + kfree(desc); + ++i; + } + + if (i < ib_conn->fastreg.pool_size) + iser_warn("pool still has %d regions registered\n", + ib_conn->fastreg.pool_size - i); +} + +/** + * iser_create_ib_conn_res - Queue-Pair (QP) + * + * returns 0 on success, -1 on failure + */ +static int iser_create_ib_conn_res(struct iser_conn *ib_conn) +{ + struct iser_device *device; + struct ib_qp_init_attr init_attr; + int ret = -ENOMEM; + int index, min_index = 0; + + BUG_ON(ib_conn->device == NULL); + + device = ib_conn->device; + memset(&init_attr, 0, sizeof init_attr); mutex_lock(&ig.connlist_mutex); @@ -258,27 +457,32 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn) min_index = index; device->cq_active_qps[min_index]++; mutex_unlock(&ig.connlist_mutex); - iser_err("cq index %d used for ib_conn %p\n", min_index, ib_conn); + iser_info("cq index %d used for ib_conn %p\n", min_index, ib_conn); init_attr.event_handler = iser_qp_event_callback; init_attr.qp_context = (void *)ib_conn; init_attr.send_cq = device->tx_cq[min_index]; init_attr.recv_cq = device->rx_cq[min_index]; - init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS; init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; init_attr.cap.max_send_sge = 2; init_attr.cap.max_recv_sge = 1; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; init_attr.qp_type = IB_QPT_RC; + if (ib_conn->pi_support) { + init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS; + init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; + } else { + init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS; + } ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); if (ret) goto out_err; ib_conn->qp = ib_conn->cma_id->qp; - iser_err("setting conn %p cma_id %p: fmr_pool %p qp %p\n", - ib_conn, ib_conn->cma_id, - ib_conn->fmr_pool, ib_conn->cma_id->qp); + iser_info("setting conn %p cma_id %p qp %p\n", + ib_conn, ib_conn->cma_id, + ib_conn->cma_id->qp); return ret; out_err: @@ -287,21 +491,19 @@ out_err: } /** - * releases the FMR pool, QP and CMA ID objects, returns 0 on success, + * releases the QP objects, returns 0 on success, * -1 on failure */ -static int iser_free_ib_conn_res(struct iser_conn *ib_conn, int can_destroy_id) +static int iser_free_ib_conn_res(struct iser_conn *ib_conn) { int cq_index; BUG_ON(ib_conn == NULL); - iser_err("freeing conn %p cma_id %p fmr pool %p qp %p\n", - ib_conn, ib_conn->cma_id, - ib_conn->fmr_pool, ib_conn->qp); + iser_info("freeing conn %p cma_id %p qp %p\n", + ib_conn, ib_conn->cma_id, + ib_conn->qp); /* qp is created only once both addr & route are resolved */ - if (ib_conn->fmr_pool != NULL) - ib_destroy_fmr_pool(ib_conn->fmr_pool); if (ib_conn->qp != NULL) { cq_index = ((struct iser_cq_desc *)ib_conn->qp->recv_cq->cq_context)->cq_index; @@ -309,26 +511,8 @@ static int iser_free_ib_conn_res(struct iser_conn *ib_conn, int can_destroy_id) rdma_destroy_qp(ib_conn->cma_id); } - /* if cma handler context, the caller acts s.t the cma destroy the id */ - if (ib_conn->cma_id != NULL && can_destroy_id) - rdma_destroy_id(ib_conn->cma_id); - ib_conn->fmr_pool = NULL; ib_conn->qp = NULL; - ib_conn->cma_id = NULL; - kfree(ib_conn->page_vec); - - if (ib_conn->login_buf) { - if (ib_conn->login_req_dma) - ib_dma_unmap_single(ib_conn->device->ib_device, - ib_conn->login_req_dma, - ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); - if (ib_conn->login_resp_dma) - ib_dma_unmap_single(ib_conn->device->ib_device, - ib_conn->login_resp_dma, - ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); - kfree(ib_conn->login_buf); - } return 0; } @@ -375,7 +559,7 @@ static void iser_device_try_release(struct iser_device *device) { mutex_lock(&ig.device_list_mutex); device->refcount--; - iser_err("device %p refcount %d\n",device,device->refcount); + iser_info("device %p refcount %d\n", device, device->refcount); if (!device->refcount) { iser_free_device_ib_res(device); list_del(&device->ig_list); @@ -397,39 +581,46 @@ static int iser_conn_state_comp_exch(struct iser_conn *ib_conn, return ret; } +void iser_release_work(struct work_struct *work) +{ + struct iser_conn *ib_conn; + + ib_conn = container_of(work, struct iser_conn, release_work); + + /* wait for .conn_stop callback */ + wait_for_completion(&ib_conn->stop_completion); + + /* wait for the qp`s post send and post receive buffers to empty */ + wait_event_interruptible(ib_conn->wait, + ib_conn->state == ISER_CONN_DOWN); + + iser_conn_release(ib_conn); +} + /** * Frees all conn objects and deallocs conn descriptor */ -static void iser_conn_release(struct iser_conn *ib_conn, int can_destroy_id) +void iser_conn_release(struct iser_conn *ib_conn) { struct iser_device *device = ib_conn->device; - BUG_ON(ib_conn->state != ISER_CONN_DOWN); + BUG_ON(ib_conn->state == ISER_CONN_UP); mutex_lock(&ig.connlist_mutex); list_del(&ib_conn->conn_list); mutex_unlock(&ig.connlist_mutex); iser_free_rx_descriptors(ib_conn); - iser_free_ib_conn_res(ib_conn, can_destroy_id); + iser_free_ib_conn_res(ib_conn); ib_conn->device = NULL; /* on EVENT_ADDR_ERROR there's no device yet for this conn */ if (device != NULL) iser_device_try_release(device); - iscsi_destroy_endpoint(ib_conn->ep); -} - -void iser_conn_get(struct iser_conn *ib_conn) -{ - atomic_inc(&ib_conn->refcount); -} - -int iser_conn_put(struct iser_conn *ib_conn, int can_destroy_id) -{ - if (atomic_dec_and_test(&ib_conn->refcount)) { - iser_conn_release(ib_conn, can_destroy_id); - return 1; + /* if cma handler context, the caller actually destroy the id */ + if (ib_conn->cma_id != NULL) { + rdma_destroy_id(ib_conn->cma_id); + ib_conn->cma_id = NULL; } - return 0; + iscsi_destroy_endpoint(ib_conn->ep); } /** @@ -449,24 +640,19 @@ void iser_conn_terminate(struct iser_conn *ib_conn) if (err) iser_err("Failed to disconnect, conn: 0x%p err %d\n", ib_conn,err); - - wait_event_interruptible(ib_conn->wait, - ib_conn->state == ISER_CONN_DOWN); - - iser_conn_put(ib_conn, 1); /* deref ib conn deallocate */ } -static int iser_connect_error(struct rdma_cm_id *cma_id) +static void iser_connect_error(struct rdma_cm_id *cma_id) { struct iser_conn *ib_conn; + ib_conn = (struct iser_conn *)cma_id->context; ib_conn->state = ISER_CONN_DOWN; wake_up_interruptible(&ib_conn->wait); - return iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */ } -static int iser_addr_handler(struct rdma_cm_id *cma_id) +static void iser_addr_handler(struct rdma_cm_id *cma_id) { struct iser_device *device; struct iser_conn *ib_conn; @@ -475,25 +661,39 @@ static int iser_addr_handler(struct rdma_cm_id *cma_id) device = iser_device_find_by_ib_device(cma_id); if (!device) { iser_err("device lookup/creation failed\n"); - return iser_connect_error(cma_id); + iser_connect_error(cma_id); + return; } ib_conn = (struct iser_conn *)cma_id->context; ib_conn->device = device; + /* connection T10-PI support */ + if (iser_pi_enable) { + if (!(device->dev_attr.device_cap_flags & + IB_DEVICE_SIGNATURE_HANDOVER)) { + iser_warn("T10-PI requested but not supported on %s, " + "continue without T10-PI\n", + ib_conn->device->ib_device->name); + ib_conn->pi_support = false; + } else { + ib_conn->pi_support = true; + } + } + ret = rdma_resolve_route(cma_id, 1000); if (ret) { iser_err("resolve route failed: %d\n", ret); - return iser_connect_error(cma_id); + iser_connect_error(cma_id); + return; } - - return 0; } -static int iser_route_handler(struct rdma_cm_id *cma_id) +static void iser_route_handler(struct rdma_cm_id *cma_id) { struct rdma_conn_param conn_param; int ret; + struct iser_cm_hdr req_hdr; ret = iser_create_ib_conn_res((struct iser_conn *)cma_id->context); if (ret) @@ -505,39 +705,52 @@ static int iser_route_handler(struct rdma_cm_id *cma_id) conn_param.retry_count = 7; conn_param.rnr_retry_count = 6; + memset(&req_hdr, 0, sizeof(req_hdr)); + req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED | + ISER_SEND_W_INV_NOT_SUPPORTED); + conn_param.private_data = (void *)&req_hdr; + conn_param.private_data_len = sizeof(struct iser_cm_hdr); + ret = rdma_connect(cma_id, &conn_param); if (ret) { iser_err("failure connecting: %d\n", ret); goto failure; } - return 0; + return; failure: - return iser_connect_error(cma_id); + iser_connect_error(cma_id); } static void iser_connected_handler(struct rdma_cm_id *cma_id) { struct iser_conn *ib_conn; + struct ib_qp_attr attr; + struct ib_qp_init_attr init_attr; + + (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr); + iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num); ib_conn = (struct iser_conn *)cma_id->context; - ib_conn->state = ISER_CONN_UP; - wake_up_interruptible(&ib_conn->wait); + if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_PENDING, ISER_CONN_UP)) + wake_up_interruptible(&ib_conn->wait); } -static int iser_disconnected_handler(struct rdma_cm_id *cma_id) +static void iser_disconnected_handler(struct rdma_cm_id *cma_id) { struct iser_conn *ib_conn; - int ret; ib_conn = (struct iser_conn *)cma_id->context; /* getting here when the state is UP means that the conn is being * * terminated asynchronously from the iSCSI layer's perspective. */ if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, - ISER_CONN_TERMINATING)) - iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn, - ISCSI_ERR_CONN_FAILED); + ISER_CONN_TERMINATING)){ + if (ib_conn->iscsi_conn) + iscsi_conn_failure(ib_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); + else + iser_err("iscsi_iser connection isn't bound\n"); + } /* Complete the termination process if no posts are pending */ if (ib_conn->post_recv_buf_count == 0 && @@ -545,24 +758,19 @@ static int iser_disconnected_handler(struct rdma_cm_id *cma_id) ib_conn->state = ISER_CONN_DOWN; wake_up_interruptible(&ib_conn->wait); } - - ret = iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */ - return ret; } static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { - int ret = 0; - - iser_err("event %d status %d conn %p id %p\n", - event->event, event->status, cma_id->context, cma_id); + iser_info("event %d status %d conn %p id %p\n", + event->event, event->status, cma_id->context, cma_id); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: - ret = iser_addr_handler(cma_id); + iser_addr_handler(cma_id); break; case RDMA_CM_EVENT_ROUTE_RESOLVED: - ret = iser_route_handler(cma_id); + iser_route_handler(cma_id); break; case RDMA_CM_EVENT_ESTABLISHED: iser_connected_handler(cma_id); @@ -572,18 +780,18 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: - ret = iser_connect_error(cma_id); + iser_connect_error(cma_id); break; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_ADDR_CHANGE: - ret = iser_disconnected_handler(cma_id); + iser_disconnected_handler(cma_id); break; default: iser_err("Unexpected RDMA CM event (%d)\n", event->event); break; } - return ret; + return 0; } void iser_conn_init(struct iser_conn *ib_conn) @@ -592,7 +800,7 @@ void iser_conn_init(struct iser_conn *ib_conn) init_waitqueue_head(&ib_conn->wait); ib_conn->post_recv_buf_count = 0; atomic_set(&ib_conn->post_send_buf_count, 0); - atomic_set(&ib_conn->refcount, 1); /* ref ib conn allocation */ + init_completion(&ib_conn->stop_completion); INIT_LIST_HEAD(&ib_conn->conn_list); spin_lock_init(&ib_conn->lock); } @@ -615,12 +823,11 @@ int iser_connect(struct iser_conn *ib_conn, /* the device is known only --after-- address resolution */ ib_conn->device = NULL; - iser_err("connecting to: %pI4, port 0x%x\n", - &dst_addr->sin_addr, dst_addr->sin_port); + iser_info("connecting to: %pI4, port 0x%x\n", + &dst_addr->sin_addr, dst_addr->sin_port); ib_conn->state = ISER_CONN_PENDING; - iser_conn_get(ib_conn); /* ref ib conn's cma id */ ib_conn->cma_id = rdma_create_id(iser_cma_handler, (void *)ib_conn, RDMA_PS_TCP, IB_QPT_RC); @@ -657,9 +864,8 @@ id_failure: ib_conn->cma_id = NULL; addr_failure: ib_conn->state = ISER_CONN_DOWN; - iser_conn_put(ib_conn, 1); /* deref ib conn's cma id */ connect_failure: - iser_conn_put(ib_conn, 1); /* deref ib conn deallocate */ + iser_conn_release(ib_conn); return err; } @@ -680,7 +886,7 @@ int iser_reg_page_vec(struct iser_conn *ib_conn, page_list = page_vec->pages; io_addr = page_list[0]; - mem = ib_fmr_pool_map_phys(ib_conn->fmr_pool, + mem = ib_fmr_pool_map_phys(ib_conn->fmr.pool, page_list, page_vec->length, io_addr); @@ -695,7 +901,7 @@ int iser_reg_page_vec(struct iser_conn *ib_conn, mem_reg->rkey = mem->fmr->rkey; mem_reg->len = page_vec->length * SIZE_4K; mem_reg->va = io_addr; - mem_reg->is_fmr = 1; + mem_reg->is_mr = 1; mem_reg->mem_h = (void *)mem; mem_reg->va += page_vec->offset; @@ -713,12 +919,18 @@ int iser_reg_page_vec(struct iser_conn *ib_conn, } /** - * Unregister (previosuly registered) memory. + * Unregister (previosuly registered using FMR) memory. + * If memory is non-FMR does nothing. */ -void iser_unreg_mem(struct iser_mem_reg *reg) +void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) { + struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; int ret; + if (!reg->is_mr) + return; + iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h); ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h); @@ -728,6 +940,23 @@ void iser_unreg_mem(struct iser_mem_reg *reg) reg->mem_h = NULL; } +void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) +{ + struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; + struct iser_conn *ib_conn = iser_task->ib_conn; + struct fast_reg_descriptor *desc = reg->mem_h; + + if (!reg->is_mr) + return; + + reg->mem_h = NULL; + reg->is_mr = 0; + spin_lock_bh(&ib_conn->lock); + list_add_tail(&desc->list, &ib_conn->fastreg.pool); + spin_unlock_bh(&ib_conn->lock); +} + int iser_post_recvl(struct iser_conn *ib_conn) { struct ib_recv_wr rx_wr, *rx_wr_failed; @@ -765,7 +994,7 @@ int iser_post_recvm(struct iser_conn *ib_conn, int count) rx_wr->sg_list = &rx_desc->rx_sg; rx_wr->num_sge = 1; rx_wr->next = rx_wr + 1; - my_rx_head = (my_rx_head + 1) & (ISER_QP_MAX_RECV_DTOS - 1); + my_rx_head = (my_rx_head + 1) & ib_conn->qp_max_recv_dtos_mask; } rx_wr--; @@ -825,7 +1054,7 @@ static void iser_handle_comp_error(struct iser_tx_desc *desc, * perspective. */ if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, ISER_CONN_TERMINATING)) - iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn, + iscsi_conn_failure(ib_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); /* no more non completed posts to the QP, complete the @@ -854,9 +1083,11 @@ static int iser_drain_tx_cq(struct iser_device *device, int cq_index) IB_WC_SEND, wc.opcode); } else { iser_err("tx id %llx status %d vend_err %x\n", - wc.wr_id, wc.status, wc.vendor_err); - atomic_dec(&ib_conn->post_send_buf_count); - iser_handle_comp_error(tx_desc, ib_conn); + wc.wr_id, wc.status, wc.vendor_err); + if (wc.wr_id != ISER_FASTREG_LI_WRID) { + atomic_dec(&ib_conn->post_send_buf_count); + iser_handle_comp_error(tx_desc, ib_conn); + } } completed_tx++; } @@ -874,8 +1105,12 @@ static void iser_cq_tasklet_fn(unsigned long data) struct iser_rx_desc *desc; unsigned long xfer_len; struct iser_conn *ib_conn; - int completed_tx, completed_rx; - completed_tx = completed_rx = 0; + int completed_tx, completed_rx = 0; + + /* First do tx drain, so in a case where we have rx flushes and a successful + * tx completion we will still go through completion error handling. + */ + completed_tx = iser_drain_tx_cq(device, cq_index); while (ib_poll_cq(cq, 1, &wc) == 1) { desc = (struct iser_rx_desc *) (unsigned long) wc.wr_id; @@ -903,7 +1138,6 @@ static void iser_cq_tasklet_fn(unsigned long data) * " would not cause interrupts to be missed" */ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - completed_tx += iser_drain_tx_cq(device, cq_index); iser_dbg("got %d rx %d tx completions\n", completed_rx, completed_tx); } @@ -915,3 +1149,51 @@ static void iser_cq_callback(struct ib_cq *cq, void *cq_context) tasklet_schedule(&device->cq_tasklet[cq_index]); } + +u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir, sector_t *sector) +{ + struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; + struct fast_reg_descriptor *desc = reg->mem_h; + unsigned long sector_size = iser_task->sc->device->sector_size; + struct ib_mr_status mr_status; + int ret; + + if (desc && desc->reg_indicators & ISER_FASTREG_PROTECTED) { + desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; + ret = ib_check_mr_status(desc->pi_ctx->sig_mr, + IB_MR_CHECK_SIG_STATUS, &mr_status); + if (ret) { + pr_err("ib_check_mr_status failed, ret %d\n", ret); + goto err; + } + + if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { + sector_t sector_off = mr_status.sig_err.sig_err_offset; + + do_div(sector_off, sector_size + 8); + *sector = scsi_get_lba(iser_task->sc) + sector_off; + + pr_err("PI error found type %d at sector %llx " + "expected %x vs actual %x\n", + mr_status.sig_err.err_type, + (unsigned long long)*sector, + mr_status.sig_err.expected, + mr_status.sig_err.actual); + + switch (mr_status.sig_err.err_type) { + case IB_SIG_BAD_GUARD: + return 0x1; + case IB_SIG_BAD_REFTAG: + return 0x3; + case IB_SIG_BAD_APPTAG: + return 0x2; + } + } + } + + return 0; +err: + /* Not alot we can do here, return ambiguous guard error */ + return 0x1; +} diff --git a/drivers/infiniband/ulp/isert/Kconfig b/drivers/infiniband/ulp/isert/Kconfig new file mode 100644 index 00000000000..02f9759ebb1 --- /dev/null +++ b/drivers/infiniband/ulp/isert/Kconfig @@ -0,0 +1,5 @@ +config INFINIBAND_ISERT + tristate "iSCSI Extensions for RDMA (iSER) target support" + depends on INET && INFINIBAND_ADDR_TRANS && TARGET_CORE && ISCSI_TARGET + ---help--- + Support for iSCSI Extensions for RDMA (iSER) Target on Infiniband fabrics. diff --git a/drivers/infiniband/ulp/isert/Makefile b/drivers/infiniband/ulp/isert/Makefile new file mode 100644 index 00000000000..c8bf2421f5b --- /dev/null +++ b/drivers/infiniband/ulp/isert/Makefile @@ -0,0 +1,2 @@ +ccflags-y := -Idrivers/target -Idrivers/target/iscsi +obj-$(CONFIG_INFINIBAND_ISERT) += ib_isert.o diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c new file mode 100644 index 00000000000..d4c7928a0f3 --- /dev/null +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -0,0 +1,3308 @@ +/******************************************************************************* + * This file contains iSCSI extentions for RDMA (iSER) Verbs + * + * (c) Copyright 2013 Datera, Inc. + * + * Nicholas A. Bellinger <nab@linux-iscsi.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + ****************************************************************************/ + +#include <linux/string.h> +#include <linux/module.h> +#include <linux/scatterlist.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/llist.h> +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include <target/target_core_base.h> +#include <target/target_core_fabric.h> +#include <target/iscsi/iscsi_transport.h> +#include <linux/semaphore.h> + +#include "isert_proto.h" +#include "ib_isert.h" + +#define ISERT_MAX_CONN 8 +#define ISER_MAX_RX_CQ_LEN (ISERT_QP_MAX_RECV_DTOS * ISERT_MAX_CONN) +#define ISER_MAX_TX_CQ_LEN (ISERT_QP_MAX_REQ_DTOS * ISERT_MAX_CONN) + +static DEFINE_MUTEX(device_list_mutex); +static LIST_HEAD(device_list); +static struct workqueue_struct *isert_rx_wq; +static struct workqueue_struct *isert_comp_wq; + +static void +isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn); +static int +isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr); +static void +isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn); +static int +isert_reg_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr); +static int +isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd); + +static void +isert_qp_event_callback(struct ib_event *e, void *context) +{ + struct isert_conn *isert_conn = (struct isert_conn *)context; + + pr_err("isert_qp_event_callback event: %d\n", e->event); + switch (e->event) { + case IB_EVENT_COMM_EST: + rdma_notify(isert_conn->conn_cm_id, IB_EVENT_COMM_EST); + break; + case IB_EVENT_QP_LAST_WQE_REACHED: + pr_warn("Reached TX IB_EVENT_QP_LAST_WQE_REACHED:\n"); + break; + default: + break; + } +} + +static int +isert_query_device(struct ib_device *ib_dev, struct ib_device_attr *devattr) +{ + int ret; + + ret = ib_query_device(ib_dev, devattr); + if (ret) { + pr_err("ib_query_device() failed: %d\n", ret); + return ret; + } + pr_debug("devattr->max_sge: %d\n", devattr->max_sge); + pr_debug("devattr->max_sge_rd: %d\n", devattr->max_sge_rd); + + return 0; +} + +static int +isert_conn_setup_qp(struct isert_conn *isert_conn, struct rdma_cm_id *cma_id, + u8 protection) +{ + struct isert_device *device = isert_conn->conn_device; + struct ib_qp_init_attr attr; + int ret, index, min_index = 0; + + mutex_lock(&device_list_mutex); + for (index = 0; index < device->cqs_used; index++) + if (device->cq_active_qps[index] < + device->cq_active_qps[min_index]) + min_index = index; + device->cq_active_qps[min_index]++; + pr_debug("isert_conn_setup_qp: Using min_index: %d\n", min_index); + mutex_unlock(&device_list_mutex); + + memset(&attr, 0, sizeof(struct ib_qp_init_attr)); + attr.event_handler = isert_qp_event_callback; + attr.qp_context = isert_conn; + attr.send_cq = device->dev_tx_cq[min_index]; + attr.recv_cq = device->dev_rx_cq[min_index]; + attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS; + attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS; + /* + * FIXME: Use devattr.max_sge - 2 for max_send_sge as + * work-around for RDMA_READ.. + */ + attr.cap.max_send_sge = device->dev_attr.max_sge - 2; + isert_conn->max_sge = attr.cap.max_send_sge; + + attr.cap.max_recv_sge = 1; + attr.sq_sig_type = IB_SIGNAL_REQ_WR; + attr.qp_type = IB_QPT_RC; + if (protection) + attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; + + pr_debug("isert_conn_setup_qp cma_id->device: %p\n", + cma_id->device); + pr_debug("isert_conn_setup_qp conn_pd->device: %p\n", + isert_conn->conn_pd->device); + + ret = rdma_create_qp(cma_id, isert_conn->conn_pd, &attr); + if (ret) { + pr_err("rdma_create_qp failed for cma_id %d\n", ret); + return ret; + } + isert_conn->conn_qp = cma_id->qp; + pr_debug("rdma_create_qp() returned success >>>>>>>>>>>>>>>>>>>>>>>>>.\n"); + + return 0; +} + +static void +isert_cq_event_callback(struct ib_event *e, void *context) +{ + pr_debug("isert_cq_event_callback event: %d\n", e->event); +} + +static int +isert_alloc_rx_descriptors(struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iser_rx_desc *rx_desc; + struct ib_sge *rx_sg; + u64 dma_addr; + int i, j; + + isert_conn->conn_rx_descs = kzalloc(ISERT_QP_MAX_RECV_DTOS * + sizeof(struct iser_rx_desc), GFP_KERNEL); + if (!isert_conn->conn_rx_descs) + goto fail; + + rx_desc = isert_conn->conn_rx_descs; + + for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++) { + dma_addr = ib_dma_map_single(ib_dev, (void *)rx_desc, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ib_dev, dma_addr)) + goto dma_map_fail; + + rx_desc->dma_addr = dma_addr; + + rx_sg = &rx_desc->rx_sg; + rx_sg->addr = rx_desc->dma_addr; + rx_sg->length = ISER_RX_PAYLOAD_SIZE; + rx_sg->lkey = isert_conn->conn_mr->lkey; + } + + isert_conn->conn_rx_desc_head = 0; + return 0; + +dma_map_fail: + rx_desc = isert_conn->conn_rx_descs; + for (j = 0; j < i; j++, rx_desc++) { + ib_dma_unmap_single(ib_dev, rx_desc->dma_addr, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + } + kfree(isert_conn->conn_rx_descs); + isert_conn->conn_rx_descs = NULL; +fail: + return -ENOMEM; +} + +static void +isert_free_rx_descriptors(struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iser_rx_desc *rx_desc; + int i; + + if (!isert_conn->conn_rx_descs) + return; + + rx_desc = isert_conn->conn_rx_descs; + for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++) { + ib_dma_unmap_single(ib_dev, rx_desc->dma_addr, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + } + + kfree(isert_conn->conn_rx_descs); + isert_conn->conn_rx_descs = NULL; +} + +static void isert_cq_tx_work(struct work_struct *); +static void isert_cq_tx_callback(struct ib_cq *, void *); +static void isert_cq_rx_work(struct work_struct *); +static void isert_cq_rx_callback(struct ib_cq *, void *); + +static int +isert_create_device_ib_res(struct isert_device *device) +{ + struct ib_device *ib_dev = device->ib_device; + struct isert_cq_desc *cq_desc; + struct ib_device_attr *dev_attr; + int ret = 0, i, j; + + dev_attr = &device->dev_attr; + ret = isert_query_device(ib_dev, dev_attr); + if (ret) + return ret; + + /* asign function handlers */ + if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS && + dev_attr->device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) { + device->use_fastreg = 1; + device->reg_rdma_mem = isert_reg_rdma; + device->unreg_rdma_mem = isert_unreg_rdma; + } else { + device->use_fastreg = 0; + device->reg_rdma_mem = isert_map_rdma; + device->unreg_rdma_mem = isert_unmap_cmd; + } + + /* Check signature cap */ + device->pi_capable = dev_attr->device_cap_flags & + IB_DEVICE_SIGNATURE_HANDOVER ? true : false; + + device->cqs_used = min_t(int, num_online_cpus(), + device->ib_device->num_comp_vectors); + device->cqs_used = min(ISERT_MAX_CQ, device->cqs_used); + pr_debug("Using %d CQs, device %s supports %d vectors support " + "Fast registration %d pi_capable %d\n", + device->cqs_used, device->ib_device->name, + device->ib_device->num_comp_vectors, device->use_fastreg, + device->pi_capable); + device->cq_desc = kzalloc(sizeof(struct isert_cq_desc) * + device->cqs_used, GFP_KERNEL); + if (!device->cq_desc) { + pr_err("Unable to allocate device->cq_desc\n"); + return -ENOMEM; + } + cq_desc = device->cq_desc; + + for (i = 0; i < device->cqs_used; i++) { + cq_desc[i].device = device; + cq_desc[i].cq_index = i; + + INIT_WORK(&cq_desc[i].cq_rx_work, isert_cq_rx_work); + device->dev_rx_cq[i] = ib_create_cq(device->ib_device, + isert_cq_rx_callback, + isert_cq_event_callback, + (void *)&cq_desc[i], + ISER_MAX_RX_CQ_LEN, i); + if (IS_ERR(device->dev_rx_cq[i])) { + ret = PTR_ERR(device->dev_rx_cq[i]); + device->dev_rx_cq[i] = NULL; + goto out_cq; + } + + INIT_WORK(&cq_desc[i].cq_tx_work, isert_cq_tx_work); + device->dev_tx_cq[i] = ib_create_cq(device->ib_device, + isert_cq_tx_callback, + isert_cq_event_callback, + (void *)&cq_desc[i], + ISER_MAX_TX_CQ_LEN, i); + if (IS_ERR(device->dev_tx_cq[i])) { + ret = PTR_ERR(device->dev_tx_cq[i]); + device->dev_tx_cq[i] = NULL; + goto out_cq; + } + + ret = ib_req_notify_cq(device->dev_rx_cq[i], IB_CQ_NEXT_COMP); + if (ret) + goto out_cq; + + ret = ib_req_notify_cq(device->dev_tx_cq[i], IB_CQ_NEXT_COMP); + if (ret) + goto out_cq; + } + + return 0; + +out_cq: + for (j = 0; j < i; j++) { + cq_desc = &device->cq_desc[j]; + + if (device->dev_rx_cq[j]) { + cancel_work_sync(&cq_desc->cq_rx_work); + ib_destroy_cq(device->dev_rx_cq[j]); + } + if (device->dev_tx_cq[j]) { + cancel_work_sync(&cq_desc->cq_tx_work); + ib_destroy_cq(device->dev_tx_cq[j]); + } + } + kfree(device->cq_desc); + + return ret; +} + +static void +isert_free_device_ib_res(struct isert_device *device) +{ + struct isert_cq_desc *cq_desc; + int i; + + for (i = 0; i < device->cqs_used; i++) { + cq_desc = &device->cq_desc[i]; + + cancel_work_sync(&cq_desc->cq_rx_work); + cancel_work_sync(&cq_desc->cq_tx_work); + ib_destroy_cq(device->dev_rx_cq[i]); + ib_destroy_cq(device->dev_tx_cq[i]); + device->dev_rx_cq[i] = NULL; + device->dev_tx_cq[i] = NULL; + } + + kfree(device->cq_desc); +} + +static void +isert_device_try_release(struct isert_device *device) +{ + mutex_lock(&device_list_mutex); + device->refcount--; + if (!device->refcount) { + isert_free_device_ib_res(device); + list_del(&device->dev_node); + kfree(device); + } + mutex_unlock(&device_list_mutex); +} + +static struct isert_device * +isert_device_find_by_ib_dev(struct rdma_cm_id *cma_id) +{ + struct isert_device *device; + int ret; + + mutex_lock(&device_list_mutex); + list_for_each_entry(device, &device_list, dev_node) { + if (device->ib_device->node_guid == cma_id->device->node_guid) { + device->refcount++; + mutex_unlock(&device_list_mutex); + return device; + } + } + + device = kzalloc(sizeof(struct isert_device), GFP_KERNEL); + if (!device) { + mutex_unlock(&device_list_mutex); + return ERR_PTR(-ENOMEM); + } + + INIT_LIST_HEAD(&device->dev_node); + + device->ib_device = cma_id->device; + ret = isert_create_device_ib_res(device); + if (ret) { + kfree(device); + mutex_unlock(&device_list_mutex); + return ERR_PTR(ret); + } + + device->refcount++; + list_add_tail(&device->dev_node, &device_list); + mutex_unlock(&device_list_mutex); + + return device; +} + +static void +isert_conn_free_fastreg_pool(struct isert_conn *isert_conn) +{ + struct fast_reg_descriptor *fr_desc, *tmp; + int i = 0; + + if (list_empty(&isert_conn->conn_fr_pool)) + return; + + pr_debug("Freeing conn %p fastreg pool", isert_conn); + + list_for_each_entry_safe(fr_desc, tmp, + &isert_conn->conn_fr_pool, list) { + list_del(&fr_desc->list); + ib_free_fast_reg_page_list(fr_desc->data_frpl); + ib_dereg_mr(fr_desc->data_mr); + if (fr_desc->pi_ctx) { + ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl); + ib_dereg_mr(fr_desc->pi_ctx->prot_mr); + ib_destroy_mr(fr_desc->pi_ctx->sig_mr); + kfree(fr_desc->pi_ctx); + } + kfree(fr_desc); + ++i; + } + + if (i < isert_conn->conn_fr_pool_size) + pr_warn("Pool still has %d regions registered\n", + isert_conn->conn_fr_pool_size - i); +} + +static int +isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd, + struct fast_reg_descriptor *fr_desc, u8 protection) +{ + int ret; + + fr_desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(fr_desc->data_frpl)) { + pr_err("Failed to allocate data frpl err=%ld\n", + PTR_ERR(fr_desc->data_frpl)); + return PTR_ERR(fr_desc->data_frpl); + } + + fr_desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(fr_desc->data_mr)) { + pr_err("Failed to allocate data frmr err=%ld\n", + PTR_ERR(fr_desc->data_mr)); + ret = PTR_ERR(fr_desc->data_mr); + goto err_data_frpl; + } + pr_debug("Create fr_desc %p page_list %p\n", + fr_desc, fr_desc->data_frpl->page_list); + fr_desc->ind |= ISERT_DATA_KEY_VALID; + + if (protection) { + struct ib_mr_init_attr mr_init_attr = {0}; + struct pi_context *pi_ctx; + + fr_desc->pi_ctx = kzalloc(sizeof(*fr_desc->pi_ctx), GFP_KERNEL); + if (!fr_desc->pi_ctx) { + pr_err("Failed to allocate pi context\n"); + ret = -ENOMEM; + goto err_data_mr; + } + pi_ctx = fr_desc->pi_ctx; + + pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(pi_ctx->prot_frpl)) { + pr_err("Failed to allocate prot frpl err=%ld\n", + PTR_ERR(pi_ctx->prot_frpl)); + ret = PTR_ERR(pi_ctx->prot_frpl); + goto err_pi_ctx; + } + + pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(pi_ctx->prot_mr)) { + pr_err("Failed to allocate prot frmr err=%ld\n", + PTR_ERR(pi_ctx->prot_mr)); + ret = PTR_ERR(pi_ctx->prot_mr); + goto err_prot_frpl; + } + fr_desc->ind |= ISERT_PROT_KEY_VALID; + + mr_init_attr.max_reg_descriptors = 2; + mr_init_attr.flags |= IB_MR_SIGNATURE_EN; + pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); + if (IS_ERR(pi_ctx->sig_mr)) { + pr_err("Failed to allocate signature enabled mr err=%ld\n", + PTR_ERR(pi_ctx->sig_mr)); + ret = PTR_ERR(pi_ctx->sig_mr); + goto err_prot_mr; + } + fr_desc->ind |= ISERT_SIG_KEY_VALID; + } + fr_desc->ind &= ~ISERT_PROTECTED; + + return 0; +err_prot_mr: + ib_dereg_mr(fr_desc->pi_ctx->prot_mr); +err_prot_frpl: + ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl); +err_pi_ctx: + kfree(fr_desc->pi_ctx); +err_data_mr: + ib_dereg_mr(fr_desc->data_mr); +err_data_frpl: + ib_free_fast_reg_page_list(fr_desc->data_frpl); + + return ret; +} + +static int +isert_conn_create_fastreg_pool(struct isert_conn *isert_conn, u8 pi_support) +{ + struct fast_reg_descriptor *fr_desc; + struct isert_device *device = isert_conn->conn_device; + struct se_session *se_sess = isert_conn->conn->sess->se_sess; + struct se_node_acl *se_nacl = se_sess->se_node_acl; + int i, ret, tag_num; + /* + * Setup the number of FRMRs based upon the number of tags + * available to session in iscsi_target_locate_portal(). + */ + tag_num = max_t(u32, ISCSIT_MIN_TAGS, se_nacl->queue_depth); + tag_num = (tag_num * 2) + ISCSIT_EXTRA_TAGS; + + isert_conn->conn_fr_pool_size = 0; + for (i = 0; i < tag_num; i++) { + fr_desc = kzalloc(sizeof(*fr_desc), GFP_KERNEL); + if (!fr_desc) { + pr_err("Failed to allocate fast_reg descriptor\n"); + ret = -ENOMEM; + goto err; + } + + ret = isert_create_fr_desc(device->ib_device, + isert_conn->conn_pd, fr_desc, + pi_support); + if (ret) { + pr_err("Failed to create fastreg descriptor err=%d\n", + ret); + kfree(fr_desc); + goto err; + } + + list_add_tail(&fr_desc->list, &isert_conn->conn_fr_pool); + isert_conn->conn_fr_pool_size++; + } + + pr_debug("Creating conn %p fastreg pool size=%d", + isert_conn, isert_conn->conn_fr_pool_size); + + return 0; + +err: + isert_conn_free_fastreg_pool(isert_conn); + return ret; +} + +static int +isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ + struct iscsi_np *np = cma_id->context; + struct isert_np *isert_np = np->np_context; + struct isert_conn *isert_conn; + struct isert_device *device; + struct ib_device *ib_dev = cma_id->device; + int ret = 0; + u8 pi_support; + + spin_lock_bh(&np->np_thread_lock); + if (!np->enabled) { + spin_unlock_bh(&np->np_thread_lock); + pr_debug("iscsi_np is not enabled, reject connect request\n"); + return rdma_reject(cma_id, NULL, 0); + } + spin_unlock_bh(&np->np_thread_lock); + + pr_debug("Entering isert_connect_request cma_id: %p, context: %p\n", + cma_id, cma_id->context); + + isert_conn = kzalloc(sizeof(struct isert_conn), GFP_KERNEL); + if (!isert_conn) { + pr_err("Unable to allocate isert_conn\n"); + return -ENOMEM; + } + isert_conn->state = ISER_CONN_INIT; + INIT_LIST_HEAD(&isert_conn->conn_accept_node); + init_completion(&isert_conn->conn_login_comp); + init_completion(&isert_conn->conn_wait); + init_completion(&isert_conn->conn_wait_comp_err); + kref_init(&isert_conn->conn_kref); + kref_get(&isert_conn->conn_kref); + mutex_init(&isert_conn->conn_mutex); + spin_lock_init(&isert_conn->conn_lock); + INIT_LIST_HEAD(&isert_conn->conn_fr_pool); + + cma_id->context = isert_conn; + isert_conn->conn_cm_id = cma_id; + isert_conn->responder_resources = event->param.conn.responder_resources; + isert_conn->initiator_depth = event->param.conn.initiator_depth; + pr_debug("Using responder_resources: %u initiator_depth: %u\n", + isert_conn->responder_resources, isert_conn->initiator_depth); + + isert_conn->login_buf = kzalloc(ISCSI_DEF_MAX_RECV_SEG_LEN + + ISER_RX_LOGIN_SIZE, GFP_KERNEL); + if (!isert_conn->login_buf) { + pr_err("Unable to allocate isert_conn->login_buf\n"); + ret = -ENOMEM; + goto out; + } + + isert_conn->login_req_buf = isert_conn->login_buf; + isert_conn->login_rsp_buf = isert_conn->login_buf + + ISCSI_DEF_MAX_RECV_SEG_LEN; + pr_debug("Set login_buf: %p login_req_buf: %p login_rsp_buf: %p\n", + isert_conn->login_buf, isert_conn->login_req_buf, + isert_conn->login_rsp_buf); + + isert_conn->login_req_dma = ib_dma_map_single(ib_dev, + (void *)isert_conn->login_req_buf, + ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_FROM_DEVICE); + + ret = ib_dma_mapping_error(ib_dev, isert_conn->login_req_dma); + if (ret) { + pr_err("ib_dma_mapping_error failed for login_req_dma: %d\n", + ret); + isert_conn->login_req_dma = 0; + goto out_login_buf; + } + + isert_conn->login_rsp_dma = ib_dma_map_single(ib_dev, + (void *)isert_conn->login_rsp_buf, + ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE); + + ret = ib_dma_mapping_error(ib_dev, isert_conn->login_rsp_dma); + if (ret) { + pr_err("ib_dma_mapping_error failed for login_rsp_dma: %d\n", + ret); + isert_conn->login_rsp_dma = 0; + goto out_req_dma_map; + } + + device = isert_device_find_by_ib_dev(cma_id); + if (IS_ERR(device)) { + ret = PTR_ERR(device); + goto out_rsp_dma_map; + } + + isert_conn->conn_device = device; + isert_conn->conn_pd = ib_alloc_pd(isert_conn->conn_device->ib_device); + if (IS_ERR(isert_conn->conn_pd)) { + ret = PTR_ERR(isert_conn->conn_pd); + pr_err("ib_alloc_pd failed for conn %p: ret=%d\n", + isert_conn, ret); + goto out_pd; + } + + isert_conn->conn_mr = ib_get_dma_mr(isert_conn->conn_pd, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(isert_conn->conn_mr)) { + ret = PTR_ERR(isert_conn->conn_mr); + pr_err("ib_get_dma_mr failed for conn %p: ret=%d\n", + isert_conn, ret); + goto out_mr; + } + + pi_support = np->tpg_np->tpg->tpg_attrib.t10_pi; + if (pi_support && !device->pi_capable) { + pr_err("Protection information requested but not supported, " + "rejecting connect request\n"); + ret = rdma_reject(cma_id, NULL, 0); + goto out_mr; + } + + ret = isert_conn_setup_qp(isert_conn, cma_id, pi_support); + if (ret) + goto out_conn_dev; + + mutex_lock(&isert_np->np_accept_mutex); + list_add_tail(&isert_conn->conn_accept_node, &isert_np->np_accept_list); + mutex_unlock(&isert_np->np_accept_mutex); + + pr_debug("isert_connect_request() up np_sem np: %p\n", np); + up(&isert_np->np_sem); + return 0; + +out_conn_dev: + ib_dereg_mr(isert_conn->conn_mr); +out_mr: + ib_dealloc_pd(isert_conn->conn_pd); +out_pd: + isert_device_try_release(device); +out_rsp_dma_map: + ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma, + ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE); +out_req_dma_map: + ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_FROM_DEVICE); +out_login_buf: + kfree(isert_conn->login_buf); +out: + kfree(isert_conn); + return ret; +} + +static void +isert_connect_release(struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_device *device = isert_conn->conn_device; + int cq_index; + + pr_debug("Entering isert_connect_release(): >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"); + + if (device && device->use_fastreg) + isert_conn_free_fastreg_pool(isert_conn); + + if (isert_conn->conn_qp) { + cq_index = ((struct isert_cq_desc *) + isert_conn->conn_qp->recv_cq->cq_context)->cq_index; + pr_debug("isert_connect_release: cq_index: %d\n", cq_index); + isert_conn->conn_device->cq_active_qps[cq_index]--; + + rdma_destroy_qp(isert_conn->conn_cm_id); + } + + isert_free_rx_descriptors(isert_conn); + rdma_destroy_id(isert_conn->conn_cm_id); + + ib_dereg_mr(isert_conn->conn_mr); + ib_dealloc_pd(isert_conn->conn_pd); + + if (isert_conn->login_buf) { + ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma, + ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE); + ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, + DMA_FROM_DEVICE); + kfree(isert_conn->login_buf); + } + kfree(isert_conn); + + if (device) + isert_device_try_release(device); + + pr_debug("Leaving isert_connect_release >>>>>>>>>>>>\n"); +} + +static void +isert_connected_handler(struct rdma_cm_id *cma_id) +{ + return; +} + +static void +isert_release_conn_kref(struct kref *kref) +{ + struct isert_conn *isert_conn = container_of(kref, + struct isert_conn, conn_kref); + + pr_debug("Calling isert_connect_release for final kref %s/%d\n", + current->comm, current->pid); + + isert_connect_release(isert_conn); +} + +static void +isert_put_conn(struct isert_conn *isert_conn) +{ + kref_put(&isert_conn->conn_kref, isert_release_conn_kref); +} + +static void +isert_disconnect_work(struct work_struct *work) +{ + struct isert_conn *isert_conn = container_of(work, + struct isert_conn, conn_logout_work); + + pr_debug("isert_disconnect_work(): >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"); + mutex_lock(&isert_conn->conn_mutex); + if (isert_conn->state == ISER_CONN_UP) + isert_conn->state = ISER_CONN_TERMINATING; + + if (isert_conn->post_recv_buf_count == 0 && + atomic_read(&isert_conn->post_send_buf_count) == 0) { + mutex_unlock(&isert_conn->conn_mutex); + goto wake_up; + } + if (!isert_conn->conn_cm_id) { + mutex_unlock(&isert_conn->conn_mutex); + isert_put_conn(isert_conn); + return; + } + + if (isert_conn->disconnect) { + /* Send DREQ/DREP towards our initiator */ + rdma_disconnect(isert_conn->conn_cm_id); + } + + mutex_unlock(&isert_conn->conn_mutex); + +wake_up: + complete(&isert_conn->conn_wait); + isert_put_conn(isert_conn); +} + +static void +isert_disconnected_handler(struct rdma_cm_id *cma_id, bool disconnect) +{ + struct isert_conn *isert_conn = (struct isert_conn *)cma_id->context; + + isert_conn->disconnect = disconnect; + INIT_WORK(&isert_conn->conn_logout_work, isert_disconnect_work); + schedule_work(&isert_conn->conn_logout_work); +} + +static int +isert_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ + int ret = 0; + bool disconnect = false; + + pr_debug("isert_cma_handler: event %d status %d conn %p id %p\n", + event->event, event->status, cma_id->context, cma_id); + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = isert_connect_request(cma_id, event); + break; + case RDMA_CM_EVENT_ESTABLISHED: + isert_connected_handler(cma_id); + break; + case RDMA_CM_EVENT_ADDR_CHANGE: /* FALLTHRU */ + case RDMA_CM_EVENT_DISCONNECTED: /* FALLTHRU */ + case RDMA_CM_EVENT_DEVICE_REMOVAL: /* FALLTHRU */ + disconnect = true; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: /* FALLTHRU */ + isert_disconnected_handler(cma_id, disconnect); + break; + case RDMA_CM_EVENT_CONNECT_ERROR: + default: + pr_err("Unhandled RDMA CMA event: %d\n", event->event); + break; + } + + if (ret != 0) { + pr_err("isert_cma_handler failed RDMA_CM_EVENT: 0x%08x %d\n", + event->event, ret); + dump_stack(); + } + + return ret; +} + +static int +isert_post_recv(struct isert_conn *isert_conn, u32 count) +{ + struct ib_recv_wr *rx_wr, *rx_wr_failed; + int i, ret; + unsigned int rx_head = isert_conn->conn_rx_desc_head; + struct iser_rx_desc *rx_desc; + + for (rx_wr = isert_conn->conn_rx_wr, i = 0; i < count; i++, rx_wr++) { + rx_desc = &isert_conn->conn_rx_descs[rx_head]; + rx_wr->wr_id = (unsigned long)rx_desc; + rx_wr->sg_list = &rx_desc->rx_sg; + rx_wr->num_sge = 1; + rx_wr->next = rx_wr + 1; + rx_head = (rx_head + 1) & (ISERT_QP_MAX_RECV_DTOS - 1); + } + + rx_wr--; + rx_wr->next = NULL; /* mark end of work requests list */ + + isert_conn->post_recv_buf_count += count; + ret = ib_post_recv(isert_conn->conn_qp, isert_conn->conn_rx_wr, + &rx_wr_failed); + if (ret) { + pr_err("ib_post_recv() failed with ret: %d\n", ret); + isert_conn->post_recv_buf_count -= count; + } else { + pr_debug("isert_post_recv(): Posted %d RX buffers\n", count); + isert_conn->conn_rx_desc_head = rx_head; + } + return ret; +} + +static int +isert_post_send(struct isert_conn *isert_conn, struct iser_tx_desc *tx_desc) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_send_wr send_wr, *send_wr_failed; + int ret; + + ib_dma_sync_single_for_device(ib_dev, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + + send_wr.next = NULL; + send_wr.wr_id = (unsigned long)tx_desc; + send_wr.sg_list = tx_desc->tx_sg; + send_wr.num_sge = tx_desc->num_sge; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + atomic_inc(&isert_conn->post_send_buf_count); + + ret = ib_post_send(isert_conn->conn_qp, &send_wr, &send_wr_failed); + if (ret) { + pr_err("ib_post_send() failed, ret: %d\n", ret); + atomic_dec(&isert_conn->post_send_buf_count); + } + + return ret; +} + +static void +isert_create_send_desc(struct isert_conn *isert_conn, + struct isert_cmd *isert_cmd, + struct iser_tx_desc *tx_desc) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + + ib_dma_sync_single_for_cpu(ib_dev, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + + memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr)); + tx_desc->iser_header.flags = ISER_VER; + + tx_desc->num_sge = 1; + tx_desc->isert_cmd = isert_cmd; + + if (tx_desc->tx_sg[0].lkey != isert_conn->conn_mr->lkey) { + tx_desc->tx_sg[0].lkey = isert_conn->conn_mr->lkey; + pr_debug("tx_desc %p lkey mismatch, fixing\n", tx_desc); + } +} + +static int +isert_init_tx_hdrs(struct isert_conn *isert_conn, + struct iser_tx_desc *tx_desc) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + u64 dma_addr; + + dma_addr = ib_dma_map_single(ib_dev, (void *)tx_desc, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + if (ib_dma_mapping_error(ib_dev, dma_addr)) { + pr_err("ib_dma_mapping_error() failed\n"); + return -ENOMEM; + } + + tx_desc->dma_addr = dma_addr; + tx_desc->tx_sg[0].addr = tx_desc->dma_addr; + tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; + tx_desc->tx_sg[0].lkey = isert_conn->conn_mr->lkey; + + pr_debug("isert_init_tx_hdrs: Setup tx_sg[0].addr: 0x%llx length: %u" + " lkey: 0x%08x\n", tx_desc->tx_sg[0].addr, + tx_desc->tx_sg[0].length, tx_desc->tx_sg[0].lkey); + + return 0; +} + +static void +isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct ib_send_wr *send_wr, bool coalesce) +{ + struct iser_tx_desc *tx_desc = &isert_cmd->tx_desc; + + isert_cmd->rdma_wr.iser_ib_op = ISER_IB_SEND; + send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc; + send_wr->opcode = IB_WR_SEND; + send_wr->sg_list = &tx_desc->tx_sg[0]; + send_wr->num_sge = isert_cmd->tx_desc.num_sge; + /* + * Coalesce send completion interrupts by only setting IB_SEND_SIGNALED + * bit for every ISERT_COMP_BATCH_COUNT number of ib_post_send() calls. + */ + mutex_lock(&isert_conn->conn_mutex); + if (coalesce && isert_conn->state == ISER_CONN_UP && + ++isert_conn->conn_comp_batch < ISERT_COMP_BATCH_COUNT) { + tx_desc->llnode_active = true; + llist_add(&tx_desc->comp_llnode, &isert_conn->conn_comp_llist); + mutex_unlock(&isert_conn->conn_mutex); + return; + } + isert_conn->conn_comp_batch = 0; + tx_desc->comp_llnode_batch = llist_del_all(&isert_conn->conn_comp_llist); + mutex_unlock(&isert_conn->conn_mutex); + + send_wr->send_flags = IB_SEND_SIGNALED; +} + +static int +isert_rdma_post_recvl(struct isert_conn *isert_conn) +{ + struct ib_recv_wr rx_wr, *rx_wr_fail; + struct ib_sge sge; + int ret; + + memset(&sge, 0, sizeof(struct ib_sge)); + sge.addr = isert_conn->login_req_dma; + sge.length = ISER_RX_LOGIN_SIZE; + sge.lkey = isert_conn->conn_mr->lkey; + + pr_debug("Setup sge: addr: %llx length: %d 0x%08x\n", + sge.addr, sge.length, sge.lkey); + + memset(&rx_wr, 0, sizeof(struct ib_recv_wr)); + rx_wr.wr_id = (unsigned long)isert_conn->login_req_buf; + rx_wr.sg_list = &sge; + rx_wr.num_sge = 1; + + isert_conn->post_recv_buf_count++; + ret = ib_post_recv(isert_conn->conn_qp, &rx_wr, &rx_wr_fail); + if (ret) { + pr_err("ib_post_recv() failed: %d\n", ret); + isert_conn->post_recv_buf_count--; + } + + pr_debug("ib_post_recv(): returned success >>>>>>>>>>>>>>>>>>>>>>>>\n"); + return ret; +} + +static int +isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login, + u32 length) +{ + struct isert_conn *isert_conn = conn->context; + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iser_tx_desc *tx_desc = &isert_conn->conn_login_tx_desc; + int ret; + + isert_create_send_desc(isert_conn, NULL, tx_desc); + + memcpy(&tx_desc->iscsi_header, &login->rsp[0], + sizeof(struct iscsi_hdr)); + + isert_init_tx_hdrs(isert_conn, tx_desc); + + if (length > 0) { + struct ib_sge *tx_dsg = &tx_desc->tx_sg[1]; + + ib_dma_sync_single_for_cpu(ib_dev, isert_conn->login_rsp_dma, + length, DMA_TO_DEVICE); + + memcpy(isert_conn->login_rsp_buf, login->rsp_buf, length); + + ib_dma_sync_single_for_device(ib_dev, isert_conn->login_rsp_dma, + length, DMA_TO_DEVICE); + + tx_dsg->addr = isert_conn->login_rsp_dma; + tx_dsg->length = length; + tx_dsg->lkey = isert_conn->conn_mr->lkey; + tx_desc->num_sge = 2; + } + if (!login->login_failed) { + if (login->login_complete) { + if (!conn->sess->sess_ops->SessionType && + isert_conn->conn_device->use_fastreg) { + /* Normal Session and fastreg is used */ + u8 pi_support = login->np->tpg_np->tpg->tpg_attrib.t10_pi; + + ret = isert_conn_create_fastreg_pool(isert_conn, + pi_support); + if (ret) { + pr_err("Conn: %p failed to create" + " fastreg pool\n", isert_conn); + return ret; + } + } + + ret = isert_alloc_rx_descriptors(isert_conn); + if (ret) + return ret; + + ret = isert_post_recv(isert_conn, ISERT_MIN_POSTED_RX); + if (ret) + return ret; + + isert_conn->state = ISER_CONN_UP; + goto post_send; + } + + ret = isert_rdma_post_recvl(isert_conn); + if (ret) + return ret; + } +post_send: + ret = isert_post_send(isert_conn, tx_desc); + if (ret) + return ret; + + return 0; +} + +static void +isert_rx_login_req(struct iser_rx_desc *rx_desc, int rx_buflen, + struct isert_conn *isert_conn) +{ + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_login *login = conn->conn_login; + int size; + + if (!login) { + pr_err("conn->conn_login is NULL\n"); + dump_stack(); + return; + } + + if (login->first_request) { + struct iscsi_login_req *login_req = + (struct iscsi_login_req *)&rx_desc->iscsi_header; + /* + * Setup the initial iscsi_login values from the leading + * login request PDU. + */ + login->leading_connection = (!login_req->tsih) ? 1 : 0; + login->current_stage = + (login_req->flags & ISCSI_FLAG_LOGIN_CURRENT_STAGE_MASK) + >> 2; + login->version_min = login_req->min_version; + login->version_max = login_req->max_version; + memcpy(login->isid, login_req->isid, 6); + login->cmd_sn = be32_to_cpu(login_req->cmdsn); + login->init_task_tag = login_req->itt; + login->initial_exp_statsn = be32_to_cpu(login_req->exp_statsn); + login->cid = be16_to_cpu(login_req->cid); + login->tsih = be16_to_cpu(login_req->tsih); + } + + memcpy(&login->req[0], (void *)&rx_desc->iscsi_header, ISCSI_HDR_LEN); + + size = min(rx_buflen, MAX_KEY_VALUE_PAIRS); + pr_debug("Using login payload size: %d, rx_buflen: %d MAX_KEY_VALUE_PAIRS: %d\n", + size, rx_buflen, MAX_KEY_VALUE_PAIRS); + memcpy(login->req_buf, &rx_desc->data[0], size); + + if (login->first_request) { + complete(&isert_conn->conn_login_comp); + return; + } + schedule_delayed_work(&conn->login_work, 0); +} + +static struct iscsi_cmd +*isert_allocate_cmd(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_cmd *isert_cmd; + struct iscsi_cmd *cmd; + + cmd = iscsit_allocate_cmd(conn, TASK_INTERRUPTIBLE); + if (!cmd) { + pr_err("Unable to allocate iscsi_cmd + isert_cmd\n"); + return NULL; + } + isert_cmd = iscsit_priv_cmd(cmd); + isert_cmd->conn = isert_conn; + isert_cmd->iscsi_cmd = cmd; + + return cmd; +} + +static int +isert_handle_scsi_cmd(struct isert_conn *isert_conn, + struct isert_cmd *isert_cmd, struct iscsi_cmd *cmd, + struct iser_rx_desc *rx_desc, unsigned char *buf) +{ + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)buf; + struct scatterlist *sg; + int imm_data, imm_data_len, unsol_data, sg_nents, rc; + bool dump_payload = false; + + rc = iscsit_setup_scsi_cmd(conn, cmd, buf); + if (rc < 0) + return rc; + + imm_data = cmd->immediate_data; + imm_data_len = cmd->first_burst_len; + unsol_data = cmd->unsolicited_data; + + rc = iscsit_process_scsi_cmd(conn, cmd, hdr); + if (rc < 0) { + return 0; + } else if (rc > 0) { + dump_payload = true; + goto sequence_cmd; + } + + if (!imm_data) + return 0; + + sg = &cmd->se_cmd.t_data_sg[0]; + sg_nents = max(1UL, DIV_ROUND_UP(imm_data_len, PAGE_SIZE)); + + pr_debug("Copying Immediate SG: %p sg_nents: %u from %p imm_data_len: %d\n", + sg, sg_nents, &rx_desc->data[0], imm_data_len); + + sg_copy_from_buffer(sg, sg_nents, &rx_desc->data[0], imm_data_len); + + cmd->write_data_done += imm_data_len; + + if (cmd->write_data_done == cmd->se_cmd.data_length) { + spin_lock_bh(&cmd->istate_lock); + cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT; + cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT; + spin_unlock_bh(&cmd->istate_lock); + } + +sequence_cmd: + rc = iscsit_sequence_cmd(conn, cmd, buf, hdr->cmdsn); + + if (!rc && dump_payload == false && unsol_data) + iscsit_set_unsoliticed_dataout(cmd); + else if (dump_payload && imm_data) + target_put_sess_cmd(conn->sess->se_sess, &cmd->se_cmd); + + return 0; +} + +static int +isert_handle_iscsi_dataout(struct isert_conn *isert_conn, + struct iser_rx_desc *rx_desc, unsigned char *buf) +{ + struct scatterlist *sg_start; + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_cmd *cmd = NULL; + struct iscsi_data *hdr = (struct iscsi_data *)buf; + u32 unsol_data_len = ntoh24(hdr->dlength); + int rc, sg_nents, sg_off, page_off; + + rc = iscsit_check_dataout_hdr(conn, buf, &cmd); + if (rc < 0) + return rc; + else if (!cmd) + return 0; + /* + * FIXME: Unexpected unsolicited_data out + */ + if (!cmd->unsolicited_data) { + pr_err("Received unexpected solicited data payload\n"); + dump_stack(); + return -1; + } + + pr_debug("Unsolicited DataOut unsol_data_len: %u, write_data_done: %u, data_length: %u\n", + unsol_data_len, cmd->write_data_done, cmd->se_cmd.data_length); + + sg_off = cmd->write_data_done / PAGE_SIZE; + sg_start = &cmd->se_cmd.t_data_sg[sg_off]; + sg_nents = max(1UL, DIV_ROUND_UP(unsol_data_len, PAGE_SIZE)); + page_off = cmd->write_data_done % PAGE_SIZE; + /* + * FIXME: Non page-aligned unsolicited_data out + */ + if (page_off) { + pr_err("Received unexpected non-page aligned data payload\n"); + dump_stack(); + return -1; + } + pr_debug("Copying DataOut: sg_start: %p, sg_off: %u sg_nents: %u from %p %u\n", + sg_start, sg_off, sg_nents, &rx_desc->data[0], unsol_data_len); + + sg_copy_from_buffer(sg_start, sg_nents, &rx_desc->data[0], + unsol_data_len); + + rc = iscsit_check_dataout_payload(cmd, hdr, false); + if (rc < 0) + return rc; + + return 0; +} + +static int +isert_handle_nop_out(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc, + unsigned char *buf) +{ + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_nopout *hdr = (struct iscsi_nopout *)buf; + int rc; + + rc = iscsit_setup_nop_out(conn, cmd, hdr); + if (rc < 0) + return rc; + /* + * FIXME: Add support for NOPOUT payload using unsolicited RDMA payload + */ + + return iscsit_process_nop_out(conn, cmd, hdr); +} + +static int +isert_handle_text_cmd(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc, + struct iscsi_text *hdr) +{ + struct iscsi_conn *conn = isert_conn->conn; + u32 payload_length = ntoh24(hdr->dlength); + int rc; + unsigned char *text_in; + + rc = iscsit_setup_text_cmd(conn, cmd, hdr); + if (rc < 0) + return rc; + + text_in = kzalloc(payload_length, GFP_KERNEL); + if (!text_in) { + pr_err("Unable to allocate text_in of payload_length: %u\n", + payload_length); + return -ENOMEM; + } + cmd->text_in_ptr = text_in; + + memcpy(cmd->text_in_ptr, &rx_desc->data[0], payload_length); + + return iscsit_process_text_cmd(conn, cmd, hdr); +} + +static int +isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc, + uint32_t read_stag, uint64_t read_va, + uint32_t write_stag, uint64_t write_va) +{ + struct iscsi_hdr *hdr = &rx_desc->iscsi_header; + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_session *sess = conn->sess; + struct iscsi_cmd *cmd; + struct isert_cmd *isert_cmd; + int ret = -EINVAL; + u8 opcode = (hdr->opcode & ISCSI_OPCODE_MASK); + + if (sess->sess_ops->SessionType && + (!(opcode & ISCSI_OP_TEXT) || !(opcode & ISCSI_OP_LOGOUT))) { + pr_err("Got illegal opcode: 0x%02x in SessionType=Discovery," + " ignoring\n", opcode); + return 0; + } + + switch (opcode) { + case ISCSI_OP_SCSI_CMD: + cmd = isert_allocate_cmd(conn); + if (!cmd) + break; + + isert_cmd = iscsit_priv_cmd(cmd); + isert_cmd->read_stag = read_stag; + isert_cmd->read_va = read_va; + isert_cmd->write_stag = write_stag; + isert_cmd->write_va = write_va; + + ret = isert_handle_scsi_cmd(isert_conn, isert_cmd, cmd, + rx_desc, (unsigned char *)hdr); + break; + case ISCSI_OP_NOOP_OUT: + cmd = isert_allocate_cmd(conn); + if (!cmd) + break; + + isert_cmd = iscsit_priv_cmd(cmd); + ret = isert_handle_nop_out(isert_conn, isert_cmd, cmd, + rx_desc, (unsigned char *)hdr); + break; + case ISCSI_OP_SCSI_DATA_OUT: + ret = isert_handle_iscsi_dataout(isert_conn, rx_desc, + (unsigned char *)hdr); + break; + case ISCSI_OP_SCSI_TMFUNC: + cmd = isert_allocate_cmd(conn); + if (!cmd) + break; + + ret = iscsit_handle_task_mgt_cmd(conn, cmd, + (unsigned char *)hdr); + break; + case ISCSI_OP_LOGOUT: + cmd = isert_allocate_cmd(conn); + if (!cmd) + break; + + ret = iscsit_handle_logout_cmd(conn, cmd, (unsigned char *)hdr); + if (ret > 0) + wait_for_completion_timeout(&conn->conn_logout_comp, + SECONDS_FOR_LOGOUT_COMP * + HZ); + break; + case ISCSI_OP_TEXT: + cmd = isert_allocate_cmd(conn); + if (!cmd) + break; + + isert_cmd = iscsit_priv_cmd(cmd); + ret = isert_handle_text_cmd(isert_conn, isert_cmd, cmd, + rx_desc, (struct iscsi_text *)hdr); + break; + default: + pr_err("Got unknown iSCSI OpCode: 0x%02x\n", opcode); + dump_stack(); + break; + } + + return ret; +} + +static void +isert_rx_do_work(struct iser_rx_desc *rx_desc, struct isert_conn *isert_conn) +{ + struct iser_hdr *iser_hdr = &rx_desc->iser_header; + uint64_t read_va = 0, write_va = 0; + uint32_t read_stag = 0, write_stag = 0; + int rc; + + switch (iser_hdr->flags & 0xF0) { + case ISCSI_CTRL: + if (iser_hdr->flags & ISER_RSV) { + read_stag = be32_to_cpu(iser_hdr->read_stag); + read_va = be64_to_cpu(iser_hdr->read_va); + pr_debug("ISER_RSV: read_stag: 0x%08x read_va: 0x%16llx\n", + read_stag, (unsigned long long)read_va); + } + if (iser_hdr->flags & ISER_WSV) { + write_stag = be32_to_cpu(iser_hdr->write_stag); + write_va = be64_to_cpu(iser_hdr->write_va); + pr_debug("ISER_WSV: write__stag: 0x%08x write_va: 0x%16llx\n", + write_stag, (unsigned long long)write_va); + } + + pr_debug("ISER ISCSI_CTRL PDU\n"); + break; + case ISER_HELLO: + pr_err("iSER Hello message\n"); + break; + default: + pr_warn("Unknown iSER hdr flags: 0x%02x\n", iser_hdr->flags); + break; + } + + rc = isert_rx_opcode(isert_conn, rx_desc, + read_stag, read_va, write_stag, write_va); +} + +static void +isert_rx_completion(struct iser_rx_desc *desc, struct isert_conn *isert_conn, + unsigned long xfer_len) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iscsi_hdr *hdr; + u64 rx_dma; + int rx_buflen, outstanding; + + if ((char *)desc == isert_conn->login_req_buf) { + rx_dma = isert_conn->login_req_dma; + rx_buflen = ISER_RX_LOGIN_SIZE; + pr_debug("ISER login_buf: Using rx_dma: 0x%llx, rx_buflen: %d\n", + rx_dma, rx_buflen); + } else { + rx_dma = desc->dma_addr; + rx_buflen = ISER_RX_PAYLOAD_SIZE; + pr_debug("ISER req_buf: Using rx_dma: 0x%llx, rx_buflen: %d\n", + rx_dma, rx_buflen); + } + + ib_dma_sync_single_for_cpu(ib_dev, rx_dma, rx_buflen, DMA_FROM_DEVICE); + + hdr = &desc->iscsi_header; + pr_debug("iSCSI opcode: 0x%02x, ITT: 0x%08x, flags: 0x%02x dlen: %d\n", + hdr->opcode, hdr->itt, hdr->flags, + (int)(xfer_len - ISER_HEADERS_LEN)); + + if ((char *)desc == isert_conn->login_req_buf) + isert_rx_login_req(desc, xfer_len - ISER_HEADERS_LEN, + isert_conn); + else + isert_rx_do_work(desc, isert_conn); + + ib_dma_sync_single_for_device(ib_dev, rx_dma, rx_buflen, + DMA_FROM_DEVICE); + + isert_conn->post_recv_buf_count--; + pr_debug("iSERT: Decremented post_recv_buf_count: %d\n", + isert_conn->post_recv_buf_count); + + if ((char *)desc == isert_conn->login_req_buf) + return; + + outstanding = isert_conn->post_recv_buf_count; + if (outstanding + ISERT_MIN_POSTED_RX <= ISERT_QP_MAX_RECV_DTOS) { + int err, count = min(ISERT_QP_MAX_RECV_DTOS - outstanding, + ISERT_MIN_POSTED_RX); + err = isert_post_recv(isert_conn, count); + if (err) { + pr_err("isert_post_recv() count: %d failed, %d\n", + count, err); + } + } +} + +static int +isert_map_data_buf(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct scatterlist *sg, u32 nents, u32 length, u32 offset, + enum iser_ib_op_code op, struct isert_data_buf *data) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + + data->dma_dir = op == ISER_IB_RDMA_WRITE ? + DMA_TO_DEVICE : DMA_FROM_DEVICE; + + data->len = length - offset; + data->offset = offset; + data->sg_off = data->offset / PAGE_SIZE; + + data->sg = &sg[data->sg_off]; + data->nents = min_t(unsigned int, nents - data->sg_off, + ISCSI_ISER_SG_TABLESIZE); + data->len = min_t(unsigned int, data->len, ISCSI_ISER_SG_TABLESIZE * + PAGE_SIZE); + + data->dma_nents = ib_dma_map_sg(ib_dev, data->sg, data->nents, + data->dma_dir); + if (unlikely(!data->dma_nents)) { + pr_err("Cmd: unable to dma map SGs %p\n", sg); + return -EINVAL; + } + + pr_debug("Mapped cmd: %p count: %u sg: %p sg_nents: %u rdma_len %d\n", + isert_cmd, data->dma_nents, data->sg, data->nents, data->len); + + return 0; +} + +static void +isert_unmap_data_buf(struct isert_conn *isert_conn, struct isert_data_buf *data) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + + ib_dma_unmap_sg(ib_dev, data->sg, data->nents, data->dma_dir); + memset(data, 0, sizeof(*data)); +} + + + +static void +isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) +{ + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + + pr_debug("isert_unmap_cmd: %p\n", isert_cmd); + + if (wr->data.sg) { + pr_debug("isert_unmap_cmd: %p unmap_sg op\n", isert_cmd); + isert_unmap_data_buf(isert_conn, &wr->data); + } + + if (wr->send_wr) { + pr_debug("isert_unmap_cmd: %p free send_wr\n", isert_cmd); + kfree(wr->send_wr); + wr->send_wr = NULL; + } + + if (wr->ib_sge) { + pr_debug("isert_unmap_cmd: %p free ib_sge\n", isert_cmd); + kfree(wr->ib_sge); + wr->ib_sge = NULL; + } +} + +static void +isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) +{ + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + LIST_HEAD(unmap_list); + + pr_debug("unreg_fastreg_cmd: %p\n", isert_cmd); + + if (wr->fr_desc) { + pr_debug("unreg_fastreg_cmd: %p free fr_desc %p\n", + isert_cmd, wr->fr_desc); + if (wr->fr_desc->ind & ISERT_PROTECTED) { + isert_unmap_data_buf(isert_conn, &wr->prot); + wr->fr_desc->ind &= ~ISERT_PROTECTED; + } + spin_lock_bh(&isert_conn->conn_lock); + list_add_tail(&wr->fr_desc->list, &isert_conn->conn_fr_pool); + spin_unlock_bh(&isert_conn->conn_lock); + wr->fr_desc = NULL; + } + + if (wr->data.sg) { + pr_debug("unreg_fastreg_cmd: %p unmap_sg op\n", isert_cmd); + isert_unmap_data_buf(isert_conn, &wr->data); + } + + wr->ib_sge = NULL; + wr->send_wr = NULL; +} + +static void +isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err) +{ + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct isert_conn *isert_conn = isert_cmd->conn; + struct iscsi_conn *conn = isert_conn->conn; + struct isert_device *device = isert_conn->conn_device; + + pr_debug("Entering isert_put_cmd: %p\n", isert_cmd); + + switch (cmd->iscsi_opcode) { + case ISCSI_OP_SCSI_CMD: + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + if (cmd->data_direction == DMA_TO_DEVICE) { + iscsit_stop_dataout_timer(cmd); + /* + * Check for special case during comp_err where + * WRITE_PENDING has been handed off from core, + * but requires an extra target_put_sess_cmd() + * before transport_generic_free_cmd() below. + */ + if (comp_err && + cmd->se_cmd.t_state == TRANSPORT_WRITE_PENDING) { + struct se_cmd *se_cmd = &cmd->se_cmd; + + target_put_sess_cmd(se_cmd->se_sess, se_cmd); + } + } + + device->unreg_rdma_mem(isert_cmd, isert_conn); + transport_generic_free_cmd(&cmd->se_cmd, 0); + break; + case ISCSI_OP_SCSI_TMFUNC: + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + transport_generic_free_cmd(&cmd->se_cmd, 0); + break; + case ISCSI_OP_REJECT: + case ISCSI_OP_NOOP_OUT: + case ISCSI_OP_TEXT: + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + /* + * Handle special case for REJECT when iscsi_add_reject*() has + * overwritten the original iscsi_opcode assignment, and the + * associated cmd->se_cmd needs to be released. + */ + if (cmd->se_cmd.se_tfo != NULL) { + pr_debug("Calling transport_generic_free_cmd from" + " isert_put_cmd for 0x%02x\n", + cmd->iscsi_opcode); + transport_generic_free_cmd(&cmd->se_cmd, 0); + break; + } + /* + * Fall-through + */ + default: + iscsit_release_cmd(cmd); + break; + } +} + +static void +isert_unmap_tx_desc(struct iser_tx_desc *tx_desc, struct ib_device *ib_dev) +{ + if (tx_desc->dma_addr != 0) { + pr_debug("Calling ib_dma_unmap_single for tx_desc->dma_addr\n"); + ib_dma_unmap_single(ib_dev, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + tx_desc->dma_addr = 0; + } +} + +static void +isert_completion_put(struct iser_tx_desc *tx_desc, struct isert_cmd *isert_cmd, + struct ib_device *ib_dev, bool comp_err) +{ + if (isert_cmd->pdu_buf_dma != 0) { + pr_debug("Calling ib_dma_unmap_single for isert_cmd->pdu_buf_dma\n"); + ib_dma_unmap_single(ib_dev, isert_cmd->pdu_buf_dma, + isert_cmd->pdu_buf_len, DMA_TO_DEVICE); + isert_cmd->pdu_buf_dma = 0; + } + + isert_unmap_tx_desc(tx_desc, ib_dev); + isert_put_cmd(isert_cmd, comp_err); +} + +static int +isert_check_pi_status(struct se_cmd *se_cmd, struct ib_mr *sig_mr) +{ + struct ib_mr_status mr_status; + int ret; + + ret = ib_check_mr_status(sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status); + if (ret) { + pr_err("ib_check_mr_status failed, ret %d\n", ret); + goto fail_mr_status; + } + + if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { + u64 sec_offset_err; + u32 block_size = se_cmd->se_dev->dev_attrib.block_size + 8; + + switch (mr_status.sig_err.err_type) { + case IB_SIG_BAD_GUARD: + se_cmd->pi_err = TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED; + break; + case IB_SIG_BAD_REFTAG: + se_cmd->pi_err = TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED; + break; + case IB_SIG_BAD_APPTAG: + se_cmd->pi_err = TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED; + break; + } + sec_offset_err = mr_status.sig_err.sig_err_offset; + do_div(sec_offset_err, block_size); + se_cmd->bad_sector = sec_offset_err + se_cmd->t_task_lba; + + pr_err("isert: PI error found type %d at sector 0x%llx " + "expected 0x%x vs actual 0x%x\n", + mr_status.sig_err.err_type, + (unsigned long long)se_cmd->bad_sector, + mr_status.sig_err.expected, + mr_status.sig_err.actual); + ret = 1; + } + +fail_mr_status: + return ret; +} + +static void +isert_completion_rdma_write(struct iser_tx_desc *tx_desc, + struct isert_cmd *isert_cmd) +{ + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_conn *isert_conn = isert_cmd->conn; + struct isert_device *device = isert_conn->conn_device; + int ret = 0; + + if (wr->fr_desc && wr->fr_desc->ind & ISERT_PROTECTED) { + ret = isert_check_pi_status(se_cmd, + wr->fr_desc->pi_ctx->sig_mr); + wr->fr_desc->ind &= ~ISERT_PROTECTED; + } + + device->unreg_rdma_mem(isert_cmd, isert_conn); + wr->send_wr_num = 0; + if (ret) + transport_send_check_condition_and_sense(se_cmd, + se_cmd->pi_err, 0); + else + isert_put_response(isert_conn->conn, cmd); +} + +static void +isert_completion_rdma_read(struct iser_tx_desc *tx_desc, + struct isert_cmd *isert_cmd) +{ + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_conn *isert_conn = isert_cmd->conn; + struct isert_device *device = isert_conn->conn_device; + int ret = 0; + + if (wr->fr_desc && wr->fr_desc->ind & ISERT_PROTECTED) { + ret = isert_check_pi_status(se_cmd, + wr->fr_desc->pi_ctx->sig_mr); + wr->fr_desc->ind &= ~ISERT_PROTECTED; + } + + iscsit_stop_dataout_timer(cmd); + device->unreg_rdma_mem(isert_cmd, isert_conn); + cmd->write_data_done = wr->data.len; + wr->send_wr_num = 0; + + pr_debug("Cmd: %p RDMA_READ comp calling execute_cmd\n", isert_cmd); + spin_lock_bh(&cmd->istate_lock); + cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT; + cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT; + spin_unlock_bh(&cmd->istate_lock); + + if (ret) + transport_send_check_condition_and_sense(se_cmd, + se_cmd->pi_err, 0); + else + target_execute_cmd(se_cmd); +} + +static void +isert_do_control_comp(struct work_struct *work) +{ + struct isert_cmd *isert_cmd = container_of(work, + struct isert_cmd, comp_work); + struct isert_conn *isert_conn = isert_cmd->conn; + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + + switch (cmd->i_state) { + case ISTATE_SEND_TASKMGTRSP: + pr_debug("Calling iscsit_tmr_post_handler >>>>>>>>>>>>>>>>>\n"); + + atomic_dec(&isert_conn->post_send_buf_count); + iscsit_tmr_post_handler(cmd, cmd->conn); + + cmd->i_state = ISTATE_SENT_STATUS; + isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); + break; + case ISTATE_SEND_REJECT: + pr_debug("Got isert_do_control_comp ISTATE_SEND_REJECT: >>>\n"); + atomic_dec(&isert_conn->post_send_buf_count); + + cmd->i_state = ISTATE_SENT_STATUS; + isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); + break; + case ISTATE_SEND_LOGOUTRSP: + pr_debug("Calling iscsit_logout_post_handler >>>>>>>>>>>>>>\n"); + + atomic_dec(&isert_conn->post_send_buf_count); + iscsit_logout_post_handler(cmd, cmd->conn); + break; + case ISTATE_SEND_TEXTRSP: + atomic_dec(&isert_conn->post_send_buf_count); + cmd->i_state = ISTATE_SENT_STATUS; + isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); + break; + default: + pr_err("Unknown do_control_comp i_state %d\n", cmd->i_state); + dump_stack(); + break; + } +} + +static void +isert_response_completion(struct iser_tx_desc *tx_desc, + struct isert_cmd *isert_cmd, + struct isert_conn *isert_conn, + struct ib_device *ib_dev) +{ + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + + if (cmd->i_state == ISTATE_SEND_TASKMGTRSP || + cmd->i_state == ISTATE_SEND_LOGOUTRSP || + cmd->i_state == ISTATE_SEND_REJECT || + cmd->i_state == ISTATE_SEND_TEXTRSP) { + isert_unmap_tx_desc(tx_desc, ib_dev); + + INIT_WORK(&isert_cmd->comp_work, isert_do_control_comp); + queue_work(isert_comp_wq, &isert_cmd->comp_work); + return; + } + + /** + * If send_wr_num is 0 this means that we got + * RDMA completion and we cleared it and we should + * simply decrement the response post. else the + * response is incorporated in send_wr_num, just + * sub it. + **/ + if (wr->send_wr_num) + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + else + atomic_dec(&isert_conn->post_send_buf_count); + + cmd->i_state = ISTATE_SENT_STATUS; + isert_completion_put(tx_desc, isert_cmd, ib_dev, false); +} + +static void +__isert_send_completion(struct iser_tx_desc *tx_desc, + struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_cmd *isert_cmd = tx_desc->isert_cmd; + struct isert_rdma_wr *wr; + + if (!isert_cmd) { + atomic_dec(&isert_conn->post_send_buf_count); + isert_unmap_tx_desc(tx_desc, ib_dev); + return; + } + wr = &isert_cmd->rdma_wr; + + switch (wr->iser_ib_op) { + case ISER_IB_RECV: + pr_err("isert_send_completion: Got ISER_IB_RECV\n"); + dump_stack(); + break; + case ISER_IB_SEND: + pr_debug("isert_send_completion: Got ISER_IB_SEND\n"); + isert_response_completion(tx_desc, isert_cmd, + isert_conn, ib_dev); + break; + case ISER_IB_RDMA_WRITE: + pr_debug("isert_send_completion: Got ISER_IB_RDMA_WRITE\n"); + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + isert_completion_rdma_write(tx_desc, isert_cmd); + break; + case ISER_IB_RDMA_READ: + pr_debug("isert_send_completion: Got ISER_IB_RDMA_READ:\n"); + + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + isert_completion_rdma_read(tx_desc, isert_cmd); + break; + default: + pr_err("Unknown wr->iser_ib_op: 0x%02x\n", wr->iser_ib_op); + dump_stack(); + break; + } +} + +static void +isert_send_completion(struct iser_tx_desc *tx_desc, + struct isert_conn *isert_conn) +{ + struct llist_node *llnode = tx_desc->comp_llnode_batch; + struct iser_tx_desc *t; + /* + * Drain coalesced completion llist starting from comp_llnode_batch + * setup in isert_init_send_wr(), and then complete trailing tx_desc. + */ + while (llnode) { + t = llist_entry(llnode, struct iser_tx_desc, comp_llnode); + llnode = llist_next(llnode); + __isert_send_completion(t, isert_conn); + } + __isert_send_completion(tx_desc, isert_conn); +} + +static void +isert_cq_drain_comp_llist(struct isert_conn *isert_conn, struct ib_device *ib_dev) +{ + struct llist_node *llnode; + struct isert_rdma_wr *wr; + struct iser_tx_desc *t; + + mutex_lock(&isert_conn->conn_mutex); + llnode = llist_del_all(&isert_conn->conn_comp_llist); + isert_conn->conn_comp_batch = 0; + mutex_unlock(&isert_conn->conn_mutex); + + while (llnode) { + t = llist_entry(llnode, struct iser_tx_desc, comp_llnode); + llnode = llist_next(llnode); + wr = &t->isert_cmd->rdma_wr; + + /** + * If send_wr_num is 0 this means that we got + * RDMA completion and we cleared it and we should + * simply decrement the response post. else the + * response is incorporated in send_wr_num, just + * sub it. + **/ + if (wr->send_wr_num) + atomic_sub(wr->send_wr_num, + &isert_conn->post_send_buf_count); + else + atomic_dec(&isert_conn->post_send_buf_count); + + isert_completion_put(t, t->isert_cmd, ib_dev, true); + } +} + +static void +isert_cq_tx_comp_err(struct iser_tx_desc *tx_desc, struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_cmd *isert_cmd = tx_desc->isert_cmd; + struct llist_node *llnode = tx_desc->comp_llnode_batch; + struct isert_rdma_wr *wr; + struct iser_tx_desc *t; + + while (llnode) { + t = llist_entry(llnode, struct iser_tx_desc, comp_llnode); + llnode = llist_next(llnode); + wr = &t->isert_cmd->rdma_wr; + + /** + * If send_wr_num is 0 this means that we got + * RDMA completion and we cleared it and we should + * simply decrement the response post. else the + * response is incorporated in send_wr_num, just + * sub it. + **/ + if (wr->send_wr_num) + atomic_sub(wr->send_wr_num, + &isert_conn->post_send_buf_count); + else + atomic_dec(&isert_conn->post_send_buf_count); + + isert_completion_put(t, t->isert_cmd, ib_dev, true); + } + tx_desc->comp_llnode_batch = NULL; + + if (!isert_cmd) + isert_unmap_tx_desc(tx_desc, ib_dev); + else + isert_completion_put(tx_desc, isert_cmd, ib_dev, true); +} + +static void +isert_cq_rx_comp_err(struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iscsi_conn *conn = isert_conn->conn; + + if (isert_conn->post_recv_buf_count) + return; + + isert_cq_drain_comp_llist(isert_conn, ib_dev); + + if (conn->sess) { + target_sess_cmd_list_set_waiting(conn->sess->se_sess); + target_wait_for_sess_cmds(conn->sess->se_sess); + } + + while (atomic_read(&isert_conn->post_send_buf_count)) + msleep(3000); + + mutex_lock(&isert_conn->conn_mutex); + isert_conn->state = ISER_CONN_DOWN; + mutex_unlock(&isert_conn->conn_mutex); + + iscsit_cause_connection_reinstatement(isert_conn->conn, 0); + + complete(&isert_conn->conn_wait_comp_err); +} + +static void +isert_cq_tx_work(struct work_struct *work) +{ + struct isert_cq_desc *cq_desc = container_of(work, + struct isert_cq_desc, cq_tx_work); + struct isert_device *device = cq_desc->device; + int cq_index = cq_desc->cq_index; + struct ib_cq *tx_cq = device->dev_tx_cq[cq_index]; + struct isert_conn *isert_conn; + struct iser_tx_desc *tx_desc; + struct ib_wc wc; + + while (ib_poll_cq(tx_cq, 1, &wc) == 1) { + tx_desc = (struct iser_tx_desc *)(unsigned long)wc.wr_id; + isert_conn = wc.qp->qp_context; + + if (wc.status == IB_WC_SUCCESS) { + isert_send_completion(tx_desc, isert_conn); + } else { + pr_debug("TX wc.status != IB_WC_SUCCESS >>>>>>>>>>>>>>\n"); + pr_debug("TX wc.status: 0x%08x\n", wc.status); + pr_debug("TX wc.vendor_err: 0x%08x\n", wc.vendor_err); + + if (wc.wr_id != ISER_FASTREG_LI_WRID) { + if (tx_desc->llnode_active) + continue; + + atomic_dec(&isert_conn->post_send_buf_count); + isert_cq_tx_comp_err(tx_desc, isert_conn); + } + } + } + + ib_req_notify_cq(tx_cq, IB_CQ_NEXT_COMP); +} + +static void +isert_cq_tx_callback(struct ib_cq *cq, void *context) +{ + struct isert_cq_desc *cq_desc = (struct isert_cq_desc *)context; + + queue_work(isert_comp_wq, &cq_desc->cq_tx_work); +} + +static void +isert_cq_rx_work(struct work_struct *work) +{ + struct isert_cq_desc *cq_desc = container_of(work, + struct isert_cq_desc, cq_rx_work); + struct isert_device *device = cq_desc->device; + int cq_index = cq_desc->cq_index; + struct ib_cq *rx_cq = device->dev_rx_cq[cq_index]; + struct isert_conn *isert_conn; + struct iser_rx_desc *rx_desc; + struct ib_wc wc; + unsigned long xfer_len; + + while (ib_poll_cq(rx_cq, 1, &wc) == 1) { + rx_desc = (struct iser_rx_desc *)(unsigned long)wc.wr_id; + isert_conn = wc.qp->qp_context; + + if (wc.status == IB_WC_SUCCESS) { + xfer_len = (unsigned long)wc.byte_len; + isert_rx_completion(rx_desc, isert_conn, xfer_len); + } else { + pr_debug("RX wc.status != IB_WC_SUCCESS >>>>>>>>>>>>>>\n"); + if (wc.status != IB_WC_WR_FLUSH_ERR) { + pr_debug("RX wc.status: 0x%08x\n", wc.status); + pr_debug("RX wc.vendor_err: 0x%08x\n", + wc.vendor_err); + } + isert_conn->post_recv_buf_count--; + isert_cq_rx_comp_err(isert_conn); + } + } + + ib_req_notify_cq(rx_cq, IB_CQ_NEXT_COMP); +} + +static void +isert_cq_rx_callback(struct ib_cq *cq, void *context) +{ + struct isert_cq_desc *cq_desc = (struct isert_cq_desc *)context; + + queue_work(isert_rx_wq, &cq_desc->cq_rx_work); +} + +static int +isert_post_response(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd) +{ + struct ib_send_wr *wr_failed; + int ret; + + atomic_inc(&isert_conn->post_send_buf_count); + + ret = ib_post_send(isert_conn->conn_qp, &isert_cmd->tx_desc.send_wr, + &wr_failed); + if (ret) { + pr_err("ib_post_send failed with %d\n", ret); + atomic_dec(&isert_conn->post_send_buf_count); + return ret; + } + return ret; +} + +static int +isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + struct iscsi_scsi_rsp *hdr = (struct iscsi_scsi_rsp *) + &isert_cmd->tx_desc.iscsi_header; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_rsp_pdu(cmd, conn, true, hdr); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + /* + * Attach SENSE DATA payload to iSCSI Response PDU + */ + if (cmd->se_cmd.sense_buffer && + ((cmd->se_cmd.se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) || + (cmd->se_cmd.se_cmd_flags & SCF_EMULATED_TASK_SENSE))) { + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; + u32 padding, pdu_len; + + put_unaligned_be16(cmd->se_cmd.scsi_sense_length, + cmd->sense_buffer); + cmd->se_cmd.scsi_sense_length += sizeof(__be16); + + padding = -(cmd->se_cmd.scsi_sense_length) & 3; + hton24(hdr->dlength, (u32)cmd->se_cmd.scsi_sense_length); + pdu_len = cmd->se_cmd.scsi_sense_length + padding; + + isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev, + (void *)cmd->sense_buffer, pdu_len, + DMA_TO_DEVICE); + + isert_cmd->pdu_buf_len = pdu_len; + tx_dsg->addr = isert_cmd->pdu_buf_dma; + tx_dsg->length = pdu_len; + tx_dsg->lkey = isert_conn->conn_mr->lkey; + isert_cmd->tx_desc.num_sge = 2; + } + + isert_init_send_wr(isert_conn, isert_cmd, send_wr, true); + + pr_debug("Posting SCSI Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static void +isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_device *device = isert_conn->conn_device; + + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + if (cmd->data_direction == DMA_TO_DEVICE) + iscsit_stop_dataout_timer(cmd); + + device->unreg_rdma_mem(isert_cmd, isert_conn); +} + +static enum target_prot_op +isert_get_sup_prot_ops(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_device *device = isert_conn->conn_device; + + if (device->pi_capable) + return TARGET_PROT_ALL; + + return TARGET_PROT_NORMAL; +} + +static int +isert_put_nopin(struct iscsi_cmd *cmd, struct iscsi_conn *conn, + bool nopout_response) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_nopin_rsp(cmd, conn, (struct iscsi_nopin *) + &isert_cmd->tx_desc.iscsi_header, + nopout_response); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + + pr_debug("Posting NOPIN Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_logout_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_logout_rsp(cmd, conn, (struct iscsi_logout_rsp *) + &isert_cmd->tx_desc.iscsi_header); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + + pr_debug("Posting Logout Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_tm_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_task_mgt_rsp(cmd, conn, (struct iscsi_tm_rsp *) + &isert_cmd->tx_desc.iscsi_header); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + + pr_debug("Posting Task Management Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; + struct iscsi_reject *hdr = + (struct iscsi_reject *)&isert_cmd->tx_desc.iscsi_header; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_reject(cmd, conn, hdr); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + + hton24(hdr->dlength, ISCSI_HDR_LEN); + isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev, + (void *)cmd->buf_ptr, ISCSI_HDR_LEN, + DMA_TO_DEVICE); + isert_cmd->pdu_buf_len = ISCSI_HDR_LEN; + tx_dsg->addr = isert_cmd->pdu_buf_dma; + tx_dsg->length = ISCSI_HDR_LEN; + tx_dsg->lkey = isert_conn->conn_mr->lkey; + isert_cmd->tx_desc.num_sge = 2; + + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + + pr_debug("Posting Reject IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + struct iscsi_text_rsp *hdr = + (struct iscsi_text_rsp *)&isert_cmd->tx_desc.iscsi_header; + u32 txt_rsp_len; + int rc; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + rc = iscsit_build_text_rsp(cmd, conn, hdr, ISCSI_INFINIBAND); + if (rc < 0) + return rc; + + txt_rsp_len = rc; + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + + if (txt_rsp_len) { + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; + void *txt_rsp_buf = cmd->buf_ptr; + + isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev, + txt_rsp_buf, txt_rsp_len, DMA_TO_DEVICE); + + isert_cmd->pdu_buf_len = txt_rsp_len; + tx_dsg->addr = isert_cmd->pdu_buf_dma; + tx_dsg->length = txt_rsp_len; + tx_dsg->lkey = isert_conn->conn_mr->lkey; + isert_cmd->tx_desc.num_sge = 2; + } + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + + pr_debug("Posting Text Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct ib_sge *ib_sge, struct ib_send_wr *send_wr, + u32 data_left, u32 offset) +{ + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct scatterlist *sg_start, *tmp_sg; + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + u32 sg_off, page_off; + int i = 0, sg_nents; + + sg_off = offset / PAGE_SIZE; + sg_start = &cmd->se_cmd.t_data_sg[sg_off]; + sg_nents = min(cmd->se_cmd.t_data_nents - sg_off, isert_conn->max_sge); + page_off = offset % PAGE_SIZE; + + send_wr->sg_list = ib_sge; + send_wr->num_sge = sg_nents; + send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc; + /* + * Perform mapping of TCM scatterlist memory ib_sge dma_addr. + */ + for_each_sg(sg_start, tmp_sg, sg_nents, i) { + pr_debug("ISER RDMA from SGL dma_addr: 0x%16llx dma_len: %u, page_off: %u\n", + (unsigned long long)tmp_sg->dma_address, + tmp_sg->length, page_off); + + ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off; + ib_sge->length = min_t(u32, data_left, + ib_sg_dma_len(ib_dev, tmp_sg) - page_off); + ib_sge->lkey = isert_conn->conn_mr->lkey; + + pr_debug("RDMA ib_sge: addr: 0x%16llx length: %u lkey: %08x\n", + ib_sge->addr, ib_sge->length, ib_sge->lkey); + page_off = 0; + data_left -= ib_sge->length; + ib_sge++; + pr_debug("Incrementing ib_sge pointer to %p\n", ib_sge); + } + + pr_debug("Set outgoing sg_list: %p num_sg: %u from TCM SGLs\n", + send_wr->sg_list, send_wr->num_sge); + + return sg_nents; +} + +static int +isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr) +{ + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_data_buf *data = &wr->data; + struct ib_send_wr *send_wr; + struct ib_sge *ib_sge; + u32 offset, data_len, data_left, rdma_write_max, va_offset = 0; + int ret = 0, i, ib_sge_cnt; + + isert_cmd->tx_desc.isert_cmd = isert_cmd; + + offset = wr->iser_ib_op == ISER_IB_RDMA_READ ? cmd->write_data_done : 0; + ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg, + se_cmd->t_data_nents, se_cmd->data_length, + offset, wr->iser_ib_op, &wr->data); + if (ret) + return ret; + + data_left = data->len; + offset = data->offset; + + ib_sge = kzalloc(sizeof(struct ib_sge) * data->nents, GFP_KERNEL); + if (!ib_sge) { + pr_warn("Unable to allocate ib_sge\n"); + ret = -ENOMEM; + goto unmap_cmd; + } + wr->ib_sge = ib_sge; + + wr->send_wr_num = DIV_ROUND_UP(data->nents, isert_conn->max_sge); + wr->send_wr = kzalloc(sizeof(struct ib_send_wr) * wr->send_wr_num, + GFP_KERNEL); + if (!wr->send_wr) { + pr_debug("Unable to allocate wr->send_wr\n"); + ret = -ENOMEM; + goto unmap_cmd; + } + + wr->isert_cmd = isert_cmd; + rdma_write_max = isert_conn->max_sge * PAGE_SIZE; + + for (i = 0; i < wr->send_wr_num; i++) { + send_wr = &isert_cmd->rdma_wr.send_wr[i]; + data_len = min(data_left, rdma_write_max); + + send_wr->send_flags = 0; + if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) { + send_wr->opcode = IB_WR_RDMA_WRITE; + send_wr->wr.rdma.remote_addr = isert_cmd->read_va + offset; + send_wr->wr.rdma.rkey = isert_cmd->read_stag; + if (i + 1 == wr->send_wr_num) + send_wr->next = &isert_cmd->tx_desc.send_wr; + else + send_wr->next = &wr->send_wr[i + 1]; + } else { + send_wr->opcode = IB_WR_RDMA_READ; + send_wr->wr.rdma.remote_addr = isert_cmd->write_va + va_offset; + send_wr->wr.rdma.rkey = isert_cmd->write_stag; + if (i + 1 == wr->send_wr_num) + send_wr->send_flags = IB_SEND_SIGNALED; + else + send_wr->next = &wr->send_wr[i + 1]; + } + + ib_sge_cnt = isert_build_rdma_wr(isert_conn, isert_cmd, ib_sge, + send_wr, data_len, offset); + ib_sge += ib_sge_cnt; + + offset += data_len; + va_offset += data_len; + data_left -= data_len; + } + + return 0; +unmap_cmd: + isert_unmap_data_buf(isert_conn, data); + + return ret; +} + +static int +isert_map_fr_pagelist(struct ib_device *ib_dev, + struct scatterlist *sg_start, int sg_nents, u64 *fr_pl) +{ + u64 start_addr, end_addr, page, chunk_start = 0; + struct scatterlist *tmp_sg; + int i = 0, new_chunk, last_ent, n_pages; + + n_pages = 0; + new_chunk = 1; + last_ent = sg_nents - 1; + for_each_sg(sg_start, tmp_sg, sg_nents, i) { + start_addr = ib_sg_dma_address(ib_dev, tmp_sg); + if (new_chunk) + chunk_start = start_addr; + end_addr = start_addr + ib_sg_dma_len(ib_dev, tmp_sg); + + pr_debug("SGL[%d] dma_addr: 0x%16llx len: %u\n", + i, (unsigned long long)tmp_sg->dma_address, + tmp_sg->length); + + if ((end_addr & ~PAGE_MASK) && i < last_ent) { + new_chunk = 0; + continue; + } + new_chunk = 1; + + page = chunk_start & PAGE_MASK; + do { + fr_pl[n_pages++] = page; + pr_debug("Mapped page_list[%d] page_addr: 0x%16llx\n", + n_pages - 1, page); + page += PAGE_SIZE; + } while (page < end_addr); + } + + return n_pages; +} + +static int +isert_fast_reg_mr(struct isert_conn *isert_conn, + struct fast_reg_descriptor *fr_desc, + struct isert_data_buf *mem, + enum isert_indicator ind, + struct ib_sge *sge) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; + struct ib_send_wr fr_wr, inv_wr; + struct ib_send_wr *bad_wr, *wr = NULL; + int ret, pagelist_len; + u32 page_off; + u8 key; + + if (mem->dma_nents == 1) { + sge->lkey = isert_conn->conn_mr->lkey; + sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]); + sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]); + pr_debug("%s:%d sge: addr: 0x%llx length: %u lkey: %x\n", + __func__, __LINE__, sge->addr, sge->length, + sge->lkey); + return 0; + } + + if (ind == ISERT_DATA_KEY_VALID) { + /* Registering data buffer */ + mr = fr_desc->data_mr; + frpl = fr_desc->data_frpl; + } else { + /* Registering protection buffer */ + mr = fr_desc->pi_ctx->prot_mr; + frpl = fr_desc->pi_ctx->prot_frpl; + } + + page_off = mem->offset % PAGE_SIZE; + + pr_debug("Use fr_desc %p sg_nents %d offset %u\n", + fr_desc, mem->nents, mem->offset); + + pagelist_len = isert_map_fr_pagelist(ib_dev, mem->sg, mem->nents, + &frpl->page_list[0]); + + if (!(fr_desc->ind & ISERT_DATA_KEY_VALID)) { + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.wr_id = ISER_FASTREG_LI_WRID; + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.ex.invalidate_rkey = mr->rkey; + wr = &inv_wr; + /* Bump the key */ + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); + } + + /* Prepare FASTREG WR */ + memset(&fr_wr, 0, sizeof(fr_wr)); + fr_wr.wr_id = ISER_FASTREG_LI_WRID; + fr_wr.opcode = IB_WR_FAST_REG_MR; + fr_wr.wr.fast_reg.iova_start = frpl->page_list[0] + page_off; + fr_wr.wr.fast_reg.page_list = frpl; + fr_wr.wr.fast_reg.page_list_len = pagelist_len; + fr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fr_wr.wr.fast_reg.length = mem->len; + fr_wr.wr.fast_reg.rkey = mr->rkey; + fr_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE; + + if (!wr) + wr = &fr_wr; + else + wr->next = &fr_wr; + + ret = ib_post_send(isert_conn->conn_qp, wr, &bad_wr); + if (ret) { + pr_err("fast registration failed, ret:%d\n", ret); + return ret; + } + fr_desc->ind &= ~ind; + + sge->lkey = mr->lkey; + sge->addr = frpl->page_list[0] + page_off; + sge->length = mem->len; + + pr_debug("%s:%d sge: addr: 0x%llx length: %u lkey: %x\n", + __func__, __LINE__, sge->addr, sge->length, + sge->lkey); + + return ret; +} + +static inline enum ib_t10_dif_type +se2ib_prot_type(enum target_prot_type prot_type) +{ + switch (prot_type) { + case TARGET_DIF_TYPE0_PROT: + return IB_T10DIF_NONE; + case TARGET_DIF_TYPE1_PROT: + return IB_T10DIF_TYPE1; + case TARGET_DIF_TYPE2_PROT: + return IB_T10DIF_TYPE2; + case TARGET_DIF_TYPE3_PROT: + return IB_T10DIF_TYPE3; + default: + return IB_T10DIF_NONE; + } +} + +static int +isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs) +{ + enum ib_t10_dif_type ib_prot_type = se2ib_prot_type(se_cmd->prot_type); + + sig_attrs->mem.sig_type = IB_SIG_TYPE_T10_DIF; + sig_attrs->wire.sig_type = IB_SIG_TYPE_T10_DIF; + sig_attrs->mem.sig.dif.pi_interval = + se_cmd->se_dev->dev_attrib.block_size; + sig_attrs->wire.sig.dif.pi_interval = + se_cmd->se_dev->dev_attrib.block_size; + + switch (se_cmd->prot_op) { + case TARGET_PROT_DIN_INSERT: + case TARGET_PROT_DOUT_STRIP: + sig_attrs->mem.sig.dif.type = IB_T10DIF_NONE; + sig_attrs->wire.sig.dif.type = ib_prot_type; + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->wire.sig.dif.ref_tag = se_cmd->reftag_seed; + break; + case TARGET_PROT_DOUT_INSERT: + case TARGET_PROT_DIN_STRIP: + sig_attrs->mem.sig.dif.type = ib_prot_type; + sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->mem.sig.dif.ref_tag = se_cmd->reftag_seed; + sig_attrs->wire.sig.dif.type = IB_T10DIF_NONE; + break; + case TARGET_PROT_DIN_PASS: + case TARGET_PROT_DOUT_PASS: + sig_attrs->mem.sig.dif.type = ib_prot_type; + sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->mem.sig.dif.ref_tag = se_cmd->reftag_seed; + sig_attrs->wire.sig.dif.type = ib_prot_type; + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->wire.sig.dif.ref_tag = se_cmd->reftag_seed; + break; + default: + pr_err("Unsupported PI operation %d\n", se_cmd->prot_op); + return -EINVAL; + } + + return 0; +} + +static inline u8 +isert_set_prot_checks(u8 prot_checks) +{ + return (prot_checks & TARGET_DIF_CHECK_GUARD ? 0xc0 : 0) | + (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) | + (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0); +} + +static int +isert_reg_sig_mr(struct isert_conn *isert_conn, struct se_cmd *se_cmd, + struct fast_reg_descriptor *fr_desc, + struct ib_sge *data_sge, struct ib_sge *prot_sge, + struct ib_sge *sig_sge) +{ + struct ib_send_wr sig_wr, inv_wr; + struct ib_send_wr *bad_wr, *wr = NULL; + struct pi_context *pi_ctx = fr_desc->pi_ctx; + struct ib_sig_attrs sig_attrs; + int ret; + u32 key; + + memset(&sig_attrs, 0, sizeof(sig_attrs)); + ret = isert_set_sig_attrs(se_cmd, &sig_attrs); + if (ret) + goto err; + + sig_attrs.check_mask = isert_set_prot_checks(se_cmd->prot_checks); + + if (!(fr_desc->ind & ISERT_SIG_KEY_VALID)) { + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.wr_id = ISER_FASTREG_LI_WRID; + inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey; + wr = &inv_wr; + /* Bump the key */ + key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(pi_ctx->sig_mr, ++key); + } + + memset(&sig_wr, 0, sizeof(sig_wr)); + sig_wr.opcode = IB_WR_REG_SIG_MR; + sig_wr.wr_id = ISER_FASTREG_LI_WRID; + sig_wr.sg_list = data_sge; + sig_wr.num_sge = 1; + sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE; + sig_wr.wr.sig_handover.sig_attrs = &sig_attrs; + sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr; + if (se_cmd->t_prot_sg) + sig_wr.wr.sig_handover.prot = prot_sge; + + if (!wr) + wr = &sig_wr; + else + wr->next = &sig_wr; + + ret = ib_post_send(isert_conn->conn_qp, wr, &bad_wr); + if (ret) { + pr_err("fast registration failed, ret:%d\n", ret); + goto err; + } + fr_desc->ind &= ~ISERT_SIG_KEY_VALID; + + sig_sge->lkey = pi_ctx->sig_mr->lkey; + sig_sge->addr = 0; + sig_sge->length = se_cmd->data_length; + if (se_cmd->prot_op != TARGET_PROT_DIN_STRIP && + se_cmd->prot_op != TARGET_PROT_DOUT_INSERT) + /* + * We have protection guards on the wire + * so we need to set a larget transfer + */ + sig_sge->length += se_cmd->prot_length; + + pr_debug("sig_sge: addr: 0x%llx length: %u lkey: %x\n", + sig_sge->addr, sig_sge->length, + sig_sge->lkey); +err: + return ret; +} + +static int +isert_reg_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr) +{ + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = conn->context; + struct ib_sge data_sge; + struct ib_send_wr *send_wr; + struct fast_reg_descriptor *fr_desc = NULL; + u32 offset; + int ret = 0; + unsigned long flags; + + isert_cmd->tx_desc.isert_cmd = isert_cmd; + + offset = wr->iser_ib_op == ISER_IB_RDMA_READ ? cmd->write_data_done : 0; + ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg, + se_cmd->t_data_nents, se_cmd->data_length, + offset, wr->iser_ib_op, &wr->data); + if (ret) + return ret; + + if (wr->data.dma_nents != 1 || + se_cmd->prot_op != TARGET_PROT_NORMAL) { + spin_lock_irqsave(&isert_conn->conn_lock, flags); + fr_desc = list_first_entry(&isert_conn->conn_fr_pool, + struct fast_reg_descriptor, list); + list_del(&fr_desc->list); + spin_unlock_irqrestore(&isert_conn->conn_lock, flags); + wr->fr_desc = fr_desc; + } + + ret = isert_fast_reg_mr(isert_conn, fr_desc, &wr->data, + ISERT_DATA_KEY_VALID, &data_sge); + if (ret) + goto unmap_cmd; + + if (se_cmd->prot_op != TARGET_PROT_NORMAL) { + struct ib_sge prot_sge, sig_sge; + + if (se_cmd->t_prot_sg) { + ret = isert_map_data_buf(isert_conn, isert_cmd, + se_cmd->t_prot_sg, + se_cmd->t_prot_nents, + se_cmd->prot_length, + 0, wr->iser_ib_op, &wr->prot); + if (ret) + goto unmap_cmd; + + ret = isert_fast_reg_mr(isert_conn, fr_desc, &wr->prot, + ISERT_PROT_KEY_VALID, &prot_sge); + if (ret) + goto unmap_prot_cmd; + } + + ret = isert_reg_sig_mr(isert_conn, se_cmd, fr_desc, + &data_sge, &prot_sge, &sig_sge); + if (ret) + goto unmap_prot_cmd; + + fr_desc->ind |= ISERT_PROTECTED; + memcpy(&wr->s_ib_sge, &sig_sge, sizeof(sig_sge)); + } else + memcpy(&wr->s_ib_sge, &data_sge, sizeof(data_sge)); + + wr->ib_sge = &wr->s_ib_sge; + wr->send_wr_num = 1; + memset(&wr->s_send_wr, 0, sizeof(*send_wr)); + wr->send_wr = &wr->s_send_wr; + wr->isert_cmd = isert_cmd; + + send_wr = &isert_cmd->rdma_wr.s_send_wr; + send_wr->sg_list = &wr->s_ib_sge; + send_wr->num_sge = 1; + send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc; + if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) { + send_wr->opcode = IB_WR_RDMA_WRITE; + send_wr->wr.rdma.remote_addr = isert_cmd->read_va; + send_wr->wr.rdma.rkey = isert_cmd->read_stag; + send_wr->send_flags = se_cmd->prot_op == TARGET_PROT_NORMAL ? + 0 : IB_SEND_SIGNALED; + } else { + send_wr->opcode = IB_WR_RDMA_READ; + send_wr->wr.rdma.remote_addr = isert_cmd->write_va; + send_wr->wr.rdma.rkey = isert_cmd->write_stag; + send_wr->send_flags = IB_SEND_SIGNALED; + } + + return 0; +unmap_prot_cmd: + if (se_cmd->t_prot_sg) + isert_unmap_data_buf(isert_conn, &wr->prot); +unmap_cmd: + if (fr_desc) { + spin_lock_irqsave(&isert_conn->conn_lock, flags); + list_add_tail(&fr_desc->list, &isert_conn->conn_fr_pool); + spin_unlock_irqrestore(&isert_conn->conn_lock, flags); + } + isert_unmap_data_buf(isert_conn, &wr->data); + + return ret; +} + +static int +isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) +{ + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_device *device = isert_conn->conn_device; + struct ib_send_wr *wr_failed; + int rc; + + pr_debug("Cmd: %p RDMA_WRITE data_length: %u\n", + isert_cmd, se_cmd->data_length); + wr->iser_ib_op = ISER_IB_RDMA_WRITE; + rc = device->reg_rdma_mem(conn, cmd, wr); + if (rc) { + pr_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd); + return rc; + } + + if (se_cmd->prot_op == TARGET_PROT_NORMAL) { + /* + * Build isert_conn->tx_desc for iSCSI response PDU and attach + */ + isert_create_send_desc(isert_conn, isert_cmd, + &isert_cmd->tx_desc); + iscsit_build_rsp_pdu(cmd, conn, true, (struct iscsi_scsi_rsp *) + &isert_cmd->tx_desc.iscsi_header); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr(isert_conn, isert_cmd, + &isert_cmd->tx_desc.send_wr, true); + isert_cmd->rdma_wr.s_send_wr.next = &isert_cmd->tx_desc.send_wr; + wr->send_wr_num += 1; + } + + atomic_add(wr->send_wr_num, &isert_conn->post_send_buf_count); + + rc = ib_post_send(isert_conn->conn_qp, wr->send_wr, &wr_failed); + if (rc) { + pr_warn("ib_post_send() failed for IB_WR_RDMA_WRITE\n"); + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + } + + if (se_cmd->prot_op == TARGET_PROT_NORMAL) + pr_debug("Cmd: %p posted RDMA_WRITE + Response for iSER Data " + "READ\n", isert_cmd); + else + pr_debug("Cmd: %p posted RDMA_WRITE for iSER Data READ\n", + isert_cmd); + + return 1; +} + +static int +isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery) +{ + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_device *device = isert_conn->conn_device; + struct ib_send_wr *wr_failed; + int rc; + + pr_debug("Cmd: %p RDMA_READ data_length: %u write_data_done: %u\n", + isert_cmd, se_cmd->data_length, cmd->write_data_done); + wr->iser_ib_op = ISER_IB_RDMA_READ; + rc = device->reg_rdma_mem(conn, cmd, wr); + if (rc) { + pr_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd); + return rc; + } + + atomic_add(wr->send_wr_num, &isert_conn->post_send_buf_count); + + rc = ib_post_send(isert_conn->conn_qp, wr->send_wr, &wr_failed); + if (rc) { + pr_warn("ib_post_send() failed for IB_WR_RDMA_READ\n"); + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + } + pr_debug("Cmd: %p posted RDMA_READ memory for ISER Data WRITE\n", + isert_cmd); + + return 0; +} + +static int +isert_immediate_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state) +{ + int ret; + + switch (state) { + case ISTATE_SEND_NOPIN_WANT_RESPONSE: + ret = isert_put_nopin(cmd, conn, false); + break; + default: + pr_err("Unknown immediate state: 0x%02x\n", state); + ret = -EINVAL; + break; + } + + return ret; +} + +static int +isert_response_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state) +{ + int ret; + + switch (state) { + case ISTATE_SEND_LOGOUTRSP: + ret = isert_put_logout_rsp(cmd, conn); + if (!ret) { + pr_debug("Returning iSER Logout -EAGAIN\n"); + ret = -EAGAIN; + } + break; + case ISTATE_SEND_NOPIN: + ret = isert_put_nopin(cmd, conn, true); + break; + case ISTATE_SEND_TASKMGTRSP: + ret = isert_put_tm_rsp(cmd, conn); + break; + case ISTATE_SEND_REJECT: + ret = isert_put_reject(cmd, conn); + break; + case ISTATE_SEND_TEXTRSP: + ret = isert_put_text_rsp(cmd, conn); + break; + case ISTATE_SEND_STATUS: + /* + * Special case for sending non GOOD SCSI status from TX thread + * context during pre se_cmd excecution failure. + */ + ret = isert_put_response(conn, cmd); + break; + default: + pr_err("Unknown response state: 0x%02x\n", state); + ret = -EINVAL; + break; + } + + return ret; +} + +static int +isert_setup_np(struct iscsi_np *np, + struct __kernel_sockaddr_storage *ksockaddr) +{ + struct isert_np *isert_np; + struct rdma_cm_id *isert_lid; + struct sockaddr *sa; + int ret; + + isert_np = kzalloc(sizeof(struct isert_np), GFP_KERNEL); + if (!isert_np) { + pr_err("Unable to allocate struct isert_np\n"); + return -ENOMEM; + } + sema_init(&isert_np->np_sem, 0); + mutex_init(&isert_np->np_accept_mutex); + INIT_LIST_HEAD(&isert_np->np_accept_list); + init_completion(&isert_np->np_login_comp); + + sa = (struct sockaddr *)ksockaddr; + pr_debug("ksockaddr: %p, sa: %p\n", ksockaddr, sa); + /* + * Setup the np->np_sockaddr from the passed sockaddr setup + * in iscsi_target_configfs.c code.. + */ + memcpy(&np->np_sockaddr, ksockaddr, + sizeof(struct __kernel_sockaddr_storage)); + + isert_lid = rdma_create_id(isert_cma_handler, np, RDMA_PS_TCP, + IB_QPT_RC); + if (IS_ERR(isert_lid)) { + pr_err("rdma_create_id() for isert_listen_handler failed: %ld\n", + PTR_ERR(isert_lid)); + ret = PTR_ERR(isert_lid); + goto out; + } + + ret = rdma_bind_addr(isert_lid, sa); + if (ret) { + pr_err("rdma_bind_addr() for isert_lid failed: %d\n", ret); + goto out_lid; + } + + ret = rdma_listen(isert_lid, ISERT_RDMA_LISTEN_BACKLOG); + if (ret) { + pr_err("rdma_listen() for isert_lid failed: %d\n", ret); + goto out_lid; + } + + isert_np->np_cm_id = isert_lid; + np->np_context = isert_np; + pr_debug("Setup isert_lid->context: %p\n", isert_lid->context); + + return 0; + +out_lid: + rdma_destroy_id(isert_lid); +out: + kfree(isert_np); + return ret; +} + +static int +isert_rdma_accept(struct isert_conn *isert_conn) +{ + struct rdma_cm_id *cm_id = isert_conn->conn_cm_id; + struct rdma_conn_param cp; + int ret; + + memset(&cp, 0, sizeof(struct rdma_conn_param)); + cp.responder_resources = isert_conn->responder_resources; + cp.initiator_depth = isert_conn->initiator_depth; + cp.retry_count = 7; + cp.rnr_retry_count = 7; + + pr_debug("Before rdma_accept >>>>>>>>>>>>>>>>>>>>.\n"); + + ret = rdma_accept(cm_id, &cp); + if (ret) { + pr_err("rdma_accept() failed with: %d\n", ret); + return ret; + } + + pr_debug("After rdma_accept >>>>>>>>>>>>>>>>>>>>>.\n"); + + return 0; +} + +static int +isert_get_login_rx(struct iscsi_conn *conn, struct iscsi_login *login) +{ + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + int ret; + + pr_debug("isert_get_login_rx before conn_login_comp conn: %p\n", conn); + /* + * For login requests after the first PDU, isert_rx_login_req() will + * kick schedule_delayed_work(&conn->login_work) as the packet is + * received, which turns this callback from iscsi_target_do_login_rx() + * into a NOP. + */ + if (!login->first_request) + return 0; + + ret = wait_for_completion_interruptible(&isert_conn->conn_login_comp); + if (ret) + return ret; + + pr_debug("isert_get_login_rx processing login->req: %p\n", login->req); + return 0; +} + +static void +isert_set_conn_info(struct iscsi_np *np, struct iscsi_conn *conn, + struct isert_conn *isert_conn) +{ + struct rdma_cm_id *cm_id = isert_conn->conn_cm_id; + struct rdma_route *cm_route = &cm_id->route; + struct sockaddr_in *sock_in; + struct sockaddr_in6 *sock_in6; + + conn->login_family = np->np_sockaddr.ss_family; + + if (np->np_sockaddr.ss_family == AF_INET6) { + sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.dst_addr; + snprintf(conn->login_ip, sizeof(conn->login_ip), "%pI6c", + &sock_in6->sin6_addr.in6_u); + conn->login_port = ntohs(sock_in6->sin6_port); + + sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.src_addr; + snprintf(conn->local_ip, sizeof(conn->local_ip), "%pI6c", + &sock_in6->sin6_addr.in6_u); + conn->local_port = ntohs(sock_in6->sin6_port); + } else { + sock_in = (struct sockaddr_in *)&cm_route->addr.dst_addr; + sprintf(conn->login_ip, "%pI4", + &sock_in->sin_addr.s_addr); + conn->login_port = ntohs(sock_in->sin_port); + + sock_in = (struct sockaddr_in *)&cm_route->addr.src_addr; + sprintf(conn->local_ip, "%pI4", + &sock_in->sin_addr.s_addr); + conn->local_port = ntohs(sock_in->sin_port); + } +} + +static int +isert_accept_np(struct iscsi_np *np, struct iscsi_conn *conn) +{ + struct isert_np *isert_np = (struct isert_np *)np->np_context; + struct isert_conn *isert_conn; + int max_accept = 0, ret; + +accept_wait: + ret = down_interruptible(&isert_np->np_sem); + if (max_accept > 5) + return -ENODEV; + + spin_lock_bh(&np->np_thread_lock); + if (np->np_thread_state >= ISCSI_NP_THREAD_RESET) { + spin_unlock_bh(&np->np_thread_lock); + pr_debug("np_thread_state %d for isert_accept_np\n", + np->np_thread_state); + /** + * No point in stalling here when np_thread + * is in state RESET/SHUTDOWN/EXIT - bail + **/ + return -ENODEV; + } + spin_unlock_bh(&np->np_thread_lock); + + mutex_lock(&isert_np->np_accept_mutex); + if (list_empty(&isert_np->np_accept_list)) { + mutex_unlock(&isert_np->np_accept_mutex); + max_accept++; + goto accept_wait; + } + isert_conn = list_first_entry(&isert_np->np_accept_list, + struct isert_conn, conn_accept_node); + list_del_init(&isert_conn->conn_accept_node); + mutex_unlock(&isert_np->np_accept_mutex); + + conn->context = isert_conn; + isert_conn->conn = conn; + max_accept = 0; + + ret = isert_rdma_post_recvl(isert_conn); + if (ret) + return ret; + + ret = isert_rdma_accept(isert_conn); + if (ret) + return ret; + + isert_set_conn_info(np, conn, isert_conn); + + pr_debug("Processing isert_accept_np: isert_conn: %p\n", isert_conn); + return 0; +} + +static void +isert_free_np(struct iscsi_np *np) +{ + struct isert_np *isert_np = (struct isert_np *)np->np_context; + + rdma_destroy_id(isert_np->np_cm_id); + + np->np_context = NULL; + kfree(isert_np); +} + +static void isert_wait_conn(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = conn->context; + + pr_debug("isert_wait_conn: Starting \n"); + + mutex_lock(&isert_conn->conn_mutex); + if (isert_conn->conn_cm_id) { + pr_debug("Calling rdma_disconnect from isert_wait_conn\n"); + rdma_disconnect(isert_conn->conn_cm_id); + } + /* + * Only wait for conn_wait_comp_err if the isert_conn made it + * into full feature phase.. + */ + if (isert_conn->state == ISER_CONN_INIT) { + mutex_unlock(&isert_conn->conn_mutex); + return; + } + if (isert_conn->state == ISER_CONN_UP) + isert_conn->state = ISER_CONN_TERMINATING; + mutex_unlock(&isert_conn->conn_mutex); + + wait_for_completion(&isert_conn->conn_wait_comp_err); + + wait_for_completion(&isert_conn->conn_wait); +} + +static void isert_free_conn(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = conn->context; + + isert_put_conn(isert_conn); +} + +static struct iscsit_transport iser_target_transport = { + .name = "IB/iSER", + .transport_type = ISCSI_INFINIBAND, + .priv_size = sizeof(struct isert_cmd), + .owner = THIS_MODULE, + .iscsit_setup_np = isert_setup_np, + .iscsit_accept_np = isert_accept_np, + .iscsit_free_np = isert_free_np, + .iscsit_wait_conn = isert_wait_conn, + .iscsit_free_conn = isert_free_conn, + .iscsit_get_login_rx = isert_get_login_rx, + .iscsit_put_login_tx = isert_put_login_tx, + .iscsit_immediate_queue = isert_immediate_queue, + .iscsit_response_queue = isert_response_queue, + .iscsit_get_dataout = isert_get_dataout, + .iscsit_queue_data_in = isert_put_datain, + .iscsit_queue_status = isert_put_response, + .iscsit_aborted_task = isert_aborted_task, + .iscsit_get_sup_prot_ops = isert_get_sup_prot_ops, +}; + +static int __init isert_init(void) +{ + int ret; + + isert_rx_wq = alloc_workqueue("isert_rx_wq", 0, 0); + if (!isert_rx_wq) { + pr_err("Unable to allocate isert_rx_wq\n"); + return -ENOMEM; + } + + isert_comp_wq = alloc_workqueue("isert_comp_wq", 0, 0); + if (!isert_comp_wq) { + pr_err("Unable to allocate isert_comp_wq\n"); + ret = -ENOMEM; + goto destroy_rx_wq; + } + + iscsit_register_transport(&iser_target_transport); + pr_debug("iSER_TARGET[0] - Loaded iser_target_transport\n"); + return 0; + +destroy_rx_wq: + destroy_workqueue(isert_rx_wq); + return ret; +} + +static void __exit isert_exit(void) +{ + flush_scheduled_work(); + destroy_workqueue(isert_comp_wq); + destroy_workqueue(isert_rx_wq); + iscsit_unregister_transport(&iser_target_transport); + pr_debug("iSER_TARGET[0] - Released iser_target_transport\n"); +} + +MODULE_DESCRIPTION("iSER-Target for mainline target infrastructure"); +MODULE_VERSION("0.1"); +MODULE_AUTHOR("nab@Linux-iSCSI.org"); +MODULE_LICENSE("GPL"); + +module_init(isert_init); +module_exit(isert_exit); diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h new file mode 100644 index 00000000000..04f51f7bf61 --- /dev/null +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -0,0 +1,190 @@ +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> + +#define ISERT_RDMA_LISTEN_BACKLOG 10 +#define ISCSI_ISER_SG_TABLESIZE 256 +#define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL + +enum isert_desc_type { + ISCSI_TX_CONTROL, + ISCSI_TX_DATAIN +}; + +enum iser_ib_op_code { + ISER_IB_RECV, + ISER_IB_SEND, + ISER_IB_RDMA_WRITE, + ISER_IB_RDMA_READ, +}; + +enum iser_conn_state { + ISER_CONN_INIT, + ISER_CONN_UP, + ISER_CONN_TERMINATING, + ISER_CONN_DOWN, +}; + +struct iser_rx_desc { + struct iser_hdr iser_header; + struct iscsi_hdr iscsi_header; + char data[ISER_RECV_DATA_SEG_LEN]; + u64 dma_addr; + struct ib_sge rx_sg; + char pad[ISER_RX_PAD_SIZE]; +} __packed; + +struct iser_tx_desc { + struct iser_hdr iser_header; + struct iscsi_hdr iscsi_header; + enum isert_desc_type type; + u64 dma_addr; + struct ib_sge tx_sg[2]; + int num_sge; + struct isert_cmd *isert_cmd; + struct llist_node *comp_llnode_batch; + struct llist_node comp_llnode; + bool llnode_active; + struct ib_send_wr send_wr; +} __packed; + +enum isert_indicator { + ISERT_PROTECTED = 1 << 0, + ISERT_DATA_KEY_VALID = 1 << 1, + ISERT_PROT_KEY_VALID = 1 << 2, + ISERT_SIG_KEY_VALID = 1 << 3, +}; + +struct pi_context { + struct ib_mr *prot_mr; + struct ib_fast_reg_page_list *prot_frpl; + struct ib_mr *sig_mr; +}; + +struct fast_reg_descriptor { + struct list_head list; + struct ib_mr *data_mr; + struct ib_fast_reg_page_list *data_frpl; + u8 ind; + struct pi_context *pi_ctx; +}; + +struct isert_data_buf { + struct scatterlist *sg; + int nents; + u32 sg_off; + u32 len; /* cur_rdma_length */ + u32 offset; + unsigned int dma_nents; + enum dma_data_direction dma_dir; +}; + +struct isert_rdma_wr { + struct list_head wr_list; + struct isert_cmd *isert_cmd; + enum iser_ib_op_code iser_ib_op; + struct ib_sge *ib_sge; + struct ib_sge s_ib_sge; + int send_wr_num; + struct ib_send_wr *send_wr; + struct ib_send_wr s_send_wr; + struct isert_data_buf data; + struct isert_data_buf prot; + struct fast_reg_descriptor *fr_desc; +}; + +struct isert_cmd { + uint32_t read_stag; + uint32_t write_stag; + uint64_t read_va; + uint64_t write_va; + u64 pdu_buf_dma; + u32 pdu_buf_len; + u32 read_va_off; + u32 write_va_off; + u32 rdma_wr_num; + struct isert_conn *conn; + struct iscsi_cmd *iscsi_cmd; + struct iser_tx_desc tx_desc; + struct isert_rdma_wr rdma_wr; + struct work_struct comp_work; +}; + +struct isert_device; + +struct isert_conn { + enum iser_conn_state state; + int post_recv_buf_count; + atomic_t post_send_buf_count; + u32 responder_resources; + u32 initiator_depth; + u32 max_sge; + char *login_buf; + char *login_req_buf; + char *login_rsp_buf; + u64 login_req_dma; + u64 login_rsp_dma; + unsigned int conn_rx_desc_head; + struct iser_rx_desc *conn_rx_descs; + struct ib_recv_wr conn_rx_wr[ISERT_MIN_POSTED_RX]; + struct iscsi_conn *conn; + struct list_head conn_accept_node; + struct completion conn_login_comp; + struct iser_tx_desc conn_login_tx_desc; + struct rdma_cm_id *conn_cm_id; + struct ib_pd *conn_pd; + struct ib_mr *conn_mr; + struct ib_qp *conn_qp; + struct isert_device *conn_device; + struct work_struct conn_logout_work; + struct mutex conn_mutex; + struct completion conn_wait; + struct completion conn_wait_comp_err; + struct kref conn_kref; + struct list_head conn_fr_pool; + int conn_fr_pool_size; + /* lock to protect fastreg pool */ + spinlock_t conn_lock; +#define ISERT_COMP_BATCH_COUNT 8 + int conn_comp_batch; + struct llist_head conn_comp_llist; + bool disconnect; +}; + +#define ISERT_MAX_CQ 64 + +struct isert_cq_desc { + struct isert_device *device; + int cq_index; + struct work_struct cq_rx_work; + struct work_struct cq_tx_work; +}; + +struct isert_device { + int use_fastreg; + bool pi_capable; + int cqs_used; + int refcount; + int cq_active_qps[ISERT_MAX_CQ]; + struct ib_device *ib_device; + struct ib_cq *dev_rx_cq[ISERT_MAX_CQ]; + struct ib_cq *dev_tx_cq[ISERT_MAX_CQ]; + struct isert_cq_desc *cq_desc; + struct list_head dev_node; + struct ib_device_attr dev_attr; + int (*reg_rdma_mem)(struct iscsi_conn *conn, + struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr); + void (*unreg_rdma_mem)(struct isert_cmd *isert_cmd, + struct isert_conn *isert_conn); +}; + +struct isert_np { + struct semaphore np_sem; + struct rdma_cm_id *np_cm_id; + struct mutex np_accept_mutex; + struct list_head np_accept_list; + struct completion np_login_comp; +}; diff --git a/drivers/infiniband/ulp/isert/isert_proto.h b/drivers/infiniband/ulp/isert/isert_proto.h new file mode 100644 index 00000000000..4dccd313b77 --- /dev/null +++ b/drivers/infiniband/ulp/isert/isert_proto.h @@ -0,0 +1,47 @@ +/* From iscsi_iser.h */ + +struct iser_hdr { + u8 flags; + u8 rsvd[3]; + __be32 write_stag; /* write rkey */ + __be64 write_va; + __be32 read_stag; /* read rkey */ + __be64 read_va; +} __packed; + +/*Constant PDU lengths calculations */ +#define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr)) + +#define ISER_RECV_DATA_SEG_LEN 8192 +#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN) +#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN) + +/* QP settings */ +/* Maximal bounds on received asynchronous PDUs */ +#define ISERT_MAX_TX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */ + +#define ISERT_MAX_RX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), * + * SCSI_TMFUNC(2), LOGOUT(1) */ + +#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* from libiscsi.h, must be power of 2 */ + +#define ISERT_QP_MAX_RECV_DTOS (ISCSI_DEF_XMIT_CMDS_MAX) + +#define ISERT_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX >> 2) + +#define ISERT_INFLIGHT_DATAOUTS 8 + +#define ISERT_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX * \ + (1 + ISERT_INFLIGHT_DATAOUTS) + \ + ISERT_MAX_TX_MISC_PDUS + \ + ISERT_MAX_RX_MISC_PDUS) + +#define ISER_RX_PAD_SIZE (ISER_RECV_DATA_SEG_LEN + 4096 - \ + (ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge))) + +#define ISER_VER 0x10 +#define ISER_WSV 0x08 +#define ISER_RSV 0x04 +#define ISCSI_CTRL 0x10 +#define ISER_HELLO 0x20 +#define ISER_HELLORPLY 0x30 diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 922d845f76b..e3c2c5b4297 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -30,7 +30,7 @@ * SOFTWARE. */ -#define pr_fmt(fmt) PFX fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/init.h> @@ -46,6 +46,7 @@ #include <scsi/scsi.h> #include <scsi/scsi_device.h> #include <scsi/scsi_dbg.h> +#include <scsi/scsi_tcq.h> #include <scsi/srp.h> #include <scsi/scsi_transport_srp.h> @@ -53,8 +54,8 @@ #define DRV_NAME "ib_srp" #define PFX DRV_NAME ": " -#define DRV_VERSION "0.2" -#define DRV_RELDATE "November 1, 2005" +#define DRV_VERSION "1.0" +#define DRV_RELDATE "July 1, 2013" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator " @@ -65,6 +66,8 @@ static unsigned int srp_sg_tablesize; static unsigned int cmd_sg_entries; static unsigned int indirect_sg_entries; static bool allow_ext_sg; +static bool prefer_fr; +static bool register_always; static int topspin_workarounds = 1; module_param(srp_sg_tablesize, uint, 0444); @@ -86,6 +89,40 @@ module_param(topspin_workarounds, int, 0444); MODULE_PARM_DESC(topspin_workarounds, "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); +module_param(prefer_fr, bool, 0444); +MODULE_PARM_DESC(prefer_fr, +"Whether to use fast registration if both FMR and fast registration are supported"); + +module_param(register_always, bool, 0444); +MODULE_PARM_DESC(register_always, + "Use memory registration even for contiguous memory regions"); + +static struct kernel_param_ops srp_tmo_ops; + +static int srp_reconnect_delay = 10; +module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reconnect_delay, "Time between successive reconnect attempts"); + +static int srp_fast_io_fail_tmo = 15; +module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(fast_io_fail_tmo, + "Number of seconds between the observation of a transport" + " layer error and failing all I/O. \"off\" means that this" + " functionality is disabled."); + +static int srp_dev_loss_tmo = 600; +module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dev_loss_tmo, + "Maximum number of seconds that the SRP transport should" + " insulate transport layer errors. After this time has been" + " exceeded the SCSI host is removed. Should be" + " between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT) + " if fast_io_fail_tmo has not been set. \"off\" means that" + " this functionality is disabled."); + static void srp_add_one(struct ib_device *device); static void srp_remove_one(struct ib_device *device); static void srp_recv_completion(struct ib_cq *cq, void *target_ptr); @@ -102,6 +139,48 @@ static struct ib_client srp_client = { static struct ib_sa_client srp_sa_client; +static int srp_tmo_get(char *buffer, const struct kernel_param *kp) +{ + int tmo = *(int *)kp->arg; + + if (tmo >= 0) + return sprintf(buffer, "%d", tmo); + else + return sprintf(buffer, "off"); +} + +static int srp_tmo_set(const char *val, const struct kernel_param *kp) +{ + int tmo, res; + + if (strncmp(val, "off", 3) != 0) { + res = kstrtoint(val, 0, &tmo); + if (res) + goto out; + } else { + tmo = -1; + } + if (kp->arg == &srp_reconnect_delay) + res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo, + srp_dev_loss_tmo); + else if (kp->arg == &srp_fast_io_fail_tmo) + res = srp_tmo_valid(srp_reconnect_delay, tmo, srp_dev_loss_tmo); + else + res = srp_tmo_valid(srp_reconnect_delay, srp_fast_io_fail_tmo, + tmo); + if (res) + goto out; + *(int *)kp->arg = tmo; + +out: + return res; +} + +static struct kernel_param_ops srp_tmo_ops = { + .get = srp_tmo_get, + .set = srp_tmo_set, +}; + static inline struct srp_target_port *host_to_target(struct Scsi_Host *host) { return (struct srp_target_port *) host->hostdata; @@ -219,80 +298,288 @@ static int srp_new_cm_id(struct srp_target_port *target) return 0; } +static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_fmr_pool_param fmr_param; + + memset(&fmr_param, 0, sizeof(fmr_param)); + fmr_param.pool_size = target->scsi_host->can_queue; + fmr_param.dirty_watermark = fmr_param.pool_size / 4; + fmr_param.cache = 1; + fmr_param.max_pages_per_fmr = dev->max_pages_per_mr; + fmr_param.page_shift = ilog2(dev->mr_page_size); + fmr_param.access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + + return ib_create_fmr_pool(dev->pd, &fmr_param); +} + +/** + * srp_destroy_fr_pool() - free the resources owned by a pool + * @pool: Fast registration pool to be destroyed. + */ +static void srp_destroy_fr_pool(struct srp_fr_pool *pool) +{ + int i; + struct srp_fr_desc *d; + + if (!pool) + return; + + for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { + if (d->frpl) + ib_free_fast_reg_page_list(d->frpl); + if (d->mr) + ib_dereg_mr(d->mr); + } + kfree(pool); +} + +/** + * srp_create_fr_pool() - allocate and initialize a pool for fast registration + * @device: IB device to allocate fast registration descriptors for. + * @pd: Protection domain associated with the FR descriptors. + * @pool_size: Number of descriptors to allocate. + * @max_page_list_len: Maximum fast registration work request page list length. + */ +static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device, + struct ib_pd *pd, int pool_size, + int max_page_list_len) +{ + struct srp_fr_pool *pool; + struct srp_fr_desc *d; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; + int i, ret = -EINVAL; + + if (pool_size <= 0) + goto err; + ret = -ENOMEM; + pool = kzalloc(sizeof(struct srp_fr_pool) + + pool_size * sizeof(struct srp_fr_desc), GFP_KERNEL); + if (!pool) + goto err; + pool->size = pool_size; + pool->max_page_list_len = max_page_list_len; + spin_lock_init(&pool->lock); + INIT_LIST_HEAD(&pool->free_list); + + for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { + mr = ib_alloc_fast_reg_mr(pd, max_page_list_len); + if (IS_ERR(mr)) { + ret = PTR_ERR(mr); + goto destroy_pool; + } + d->mr = mr; + frpl = ib_alloc_fast_reg_page_list(device, max_page_list_len); + if (IS_ERR(frpl)) { + ret = PTR_ERR(frpl); + goto destroy_pool; + } + d->frpl = frpl; + list_add_tail(&d->entry, &pool->free_list); + } + +out: + return pool; + +destroy_pool: + srp_destroy_fr_pool(pool); + +err: + pool = ERR_PTR(ret); + goto out; +} + +/** + * srp_fr_pool_get() - obtain a descriptor suitable for fast registration + * @pool: Pool to obtain descriptor from. + */ +static struct srp_fr_desc *srp_fr_pool_get(struct srp_fr_pool *pool) +{ + struct srp_fr_desc *d = NULL; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + if (!list_empty(&pool->free_list)) { + d = list_first_entry(&pool->free_list, typeof(*d), entry); + list_del(&d->entry); + } + spin_unlock_irqrestore(&pool->lock, flags); + + return d; +} + +/** + * srp_fr_pool_put() - put an FR descriptor back in the free list + * @pool: Pool the descriptor was allocated from. + * @desc: Pointer to an array of fast registration descriptor pointers. + * @n: Number of descriptors to put back. + * + * Note: The caller must already have queued an invalidation request for + * desc->mr->rkey before calling this function. + */ +static void srp_fr_pool_put(struct srp_fr_pool *pool, struct srp_fr_desc **desc, + int n) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&pool->lock, flags); + for (i = 0; i < n; i++) + list_add(&desc[i]->entry, &pool->free_list); + spin_unlock_irqrestore(&pool->lock, flags); +} + +static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + + return srp_create_fr_pool(dev->dev, dev->pd, + target->scsi_host->can_queue, + dev->max_pages_per_mr); +} + static int srp_create_target_ib(struct srp_target_port *target) { + struct srp_device *dev = target->srp_host->srp_dev; struct ib_qp_init_attr *init_attr; + struct ib_cq *recv_cq, *send_cq; + struct ib_qp *qp; + struct ib_fmr_pool *fmr_pool = NULL; + struct srp_fr_pool *fr_pool = NULL; + const int m = 1 + dev->use_fast_reg; int ret; init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL); if (!init_attr) return -ENOMEM; - target->recv_cq = ib_create_cq(target->srp_host->srp_dev->dev, - srp_recv_completion, NULL, target, SRP_RQ_SIZE, 0); - if (IS_ERR(target->recv_cq)) { - ret = PTR_ERR(target->recv_cq); + recv_cq = ib_create_cq(dev->dev, srp_recv_completion, NULL, target, + target->queue_size, target->comp_vector); + if (IS_ERR(recv_cq)) { + ret = PTR_ERR(recv_cq); goto err; } - target->send_cq = ib_create_cq(target->srp_host->srp_dev->dev, - srp_send_completion, NULL, target, SRP_SQ_SIZE, 0); - if (IS_ERR(target->send_cq)) { - ret = PTR_ERR(target->send_cq); + send_cq = ib_create_cq(dev->dev, srp_send_completion, NULL, target, + m * target->queue_size, target->comp_vector); + if (IS_ERR(send_cq)) { + ret = PTR_ERR(send_cq); goto err_recv_cq; } - ib_req_notify_cq(target->recv_cq, IB_CQ_NEXT_COMP); + ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP); init_attr->event_handler = srp_qp_event; - init_attr->cap.max_send_wr = SRP_SQ_SIZE; - init_attr->cap.max_recv_wr = SRP_RQ_SIZE; + init_attr->cap.max_send_wr = m * target->queue_size; + init_attr->cap.max_recv_wr = target->queue_size; init_attr->cap.max_recv_sge = 1; init_attr->cap.max_send_sge = 1; - init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; init_attr->qp_type = IB_QPT_RC; - init_attr->send_cq = target->send_cq; - init_attr->recv_cq = target->recv_cq; + init_attr->send_cq = send_cq; + init_attr->recv_cq = recv_cq; - target->qp = ib_create_qp(target->srp_host->srp_dev->pd, init_attr); - if (IS_ERR(target->qp)) { - ret = PTR_ERR(target->qp); + qp = ib_create_qp(dev->pd, init_attr); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto err_send_cq; } - ret = srp_init_qp(target, target->qp); + ret = srp_init_qp(target, qp); if (ret) goto err_qp; + if (dev->use_fast_reg && dev->has_fr) { + fr_pool = srp_alloc_fr_pool(target); + if (IS_ERR(fr_pool)) { + ret = PTR_ERR(fr_pool); + shost_printk(KERN_WARNING, target->scsi_host, PFX + "FR pool allocation failed (%d)\n", ret); + goto err_qp; + } + if (target->fr_pool) + srp_destroy_fr_pool(target->fr_pool); + target->fr_pool = fr_pool; + } else if (!dev->use_fast_reg && dev->has_fmr) { + fmr_pool = srp_alloc_fmr_pool(target); + if (IS_ERR(fmr_pool)) { + ret = PTR_ERR(fmr_pool); + shost_printk(KERN_WARNING, target->scsi_host, PFX + "FMR pool allocation failed (%d)\n", ret); + goto err_qp; + } + if (target->fmr_pool) + ib_destroy_fmr_pool(target->fmr_pool); + target->fmr_pool = fmr_pool; + } + + if (target->qp) + ib_destroy_qp(target->qp); + if (target->recv_cq) + ib_destroy_cq(target->recv_cq); + if (target->send_cq) + ib_destroy_cq(target->send_cq); + + target->qp = qp; + target->recv_cq = recv_cq; + target->send_cq = send_cq; + kfree(init_attr); return 0; err_qp: - ib_destroy_qp(target->qp); + ib_destroy_qp(qp); err_send_cq: - ib_destroy_cq(target->send_cq); + ib_destroy_cq(send_cq); err_recv_cq: - ib_destroy_cq(target->recv_cq); + ib_destroy_cq(recv_cq); err: kfree(init_attr); return ret; } +/* + * Note: this function may be called without srp_alloc_iu_bufs() having been + * invoked. Hence the target->[rt]x_ring checks. + */ static void srp_free_target_ib(struct srp_target_port *target) { + struct srp_device *dev = target->srp_host->srp_dev; int i; + if (dev->use_fast_reg) { + if (target->fr_pool) + srp_destroy_fr_pool(target->fr_pool); + } else { + if (target->fmr_pool) + ib_destroy_fmr_pool(target->fmr_pool); + } ib_destroy_qp(target->qp); ib_destroy_cq(target->send_cq); ib_destroy_cq(target->recv_cq); - for (i = 0; i < SRP_RQ_SIZE; ++i) - srp_free_iu(target->srp_host, target->rx_ring[i]); - for (i = 0; i < SRP_SQ_SIZE; ++i) - srp_free_iu(target->srp_host, target->tx_ring[i]); + target->qp = NULL; + target->send_cq = target->recv_cq = NULL; + + if (target->rx_ring) { + for (i = 0; i < target->queue_size; ++i) + srp_free_iu(target->srp_host, target->rx_ring[i]); + kfree(target->rx_ring); + target->rx_ring = NULL; + } + if (target->tx_ring) { + for (i = 0; i < target->queue_size; ++i) + srp_free_iu(target->srp_host, target->tx_ring[i]); + kfree(target->tx_ring); + target->tx_ring = NULL; + } } static void srp_path_rec_completion(int status, @@ -312,6 +599,8 @@ static void srp_path_rec_completion(int status, static int srp_lookup_path(struct srp_target_port *target) { + int ret; + target->path.numb_path = 1; init_completion(&target->done); @@ -332,7 +621,9 @@ static int srp_lookup_path(struct srp_target_port *target) if (target->path_query_id < 0) return target->path_query_id; - wait_for_completion(&target->done); + ret = wait_for_completion_interruptible(&target->done); + if (ret < 0) + return ret; if (target->status < 0) shost_printk(KERN_WARNING, target->scsi_host, @@ -372,7 +663,7 @@ static int srp_send_req(struct srp_target_port *target) req->param.responder_resources = 4; req->param.remote_cm_response_timeout = 20; req->param.local_cm_response_timeout = 20; - req->param.retry_count = 7; + req->param.retry_count = target->tl_retry_count; req->param.rnr_retry_count = 7; req->param.max_cm_retries = 15; @@ -428,42 +719,66 @@ static int srp_send_req(struct srp_target_port *target) return status; } -static void srp_disconnect_target(struct srp_target_port *target) +static bool srp_queue_remove_work(struct srp_target_port *target) { - /* XXX should send SRP_I_LOGOUT request */ + bool changed = false; - init_completion(&target->done); - if (ib_send_cm_dreq(target->cm_id, NULL, 0)) { - shost_printk(KERN_DEBUG, target->scsi_host, - PFX "Sending CM DREQ failed\n"); - return; + spin_lock_irq(&target->lock); + if (target->state != SRP_TARGET_REMOVED) { + target->state = SRP_TARGET_REMOVED; + changed = true; } - wait_for_completion(&target->done); + spin_unlock_irq(&target->lock); + + if (changed) + queue_work(system_long_wq, &target->remove_work); + + return changed; } -static bool srp_change_state(struct srp_target_port *target, - enum srp_target_state old, - enum srp_target_state new) +static bool srp_change_conn_state(struct srp_target_port *target, + bool connected) { bool changed = false; spin_lock_irq(&target->lock); - if (target->state == old) { - target->state = new; + if (target->connected != connected) { + target->connected = connected; changed = true; } spin_unlock_irq(&target->lock); + return changed; } +static void srp_disconnect_target(struct srp_target_port *target) +{ + if (srp_change_conn_state(target, false)) { + /* XXX should send SRP_I_LOGOUT request */ + + if (ib_send_cm_dreq(target->cm_id, NULL, 0)) { + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Sending CM DREQ failed\n"); + } + } +} + static void srp_free_req_data(struct srp_target_port *target) { - struct ib_device *ibdev = target->srp_host->srp_dev->dev; + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; struct srp_request *req; int i; - for (i = 0, req = target->req_ring; i < SRP_CMD_SQ_SIZE; ++i, ++req) { - kfree(req->fmr_list); + if (!target->req_ring) + return; + + for (i = 0; i < target->req_ring_size; ++i) { + req = &target->req_ring[i]; + if (dev->use_fast_reg) + kfree(req->fr_list); + else + kfree(req->fmr_list); kfree(req->map_page); if (req->indirect_dma_addr) { ib_dma_unmap_single(ibdev, req->indirect_dma_addr, @@ -472,6 +787,59 @@ static void srp_free_req_data(struct srp_target_port *target) } kfree(req->indirect_desc); } + + kfree(target->req_ring); + target->req_ring = NULL; +} + +static int srp_alloc_req_data(struct srp_target_port *target) +{ + struct srp_device *srp_dev = target->srp_host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; + struct srp_request *req; + void *mr_list; + dma_addr_t dma_addr; + int i, ret = -ENOMEM; + + INIT_LIST_HEAD(&target->free_reqs); + + target->req_ring = kzalloc(target->req_ring_size * + sizeof(*target->req_ring), GFP_KERNEL); + if (!target->req_ring) + goto out; + + for (i = 0; i < target->req_ring_size; ++i) { + req = &target->req_ring[i]; + mr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *), + GFP_KERNEL); + if (!mr_list) + goto out; + if (srp_dev->use_fast_reg) + req->fr_list = mr_list; + else + req->fmr_list = mr_list; + req->map_page = kmalloc(srp_dev->max_pages_per_mr * + sizeof(void *), GFP_KERNEL); + if (!req->map_page) + goto out; + req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); + if (!req->indirect_desc) + goto out; + + dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, + target->indirect_size, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ibdev, dma_addr)) + goto out; + + req->indirect_dma_addr = dma_addr; + req->index = i; + list_add_tail(&req->list, &target->free_reqs); + } + ret = 0; + +out: + return ret; } /** @@ -489,32 +857,55 @@ static void srp_del_scsi_host_attr(struct Scsi_Host *shost) device_remove_file(&shost->shost_dev, *attr); } -static void srp_remove_work(struct work_struct *work) +static void srp_remove_target(struct srp_target_port *target) { - struct srp_target_port *target = - container_of(work, struct srp_target_port, work); - - if (!srp_change_state(target, SRP_TARGET_DEAD, SRP_TARGET_REMOVED)) - return; - - spin_lock(&target->srp_host->target_lock); - list_del(&target->list); - spin_unlock(&target->srp_host->target_lock); + WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); srp_del_scsi_host_attr(target->scsi_host); + srp_rport_get(target->rport); srp_remove_host(target->scsi_host); scsi_remove_host(target->scsi_host); + srp_stop_rport_timers(target->rport); + srp_disconnect_target(target); ib_destroy_cm_id(target->cm_id); srp_free_target_ib(target); + cancel_work_sync(&target->tl_err_work); + srp_rport_put(target->rport); srp_free_req_data(target); + + spin_lock(&target->srp_host->target_lock); + list_del(&target->list); + spin_unlock(&target->srp_host->target_lock); + scsi_host_put(target->scsi_host); } +static void srp_remove_work(struct work_struct *work) +{ + struct srp_target_port *target = + container_of(work, struct srp_target_port, remove_work); + + WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); + + srp_remove_target(target); +} + +static void srp_rport_delete(struct srp_rport *rport) +{ + struct srp_target_port *target = rport->lld_data; + + srp_queue_remove_work(target); +} + static int srp_connect_target(struct srp_target_port *target) { int retries = 3; int ret; + WARN_ON_ONCE(target->connected); + + target->qp_in_error = false; + ret = srp_lookup_path(target); if (ret) return ret; @@ -524,7 +915,9 @@ static int srp_connect_target(struct srp_target_port *target) ret = srp_send_req(target); if (ret) return ret; - wait_for_completion(&target->done); + ret = wait_for_completion_interruptible(&target->done); + if (ret < 0) + return ret; /* * The CM event handling code will set status to @@ -534,6 +927,7 @@ static int srp_connect_target(struct srp_target_port *target) */ switch (target->status) { case 0: + srp_change_conn_state(target, true); return 0; case SRP_PORT_REDIRECT: @@ -566,21 +960,56 @@ static int srp_connect_target(struct srp_target_port *target) } } +static int srp_inv_rkey(struct srp_target_port *target, u32 rkey) +{ + struct ib_send_wr *bad_wr; + struct ib_send_wr wr = { + .opcode = IB_WR_LOCAL_INV, + .wr_id = LOCAL_INV_WR_ID_MASK, + .next = NULL, + .num_sge = 0, + .send_flags = 0, + .ex.invalidate_rkey = rkey, + }; + + return ib_post_send(target->qp, &wr, &bad_wr); +} + static void srp_unmap_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, struct srp_request *req) { - struct ib_device *ibdev = target->srp_host->srp_dev->dev; - struct ib_pool_fmr **pfmr; + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + int i, res; if (!scsi_sglist(scmnd) || (scmnd->sc_data_direction != DMA_TO_DEVICE && scmnd->sc_data_direction != DMA_FROM_DEVICE)) return; - pfmr = req->fmr_list; - while (req->nfmr--) - ib_fmr_pool_unmap(*pfmr++); + if (dev->use_fast_reg) { + struct srp_fr_desc **pfr; + + for (i = req->nmdesc, pfr = req->fr_list; i > 0; i--, pfr++) { + res = srp_inv_rkey(target, (*pfr)->mr->rkey); + if (res < 0) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "Queueing INV WR for rkey %#x failed (%d)\n", + (*pfr)->mr->rkey, res); + queue_work(system_long_wq, + &target->tl_err_work); + } + } + if (req->nmdesc) + srp_fr_pool_put(target->fr_pool, req->fr_list, + req->nmdesc); + } else { + struct ib_pool_fmr **pfmr; + + for (i = req->nmdesc, pfmr = req->fmr_list; i > 0; i--, pfmr++) + ib_fmr_pool_unmap(*pfmr); + } ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd), scmnd->sc_data_direction); @@ -590,6 +1019,7 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd, * srp_claim_req - Take ownership of the scmnd associated with a request. * @target: SRP target port. * @req: SRP request. + * @sdev: If not NULL, only take ownership for this SCSI device. * @scmnd: If NULL, take ownership of @req->scmnd. If not NULL, only take * ownership of @req->scmnd if it equals @scmnd. * @@ -598,16 +1028,17 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd, */ static struct scsi_cmnd *srp_claim_req(struct srp_target_port *target, struct srp_request *req, + struct scsi_device *sdev, struct scsi_cmnd *scmnd) { unsigned long flags; spin_lock_irqsave(&target->lock, flags); - if (!scmnd) { + if (req->scmnd && + (!sdev || req->scmnd->device == sdev) && + (!scmnd || req->scmnd == scmnd)) { scmnd = req->scmnd; req->scmnd = NULL; - } else if (req->scmnd == scmnd) { - req->scmnd = NULL; } else { scmnd = NULL; } @@ -618,6 +1049,10 @@ static struct scsi_cmnd *srp_claim_req(struct srp_target_port *target, /** * srp_free_req() - Unmap data and add request to the free request list. + * @target: SRP target port. + * @req: Request to be freed. + * @scmnd: SCSI command associated with @req. + * @req_lim_delta: Amount to be added to @target->req_lim. */ static void srp_free_req(struct srp_target_port *target, struct srp_request *req, struct scsi_cmnd *scmnd, @@ -633,89 +1068,83 @@ static void srp_free_req(struct srp_target_port *target, spin_unlock_irqrestore(&target->lock, flags); } -static void srp_reset_req(struct srp_target_port *target, struct srp_request *req) +static void srp_finish_req(struct srp_target_port *target, + struct srp_request *req, struct scsi_device *sdev, + int result) { - struct scsi_cmnd *scmnd = srp_claim_req(target, req, NULL); + struct scsi_cmnd *scmnd = srp_claim_req(target, req, sdev, NULL); if (scmnd) { srp_free_req(target, req, scmnd, 0); - scmnd->result = DID_RESET << 16; + scmnd->result = result; scmnd->scsi_done(scmnd); } } -static int srp_reconnect_target(struct srp_target_port *target) +static void srp_terminate_io(struct srp_rport *rport) { - struct ib_qp_attr qp_attr; - struct ib_wc wc; - int i, ret; - - if (!srp_change_state(target, SRP_TARGET_LIVE, SRP_TARGET_CONNECTING)) - return -EAGAIN; + struct srp_target_port *target = rport->lld_data; + struct Scsi_Host *shost = target->scsi_host; + struct scsi_device *sdev; + int i; - srp_disconnect_target(target); /* - * Now get a new local CM ID so that we avoid confusing the - * target in case things are really fouled up. + * Invoking srp_terminate_io() while srp_queuecommand() is running + * is not safe. Hence the warning statement below. */ - ret = srp_new_cm_id(target); - if (ret) - goto err; + shost_for_each_device(sdev, shost) + WARN_ON_ONCE(sdev->request_queue->request_fn_active); - qp_attr.qp_state = IB_QPS_RESET; - ret = ib_modify_qp(target->qp, &qp_attr, IB_QP_STATE); - if (ret) - goto err; + for (i = 0; i < target->req_ring_size; ++i) { + struct srp_request *req = &target->req_ring[i]; + srp_finish_req(target, req, NULL, DID_TRANSPORT_FAILFAST << 16); + } +} - ret = srp_init_qp(target, target->qp); - if (ret) - goto err; +/* + * It is up to the caller to ensure that srp_rport_reconnect() calls are + * serialized and that no concurrent srp_queuecommand(), srp_abort(), + * srp_reset_device() or srp_reset_host() calls will occur while this function + * is in progress. One way to realize that is not to call this function + * directly but to call srp_reconnect_rport() instead since that last function + * serializes calls of this function via rport->mutex and also blocks + * srp_queuecommand() calls before invoking this function. + */ +static int srp_rport_reconnect(struct srp_rport *rport) +{ + struct srp_target_port *target = rport->lld_data; + int i, ret; - while (ib_poll_cq(target->recv_cq, 1, &wc) > 0) - ; /* nothing */ - while (ib_poll_cq(target->send_cq, 1, &wc) > 0) - ; /* nothing */ + srp_disconnect_target(target); + /* + * Now get a new local CM ID so that we avoid confusing the target in + * case things are really fouled up. Doing so also ensures that all CM + * callbacks will have finished before a new QP is allocated. + */ + ret = srp_new_cm_id(target); - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { + for (i = 0; i < target->req_ring_size; ++i) { struct srp_request *req = &target->req_ring[i]; - if (req->scmnd) - srp_reset_req(target, req); + srp_finish_req(target, req, NULL, DID_RESET << 16); } + /* + * Whether or not creating a new CM ID succeeded, create a new + * QP. This guarantees that all callback functions for the old QP have + * finished before any send requests are posted on the new QP. + */ + ret += srp_create_target_ib(target); + INIT_LIST_HEAD(&target->free_tx); - for (i = 0; i < SRP_SQ_SIZE; ++i) + for (i = 0; i < target->queue_size; ++i) list_add(&target->tx_ring[i]->list, &target->free_tx); - target->qp_in_error = 0; - ret = srp_connect_target(target); - if (ret) - goto err; + if (ret == 0) + ret = srp_connect_target(target); - if (!srp_change_state(target, SRP_TARGET_CONNECTING, SRP_TARGET_LIVE)) - ret = -EAGAIN; - - return ret; - -err: - shost_printk(KERN_ERR, target->scsi_host, - PFX "reconnect failed (%d), removing target port.\n", ret); - - /* - * We couldn't reconnect, so kill our target port off. - * However, we have to defer the real removal because we - * are in the context of the SCSI error handler now, which - * will deadlock if we call scsi_remove_host(). - * - * Schedule our work inside the lock to avoid a race with - * the flush_scheduled_work() in srp_remove_one(). - */ - spin_lock_irq(&target->lock); - if (target->state == SRP_TARGET_CONNECTING) { - target->state = SRP_TARGET_DEAD; - INIT_WORK(&target->work, srp_remove_work); - queue_work(ib_wq, &target->work); - } - spin_unlock_irq(&target->lock); + if (ret == 0) + shost_printk(KERN_INFO, target->scsi_host, + PFX "reconnect succeeded\n"); return ret; } @@ -737,33 +1166,87 @@ static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr, static int srp_map_finish_fmr(struct srp_map_state *state, struct srp_target_port *target) { - struct srp_device *dev = target->srp_host->srp_dev; struct ib_pool_fmr *fmr; u64 io_addr = 0; - if (!state->npages) - return 0; - - if (state->npages == 1) { - srp_map_desc(state, state->base_dma_addr, state->fmr_len, - target->rkey); - state->npages = state->fmr_len = 0; - return 0; - } - - fmr = ib_fmr_pool_map_phys(dev->fmr_pool, state->pages, + fmr = ib_fmr_pool_map_phys(target->fmr_pool, state->pages, state->npages, io_addr); if (IS_ERR(fmr)) return PTR_ERR(fmr); *state->next_fmr++ = fmr; - state->nfmr++; + state->nmdesc++; + + srp_map_desc(state, 0, state->dma_len, fmr->fmr->rkey); - srp_map_desc(state, 0, state->fmr_len, fmr->fmr->rkey); - state->npages = state->fmr_len = 0; return 0; } +static int srp_map_finish_fr(struct srp_map_state *state, + struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_send_wr *bad_wr; + struct ib_send_wr wr; + struct srp_fr_desc *desc; + u32 rkey; + + desc = srp_fr_pool_get(target->fr_pool); + if (!desc) + return -ENOMEM; + + rkey = ib_inc_rkey(desc->mr->rkey); + ib_update_fast_reg_key(desc->mr, rkey); + + memcpy(desc->frpl->page_list, state->pages, + sizeof(state->pages[0]) * state->npages); + + memset(&wr, 0, sizeof(wr)); + wr.opcode = IB_WR_FAST_REG_MR; + wr.wr_id = FAST_REG_WR_ID_MASK; + wr.wr.fast_reg.iova_start = state->base_dma_addr; + wr.wr.fast_reg.page_list = desc->frpl; + wr.wr.fast_reg.page_list_len = state->npages; + wr.wr.fast_reg.page_shift = ilog2(dev->mr_page_size); + wr.wr.fast_reg.length = state->dma_len; + wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE); + wr.wr.fast_reg.rkey = desc->mr->lkey; + + *state->next_fr++ = desc; + state->nmdesc++; + + srp_map_desc(state, state->base_dma_addr, state->dma_len, + desc->mr->rkey); + + return ib_post_send(target->qp, &wr, &bad_wr); +} + +static int srp_finish_mapping(struct srp_map_state *state, + struct srp_target_port *target) +{ + int ret = 0; + + if (state->npages == 0) + return 0; + + if (state->npages == 1 && !register_always) + srp_map_desc(state, state->base_dma_addr, state->dma_len, + target->rkey); + else + ret = target->srp_host->srp_dev->use_fast_reg ? + srp_map_finish_fr(state, target) : + srp_map_finish_fmr(state, target); + + if (ret == 0) { + state->npages = 0; + state->dma_len = 0; + } + + return ret; +} + static void srp_map_update_start(struct srp_map_state *state, struct scatterlist *sg, int sg_index, dma_addr_t dma_addr) @@ -776,7 +1259,7 @@ static void srp_map_update_start(struct srp_map_state *state, static int srp_map_sg_entry(struct srp_map_state *state, struct srp_target_port *target, struct scatterlist *sg, int sg_index, - int use_fmr) + bool use_mr) { struct srp_device *dev = target->srp_host->srp_dev; struct ib_device *ibdev = dev->dev; @@ -788,23 +1271,25 @@ static int srp_map_sg_entry(struct srp_map_state *state, if (!dma_len) return 0; - if (use_fmr == SRP_MAP_NO_FMR) { - /* Once we're in direct map mode for a request, we don't - * go back to FMR mode, so no need to update anything + if (!use_mr) { + /* + * Once we're in direct map mode for a request, we don't + * go back to FMR or FR mode, so no need to update anything * other than the descriptor. */ srp_map_desc(state, dma_addr, dma_len, target->rkey); return 0; } - /* If we start at an offset into the FMR page, don't merge into - * the current FMR. Finish it out, and use the kernel's MR for this - * sg entry. This is to avoid potential bugs on some SRP targets - * that were never quite defined, but went away when the initiator - * avoided using FMR on such page fragments. + /* + * Since not all RDMA HW drivers support non-zero page offsets for + * FMR, if we start at an offset into a page, don't merge into the + * current FMR mapping. Finish it out, and use the kernel's MR for + * this sg entry. */ - if (dma_addr & ~dev->fmr_page_mask || dma_len > dev->fmr_max_size) { - ret = srp_map_finish_fmr(state, target); + if ((!dev->use_fast_reg && dma_addr & ~dev->mr_page_mask) || + dma_len > dev->mr_max_size) { + ret = srp_finish_mapping(state, target); if (ret) return ret; @@ -813,52 +1298,106 @@ static int srp_map_sg_entry(struct srp_map_state *state, return 0; } - /* If this is the first sg to go into the FMR, save our position. - * We need to know the first unmapped entry, its index, and the - * first unmapped address within that entry to be able to restart - * mapping after an error. + /* + * If this is the first sg that will be mapped via FMR or via FR, save + * our position. We need to know the first unmapped entry, its index, + * and the first unmapped address within that entry to be able to + * restart mapping after an error. */ if (!state->unmapped_sg) srp_map_update_start(state, sg, sg_index, dma_addr); while (dma_len) { - if (state->npages == SRP_FMR_SIZE) { - ret = srp_map_finish_fmr(state, target); + unsigned offset = dma_addr & ~dev->mr_page_mask; + if (state->npages == dev->max_pages_per_mr || offset != 0) { + ret = srp_finish_mapping(state, target); if (ret) return ret; srp_map_update_start(state, sg, sg_index, dma_addr); } - len = min_t(unsigned int, dma_len, dev->fmr_page_size); + len = min_t(unsigned int, dma_len, dev->mr_page_size - offset); if (!state->npages) state->base_dma_addr = dma_addr; - state->pages[state->npages++] = dma_addr; - state->fmr_len += len; + state->pages[state->npages++] = dma_addr & dev->mr_page_mask; + state->dma_len += len; dma_addr += len; dma_len -= len; } - /* If the last entry of the FMR wasn't a full page, then we need to + /* + * If the last entry of the MR wasn't a full page, then we need to * close it out and start a new one -- we can only merge at page * boundries. */ ret = 0; - if (len != dev->fmr_page_size) { - ret = srp_map_finish_fmr(state, target); + if (len != dev->mr_page_size) { + ret = srp_finish_mapping(state, target); if (!ret) srp_map_update_start(state, NULL, 0, 0); } return ret; } +static int srp_map_sg(struct srp_map_state *state, + struct srp_target_port *target, struct srp_request *req, + struct scatterlist *scat, int count) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + struct scatterlist *sg; + int i; + bool use_mr; + + state->desc = req->indirect_desc; + state->pages = req->map_page; + if (dev->use_fast_reg) { + state->next_fr = req->fr_list; + use_mr = !!target->fr_pool; + } else { + state->next_fmr = req->fmr_list; + use_mr = !!target->fmr_pool; + } + + for_each_sg(scat, sg, count, i) { + if (srp_map_sg_entry(state, target, sg, i, use_mr)) { + /* + * Memory registration failed, so backtrack to the + * first unmapped entry and continue on without using + * memory registration. + */ + dma_addr_t dma_addr; + unsigned int dma_len; + +backtrack: + sg = state->unmapped_sg; + i = state->unmapped_index; + + dma_addr = ib_sg_dma_address(ibdev, sg); + dma_len = ib_sg_dma_len(ibdev, sg); + dma_len -= (state->unmapped_addr - dma_addr); + dma_addr = state->unmapped_addr; + use_mr = false; + srp_map_desc(state, dma_addr, dma_len, target->rkey); + } + } + + if (use_mr && srp_finish_mapping(state, target)) + goto backtrack; + + req->nmdesc = state->nmdesc; + + return 0; +} + static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, struct srp_request *req) { - struct scatterlist *scat, *sg; + struct scatterlist *scat; struct srp_cmd *cmd = req->cmd->buf; - int i, len, nents, count, use_fmr; + int len, nents, count; struct srp_device *dev; struct ib_device *ibdev; struct srp_map_state state; @@ -890,7 +1429,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, fmt = SRP_DATA_DESC_DIRECT; len = sizeof (struct srp_cmd) + sizeof (struct srp_direct_buf); - if (count == 1) { + if (count == 1 && !register_always) { /* * The midlayer only generated a single gather/scatter * entry, or DMA mapping coalesced everything to a @@ -903,13 +1442,13 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, buf->key = cpu_to_be32(target->rkey); buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat)); - req->nfmr = 0; + req->nmdesc = 0; goto map_complete; } - /* We have more than one scatter/gather entry, so build our indirect - * descriptor table, trying to merge as many entries with FMR as we - * can. + /* + * We have more than one scatter/gather entry, so build our indirect + * descriptor table, trying to merge as many entries as we can. */ indirect_hdr = (void *) cmd->add_data; @@ -917,35 +1456,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, target->indirect_size, DMA_TO_DEVICE); memset(&state, 0, sizeof(state)); - state.desc = req->indirect_desc; - state.pages = req->map_page; - state.next_fmr = req->fmr_list; - - use_fmr = dev->fmr_pool ? SRP_MAP_ALLOW_FMR : SRP_MAP_NO_FMR; - - for_each_sg(scat, sg, count, i) { - if (srp_map_sg_entry(&state, target, sg, i, use_fmr)) { - /* FMR mapping failed, so backtrack to the first - * unmapped entry and continue on without using FMR. - */ - dma_addr_t dma_addr; - unsigned int dma_len; - -backtrack: - sg = state.unmapped_sg; - i = state.unmapped_index; - - dma_addr = ib_sg_dma_address(ibdev, sg); - dma_len = ib_sg_dma_len(ibdev, sg); - dma_len -= (state.unmapped_addr - dma_addr); - dma_addr = state.unmapped_addr; - use_fmr = SRP_MAP_NO_FMR; - srp_map_desc(&state, dma_addr, dma_len, target->rkey); - } - } - - if (use_fmr == SRP_MAP_ALLOW_FMR && srp_map_finish_fmr(&state, target)) - goto backtrack; + srp_map_sg(&state, target, req, scat, count); /* We've mapped the request, now pull as much of the indirect * descriptor table as we can into the command buffer. If this @@ -953,9 +1464,9 @@ backtrack: * guaranteed to fit into the command, as the SCSI layer won't * give us more S/G entries than we allow. */ - req->nfmr = state.nfmr; if (state.ndesc == 1) { - /* FMR mapping was able to collapse this to one entry, + /* + * Memory registration collapsed the sg-list into one entry, * so use a direct descriptor. */ struct srp_direct_buf *buf = (void *) cmd->add_data; @@ -1111,7 +1622,7 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) complete(&target->tsk_mgmt_done); } else { req = &target->req_ring[rsp->tag]; - scmnd = srp_claim_req(target, req, NULL); + scmnd = srp_claim_req(target, req, NULL, NULL); if (!scmnd) { shost_printk(KERN_ERR, target->scsi_host, "Null scmnd for RSP w/tag %016llx\n", @@ -1262,6 +1773,45 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) PFX "Recv failed with error code %d\n", res); } +/** + * srp_tl_err_work() - handle a transport layer error + * @work: Work structure embedded in an SRP target port. + * + * Note: This function may get invoked before the rport has been created, + * hence the target->rport test. + */ +static void srp_tl_err_work(struct work_struct *work) +{ + struct srp_target_port *target; + + target = container_of(work, struct srp_target_port, tl_err_work); + if (target->rport) + srp_start_tl_fail_timers(target->rport); +} + +static void srp_handle_qp_err(u64 wr_id, enum ib_wc_status wc_status, + bool send_err, struct srp_target_port *target) +{ + if (target->connected && !target->qp_in_error) { + if (wr_id & LOCAL_INV_WR_ID_MASK) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "LOCAL_INV failed with status %d\n", + wc_status); + } else if (wr_id & FAST_REG_WR_ID_MASK) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "FAST_REG_MR failed status %d\n", + wc_status); + } else { + shost_printk(KERN_ERR, target->scsi_host, + PFX "failed %s status %d for iu %p\n", + send_err ? "send" : "receive", + wc_status, (void *)(uintptr_t)wr_id); + } + queue_work(system_long_wq, &target->tl_err_work); + } + target->qp_in_error = true; +} + static void srp_recv_completion(struct ib_cq *cq, void *target_ptr) { struct srp_target_port *target = target_ptr; @@ -1269,15 +1819,11 @@ static void srp_recv_completion(struct ib_cq *cq, void *target_ptr) ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); while (ib_poll_cq(cq, 1, &wc) > 0) { - if (wc.status) { - shost_printk(KERN_ERR, target->scsi_host, - PFX "failed receive status %d\n", - wc.status); - target->qp_in_error = 1; - break; + if (likely(wc.status == IB_WC_SUCCESS)) { + srp_handle_recv(target, &wc); + } else { + srp_handle_qp_err(wc.wr_id, wc.status, false, target); } - - srp_handle_recv(target, &wc); } } @@ -1288,38 +1834,39 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) struct srp_iu *iu; while (ib_poll_cq(cq, 1, &wc) > 0) { - if (wc.status) { - shost_printk(KERN_ERR, target->scsi_host, - PFX "failed send status %d\n", - wc.status); - target->qp_in_error = 1; - break; + if (likely(wc.status == IB_WC_SUCCESS)) { + iu = (struct srp_iu *) (uintptr_t) wc.wr_id; + list_add(&iu->list, &target->free_tx); + } else { + srp_handle_qp_err(wc.wr_id, wc.status, true, target); } - - iu = (struct srp_iu *) (uintptr_t) wc.wr_id; - list_add(&iu->list, &target->free_tx); } } static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(shost); + struct srp_rport *rport = target->rport; struct srp_request *req; struct srp_iu *iu; struct srp_cmd *cmd; struct ib_device *dev; unsigned long flags; - int len; + int len, ret; + const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler; - if (target->state == SRP_TARGET_CONNECTING) - goto err; + /* + * The SCSI EH thread is the only context from which srp_queuecommand() + * can get invoked for blocked devices (SDEV_BLOCK / + * SDEV_CREATED_BLOCK). Avoid racing with srp_reconnect_rport() by + * locking the rport mutex if invoked from inside the SCSI EH. + */ + if (in_scsi_eh) + mutex_lock(&rport->mutex); - if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) { - scmnd->result = DID_BAD_TARGET << 16; - scmnd->scsi_done(scmnd); - return 0; - } + scmnd->result = srp_chkready(target->rport); + if (unlikely(scmnd->result)) + goto err; spin_lock_irqsave(&target->lock, flags); iu = __srp_get_tx_iu(target, SRP_IU_CMD); @@ -1334,7 +1881,6 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_iu_len, DMA_TO_DEVICE); - scmnd->result = 0; scmnd->host_scribble = (void *) req; cmd = iu->buf; @@ -1351,7 +1897,15 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) len = srp_map_data(scmnd, target, req); if (len < 0) { shost_printk(KERN_ERR, target->scsi_host, - PFX "Failed to map data\n"); + PFX "Failed to map data (%d)\n", len); + /* + * If we ran out of memory descriptors (-ENOMEM) because an + * application is queuing many requests with more than + * max_pages_per_mr sg-list elements, tell the SCSI mid-layer + * to reduce queue depth temporarily. + */ + scmnd->result = len == -ENOMEM ? + DID_OK << 16 | QUEUE_FULL << 1 : DID_ERROR << 16; goto err_iu; } @@ -1363,7 +1917,13 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) goto err_unmap; } - return 0; + ret = 0; + +unlock_rport: + if (in_scsi_eh) + mutex_unlock(&rport->mutex); + + return ret; err_unmap: srp_unmap_data(scmnd, target, req); @@ -1371,6 +1931,12 @@ err_unmap: err_iu: srp_put_tx_iu(target, iu, SRP_IU_CMD); + /* + * Avoid that the loops that iterate over the request ring can + * encounter a dangling SCSI command pointer. + */ + req->scmnd = NULL; + spin_lock_irqsave(&target->lock, flags); list_add(&req->list, &target->free_reqs); @@ -1378,14 +1944,34 @@ err_unlock: spin_unlock_irqrestore(&target->lock, flags); err: - return SCSI_MLQUEUE_HOST_BUSY; + if (scmnd->result) { + scmnd->scsi_done(scmnd); + ret = 0; + } else { + ret = SCSI_MLQUEUE_HOST_BUSY; + } + + goto unlock_rport; } +/* + * Note: the resources allocated in this function are freed in + * srp_free_target_ib(). + */ static int srp_alloc_iu_bufs(struct srp_target_port *target) { int i; - for (i = 0; i < SRP_RQ_SIZE; ++i) { + target->rx_ring = kzalloc(target->queue_size * sizeof(*target->rx_ring), + GFP_KERNEL); + if (!target->rx_ring) + goto err_no_ring; + target->tx_ring = kzalloc(target->queue_size * sizeof(*target->tx_ring), + GFP_KERNEL); + if (!target->tx_ring) + goto err_no_ring; + + for (i = 0; i < target->queue_size; ++i) { target->rx_ring[i] = srp_alloc_iu(target->srp_host, target->max_ti_iu_len, GFP_KERNEL, DMA_FROM_DEVICE); @@ -1393,7 +1979,7 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target) goto err; } - for (i = 0; i < SRP_SQ_SIZE; ++i) { + for (i = 0; i < target->queue_size; ++i) { target->tx_ring[i] = srp_alloc_iu(target->srp_host, target->max_iu_len, GFP_KERNEL, DMA_TO_DEVICE); @@ -1406,19 +1992,48 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target) return 0; err: - for (i = 0; i < SRP_RQ_SIZE; ++i) { + for (i = 0; i < target->queue_size; ++i) { srp_free_iu(target->srp_host, target->rx_ring[i]); - target->rx_ring[i] = NULL; - } - - for (i = 0; i < SRP_SQ_SIZE; ++i) { srp_free_iu(target->srp_host, target->tx_ring[i]); - target->tx_ring[i] = NULL; } + +err_no_ring: + kfree(target->tx_ring); + target->tx_ring = NULL; + kfree(target->rx_ring); + target->rx_ring = NULL; + return -ENOMEM; } +static uint32_t srp_compute_rq_tmo(struct ib_qp_attr *qp_attr, int attr_mask) +{ + uint64_t T_tr_ns, max_compl_time_ms; + uint32_t rq_tmo_jiffies; + + /* + * According to section 11.2.4.2 in the IBTA spec (Modify Queue Pair, + * table 91), both the QP timeout and the retry count have to be set + * for RC QP's during the RTR to RTS transition. + */ + WARN_ON_ONCE((attr_mask & (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)) != + (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)); + + /* + * Set target->rq_tmo_jiffies to one second more than the largest time + * it can take before an error completion is generated. See also + * C9-140..142 in the IBTA spec for more information about how to + * convert the QP Local ACK Timeout value to nanoseconds. + */ + T_tr_ns = 4096 * (1ULL << qp_attr->timeout); + max_compl_time_ms = qp_attr->retry_cnt * 4 * T_tr_ns; + do_div(max_compl_time_ms, NSEC_PER_MSEC); + rq_tmo_jiffies = msecs_to_jiffies(max_compl_time_ms + 1000); + + return rq_tmo_jiffies; +} + static void srp_cm_rep_handler(struct ib_cm_id *cm_id, struct srp_login_rsp *lrsp, struct srp_target_port *target) @@ -1439,6 +2054,9 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, target->scsi_host->can_queue = min(target->req_lim - SRP_TSK_MGMT_SQ_SIZE, target->scsi_host->can_queue); + target->scsi_host->cmd_per_lun + = min_t(int, target->scsi_host->can_queue, + target->scsi_host->cmd_per_lun); } else { shost_printk(KERN_WARNING, target->scsi_host, PFX "Unhandled RSP opcode %#x\n", lrsp->opcode); @@ -1446,7 +2064,7 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, goto error; } - if (!target->rx_ring[0]) { + if (!target->rx_ring) { ret = srp_alloc_iu_bufs(target); if (ret) goto error; @@ -1466,7 +2084,7 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, if (ret) goto error_free; - for (i = 0; i < SRP_RQ_SIZE; i++) { + for (i = 0; i < target->queue_size; i++) { struct srp_iu *iu = target->rx_ring[i]; ret = srp_post_recv(target, iu); if (ret) @@ -1478,6 +2096,8 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, if (ret) goto error_free; + target->rq_tmo_jiffies = srp_compute_rq_tmo(qp_attr, attr_mask); + ret = ib_modify_qp(target->qp, qp_attr, attr_mask); if (ret) goto error_free; @@ -1550,8 +2170,10 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, shost_printk(KERN_WARNING, shost, PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n"); else - shost_printk(KERN_WARNING, shost, - PFX "SRP LOGIN REJECTED, reason 0x%08x\n", reason); + shost_printk(KERN_WARNING, shost, PFX + "SRP LOGIN from %pI6 to %pI6 REJECTED, reason 0x%08x\n", + target->path.sgid.raw, + target->orig_dgid, reason); } else shost_printk(KERN_WARNING, shost, " REJ reason: IB_CM_REJ_CONSUMER_DEFINED," @@ -1599,16 +2221,18 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) case IB_CM_DREQ_RECEIVED: shost_printk(KERN_WARNING, target->scsi_host, PFX "DREQ received - connection closed\n"); + srp_change_conn_state(target, false); if (ib_send_cm_drep(cm_id, NULL, 0)) shost_printk(KERN_ERR, target->scsi_host, PFX "Sending CM DREP failed\n"); + queue_work(system_long_wq, &target->tl_err_work); break; case IB_CM_TIMEWAIT_EXIT: shost_printk(KERN_ERR, target->scsi_host, PFX "connection closed\n"); - comp = 1; + target->status = 0; break; @@ -1629,25 +2253,84 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) return 0; } +/** + * srp_change_queue_type - changing device queue tag type + * @sdev: scsi device struct + * @tag_type: requested tag type + * + * Returns queue tag type. + */ +static int +srp_change_queue_type(struct scsi_device *sdev, int tag_type) +{ + if (sdev->tagged_supported) { + scsi_set_tag_type(sdev, tag_type); + if (tag_type) + scsi_activate_tcq(sdev, sdev->queue_depth); + else + scsi_deactivate_tcq(sdev, sdev->queue_depth); + } else + tag_type = 0; + + return tag_type; +} + +/** + * srp_change_queue_depth - setting device queue depth + * @sdev: scsi device struct + * @qdepth: requested queue depth + * @reason: SCSI_QDEPTH_DEFAULT/SCSI_QDEPTH_QFULL/SCSI_QDEPTH_RAMP_UP + * (see include/scsi/scsi_host.h for definition) + * + * Returns queue depth. + */ +static int +srp_change_queue_depth(struct scsi_device *sdev, int qdepth, int reason) +{ + struct Scsi_Host *shost = sdev->host; + int max_depth; + if (reason == SCSI_QDEPTH_DEFAULT || reason == SCSI_QDEPTH_RAMP_UP) { + max_depth = shost->can_queue; + if (!sdev->tagged_supported) + max_depth = 1; + if (qdepth > max_depth) + qdepth = max_depth; + scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), qdepth); + } else if (reason == SCSI_QDEPTH_QFULL) + scsi_track_queue_full(sdev, qdepth); + else + return -EOPNOTSUPP; + + return sdev->queue_depth; +} + static int srp_send_tsk_mgmt(struct srp_target_port *target, u64 req_tag, unsigned int lun, u8 func) { + struct srp_rport *rport = target->rport; struct ib_device *dev = target->srp_host->srp_dev->dev; struct srp_iu *iu; struct srp_tsk_mgmt *tsk_mgmt; - if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) + if (!target->connected || target->qp_in_error) return -1; init_completion(&target->tsk_mgmt_done); + /* + * Lock the rport mutex to avoid that srp_create_target_ib() is + * invoked while a task management function is being sent. + */ + mutex_lock(&rport->mutex); spin_lock_irq(&target->lock); iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT); spin_unlock_irq(&target->lock); - if (!iu) + if (!iu) { + mutex_unlock(&rport->mutex); + return -1; + } ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt, DMA_TO_DEVICE); @@ -1664,8 +2347,11 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, DMA_TO_DEVICE); if (srp_post_send(target, iu, sizeof *tsk_mgmt)) { srp_put_tx_iu(target, iu, SRP_IU_TSK_MGMT); + mutex_unlock(&rport->mutex); + return -1; } + mutex_unlock(&rport->mutex); if (!wait_for_completion_timeout(&target->tsk_mgmt_done, msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS))) @@ -1678,18 +2364,24 @@ static int srp_abort(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); struct srp_request *req = (struct srp_request *) scmnd->host_scribble; + int ret; shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n"); - if (!req || target->qp_in_error || !srp_claim_req(target, req, scmnd)) - return FAILED; - srp_send_tsk_mgmt(target, req->index, scmnd->device->lun, - SRP_TSK_ABORT_TASK); + if (!req || !srp_claim_req(target, req, NULL, scmnd)) + return SUCCESS; + if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun, + SRP_TSK_ABORT_TASK) == 0) + ret = SUCCESS; + else if (target->rport->state == SRP_RPORT_LOST) + ret = FAST_IO_FAIL; + else + ret = FAILED; srp_free_req(target, req, scmnd, 0); scmnd->result = DID_ABORT << 16; scmnd->scsi_done(scmnd); - return SUCCESS; + return ret; } static int srp_reset_device(struct scsi_cmnd *scmnd) @@ -1699,18 +2391,15 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n"); - if (target->qp_in_error) - return FAILED; if (srp_send_tsk_mgmt(target, SRP_TAG_NO_REQ, scmnd->device->lun, SRP_TSK_LUN_RESET)) return FAILED; if (target->tsk_mgmt_status) return FAILED; - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { + for (i = 0; i < target->req_ring_size; ++i) { struct srp_request *req = &target->req_ring[i]; - if (req->scmnd && req->scmnd->device == scmnd->device) - srp_reset_req(target, req); + srp_finish_req(target, req, scmnd->device, DID_RESET << 16); } return SUCCESS; @@ -1719,14 +2408,25 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) static int srp_reset_host(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); - int ret = FAILED; shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n"); - if (!srp_reconnect_target(target)) - ret = SUCCESS; + return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED; +} - return ret; +static int srp_slave_configure(struct scsi_device *sdev) +{ + struct Scsi_Host *shost = sdev->host; + struct srp_target_port *target = host_to_target(shost); + struct request_queue *q = sdev->request_queue; + unsigned long timeout; + + if (sdev->type == TYPE_DISK) { + timeout = max_t(unsigned, 30 * HZ, target->rq_tmo_jiffies); + blk_queue_rq_timeout(q, timeout); + } + + return 0; } static ssize_t show_id_ext(struct device *dev, struct device_attribute *attr, @@ -1764,6 +2464,14 @@ static ssize_t show_pkey(struct device *dev, struct device_attribute *attr, return sprintf(buf, "0x%04x\n", be16_to_cpu(target->path.pkey)); } +static ssize_t show_sgid(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%pI6\n", target->path.sgid.raw); +} + static ssize_t show_dgid(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1812,6 +2520,22 @@ static ssize_t show_local_ib_device(struct device *dev, return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name); } +static ssize_t show_comp_vector(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%d\n", target->comp_vector); +} + +static ssize_t show_tl_retry_count(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%d\n", target->tl_retry_count); +} + static ssize_t show_cmd_sg_entries(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1832,12 +2556,15 @@ static DEVICE_ATTR(id_ext, S_IRUGO, show_id_ext, NULL); static DEVICE_ATTR(ioc_guid, S_IRUGO, show_ioc_guid, NULL); static DEVICE_ATTR(service_id, S_IRUGO, show_service_id, NULL); static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); +static DEVICE_ATTR(sgid, S_IRUGO, show_sgid, NULL); static DEVICE_ATTR(dgid, S_IRUGO, show_dgid, NULL); static DEVICE_ATTR(orig_dgid, S_IRUGO, show_orig_dgid, NULL); static DEVICE_ATTR(req_lim, S_IRUGO, show_req_lim, NULL); static DEVICE_ATTR(zero_req_lim, S_IRUGO, show_zero_req_lim, NULL); static DEVICE_ATTR(local_ib_port, S_IRUGO, show_local_ib_port, NULL); static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL); +static DEVICE_ATTR(comp_vector, S_IRUGO, show_comp_vector, NULL); +static DEVICE_ATTR(tl_retry_count, S_IRUGO, show_tl_retry_count, NULL); static DEVICE_ATTR(cmd_sg_entries, S_IRUGO, show_cmd_sg_entries, NULL); static DEVICE_ATTR(allow_ext_sg, S_IRUGO, show_allow_ext_sg, NULL); @@ -1846,12 +2573,15 @@ static struct device_attribute *srp_host_attrs[] = { &dev_attr_ioc_guid, &dev_attr_service_id, &dev_attr_pkey, + &dev_attr_sgid, &dev_attr_dgid, &dev_attr_orig_dgid, &dev_attr_req_lim, &dev_attr_zero_req_lim, &dev_attr_local_ib_port, &dev_attr_local_ib_device, + &dev_attr_comp_vector, + &dev_attr_tl_retry_count, &dev_attr_cmd_sg_entries, &dev_attr_allow_ext_sg, NULL @@ -1861,15 +2591,19 @@ static struct scsi_host_template srp_template = { .module = THIS_MODULE, .name = "InfiniBand SRP initiator", .proc_name = DRV_NAME, + .slave_configure = srp_slave_configure, .info = srp_target_info, .queuecommand = srp_queuecommand, + .change_queue_depth = srp_change_queue_depth, + .change_queue_type = srp_change_queue_type, .eh_abort_handler = srp_abort, .eh_device_reset_handler = srp_reset_device, .eh_host_reset_handler = srp_reset_host, + .skip_settle_delay = true, .sg_tablesize = SRP_DEF_SG_TABLESIZE, - .can_queue = SRP_CMD_SQ_SIZE, + .can_queue = SRP_DEFAULT_CMD_SQ_SIZE, .this_id = -1, - .cmd_per_lun = SRP_CMD_SQ_SIZE, + .cmd_per_lun = SRP_DEFAULT_CMD_SQ_SIZE, .use_clustering = ENABLE_CLUSTERING, .shost_attrs = srp_host_attrs }; @@ -1894,6 +2628,9 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) return PTR_ERR(rport); } + rport->lld_data = target; + target->rport = rport; + spin_lock(&host->target_lock); list_add_tail(&target->list, &host->target_list); spin_unlock(&host->target_lock); @@ -1919,6 +2656,38 @@ static struct class srp_class = { .dev_release = srp_release_dev }; +/** + * srp_conn_unique() - check whether the connection to a target is unique + * @host: SRP host. + * @target: SRP target port. + */ +static bool srp_conn_unique(struct srp_host *host, + struct srp_target_port *target) +{ + struct srp_target_port *t; + bool ret = false; + + if (target->state == SRP_TARGET_REMOVED) + goto out; + + ret = true; + + spin_lock(&host->target_lock); + list_for_each_entry(t, &host->target_list, list) { + if (t != target && + target->id_ext == t->id_ext && + target->ioc_guid == t->ioc_guid && + target->initiator_ext == t->initiator_ext) { + ret = false; + break; + } + } + spin_unlock(&host->target_lock); + +out: + return ret; +} + /* * Target ports are added by writing * @@ -1941,6 +2710,9 @@ enum { SRP_OPT_CMD_SG_ENTRIES = 1 << 9, SRP_OPT_ALLOW_EXT_SG = 1 << 10, SRP_OPT_SG_TABLESIZE = 1 << 11, + SRP_OPT_COMP_VECTOR = 1 << 12, + SRP_OPT_TL_RETRY_COUNT = 1 << 13, + SRP_OPT_QUEUE_SIZE = 1 << 14, SRP_OPT_ALL = (SRP_OPT_ID_EXT | SRP_OPT_IOC_GUID | SRP_OPT_DGID | @@ -1961,6 +2733,9 @@ static const match_table_t srp_opt_tokens = { { SRP_OPT_CMD_SG_ENTRIES, "cmd_sg_entries=%u" }, { SRP_OPT_ALLOW_EXT_SG, "allow_ext_sg=%u" }, { SRP_OPT_SG_TABLESIZE, "sg_tablesize=%u" }, + { SRP_OPT_COMP_VECTOR, "comp_vector=%u" }, + { SRP_OPT_TL_RETRY_COUNT, "tl_retry_count=%u" }, + { SRP_OPT_QUEUE_SIZE, "queue_size=%d" }, { SRP_OPT_ERR, NULL } }; @@ -2055,13 +2830,25 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) target->scsi_host->max_sectors = token; break; + case SRP_OPT_QUEUE_SIZE: + if (match_int(args, &token) || token < 1) { + pr_warn("bad queue_size parameter '%s'\n", p); + goto out; + } + target->scsi_host->can_queue = token; + target->queue_size = token + SRP_RSP_SQ_SIZE + + SRP_TSK_MGMT_SQ_SIZE; + if (!(opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) + target->scsi_host->cmd_per_lun = token; + break; + case SRP_OPT_MAX_CMD_PER_LUN: - if (match_int(args, &token)) { + if (match_int(args, &token) || token < 1) { pr_warn("bad max cmd_per_lun parameter '%s'\n", p); goto out; } - target->scsi_host->cmd_per_lun = min(token, SRP_CMD_SQ_SIZE); + target->scsi_host->cmd_per_lun = token; break; case SRP_OPT_IO_CLASS: @@ -2116,6 +2903,23 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) target->sg_tablesize = token; break; + case SRP_OPT_COMP_VECTOR: + if (match_int(args, &token) || token < 0) { + pr_warn("bad comp_vector parameter '%s'\n", p); + goto out; + } + target->comp_vector = token; + break; + + case SRP_OPT_TL_RETRY_COUNT: + if (match_int(args, &token) || token < 2 || token > 7) { + pr_warn("bad tl_retry_count parameter '%s' (must be a number between 2 and 7)\n", + p); + goto out; + } + target->tl_retry_count = token; + break; + default: pr_warn("unknown parameter or missing value '%s' in target creation request\n", p); @@ -2132,6 +2936,12 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) pr_warn("target creation request is missing parameter '%s'\n", srp_opt_tokens[i].pattern); + if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue + && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) + pr_warn("cmd_per_lun = %d > queue_size = %d\n", + target->scsi_host->cmd_per_lun, + target->scsi_host->can_queue); + out: kfree(options); return ret; @@ -2145,9 +2955,9 @@ static ssize_t srp_create_target(struct device *dev, container_of(dev, struct srp_host, dev); struct Scsi_Host *target_host; struct srp_target_port *target; - struct ib_device *ibdev = host->srp_dev->dev; - dma_addr_t dma_addr; - int i, ret; + struct srp_device *srp_dev = host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; + int ret; target_host = scsi_host_alloc(&srp_template, sizeof (struct srp_target_port)); @@ -2170,14 +2980,30 @@ static ssize_t srp_create_target(struct device *dev, target->cmd_sg_cnt = cmd_sg_entries; target->sg_tablesize = indirect_sg_entries ? : cmd_sg_entries; target->allow_ext_sg = allow_ext_sg; + target->tl_retry_count = 7; + target->queue_size = SRP_DEFAULT_QUEUE_SIZE; + + mutex_lock(&host->add_target_mutex); ret = srp_parse_options(buf, target); if (ret) goto err; - if (!host->srp_dev->fmr_pool && !target->allow_ext_sg && - target->cmd_sg_cnt < target->sg_tablesize) { - pr_warn("No FMR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); + target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE; + + if (!srp_conn_unique(target->srp_host, target)) { + shost_printk(KERN_INFO, target->scsi_host, + PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + be64_to_cpu(target->initiator_ext)); + ret = -EEXIST; + goto err; + } + + if (!srp_dev->has_fmr && !srp_dev->has_fr && !target->allow_ext_sg && + target->cmd_sg_cnt < target->sg_tablesize) { + pr_warn("No MR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); target->sg_tablesize = target->cmd_sg_cnt; } @@ -2188,41 +3014,17 @@ static ssize_t srp_create_target(struct device *dev, sizeof (struct srp_indirect_buf) + target->cmd_sg_cnt * sizeof (struct srp_direct_buf); + INIT_WORK(&target->tl_err_work, srp_tl_err_work); + INIT_WORK(&target->remove_work, srp_remove_work); spin_lock_init(&target->lock); INIT_LIST_HEAD(&target->free_tx); - INIT_LIST_HEAD(&target->free_reqs); - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { - struct srp_request *req = &target->req_ring[i]; - - req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof (void *), - GFP_KERNEL); - req->map_page = kmalloc(SRP_FMR_SIZE * sizeof (void *), - GFP_KERNEL); - req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); - if (!req->fmr_list || !req->map_page || !req->indirect_desc) - goto err_free_mem; - - dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, - target->indirect_size, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(ibdev, dma_addr)) - goto err_free_mem; - - req->indirect_dma_addr = dma_addr; - req->index = i; - list_add_tail(&req->list, &target->free_reqs); - } - - ib_query_gid(ibdev, host->port, 0, &target->path.sgid); + ret = srp_alloc_req_data(target); + if (ret) + goto err_free_mem; - shost_printk(KERN_DEBUG, target->scsi_host, PFX - "new target: id_ext %016llx ioc_guid %016llx pkey %04x " - "service_id %016llx dgid %pI6\n", - (unsigned long long) be64_to_cpu(target->id_ext), - (unsigned long long) be64_to_cpu(target->ioc_guid), - be16_to_cpu(target->path.pkey), - (unsigned long long) be64_to_cpu(target->service_id), - target->path.dgid.raw); + ret = ib_query_gid(ibdev, host->port, 0, &target->path.sgid); + if (ret) + goto err_free_mem; ret = srp_create_target_ib(target); if (ret) @@ -2232,7 +3034,6 @@ static ssize_t srp_create_target(struct device *dev, if (ret) goto err_free_ib; - target->qp_in_error = 0; ret = srp_connect_target(target); if (ret) { shost_printk(KERN_ERR, target->scsi_host, @@ -2244,7 +3045,19 @@ static ssize_t srp_create_target(struct device *dev, if (ret) goto err_disconnect; - return count; + shost_printk(KERN_DEBUG, target->scsi_host, PFX + "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + be16_to_cpu(target->path.pkey), + be64_to_cpu(target->service_id), + target->path.sgid.raw, target->path.dgid.raw); + + ret = count; + +out: + mutex_unlock(&host->add_target_mutex); + return ret; err_disconnect: srp_disconnect_target(target); @@ -2260,8 +3073,7 @@ err_free_mem: err: scsi_host_put(target_host); - - return ret; + goto out; } static DEVICE_ATTR(add_target, S_IWUSR, NULL, srp_create_target); @@ -2297,6 +3109,7 @@ static struct srp_host *srp_add_port(struct srp_device *device, u8 port) INIT_LIST_HEAD(&host->target_list); spin_lock_init(&host->target_lock); init_completion(&host->released); + mutex_init(&host->add_target_mutex); host->srp_dev = device; host->port = port; @@ -2328,9 +3141,9 @@ static void srp_add_one(struct ib_device *device) { struct srp_device *srp_dev; struct ib_device_attr *dev_attr; - struct ib_fmr_pool_param fmr_param; struct srp_host *host; - int max_pages_per_fmr, fmr_page_shift, s, e, p; + int mr_page_shift, s, e, p; + u64 max_pages_per_mr; dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); if (!dev_attr) @@ -2345,15 +3158,39 @@ static void srp_add_one(struct ib_device *device) if (!srp_dev) goto free_attr; + srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && + device->map_phys_fmr && device->unmap_fmr); + srp_dev->has_fr = (dev_attr->device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + if (!srp_dev->has_fmr && !srp_dev->has_fr) + dev_warn(&device->dev, "neither FMR nor FR is supported\n"); + + srp_dev->use_fast_reg = (srp_dev->has_fr && + (!srp_dev->has_fmr || prefer_fr)); + /* * Use the smallest page size supported by the HCA, down to a * minimum of 4096 bytes. We're unlikely to build large sglists * out of smaller entries. */ - fmr_page_shift = max(12, ffs(dev_attr->page_size_cap) - 1); - srp_dev->fmr_page_size = 1 << fmr_page_shift; - srp_dev->fmr_page_mask = ~((u64) srp_dev->fmr_page_size - 1); - srp_dev->fmr_max_size = srp_dev->fmr_page_size * SRP_FMR_SIZE; + mr_page_shift = max(12, ffs(dev_attr->page_size_cap) - 1); + srp_dev->mr_page_size = 1 << mr_page_shift; + srp_dev->mr_page_mask = ~((u64) srp_dev->mr_page_size - 1); + max_pages_per_mr = dev_attr->max_mr_size; + do_div(max_pages_per_mr, srp_dev->mr_page_size); + srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR, + max_pages_per_mr); + if (srp_dev->use_fast_reg) { + srp_dev->max_pages_per_mr = + min_t(u32, srp_dev->max_pages_per_mr, + dev_attr->max_fast_reg_page_list_len); + } + srp_dev->mr_max_size = srp_dev->mr_page_size * + srp_dev->max_pages_per_mr; + pr_debug("%s: mr_page_shift = %d, dev_attr->max_mr_size = %#llx, dev_attr->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n", + device->name, mr_page_shift, dev_attr->max_mr_size, + dev_attr->max_fast_reg_page_list_len, + srp_dev->max_pages_per_mr, srp_dev->mr_max_size); INIT_LIST_HEAD(&srp_dev->dev_list); @@ -2369,27 +3206,6 @@ static void srp_add_one(struct ib_device *device) if (IS_ERR(srp_dev->mr)) goto err_pd; - for (max_pages_per_fmr = SRP_FMR_SIZE; - max_pages_per_fmr >= SRP_FMR_MIN_SIZE; - max_pages_per_fmr /= 2, srp_dev->fmr_max_size /= 2) { - memset(&fmr_param, 0, sizeof fmr_param); - fmr_param.pool_size = SRP_FMR_POOL_SIZE; - fmr_param.dirty_watermark = SRP_FMR_DIRTY_SIZE; - fmr_param.cache = 1; - fmr_param.max_pages_per_fmr = max_pages_per_fmr; - fmr_param.page_shift = fmr_page_shift; - fmr_param.access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ); - - srp_dev->fmr_pool = ib_create_fmr_pool(srp_dev->pd, &fmr_param); - if (!IS_ERR(srp_dev->fmr_pool)) - break; - } - - if (IS_ERR(srp_dev->fmr_pool)) - srp_dev->fmr_pool = NULL; - if (device->node_type == RDMA_NODE_IB_SWITCH) { s = 0; e = 0; @@ -2422,10 +3238,11 @@ static void srp_remove_one(struct ib_device *device) { struct srp_device *srp_dev; struct srp_host *host, *tmp_host; - LIST_HEAD(target_list); - struct srp_target_port *target, *tmp_target; + struct srp_target_port *target; srp_dev = ib_get_client_data(device, &srp_client); + if (!srp_dev) + return; list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) { device_unregister(&host->dev); @@ -2436,41 +3253,21 @@ static void srp_remove_one(struct ib_device *device) wait_for_completion(&host->released); /* - * Mark all target ports as removed, so we stop queueing - * commands and don't try to reconnect. + * Remove all target ports. */ spin_lock(&host->target_lock); - list_for_each_entry(target, &host->target_list, list) { - spin_lock_irq(&target->lock); - target->state = SRP_TARGET_REMOVED; - spin_unlock_irq(&target->lock); - } + list_for_each_entry(target, &host->target_list, list) + srp_queue_remove_work(target); spin_unlock(&host->target_lock); /* - * Wait for any reconnection tasks that may have - * started before we marked our target ports as - * removed, and any target port removal tasks. + * Wait for target port removal tasks. */ - flush_workqueue(ib_wq); - - list_for_each_entry_safe(target, tmp_target, - &host->target_list, list) { - srp_del_scsi_host_attr(target->scsi_host); - srp_remove_host(target->scsi_host); - scsi_remove_host(target->scsi_host); - srp_disconnect_target(target); - ib_destroy_cm_id(target->cm_id); - srp_free_target_ib(target); - srp_free_req_data(target); - scsi_host_put(target->scsi_host); - } + flush_workqueue(system_long_wq); kfree(host); } - if (srp_dev->fmr_pool) - ib_destroy_fmr_pool(srp_dev->fmr_pool); ib_dereg_mr(srp_dev->mr); ib_dealloc_pd(srp_dev->pd); @@ -2478,6 +3275,14 @@ static void srp_remove_one(struct ib_device *device) } static struct srp_function_template ib_srp_transport_functions = { + .has_rport_state = true, + .reset_timer_if_blocked = true, + .reconnect_delay = &srp_reconnect_delay, + .fast_io_fail_tmo = &srp_fast_io_fail_tmo, + .dev_loss_tmo = &srp_dev_loss_tmo, + .reconnect = srp_rport_reconnect, + .rport_delete = srp_rport_delete, + .terminate_rport_io = srp_terminate_io, }; static int __init srp_init_module(void) diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 020caf0c378..e46ecb15aa0 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -57,32 +57,24 @@ enum { SRP_MAX_LUN = 512, SRP_DEF_SG_TABLESIZE = 12, - SRP_RQ_SHIFT = 6, - SRP_RQ_SIZE = 1 << SRP_RQ_SHIFT, - - SRP_SQ_SIZE = SRP_RQ_SIZE, + SRP_DEFAULT_QUEUE_SIZE = 1 << 6, SRP_RSP_SQ_SIZE = 1, - SRP_REQ_SQ_SIZE = SRP_SQ_SIZE - SRP_RSP_SQ_SIZE, SRP_TSK_MGMT_SQ_SIZE = 1, - SRP_CMD_SQ_SIZE = SRP_REQ_SQ_SIZE - SRP_TSK_MGMT_SQ_SIZE, + SRP_DEFAULT_CMD_SQ_SIZE = SRP_DEFAULT_QUEUE_SIZE - SRP_RSP_SQ_SIZE - + SRP_TSK_MGMT_SQ_SIZE, SRP_TAG_NO_REQ = ~0U, SRP_TAG_TSK_MGMT = 1U << 31, - SRP_FMR_SIZE = 512, - SRP_FMR_MIN_SIZE = 128, - SRP_FMR_POOL_SIZE = 1024, - SRP_FMR_DIRTY_SIZE = SRP_FMR_POOL_SIZE / 4, + SRP_MAX_PAGES_PER_MR = 512, - SRP_MAP_ALLOW_FMR = 0, - SRP_MAP_NO_FMR = 1, + LOCAL_INV_WR_ID_MASK = 1, + FAST_REG_WR_ID_MASK = 2, }; enum srp_target_state { SRP_TARGET_LIVE, - SRP_TARGET_CONNECTING, - SRP_TARGET_DEAD, - SRP_TARGET_REMOVED + SRP_TARGET_REMOVED, }; enum srp_iu_type { @@ -91,15 +83,24 @@ enum srp_iu_type { SRP_IU_RSP, }; +/* + * @mr_page_mask: HCA memory registration page mask. + * @mr_page_size: HCA memory registration page size. + * @mr_max_size: Maximum size in bytes of a single FMR / FR registration + * request. + */ struct srp_device { struct list_head dev_list; struct ib_device *dev; struct ib_pd *pd; struct ib_mr *mr; - struct ib_fmr_pool *fmr_pool; - u64 fmr_page_mask; - int fmr_page_size; - int fmr_max_size; + u64 mr_page_mask; + int mr_page_size; + int mr_max_size; + int max_pages_per_mr; + bool has_fmr; + bool has_fr; + bool use_fast_reg; }; struct srp_host { @@ -110,17 +111,21 @@ struct srp_host { spinlock_t target_lock; struct completion released; struct list_head list; + struct mutex add_target_mutex; }; struct srp_request { struct list_head list; struct scsi_cmnd *scmnd; struct srp_iu *cmd; - struct ib_pool_fmr **fmr_list; + union { + struct ib_pool_fmr **fmr_list; + struct srp_fr_desc **fr_list; + }; u64 *map_page; struct srp_direct_buf *indirect_desc; dma_addr_t indirect_dma_addr; - short nfmr; + short nmdesc; short index; }; @@ -135,6 +140,10 @@ struct srp_target_port { struct ib_cq *send_cq ____cacheline_aligned_in_smp; struct ib_cq *recv_cq; struct ib_qp *qp; + union { + struct ib_fmr_pool *fmr_pool; + struct srp_fr_pool *fr_pool; + }; u32 lkey; u32 rkey; enum srp_target_state state; @@ -154,31 +163,40 @@ struct srp_target_port { u16 io_class; struct srp_host *srp_host; struct Scsi_Host *scsi_host; + struct srp_rport *rport; char target_name[32]; unsigned int scsi_id; unsigned int sg_tablesize; + int queue_size; + int req_ring_size; + int comp_vector; + int tl_retry_count; struct ib_sa_path_rec path; __be16 orig_dgid[8]; struct ib_sa_query *path_query; int path_query_id; + u32 rq_tmo_jiffies; + bool connected; + struct ib_cm_id *cm_id; int max_ti_iu_len; int zero_req_lim; - struct srp_iu *tx_ring[SRP_SQ_SIZE]; - struct srp_iu *rx_ring[SRP_RQ_SIZE]; - struct srp_request req_ring[SRP_CMD_SQ_SIZE]; + struct srp_iu **tx_ring; + struct srp_iu **rx_ring; + struct srp_request *req_ring; - struct work_struct work; + struct work_struct tl_err_work; + struct work_struct remove_work; struct list_head list; struct completion done; int status; - int qp_in_error; + bool qp_in_error; struct completion tsk_mgmt_done; u8 tsk_mgmt_status; @@ -192,15 +210,66 @@ struct srp_iu { enum dma_data_direction direction; }; +/** + * struct srp_fr_desc - fast registration work request arguments + * @entry: Entry in srp_fr_pool.free_list. + * @mr: Memory region. + * @frpl: Fast registration page list. + */ +struct srp_fr_desc { + struct list_head entry; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; +}; + +/** + * struct srp_fr_pool - pool of fast registration descriptors + * + * An entry is available for allocation if and only if it occurs in @free_list. + * + * @size: Number of descriptors in this pool. + * @max_page_list_len: Maximum fast registration work request page list length. + * @lock: Protects free_list. + * @free_list: List of free descriptors. + * @desc: Fast registration descriptor pool. + */ +struct srp_fr_pool { + int size; + int max_page_list_len; + spinlock_t lock; + struct list_head free_list; + struct srp_fr_desc desc[0]; +}; + +/** + * struct srp_map_state - per-request DMA memory mapping state + * @desc: Pointer to the element of the SRP buffer descriptor array + * that is being filled in. + * @pages: Array with DMA addresses of pages being considered for + * memory registration. + * @base_dma_addr: DMA address of the first page that has not yet been mapped. + * @dma_len: Number of bytes that will be registered with the next + * FMR or FR memory registration call. + * @total_len: Total number of bytes in the sg-list being mapped. + * @npages: Number of page addresses in the pages[] array. + * @nmdesc: Number of FMR or FR memory descriptors used for mapping. + * @ndesc: Number of SRP buffer descriptors that have been filled in. + * @unmapped_sg: First element of the sg-list that is mapped via FMR or FR. + * @unmapped_index: Index of the first element mapped via FMR or FR. + * @unmapped_addr: DMA address of the first element mapped via FMR or FR. + */ struct srp_map_state { - struct ib_pool_fmr **next_fmr; + union { + struct ib_pool_fmr **next_fmr; + struct srp_fr_desc **next_fr; + }; struct srp_direct_buf *desc; u64 *pages; dma_addr_t base_dma_addr; - u32 fmr_len; + u32 dma_len; u32 total_len; unsigned int npages; - unsigned int nfmr; + unsigned int nmdesc; unsigned int ndesc; struct scatterlist *unmapped_sg; int unmapped_index; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index cf23c46185b..fe09f2788b1 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1078,6 +1078,7 @@ static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch, static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, struct srpt_send_ioctx *ioctx) { + struct ib_device *dev = ch->sport->sdev->device; struct se_cmd *cmd; struct scatterlist *sg, *sg_orig; int sg_cnt; @@ -1124,7 +1125,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, db = ioctx->rbufs; tsize = cmd->data_length; - dma_len = sg_dma_len(&sg[0]); + dma_len = ib_sg_dma_len(dev, &sg[0]); riu = ioctx->rdma_ius; /* @@ -1155,7 +1156,8 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, ++j; if (j < count) { sg = sg_next(sg); - dma_len = sg_dma_len(sg); + dma_len = ib_sg_dma_len( + dev, sg); } } } else { @@ -1192,8 +1194,8 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, tsize = cmd->data_length; riu = ioctx->rdma_ius; sg = sg_orig; - dma_len = sg_dma_len(&sg[0]); - dma_addr = sg_dma_address(&sg[0]); + dma_len = ib_sg_dma_len(dev, &sg[0]); + dma_addr = ib_sg_dma_address(dev, &sg[0]); /* this second loop is really mapped sg_addres to rdma_iu->ib_sge */ for (i = 0, j = 0; @@ -1216,8 +1218,10 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, ++j; if (j < count) { sg = sg_next(sg); - dma_len = sg_dma_len(sg); - dma_addr = sg_dma_address(sg); + dma_len = ib_sg_dma_len( + dev, sg); + dma_addr = ib_sg_dma_address( + dev, sg); } } } else { @@ -1269,7 +1273,6 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) return ioctx; BUG_ON(ioctx->ch != ch); - kref_init(&ioctx->kref); spin_lock_init(&ioctx->spinlock); ioctx->state = SRPT_STATE_NEW; ioctx->n_rbuf = 0; @@ -1291,39 +1294,6 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) } /** - * srpt_put_send_ioctx() - Free up resources. - */ -static void srpt_put_send_ioctx(struct srpt_send_ioctx *ioctx) -{ - struct srpt_rdma_ch *ch; - unsigned long flags; - - BUG_ON(!ioctx); - ch = ioctx->ch; - BUG_ON(!ch); - - WARN_ON(srpt_get_cmd_state(ioctx) != SRPT_STATE_DONE); - - srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); - transport_generic_free_cmd(&ioctx->cmd, 0); - - if (ioctx->n_rbuf > 1) { - kfree(ioctx->rbufs); - ioctx->rbufs = NULL; - ioctx->n_rbuf = 0; - } - - spin_lock_irqsave(&ch->spinlock, flags); - list_add(&ioctx->free_list, &ch->free_list); - spin_unlock_irqrestore(&ch->spinlock, flags); -} - -static void srpt_put_send_ioctx_kref(struct kref *kref) -{ - srpt_put_send_ioctx(container_of(kref, struct srpt_send_ioctx, kref)); -} - -/** * srpt_abort_cmd() - Abort a SCSI command. * @ioctx: I/O context associated with the SCSI command. * @context: Preferred execution context. @@ -1359,8 +1329,14 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) } spin_unlock_irqrestore(&ioctx->spinlock, flags); - if (state == SRPT_STATE_DONE) + if (state == SRPT_STATE_DONE) { + struct srpt_rdma_ch *ch = ioctx->ch; + + BUG_ON(ch->sess == NULL); + + target_put_sess_cmd(ch->sess, &ioctx->cmd); goto out; + } pr_debug("Aborting cmd with state %d and tag %lld\n", state, ioctx->tag); @@ -1380,11 +1356,8 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) /* XXX(hch): this is a horrible layering violation.. */ spin_lock_irqsave(&ioctx->cmd.t_state_lock, flags); - ioctx->cmd.transport_state |= CMD_T_LUN_STOP; ioctx->cmd.transport_state &= ~CMD_T_ACTIVE; spin_unlock_irqrestore(&ioctx->cmd.t_state_lock, flags); - - complete(&ioctx->cmd.transport_lun_stop_comp); break; case SRPT_STATE_CMD_RSP_SENT: /* @@ -1392,17 +1365,14 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) * not been received in time. */ srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); - spin_lock_irqsave(&ioctx->cmd.t_state_lock, flags); - ioctx->cmd.transport_state |= CMD_T_LUN_STOP; - spin_unlock_irqrestore(&ioctx->cmd.t_state_lock, flags); - kref_put(&ioctx->kref, srpt_put_send_ioctx_kref); + target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); break; case SRPT_STATE_MGMT_RSP_SENT: srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); - kref_put(&ioctx->kref, srpt_put_send_ioctx_kref); + target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); break; default: - WARN_ON("ERROR: unexpected command state"); + WARN(1, "Unexpected command state (%d)", state); break; } @@ -1457,11 +1427,13 @@ static void srpt_handle_send_comp(struct srpt_rdma_ch *ch, && state != SRPT_STATE_DONE)) pr_debug("state = %d\n", state); - if (state != SRPT_STATE_DONE) - kref_put(&ioctx->kref, srpt_put_send_ioctx_kref); - else + if (state != SRPT_STATE_DONE) { + srpt_unmap_sg_to_ib_sge(ch, ioctx); + transport_generic_free_cmd(&ioctx->cmd, 0); + } else { printk(KERN_ERR "IB completion has been received too late for" " wr_id = %u.\n", ioctx->ioctx.index); + } } /** @@ -1502,7 +1474,6 @@ static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch, { struct se_cmd *cmd; enum srpt_command_state state; - unsigned long flags; cmd = &ioctx->cmd; state = srpt_get_cmd_state(ioctx); @@ -1522,9 +1493,6 @@ static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch, __func__, __LINE__, state); break; case SRPT_RDMA_WRITE_LAST: - spin_lock_irqsave(&ioctx->cmd.t_state_lock, flags); - ioctx->cmd.transport_state |= CMD_T_LUN_STOP; - spin_unlock_irqrestore(&ioctx->cmd.t_state_lock, flags); break; default: printk(KERN_ERR "%s[%d]: opcode = %u\n", __func__, @@ -1614,7 +1582,7 @@ static int srpt_build_tskmgmt_rsp(struct srpt_rdma_ch *ch, int resp_data_len; int resp_len; - resp_data_len = (rsp_code == SRP_TSK_MGMT_SUCCESS) ? 0 : 4; + resp_data_len = 4; resp_len = sizeof(*srp_rsp) + resp_data_len; srp_rsp = ioctx->ioctx.buf; @@ -1626,11 +1594,9 @@ static int srpt_build_tskmgmt_rsp(struct srpt_rdma_ch *ch, + atomic_xchg(&ch->req_lim_delta, 0)); srp_rsp->tag = tag; - if (rsp_code != SRP_TSK_MGMT_SUCCESS) { - srp_rsp->flags |= SRP_RSP_FLAG_RSPVALID; - srp_rsp->resp_data_len = cpu_to_be32(resp_data_len); - srp_rsp->data[3] = rsp_code; - } + srp_rsp->flags |= SRP_RSP_FLAG_RSPVALID; + srp_rsp->resp_data_len = cpu_to_be32(resp_data_len); + srp_rsp->data[3] = rsp_code; return resp_len; } @@ -1712,10 +1678,10 @@ out_err: static int srpt_check_stop_free(struct se_cmd *cmd) { - struct srpt_send_ioctx *ioctx; + struct srpt_send_ioctx *ioctx = container_of(cmd, + struct srpt_send_ioctx, cmd); - ioctx = container_of(cmd, struct srpt_send_ioctx, cmd); - return kref_put(&ioctx->kref, srpt_put_send_ioctx_kref); + return target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); } /** @@ -1730,12 +1696,12 @@ static int srpt_handle_cmd(struct srpt_rdma_ch *ch, uint64_t unpacked_lun; u64 data_len; enum dma_data_direction dir; - int ret; + sense_reason_t ret; + int rc; BUG_ON(!send_ioctx); srp_cmd = recv_ioctx->ioctx.buf; - kref_get(&send_ioctx->kref); cmd = &send_ioctx->cmd; send_ioctx->tag = srp_cmd->tag; @@ -1755,40 +1721,26 @@ static int srpt_handle_cmd(struct srpt_rdma_ch *ch, break; } - ret = srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &data_len); - if (ret) { + if (srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &data_len)) { printk(KERN_ERR "0x%llx: parsing SRP descriptor table failed.\n", srp_cmd->tag); - cmd->se_cmd_flags |= SCF_SCSI_CDB_EXCEPTION; - cmd->scsi_sense_reason = TCM_INVALID_CDB_FIELD; - kref_put(&send_ioctx->kref, srpt_put_send_ioctx_kref); + ret = TCM_INVALID_CDB_FIELD; goto send_sense; } - cmd->data_length = data_len; - cmd->data_direction = dir; unpacked_lun = srpt_unpack_lun((uint8_t *)&srp_cmd->lun, sizeof(srp_cmd->lun)); - if (transport_lookup_cmd_lun(cmd, unpacked_lun) < 0) { - kref_put(&send_ioctx->kref, srpt_put_send_ioctx_kref); + rc = target_submit_cmd(cmd, ch->sess, srp_cmd->cdb, + &send_ioctx->sense_data[0], unpacked_lun, data_len, + MSG_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF); + if (rc != 0) { + ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto send_sense; } - ret = target_setup_cmd_from_cdb(cmd, srp_cmd->cdb); - if (ret < 0) { - kref_put(&send_ioctx->kref, srpt_put_send_ioctx_kref); - if (cmd->se_cmd_flags & SCF_SCSI_RESERVATION_CONFLICT) { - srpt_queue_status(cmd); - return 0; - } else - goto send_sense; - } - - transport_handle_cdb_direct(cmd); return 0; send_sense: - transport_send_check_condition_and_sense(cmd, cmd->scsi_sense_reason, - 0); + transport_send_check_condition_and_sense(cmd, ret, 0); return -1; } @@ -1865,9 +1817,11 @@ static void srpt_handle_tsk_mgmt(struct srpt_rdma_ch *ch, { struct srp_tsk_mgmt *srp_tsk; struct se_cmd *cmd; + struct se_session *sess = ch->sess; uint64_t unpacked_lun; + uint32_t tag = 0; int tcm_tmr; - int res; + int rc; BUG_ON(!send_ioctx); @@ -1882,39 +1836,32 @@ static void srpt_handle_tsk_mgmt(struct srpt_rdma_ch *ch, send_ioctx->tag = srp_tsk->tag; tcm_tmr = srp_tmr_to_tcm(srp_tsk->tsk_mgmt_func); if (tcm_tmr < 0) { - send_ioctx->cmd.se_cmd_flags |= SCF_SCSI_CDB_EXCEPTION; send_ioctx->cmd.se_tmr_req->response = TMR_TASK_MGMT_FUNCTION_NOT_SUPPORTED; - goto process_tmr; - } - res = core_tmr_alloc_req(cmd, NULL, tcm_tmr, GFP_KERNEL); - if (res < 0) { - send_ioctx->cmd.se_cmd_flags |= SCF_SCSI_CDB_EXCEPTION; - send_ioctx->cmd.se_tmr_req->response = TMR_FUNCTION_REJECTED; - goto process_tmr; + goto fail; } - unpacked_lun = srpt_unpack_lun((uint8_t *)&srp_tsk->lun, sizeof(srp_tsk->lun)); - res = transport_lookup_tmr_lun(&send_ioctx->cmd, unpacked_lun); - if (res) { - pr_debug("rejecting TMR for LUN %lld\n", unpacked_lun); - send_ioctx->cmd.se_cmd_flags |= SCF_SCSI_CDB_EXCEPTION; - send_ioctx->cmd.se_tmr_req->response = TMR_LUN_DOES_NOT_EXIST; - goto process_tmr; - } - - if (srp_tsk->tsk_mgmt_func == SRP_TSK_ABORT_TASK) - srpt_rx_mgmt_fn_tag(send_ioctx, srp_tsk->task_tag); - -process_tmr: - kref_get(&send_ioctx->kref); - if (!(send_ioctx->cmd.se_cmd_flags & SCF_SCSI_CDB_EXCEPTION)) - transport_generic_handle_tmr(&send_ioctx->cmd); - else - transport_send_check_condition_and_sense(cmd, - cmd->scsi_sense_reason, 0); + if (srp_tsk->tsk_mgmt_func == SRP_TSK_ABORT_TASK) { + rc = srpt_rx_mgmt_fn_tag(send_ioctx, srp_tsk->task_tag); + if (rc < 0) { + send_ioctx->cmd.se_tmr_req->response = + TMR_TASK_DOES_NOT_EXIST; + goto fail; + } + tag = srp_tsk->task_tag; + } + rc = target_submit_tmr(&send_ioctx->cmd, sess, NULL, unpacked_lun, + srp_tsk, tcm_tmr, GFP_KERNEL, tag, + TARGET_SCF_ACK_KREF); + if (rc != 0) { + send_ioctx->cmd.se_tmr_req->response = TMR_FUNCTION_REJECTED; + goto fail; + } + return; +fail: + transport_send_check_condition_and_sense(cmd, 0, 0); // XXX: } /** @@ -1956,10 +1903,6 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, } } - transport_init_se_cmd(&send_ioctx->cmd, &srpt_target->tf_ops, ch->sess, - 0, DMA_NONE, MSG_SIMPLE_TAG, - send_ioctx->sense_data); - switch (srp_cmd->opcode) { case SRP_CMD: srpt_handle_cmd(ch, recv_ioctx, send_ioctx); @@ -2276,6 +2219,27 @@ static void srpt_close_ch(struct srpt_rdma_ch *ch) } /** + * srpt_shutdown_session() - Whether or not a session may be shut down. + */ +static int srpt_shutdown_session(struct se_session *se_sess) +{ + struct srpt_rdma_ch *ch = se_sess->fabric_sess_ptr; + unsigned long flags; + + spin_lock_irqsave(&ch->spinlock, flags); + if (ch->in_shutdown) { + spin_unlock_irqrestore(&ch->spinlock, flags); + return true; + } + + ch->in_shutdown = true; + target_sess_cmd_list_set_waiting(se_sess); + spin_unlock_irqrestore(&ch->spinlock, flags); + + return true; +} + +/** * srpt_drain_channel() - Drain a channel by resetting the IB queue pair. * @cm_id: Pointer to the CM ID of the channel to be drained. * @@ -2313,6 +2277,9 @@ static void srpt_drain_channel(struct ib_cm_id *cm_id) spin_unlock_irq(&sdev->spinlock); if (do_reset) { + if (ch->sess) + srpt_shutdown_session(ch->sess); + ret = srpt_ch_qp_err(ch); if (ret < 0) printk(KERN_ERR "Setting queue pair in error state" @@ -2365,6 +2332,7 @@ static void srpt_release_channel_work(struct work_struct *w) { struct srpt_rdma_ch *ch; struct srpt_device *sdev; + struct se_session *se_sess; ch = container_of(w, struct srpt_rdma_ch, release_work); pr_debug("ch = %p; ch->sess = %p; release_done = %p\n", ch, ch->sess, @@ -2373,10 +2341,17 @@ static void srpt_release_channel_work(struct work_struct *w) sdev = ch->sport->sdev; BUG_ON(!sdev); - transport_deregister_session_configfs(ch->sess); - transport_deregister_session(ch->sess); + se_sess = ch->sess; + BUG_ON(!se_sess); + + target_wait_for_sess_cmds(se_sess); + + transport_deregister_session_configfs(se_sess); + transport_deregister_session(se_sess); ch->sess = NULL; + ib_destroy_cm_id(ch->cm_id); + srpt_destroy_ch_ib(ch); srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring, @@ -2387,8 +2362,6 @@ static void srpt_release_channel_work(struct work_struct *w) list_del(&ch->list); spin_unlock_irq(&sdev->spinlock); - ib_destroy_cm_id(ch->cm_id); - if (ch->release_done) complete(ch->release_done); @@ -2611,7 +2584,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, goto destroy_ib; } - ch->sess = transport_init_session(); + ch->sess = transport_init_session(TARGET_PROT_NORMAL); if (IS_ERR(ch->sess)) { rej->reason = __constant_cpu_to_be32( SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); @@ -3030,7 +3003,7 @@ static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status) * Callback function called by the TCM core. Must not block since it can be * invoked on the context of the IB completion handler. */ -static int srpt_queue_response(struct se_cmd *cmd) +static void srpt_queue_response(struct se_cmd *cmd) { struct srpt_rdma_ch *ch; struct srpt_send_ioctx *ioctx; @@ -3041,8 +3014,6 @@ static int srpt_queue_response(struct se_cmd *cmd) int resp_len; u8 srp_tm_status; - ret = 0; - ioctx = container_of(cmd, struct srpt_send_ioctx, cmd); ch = ioctx->ch; BUG_ON(!ch); @@ -3068,7 +3039,7 @@ static int srpt_queue_response(struct se_cmd *cmd) || WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT))) { atomic_inc(&ch->req_lim_delta); srpt_abort_cmd(ioctx); - goto out; + return; } dir = ioctx->cmd.data_direction; @@ -3080,7 +3051,7 @@ static int srpt_queue_response(struct se_cmd *cmd) if (ret) { printk(KERN_ERR "xfer_data failed for tag %llu\n", ioctx->tag); - goto out; + return; } } @@ -3099,11 +3070,27 @@ static int srpt_queue_response(struct se_cmd *cmd) ioctx->tag); srpt_unmap_sg_to_ib_sge(ch, ioctx); srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); - kref_put(&ioctx->kref, srpt_put_send_ioctx_kref); + target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); } +} -out: - return ret; +static int srpt_queue_data_in(struct se_cmd *cmd) +{ + srpt_queue_response(cmd); + return 0; +} + +static void srpt_queue_tm_rsp(struct se_cmd *cmd) +{ + srpt_queue_response(cmd); +} + +static void srpt_aborted_task(struct se_cmd *cmd) +{ + struct srpt_send_ioctx *ioctx = container_of(cmd, + struct srpt_send_ioctx, cmd); + + srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); } static int srpt_queue_status(struct se_cmd *cmd) @@ -3116,7 +3103,8 @@ static int srpt_queue_status(struct se_cmd *cmd) (SCF_TRANSPORT_TASK_SENSE | SCF_EMULATED_TASK_SENSE)) WARN_ON(cmd->scsi_status != SAM_STAT_CHECK_CONDITION); ioctx->queue_status_only = true; - return srpt_queue_response(cmd); + srpt_queue_response(cmd); + return 0; } static void srpt_refresh_port_work(struct work_struct *work) @@ -3490,14 +3478,23 @@ static u32 srpt_tpg_get_inst_index(struct se_portal_group *se_tpg) static void srpt_release_cmd(struct se_cmd *se_cmd) { -} + struct srpt_send_ioctx *ioctx = container_of(se_cmd, + struct srpt_send_ioctx, cmd); + struct srpt_rdma_ch *ch = ioctx->ch; + unsigned long flags; -/** - * srpt_shutdown_session() - Whether or not a session may be shut down. - */ -static int srpt_shutdown_session(struct se_session *se_sess) -{ - return true; + WARN_ON(ioctx->state != SRPT_STATE_DONE); + WARN_ON(ioctx->mapped_sg_count != 0); + + if (ioctx->n_rbuf > 1) { + kfree(ioctx->rbufs); + ioctx->rbufs = NULL; + ioctx->n_rbuf = 0; + } + + spin_lock_irqsave(&ch->spinlock, flags); + list_add(&ioctx->free_list, &ch->free_list); + spin_unlock_irqrestore(&ch->spinlock, flags); } /** @@ -3681,9 +3678,9 @@ static ssize_t srpt_tpg_attrib_store_srp_max_rdma_size( unsigned long val; int ret; - ret = strict_strtoul(page, 0, &val); + ret = kstrtoul(page, 0, &val); if (ret < 0) { - pr_err("strict_strtoul() failed with ret: %d\n", ret); + pr_err("kstrtoul() failed with ret: %d\n", ret); return -EINVAL; } if (val > MAX_SRPT_RDMA_SIZE) { @@ -3721,9 +3718,9 @@ static ssize_t srpt_tpg_attrib_store_srp_max_rsp_size( unsigned long val; int ret; - ret = strict_strtoul(page, 0, &val); + ret = kstrtoul(page, 0, &val); if (ret < 0) { - pr_err("strict_strtoul() failed with ret: %d\n", ret); + pr_err("kstrtoul() failed with ret: %d\n", ret); return -EINVAL; } if (val > MAX_SRPT_RSP_SIZE) { @@ -3761,9 +3758,9 @@ static ssize_t srpt_tpg_attrib_store_srp_sq_size( unsigned long val; int ret; - ret = strict_strtoul(page, 0, &val); + ret = kstrtoul(page, 0, &val); if (ret < 0) { - pr_err("strict_strtoul() failed with ret: %d\n", ret); + pr_err("kstrtoul() failed with ret: %d\n", ret); return -EINVAL; } if (val > MAX_SRPT_SRQ_SIZE) { @@ -3808,7 +3805,7 @@ static ssize_t srpt_tpg_store_enable( unsigned long tmp; int ret; - ret = strict_strtoul(page, 0, &tmp); + ret = kstrtoul(page, 0, &tmp); if (ret < 0) { printk(KERN_ERR "Unable to extract srpt_tpg_store_enable\n"); return -EINVAL; @@ -3940,9 +3937,10 @@ static struct target_core_fabric_ops srpt_template = { .set_default_node_attributes = srpt_set_default_node_attrs, .get_task_tag = srpt_get_task_tag, .get_cmd_state = srpt_get_tcm_cmd_state, - .queue_data_in = srpt_queue_response, + .queue_data_in = srpt_queue_data_in, .queue_status = srpt_queue_status, - .queue_tm_rsp = srpt_queue_response, + .queue_tm_rsp = srpt_queue_tm_rsp, + .aborted_task = srpt_aborted_task, /* * Setup function pointers for generic logic in * target_core_fabric_configfs.c diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 61e52b83081..3dae156905d 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -228,7 +228,6 @@ struct srpt_recv_ioctx { struct srpt_send_ioctx { struct srpt_ioctx ioctx; struct srpt_rdma_ch *ch; - struct kref kref; struct rdma_iu *rdma_ius; struct srp_direct_buf *rbufs; struct srp_direct_buf single_rbuf; @@ -326,6 +325,7 @@ struct srpt_rdma_ch { u8 sess_name[36]; struct work_struct release_work; struct completion *release_done; + bool in_shutdown; }; /** |
