diff options
Diffstat (limited to 'drivers/infiniband')
169 files changed, 18110 insertions, 3972 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 5ceda710f51..77089399359 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -3,6 +3,8 @@ menuconfig INFINIBAND depends on PCI || BROKEN depends on HAS_IOMEM depends on NET + depends on INET + depends on m || IPV6 != m ---help--- Core support for InfiniBand (IB). Make sure to also select any protocols you wish to use as well as drivers for your @@ -38,8 +40,7 @@ config INFINIBAND_USER_MEM config INFINIBAND_ADDR_TRANS bool - depends on INET - depends on !(INFINIBAND = y && IPV6 = m) + depends on INFINIBAND default y source "drivers/infiniband/hw/mthca/Kconfig" @@ -53,6 +54,7 @@ source "drivers/infiniband/hw/mlx4/Kconfig" source "drivers/infiniband/hw/mlx5/Kconfig" source "drivers/infiniband/hw/nes/Kconfig" source "drivers/infiniband/hw/ocrdma/Kconfig" +source "drivers/infiniband/hw/usnic/Kconfig" source "drivers/infiniband/ulp/ipoib/Kconfig" diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile index 1fe69888515..dc21836b5a8 100644 --- a/drivers/infiniband/Makefile +++ b/drivers/infiniband/Makefile @@ -1,17 +1,3 @@ obj-$(CONFIG_INFINIBAND) += core/ -obj-$(CONFIG_INFINIBAND_MTHCA) += hw/mthca/ -obj-$(CONFIG_INFINIBAND_IPATH) += hw/ipath/ -obj-$(CONFIG_INFINIBAND_QIB) += hw/qib/ -obj-$(CONFIG_INFINIBAND_EHCA) += hw/ehca/ -obj-$(CONFIG_INFINIBAND_AMSO1100) += hw/amso1100/ -obj-$(CONFIG_INFINIBAND_CXGB3) += hw/cxgb3/ -obj-$(CONFIG_INFINIBAND_CXGB4) += hw/cxgb4/ -obj-$(CONFIG_MLX4_INFINIBAND) += hw/mlx4/ -obj-$(CONFIG_MLX5_INFINIBAND) += hw/mlx5/ -obj-$(CONFIG_INFINIBAND_NES) += hw/nes/ -obj-$(CONFIG_INFINIBAND_OCRDMA) += hw/ocrdma/ -obj-$(CONFIG_INFINIBAND_IPOIB) += ulp/ipoib/ -obj-$(CONFIG_INFINIBAND_SRP) += ulp/srp/ -obj-$(CONFIG_INFINIBAND_SRPT) += ulp/srpt/ -obj-$(CONFIG_INFINIBAND_ISER) += ulp/iser/ -obj-$(CONFIG_INFINIBAND_ISERT) += ulp/isert/ +obj-$(CONFIG_INFINIBAND) += hw/ +obj-$(CONFIG_INFINIBAND) += ulp/ diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index c8bbaef1bec..ffd0af6734a 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -1,8 +1,9 @@ -infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := ib_addr.o rdma_cm.o +infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_cm.o user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_sa.o \ - ib_cm.o iw_cm.o $(infiniband-y) + ib_cm.o iw_cm.o ib_addr.o \ + $(infiniband-y) obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ $(user_access-y) @@ -17,7 +18,7 @@ ib_sa-y := sa_query.o multicast.o ib_cm-y := cm.o -iw_cm-y := iwcm.o +iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o rdma_cm-y := cma.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index e90f2b2eabd..8172d37f9ad 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -86,6 +86,8 @@ int rdma_addr_size(struct sockaddr *addr) } EXPORT_SYMBOL(rdma_addr_size); +static struct rdma_addr_client self; + void rdma_addr_register_client(struct rdma_addr_client *client) { atomic_set(&client->refcount, 1); @@ -119,7 +121,8 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, } EXPORT_SYMBOL(rdma_copy_addr); -int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) +int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, + u16 *vlan_id) { struct net_device *dev; int ret = -EADDRNOTAVAIL; @@ -142,6 +145,8 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) return ret; ret = rdma_copy_addr(dev_addr, dev, NULL); + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(dev); dev_put(dev); break; @@ -153,6 +158,8 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) &((struct sockaddr_in6 *) addr)->sin6_addr, dev, 1)) { ret = rdma_copy_addr(dev_addr, dev, NULL); + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(dev); break; } } @@ -238,7 +245,7 @@ static int addr4_resolve(struct sockaddr_in *src_in, src_in->sin_addr.s_addr = fl4.saddr; if (rt->dst.dev->flags & IFF_LOOPBACK) { - ret = rdma_translate_ip((struct sockaddr *) dst_in, addr); + ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL); if (!ret) memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); goto put; @@ -286,7 +293,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, } if (dst->dev->flags & IFF_LOOPBACK) { - ret = rdma_translate_ip((struct sockaddr *) dst_in, addr); + ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL); if (!ret) memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); goto put; @@ -437,6 +444,88 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) } EXPORT_SYMBOL(rdma_addr_cancel); +struct resolve_cb_context { + struct rdma_dev_addr *addr; + struct completion comp; +}; + +static void resolve_cb(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context) +{ + memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct + rdma_dev_addr)); + complete(&((struct resolve_cb_context *)context)->comp); +} + +int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac, + u16 *vlan_id) +{ + int ret = 0; + struct rdma_dev_addr dev_addr; + struct resolve_cb_context ctx; + struct net_device *dev; + + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; + + + ret = rdma_gid2ip(&sgid_addr._sockaddr, sgid); + if (ret) + return ret; + + ret = rdma_gid2ip(&dgid_addr._sockaddr, dgid); + if (ret) + return ret; + + memset(&dev_addr, 0, sizeof(dev_addr)); + + ctx.addr = &dev_addr; + init_completion(&ctx.comp); + ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr, + &dev_addr, 1000, resolve_cb, &ctx); + if (ret) + return ret; + + wait_for_completion(&ctx.comp); + + memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); + dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if); + if (!dev) + return -ENODEV; + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(dev); + dev_put(dev); + return ret; +} +EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh); + +int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id) +{ + int ret = 0; + struct rdma_dev_addr dev_addr; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } gid_addr; + + ret = rdma_gid2ip(&gid_addr._sockaddr, sgid); + + if (ret) + return ret; + memset(&dev_addr, 0, sizeof(dev_addr)); + ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id); + if (ret) + return ret; + + memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN); + return ret; +} +EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid); + static int netevent_callback(struct notifier_block *self, unsigned long event, void *ctx) { @@ -461,11 +550,13 @@ static int __init addr_init(void) return -ENOMEM; register_netevent_notifier(&nb); + rdma_addr_register_client(&self); return 0; } static void __exit addr_cleanup(void) { + rdma_addr_unregister_client(&self); unregister_netevent_notifier(&nb); destroy_workqueue(addr_wq); } diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 784b97cb05b..c3239170d8b 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -47,6 +47,7 @@ #include <linux/sysfs.h> #include <linux/workqueue.h> #include <linux/kdev_t.h> +#include <linux/etherdevice.h> #include <rdma/ib_cache.h> #include <rdma/ib_cm.h> @@ -177,6 +178,8 @@ struct cm_av { struct ib_ah_attr ah_attr; u16 pkey_index; u8 timeout; + u8 valid; + u8 smac[ETH_ALEN]; }; struct cm_work { @@ -376,6 +379,9 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path, &av->ah_attr); av->timeout = path->packet_life_time + 1; + memcpy(av->smac, path->smac, sizeof(av->smac)); + + av->valid = 1; return 0; } @@ -383,14 +389,11 @@ static int cm_alloc_id(struct cm_id_private *cm_id_priv) { unsigned long flags; int id; - static int next_id; idr_preload(GFP_KERNEL); spin_lock_irqsave(&cm.lock, flags); - id = idr_alloc(&cm.local_id_table, cm_id_priv, next_id, 0, GFP_NOWAIT); - if (id >= 0) - next_id = max(id + 1, 0); + id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT); spin_unlock_irqrestore(&cm.lock, flags); idr_preload_end(); @@ -1557,6 +1560,9 @@ static int cm_req_handler(struct cm_work *work) cm_process_routed_req(req_msg, work->mad_recv_wc->wc); cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); + + memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN); + work->path[0].vlan_id = cm_id_priv->av.ah_attr.vlan_id; ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); if (ret) { ib_get_cached_gid(work->port->cm_dev->ib_device, @@ -3503,6 +3509,32 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN; qp_attr->ah_attr = cm_id_priv->av.ah_attr; + if (!cm_id_priv->av.valid) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return -EINVAL; + } + if (cm_id_priv->av.ah_attr.vlan_id != 0xffff) { + qp_attr->vlan_id = cm_id_priv->av.ah_attr.vlan_id; + *qp_attr_mask |= IB_QP_VID; + } + if (!is_zero_ether_addr(cm_id_priv->av.smac)) { + memcpy(qp_attr->smac, cm_id_priv->av.smac, + sizeof(qp_attr->smac)); + *qp_attr_mask |= IB_QP_SMAC; + } + if (cm_id_priv->alt_av.valid) { + if (cm_id_priv->alt_av.ah_attr.vlan_id != 0xffff) { + qp_attr->alt_vlan_id = + cm_id_priv->alt_av.ah_attr.vlan_id; + *qp_attr_mask |= IB_QP_ALT_VID; + } + if (!is_zero_ether_addr(cm_id_priv->alt_av.smac)) { + memcpy(qp_attr->alt_smac, + cm_id_priv->alt_av.smac, + sizeof(qp_attr->alt_smac)); + *qp_attr_mask |= IB_QP_ALT_SMAC; + } + } qp_attr->path_mtu = cm_id_priv->path_mtu; qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn); qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index dab4b41f171..d570030d899 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -328,28 +328,6 @@ static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) return ret; } -static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_num) -{ - int i; - int err; - struct ib_port_attr props; - union ib_gid tmp; - - err = ib_query_port(device, port_num, &props); - if (err) - return err; - - for (i = 0; i < props.gid_tbl_len; ++i) { - err = ib_query_gid(device, port_num, i, &tmp); - if (err) - return err; - if (!memcmp(&tmp, gid, sizeof tmp)) - return 0; - } - - return -EADDRNOTAVAIL; -} - static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { dev_addr->dev_type = ARPHRD_INFINIBAND; @@ -362,7 +340,7 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a int ret; if (addr->sa_family != AF_IB) { - ret = rdma_translate_ip(addr, dev_addr); + ret = rdma_translate_ip(addr, dev_addr, NULL); } else { cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); ret = 0; @@ -371,13 +349,14 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a return ret; } -static int cma_acquire_dev(struct rdma_id_private *id_priv) +static int cma_acquire_dev(struct rdma_id_private *id_priv, + struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct cma_device *cma_dev; union ib_gid gid, iboe_gid; int ret = -ENODEV; - u8 port; + u8 port, found_port; enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ? IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; @@ -386,20 +365,44 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv) return -EINVAL; mutex_lock(&lock); - iboe_addr_get_sgid(dev_addr, &iboe_gid); + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &iboe_gid); + memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof gid); + if (listen_id_priv && + rdma_port_get_link_layer(listen_id_priv->id.device, + listen_id_priv->id.port_num) == dev_ll) { + cma_dev = listen_id_priv->cma_dev; + port = listen_id_priv->id.port_num; + if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && + rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) + ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, + &found_port, NULL); + else + ret = ib_find_cached_gid(cma_dev->device, &gid, + &found_port, NULL); + + if (!ret && (port == found_port)) { + id_priv->id.port_num = found_port; + goto out; + } + } list_for_each_entry(cma_dev, &dev_list, list) { for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { + if (listen_id_priv && + listen_id_priv->cma_dev == cma_dev && + listen_id_priv->id.port_num == port) + continue; if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) { if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) - ret = find_gid_port(cma_dev->device, &iboe_gid, port); + ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, &found_port, NULL); else - ret = find_gid_port(cma_dev->device, &gid, port); + ret = ib_find_cached_gid(cma_dev->device, &gid, &found_port, NULL); - if (!ret) { - id_priv->id.port_num = port; + if (!ret && (port == found_port)) { + id_priv->id.port_num = found_port; goto out; } } @@ -602,6 +605,7 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; + union ib_gid sgid; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { @@ -624,6 +628,20 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, if (ret) goto out; + ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num, + qp_attr.ah_attr.grh.sgid_index, &sgid); + if (ret) + goto out; + + if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) + == RDMA_TRANSPORT_IB && + rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) + == IB_LINK_LAYER_ETHERNET) { + ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL); + + if (ret) + goto out; + } if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); @@ -724,6 +742,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, qp_attr_mask); + if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; break; @@ -1292,7 +1311,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); - ret = cma_acquire_dev(conn_id); + ret = cma_acquire_dev(conn_id, listen_id); if (ret) goto err2; @@ -1308,13 +1327,13 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) ret = conn_id->id.event_handler(&conn_id->id, &event); if (ret) goto err3; - /* * Acquire mutex to prevent user executing rdma_destroy_id() * while we're accessing the cm_id. */ mutex_lock(&lock); - if (cma_comp(conn_id, RDMA_CM_CONNECT) && (conn_id->id.qp_type != IB_QPT_UD)) + if (cma_comp(conn_id, RDMA_CM_CONNECT) && + (conn_id->id.qp_type != IB_QPT_UD)) ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); mutex_unlock(&lock); mutex_unlock(&conn_id->handler_mutex); @@ -1451,7 +1470,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, { struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; - struct net_device *dev = NULL; struct rdma_cm_event event; int ret; struct ib_device_attr attr; @@ -1474,14 +1492,14 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; - ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); + ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr, NULL); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } - ret = cma_acquire_dev(conn_id); + ret = cma_acquire_dev(conn_id, listen_id); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); @@ -1529,8 +1547,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, cma_deref_id(conn_id); out: - if (dev) - dev_put(dev); mutex_unlock(&listen_id->handler_mutex); return ret; } @@ -1848,6 +1864,26 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) return 0; } +static int iboe_tos_to_sl(struct net_device *ndev, int tos) +{ + int prio; + struct net_device *dev; + + prio = rt_tos2priority(tos); + dev = ndev->priv_flags & IFF_802_1Q_VLAN ? + vlan_dev_real_dev(ndev) : ndev; + + if (dev->num_tc) + return netdev_get_prio_tc_map(dev, prio); + +#if IS_ENABLED(CONFIG_VLAN_8021Q) + if (ndev->priv_flags & IFF_802_1Q_VLAN) + return (vlan_dev_get_egress_qos_mask(ndev, prio) & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; +#endif + return 0; +} + static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; @@ -1855,7 +1891,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) struct cma_work *work; int ret; struct net_device *ndev = NULL; - u16 vid; + work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) @@ -1879,20 +1915,20 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) goto err2; } - vid = rdma_vlan_dev_vlan_id(ndev); + route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev); + memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN); + memcpy(route->path_rec->smac, ndev->dev_addr, ndev->addr_len); - iboe_mac_vlan_to_ll(&route->path_rec->sgid, addr->dev_addr.src_dev_addr, vid); - iboe_mac_vlan_to_ll(&route->path_rec->dgid, addr->dev_addr.dst_dev_addr, vid); + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &route->path_rec->sgid); + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, + &route->path_rec->dgid); route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; - route->path_rec->sl = netdev_get_prio_tc_map( - ndev->priv_flags & IFF_802_1Q_VLAN ? - vlan_dev_real_dev(ndev) : ndev, - rt_tos2priority(id_priv->tos)); - + route->path_rec->sl = iboe_tos_to_sl(ndev, id_priv->tos); route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = iboe_get_rate(ndev); @@ -2049,8 +2085,9 @@ static void addr_handler(int status, struct sockaddr *src_addr, RDMA_CM_ADDR_RESOLVED)) goto out; + memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) - status = cma_acquire_dev(id_priv); + status = cma_acquire_dev(id_priv, NULL); if (status) { if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, @@ -2058,10 +2095,8 @@ static void addr_handler(int status, struct sockaddr *src_addr, goto out; event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = status; - } else { - memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); + } else event.event = RDMA_CM_EVENT_ADDR_RESOLVED; - } if (id_priv->id.event_handler(&id_priv->id, &event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); @@ -2294,9 +2329,9 @@ static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) int low, high, remaining; unsigned int rover; - inet_get_local_port_range(&low, &high); + inet_get_local_port_range(&init_net, &low, &high); remaining = (high - low) + 1; - rover = net_random() % remaining + low; + rover = prandom_u32() % remaining + low; retry: if (last_used_port != rover && !idr_find(ps, (unsigned short) rover)) { @@ -2466,8 +2501,11 @@ static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, return 0; sin6 = (struct sockaddr_in6 *) addr; - if ((ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) && - !sin6->sin6_scope_id) + + if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) + return 0; + + if (!sin6->sin6_scope_id) return -EINVAL; dev_addr->bound_dev_if = sin6->sin6_scope_id; @@ -2542,17 +2580,17 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (ret) goto err1; + memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) { ret = cma_translate_addr(addr, &id->route.addr.dev_addr); if (ret) goto err1; - ret = cma_acquire_dev(id_priv); + ret = cma_acquire_dev(id_priv, NULL); if (ret) goto err1; } - memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { if (addr->sa_family == AF_INET) id_priv->afonly = 1; @@ -3281,7 +3319,8 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, err = -EINVAL; goto out2; } - iboe_addr_get_sgid(dev_addr, &mc->multicast.ib->rec.port_gid); + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &mc->multicast.ib->rec.port_gid); work->id = id_priv; work->mc = mc; INIT_WORK(&work->work, iboe_mcast_work_handler); @@ -3568,7 +3607,8 @@ static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq, sizeof *id_stats, RDMA_NL_RDMA_CM, - RDMA_NL_RDMA_CM_ID_STATS); + RDMA_NL_RDMA_CM_ID_STATS, + NLM_F_MULTI); if (!id_stats) goto out; diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index a565af5c2d2..87d1936f5c1 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -49,4 +49,6 @@ void ib_sysfs_cleanup(void); int ib_cache_setup(void); void ib_cache_cleanup(void); +int ib_resolve_eth_l2_attrs(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, int *qp_attr_mask); #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index c47c2034ca7..3d2e489ab73 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -181,9 +181,16 @@ static void add_ref(struct iw_cm_id *cm_id) static void rem_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; + int cb_destroy; + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); - if (iwcm_deref_id(cm_id_priv) && - test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags)) { + + /* + * Test bit before deref in case the cm_id gets freed on another + * thread. + */ + cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); + if (iwcm_deref_id(cm_id_priv) && cb_destroy) { BUG_ON(!list_empty(&cm_id_priv->work_list)); free_cm_id(cm_id_priv); } @@ -327,7 +334,6 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; unsigned long flags; - int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); /* @@ -343,7 +349,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); /* destroy the listening endpoint */ - ret = cm_id->device->iwcm->destroy_listen(cm_id); + cm_id->device->iwcm->destroy_listen(cm_id); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_ESTABLISHED: diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c new file mode 100644 index 00000000000..b85ddbc979e --- /dev/null +++ b/drivers/infiniband/core/iwpm_msg.c @@ -0,0 +1,685 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iwpm_util.h" + +static const char iwpm_ulib_name[] = "iWarpPortMapperUser"; +static int iwpm_ulib_version = 3; +static int iwpm_user_pid = IWPM_PID_UNDEFINED; +static atomic_t echo_nlmsg_seq; + +int iwpm_valid_pid(void) +{ + return iwpm_user_pid > 0; +} +EXPORT_SYMBOL(iwpm_valid_pid); + +/* + * iwpm_register_pid - Send a netlink query to user space + * for the iwarp port mapper pid + * + * nlmsg attributes: + * [IWPM_NLA_REG_PID_SEQ] + * [IWPM_NLA_REG_IF_NAME] + * [IWPM_NLA_REG_IBDEV_NAME] + * [IWPM_NLA_REG_ULIB_NAME] + */ +int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_client(nl_client)) { + err_str = "Invalid port mapper client"; + goto pid_query_error; + } + if (iwpm_registered_client(nl_client)) + return 0; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REG_PID, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto pid_query_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL); + if (!nlmsg_request) { + err_str = "Unable to allocate netlink request"; + goto pid_query_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + + /* fill in the pid request message */ + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_REG_PID_SEQ); + if (ret) + goto pid_query_error; + ret = ibnl_put_attr(skb, nlh, IWPM_IFNAME_SIZE, + pm_msg->if_name, IWPM_NLA_REG_IF_NAME); + if (ret) + goto pid_query_error; + ret = ibnl_put_attr(skb, nlh, IWPM_DEVNAME_SIZE, + pm_msg->dev_name, IWPM_NLA_REG_IBDEV_NAME); + if (ret) + goto pid_query_error; + ret = ibnl_put_attr(skb, nlh, IWPM_ULIBNAME_SIZE, + (char *)iwpm_ulib_name, IWPM_NLA_REG_ULIB_NAME); + if (ret) + goto pid_query_error; + + pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n", + __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name); + + ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + iwpm_set_registered(nl_client, 1); + iwpm_user_pid = IWPM_PID_UNAVAILABLE; + err_str = "Unable to send a nlmsg"; + goto pid_query_error; + } + nlmsg_request->req_buffer = pm_msg; + ret = iwpm_wait_complete_req(nlmsg_request); + return ret; +pid_query_error: + pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb(skb); + if (nlmsg_request) + iwpm_free_nlmsg_request(&nlmsg_request->kref); + return ret; +} +EXPORT_SYMBOL(iwpm_register_pid); + +/* + * iwpm_add_mapping - Send a netlink add mapping message + * to the port mapper + * nlmsg attributes: + * [IWPM_NLA_MANAGE_MAPPING_SEQ] + * [IWPM_NLA_MANAGE_ADDR] + */ +int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_client(nl_client)) { + err_str = "Invalid port mapper client"; + goto add_mapping_error; + } + if (!iwpm_registered_client(nl_client)) { + err_str = "Unregistered port mapper client"; + goto add_mapping_error; + } + if (!iwpm_valid_pid()) + return 0; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_ADD_MAPPING, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto add_mapping_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL); + if (!nlmsg_request) { + err_str = "Unable to allocate netlink request"; + goto add_mapping_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + /* fill in the add mapping message */ + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, + IWPM_NLA_MANAGE_MAPPING_SEQ); + if (ret) + goto add_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + &pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR); + if (ret) + goto add_mapping_error; + nlmsg_request->req_buffer = pm_msg; + + ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + iwpm_user_pid = IWPM_PID_UNDEFINED; + err_str = "Unable to send a nlmsg"; + goto add_mapping_error; + } + ret = iwpm_wait_complete_req(nlmsg_request); + return ret; +add_mapping_error: + pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb(skb); + if (nlmsg_request) + iwpm_free_nlmsg_request(&nlmsg_request->kref); + return ret; +} +EXPORT_SYMBOL(iwpm_add_mapping); + +/* + * iwpm_add_and_query_mapping - Send a netlink add and query + * mapping message to the port mapper + * nlmsg attributes: + * [IWPM_NLA_QUERY_MAPPING_SEQ] + * [IWPM_NLA_QUERY_LOCAL_ADDR] + * [IWPM_NLA_QUERY_REMOTE_ADDR] + */ +int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_client(nl_client)) { + err_str = "Invalid port mapper client"; + goto query_mapping_error; + } + if (!iwpm_registered_client(nl_client)) { + err_str = "Unregistered port mapper client"; + goto query_mapping_error; + } + if (!iwpm_valid_pid()) + return 0; + ret = -ENOMEM; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_QUERY_MAPPING, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto query_mapping_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, + nl_client, GFP_KERNEL); + if (!nlmsg_request) { + err_str = "Unable to allocate netlink request"; + goto query_mapping_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + + /* fill in the query message */ + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, + IWPM_NLA_QUERY_MAPPING_SEQ); + if (ret) + goto query_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + &pm_msg->loc_addr, IWPM_NLA_QUERY_LOCAL_ADDR); + if (ret) + goto query_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + &pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR); + if (ret) + goto query_mapping_error; + nlmsg_request->req_buffer = pm_msg; + + ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + err_str = "Unable to send a nlmsg"; + goto query_mapping_error; + } + ret = iwpm_wait_complete_req(nlmsg_request); + return ret; +query_mapping_error: + pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb(skb); + if (nlmsg_request) + iwpm_free_nlmsg_request(&nlmsg_request->kref); + return ret; +} +EXPORT_SYMBOL(iwpm_add_and_query_mapping); + +/* + * iwpm_remove_mapping - Send a netlink remove mapping message + * to the port mapper + * nlmsg attributes: + * [IWPM_NLA_MANAGE_MAPPING_SEQ] + * [IWPM_NLA_MANAGE_ADDR] + */ +int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_client(nl_client)) { + err_str = "Invalid port mapper client"; + goto remove_mapping_error; + } + if (!iwpm_registered_client(nl_client)) { + err_str = "Unregistered port mapper client"; + goto remove_mapping_error; + } + if (!iwpm_valid_pid()) + return 0; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REMOVE_MAPPING, &nlh, nl_client); + if (!skb) { + ret = -ENOMEM; + err_str = "Unable to create a nlmsg"; + goto remove_mapping_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, + IWPM_NLA_MANAGE_MAPPING_SEQ); + if (ret) + goto remove_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + local_addr, IWPM_NLA_MANAGE_ADDR); + if (ret) + goto remove_mapping_error; + + ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + iwpm_user_pid = IWPM_PID_UNDEFINED; + err_str = "Unable to send a nlmsg"; + goto remove_mapping_error; + } + iwpm_print_sockaddr(local_addr, + "remove_mapping: Local sockaddr:"); + return 0; +remove_mapping_error: + pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb_any(skb); + return ret; +} +EXPORT_SYMBOL(iwpm_remove_mapping); + +/* netlink attribute policy for the received response to register pid request */ +static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = { + [IWPM_NLA_RREG_PID_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RREG_IBDEV_NAME] = { .type = NLA_STRING, + .len = IWPM_DEVNAME_SIZE - 1 }, + [IWPM_NLA_RREG_ULIB_NAME] = { .type = NLA_STRING, + .len = IWPM_ULIBNAME_SIZE - 1 }, + [IWPM_NLA_RREG_ULIB_VER] = { .type = NLA_U16 }, + [IWPM_NLA_RREG_PID_ERR] = { .type = NLA_U16 } +}; + +/* + * iwpm_register_pid_cb - Process a port mapper response to + * iwpm_register_pid() + */ +int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlattr *nltb[IWPM_NLA_RREG_PID_MAX]; + struct iwpm_dev_data *pm_msg; + char *dev_name, *iwpm_name; + u32 msg_seq; + u8 nl_client; + u16 iwpm_version; + const char *msg_type = "Register Pid response"; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_RREG_PID_MAX, + resp_reg_policy, nltb, msg_type)) + return -EINVAL; + + msg_seq = nla_get_u32(nltb[IWPM_NLA_RREG_PID_SEQ]); + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + pr_info("%s: Could not find a matching request (seq = %u)\n", + __func__, msg_seq); + return -EINVAL; + } + pm_msg = nlmsg_request->req_buffer; + nl_client = nlmsg_request->nl_client; + dev_name = (char *)nla_data(nltb[IWPM_NLA_RREG_IBDEV_NAME]); + iwpm_name = (char *)nla_data(nltb[IWPM_NLA_RREG_ULIB_NAME]); + iwpm_version = nla_get_u16(nltb[IWPM_NLA_RREG_ULIB_VER]); + + /* check device name, ulib name and version */ + if (strcmp(pm_msg->dev_name, dev_name) || + strcmp(iwpm_ulib_name, iwpm_name) || + iwpm_version != iwpm_ulib_version) { + + pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n", + __func__, dev_name, iwpm_name, iwpm_version); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto register_pid_response_exit; + } + iwpm_user_pid = cb->nlh->nlmsg_pid; + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", + __func__, iwpm_user_pid); + if (iwpm_valid_client(nl_client)) + iwpm_set_registered(nl_client, 1); +register_pid_response_exit: + nlmsg_request->request_done = 1; + /* always for found nlmsg_request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + wake_up(&nlmsg_request->waitq); + return 0; +} +EXPORT_SYMBOL(iwpm_register_pid_cb); + +/* netlink attribute policy for the received response to add mapping request */ +static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = { + [IWPM_NLA_MANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_MANAGE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } +}; + +/* + * iwpm_add_mapping_cb - Process a port mapper response to + * iwpm_add_mapping() + */ +int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct iwpm_sa_data *pm_msg; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlattr *nltb[IWPM_NLA_RMANAGE_MAPPING_MAX]; + struct sockaddr_storage *local_sockaddr; + struct sockaddr_storage *mapped_sockaddr; + const char *msg_type; + u32 msg_seq; + + msg_type = "Add Mapping response"; + if (iwpm_parse_nlmsg(cb, IWPM_NLA_RMANAGE_MAPPING_MAX, + resp_add_policy, nltb, msg_type)) + return -EINVAL; + + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + + msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]); + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + pr_info("%s: Could not find a matching request (seq = %u)\n", + __func__, msg_seq); + return -EINVAL; + } + pm_msg = nlmsg_request->req_buffer; + local_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_MANAGE_ADDR]); + mapped_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]); + + if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) { + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto add_mapping_response_exit; + } + if (mapped_sockaddr->ss_family != local_sockaddr->ss_family) { + pr_info("%s: Sockaddr family doesn't match the requested one\n", + __func__); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto add_mapping_response_exit; + } + memcpy(&pm_msg->mapped_loc_addr, mapped_sockaddr, + sizeof(*mapped_sockaddr)); + iwpm_print_sockaddr(&pm_msg->loc_addr, + "add_mapping: Local sockaddr:"); + iwpm_print_sockaddr(&pm_msg->mapped_loc_addr, + "add_mapping: Mapped local sockaddr:"); + +add_mapping_response_exit: + nlmsg_request->request_done = 1; + /* always for found request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + wake_up(&nlmsg_request->waitq); + return 0; +} +EXPORT_SYMBOL(iwpm_add_mapping_cb); + +/* netlink attribute policy for the response to add and query mapping request */ +static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = { + [IWPM_NLA_QUERY_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_QUERY_LOCAL_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_QUERY_REMOTE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 } +}; + +/* + * iwpm_add_and_query_mapping_cb - Process a port mapper response to + * iwpm_add_and_query_mapping() + */ +int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct iwpm_sa_data *pm_msg; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX]; + struct sockaddr_storage *local_sockaddr, *remote_sockaddr; + struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr; + const char *msg_type; + u32 msg_seq; + u16 err_code; + + msg_type = "Query Mapping response"; + if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX, + resp_query_policy, nltb, msg_type)) + return -EINVAL; + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + + msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]); + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + pr_info("%s: Could not find a matching request (seq = %u)\n", + __func__, msg_seq); + return -EINVAL; + } + pm_msg = nlmsg_request->req_buffer; + local_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]); + remote_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]); + mapped_loc_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); + mapped_rem_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]); + + err_code = nla_get_u16(nltb[IWPM_NLA_RQUERY_MAPPING_ERR]); + if (err_code == IWPM_REMOTE_QUERY_REJECT) { + pr_info("%s: Received a Reject (pid = %u, echo seq = %u)\n", + __func__, cb->nlh->nlmsg_pid, msg_seq); + nlmsg_request->err_code = IWPM_REMOTE_QUERY_REJECT; + } + if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr) || + iwpm_compare_sockaddr(remote_sockaddr, &pm_msg->rem_addr)) { + pr_info("%s: Incorrect local sockaddr\n", __func__); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto query_mapping_response_exit; + } + if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family || + mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) { + pr_info("%s: Sockaddr family doesn't match the requested one\n", + __func__); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto query_mapping_response_exit; + } + memcpy(&pm_msg->mapped_loc_addr, mapped_loc_sockaddr, + sizeof(*mapped_loc_sockaddr)); + memcpy(&pm_msg->mapped_rem_addr, mapped_rem_sockaddr, + sizeof(*mapped_rem_sockaddr)); + + iwpm_print_sockaddr(&pm_msg->loc_addr, + "query_mapping: Local sockaddr:"); + iwpm_print_sockaddr(&pm_msg->mapped_loc_addr, + "query_mapping: Mapped local sockaddr:"); + iwpm_print_sockaddr(&pm_msg->rem_addr, + "query_mapping: Remote sockaddr:"); + iwpm_print_sockaddr(&pm_msg->mapped_rem_addr, + "query_mapping: Mapped remote sockaddr:"); +query_mapping_response_exit: + nlmsg_request->request_done = 1; + /* always for found request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + wake_up(&nlmsg_request->waitq); + return 0; +} +EXPORT_SYMBOL(iwpm_add_and_query_mapping_cb); + +/* netlink attribute policy for the received request for mapping info */ +static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = { + [IWPM_NLA_MAPINFO_ULIB_NAME] = { .type = NLA_STRING, + .len = IWPM_ULIBNAME_SIZE - 1 }, + [IWPM_NLA_MAPINFO_ULIB_VER] = { .type = NLA_U16 } +}; + +/* + * iwpm_mapping_info_cb - Process a port mapper request for mapping info + */ +int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_MAPINFO_REQ_MAX]; + const char *msg_type = "Mapping Info response"; + int iwpm_pid; + u8 nl_client; + char *iwpm_name; + u16 iwpm_version; + int ret = -EINVAL; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_REQ_MAX, + resp_mapinfo_policy, nltb, msg_type)) { + pr_info("%s: Unable to parse nlmsg\n", __func__); + return ret; + } + iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]); + iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]); + if (strcmp(iwpm_ulib_name, iwpm_name) || + iwpm_version != iwpm_ulib_version) { + pr_info("%s: Invalid port mapper name = %s version = %d\n", + __func__, iwpm_name, iwpm_version); + return ret; + } + nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + if (!iwpm_valid_client(nl_client)) { + pr_info("%s: Invalid port mapper client = %d\n", + __func__, nl_client); + return ret; + } + iwpm_set_registered(nl_client, 0); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + if (!iwpm_mapinfo_available()) + return 0; + iwpm_pid = cb->nlh->nlmsg_pid; + pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", + __func__, iwpm_pid); + ret = iwpm_send_mapinfo(nl_client, iwpm_pid); + return ret; +} +EXPORT_SYMBOL(iwpm_mapping_info_cb); + +/* netlink attribute policy for the received mapping info ack */ +static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = { + [IWPM_NLA_MAPINFO_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_MAPINFO_SEND_NUM] = { .type = NLA_U32 }, + [IWPM_NLA_MAPINFO_ACK_NUM] = { .type = NLA_U32 } +}; + +/* + * iwpm_ack_mapping_info_cb - Process a port mapper ack for + * the provided mapping info records + */ +int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_MAPINFO_NUM_MAX]; + u32 mapinfo_send, mapinfo_ack; + const char *msg_type = "Mapping Info Ack"; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_NUM_MAX, + ack_mapinfo_policy, nltb, msg_type)) + return -EINVAL; + mapinfo_send = nla_get_u32(nltb[IWPM_NLA_MAPINFO_SEND_NUM]); + mapinfo_ack = nla_get_u32(nltb[IWPM_NLA_MAPINFO_ACK_NUM]); + if (mapinfo_ack != mapinfo_send) + pr_info("%s: Invalid mapinfo number (sent = %u ack-ed = %u)\n", + __func__, mapinfo_send, mapinfo_ack); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + return 0; +} +EXPORT_SYMBOL(iwpm_ack_mapping_info_cb); + +/* netlink attribute policy for the received port mapper error message */ +static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = { + [IWPM_NLA_ERR_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_ERR_CODE] = { .type = NLA_U16 }, +}; + +/* + * iwpm_mapping_error_cb - Process a port mapper error message + */ +int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct iwpm_nlmsg_request *nlmsg_request = NULL; + int nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + struct nlattr *nltb[IWPM_NLA_ERR_MAX]; + u32 msg_seq; + u16 err_code; + const char *msg_type = "Mapping Error Msg"; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_ERR_MAX, + map_error_policy, nltb, msg_type)) + return -EINVAL; + + msg_seq = nla_get_u32(nltb[IWPM_NLA_ERR_SEQ]); + err_code = nla_get_u16(nltb[IWPM_NLA_ERR_CODE]); + pr_info("%s: Received msg seq = %u err code = %u client = %d\n", + __func__, msg_seq, err_code, nl_client); + /* look for nlmsg_request */ + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + /* not all errors have associated requests */ + pr_debug("Could not find matching req (seq = %u)\n", msg_seq); + return 0; + } + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + nlmsg_request->err_code = err_code; + nlmsg_request->request_done = 1; + /* always for found request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + wake_up(&nlmsg_request->waitq); + return 0; +} +EXPORT_SYMBOL(iwpm_mapping_error_cb); diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c new file mode 100644 index 00000000000..69e9f84c160 --- /dev/null +++ b/drivers/infiniband/core/iwpm_util.c @@ -0,0 +1,607 @@ +/* + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iwpm_util.h" + +#define IWPM_HASH_BUCKET_SIZE 512 +#define IWPM_HASH_BUCKET_MASK (IWPM_HASH_BUCKET_SIZE - 1) + +static LIST_HEAD(iwpm_nlmsg_req_list); +static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock); + +static struct hlist_head *iwpm_hash_bucket; +static DEFINE_SPINLOCK(iwpm_mapinfo_lock); + +static DEFINE_MUTEX(iwpm_admin_lock); +static struct iwpm_admin_data iwpm_admin; + +int iwpm_init(u8 nl_client) +{ + if (iwpm_valid_client(nl_client)) + return -EINVAL; + mutex_lock(&iwpm_admin_lock); + if (atomic_read(&iwpm_admin.refcount) == 0) { + iwpm_hash_bucket = kzalloc(IWPM_HASH_BUCKET_SIZE * + sizeof(struct hlist_head), GFP_KERNEL); + if (!iwpm_hash_bucket) { + mutex_unlock(&iwpm_admin_lock); + pr_err("%s Unable to create mapinfo hash table\n", __func__); + return -ENOMEM; + } + } + atomic_inc(&iwpm_admin.refcount); + mutex_unlock(&iwpm_admin_lock); + iwpm_set_valid(nl_client, 1); + return 0; +} +EXPORT_SYMBOL(iwpm_init); + +static void free_hash_bucket(void); + +int iwpm_exit(u8 nl_client) +{ + + if (!iwpm_valid_client(nl_client)) + return -EINVAL; + mutex_lock(&iwpm_admin_lock); + if (atomic_read(&iwpm_admin.refcount) == 0) { + mutex_unlock(&iwpm_admin_lock); + pr_err("%s Incorrect usage - negative refcount\n", __func__); + return -EINVAL; + } + if (atomic_dec_and_test(&iwpm_admin.refcount)) { + free_hash_bucket(); + pr_debug("%s: Mapinfo hash table is destroyed\n", __func__); + } + mutex_unlock(&iwpm_admin_lock); + iwpm_set_valid(nl_client, 0); + return 0; +} +EXPORT_SYMBOL(iwpm_exit); + +static struct hlist_head *get_hash_bucket_head(struct sockaddr_storage *, + struct sockaddr_storage *); + +int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, + struct sockaddr_storage *mapped_sockaddr, + u8 nl_client) +{ + struct hlist_head *hash_bucket_head; + struct iwpm_mapping_info *map_info; + unsigned long flags; + + if (!iwpm_valid_client(nl_client)) + return -EINVAL; + map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL); + if (!map_info) { + pr_err("%s: Unable to allocate a mapping info\n", __func__); + return -ENOMEM; + } + memcpy(&map_info->local_sockaddr, local_sockaddr, + sizeof(struct sockaddr_storage)); + memcpy(&map_info->mapped_sockaddr, mapped_sockaddr, + sizeof(struct sockaddr_storage)); + map_info->nl_client = nl_client; + + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + if (iwpm_hash_bucket) { + hash_bucket_head = get_hash_bucket_head( + &map_info->local_sockaddr, + &map_info->mapped_sockaddr); + hlist_add_head(&map_info->hlist_node, hash_bucket_head); + } + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); + return 0; +} +EXPORT_SYMBOL(iwpm_create_mapinfo); + +int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr, + struct sockaddr_storage *mapped_local_addr) +{ + struct hlist_node *tmp_hlist_node; + struct hlist_head *hash_bucket_head; + struct iwpm_mapping_info *map_info = NULL; + unsigned long flags; + int ret = -EINVAL; + + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + if (iwpm_hash_bucket) { + hash_bucket_head = get_hash_bucket_head( + local_sockaddr, + mapped_local_addr); + hlist_for_each_entry_safe(map_info, tmp_hlist_node, + hash_bucket_head, hlist_node) { + + if (!iwpm_compare_sockaddr(&map_info->mapped_sockaddr, + mapped_local_addr)) { + + hlist_del_init(&map_info->hlist_node); + kfree(map_info); + ret = 0; + break; + } + } + } + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); + return ret; +} +EXPORT_SYMBOL(iwpm_remove_mapinfo); + +static void free_hash_bucket(void) +{ + struct hlist_node *tmp_hlist_node; + struct iwpm_mapping_info *map_info; + unsigned long flags; + int i; + + /* remove all the mapinfo data from the list */ + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + for (i = 0; i < IWPM_HASH_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(map_info, tmp_hlist_node, + &iwpm_hash_bucket[i], hlist_node) { + + hlist_del_init(&map_info->hlist_node); + kfree(map_info); + } + } + /* free the hash list */ + kfree(iwpm_hash_bucket); + iwpm_hash_bucket = NULL; + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); +} + +struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, + u8 nl_client, gfp_t gfp) +{ + struct iwpm_nlmsg_request *nlmsg_request = NULL; + unsigned long flags; + + nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp); + if (!nlmsg_request) { + pr_err("%s Unable to allocate a nlmsg_request\n", __func__); + return NULL; + } + spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); + list_add_tail(&nlmsg_request->inprocess_list, &iwpm_nlmsg_req_list); + spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); + + kref_init(&nlmsg_request->kref); + kref_get(&nlmsg_request->kref); + nlmsg_request->nlmsg_seq = nlmsg_seq; + nlmsg_request->nl_client = nl_client; + nlmsg_request->request_done = 0; + nlmsg_request->err_code = 0; + return nlmsg_request; +} + +void iwpm_free_nlmsg_request(struct kref *kref) +{ + struct iwpm_nlmsg_request *nlmsg_request; + unsigned long flags; + + nlmsg_request = container_of(kref, struct iwpm_nlmsg_request, kref); + + spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); + list_del_init(&nlmsg_request->inprocess_list); + spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); + + if (!nlmsg_request->request_done) + pr_debug("%s Freeing incomplete nlmsg request (seq = %u).\n", + __func__, nlmsg_request->nlmsg_seq); + kfree(nlmsg_request); +} + +struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq) +{ + struct iwpm_nlmsg_request *nlmsg_request; + struct iwpm_nlmsg_request *found_request = NULL; + unsigned long flags; + + spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); + list_for_each_entry(nlmsg_request, &iwpm_nlmsg_req_list, + inprocess_list) { + if (nlmsg_request->nlmsg_seq == echo_seq) { + found_request = nlmsg_request; + kref_get(&nlmsg_request->kref); + break; + } + } + spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); + return found_request; +} + +int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request) +{ + int ret; + init_waitqueue_head(&nlmsg_request->waitq); + + ret = wait_event_timeout(nlmsg_request->waitq, + (nlmsg_request->request_done != 0), IWPM_NL_TIMEOUT); + if (!ret) { + ret = -EINVAL; + pr_info("%s: Timeout %d sec for netlink request (seq = %u)\n", + __func__, (IWPM_NL_TIMEOUT/HZ), nlmsg_request->nlmsg_seq); + } else { + ret = nlmsg_request->err_code; + } + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + return ret; +} + +int iwpm_get_nlmsg_seq(void) +{ + return atomic_inc_return(&iwpm_admin.nlmsg_seq); +} + +int iwpm_valid_client(u8 nl_client) +{ + if (nl_client >= RDMA_NL_NUM_CLIENTS) + return 0; + return iwpm_admin.client_list[nl_client]; +} + +void iwpm_set_valid(u8 nl_client, int valid) +{ + if (nl_client >= RDMA_NL_NUM_CLIENTS) + return; + iwpm_admin.client_list[nl_client] = valid; +} + +/* valid client */ +int iwpm_registered_client(u8 nl_client) +{ + return iwpm_admin.reg_list[nl_client]; +} + +/* valid client */ +void iwpm_set_registered(u8 nl_client, int reg) +{ + iwpm_admin.reg_list[nl_client] = reg; +} + +int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, + struct sockaddr_storage *b_sockaddr) +{ + if (a_sockaddr->ss_family != b_sockaddr->ss_family) + return 1; + if (a_sockaddr->ss_family == AF_INET) { + struct sockaddr_in *a4_sockaddr = + (struct sockaddr_in *)a_sockaddr; + struct sockaddr_in *b4_sockaddr = + (struct sockaddr_in *)b_sockaddr; + if (!memcmp(&a4_sockaddr->sin_addr, + &b4_sockaddr->sin_addr, sizeof(struct in_addr)) + && a4_sockaddr->sin_port == b4_sockaddr->sin_port) + return 0; + + } else if (a_sockaddr->ss_family == AF_INET6) { + struct sockaddr_in6 *a6_sockaddr = + (struct sockaddr_in6 *)a_sockaddr; + struct sockaddr_in6 *b6_sockaddr = + (struct sockaddr_in6 *)b_sockaddr; + if (!memcmp(&a6_sockaddr->sin6_addr, + &b6_sockaddr->sin6_addr, sizeof(struct in6_addr)) + && a6_sockaddr->sin6_port == b6_sockaddr->sin6_port) + return 0; + + } else { + pr_err("%s: Invalid sockaddr family\n", __func__); + } + return 1; +} + +struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh, + int nl_client) +{ + struct sk_buff *skb = NULL; + + skb = dev_alloc_skb(NLMSG_GOODSIZE); + if (!skb) { + pr_err("%s Unable to allocate skb\n", __func__); + goto create_nlmsg_exit; + } + if (!(ibnl_put_msg(skb, nlh, 0, 0, nl_client, nl_op, + NLM_F_REQUEST))) { + pr_warn("%s: Unable to put the nlmsg header\n", __func__); + dev_kfree_skb(skb); + skb = NULL; + } +create_nlmsg_exit: + return skb; +} + +int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, + const struct nla_policy *nlmsg_policy, + struct nlattr *nltb[], const char *msg_type) +{ + int nlh_len = 0; + int ret; + const char *err_str = ""; + + ret = nlmsg_validate(cb->nlh, nlh_len, policy_max-1, nlmsg_policy); + if (ret) { + err_str = "Invalid attribute"; + goto parse_nlmsg_error; + } + ret = nlmsg_parse(cb->nlh, nlh_len, nltb, policy_max-1, nlmsg_policy); + if (ret) { + err_str = "Unable to parse the nlmsg"; + goto parse_nlmsg_error; + } + ret = iwpm_validate_nlmsg_attr(nltb, policy_max); + if (ret) { + err_str = "Invalid NULL attribute"; + goto parse_nlmsg_error; + } + return 0; +parse_nlmsg_error: + pr_warn("%s: %s (msg type %s ret = %d)\n", + __func__, err_str, msg_type, ret); + return ret; +} + +void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg) +{ + struct sockaddr_in6 *sockaddr_v6; + struct sockaddr_in *sockaddr_v4; + + switch (sockaddr->ss_family) { + case AF_INET: + sockaddr_v4 = (struct sockaddr_in *)sockaddr; + pr_debug("%s IPV4 %pI4: %u(0x%04X)\n", + msg, &sockaddr_v4->sin_addr, + ntohs(sockaddr_v4->sin_port), + ntohs(sockaddr_v4->sin_port)); + break; + case AF_INET6: + sockaddr_v6 = (struct sockaddr_in6 *)sockaddr; + pr_debug("%s IPV6 %pI6: %u(0x%04X)\n", + msg, &sockaddr_v6->sin6_addr, + ntohs(sockaddr_v6->sin6_port), + ntohs(sockaddr_v6->sin6_port)); + break; + default: + break; + } +} + +static u32 iwpm_ipv6_jhash(struct sockaddr_in6 *ipv6_sockaddr) +{ + u32 ipv6_hash = jhash(&ipv6_sockaddr->sin6_addr, sizeof(struct in6_addr), 0); + u32 hash = jhash_2words(ipv6_hash, (__force u32) ipv6_sockaddr->sin6_port, 0); + return hash; +} + +static u32 iwpm_ipv4_jhash(struct sockaddr_in *ipv4_sockaddr) +{ + u32 ipv4_hash = jhash(&ipv4_sockaddr->sin_addr, sizeof(struct in_addr), 0); + u32 hash = jhash_2words(ipv4_hash, (__force u32) ipv4_sockaddr->sin_port, 0); + return hash; +} + +static struct hlist_head *get_hash_bucket_head(struct sockaddr_storage + *local_sockaddr, + struct sockaddr_storage + *mapped_sockaddr) +{ + u32 local_hash, mapped_hash, hash; + + if (local_sockaddr->ss_family == AF_INET) { + local_hash = iwpm_ipv4_jhash((struct sockaddr_in *) local_sockaddr); + mapped_hash = iwpm_ipv4_jhash((struct sockaddr_in *) mapped_sockaddr); + + } else if (local_sockaddr->ss_family == AF_INET6) { + local_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) local_sockaddr); + mapped_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) mapped_sockaddr); + } else { + pr_err("%s: Invalid sockaddr family\n", __func__); + return NULL; + } + + if (local_hash == mapped_hash) /* if port mapper isn't available */ + hash = local_hash; + else + hash = jhash_2words(local_hash, mapped_hash, 0); + + return &iwpm_hash_bucket[hash & IWPM_HASH_BUCKET_MASK]; +} + +static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_MAPINFO_NUM, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto mapinfo_num_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + msg_seq = 0; + err_str = "Unable to put attribute of mapinfo number nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MAPINFO_SEQ); + if (ret) + goto mapinfo_num_error; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), + &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM); + if (ret) + goto mapinfo_num_error; + ret = ibnl_unicast(skb, nlh, iwpm_pid); + if (ret) { + skb = NULL; + err_str = "Unable to send a nlmsg"; + goto mapinfo_num_error; + } + pr_debug("%s: Sent mapping number = %d\n", __func__, mapping_num); + return 0; +mapinfo_num_error: + pr_info("%s: %s\n", __func__, err_str); + if (skb) + dev_kfree_skb(skb); + return ret; +} + +static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) +{ + struct nlmsghdr *nlh = NULL; + int ret = 0; + + if (!skb) + return ret; + if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, + RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { + pr_warn("%s Unable to put NLMSG_DONE\n", __func__); + return -ENOMEM; + } + nlh->nlmsg_type = NLMSG_DONE; + ret = ibnl_unicast(skb, (struct nlmsghdr *)skb->data, iwpm_pid); + if (ret) + pr_warn("%s Unable to send a nlmsg\n", __func__); + return ret; +} + +int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) +{ + struct iwpm_mapping_info *map_info; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + int skb_num = 0, mapping_num = 0; + int i = 0, nlmsg_bytes = 0; + unsigned long flags; + const char *err_str = ""; + int ret; + + skb = dev_alloc_skb(NLMSG_GOODSIZE); + if (!skb) { + ret = -ENOMEM; + err_str = "Unable to allocate skb"; + goto send_mapping_info_exit; + } + skb_num++; + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + for (i = 0; i < IWPM_HASH_BUCKET_SIZE; i++) { + hlist_for_each_entry(map_info, &iwpm_hash_bucket[i], + hlist_node) { + if (map_info->nl_client != nl_client) + continue; + nlh = NULL; + if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, + RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { + ret = -ENOMEM; + err_str = "Unable to put the nlmsg header"; + goto send_mapping_info_unlock; + } + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, + sizeof(struct sockaddr_storage), + &map_info->local_sockaddr, + IWPM_NLA_MAPINFO_LOCAL_ADDR); + if (ret) + goto send_mapping_info_unlock; + + ret = ibnl_put_attr(skb, nlh, + sizeof(struct sockaddr_storage), + &map_info->mapped_sockaddr, + IWPM_NLA_MAPINFO_MAPPED_ADDR); + if (ret) + goto send_mapping_info_unlock; + + iwpm_print_sockaddr(&map_info->local_sockaddr, + "send_mapping_info: Local sockaddr:"); + iwpm_print_sockaddr(&map_info->mapped_sockaddr, + "send_mapping_info: Mapped local sockaddr:"); + mapping_num++; + nlmsg_bytes += nlh->nlmsg_len; + + /* check if all mappings can fit in one skb */ + if (NLMSG_GOODSIZE - nlmsg_bytes < nlh->nlmsg_len * 2) { + /* and leave room for NLMSG_DONE */ + nlmsg_bytes = 0; + skb_num++; + spin_unlock_irqrestore(&iwpm_mapinfo_lock, + flags); + /* send the skb */ + ret = send_nlmsg_done(skb, nl_client, iwpm_pid); + skb = NULL; + if (ret) { + err_str = "Unable to send map info"; + goto send_mapping_info_exit; + } + if (skb_num == IWPM_MAPINFO_SKB_COUNT) { + ret = -ENOMEM; + err_str = "Insufficient skbs for map info"; + goto send_mapping_info_exit; + } + skb = dev_alloc_skb(NLMSG_GOODSIZE); + if (!skb) { + ret = -ENOMEM; + err_str = "Unable to allocate skb"; + goto send_mapping_info_exit; + } + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + } + } + } +send_mapping_info_unlock: + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); +send_mapping_info_exit: + if (ret) { + pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret); + if (skb) + dev_kfree_skb(skb); + return ret; + } + send_nlmsg_done(skb, nl_client, iwpm_pid); + return send_mapinfo_num(mapping_num, nl_client, iwpm_pid); +} + +int iwpm_mapinfo_available(void) +{ + unsigned long flags; + int full_bucket = 0, i = 0; + + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + if (iwpm_hash_bucket) { + for (i = 0; i < IWPM_HASH_BUCKET_SIZE; i++) { + if (!hlist_empty(&iwpm_hash_bucket[i])) { + full_bucket = 1; + break; + } + } + } + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); + return full_bucket; +} diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h new file mode 100644 index 00000000000..9777c869a14 --- /dev/null +++ b/drivers/infiniband/core/iwpm_util.h @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _IWPM_UTIL_H +#define _IWPM_UTIL_H + +#include <linux/module.h> +#include <linux/io.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/spinlock.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/delay.h> +#include <linux/workqueue.h> +#include <linux/mutex.h> +#include <linux/jhash.h> +#include <linux/kref.h> +#include <net/netlink.h> +#include <linux/errno.h> +#include <rdma/iw_portmap.h> +#include <rdma/rdma_netlink.h> + + +#define IWPM_NL_RETRANS 3 +#define IWPM_NL_TIMEOUT (10*HZ) +#define IWPM_MAPINFO_SKB_COUNT 20 + +#define IWPM_PID_UNDEFINED -1 +#define IWPM_PID_UNAVAILABLE -2 + +struct iwpm_nlmsg_request { + struct list_head inprocess_list; + __u32 nlmsg_seq; + void *req_buffer; + u8 nl_client; + u8 request_done; + u16 err_code; + wait_queue_head_t waitq; + struct kref kref; +}; + +struct iwpm_mapping_info { + struct hlist_node hlist_node; + struct sockaddr_storage local_sockaddr; + struct sockaddr_storage mapped_sockaddr; + u8 nl_client; +}; + +struct iwpm_admin_data { + atomic_t refcount; + atomic_t nlmsg_seq; + int client_list[RDMA_NL_NUM_CLIENTS]; + int reg_list[RDMA_NL_NUM_CLIENTS]; +}; + +/** + * iwpm_get_nlmsg_request - Allocate and initialize netlink message request + * @nlmsg_seq: Sequence number of the netlink message + * @nl_client: The index of the netlink client + * @gfp: Indicates how the memory for the request should be allocated + * + * Returns the newly allocated netlink request object if successful, + * otherwise returns NULL + */ +struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, + u8 nl_client, gfp_t gfp); + +/** + * iwpm_free_nlmsg_request - Deallocate netlink message request + * @kref: Holds reference of netlink message request + */ +void iwpm_free_nlmsg_request(struct kref *kref); + +/** + * iwpm_find_nlmsg_request - Find netlink message request in the request list + * @echo_seq: Sequence number of the netlink request to find + * + * Returns the found netlink message request, + * if not found, returns NULL + */ +struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq); + +/** + * iwpm_wait_complete_req - Block while servicing the netlink request + * @nlmsg_request: Netlink message request to service + * + * Wakes up, after the request is completed or expired + * Returns 0 if the request is complete without error + */ +int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request); + +/** + * iwpm_get_nlmsg_seq - Get the sequence number for a netlink + * message to send to the port mapper + * + * Returns the sequence number for the netlink message. + */ +int iwpm_get_nlmsg_seq(void); + +/** + * iwpm_valid_client - Check if the port mapper client is valid + * @nl_client: The index of the netlink client + * + * Valid clients need to call iwpm_init() before using + * the port mapper + */ +int iwpm_valid_client(u8 nl_client); + +/** + * iwpm_set_valid - Set the port mapper client to valid or not + * @nl_client: The index of the netlink client + * @valid: 1 if valid or 0 if invalid + */ +void iwpm_set_valid(u8 nl_client, int valid); + +/** + * iwpm_registered_client - Check if the port mapper client is registered + * @nl_client: The index of the netlink client + * + * Call iwpm_register_pid() to register a client + */ +int iwpm_registered_client(u8 nl_client); + +/** + * iwpm_set_registered - Set the port mapper client to registered or not + * @nl_client: The index of the netlink client + * @reg: 1 if registered or 0 if not + */ +void iwpm_set_registered(u8 nl_client, int reg); + +/** + * iwpm_send_mapinfo - Send local and mapped IPv4/IPv6 address info of + * a client to the user space port mapper + * @nl_client: The index of the netlink client + * @iwpm_pid: The pid of the user space port mapper + * + * If successful, returns the number of sent mapping info records + */ +int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid); + +/** + * iwpm_mapinfo_available - Check if any mapping info records is available + * in the hash table + * + * Returns 1 if mapping information is available, otherwise returns 0 + */ +int iwpm_mapinfo_available(void); + +/** + * iwpm_compare_sockaddr - Compare two sockaddr storage structs + * + * Returns 0 if they are holding the same ip/tcp address info, + * otherwise returns 1 + */ +int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, + struct sockaddr_storage *b_sockaddr); + +/** + * iwpm_validate_nlmsg_attr - Check for NULL netlink attributes + * @nltb: Holds address of each netlink message attributes + * @nla_count: Number of netlink message attributes + * + * Returns error if any of the nla_count attributes is NULL + */ +static inline int iwpm_validate_nlmsg_attr(struct nlattr *nltb[], + int nla_count) +{ + int i; + for (i = 1; i < nla_count; i++) { + if (!nltb[i]) + return -EINVAL; + } + return 0; +} + +/** + * iwpm_create_nlmsg - Allocate skb and form a netlink message + * @nl_op: Netlink message opcode + * @nlh: Holds address of the netlink message header in skb + * @nl_client: The index of the netlink client + * + * Returns the newly allcated skb, or NULL if the tailroom of the skb + * is insufficient to store the message header and payload + */ +struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh, + int nl_client); + +/** + * iwpm_parse_nlmsg - Validate and parse the received netlink message + * @cb: Netlink callback structure + * @policy_max: Maximum attribute type to be expected + * @nlmsg_policy: Validation policy + * @nltb: Array to store policy_max parsed elements + * @msg_type: Type of netlink message + * + * Returns 0 on success or a negative error code + */ +int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, + const struct nla_policy *nlmsg_policy, + struct nlattr *nltb[], const char *msg_type); + +/** + * iwpm_print_sockaddr - Print IPv4/IPv6 address and TCP port + * @sockaddr: Socket address to print + * @msg: Message to print + */ +void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg); +#endif diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 4c837e66516..ab31f136d04 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -1022,12 +1022,21 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) mad_send_wr->send_buf.mad, sge[0].length, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr))) + return -ENOMEM; + mad_send_wr->header_mapping = sge[0].addr; sge[1].addr = ib_dma_map_single(mad_agent->device, ib_get_payload(mad_send_wr), sge[1].length, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) { + ib_dma_unmap_single(mad_agent->device, + mad_send_wr->header_mapping, + sge[0].length, DMA_TO_DEVICE); + return -ENOMEM; + } mad_send_wr->payload_mapping = sge[1].addr; spin_lock_irqsave(&qp_info->send_queue.lock, flags); @@ -2590,6 +2599,11 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, sizeof *mad_priv - sizeof mad_priv->header, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, + sg_list.addr))) { + ret = -ENOMEM; + break; + } mad_priv->header.mapping = sg_list.addr; recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list; mad_priv->header.mad_list.mad_queue = recv_queue; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index da06abde9e0..23dd5a5c759 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -103,13 +103,13 @@ int ibnl_remove_client(int index) EXPORT_SYMBOL(ibnl_remove_client); void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq, - int len, int client, int op) + int len, int client, int op, int flags) { unsigned char *prev_tail; prev_tail = skb_tail_pointer(skb); *nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), - len, NLM_F_MULTI); + len, flags); if (!*nlh) goto out_nlmsg_trim; (*nlh)->nlmsg_len = skb_tail_pointer(skb) - prev_tail; @@ -148,7 +148,7 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) list_for_each_entry(client, &client_list, list) { if (client->index == index) { if (op < 0 || op >= client->nops || - !client->cb_table[RDMA_NL_GET_OP(op)].dump) + !client->cb_table[op].dump) return -EINVAL; { @@ -172,6 +172,20 @@ static void ibnl_rcv(struct sk_buff *skb) mutex_unlock(&ibnl_mutex); } +int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh, + __u32 pid) +{ + return nlmsg_unicast(nls, skb, pid); +} +EXPORT_SYMBOL(ibnl_unicast); + +int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh, + unsigned int group, gfp_t flags) +{ + return nlmsg_multicast(nls, skb, 0, group, flags); +} +EXPORT_SYMBOL(ibnl_multicast); + int __init ibnl_init(void) { struct netlink_kernel_cfg cfg = { diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 9838ca48438..233eaf541f5 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -42,7 +42,7 @@ #include <linux/kref.h> #include <linux/idr.h> #include <linux/workqueue.h> - +#include <uapi/linux/if_ether.h> #include <rdma/ib_pack.h> #include <rdma/ib_cache.h> #include "sa.h" @@ -556,6 +556,13 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, ah_attr->grh.hop_limit = rec->hop_limit; ah_attr->grh.traffic_class = rec->traffic_class; } + if (force_grh) { + memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN); + ah_attr->vlan_id = rec->vlan_id; + } else { + ah_attr->vlan_id = 0xffff; + } + return 0; } EXPORT_SYMBOL(ib_init_ah_from_path); @@ -611,7 +618,7 @@ static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent) static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) { - bool preload = gfp_mask & __GFP_WAIT; + bool preload = !!(gfp_mask & __GFP_WAIT); unsigned long flags; int ret, id; @@ -670,6 +677,9 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), mad->data, &rec); + rec.vlan_id = 0xffff; + memset(rec.dmac, 0, ETH_ALEN); + memset(rec.smac, 0, ETH_ALEN); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index cde1e7b5b85..cbd0383f622 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -429,15 +429,19 @@ static void ib_port_release(struct kobject *kobj) struct attribute *a; int i; - for (i = 0; (a = p->gid_group.attrs[i]); ++i) - kfree(a); + if (p->gid_group.attrs) { + for (i = 0; (a = p->gid_group.attrs[i]); ++i) + kfree(a); - kfree(p->gid_group.attrs); + kfree(p->gid_group.attrs); + } - for (i = 0; (a = p->pkey_group.attrs[i]); ++i) - kfree(a); + if (p->pkey_group.attrs) { + for (i = 0; (a = p->pkey_group.attrs[i]); ++i) + kfree(a); - kfree(p->pkey_group.attrs); + kfree(p->pkey_group.attrs); + } kfree(p); } @@ -534,10 +538,12 @@ static int add_port(struct ib_device *device, int port_num, p->port_num = port_num; ret = kobject_init_and_add(&p->kobj, &port_type, - kobject_get(device->ports_parent), + device->ports_parent, "%d", port_num); - if (ret) - goto err_put; + if (ret) { + kfree(p); + return ret; + } ret = sysfs_create_group(&p->kobj, &pma_group); if (ret) @@ -585,6 +591,7 @@ err_free_pkey: kfree(p->pkey_group.attrs[i]); kfree(p->pkey_group.attrs); + p->pkey_group.attrs = NULL; err_remove_gid: sysfs_remove_group(&p->kobj, &p->gid_group); @@ -594,13 +601,13 @@ err_free_gid: kfree(p->gid_group.attrs[i]); kfree(p->gid_group.attrs); + p->gid_group.attrs = NULL; err_remove_pma: sysfs_remove_group(&p->kobj, &pma_group); err_put: - kobject_put(device->ports_parent); - kfree(p); + kobject_put(&p->kobj); return ret; } @@ -612,6 +619,8 @@ static ssize_t show_node_type(struct device *device, switch (dev->node_type) { case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type); case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type); + case RDMA_NODE_USNIC: return sprintf(buf, "%d: usNIC\n", dev->node_type); + case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type); case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type); case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type); default: return sprintf(buf, "%d: <unknown>\n", dev->node_type); @@ -807,6 +816,22 @@ static struct attribute_group iw_stats_group = { .attrs = iw_proto_stats_attrs, }; +static void free_port_list_attributes(struct ib_device *device) +{ + struct kobject *p, *t; + + list_for_each_entry_safe(p, t, &device->port_list, entry) { + struct ib_port *port = container_of(p, struct ib_port, kobj); + list_del(&p->entry); + sysfs_remove_group(p, &pma_group); + sysfs_remove_group(p, &port->pkey_group); + sysfs_remove_group(p, &port->gid_group); + kobject_put(p); + } + + kobject_put(device->ports_parent); +} + int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)) @@ -833,7 +858,7 @@ int ib_device_register_sysfs(struct ib_device *device, } device->ports_parent = kobject_create_and_add("ports", - kobject_get(&class_dev->kobj)); + &class_dev->kobj); if (!device->ports_parent) { ret = -ENOMEM; goto err_put; @@ -860,21 +885,7 @@ int ib_device_register_sysfs(struct ib_device *device, return 0; err_put: - { - struct kobject *p, *t; - struct ib_port *port; - - list_for_each_entry_safe(p, t, &device->port_list, entry) { - list_del(&p->entry); - port = container_of(p, struct ib_port, kobj); - sysfs_remove_group(p, &pma_group); - sysfs_remove_group(p, &port->pkey_group); - sysfs_remove_group(p, &port->gid_group); - kobject_put(p); - } - } - - kobject_put(&class_dev->kobj); + free_port_list_attributes(device); err_unregister: device_unregister(class_dev); @@ -885,22 +896,18 @@ err: void ib_device_unregister_sysfs(struct ib_device *device) { - struct kobject *p, *t; - struct ib_port *port; - /* Hold kobject until ib_dealloc_device() */ - kobject_get(&device->dev.kobj); + struct kobject *kobj_dev = kobject_get(&device->dev.kobj); + int i; - list_for_each_entry_safe(p, t, &device->port_list, entry) { - list_del(&p->entry); - port = container_of(p, struct ib_port, kobj); - sysfs_remove_group(p, &pma_group); - sysfs_remove_group(p, &port->pkey_group); - sysfs_remove_group(p, &port->gid_group); - kobject_put(p); - } + if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) + sysfs_remove_group(kobj_dev, &iw_stats_group); + + free_port_list_attributes(device); + + for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) + device_remove_file(&device->dev, ib_class_attributes[i]); - kobject_put(device->ports_parent); device_unregister(&device->dev); } diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index b0f189be543..56a4b7ca7ee 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -57,7 +57,7 @@ MODULE_LICENSE("Dual BSD/GPL"); static unsigned int max_backlog = 1024; static struct ctl_table_header *ucma_ctl_table_hdr; -static ctl_table ucma_ctl_table[] = { +static struct ctl_table ucma_ctl_table[] = { { .procname = "max_backlog", .data = &max_backlog, @@ -271,7 +271,7 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, goto out; } ctx->backlog--; - } else if (!ctx->uid) { + } else if (!ctx->uid || ctx->cm_id != cm_id) { /* * We ignore events for new connections until userspace has set * their context. This can only happen if an error occurs on a @@ -655,24 +655,14 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { - struct rdma_dev_addr *dev_addr; - struct net_device *dev; - u16 vid = 0; resp->num_paths = route->num_paths; switch (route->num_paths) { case 0: - dev_addr = &route->addr.dev_addr; - dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); - if (dev) { - vid = rdma_vlan_dev_vlan_id(dev); - dev_put(dev); - } - - iboe_mac_vlan_to_ll((union ib_gid *) &resp->ib_route[0].dgid, - dev_addr->dst_dev_addr, vid); - iboe_addr_get_sgid(dev_addr, - (union ib_gid *) &resp->ib_route[0].sgid); + rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr, + (union ib_gid *)&resp->ib_route[0].dgid); + rdma_ip2gid((struct sockaddr *)&route->addr.src_addr, + (union ib_gid *)&resp->ib_route[0].sgid); resp->ib_route[0].pkey = cpu_to_be16(0xffff); break; case 2: diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index a8411232207..a3a2e9c1639 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -42,29 +42,29 @@ #include "uverbs.h" -#define IB_UMEM_MAX_PAGE_CHUNK \ - ((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) / \ - ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] - \ - (void *) &((struct ib_umem_chunk *) 0)->page_list[0])) static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { - struct ib_umem_chunk *chunk, *tmp; + struct scatterlist *sg; + struct page *page; int i; - list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) { - ib_dma_unmap_sg(dev, chunk->page_list, - chunk->nents, DMA_BIDIRECTIONAL); - for (i = 0; i < chunk->nents; ++i) { - struct page *page = sg_page(&chunk->page_list[i]); + if (umem->nmap > 0) + ib_dma_unmap_sg(dev, umem->sg_head.sgl, + umem->nmap, + DMA_BIDIRECTIONAL); - if (umem->writable && dirty) - set_page_dirty_lock(page); - put_page(page); - } + for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { - kfree(chunk); + page = sg_page(sg); + if (umem->writable && dirty) + set_page_dirty_lock(page); + put_page(page); } + + sg_free_table(&umem->sg_head); + return; + } /** @@ -81,15 +81,15 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, struct ib_umem *umem; struct page **page_list; struct vm_area_struct **vma_list; - struct ib_umem_chunk *chunk; unsigned long locked; unsigned long lock_limit; unsigned long cur_base; unsigned long npages; int ret; - int off; int i; DEFINE_DMA_ATTRS(attrs); + struct scatterlist *sg, *sg_list_start; + int need_release = 0; if (dmasync) dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); @@ -97,7 +97,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (!can_do_mlock()) return ERR_PTR(-EPERM); - umem = kmalloc(sizeof *umem, GFP_KERNEL); + umem = kzalloc(sizeof *umem, GFP_KERNEL); if (!umem) return ERR_PTR(-ENOMEM); @@ -117,8 +117,6 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, /* We assume the memory is from hugetlb until proved otherwise */ umem->hugetlb = 1; - INIT_LIST_HEAD(&umem->chunk_list); - page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) { kfree(umem); @@ -147,7 +145,18 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, cur_base = addr & PAGE_MASK; - ret = 0; + if (npages == 0) { + ret = -EINVAL; + goto out; + } + + ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); + if (ret) + goto out; + + need_release = 1; + sg_list_start = umem->sg_head.sgl; + while (npages) { ret = get_user_pages(current, current->mm, cur_base, min_t(unsigned long, npages, @@ -157,54 +166,38 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (ret < 0) goto out; + umem->npages += ret; cur_base += ret * PAGE_SIZE; npages -= ret; - off = 0; - - while (ret) { - chunk = kmalloc(sizeof *chunk + sizeof (struct scatterlist) * - min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK), - GFP_KERNEL); - if (!chunk) { - ret = -ENOMEM; - goto out; - } - - chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK); - sg_init_table(chunk->page_list, chunk->nents); - for (i = 0; i < chunk->nents; ++i) { - if (vma_list && - !is_vm_hugetlb_page(vma_list[i + off])) - umem->hugetlb = 0; - sg_set_page(&chunk->page_list[i], page_list[i + off], PAGE_SIZE, 0); - } - - chunk->nmap = ib_dma_map_sg_attrs(context->device, - &chunk->page_list[0], - chunk->nents, - DMA_BIDIRECTIONAL, - &attrs); - if (chunk->nmap <= 0) { - for (i = 0; i < chunk->nents; ++i) - put_page(sg_page(&chunk->page_list[i])); - kfree(chunk); - - ret = -ENOMEM; - goto out; - } - - ret -= chunk->nents; - off += chunk->nents; - list_add_tail(&chunk->list, &umem->chunk_list); + for_each_sg(sg_list_start, sg, ret, i) { + if (vma_list && !is_vm_hugetlb_page(vma_list[i])) + umem->hugetlb = 0; + + sg_set_page(sg, page_list[i], PAGE_SIZE, 0); } - ret = 0; + /* preparing for next loop */ + sg_list_start = sg; } + umem->nmap = ib_dma_map_sg_attrs(context->device, + umem->sg_head.sgl, + umem->npages, + DMA_BIDIRECTIONAL, + &attrs); + + if (umem->nmap <= 0) { + ret = -ENOMEM; + goto out; + } + + ret = 0; + out: if (ret < 0) { - __ib_umem_release(context->device, umem, 0); + if (need_release) + __ib_umem_release(context->device, umem, 0); kfree(umem); } else current->mm->pinned_vm = locked; @@ -278,17 +271,16 @@ EXPORT_SYMBOL(ib_umem_release); int ib_umem_page_count(struct ib_umem *umem) { - struct ib_umem_chunk *chunk; int shift; int i; int n; + struct scatterlist *sg; shift = ilog2(umem->page_size); n = 0; - list_for_each_entry(chunk, &umem->chunk_list, list) - for (i = 0; i < chunk->nmap; ++i) - n += sg_dma_len(&chunk->page_list[i]) >> shift; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) + n += sg_dma_len(sg) >> shift; return n; } diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index f0d588f8859..1acb9910055 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -98,7 +98,7 @@ struct ib_umad_port { struct ib_umad_device { int start_port, end_port; - struct kref ref; + struct kobject kobj; struct ib_umad_port port[0]; }; @@ -134,14 +134,18 @@ static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); static void ib_umad_add_one(struct ib_device *device); static void ib_umad_remove_one(struct ib_device *device); -static void ib_umad_release_dev(struct kref *ref) +static void ib_umad_release_dev(struct kobject *kobj) { struct ib_umad_device *dev = - container_of(ref, struct ib_umad_device, ref); + container_of(kobj, struct ib_umad_device, kobj); kfree(dev); } +static struct kobj_type ib_umad_dev_ktype = { + .release = ib_umad_release_dev, +}; + static int hdr_size(struct ib_umad_file *file) { return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) : @@ -780,27 +784,19 @@ static int ib_umad_open(struct inode *inode, struct file *filp) { struct ib_umad_port *port; struct ib_umad_file *file; - int ret; + int ret = -ENXIO; port = container_of(inode->i_cdev, struct ib_umad_port, cdev); - if (port) - kref_get(&port->umad_dev->ref); - else - return -ENXIO; mutex_lock(&port->file_mutex); - if (!port->ib_dev) { - ret = -ENXIO; + if (!port->ib_dev) goto out; - } + ret = -ENOMEM; file = kzalloc(sizeof *file, GFP_KERNEL); - if (!file) { - kref_put(&port->umad_dev->ref, ib_umad_release_dev); - ret = -ENOMEM; + if (!file) goto out; - } mutex_init(&file->mutex); spin_lock_init(&file->send_lock); @@ -814,6 +810,13 @@ static int ib_umad_open(struct inode *inode, struct file *filp) list_add_tail(&file->port_list, &port->file_list); ret = nonseekable_open(inode, filp); + if (ret) { + list_del(&file->port_list); + kfree(file); + goto out; + } + + kobject_get(&port->umad_dev->kobj); out: mutex_unlock(&port->file_mutex); @@ -852,7 +855,7 @@ static int ib_umad_close(struct inode *inode, struct file *filp) mutex_unlock(&file->port->file_mutex); kfree(file); - kref_put(&dev->ref, ib_umad_release_dev); + kobject_put(&dev->kobj); return 0; } @@ -880,10 +883,6 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp) int ret; port = container_of(inode->i_cdev, struct ib_umad_port, sm_cdev); - if (port) - kref_get(&port->umad_dev->ref); - else - return -ENXIO; if (filp->f_flags & O_NONBLOCK) { if (down_trylock(&port->sm_sem)) { @@ -898,17 +897,27 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp) } ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props); - if (ret) { - up(&port->sm_sem); - goto fail; - } + if (ret) + goto err_up_sem; filp->private_data = port; - return nonseekable_open(inode, filp); + ret = nonseekable_open(inode, filp); + if (ret) + goto err_clr_sm_cap; + + kobject_get(&port->umad_dev->kobj); + + return 0; + +err_clr_sm_cap: + swap(props.set_port_cap_mask, props.clr_port_cap_mask); + ib_modify_port(port->ib_dev, port->port_num, 0, &props); + +err_up_sem: + up(&port->sm_sem); fail: - kref_put(&port->umad_dev->ref, ib_umad_release_dev); return ret; } @@ -927,7 +936,7 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp) up(&port->sm_sem); - kref_put(&port->umad_dev->ref, ib_umad_release_dev); + kobject_put(&port->umad_dev->kobj); return ret; } @@ -995,6 +1004,7 @@ static int find_overflow_devnum(void) } static int ib_umad_init_port(struct ib_device *device, int port_num, + struct ib_umad_device *umad_dev, struct ib_umad_port *port) { int devnum; @@ -1027,6 +1037,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, cdev_init(&port->cdev, &umad_fops); port->cdev.owner = THIS_MODULE; + port->cdev.kobj.parent = &umad_dev->kobj; kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num); if (cdev_add(&port->cdev, base, 1)) goto err_cdev; @@ -1045,6 +1056,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, base += IB_UMAD_MAX_PORTS; cdev_init(&port->sm_cdev, &umad_sm_fops); port->sm_cdev.owner = THIS_MODULE; + port->sm_cdev.kobj.parent = &umad_dev->kobj; kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num); if (cdev_add(&port->sm_cdev, base, 1)) goto err_sm_cdev; @@ -1138,7 +1150,7 @@ static void ib_umad_add_one(struct ib_device *device) if (!umad_dev) return; - kref_init(&umad_dev->ref); + kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype); umad_dev->start_port = s; umad_dev->end_port = e; @@ -1146,7 +1158,8 @@ static void ib_umad_add_one(struct ib_device *device) for (i = s; i <= e; ++i) { umad_dev->port[i - s].umad_dev = umad_dev; - if (ib_umad_init_port(device, i, &umad_dev->port[i - s])) + if (ib_umad_init_port(device, i, umad_dev, + &umad_dev->port[i - s])) goto err; } @@ -1158,7 +1171,7 @@ err: while (--i >= s) ib_umad_kill_port(&umad_dev->port[i - s]); - kref_put(&umad_dev->ref, ib_umad_release_dev); + kobject_put(&umad_dev->kobj); } static void ib_umad_remove_one(struct ib_device *device) @@ -1172,7 +1185,7 @@ static void ib_umad_remove_one(struct ib_device *device) for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i) ib_umad_kill_port(&umad_dev->port[i]); - kref_put(&umad_dev->ref, ib_umad_release_dev); + kobject_put(&umad_dev->kobj); } static char *umad_devnode(struct device *dev, umode_t *mode) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index d040b877475..a283274a5a0 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -47,6 +47,22 @@ #include <rdma/ib_umem.h> #include <rdma/ib_user_verbs.h> +#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ + do { \ + (udata)->inbuf = (const void __user *) (ibuf); \ + (udata)->outbuf = (void __user *) (obuf); \ + (udata)->inlen = (ilen); \ + (udata)->outlen = (olen); \ + } while (0) + +#define INIT_UDATA_BUF_OR_NULL(udata, ibuf, obuf, ilen, olen) \ + do { \ + (udata)->inbuf = (ilen) ? (const void __user *) (ibuf) : NULL; \ + (udata)->outbuf = (olen) ? (void __user *) (obuf) : NULL; \ + (udata)->inlen = (ilen); \ + (udata)->outlen = (olen); \ + } while (0) + /* * Our lifetime rules for these structs are the following: * @@ -178,6 +194,22 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd); +struct ib_uverbs_flow_spec { + union { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_spec_eth eth; + struct ib_uverbs_flow_spec_ipv4 ipv4; + struct ib_uverbs_flow_spec_tcp_udp tcp_udp; + }; +}; + #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ const char __user *buf, int in_len, \ @@ -217,7 +249,13 @@ IB_UVERBS_DECLARE_CMD(destroy_srq); IB_UVERBS_DECLARE_CMD(create_xsrq); IB_UVERBS_DECLARE_CMD(open_xrcd); IB_UVERBS_DECLARE_CMD(close_xrcd); -IB_UVERBS_DECLARE_CMD(create_flow); -IB_UVERBS_DECLARE_CMD(destroy_flow); + +#define IB_UVERBS_DECLARE_EX_CMD(name) \ + int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \ + struct ib_udata *ucore, \ + struct ib_udata *uhw) + +IB_UVERBS_DECLARE_EX_CMD(create_flow); +IB_UVERBS_DECLARE_EX_CMD(destroy_flow); #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index f2b81b9ee0d..ea6203ee7bc 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -40,6 +40,7 @@ #include <asm/uaccess.h> #include "uverbs.h" +#include "core_priv.h" struct uverbs_lock_class { struct lock_class_key key; @@ -56,14 +57,6 @@ static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" }; static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; -#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ - do { \ - (udata)->inbuf = (void __user *) (ibuf); \ - (udata)->outbuf = (void __user *) (obuf); \ - (udata)->inlen = (ilen); \ - (udata)->outlen = (olen); \ - } while (0) - /* * The ib_uobject locking scheme is as follows: * @@ -937,13 +930,9 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) return -EINVAL; - /* - * Local write permission is required if remote write or - * remote atomic permission is also requested. - */ - if (cmd.access_flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && - !(cmd.access_flags & IB_ACCESS_LOCAL_WRITE)) - return -EINVAL; + ret = ib_check_mr_access(cmd.access_flags); + if (ret) + return ret; uobj = kmalloc(sizeof *uobj, GFP_KERNEL); if (!uobj) @@ -1973,6 +1962,9 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, attr->alt_ah_attr.port_num = cmd.alt_dest.port_num; if (qp->real_qp == qp) { + ret = ib_resolve_eth_l2_attrs(qp, attr, &cmd.attr_mask); + if (ret) + goto out; ret = qp->device->modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata); } else { @@ -2126,6 +2118,9 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, } next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn; next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey; + if (next->opcode == IB_WR_SEND_WITH_IMM) + next->ex.imm_data = + (__be32 __force) user_wr->ex.imm_data; } else { switch (next->opcode) { case IB_WR_RDMA_WRITE_WITH_IMM: @@ -2599,9 +2594,12 @@ out_put: return ret ? ret : in_len; } -static int kern_spec_to_ib_spec(struct ib_kern_spec *kern_spec, +static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec) { + if (kern_spec->reserved) + return -EINVAL; + ib_spec->type = kern_spec->type; switch (ib_spec->type) { @@ -2639,28 +2637,34 @@ static int kern_spec_to_ib_spec(struct ib_kern_spec *kern_spec, return 0; } -ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) { struct ib_uverbs_create_flow cmd; struct ib_uverbs_create_flow_resp resp; struct ib_uobject *uobj; struct ib_flow *flow_id; - struct ib_kern_flow_attr *kern_flow_attr; + struct ib_uverbs_flow_attr *kern_flow_attr; struct ib_flow_attr *flow_attr; struct ib_qp *qp; int err = 0; void *kern_spec; void *ib_spec; int i; - int kern_attr_size; - if (out_len < sizeof(resp)) + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + if (ucore->outlen < sizeof(resp)) return -ENOSPC; - if (copy_from_user(&cmd, buf, sizeof(cmd))) - return -EFAULT; + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + ucore->inbuf += sizeof(cmd); + ucore->inlen -= sizeof(cmd); if (cmd.comp_mask) return -EINVAL; @@ -2669,32 +2673,31 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW)) return -EPERM; - if (cmd.flow_attr.num_of_specs < 0 || - cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) + if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) return -EINVAL; - kern_attr_size = cmd.flow_attr.size - sizeof(cmd) - - sizeof(struct ib_uverbs_cmd_hdr_ex); + if (cmd.flow_attr.size > ucore->inlen || + cmd.flow_attr.size > + (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec))) + return -EINVAL; - if (cmd.flow_attr.size < 0 || cmd.flow_attr.size > in_len || - kern_attr_size < 0 || kern_attr_size > - (cmd.flow_attr.num_of_specs * sizeof(struct ib_kern_spec))) + if (cmd.flow_attr.reserved[0] || + cmd.flow_attr.reserved[1]) return -EINVAL; if (cmd.flow_attr.num_of_specs) { - kern_flow_attr = kmalloc(cmd.flow_attr.size, GFP_KERNEL); + kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size, + GFP_KERNEL); if (!kern_flow_attr) return -ENOMEM; memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr)); - if (copy_from_user(kern_flow_attr + 1, buf + sizeof(cmd), - kern_attr_size)) { - err = -EFAULT; + err = ib_copy_from_udata(kern_flow_attr + 1, ucore, + cmd.flow_attr.size); + if (err) goto err_free_attr; - } } else { kern_flow_attr = &cmd.flow_attr; - kern_attr_size = sizeof(cmd.flow_attr); } uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); @@ -2711,7 +2714,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, goto err_uobj; } - flow_attr = kmalloc(cmd.flow_attr.size, GFP_KERNEL); + flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, GFP_KERNEL); if (!flow_attr) { err = -ENOMEM; goto err_put; @@ -2726,19 +2729,23 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, kern_spec = kern_flow_attr + 1; ib_spec = flow_attr + 1; - for (i = 0; i < flow_attr->num_of_specs && kern_attr_size > 0; i++) { + for (i = 0; i < flow_attr->num_of_specs && + cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) && + cmd.flow_attr.size >= + ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) { err = kern_spec_to_ib_spec(kern_spec, ib_spec); if (err) goto err_free; flow_attr->size += ((union ib_flow_spec *) ib_spec)->size; - kern_attr_size -= ((struct ib_kern_spec *) kern_spec)->size; - kern_spec += ((struct ib_kern_spec *) kern_spec)->size; + cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size; + kern_spec += ((struct ib_uverbs_flow_spec *) kern_spec)->size; ib_spec += ((union ib_flow_spec *) ib_spec)->size; } - if (kern_attr_size) { - pr_warn("create flow failed, %d bytes left from uverb cmd\n", - kern_attr_size); + if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { + pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n", + i, cmd.flow_attr.size); + err = -EINVAL; goto err_free; } flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); @@ -2757,11 +2764,10 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, memset(&resp, 0, sizeof(resp)); resp.flow_handle = uobj->id; - if (copy_to_user((void __user *)(unsigned long) cmd.response, - &resp, sizeof(resp))) { - err = -EFAULT; + err = ib_copy_to_udata(ucore, + &resp, sizeof(resp)); + if (err) goto err_copy; - } put_qp_read(qp); mutex_lock(&file->mutex); @@ -2774,7 +2780,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, kfree(flow_attr); if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); - return in_len; + return 0; err_copy: idr_remove_uobj(&ib_uverbs_rule_idr, uobj); destroy_flow: @@ -2791,16 +2797,24 @@ err_free_attr: return err; } -ssize_t ib_uverbs_destroy_flow(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) { +int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ struct ib_uverbs_destroy_flow cmd; struct ib_flow *flow_id; struct ib_uobject *uobj; int ret; - if (copy_from_user(&cmd, buf, sizeof(cmd))) - return -EFAULT; + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (ret) + return ret; + + if (cmd.comp_mask) + return -EINVAL; uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle, file->ucontext); @@ -2822,7 +2836,7 @@ ssize_t ib_uverbs_destroy_flow(struct ib_uverbs_file *file, put_uobj(uobj); - return ret ? ret : in_len; + return ret; } static int __uverbs_create_xsrq(struct ib_uverbs_file *file, diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 75ad86c4abf..08219fb3338 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -115,8 +115,13 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd, [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq, [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp, - [IB_USER_VERBS_CMD_CREATE_FLOW] = ib_uverbs_create_flow, - [IB_USER_VERBS_CMD_DESTROY_FLOW] = ib_uverbs_destroy_flow +}; + +static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) = { + [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, + [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow }; static void ib_uverbs_add_one(struct ib_device *device); @@ -587,6 +592,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, { struct ib_uverbs_file *file = filp->private_data; struct ib_uverbs_cmd_hdr hdr; + __u32 flags; if (count < sizeof hdr) return -EINVAL; @@ -594,41 +600,110 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, if (copy_from_user(&hdr, buf, sizeof hdr)) return -EFAULT; - if (hdr.command >= ARRAY_SIZE(uverbs_cmd_table) || - !uverbs_cmd_table[hdr.command]) - return -EINVAL; + flags = (hdr.command & + IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; - if (!file->ucontext && - hdr.command != IB_USER_VERBS_CMD_GET_CONTEXT) - return -EINVAL; + if (!flags) { + __u32 command; - if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command))) - return -ENOSYS; + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) + return -EINVAL; - if (hdr.command >= IB_USER_VERBS_CMD_THRESHOLD) { - struct ib_uverbs_cmd_hdr_ex hdr_ex; + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; - if (copy_from_user(&hdr_ex, buf, sizeof(hdr_ex))) - return -EFAULT; + if (command >= ARRAY_SIZE(uverbs_cmd_table) || + !uverbs_cmd_table[command]) + return -EINVAL; - if (((hdr_ex.in_words + hdr_ex.provider_in_words) * 4) != count) + if (!file->ucontext && + command != IB_USER_VERBS_CMD_GET_CONTEXT) return -EINVAL; - return uverbs_cmd_table[hdr.command](file, - buf + sizeof(hdr_ex), - (hdr_ex.in_words + - hdr_ex.provider_in_words) * 4, - (hdr_ex.out_words + - hdr_ex.provider_out_words) * 4); - } else { + if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command))) + return -ENOSYS; + if (hdr.in_words * 4 != count) return -EINVAL; - return uverbs_cmd_table[hdr.command](file, - buf + sizeof(hdr), - hdr.in_words * 4, - hdr.out_words * 4); + return uverbs_cmd_table[command](file, + buf + sizeof(hdr), + hdr.in_words * 4, + hdr.out_words * 4); + + } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) { + __u32 command; + + struct ib_uverbs_ex_cmd_hdr ex_hdr; + struct ib_udata ucore; + struct ib_udata uhw; + int err; + size_t written_count = count; + + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) + return -EINVAL; + + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + + if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) || + !uverbs_ex_cmd_table[command]) + return -ENOSYS; + + if (!file->ucontext) + return -EINVAL; + + if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command))) + return -ENOSYS; + + if (count < (sizeof(hdr) + sizeof(ex_hdr))) + return -EINVAL; + + if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) + return -EFAULT; + + count -= sizeof(hdr) + sizeof(ex_hdr); + buf += sizeof(hdr) + sizeof(ex_hdr); + + if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) + return -EINVAL; + + if (ex_hdr.cmd_hdr_reserved) + return -EINVAL; + + if (ex_hdr.response) { + if (!hdr.out_words && !ex_hdr.provider_out_words) + return -EINVAL; + + if (!access_ok(VERIFY_WRITE, + (void __user *) (unsigned long) ex_hdr.response, + (hdr.out_words + ex_hdr.provider_out_words) * 8)) + return -EFAULT; + } else { + if (hdr.out_words || ex_hdr.provider_out_words) + return -EINVAL; + } + + INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response, + hdr.in_words * 8, hdr.out_words * 8); + + INIT_UDATA_BUF_OR_NULL(&uhw, + buf + ucore.inlen, + (unsigned long) ex_hdr.response + ucore.outlen, + ex_hdr.provider_in_words * 8, + ex_hdr.provider_out_words * 8); + + err = uverbs_ex_cmd_table[command](file, + &ucore, + &uhw); + + if (err) + return err; + + return written_count; } + + return -ENOSYS; } static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index a321df28bab..c2b89cc5dbc 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -44,8 +44,11 @@ #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> +#include <rdma/ib_addr.h> -int ib_rate_to_mult(enum ib_rate rate) +#include "core_priv.h" + +__attribute_const__ int ib_rate_to_mult(enum ib_rate rate) { switch (rate) { case IB_RATE_2_5_GBPS: return 1; @@ -62,7 +65,7 @@ int ib_rate_to_mult(enum ib_rate rate) } EXPORT_SYMBOL(ib_rate_to_mult); -enum ib_rate mult_to_ib_rate(int mult) +__attribute_const__ enum ib_rate mult_to_ib_rate(int mult) { switch (mult) { case 1: return IB_RATE_2_5_GBPS; @@ -79,7 +82,7 @@ enum ib_rate mult_to_ib_rate(int mult) } EXPORT_SYMBOL(mult_to_ib_rate); -int ib_rate_to_mbps(enum ib_rate rate) +__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) { switch (rate) { case IB_RATE_2_5_GBPS: return 2500; @@ -104,7 +107,7 @@ int ib_rate_to_mbps(enum ib_rate rate) } EXPORT_SYMBOL(ib_rate_to_mbps); -enum rdma_transport_type +__attribute_const__ enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type) { switch (node_type) { @@ -114,6 +117,10 @@ rdma_node_get_transport(enum rdma_node_type node_type) return RDMA_TRANSPORT_IB; case RDMA_NODE_RNIC: return RDMA_TRANSPORT_IWARP; + case RDMA_NODE_USNIC: + return RDMA_TRANSPORT_USNIC; + case RDMA_NODE_USNIC_UDP: + return RDMA_TRANSPORT_USNIC_UDP; default: BUG(); return 0; @@ -130,6 +137,8 @@ enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_ case RDMA_TRANSPORT_IB: return IB_LINK_LAYER_INFINIBAND; case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_USNIC: + case RDMA_TRANSPORT_USNIC_UDP: return IB_LINK_LAYER_ETHERNET; default: return IB_LINK_LAYER_UNSPECIFIED; @@ -189,8 +198,28 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, u32 flow_class; u16 gid_index; int ret; + int is_eth = (rdma_port_get_link_layer(device, port_num) == + IB_LINK_LAYER_ETHERNET); memset(ah_attr, 0, sizeof *ah_attr); + if (is_eth) { + if (!(wc->wc_flags & IB_WC_GRH)) + return -EPROTOTYPE; + + if (wc->wc_flags & IB_WC_WITH_SMAC && + wc->wc_flags & IB_WC_WITH_VLAN) { + memcpy(ah_attr->dmac, wc->smac, ETH_ALEN); + ah_attr->vlan_id = wc->vlan_id; + } else { + ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid, + ah_attr->dmac, &ah_attr->vlan_id); + if (ret) + return ret; + } + } else { + ah_attr->vlan_id = 0xffff; + } + ah_attr->dlid = wc->slid; ah_attr->sl = wc->sl; ah_attr->src_path_bits = wc->dlid_path_bits; @@ -473,7 +502,9 @@ EXPORT_SYMBOL(ib_create_qp); static const struct { int valid; enum ib_qp_attr_mask req_param[IB_QPT_MAX]; + enum ib_qp_attr_mask req_param_add_eth[IB_QPT_MAX]; enum ib_qp_attr_mask opt_param[IB_QPT_MAX]; + enum ib_qp_attr_mask opt_param_add_eth[IB_QPT_MAX]; } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { [IB_QPS_RESET] = { [IB_QPS_RESET] = { .valid = 1 }, @@ -554,6 +585,12 @@ static const struct { IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER), }, + .req_param_add_eth = { + [IB_QPT_RC] = (IB_QP_SMAC), + [IB_QPT_UC] = (IB_QP_SMAC), + [IB_QPT_XRC_INI] = (IB_QP_SMAC), + [IB_QPT_XRC_TGT] = (IB_QP_SMAC) + }, .opt_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), @@ -573,7 +610,21 @@ static const struct { IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), - } + }, + .opt_param_add_eth = { + [IB_QPT_RC] = (IB_QP_ALT_SMAC | + IB_QP_VID | + IB_QP_ALT_VID), + [IB_QPT_UC] = (IB_QP_ALT_SMAC | + IB_QP_VID | + IB_QP_ALT_VID), + [IB_QPT_XRC_INI] = (IB_QP_ALT_SMAC | + IB_QP_VID | + IB_QP_ALT_VID), + [IB_QPT_XRC_TGT] = (IB_QP_ALT_SMAC | + IB_QP_VID | + IB_QP_ALT_VID) + } } }, [IB_QPS_RTR] = { @@ -776,7 +827,8 @@ static const struct { }; int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask) + enum ib_qp_type type, enum ib_qp_attr_mask mask, + enum rdma_link_layer ll) { enum ib_qp_attr_mask req_param, opt_param; @@ -795,6 +847,13 @@ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, req_param = qp_state_table[cur_state][next_state].req_param[type]; opt_param = qp_state_table[cur_state][next_state].opt_param[type]; + if (ll == IB_LINK_LAYER_ETHERNET) { + req_param |= qp_state_table[cur_state][next_state]. + req_param_add_eth[type]; + opt_param |= qp_state_table[cur_state][next_state]. + opt_param_add_eth[type]; + } + if ((mask & req_param) != req_param) return 0; @@ -805,10 +864,51 @@ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, } EXPORT_SYMBOL(ib_modify_qp_is_ok); +int ib_resolve_eth_l2_attrs(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, int *qp_attr_mask) +{ + int ret = 0; + union ib_gid sgid; + + if ((*qp_attr_mask & IB_QP_AV) && + (rdma_port_get_link_layer(qp->device, qp_attr->ah_attr.port_num) == IB_LINK_LAYER_ETHERNET)) { + ret = ib_query_gid(qp->device, qp_attr->ah_attr.port_num, + qp_attr->ah_attr.grh.sgid_index, &sgid); + if (ret) + goto out; + if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) { + rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, qp_attr->ah_attr.dmac); + rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac); + qp_attr->vlan_id = rdma_get_vlan_id(&sgid); + } else { + ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid, + qp_attr->ah_attr.dmac, &qp_attr->vlan_id); + if (ret) + goto out; + ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr->smac, NULL); + if (ret) + goto out; + } + *qp_attr_mask |= IB_QP_SMAC; + if (qp_attr->vlan_id < 0xFFFF) + *qp_attr_mask |= IB_QP_VID; + } +out: + return ret; +} +EXPORT_SYMBOL(ib_resolve_eth_l2_attrs); + + int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask) { + int ret; + + ret = ib_resolve_eth_l2_attrs(qp, qp_attr, &qp_attr_mask); + if (ret) + return ret; + return qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); } EXPORT_SYMBOL(ib_modify_qp); @@ -958,6 +1058,11 @@ EXPORT_SYMBOL(ib_resize_cq); struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) { struct ib_mr *mr; + int err; + + err = ib_check_mr_access(mr_access_flags); + if (err) + return ERR_PTR(err); mr = pd->device->get_dma_mr(pd, mr_access_flags); @@ -980,6 +1085,11 @@ struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, u64 *iova_start) { struct ib_mr *mr; + int err; + + err = ib_check_mr_access(mr_access_flags); + if (err) + return ERR_PTR(err); if (!pd->device->reg_phys_mr) return ERR_PTR(-ENOSYS); @@ -1010,6 +1120,10 @@ int ib_rereg_phys_mr(struct ib_mr *mr, struct ib_pd *old_pd; int ret; + ret = ib_check_mr_access(mr_access_flags); + if (ret) + return ret; + if (!mr->device->rereg_phys_mr) return -ENOSYS; @@ -1055,6 +1169,45 @@ int ib_dereg_mr(struct ib_mr *mr) } EXPORT_SYMBOL(ib_dereg_mr); +struct ib_mr *ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr) +{ + struct ib_mr *mr; + + if (!pd->device->create_mr) + return ERR_PTR(-ENOSYS); + + mr = pd->device->create_mr(pd, mr_init_attr); + + if (!IS_ERR(mr)) { + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + atomic_set(&mr->usecnt, 0); + } + + return mr; +} +EXPORT_SYMBOL(ib_create_mr); + +int ib_destroy_mr(struct ib_mr *mr) +{ + struct ib_pd *pd; + int ret; + + if (atomic_read(&mr->usecnt)) + return -EBUSY; + + pd = mr->pd; + ret = mr->device->destroy_mr(mr); + if (!ret) + atomic_dec(&pd->usecnt); + + return ret; +} +EXPORT_SYMBOL(ib_destroy_mr); + struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) { struct ib_mr *mr; @@ -1284,3 +1437,11 @@ int ib_destroy_flow(struct ib_flow *flow_id) return err; } EXPORT_SYMBOL(ib_destroy_flow); + +int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, + struct ib_mr_status *mr_status) +{ + return mr->device->check_mr_status ? + mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS; +} +EXPORT_SYMBOL(ib_check_mr_status); diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile new file mode 100644 index 00000000000..e900b03531a --- /dev/null +++ b/drivers/infiniband/hw/Makefile @@ -0,0 +1,12 @@ +obj-$(CONFIG_INFINIBAND_MTHCA) += mthca/ +obj-$(CONFIG_INFINIBAND_IPATH) += ipath/ +obj-$(CONFIG_INFINIBAND_QIB) += qib/ +obj-$(CONFIG_INFINIBAND_EHCA) += ehca/ +obj-$(CONFIG_INFINIBAND_AMSO1100) += amso1100/ +obj-$(CONFIG_INFINIBAND_CXGB3) += cxgb3/ +obj-$(CONFIG_INFINIBAND_CXGB4) += cxgb4/ +obj-$(CONFIG_MLX4_INFINIBAND) += mlx4/ +obj-$(CONFIG_MLX5_INFINIBAND) += mlx5/ +obj-$(CONFIG_INFINIBAND_NES) += nes/ +obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ +obj-$(CONFIG_INFINIBAND_USNIC) += usnic/ diff --git a/drivers/infiniband/hw/amso1100/c2.c b/drivers/infiniband/hw/amso1100/c2.c index d53cf519f42..00400c352c1 100644 --- a/drivers/infiniband/hw/amso1100/c2.c +++ b/drivers/infiniband/hw/amso1100/c2.c @@ -1082,6 +1082,7 @@ static int c2_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) /* Initialize network device */ if ((netdev = c2_devinit(c2dev, mmio_regs)) == NULL) { + ret = -ENOMEM; iounmap(mmio_regs); goto bail4; } @@ -1151,7 +1152,8 @@ static int c2_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) goto bail10; } - if (c2_register_device(c2dev)) + ret = c2_register_device(c2dev); + if (ret) goto bail10; return 0; diff --git a/drivers/infiniband/hw/amso1100/c2_ae.c b/drivers/infiniband/hw/amso1100/c2_ae.c index d5d1929753e..cedda25232b 100644 --- a/drivers/infiniband/hw/amso1100/c2_ae.c +++ b/drivers/infiniband/hw/amso1100/c2_ae.c @@ -141,7 +141,7 @@ static const char *to_qp_state_str(int state) return "C2_QP_STATE_ERROR"; default: return "<invalid QP state>"; - }; + } } void c2_ae_event(struct c2_dev *c2dev, u32 mq_index) diff --git a/drivers/infiniband/hw/amso1100/c2_intr.c b/drivers/infiniband/hw/amso1100/c2_intr.c index 8951db4ae29..3a17d9b36db 100644 --- a/drivers/infiniband/hw/amso1100/c2_intr.c +++ b/drivers/infiniband/hw/amso1100/c2_intr.c @@ -169,7 +169,8 @@ static void handle_vq(struct c2_dev *c2dev, u32 mq_index) * We should never get here, as the adapter should * never send us a reply that we're not expecting. */ - vq_repbuf_free(c2dev, host_msg); + if (reply_msg != NULL) + vq_repbuf_free(c2dev, host_msg); pr_debug("handle_vq: UNEXPECTEDLY got NULL req\n"); return; } diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c index 07eb3a8067d..8af33cf1fc4 100644 --- a/drivers/infiniband/hw/amso1100/c2_provider.c +++ b/drivers/infiniband/hw/amso1100/c2_provider.c @@ -431,9 +431,9 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 *pages; u64 kva = 0; int shift, n, len; - int i, j, k; + int i, k, entry; int err = 0; - struct ib_umem_chunk *chunk; + struct scatterlist *sg; struct c2_pd *c2pd = to_c2pd(pd); struct c2_mr *c2mr; @@ -452,10 +452,7 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } shift = ffs(c2mr->umem->page_size) - 1; - - n = 0; - list_for_each_entry(chunk, &c2mr->umem->chunk_list, list) - n += chunk->nents; + n = c2mr->umem->nmap; pages = kmalloc(n * sizeof(u64), GFP_KERNEL); if (!pages) { @@ -464,14 +461,12 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } i = 0; - list_for_each_entry(chunk, &c2mr->umem->chunk_list, list) { - for (j = 0; j < chunk->nmap; ++j) { - len = sg_dma_len(&chunk->page_list[j]) >> shift; - for (k = 0; k < len; ++k) { - pages[i++] = - sg_dma_address(&chunk->page_list[j]) + - (c2mr->umem->page_size * k); - } + for_each_sg(c2mr->umem->sg_head.sgl, sg, c2mr->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = + sg_dma_address(sg) + + (c2mr->umem->page_size * k); } } diff --git a/drivers/infiniband/hw/amso1100/c2_rnic.c b/drivers/infiniband/hw/amso1100/c2_rnic.c index b7c98699005..d2a6d961344 100644 --- a/drivers/infiniband/hw/amso1100/c2_rnic.c +++ b/drivers/infiniband/hw/amso1100/c2_rnic.c @@ -576,7 +576,8 @@ int c2_rnic_init(struct c2_dev *c2dev) goto bail4; /* Initialize cached the adapter limits */ - if (c2_rnic_query(c2dev, &c2dev->props)) + err = c2_rnic_query(c2dev, &c2dev->props); + if (err) goto bail5; /* Initialize the PD pool */ diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index c3f5aca4ef0..de1c61b417d 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -735,14 +735,12 @@ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, ((perm & TPT_MW_BIND) ? F_TPT_MW_BIND_ENABLE : 0) | V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) | V_TPT_PAGE_SIZE(page_size)); - tpt.rsvd_pbl_addr = reset_tpt_entry ? 0 : - cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3)); + tpt.rsvd_pbl_addr = cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3)); tpt.len = cpu_to_be32(len); tpt.va_hi = cpu_to_be32((u32) (to >> 32)); tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL)); tpt.rsvd_bind_cnt_or_pstag = 0; - tpt.rsvd_pbl_size = reset_tpt_entry ? 0 : - cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2)); + tpt.rsvd_pbl_size = cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2)); } err = cxio_hal_ctrl_qp_write_mem(rdev_p, stag_idx + diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index 095bb046e2c..cb78b1e9bcd 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -418,6 +418,7 @@ static int send_abort(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp) skb->priority = CPL_PRIORITY_DATA; set_arp_failure_handler(skb, abort_arp_failure); req = (struct cpl_abort_req *) skb_put(skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid)); diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index d2283837d45..811b24a539c 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -618,14 +618,13 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, { __be64 *pages; int shift, n, len; - int i, j, k; + int i, k, entry; int err = 0; - struct ib_umem_chunk *chunk; struct iwch_dev *rhp; struct iwch_pd *php; struct iwch_mr *mhp; struct iwch_reg_user_mr_resp uresp; - + struct scatterlist *sg; PDBG("%s ib_pd %p\n", __func__, pd); php = to_iwch_pd(pd); @@ -645,9 +644,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, shift = ffs(mhp->umem->page_size) - 1; - n = 0; - list_for_each_entry(chunk, &mhp->umem->chunk_list, list) - n += chunk->nents; + n = mhp->umem->nmap; err = iwch_alloc_pbl(mhp, n); if (err) @@ -661,12 +658,10 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, i = n = 0; - list_for_each_entry(chunk, &mhp->umem->chunk_list, list) - for (j = 0; j < chunk->nmap; ++j) { - len = sg_dma_len(&chunk->page_list[j]) >> shift; + for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; for (k = 0; k < len; ++k) { - pages[i++] = cpu_to_be64(sg_dma_address( - &chunk->page_list[j]) + + pages[i++] = cpu_to_be64(sg_dma_address(sg) + mhp->umem->page_size * k); if (i == PAGE_SIZE / sizeof *pages) { err = iwch_write_pbl(mhp, pages, i, n); @@ -676,7 +671,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, i = 0; } } - } + } if (i) err = iwch_write_pbl(mhp, pages, i, n); diff --git a/drivers/infiniband/hw/cxgb4/Kconfig b/drivers/infiniband/hw/cxgb4/Kconfig index d4e8983fba5..23f38cf2c5c 100644 --- a/drivers/infiniband/hw/cxgb4/Kconfig +++ b/drivers/infiniband/hw/cxgb4/Kconfig @@ -1,10 +1,10 @@ config INFINIBAND_CXGB4 - tristate "Chelsio T4 RDMA Driver" + tristate "Chelsio T4/T5 RDMA Driver" depends on CHELSIO_T4 && INET && (IPV6 || IPV6=n) select GENERIC_ALLOCATOR ---help--- - This is an iWARP/RDMA driver for the Chelsio T4 1GbE and - 10GbE adapters. + This is an iWARP/RDMA driver for the Chelsio T4 and T5 + 1GbE, 10GbE adapters and T5 40GbE adapter. For general information about Chelsio and our products, visit our website at <http://www.chelsio.com>. diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 12fef76c791..768a0fb67dd 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * Copyright (c) 2009-2014 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -47,6 +47,8 @@ #include <net/ip6_route.h> #include <net/addrconf.h> +#include <rdma/ib_addr.h> + #include "iw_cxgb4.h" static char *states[] = { @@ -98,9 +100,9 @@ int c4iw_debug; module_param(c4iw_debug, int, 0644); MODULE_PARM_DESC(c4iw_debug, "Enable debug logging (default=0)"); -static int peer2peer; +static int peer2peer = 1; module_param(peer2peer, int, 0644); -MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=0)"); +MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=1)"); static int p2p_type = FW_RI_INIT_P2PTYPE_READ_REQ; module_param(p2p_type, int, 0644); @@ -173,12 +175,15 @@ static void start_ep_timer(struct c4iw_ep *ep) add_timer(&ep->timer); } -static void stop_ep_timer(struct c4iw_ep *ep) +static int stop_ep_timer(struct c4iw_ep *ep) { PDBG("%s ep %p stopping\n", __func__, ep); del_timer_sync(&ep->timer); - if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) + if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) { c4iw_put_ep(&ep->com); + return 0; + } + return 1; } static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb, @@ -229,12 +234,16 @@ static void release_tid(struct c4iw_rdev *rdev, u32 hwtid, struct sk_buff *skb) static void set_emss(struct c4iw_ep *ep, u16 opt) { - ep->emss = ep->com.dev->rdev.lldi.mtus[GET_TCPOPT_MSS(opt)] - 40; + ep->emss = ep->com.dev->rdev.lldi.mtus[GET_TCPOPT_MSS(opt)] - + sizeof(struct iphdr) - sizeof(struct tcphdr); ep->mss = ep->emss; if (GET_TCPOPT_TSTAMP(opt)) ep->emss -= 12; if (ep->emss < 128) ep->emss = 128; + if (ep->emss & 7) + PDBG("Warning: misaligned mtu idx %u mss %u emss=%u\n", + GET_TCPOPT_MSS(opt), ep->mss, ep->emss); PDBG("%s mss_idx %u mss %u emss=%u\n", __func__, GET_TCPOPT_MSS(opt), ep->mss, ep->emss); } @@ -291,6 +300,12 @@ void _c4iw_free_ep(struct kref *kref) dst_release(ep->dst); cxgb4_l2t_release(ep->l2t); } + if (test_bit(RELEASE_MAPINFO, &ep->com.flags)) { + print_addr(&ep->com, __func__, "remove_mapinfo/mapping"); + iwpm_remove_mapinfo(&ep->com.local_addr, + &ep->com.mapped_local_addr); + iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW); + } kfree(ep); } @@ -338,10 +353,7 @@ static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp) static struct net_device *get_real_dev(struct net_device *egress_dev) { - struct net_device *phys_dev = egress_dev; - if (egress_dev->priv_flags & IFF_802_1Q_VLAN) - phys_dev = vlan_dev_real_dev(egress_dev); - return phys_dev; + return rdma_vlan_dev_real_dev(egress_dev) ? : egress_dev; } static int our_interface(struct c4iw_dev *dev, struct net_device *egress_dev) @@ -400,7 +412,8 @@ static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip, n = dst_neigh_lookup(&rt->dst, &peer_ip); if (!n) return NULL; - if (!our_interface(dev, n->dev)) { + if (!our_interface(dev, n->dev) && + !(n->dev->flags & IFF_LOOPBACK)) { dst_release(&rt->dst); return NULL; } @@ -419,8 +432,17 @@ static void arp_failure_discard(void *handle, struct sk_buff *skb) */ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb) { + struct c4iw_ep *ep = handle; + printk(KERN_ERR MOD "ARP failure duing connect\n"); kfree_skb(skb); + connect_reply_upcall(ep, -EHOSTUNREACH); + state_set(&ep->com, DEAD); + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); + cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + c4iw_put_ep(&ep->com); } /* @@ -464,7 +486,7 @@ static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb) flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT; flowc->mnemval[5].val = cpu_to_be32(ep->rcv_seq); flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF; - flowc->mnemval[6].val = cpu_to_be32(snd_win); + flowc->mnemval[6].val = cpu_to_be32(ep->snd_win); flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS; flowc->mnemval[7].val = cpu_to_be32(ep->emss); /* Pad WR to 16 byte boundary */ @@ -524,48 +546,47 @@ static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp) return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } -#define VLAN_NONE 0xfff -#define FILTER_SEL_VLAN_NONE 0xffff -#define FILTER_SEL_WIDTH_P_FC (3+1) /* port uses 3 bits, FCoE one bit */ -#define FILTER_SEL_WIDTH_VIN_P_FC \ - (6 + 7 + FILTER_SEL_WIDTH_P_FC) /* 6 bits are unused, VF uses 7 bits*/ -#define FILTER_SEL_WIDTH_TAG_P_FC \ - (3 + FILTER_SEL_WIDTH_VIN_P_FC) /* PF uses 3 bits */ -#define FILTER_SEL_WIDTH_VLD_TAG_P_FC (1 + FILTER_SEL_WIDTH_TAG_P_FC) +/* + * c4iw_form_pm_msg - Form a port mapper message with mapping info + */ +static void c4iw_form_pm_msg(struct c4iw_ep *ep, + struct iwpm_sa_data *pm_msg) +{ + memcpy(&pm_msg->loc_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&pm_msg->rem_addr, &ep->com.remote_addr, + sizeof(ep->com.remote_addr)); +} -static unsigned int select_ntuple(struct c4iw_dev *dev, struct dst_entry *dst, - struct l2t_entry *l2t) +/* + * c4iw_form_reg_msg - Form a port mapper message with dev info + */ +static void c4iw_form_reg_msg(struct c4iw_dev *dev, + struct iwpm_dev_data *pm_msg) { - unsigned int ntuple = 0; - u32 viid; + memcpy(pm_msg->dev_name, dev->ibdev.name, IWPM_DEVNAME_SIZE); + memcpy(pm_msg->if_name, dev->rdev.lldi.ports[0]->name, + IWPM_IFNAME_SIZE); +} - switch (dev->rdev.lldi.filt_mode) { +static void c4iw_record_pm_msg(struct c4iw_ep *ep, + struct iwpm_sa_data *pm_msg) +{ + memcpy(&ep->com.mapped_local_addr, &pm_msg->mapped_loc_addr, + sizeof(ep->com.mapped_local_addr)); + memcpy(&ep->com.mapped_remote_addr, &pm_msg->mapped_rem_addr, + sizeof(ep->com.mapped_remote_addr)); +} - /* default filter mode */ - case HW_TPL_FR_MT_PR_IV_P_FC: - if (l2t->vlan == VLAN_NONE) - ntuple |= FILTER_SEL_VLAN_NONE << FILTER_SEL_WIDTH_P_FC; - else { - ntuple |= l2t->vlan << FILTER_SEL_WIDTH_P_FC; - ntuple |= 1 << FILTER_SEL_WIDTH_TAG_P_FC; - } - ntuple |= l2t->lport << S_PORT | IPPROTO_TCP << - FILTER_SEL_WIDTH_VLD_TAG_P_FC; - break; - case HW_TPL_FR_MT_PR_OV_P_FC: { - viid = cxgb4_port_viid(l2t->neigh->dev); - - ntuple |= FW_VIID_VIN_GET(viid) << FILTER_SEL_WIDTH_P_FC; - ntuple |= FW_VIID_PFN_GET(viid) << FILTER_SEL_WIDTH_VIN_P_FC; - ntuple |= FW_VIID_VIVLD_GET(viid) << FILTER_SEL_WIDTH_TAG_P_FC; - ntuple |= l2t->lport << S_PORT | IPPROTO_TCP << - FILTER_SEL_WIDTH_VLD_TAG_P_FC; - break; - } - default: - break; - } - return ntuple; +static void best_mtu(const unsigned short *mtus, unsigned short mtu, + unsigned int *idx, int use_ts) +{ + unsigned short hdr_size = sizeof(struct iphdr) + + sizeof(struct tcphdr) + + (use_ts ? 12 : 0); + unsigned short data_size = mtu - hdr_size; + + cxgb4_best_aligned_mtu(mtus, hdr_size, data_size, 8, idx); } static int send_connect(struct c4iw_ep *ep) @@ -586,10 +607,15 @@ static int send_connect(struct c4iw_ep *ep) int sizev6 = is_t4(ep->com.dev->rdev.lldi.adapter_type) ? sizeof(struct cpl_act_open_req6) : sizeof(struct cpl_t5_act_open_req6); - struct sockaddr_in *la = (struct sockaddr_in *)&ep->com.local_addr; - struct sockaddr_in *ra = (struct sockaddr_in *)&ep->com.remote_addr; - struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&ep->com.local_addr; - struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&ep->com.remote_addr; + struct sockaddr_in *la = (struct sockaddr_in *) + &ep->com.mapped_local_addr; + struct sockaddr_in *ra = (struct sockaddr_in *) + &ep->com.mapped_remote_addr; + struct sockaddr_in6 *la6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; + struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *) + &ep->com.mapped_remote_addr; + int win; wrlen = (ep->com.remote_addr.ss_family == AF_INET) ? roundup(sizev4, 16) : @@ -605,8 +631,18 @@ static int send_connect(struct c4iw_ep *ep) } set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx); - cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx); + best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, + enable_tcp_timestamps); wscale = compute_wscale(rcv_win); + + /* + * Specify the largest window that will fit in opt0. The + * remainder will be specified in the rx_data_ack. + */ + win = ep->rcv_win >> 10; + if (win > RCV_BUFSIZ_MASK) + win = RCV_BUFSIZ_MASK; + opt0 = (nocong ? NO_CONG(1) : 0) | KEEP_ALIVE(1) | DELACK(1) | @@ -617,7 +653,7 @@ static int send_connect(struct c4iw_ep *ep) SMAC_SEL(ep->smac_idx) | DSCP(ep->tos) | ULP_MODE(ULP_MODE_TCPDDP) | - RCV_BUFSIZ(rcv_win>>10); + RCV_BUFSIZ(win); opt2 = RX_CHANNEL(0) | CCTRL_ECN(enable_ecn) | RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid); @@ -627,7 +663,11 @@ static int send_connect(struct c4iw_ep *ep) opt2 |= SACK_EN(1); if (wscale && enable_tcp_window_scaling) opt2 |= WND_SCALE_EN(1); - t4_set_arp_err_handler(skb, NULL, act_open_req_arp_failure); + if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) { + opt2 |= T5_OPT_2_VALID; + opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); + } + t4_set_arp_err_handler(skb, ep, act_open_req_arp_failure); if (is_t4(ep->com.dev->rdev.lldi.adapter_type)) { if (ep->com.remote_addr.ss_family == AF_INET) { @@ -641,8 +681,9 @@ static int send_connect(struct c4iw_ep *ep) req->local_ip = la->sin_addr.s_addr; req->peer_ip = ra->sin_addr.s_addr; req->opt0 = cpu_to_be64(opt0); - req->params = cpu_to_be32(select_ntuple(ep->com.dev, - ep->dst, ep->l2t)); + req->params = cpu_to_be32(cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t)); req->opt2 = cpu_to_be32(opt2); } else { req6 = (struct cpl_act_open_req6 *)skb_put(skb, wrlen); @@ -662,12 +703,19 @@ static int send_connect(struct c4iw_ep *ep) req6->peer_ip_lo = *((__be64 *) (ra6->sin6_addr.s6_addr + 8)); req6->opt0 = cpu_to_be64(opt0); - req6->params = cpu_to_be32( - select_ntuple(ep->com.dev, ep->dst, - ep->l2t)); + req6->params = cpu_to_be32(cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t)); req6->opt2 = cpu_to_be32(opt2); } } else { + u32 isn = (prandom_u32() & ~7UL) - 1; + + opt2 |= T5_OPT_2_VALID; + opt2 |= CONG_CNTRL_VALID; /* OPT_2_ISS for T5 */ + if (peer2peer) + isn += 4; + if (ep->com.remote_addr.ss_family == AF_INET) { t5_req = (struct cpl_t5_act_open_req *) skb_put(skb, wrlen); @@ -681,8 +729,12 @@ static int send_connect(struct c4iw_ep *ep) t5_req->peer_ip = ra->sin_addr.s_addr; t5_req->opt0 = cpu_to_be64(opt0); t5_req->params = cpu_to_be64(V_FILTER_TUPLE( - select_ntuple(ep->com.dev, - ep->dst, ep->l2t))); + cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t))); + t5_req->rsvd = cpu_to_be32(isn); + PDBG("%s snd_isn %u\n", __func__, + be32_to_cpu(t5_req->rsvd)); t5_req->opt2 = cpu_to_be32(opt2); } else { t5_req6 = (struct cpl_t5_act_open_req6 *) @@ -703,7 +755,12 @@ static int send_connect(struct c4iw_ep *ep) (ra6->sin6_addr.s6_addr + 8)); t5_req6->opt0 = cpu_to_be64(opt0); t5_req6->params = (__force __be64)cpu_to_be32( - select_ntuple(ep->com.dev, ep->dst, ep->l2t)); + cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t)); + t5_req6->rsvd = cpu_to_be32(isn); + PDBG("%s snd_isn %u\n", __func__, + be32_to_cpu(t5_req6->rsvd)); t5_req6->opt2 = cpu_to_be32(opt2); } } @@ -799,8 +856,9 @@ static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, ep->mpa_skb = skb; c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); start_ep_timer(ep); - state_set(&ep->com, MPA_REQ_SENT); + __state_set(&ep->com, MPA_REQ_SENT); ep->mpa_attr.initiator = 1; + ep->snd_seq += mpalen; return; } @@ -880,6 +938,7 @@ static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen) t4_set_arp_err_handler(skb, NULL, arp_failure_discard); BUG_ON(ep->mpa_skb); ep->mpa_skb = skb; + ep->snd_seq += mpalen; return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } @@ -963,7 +1022,8 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen) skb_get(skb); t4_set_arp_err_handler(skb, NULL, arp_failure_discard); ep->mpa_skb = skb; - state_set(&ep->com, MPA_REP_SENT); + __state_set(&ep->com, MPA_REP_SENT); + ep->snd_seq += mpalen; return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } @@ -980,6 +1040,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) PDBG("%s ep %p tid %u snd_isn %u rcv_isn %u\n", __func__, ep, tid, be32_to_cpu(req->snd_isn), be32_to_cpu(req->rcv_isn)); + mutex_lock(&ep->com.mutex); dst_confirm(ep->dst); /* setup the hwtid for this connection */ @@ -1003,17 +1064,18 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) send_mpa_req(ep, skb, 1); else send_mpa_req(ep, skb, mpa_rev); - + mutex_unlock(&ep->com.mutex); return 0; } -static void close_complete_upcall(struct c4iw_ep *ep) +static void close_complete_upcall(struct c4iw_ep *ep, int status) { struct iw_cm_event event; PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CLOSE; + event.status = status; if (ep->com.cm_id) { PDBG("close complete delivered ep %p cm_id %p tid %u\n", ep, ep->com.cm_id, ep->hwtid); @@ -1027,8 +1089,7 @@ static void close_complete_upcall(struct c4iw_ep *ep) static int abort_connection(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp) { PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); - close_complete_upcall(ep); - state_set(&ep->com, ABORTING); + __state_set(&ep->com, ABORTING); set_bit(ABORT_CONN, &ep->com.history); return send_abort(ep, skb, gfp); } @@ -1106,9 +1167,10 @@ static void connect_reply_upcall(struct c4iw_ep *ep, int status) } } -static void connect_request_upcall(struct c4iw_ep *ep) +static int connect_request_upcall(struct c4iw_ep *ep) { struct iw_cm_event event; + int ret; PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); memset(&event, 0, sizeof(event)); @@ -1133,15 +1195,14 @@ static void connect_request_upcall(struct c4iw_ep *ep) event.private_data_len = ep->plen; event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); } - if (state_read(&ep->parent_ep->com) != DEAD) { - c4iw_get_ep(&ep->com); - ep->parent_ep->com.cm_id->event_handler( - ep->parent_ep->com.cm_id, - &event); - } + c4iw_get_ep(&ep->com); + ret = ep->parent_ep->com.cm_id->event_handler(ep->parent_ep->com.cm_id, + &event); + if (ret) + c4iw_put_ep(&ep->com); set_bit(CONNREQ_UPCALL, &ep->com.history); c4iw_put_ep(&ep->parent_ep->com); - ep->parent_ep = NULL; + return ret; } static void established_upcall(struct c4iw_ep *ep) @@ -1173,6 +1234,14 @@ static int update_rx_credits(struct c4iw_ep *ep, u32 credits) return 0; } + /* + * If we couldn't specify the entire rcv window at connection setup + * due to the limit in the number of bits in the RCV_BUFSIZ field, + * then add the overage in to the credits returned. + */ + if (ep->rcv_win > RCV_BUFSIZ_MASK * 1024) + credits += ep->rcv_win - RCV_BUFSIZ_MASK * 1024; + req = (struct cpl_rx_data_ack *) skb_put(skb, wrlen); memset(req, 0, wrlen); INIT_TP_WR(req, ep->hwtid); @@ -1186,7 +1255,7 @@ static int update_rx_credits(struct c4iw_ep *ep, u32 credits) return credits; } -static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) +static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) { struct mpa_message *mpa; struct mpa_v2_conn_params *mpa_v2_params; @@ -1196,17 +1265,17 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) struct c4iw_qp_attributes attrs; enum c4iw_qp_attr_mask mask; int err; + int disconnect = 0; PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); /* - * Stop mpa timer. If it expired, then the state has - * changed and we bail since ep_timeout already aborted - * the connection. + * Stop mpa timer. If it expired, then + * we ignore the MPA reply. process_timeout() + * will abort the connection. */ - stop_ep_timer(ep); - if (state_read(&ep->com) != MPA_REQ_SENT) - return; + if (stop_ep_timer(ep)) + return 0; /* * If we get more than the supported amount of private data @@ -1228,7 +1297,7 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) * if we don't even have the mpa message, then bail. */ if (ep->mpa_pkt_len < sizeof(*mpa)) - return; + return 0; mpa = (struct mpa_message *) ep->mpa_pkt; /* Validate MPA header. */ @@ -1268,7 +1337,7 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) * We'll continue process when more data arrives. */ if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) - return; + return 0; if (mpa->flags & MPA_REJECT) { err = -ECONNREFUSED; @@ -1280,7 +1349,7 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) * start reply message including private data. And * the MPA header is valid. */ - state_set(&ep->com, FPDU_MODE); + __state_set(&ep->com, FPDU_MODE); ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; @@ -1370,9 +1439,11 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) attrs.layer_etype = LAYER_MPA | DDP_LLP; attrs.ecode = MPA_NOMATCH_RTR; attrs.next_state = C4IW_QP_STATE_TERMINATE; + attrs.send_term = 1; err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, - C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); err = -ENOMEM; + disconnect = 1; goto out; } @@ -1388,18 +1459,20 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) attrs.layer_etype = LAYER_MPA | DDP_LLP; attrs.ecode = MPA_INSUFF_IRD; attrs.next_state = C4IW_QP_STATE_TERMINATE; + attrs.send_term = 1; err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, - C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); err = -ENOMEM; + disconnect = 1; goto out; } goto out; err: - state_set(&ep->com, ABORTING); + __state_set(&ep->com, ABORTING); send_abort(ep, skb, GFP_KERNEL); out: connect_reply_upcall(ep, err); - return; + return disconnect; } static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) @@ -1410,15 +1483,12 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); - if (state_read(&ep->com) != MPA_REQ_WAIT) - return; - /* * If we get more than the supported amount of private data * then we must fail this connection. */ if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { - stop_ep_timer(ep); + (void)stop_ep_timer(ep); abort_connection(ep, skb, GFP_KERNEL); return; } @@ -1440,7 +1510,6 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) return; PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__); - stop_ep_timer(ep); mpa = (struct mpa_message *) ep->mpa_pkt; /* @@ -1449,13 +1518,13 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) if (mpa->revision > mpa_rev) { printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d," " Received = %d\n", __func__, mpa_rev, mpa->revision); - stop_ep_timer(ep); + (void)stop_ep_timer(ep); abort_connection(ep, skb, GFP_KERNEL); return; } if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) { - stop_ep_timer(ep); + (void)stop_ep_timer(ep); abort_connection(ep, skb, GFP_KERNEL); return; } @@ -1466,7 +1535,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) * Fail if there's too much private data. */ if (plen > MPA_MAX_PRIVATE_DATA) { - stop_ep_timer(ep); + (void)stop_ep_timer(ep); abort_connection(ep, skb, GFP_KERNEL); return; } @@ -1475,7 +1544,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) * If plen does not account for pkt size */ if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { - stop_ep_timer(ep); + (void)stop_ep_timer(ep); abort_connection(ep, skb, GFP_KERNEL); return; } @@ -1532,10 +1601,24 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version, ep->mpa_attr.p2p_type); - state_set(&ep->com, MPA_REQ_RCVD); - - /* drive upcall */ - connect_request_upcall(ep); + /* + * If the endpoint timer already expired, then we ignore + * the start request. process_timeout() will abort + * the connection. + */ + if (!stop_ep_timer(ep)) { + __state_set(&ep->com, MPA_REQ_RCVD); + + /* drive upcall */ + mutex_lock(&ep->parent_ep->com.mutex); + if (ep->parent_ep->com.state != DEAD) { + if (connect_request_upcall(ep)) + abort_connection(ep, skb, GFP_KERNEL); + } else { + abort_connection(ep, skb, GFP_KERNEL); + } + mutex_unlock(&ep->parent_ep->com.mutex); + } return; } @@ -1547,19 +1630,23 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb) unsigned int tid = GET_TID(hdr); struct tid_info *t = dev->rdev.lldi.tids; __u8 status = hdr->status; + int disconnect = 0; ep = lookup_tid(t, tid); + if (!ep) + return 0; PDBG("%s ep %p tid %u dlen %u\n", __func__, ep, ep->hwtid, dlen); skb_pull(skb, sizeof(*hdr)); skb_trim(skb, dlen); + mutex_lock(&ep->com.mutex); /* update RX credits */ update_rx_credits(ep, dlen); - switch (state_read(&ep->com)) { + switch (ep->com.state) { case MPA_REQ_SENT: ep->rcv_seq += dlen; - process_mpa_reply(ep, skb); + disconnect = process_mpa_reply(ep, skb); break; case MPA_REQ_WAIT: ep->rcv_seq += dlen; @@ -1572,15 +1659,19 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb) pr_err("%s Unexpected streaming data." \ " qpid %u ep %p state %d tid %u status %d\n", __func__, ep->com.qp->wq.sq.qid, ep, - state_read(&ep->com), ep->hwtid, status); + ep->com.state, ep->hwtid, status); attrs.next_state = C4IW_QP_STATE_TERMINATE; c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, - C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + disconnect = 1; break; } default: break; } + mutex_unlock(&ep->com.mutex); + if (disconnect) + c4iw_ep_disconnect(ep, 0, GFP_KERNEL); return 0; } @@ -1624,18 +1715,20 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) unsigned int mtu_idx; int wscale; struct sockaddr_in *sin; + int win; skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); req = (struct fw_ofld_connection_wr *)__skb_put(skb, sizeof(*req)); memset(req, 0, sizeof(*req)); req->op_compl = htonl(V_WR_OP(FW_OFLD_CONNECTION_WR)); req->len16_pkd = htonl(FW_WR_LEN16(DIV_ROUND_UP(sizeof(*req), 16))); - req->le.filter = cpu_to_be32(select_ntuple(ep->com.dev, ep->dst, + req->le.filter = cpu_to_be32(cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], ep->l2t)); - sin = (struct sockaddr_in *)&ep->com.local_addr; + sin = (struct sockaddr_in *)&ep->com.mapped_local_addr; req->le.lport = sin->sin_port; req->le.u.ipv4.lip = sin->sin_addr.s_addr; - sin = (struct sockaddr_in *)&ep->com.remote_addr; + sin = (struct sockaddr_in *)&ep->com.mapped_remote_addr; req->le.pport = sin->sin_port; req->le.u.ipv4.pip = sin->sin_addr.s_addr; req->tcb.t_state_to_astid = @@ -1645,8 +1738,18 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) htons(F_FW_OFLD_CONNECTION_WR_CPLRXDATAACK); req->tcb.tx_max = (__force __be32) jiffies; req->tcb.rcv_adv = htons(1); - cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx); + best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, + enable_tcp_timestamps); wscale = compute_wscale(rcv_win); + + /* + * Specify the largest window that will fit in opt0. The + * remainder will be specified in the rx_data_ack. + */ + win = ep->rcv_win >> 10; + if (win > RCV_BUFSIZ_MASK) + win = RCV_BUFSIZ_MASK; + req->tcb.opt0 = (__force __be64) (TCAM_BYPASS(1) | (nocong ? NO_CONG(1) : 0) | KEEP_ALIVE(1) | @@ -1658,7 +1761,7 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) SMAC_SEL(ep->smac_idx) | DSCP(ep->tos) | ULP_MODE(ULP_MODE_TCPDDP) | - RCV_BUFSIZ(rcv_win >> 10)); + RCV_BUFSIZ(win)); req->tcb.opt2 = (__force __be32) (PACE(1) | TX_QUEUE(ep->com.dev->rdev.lldi.tx_modq[ep->tx_chan]) | RX_CHANNEL(0) | @@ -1686,6 +1789,22 @@ static inline int act_open_has_tid(int status) status != CPL_ERR_ARP_MISS; } +/* Returns whether a CPL status conveys negative advice. + */ +static int is_neg_adv(unsigned int status) +{ + return status == CPL_ERR_RTX_NEG_ADVICE || + status == CPL_ERR_PERSIST_NEG_ADVICE || + status == CPL_ERR_KEEPALV_NEG_ADVICE; +} + +static void set_tcp_window(struct c4iw_ep *ep, struct port_info *pi) +{ + ep->snd_win = snd_win; + ep->rcv_win = rcv_win; + PDBG("%s snd_win %d rcv_win %d\n", __func__, ep->snd_win, ep->rcv_win); +} + #define ACT_OPEN_RETRY_COUNT 2 static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, @@ -1734,6 +1853,7 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, ep->ctrlq_idx = cxgb4_port_idx(pdev); ep->rss_qid = cdev->rdev.lldi.rxq_ids[ cxgb4_port_idx(pdev) * step]; + set_tcp_window(ep, (struct port_info *)netdev_priv(pdev)); dev_put(pdev); } else { pdev = get_real_dev(n->dev); @@ -1742,16 +1862,17 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, if (!ep->l2t) goto out; ep->mtu = dst_mtu(dst); - ep->tx_chan = cxgb4_port_chan(n->dev); - ep->smac_idx = (cxgb4_port_viid(n->dev) & 0x7F) << 1; + ep->tx_chan = cxgb4_port_chan(pdev); + ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1; step = cdev->rdev.lldi.ntxq / cdev->rdev.lldi.nchan; - ep->txq_idx = cxgb4_port_idx(n->dev) * step; - ep->ctrlq_idx = cxgb4_port_idx(n->dev); + ep->txq_idx = cxgb4_port_idx(pdev) * step; + ep->ctrlq_idx = cxgb4_port_idx(pdev); step = cdev->rdev.lldi.nrxq / cdev->rdev.lldi.nchan; ep->rss_qid = cdev->rdev.lldi.rxq_ids[ - cxgb4_port_idx(n->dev) * step]; + cxgb4_port_idx(pdev) * step]; + set_tcp_window(ep, (struct port_info *)netdev_priv(pdev)); if (clear_mpa_v1) { ep->retry_with_mpa_v1 = 0; @@ -1866,15 +1987,15 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) struct sockaddr_in6 *ra6; ep = lookup_atid(t, atid); - la = (struct sockaddr_in *)&ep->com.local_addr; - ra = (struct sockaddr_in *)&ep->com.remote_addr; - la6 = (struct sockaddr_in6 *)&ep->com.local_addr; - ra6 = (struct sockaddr_in6 *)&ep->com.remote_addr; + la = (struct sockaddr_in *)&ep->com.mapped_local_addr; + ra = (struct sockaddr_in *)&ep->com.mapped_remote_addr; + la6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr; + ra6 = (struct sockaddr_in6 *)&ep->com.mapped_remote_addr; PDBG("%s ep %p atid %u status %u errno %d\n", __func__, ep, atid, status, status2errno(status)); - if (status == CPL_ERR_RTX_NEG_ADVICE) { + if (is_neg_adv(status)) { printk(KERN_WARNING MOD "Connection problems for atid %u\n", atid); return 0; @@ -1982,13 +2103,36 @@ static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, u64 opt0; u32 opt2; int wscale; + struct cpl_t5_pass_accept_rpl *rpl5 = NULL; + int win; PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); BUG_ON(skb_cloned(skb)); - skb_trim(skb, sizeof(*rpl)); + skb_get(skb); - cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx); + rpl = cplhdr(skb); + if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) { + skb_trim(skb, roundup(sizeof(*rpl5), 16)); + rpl5 = (void *)rpl; + INIT_TP_WR(rpl5, ep->hwtid); + } else { + skb_trim(skb, sizeof(*rpl)); + INIT_TP_WR(rpl, ep->hwtid); + } + OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, + ep->hwtid)); + + best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, + enable_tcp_timestamps && req->tcpopt.tstamp); wscale = compute_wscale(rcv_win); + + /* + * Specify the largest window that will fit in opt0. The + * remainder will be specified in the rx_data_ack. + */ + win = ep->rcv_win >> 10; + if (win > RCV_BUFSIZ_MASK) + win = RCV_BUFSIZ_MASK; opt0 = (nocong ? NO_CONG(1) : 0) | KEEP_ALIVE(1) | DELACK(1) | @@ -1999,7 +2143,7 @@ static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, SMAC_SEL(ep->smac_idx) | DSCP(ep->tos >> 2) | ULP_MODE(ULP_MODE_TCPDDP) | - RCV_BUFSIZ(rcv_win>>10); + RCV_BUFSIZ(win); opt2 = RX_CHANNEL(0) | RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid); @@ -2018,11 +2162,19 @@ static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, if (tcph->ece && tcph->cwr) opt2 |= CCTRL_ECN(1); } + if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) { + u32 isn = (prandom_u32() & ~7UL) - 1; + opt2 |= T5_OPT_2_VALID; + opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); + opt2 |= CONG_CNTRL_VALID; /* OPT_2_ISS for T5 */ + rpl5 = (void *)rpl; + memset(&rpl5->iss, 0, roundup(sizeof(*rpl5)-sizeof(*rpl), 16)); + if (peer2peer) + isn += 4; + rpl5->iss = cpu_to_be32(isn); + PDBG("%s iss %u\n", __func__, be32_to_cpu(rpl5->iss)); + } - rpl = cplhdr(skb); - INIT_TP_WR(rpl, ep->hwtid); - OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, - ep->hwtid)); rpl->opt0 = cpu_to_be64(opt0); rpl->opt2 = cpu_to_be32(opt2); set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx); @@ -2037,7 +2189,6 @@ static void reject_cr(struct c4iw_dev *dev, u32 hwtid, struct sk_buff *skb) PDBG("%s c4iw_dev %p tid %u\n", __func__, dev, hwtid); BUG_ON(skb_cloned(skb)); skb_trim(skb, sizeof(struct cpl_tid_release)); - skb_get(skb); release_tid(&dev->rdev, hwtid, skb); return; } @@ -2087,6 +2238,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) int err; u16 peer_mss = ntohs(req->tcpopt.mss); int iptype; + unsigned short hdrs; parent_ep = lookup_stid(t, stid); if (!parent_ep) { @@ -2144,8 +2296,10 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) goto reject; } - if (peer_mss && child_ep->mtu > (peer_mss + 40)) - child_ep->mtu = peer_mss + 40; + hdrs = sizeof(struct iphdr) + sizeof(struct tcphdr) + + ((enable_tcp_timestamps && req->tcpopt.tstamp) ? 12 : 0); + if (peer_mss && child_ep->mtu > (peer_mss + hdrs)) + child_ep->mtu = peer_mss + hdrs; state_set(&child_ep->com, CONNECTING); child_ep->com.dev = dev; @@ -2279,13 +2433,13 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) disconnect = 0; break; case MORIBUND: - stop_ep_timer(ep); + (void)stop_ep_timer(ep); if (ep->com.cm_id && ep->com.qp) { attrs.next_state = C4IW_QP_STATE_IDLE; c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } - close_complete_upcall(ep); + close_complete_upcall(ep, 0); __state_set(&ep->com, DEAD); release = 1; disconnect = 0; @@ -2304,15 +2458,6 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) return 0; } -/* - * Returns whether an ABORT_REQ_RSS message is a negative advice. - */ -static int is_neg_adv_abort(unsigned int status) -{ - return status == CPL_ERR_RTX_NEG_ADVICE || - status == CPL_ERR_PERSIST_NEG_ADVICE; -} - static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_abort_req_rss *req = cplhdr(skb); @@ -2326,7 +2471,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) unsigned int tid = GET_TID(req); ep = lookup_tid(t, tid); - if (is_neg_adv_abort(req->status)) { + if (is_neg_adv(req->status)) { PDBG("%s neg_adv_abort ep %p tid %u\n", __func__, ep, ep->hwtid); return 0; @@ -2348,10 +2493,10 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) case CONNECTING: break; case MPA_REQ_WAIT: - stop_ep_timer(ep); + (void)stop_ep_timer(ep); break; case MPA_REQ_SENT: - stop_ep_timer(ep); + (void)stop_ep_timer(ep); if (mpa_rev == 1 || (mpa_rev == 2 && ep->tried_with_mpa_v1)) connect_reply_upcall(ep, -ECONNRESET); else { @@ -2456,7 +2601,7 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb) __state_set(&ep->com, MORIBUND); break; case MORIBUND: - stop_ep_timer(ep); + (void)stop_ep_timer(ep); if ((ep->com.cm_id) && (ep->com.qp)) { attrs.next_state = C4IW_QP_STATE_IDLE; c4iw_modify_qp(ep->com.qp->rhp, @@ -2464,7 +2609,7 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb) C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } - close_complete_upcall(ep); + close_complete_upcall(ep, 0); __state_set(&ep->com, DEAD); release = 1; break; @@ -2539,22 +2684,28 @@ static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb) int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) { - int err; + int err = 0; + int disconnect = 0; struct c4iw_ep *ep = to_ep(cm_id); PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); - if (state_read(&ep->com) == DEAD) { + mutex_lock(&ep->com.mutex); + if (ep->com.state == DEAD) { + mutex_unlock(&ep->com.mutex); c4iw_put_ep(&ep->com); return -ECONNRESET; } set_bit(ULP_REJECT, &ep->com.history); - BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); + BUG_ON(ep->com.state != MPA_REQ_RCVD); if (mpa_rev == 0) abort_connection(ep, NULL, GFP_KERNEL); else { err = send_mpa_reject(ep, pdata, pdata_len); - err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL); + disconnect = 1; } + mutex_unlock(&ep->com.mutex); + if (disconnect) + err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL); c4iw_put_ep(&ep->com); return 0; } @@ -2569,12 +2720,14 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct c4iw_qp *qp = get_qhp(h, conn_param->qpn); PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); - if (state_read(&ep->com) == DEAD) { + + mutex_lock(&ep->com.mutex); + if (ep->com.state == DEAD) { err = -ECONNRESET; goto err; } - BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); + BUG_ON(ep->com.state != MPA_REQ_RCVD); BUG_ON(!qp); set_bit(ULP_ACCEPT, &ep->com.history); @@ -2643,14 +2796,16 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (err) goto err1; - state_set(&ep->com, FPDU_MODE); + __state_set(&ep->com, FPDU_MODE); established_upcall(ep); + mutex_unlock(&ep->com.mutex); c4iw_put_ep(&ep->com); return 0; err1: ep->com.cm_id = NULL; cm_id->rem_ref(cm_id); err: + mutex_unlock(&ep->com.mutex); c4iw_put_ep(&ep->com); return err; } @@ -2721,13 +2876,15 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct c4iw_dev *dev = to_c4iw_dev(cm_id->device); struct c4iw_ep *ep; int err = 0; - struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; - struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; - struct sockaddr_in6 *laddr6 = (struct sockaddr_in6 *)&cm_id->local_addr; - struct sockaddr_in6 *raddr6 = (struct sockaddr_in6 *) - &cm_id->remote_addr; + struct sockaddr_in *laddr; + struct sockaddr_in *raddr; + struct sockaddr_in6 *laddr6; + struct sockaddr_in6 *raddr6; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; __u8 *ra; int iptype; + int iwpm_err = 0; if ((conn_param->ord > c4iw_max_read_depth) || (conn_param->ird > c4iw_max_read_depth)) { @@ -2758,7 +2915,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (!ep->com.qp) { PDBG("%s qpn 0x%x not found!\n", __func__, conn_param->qpn); err = -EINVAL; - goto fail2; + goto fail1; } ref_qp(ep); PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn, @@ -2771,10 +2928,50 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (ep->atid == -1) { printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__); err = -ENOMEM; - goto fail2; + goto fail1; } insert_handle(dev, &dev->atid_idr, ep, ep->atid); + memcpy(&ep->com.local_addr, &cm_id->local_addr, + sizeof(ep->com.local_addr)); + memcpy(&ep->com.remote_addr, &cm_id->remote_addr, + sizeof(ep->com.remote_addr)); + + /* No port mapper available, go with the specified peer information */ + memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr, + sizeof(ep->com.mapped_local_addr)); + memcpy(&ep->com.mapped_remote_addr, &cm_id->remote_addr, + sizeof(ep->com.mapped_remote_addr)); + + c4iw_form_reg_msg(dev, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW); + if (iwpm_err) { + PDBG("%s: Port Mapper reg pid fail (err = %d).\n", + __func__, iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + c4iw_form_pm_msg(ep, &pm_msg); + iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_C4IW); + if (iwpm_err) + PDBG("%s: Port Mapper query fail (err = %d).\n", + __func__, iwpm_err); + else + c4iw_record_pm_msg(ep, &pm_msg); + } + if (iwpm_create_mapinfo(&ep->com.local_addr, + &ep->com.mapped_local_addr, RDMA_NL_C4IW)) { + iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW); + err = -ENOMEM; + goto fail1; + } + print_addr(&ep->com, __func__, "add_query/create_mapinfo"); + set_bit(RELEASE_MAPINFO, &ep->com.flags); + + laddr = (struct sockaddr_in *)&ep->com.mapped_local_addr; + raddr = (struct sockaddr_in *)&ep->com.mapped_remote_addr; + laddr6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr; + raddr6 = (struct sockaddr_in6 *) &ep->com.mapped_remote_addr; + if (cm_id->remote_addr.ss_family == AF_INET) { iptype = 4; ra = (__u8 *)&raddr->sin_addr; @@ -2785,7 +2982,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if ((__force int)raddr->sin_addr.s_addr == INADDR_ANY) { err = pick_local_ipaddrs(dev, cm_id); if (err) - goto fail2; + goto fail1; } /* find a route */ @@ -2805,7 +3002,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (ipv6_addr_type(&raddr6->sin6_addr) == IPV6_ADDR_ANY) { err = pick_local_ip6addrs(dev, cm_id); if (err) - goto fail2; + goto fail1; } /* find a route */ @@ -2821,13 +3018,13 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (!ep->dst) { printk(KERN_ERR MOD "%s - cannot find route.\n", __func__); err = -EHOSTUNREACH; - goto fail3; + goto fail2; } err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, true); if (err) { printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); - goto fail4; + goto fail3; } PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n", @@ -2836,10 +3033,6 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) state_set(&ep->com, CONNECTING); ep->tos = 0; - memcpy(&ep->com.local_addr, &cm_id->local_addr, - sizeof(ep->com.local_addr)); - memcpy(&ep->com.remote_addr, &cm_id->remote_addr, - sizeof(ep->com.remote_addr)); /* send connect request to rnic */ err = send_connect(ep); @@ -2847,12 +3040,12 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) goto out; cxgb4_l2t_release(ep->l2t); -fail4: - dst_release(ep->dst); fail3: + dst_release(ep->dst); +fail2: remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); -fail2: +fail1: cm_id->rem_ref(cm_id); c4iw_put_ep(&ep->com); out: @@ -2862,7 +3055,8 @@ out: static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) { int err; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ep->com.local_addr; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; c4iw_init_wr_wait(&ep->com.wr_wait); err = cxgb4_create_server6(ep->com.dev->rdev.lldi.ports[0], @@ -2883,7 +3077,8 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) { int err; - struct sockaddr_in *sin = (struct sockaddr_in *)&ep->com.local_addr; + struct sockaddr_in *sin = (struct sockaddr_in *) + &ep->com.mapped_local_addr; if (dev->rdev.lldi.enable_fw_ofld_conn) { do { @@ -2918,6 +3113,9 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) int err = 0; struct c4iw_dev *dev = to_c4iw_dev(cm_id->device); struct c4iw_listen_ep *ep; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; + int iwpm_err = 0; might_sleep(); @@ -2938,7 +3136,8 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) /* * Allocate a server TID. */ - if (dev->rdev.lldi.enable_fw_ofld_conn) + if (dev->rdev.lldi.enable_fw_ofld_conn && + ep->com.local_addr.ss_family == AF_INET) ep->stid = cxgb4_alloc_sftid(dev->rdev.lldi.tids, cm_id->local_addr.ss_family, ep); else @@ -2951,6 +3150,37 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) goto fail2; } insert_handle(dev, &dev->stid_idr, ep, ep->stid); + + /* No port mapper available, go with the specified info */ + memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr, + sizeof(ep->com.mapped_local_addr)); + + c4iw_form_reg_msg(dev, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW); + if (iwpm_err) { + PDBG("%s: Port Mapper reg pid fail (err = %d).\n", + __func__, iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + memcpy(&pm_msg.loc_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_C4IW); + if (iwpm_err) + PDBG("%s: Port Mapper query fail (err = %d).\n", + __func__, iwpm_err); + else + memcpy(&ep->com.mapped_local_addr, + &pm_msg.mapped_loc_addr, + sizeof(ep->com.mapped_local_addr)); + } + if (iwpm_create_mapinfo(&ep->com.local_addr, + &ep->com.mapped_local_addr, RDMA_NL_C4IW)) { + err = -ENOMEM; + goto fail3; + } + print_addr(&ep->com, __func__, "add_mapping/create_mapinfo"); + + set_bit(RELEASE_MAPINFO, &ep->com.flags); state_set(&ep->com, LISTEN); if (ep->com.local_addr.ss_family == AF_INET) err = create_server4(dev, ep); @@ -2960,6 +3190,8 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) cm_id->provider_data = ep; goto out; } + +fail3: cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, ep->com.local_addr.ss_family); fail2: @@ -3018,7 +3250,7 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) rdev = &ep->com.dev->rdev; if (c4iw_fatal_error(rdev)) { fatal = 1; - close_complete_upcall(ep); + close_complete_upcall(ep, -EIO); ep->com.state = DEAD; } switch (ep->com.state) { @@ -3040,7 +3272,7 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) { close = 1; if (abrupt) { - stop_ep_timer(ep); + (void)stop_ep_timer(ep); ep->com.state = ABORTING; } else ep->com.state = MORIBUND; @@ -3060,7 +3292,7 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) if (close) { if (abrupt) { set_bit(EP_DISC_ABORT, &ep->com.history); - close_complete_upcall(ep); + close_complete_upcall(ep, -ECONNRESET); ret = send_abort(ep, NULL, gfp); } else { set_bit(EP_DISC_CLOSE, &ep->com.history); @@ -3241,6 +3473,7 @@ static void send_fw_pass_open_req(struct c4iw_dev *dev, struct sk_buff *skb, struct sk_buff *req_skb; struct fw_ofld_connection_wr *req; struct cpl_pass_accept_req *cpl = cplhdr(skb); + int ret; req_skb = alloc_skb(sizeof(struct fw_ofld_connection_wr), GFP_KERNEL); req = (struct fw_ofld_connection_wr *)__skb_put(req_skb, sizeof(*req)); @@ -3277,7 +3510,13 @@ static void send_fw_pass_open_req(struct c4iw_dev *dev, struct sk_buff *skb, req->cookie = (unsigned long)skb; set_wr_txq(req_skb, CPL_PRIORITY_CONTROL, port_id); - cxgb4_ofld_send(dev->rdev.lldi.ports[0], req_skb); + ret = cxgb4_ofld_send(dev->rdev.lldi.ports[0], req_skb); + if (ret < 0) { + pr_err("%s - cxgb4_ofld_send error %d - dropping\n", __func__, + ret); + kfree_skb(skb); + kfree_skb(req_skb); + } } /* @@ -3323,9 +3562,7 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb) /* * Calculate the server tid from filter hit index from cpl_rx_pkt. */ - stid = (__force int) cpu_to_be32((__force u32) rss->hash_val) - - dev->rdev.lldi.tids->sftid_base - + dev->rdev.lldi.tids->nstids; + stid = (__force int) cpu_to_be32((__force u32) rss->hash_val); lep = (struct c4iw_ep *)lookup_stid(dev->rdev.lldi.tids, stid); if (!lep) { @@ -3386,6 +3623,7 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb) pi = (struct port_info *)netdev_priv(pdev); tx_chan = cxgb4_port_chan(pdev); } + neigh_release(neigh); if (!e) { pr_err("%s - failed to allocate l2t entry!\n", __func__); @@ -3397,7 +3635,9 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb) window = (__force u16) htons((__force u16)tcph->window); /* Calcuate filter portion for LE region. */ - filter = (__force unsigned int) cpu_to_be32(select_ntuple(dev, dst, e)); + filter = (__force unsigned int) cpu_to_be32(cxgb4_select_ntuple( + dev->rdev.lldi.ports[0], + e)); /* * Synthesize the cpl_pass_accept_req. We have everything except the @@ -3464,15 +3704,26 @@ static void process_timeout(struct c4iw_ep *ep) &attrs, 1); } __state_set(&ep->com, ABORTING); + close_complete_upcall(ep, -ETIMEDOUT); + break; + case ABORTING: + case DEAD: + + /* + * These states are expected if the ep timed out at the same + * time as another thread was calling stop_ep_timer(). + * So we silently do nothing for these states. + */ + abort = 0; break; default: WARN(1, "%s unexpected state ep %p tid %u state %u\n", __func__, ep, ep->hwtid, ep->com.state); abort = 0; } - mutex_unlock(&ep->com.mutex); if (abort) abort_connection(ep, NULL, GFP_KERNEL); + mutex_unlock(&ep->com.mutex); c4iw_put_ep(&ep->com); } @@ -3486,6 +3737,8 @@ static void process_timedout_eps(void) tmp = timeout_list.next; list_del(tmp); + tmp->next = NULL; + tmp->prev = NULL; spin_unlock_irq(&timeout_lock); ep = list_entry(tmp, struct c4iw_ep, entry); process_timeout(ep); @@ -3502,6 +3755,7 @@ static void process_work(struct work_struct *work) unsigned int opcode; int ret; + process_timedout_eps(); while ((skb = skb_dequeue(&rxq))) { rpl = cplhdr(skb); dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *))); @@ -3511,8 +3765,8 @@ static void process_work(struct work_struct *work) ret = work_handlers[opcode](dev, skb); if (!ret) kfree_skb(skb); + process_timedout_eps(); } - process_timedout_eps(); } static DECLARE_WORK(skb_work, process_work); @@ -3524,8 +3778,13 @@ static void ep_timeout(unsigned long arg) spin_lock(&timeout_lock); if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) { - list_add_tail(&ep->entry, &timeout_list); - kickit = 1; + /* + * Only insert if it is not already on the list. + */ + if (!ep->entry.next) { + list_add_tail(&ep->entry, &timeout_list); + kickit = 1; + } } spin_unlock(&timeout_lock); if (kickit) @@ -3607,7 +3866,7 @@ static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb) kfree_skb(skb); return 0; } - if (is_neg_adv_abort(req->status)) { + if (is_neg_adv(req->status)) { PDBG("%s neg_adv_abort ep %p tid %u\n", __func__, ep, ep->hwtid); kfree_skb(skb); @@ -3666,7 +3925,7 @@ int __init c4iw_cm_init(void) return 0; } -void __exit c4iw_cm_term(void) +void c4iw_cm_term(void) { WARN_ON(!list_empty(&timeout_list)); flush_workqueue(workq); diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 88de3aa9c5b..c04292c950f 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -134,7 +134,8 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, V_FW_RI_RES_WR_IQANUS(0) | V_FW_RI_RES_WR_IQANUD(1) | F_FW_RI_RES_WR_IQANDST | - V_FW_RI_RES_WR_IQANDSTINDEX(*rdev->lldi.rxq_ids)); + V_FW_RI_RES_WR_IQANDSTINDEX( + rdev->lldi.ciq_ids[cq->vector])); res->u.cq.iqdroprss_to_iqesize = cpu_to_be16( F_FW_RI_RES_WR_IQDROPRSS | V_FW_RI_RES_WR_IQPCIECH(2) | @@ -235,27 +236,21 @@ int c4iw_flush_sq(struct c4iw_qp *qhp) struct t4_cq *cq = &chp->cq; int idx; struct t4_swsqe *swsqe; - int error = (qhp->attr.state != C4IW_QP_STATE_CLOSING && - qhp->attr.state != C4IW_QP_STATE_IDLE); if (wq->sq.flush_cidx == -1) wq->sq.flush_cidx = wq->sq.cidx; idx = wq->sq.flush_cidx; BUG_ON(idx >= wq->sq.size); while (idx != wq->sq.pidx) { - if (error) { - swsqe = &wq->sq.sw_sq[idx]; - BUG_ON(swsqe->flushed); - swsqe->flushed = 1; - insert_sq_cqe(wq, cq, swsqe); - if (wq->sq.oldest_read == swsqe) { - BUG_ON(swsqe->opcode != FW_RI_READ_REQ); - advance_oldest_read(wq); - } - flushed++; - } else { - t4_sq_consume(wq); + swsqe = &wq->sq.sw_sq[idx]; + BUG_ON(swsqe->flushed); + swsqe->flushed = 1; + insert_sq_cqe(wq, cq, swsqe); + if (wq->sq.oldest_read == swsqe) { + BUG_ON(swsqe->opcode != FW_RI_READ_REQ); + advance_oldest_read(wq); } + flushed++; if (++idx == wq->sq.size) idx = 0; } @@ -365,8 +360,14 @@ void c4iw_flush_hw_cq(struct c4iw_cq *chp) if (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP) { - /* - * drop peer2peer RTR reads. + /* If we have reached here because of async + * event or other error, and have egress error + * then drop + */ + if (CQE_TYPE(hw_cqe) == 1) + goto next_cqe; + + /* drop peer2peer RTR reads. */ if (CQE_WRID_STAG(hw_cqe) == 1) goto next_cqe; @@ -511,8 +512,18 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe, */ if (RQ_TYPE(hw_cqe) && (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP)) { - /* - * If this is an unsolicited read response, then the read + /* If we have reached here because of async + * event or other error, and have egress error + * then drop + */ + if (CQE_TYPE(hw_cqe) == 1) { + if (CQE_STATUS(hw_cqe)) + t4_set_wq_in_error(wq); + ret = -EAGAIN; + goto skip_cqe; + } + + /* If this is an unsolicited read response, then the read * was generated by the kernel driver as part of peer-2-peer * connection setup. So ignore the completion. */ @@ -603,7 +614,7 @@ proc_cqe: */ if (SQ_TYPE(hw_cqe)) { int idx = CQE_WRID_SQ_IDX(hw_cqe); - BUG_ON(idx > wq->sq.size); + BUG_ON(idx >= wq->sq.size); /* * Account for any unsignaled completions completed by @@ -617,7 +628,7 @@ proc_cqe: wq->sq.in_use -= wq->sq.size + idx - wq->sq.cidx; else wq->sq.in_use -= idx - wq->sq.cidx; - BUG_ON(wq->sq.in_use < 0 && wq->sq.in_use < wq->sq.size); + BUG_ON(wq->sq.in_use <= 0 && wq->sq.in_use >= wq->sq.size); wq->sq.cidx = (uint16_t)idx; PDBG("%s completing sq idx %u\n", __func__, wq->sq.cidx); @@ -662,7 +673,7 @@ skip_cqe: static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) { struct c4iw_qp *qhp = NULL; - struct t4_cqe cqe = {0, 0}, *rd_cqe; + struct t4_cqe uninitialized_var(cqe), *rd_cqe; struct t4_wq *wq; u32 credit = 0; u8 cqe_flushed; @@ -860,6 +871,9 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, rhp = to_c4iw_dev(ibdev); + if (vector >= rhp->rdev.lldi.nciq) + return ERR_PTR(-EINVAL); + chp = kzalloc(sizeof(*chp), GFP_KERNEL); if (!chp) return ERR_PTR(-ENOMEM); @@ -881,7 +895,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, /* * Make actual HW queue 2x to avoid cdix_inc overflows. */ - hwentries = entries * 2; + hwentries = min(entries * 2, T4_MAX_IQ_SIZE); /* * Make HW queue at least 64 entries so GTS updates aren't too @@ -905,6 +919,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, } chp->cq.size = hwentries; chp->cq.memsize = memsize; + chp->cq.vector = vector; ret = create_cq(&rhp->rdev, &chp->cq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx); @@ -940,7 +955,8 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, uresp.gts_key = ucontext->key; ucontext->key += PAGE_SIZE; spin_unlock(&ucontext->mmap_lock); - ret = ib_copy_to_udata(udata, &uresp, sizeof uresp); + ret = ib_copy_to_udata(udata, &uresp, + sizeof(uresp) - sizeof(uresp.reserved)); if (ret) goto err5; diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 33d2cc6ab56..7db82b24302 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -64,6 +64,10 @@ struct uld_ctx { static LIST_HEAD(uld_ctx_list); static DEFINE_MUTEX(dev_mutex); +#define DB_FC_RESUME_SIZE 64 +#define DB_FC_RESUME_DELAY 1 +#define DB_FC_DRAIN_THRESH 0 + static struct dentry *c4iw_debugfs_root; struct c4iw_debugfs_data { @@ -73,6 +77,16 @@ struct c4iw_debugfs_data { int pos; }; +/* registered cxgb4 netlink callbacks */ +static struct ibnl_client_cbs c4iw_nl_cb_table[] = { + [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, + [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, + [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, + [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, + [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} +}; + static int count_idrs(int id, void *p, void *data) { int *countp = data; @@ -109,35 +123,49 @@ static int dump_qp(int id, void *p, void *data) &qp->ep->com.local_addr; struct sockaddr_in *rsin = (struct sockaddr_in *) &qp->ep->com.remote_addr; + struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) + &qp->ep->com.mapped_local_addr; + struct sockaddr_in *mapped_rsin = (struct sockaddr_in *) + &qp->ep->com.mapped_remote_addr; cc = snprintf(qpd->buf + qpd->pos, space, "rc qp sq id %u rq id %u state %u " "onchip %u ep tid %u state %u " - "%pI4:%u->%pI4:%u\n", + "%pI4:%u/%u->%pI4:%u/%u\n", qp->wq.sq.qid, qp->wq.rq.qid, (int)qp->attr.state, qp->wq.sq.flags & T4_SQ_ONCHIP, qp->ep->hwtid, (int)qp->ep->com.state, &lsin->sin_addr, ntohs(lsin->sin_port), - &rsin->sin_addr, ntohs(rsin->sin_port)); + ntohs(mapped_lsin->sin_port), + &rsin->sin_addr, ntohs(rsin->sin_port), + ntohs(mapped_rsin->sin_port)); } else { struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) &qp->ep->com.local_addr; struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *) &qp->ep->com.remote_addr; + struct sockaddr_in6 *mapped_lsin6 = + (struct sockaddr_in6 *) + &qp->ep->com.mapped_local_addr; + struct sockaddr_in6 *mapped_rsin6 = + (struct sockaddr_in6 *) + &qp->ep->com.mapped_remote_addr; cc = snprintf(qpd->buf + qpd->pos, space, "rc qp sq id %u rq id %u state %u " "onchip %u ep tid %u state %u " - "%pI6:%u->%pI6:%u\n", + "%pI6:%u/%u->%pI6:%u/%u\n", qp->wq.sq.qid, qp->wq.rq.qid, (int)qp->attr.state, qp->wq.sq.flags & T4_SQ_ONCHIP, qp->ep->hwtid, (int)qp->ep->com.state, &lsin6->sin6_addr, ntohs(lsin6->sin6_port), + ntohs(mapped_lsin6->sin6_port), &rsin6->sin6_addr, - ntohs(rsin6->sin6_port)); + ntohs(rsin6->sin6_port), + ntohs(mapped_rsin6->sin6_port)); } } else cc = snprintf(qpd->buf + qpd->pos, space, @@ -282,7 +310,7 @@ static const struct file_operations stag_debugfs_fops = { .llseek = default_llseek, }; -static char *db_state_str[] = {"NORMAL", "FLOW_CONTROL", "RECOVERY"}; +static char *db_state_str[] = {"NORMAL", "FLOW_CONTROL", "RECOVERY", "STOPPED"}; static int stats_show(struct seq_file *seq, void *v) { @@ -311,9 +339,10 @@ static int stats_show(struct seq_file *seq, void *v) seq_printf(seq, " DB FULL: %10llu\n", dev->rdev.stats.db_full); seq_printf(seq, " DB EMPTY: %10llu\n", dev->rdev.stats.db_empty); seq_printf(seq, " DB DROP: %10llu\n", dev->rdev.stats.db_drop); - seq_printf(seq, " DB State: %s Transitions %llu\n", + seq_printf(seq, " DB State: %s Transitions %llu FC Interruptions %llu\n", db_state_str[dev->db_state], - dev->rdev.stats.db_state_transitions); + dev->rdev.stats.db_state_transitions, + dev->rdev.stats.db_fc_interruptions); seq_printf(seq, "TCAM_FULL: %10llu\n", dev->rdev.stats.tcam_full); seq_printf(seq, "ACT_OFLD_CONN_FAILS: %10llu\n", dev->rdev.stats.act_ofld_conn_fails); @@ -381,31 +410,43 @@ static int dump_ep(int id, void *p, void *data) &ep->com.local_addr; struct sockaddr_in *rsin = (struct sockaddr_in *) &ep->com.remote_addr; + struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) + &ep->com.mapped_local_addr; + struct sockaddr_in *mapped_rsin = (struct sockaddr_in *) + &ep->com.mapped_remote_addr; cc = snprintf(epd->buf + epd->pos, space, "ep %p cm_id %p qp %p state %d flags 0x%lx " "history 0x%lx hwtid %d atid %d " - "%pI4:%d <-> %pI4:%d\n", + "%pI4:%d/%d <-> %pI4:%d/%d\n", ep, ep->com.cm_id, ep->com.qp, (int)ep->com.state, ep->com.flags, ep->com.history, ep->hwtid, ep->atid, &lsin->sin_addr, ntohs(lsin->sin_port), - &rsin->sin_addr, ntohs(rsin->sin_port)); + ntohs(mapped_lsin->sin_port), + &rsin->sin_addr, ntohs(rsin->sin_port), + ntohs(mapped_rsin->sin_port)); } else { struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) &ep->com.local_addr; struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *) &ep->com.remote_addr; + struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; + struct sockaddr_in6 *mapped_rsin6 = (struct sockaddr_in6 *) + &ep->com.mapped_remote_addr; cc = snprintf(epd->buf + epd->pos, space, "ep %p cm_id %p qp %p state %d flags 0x%lx " "history 0x%lx hwtid %d atid %d " - "%pI6:%d <-> %pI6:%d\n", + "%pI6:%d/%d <-> %pI6:%d/%d\n", ep, ep->com.cm_id, ep->com.qp, (int)ep->com.state, ep->com.flags, ep->com.history, ep->hwtid, ep->atid, &lsin6->sin6_addr, ntohs(lsin6->sin6_port), - &rsin6->sin6_addr, ntohs(rsin6->sin6_port)); + ntohs(mapped_lsin6->sin6_port), + &rsin6->sin6_addr, ntohs(rsin6->sin6_port), + ntohs(mapped_rsin6->sin6_port)); } if (cc < space) epd->pos += cc; @@ -426,23 +467,29 @@ static int dump_listen_ep(int id, void *p, void *data) if (ep->com.local_addr.ss_family == AF_INET) { struct sockaddr_in *lsin = (struct sockaddr_in *) &ep->com.local_addr; + struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) + &ep->com.mapped_local_addr; cc = snprintf(epd->buf + epd->pos, space, "ep %p cm_id %p state %d flags 0x%lx stid %d " - "backlog %d %pI4:%d\n", + "backlog %d %pI4:%d/%d\n", ep, ep->com.cm_id, (int)ep->com.state, ep->com.flags, ep->stid, ep->backlog, - &lsin->sin_addr, ntohs(lsin->sin_port)); + &lsin->sin_addr, ntohs(lsin->sin_port), + ntohs(mapped_lsin->sin_port)); } else { struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) &ep->com.local_addr; + struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; cc = snprintf(epd->buf + epd->pos, space, "ep %p cm_id %p state %d flags 0x%lx stid %d " - "backlog %d %pI6:%d\n", + "backlog %d %pI6:%d/%d\n", ep, ep->com.cm_id, (int)ep->com.state, ep->com.flags, ep->stid, ep->backlog, - &lsin6->sin6_addr, ntohs(lsin6->sin6_port)); + &lsin6->sin6_addr, ntohs(lsin6->sin6_port), + ntohs(mapped_lsin6->sin6_port)); } if (cc < space) epd->pos += cc; @@ -602,10 +649,10 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) rdev->lldi.vr->qp.size, rdev->lldi.vr->cq.start, rdev->lldi.vr->cq.size); - PDBG("udb len 0x%x udb base %p db_reg %p gts_reg %p qpshift %lu " + PDBG("udb len 0x%x udb base %llx db_reg %p gts_reg %p qpshift %lu " "qpmask 0x%x cqshift %lu cqmask 0x%x\n", (unsigned)pci_resource_len(rdev->lldi.pdev, 2), - (void *)(unsigned long)pci_resource_start(rdev->lldi.pdev, 2), + (u64)pci_resource_start(rdev->lldi.pdev, 2), rdev->lldi.db_reg, rdev->lldi.gts_reg, rdev->qpshift, rdev->qpmask, @@ -643,6 +690,13 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) printk(KERN_ERR MOD "error %d initializing ocqp pool\n", err); goto err4; } + rdev->status_page = (struct t4_dev_status_page *) + __get_free_page(GFP_KERNEL); + if (!rdev->status_page) { + pr_err(MOD "error allocating status page\n"); + goto err4; + } + rdev->status_page->db_off = 0; return 0; err4: c4iw_rqtpool_destroy(rdev); @@ -656,6 +710,7 @@ err1: static void c4iw_rdev_close(struct c4iw_rdev *rdev) { + free_page((unsigned long)rdev->status_page); c4iw_pblpool_destroy(rdev); c4iw_rqtpool_destroy(rdev); c4iw_destroy_resource(&rdev->resource); @@ -670,7 +725,10 @@ static void c4iw_dealloc(struct uld_ctx *ctx) idr_destroy(&ctx->dev->hwtid_idr); idr_destroy(&ctx->dev->stid_idr); idr_destroy(&ctx->dev->atid_idr); - iounmap(ctx->dev->rdev.oc_mw_kva); + if (ctx->dev->rdev.bar2_kva) + iounmap(ctx->dev->rdev.bar2_kva); + if (ctx->dev->rdev.oc_mw_kva) + iounmap(ctx->dev->rdev.oc_mw_kva); ib_dealloc_device(&ctx->dev->ibdev); ctx->dev = NULL; } @@ -703,18 +761,6 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) pr_info("%s: On-Chip Queues not supported on this device.\n", pci_name(infop->pdev)); - if (!is_t4(infop->adapter_type)) { - if (!allow_db_fc_on_t5) { - db_fc_threshold = 100000; - pr_info("DB Flow Control Disabled.\n"); - } - - if (!allow_db_coalescing_on_t5) { - db_coalescing_threshold = -1; - pr_info("DB Coalescing Disabled.\n"); - } - } - devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp)); if (!devp) { printk(KERN_ERR MOD "Cannot allocate ib device\n"); @@ -722,11 +768,33 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) } devp->rdev.lldi = *infop; - devp->rdev.oc_mw_pa = pci_resource_start(devp->rdev.lldi.pdev, 2) + - (pci_resource_len(devp->rdev.lldi.pdev, 2) - - roundup_pow_of_two(devp->rdev.lldi.vr->ocq.size)); - devp->rdev.oc_mw_kva = ioremap_wc(devp->rdev.oc_mw_pa, - devp->rdev.lldi.vr->ocq.size); + /* + * For T5 devices, we map all of BAR2 with WC. + * For T4 devices with onchip qp mem, we map only that part + * of BAR2 with WC. + */ + devp->rdev.bar2_pa = pci_resource_start(devp->rdev.lldi.pdev, 2); + if (is_t5(devp->rdev.lldi.adapter_type)) { + devp->rdev.bar2_kva = ioremap_wc(devp->rdev.bar2_pa, + pci_resource_len(devp->rdev.lldi.pdev, 2)); + if (!devp->rdev.bar2_kva) { + pr_err(MOD "Unable to ioremap BAR2\n"); + ib_dealloc_device(&devp->ibdev); + return ERR_PTR(-EINVAL); + } + } else if (ocqp_supported(infop)) { + devp->rdev.oc_mw_pa = + pci_resource_start(devp->rdev.lldi.pdev, 2) + + pci_resource_len(devp->rdev.lldi.pdev, 2) - + roundup_pow_of_two(devp->rdev.lldi.vr->ocq.size); + devp->rdev.oc_mw_kva = ioremap_wc(devp->rdev.oc_mw_pa, + devp->rdev.lldi.vr->ocq.size); + if (!devp->rdev.oc_mw_kva) { + pr_err(MOD "Unable to ioremap onchip mem\n"); + ib_dealloc_device(&devp->ibdev); + return ERR_PTR(-EINVAL); + } + } PDBG(KERN_INFO MOD "ocq memory: " "hw_start 0x%x size %u mw_pa 0x%lx mw_kva %p\n", @@ -749,6 +817,7 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) spin_lock_init(&devp->lock); mutex_init(&devp->rdev.stats.lock); mutex_init(&devp->db_mutex); + INIT_LIST_HEAD(&devp->db_fc_list); if (c4iw_debugfs_root) { devp->debugfs_root = debugfs_create_dir( @@ -756,6 +825,8 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) c4iw_debugfs_root); setup_debugfs(devp); } + + return devp; } @@ -897,11 +968,13 @@ static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp, } opcode = *(u8 *)rsp; - if (c4iw_handlers[opcode]) + if (c4iw_handlers[opcode]) { c4iw_handlers[opcode](dev, skb); - else + } else { pr_info("%s no handler opcode 0x%x...\n", __func__, opcode); + kfree_skb(skb); + } return 0; nomem: @@ -977,13 +1050,16 @@ static int disable_qp_db(int id, void *p, void *data) static void stop_queues(struct uld_ctx *ctx) { - spin_lock_irq(&ctx->dev->lock); - if (ctx->dev->db_state == NORMAL) { - ctx->dev->rdev.stats.db_state_transitions++; - ctx->dev->db_state = FLOW_CONTROL; + unsigned long flags; + + spin_lock_irqsave(&ctx->dev->lock, flags); + ctx->dev->rdev.stats.db_state_transitions++; + ctx->dev->db_state = STOPPED; + if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL); - } - spin_unlock_irq(&ctx->dev->lock); + else + ctx->dev->rdev.status_page->db_off = 1; + spin_unlock_irqrestore(&ctx->dev->lock, flags); } static int enable_qp_db(int id, void *p, void *data) @@ -994,15 +1070,72 @@ static int enable_qp_db(int id, void *p, void *data) return 0; } +static void resume_rc_qp(struct c4iw_qp *qp) +{ + spin_lock(&qp->lock); + t4_ring_sq_db(&qp->wq, qp->wq.sq.wq_pidx_inc, + is_t5(qp->rhp->rdev.lldi.adapter_type), NULL); + qp->wq.sq.wq_pidx_inc = 0; + t4_ring_rq_db(&qp->wq, qp->wq.rq.wq_pidx_inc, + is_t5(qp->rhp->rdev.lldi.adapter_type), NULL); + qp->wq.rq.wq_pidx_inc = 0; + spin_unlock(&qp->lock); +} + +static void resume_a_chunk(struct uld_ctx *ctx) +{ + int i; + struct c4iw_qp *qp; + + for (i = 0; i < DB_FC_RESUME_SIZE; i++) { + qp = list_first_entry(&ctx->dev->db_fc_list, struct c4iw_qp, + db_fc_entry); + list_del_init(&qp->db_fc_entry); + resume_rc_qp(qp); + if (list_empty(&ctx->dev->db_fc_list)) + break; + } +} + static void resume_queues(struct uld_ctx *ctx) { spin_lock_irq(&ctx->dev->lock); - if (ctx->dev->qpcnt <= db_fc_threshold && - ctx->dev->db_state == FLOW_CONTROL) { - ctx->dev->db_state = NORMAL; - ctx->dev->rdev.stats.db_state_transitions++; - idr_for_each(&ctx->dev->qpidr, enable_qp_db, NULL); + if (ctx->dev->db_state != STOPPED) + goto out; + ctx->dev->db_state = FLOW_CONTROL; + while (1) { + if (list_empty(&ctx->dev->db_fc_list)) { + WARN_ON(ctx->dev->db_state != FLOW_CONTROL); + ctx->dev->db_state = NORMAL; + ctx->dev->rdev.stats.db_state_transitions++; + if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) { + idr_for_each(&ctx->dev->qpidr, enable_qp_db, + NULL); + } else { + ctx->dev->rdev.status_page->db_off = 0; + } + break; + } else { + if (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1) + < (ctx->dev->rdev.lldi.dbfifo_int_thresh << + DB_FC_DRAIN_THRESH)) { + resume_a_chunk(ctx); + } + if (!list_empty(&ctx->dev->db_fc_list)) { + spin_unlock_irq(&ctx->dev->lock); + if (DB_FC_RESUME_DELAY) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(DB_FC_RESUME_DELAY); + } + spin_lock_irq(&ctx->dev->lock); + if (ctx->dev->db_state != FLOW_CONTROL) + break; + } + } } +out: + if (ctx->dev->db_state != NORMAL) + ctx->dev->rdev.stats.db_fc_interruptions++; spin_unlock_irq(&ctx->dev->lock); } @@ -1028,12 +1161,12 @@ static int count_qps(int id, void *p, void *data) return 0; } -static void deref_qps(struct qp_list qp_list) +static void deref_qps(struct qp_list *qp_list) { int idx; - for (idx = 0; idx < qp_list.idx; idx++) - c4iw_qp_rem_ref(&qp_list.qps[idx]->ibqp); + for (idx = 0; idx < qp_list->idx; idx++) + c4iw_qp_rem_ref(&qp_list->qps[idx]->ibqp); } static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list) @@ -1044,17 +1177,22 @@ static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list) for (idx = 0; idx < qp_list->idx; idx++) { struct c4iw_qp *qp = qp_list->qps[idx]; + spin_lock_irq(&qp->rhp->lock); + spin_lock(&qp->lock); ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0], qp->wq.sq.qid, t4_sq_host_wq_pidx(&qp->wq), t4_sq_wq_size(&qp->wq)); if (ret) { - printk(KERN_ERR MOD "%s: Fatal error - " + pr_err(KERN_ERR MOD "%s: Fatal error - " "DB overflow recovery failed - " "error syncing SQ qid %u\n", pci_name(ctx->lldi.pdev), qp->wq.sq.qid); + spin_unlock(&qp->lock); + spin_unlock_irq(&qp->rhp->lock); return; } + qp->wq.sq.wq_pidx_inc = 0; ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0], qp->wq.rq.qid, @@ -1062,12 +1200,17 @@ static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list) t4_rq_wq_size(&qp->wq)); if (ret) { - printk(KERN_ERR MOD "%s: Fatal error - " + pr_err(KERN_ERR MOD "%s: Fatal error - " "DB overflow recovery failed - " "error syncing RQ qid %u\n", pci_name(ctx->lldi.pdev), qp->wq.rq.qid); + spin_unlock(&qp->lock); + spin_unlock_irq(&qp->rhp->lock); return; } + qp->wq.rq.wq_pidx_inc = 0; + spin_unlock(&qp->lock); + spin_unlock_irq(&qp->rhp->lock); /* Wait for the dbfifo to drain */ while (cxgb4_dbfifo_count(qp->rhp->rdev.lldi.ports[0], 1) > 0) { @@ -1083,36 +1226,22 @@ static void recover_queues(struct uld_ctx *ctx) struct qp_list qp_list; int ret; - /* lock out kernel db ringers */ - mutex_lock(&ctx->dev->db_mutex); - - /* put all queues in to recovery mode */ - spin_lock_irq(&ctx->dev->lock); - ctx->dev->db_state = RECOVERY; - ctx->dev->rdev.stats.db_state_transitions++; - idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL); - spin_unlock_irq(&ctx->dev->lock); - /* slow everybody down */ set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(usecs_to_jiffies(1000)); - /* Wait for the dbfifo to completely drain. */ - while (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1) > 0) { - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(usecs_to_jiffies(10)); - } - /* flush the SGE contexts */ ret = cxgb4_flush_eq_cache(ctx->dev->rdev.lldi.ports[0]); if (ret) { printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n", pci_name(ctx->lldi.pdev)); - goto out; + return; } /* Count active queues so we can build a list of queues to recover */ spin_lock_irq(&ctx->dev->lock); + WARN_ON(ctx->dev->db_state != STOPPED); + ctx->dev->db_state = RECOVERY; idr_for_each(&ctx->dev->qpidr, count_qps, &count); qp_list.qps = kzalloc(count * sizeof *qp_list.qps, GFP_ATOMIC); @@ -1120,7 +1249,7 @@ static void recover_queues(struct uld_ctx *ctx) printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n", pci_name(ctx->lldi.pdev)); spin_unlock_irq(&ctx->dev->lock); - goto out; + return; } qp_list.idx = 0; @@ -1133,29 +1262,13 @@ static void recover_queues(struct uld_ctx *ctx) recover_lost_dbs(ctx, &qp_list); /* we're almost done! deref the qps and clean up */ - deref_qps(qp_list); + deref_qps(&qp_list); kfree(qp_list.qps); - /* Wait for the dbfifo to completely drain again */ - while (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1) > 0) { - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(usecs_to_jiffies(10)); - } - - /* resume the queues */ spin_lock_irq(&ctx->dev->lock); - if (ctx->dev->qpcnt > db_fc_threshold) - ctx->dev->db_state = FLOW_CONTROL; - else { - ctx->dev->db_state = NORMAL; - idr_for_each(&ctx->dev->qpidr, enable_qp_db, NULL); - } - ctx->dev->rdev.stats.db_state_transitions++; + WARN_ON(ctx->dev->db_state != RECOVERY); + ctx->dev->db_state = STOPPED; spin_unlock_irq(&ctx->dev->lock); - -out: - /* start up kernel db ringers again */ - mutex_unlock(&ctx->dev->db_mutex); } static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...) @@ -1165,9 +1278,7 @@ static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...) switch (control) { case CXGB4_CONTROL_DB_FULL: stop_queues(ctx); - mutex_lock(&ctx->dev->rdev.stats.lock); ctx->dev->rdev.stats.db_full++; - mutex_unlock(&ctx->dev->rdev.stats.lock); break; case CXGB4_CONTROL_DB_EMPTY: resume_queues(ctx); @@ -1210,6 +1321,20 @@ static int __init c4iw_init_module(void) printk(KERN_WARNING MOD "could not create debugfs entry, continuing\n"); + if (ibnl_add_client(RDMA_NL_C4IW, RDMA_NL_IWPM_NUM_OPS, + c4iw_nl_cb_table)) + pr_err("%s[%u]: Failed to add netlink callback\n" + , __func__, __LINE__); + + err = iwpm_init(RDMA_NL_C4IW); + if (err) { + pr_err("port mapper initialization failed with %d\n", err); + ibnl_remove_client(RDMA_NL_C4IW); + c4iw_cm_term(); + debugfs_remove_recursive(c4iw_debugfs_root); + return err; + } + cxgb4_register_uld(CXGB4_ULD_RDMA, &c4iw_uld_info); return 0; @@ -1227,6 +1352,8 @@ static void __exit c4iw_exit_module(void) } mutex_unlock(&dev_mutex); cxgb4_unregister_uld(CXGB4_ULD_RDMA); + iwpm_exit(RDMA_NL_C4IW); + ibnl_remove_client(RDMA_NL_C4IW); c4iw_cm_term(); debugfs_remove_recursive(c4iw_debugfs_root); } diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 23eaeabab93..361fff7a074 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -52,6 +52,8 @@ #include <rdma/ib_verbs.h> #include <rdma/iw_cm.h> +#include <rdma/rdma_netlink.h> +#include <rdma/iw_portmap.h> #include "cxgb4.h" #include "cxgb4_uld.h" @@ -109,6 +111,7 @@ struct c4iw_dev_ucontext { enum c4iw_rdev_flags { T4_FATAL_ERROR = (1<<0), + T4_STATUS_PAGE_DISABLED = (1<<1), }; struct c4iw_stat { @@ -130,6 +133,7 @@ struct c4iw_stats { u64 db_empty; u64 db_drop; u64 db_state_transitions; + u64 db_fc_interruptions; u64 tcam_full; u64 act_ofld_conn_fails; u64 pas_ofld_conn_fails; @@ -147,9 +151,12 @@ struct c4iw_rdev { struct gen_pool *ocqp_pool; u32 flags; struct cxgb4_lld_info lldi; + unsigned long bar2_pa; + void __iomem *bar2_kva; unsigned long oc_mw_pa; void __iomem *oc_mw_kva; struct c4iw_stats stats; + struct t4_dev_status_page *status_page; }; static inline int c4iw_fatal_error(struct c4iw_rdev *rdev) @@ -211,7 +218,8 @@ static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev, enum db_state { NORMAL = 0, FLOW_CONTROL = 1, - RECOVERY = 2 + RECOVERY = 2, + STOPPED = 3 }; struct c4iw_dev { @@ -225,10 +233,10 @@ struct c4iw_dev { struct mutex db_mutex; struct dentry *debugfs_root; enum db_state db_state; - int qpcnt; struct idr hwtid_idr; struct idr atid_idr; struct idr stid_idr; + struct list_head db_fc_list; }; static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev) @@ -369,6 +377,7 @@ struct c4iw_fr_page_list { DEFINE_DMA_UNMAP_ADDR(mapping); dma_addr_t dma_addr; struct c4iw_dev *dev; + int pll_len; }; static inline struct c4iw_fr_page_list *to_c4iw_fr_page_list( @@ -428,10 +437,12 @@ struct c4iw_qp_attributes { u8 ecode; u16 sq_db_inc; u16 rq_db_inc; + u8 send_term; }; struct c4iw_qp { struct ib_qp ibqp; + struct list_head db_fc_entry; struct c4iw_dev *rhp; struct c4iw_ep *ep; struct c4iw_qp_attributes attr; @@ -441,6 +452,7 @@ struct c4iw_qp { atomic_t refcnt; wait_queue_head_t wait; struct timer_list timer; + int sq_sig_all; }; static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp) @@ -718,6 +730,7 @@ enum c4iw_ep_flags { CLOSE_SENT = 3, TIMEOUT = 4, QP_REFERENCED = 5, + RELEASE_MAPINFO = 6, }; enum c4iw_ep_history { @@ -754,6 +767,8 @@ struct c4iw_ep_common { struct mutex mutex; struct sockaddr_storage local_addr; struct sockaddr_storage remote_addr; + struct sockaddr_storage mapped_local_addr; + struct sockaddr_storage mapped_remote_addr; struct c4iw_wr_wait wr_wait; unsigned long flags; unsigned long history; @@ -795,7 +810,48 @@ struct c4iw_ep { u8 retry_with_mpa_v1; u8 tried_with_mpa_v1; unsigned int retry_count; -}; + int snd_win; + int rcv_win; +}; + +static inline void print_addr(struct c4iw_ep_common *epc, const char *func, + const char *msg) +{ + +#define SINA(a) (&(((struct sockaddr_in *)(a))->sin_addr.s_addr)) +#define SINP(a) ntohs(((struct sockaddr_in *)(a))->sin_port) +#define SIN6A(a) (&(((struct sockaddr_in6 *)(a))->sin6_addr)) +#define SIN6P(a) ntohs(((struct sockaddr_in6 *)(a))->sin6_port) + + if (c4iw_debug) { + switch (epc->local_addr.ss_family) { + case AF_INET: + PDBG("%s %s %pI4:%u/%u <-> %pI4:%u/%u\n", + func, msg, SINA(&epc->local_addr), + SINP(&epc->local_addr), + SINP(&epc->mapped_local_addr), + SINA(&epc->remote_addr), + SINP(&epc->remote_addr), + SINP(&epc->mapped_remote_addr)); + break; + case AF_INET6: + PDBG("%s %s %pI6:%u/%u <-> %pI6:%u/%u\n", + func, msg, SIN6A(&epc->local_addr), + SIN6P(&epc->local_addr), + SIN6P(&epc->mapped_local_addr), + SIN6A(&epc->remote_addr), + SIN6P(&epc->remote_addr), + SIN6P(&epc->mapped_remote_addr)); + break; + default: + break; + } + } +#undef SINA +#undef SINP +#undef SIN6A +#undef SIN6P +} static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id) { @@ -852,7 +908,7 @@ int c4iw_destroy_ctrl_qp(struct c4iw_rdev *rdev); int c4iw_register_device(struct c4iw_dev *dev); void c4iw_unregister_device(struct c4iw_dev *dev); int __init c4iw_cm_init(void); -void __exit c4iw_cm_term(void); +void c4iw_cm_term(void); void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx); void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev, diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 4cb8eb24497..ec7a2988a70 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -37,9 +37,9 @@ #include "iw_cxgb4.h" -int use_dsgl = 1; +int use_dsgl = 0; module_param(use_dsgl, int, 0644); -MODULE_PARM_DESC(use_dsgl, "Use DSGL for PBL/FastReg (default=1)"); +MODULE_PARM_DESC(use_dsgl, "Use DSGL for PBL/FastReg (default=0)"); #define T4_ULPTX_MIN_IO 32 #define C4IW_MAX_INLINE_SIZE 96 @@ -76,7 +76,7 @@ static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr, INIT_ULPTX_WR(req, wr_len, 0, 0); req->wr.wr_hi = cpu_to_be32(FW_WR_OP(FW_ULPTX_WR) | (wait ? FW_WR_COMPL(1) : 0)); - req->wr.wr_lo = wait ? (__force __be64)&wr_wait : 0; + req->wr.wr_lo = wait ? (__force __be64)(unsigned long) &wr_wait : 0L; req->wr.wr_mid = cpu_to_be32(FW_WR_LEN16(DIV_ROUND_UP(wr_len, 16))); req->cmd = cpu_to_be32(ULPTX_CMD(ULP_TX_MEM_WRITE)); req->cmd |= cpu_to_be32(V_T5_ULP_MEMIO_ORDER(1)); @@ -173,7 +173,7 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len, return ret; } -int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *data) +static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *data) { u32 remain = len; u32 dmalen; @@ -259,8 +259,12 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry, if ((!reset_tpt_entry) && (*stag == T4_STAG_UNSET)) { stag_idx = c4iw_get_resource(&rdev->resource.tpt_table); - if (!stag_idx) + if (!stag_idx) { + mutex_lock(&rdev->stats.lock); + rdev->stats.stag.fail++; + mutex_unlock(&rdev->stats.lock); return -ENOMEM; + } mutex_lock(&rdev->stats.lock); rdev->stats.stag.cur += 32; if (rdev->stats.stag.cur > rdev->stats.stag.max) @@ -678,9 +682,9 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, { __be64 *pages; int shift, n, len; - int i, j, k; + int i, k, entry; int err = 0; - struct ib_umem_chunk *chunk; + struct scatterlist *sg; struct c4iw_dev *rhp; struct c4iw_pd *php; struct c4iw_mr *mhp; @@ -710,10 +714,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, shift = ffs(mhp->umem->page_size) - 1; - n = 0; - list_for_each_entry(chunk, &mhp->umem->chunk_list, list) - n += chunk->nents; - + n = mhp->umem->nmap; err = alloc_pbl(mhp, n); if (err) goto err; @@ -726,24 +727,22 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, i = n = 0; - list_for_each_entry(chunk, &mhp->umem->chunk_list, list) - for (j = 0; j < chunk->nmap; ++j) { - len = sg_dma_len(&chunk->page_list[j]) >> shift; - for (k = 0; k < len; ++k) { - pages[i++] = cpu_to_be64(sg_dma_address( - &chunk->page_list[j]) + - mhp->umem->page_size * k); - if (i == PAGE_SIZE / sizeof *pages) { - err = write_pbl(&mhp->rhp->rdev, - pages, - mhp->attr.pbl_addr + (n << 3), i); - if (err) - goto pbl_done; - n += i; - i = 0; - } + for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = cpu_to_be64(sg_dma_address(sg) + + mhp->umem->page_size * k); + if (i == PAGE_SIZE / sizeof *pages) { + err = write_pbl(&mhp->rhp->rdev, + pages, + mhp->attr.pbl_addr + (n << 3), i); + if (err) + goto pbl_done; + n += i; + i = 0; } } + } if (i) err = write_pbl(&mhp->rhp->rdev, pages, @@ -903,7 +902,11 @@ struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl(struct ib_device *device, dma_unmap_addr_set(c4pl, mapping, dma_addr); c4pl->dma_addr = dma_addr; c4pl->dev = dev; - c4pl->ibpl.max_page_list_len = pll_len; + c4pl->pll_len = pll_len; + + PDBG("%s c4pl %p pll_len %u page_list %p dma_addr %pad\n", + __func__, c4pl, c4pl->pll_len, c4pl->ibpl.page_list, + &c4pl->dma_addr); return &c4pl->ibpl; } @@ -912,8 +915,12 @@ void c4iw_free_fastreg_pbl(struct ib_fast_reg_page_list *ibpl) { struct c4iw_fr_page_list *c4pl = to_c4iw_fr_page_list(ibpl); + PDBG("%s c4pl %p pll_len %u page_list %p dma_addr %pad\n", + __func__, c4pl, c4pl->pll_len, c4pl->ibpl.page_list, + &c4pl->dma_addr); + dma_free_coherent(&c4pl->dev->rdev.lldi.pdev->dev, - c4pl->ibpl.max_page_list_len, + c4pl->pll_len, c4pl->ibpl.page_list, dma_unmap_addr(c4pl, mapping)); kfree(c4pl); } diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 7e94c9a656a..b1d305338de 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -106,15 +106,57 @@ static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev, { struct c4iw_ucontext *context; struct c4iw_dev *rhp = to_c4iw_dev(ibdev); + static int warned; + struct c4iw_alloc_ucontext_resp uresp; + int ret = 0; + struct c4iw_mm_entry *mm = NULL; PDBG("%s ibdev %p\n", __func__, ibdev); context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return ERR_PTR(-ENOMEM); + if (!context) { + ret = -ENOMEM; + goto err; + } + c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx); INIT_LIST_HEAD(&context->mmaps); spin_lock_init(&context->mmap_lock); + + if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) { + if (!warned++) + pr_err(MOD "Warning - downlevel libcxgb4 (non-fatal), device status page disabled."); + rhp->rdev.flags |= T4_STATUS_PAGE_DISABLED; + } else { + mm = kmalloc(sizeof(*mm), GFP_KERNEL); + if (!mm) { + ret = -ENOMEM; + goto err_free; + } + + uresp.status_page_size = PAGE_SIZE; + + spin_lock(&context->mmap_lock); + uresp.status_page_key = context->key; + context->key += PAGE_SIZE; + spin_unlock(&context->mmap_lock); + + ret = ib_copy_to_udata(udata, &uresp, + sizeof(uresp) - sizeof(uresp.reserved)); + if (ret) + goto err_mm; + + mm->key = uresp.status_page_key; + mm->addr = virt_to_phys(rhp->rdev.status_page); + mm->len = PAGE_SIZE; + insert_mmap(context, mm); + } return &context->ibucontext; +err_mm: + kfree(mm); +err_free: + kfree(context); +err: + return ERR_PTR(ret); } static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) @@ -287,7 +329,7 @@ static int c4iw_query_device(struct ib_device *ibdev, props->max_mr = c4iw_num_stags(&dev->rdev); props->max_pd = T4_MAX_NUM_PD; props->local_ca_ack_delay = 0; - props->max_fast_reg_page_list_len = T4_MAX_FR_DEPTH; + props->max_fast_reg_page_list_len = t4_max_fr_depth(use_dsgl); return 0; } @@ -458,7 +500,7 @@ int c4iw_register_device(struct c4iw_dev *dev) dev->ibdev.node_type = RDMA_NODE_RNIC; memcpy(dev->ibdev.node_desc, C4IW_NODE_DESC, sizeof(C4IW_NODE_DESC)); dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports; - dev->ibdev.num_comp_vectors = 1; + dev->ibdev.num_comp_vectors = dev->rdev.lldi.nciq; dev->ibdev.dma_device = &(dev->rdev.lldi.pdev->dev); dev->ibdev.query_device = c4iw_query_device; dev->ibdev.query_port = c4iw_query_port; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 582936708e6..086f62f5dc9 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -212,13 +212,23 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, wq->db = rdev->lldi.db_reg; wq->gts = rdev->lldi.gts_reg; - if (user) { - wq->sq.udb = (u64)pci_resource_start(rdev->lldi.pdev, 2) + - (wq->sq.qid << rdev->qpshift); - wq->sq.udb &= PAGE_MASK; - wq->rq.udb = (u64)pci_resource_start(rdev->lldi.pdev, 2) + - (wq->rq.qid << rdev->qpshift); - wq->rq.udb &= PAGE_MASK; + if (user || is_t5(rdev->lldi.adapter_type)) { + u32 off; + + off = (wq->sq.qid << rdev->qpshift) & PAGE_MASK; + if (user) { + wq->sq.udb = (u64 __iomem *)(rdev->bar2_pa + off); + } else { + off += 128 * (wq->sq.qid & rdev->qpmask) + 8; + wq->sq.udb = (u64 __iomem *)(rdev->bar2_kva + off); + } + off = (wq->rq.qid << rdev->qpshift) & PAGE_MASK; + if (user) { + wq->rq.udb = (u64 __iomem *)(rdev->bar2_pa + off); + } else { + off += 128 * (wq->rq.qid & rdev->qpmask) + 8; + wq->rq.udb = (u64 __iomem *)(rdev->bar2_kva + off); + } } wq->rdev = rdev; wq->rq.msn = 1; @@ -299,9 +309,10 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, if (ret) goto free_dma; - PDBG("%s sqid 0x%x rqid 0x%x kdb 0x%p squdb 0x%llx rqudb 0x%llx\n", + PDBG("%s sqid 0x%x rqid 0x%x kdb 0x%p squdb 0x%lx rqudb 0x%lx\n", __func__, wq->sq.qid, wq->rq.qid, wq->db, - (unsigned long long)wq->sq.udb, (unsigned long long)wq->rq.udb); + (__force unsigned long) wq->sq.udb, + (__force unsigned long) wq->rq.udb); return 0; free_dma: @@ -425,6 +436,8 @@ static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe, default: return -EINVAL; } + wqe->send.r3 = 0; + wqe->send.r4 = 0; plen = 0; if (wr->num_sge) { @@ -555,7 +568,8 @@ static int build_fastreg(struct t4_sq *sq, union t4_wr *wqe, int pbllen = roundup(wr->wr.fast_reg.page_list_len * sizeof(u64), 32); int rem; - if (wr->wr.fast_reg.page_list_len > T4_MAX_FR_DEPTH) + if (wr->wr.fast_reg.page_list_len > + t4_max_fr_depth(use_dsgl)) return -EINVAL; wqe->fr.qpbinde_to_dcacpu = 0; @@ -638,6 +652,48 @@ void c4iw_qp_rem_ref(struct ib_qp *qp) wake_up(&(to_c4iw_qp(qp)->wait)); } +static void add_to_fc_list(struct list_head *head, struct list_head *entry) +{ + if (list_empty(entry)) + list_add_tail(entry, head); +} + +static int ring_kernel_sq_db(struct c4iw_qp *qhp, u16 inc) +{ + unsigned long flags; + + spin_lock_irqsave(&qhp->rhp->lock, flags); + spin_lock(&qhp->lock); + if (qhp->rhp->db_state == NORMAL) + t4_ring_sq_db(&qhp->wq, inc, + is_t5(qhp->rhp->rdev.lldi.adapter_type), NULL); + else { + add_to_fc_list(&qhp->rhp->db_fc_list, &qhp->db_fc_entry); + qhp->wq.sq.wq_pidx_inc += inc; + } + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&qhp->rhp->lock, flags); + return 0; +} + +static int ring_kernel_rq_db(struct c4iw_qp *qhp, u16 inc) +{ + unsigned long flags; + + spin_lock_irqsave(&qhp->rhp->lock, flags); + spin_lock(&qhp->lock); + if (qhp->rhp->db_state == NORMAL) + t4_ring_rq_db(&qhp->wq, inc, + is_t5(qhp->rhp->rdev.lldi.adapter_type), NULL); + else { + add_to_fc_list(&qhp->rhp->db_fc_list, &qhp->db_fc_entry); + qhp->wq.rq.wq_pidx_inc += inc; + } + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&qhp->rhp->lock, flags); + return 0; +} + int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { @@ -646,7 +702,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, enum fw_wr_opcodes fw_opcode = 0; enum fw_ri_wr_flags fw_flags; struct c4iw_qp *qhp; - union t4_wr *wqe; + union t4_wr *wqe = NULL; u32 num_wrs; struct t4_swsqe *swsqe; unsigned long flag; @@ -675,7 +731,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, fw_flags = 0; if (wr->send_flags & IB_SEND_SOLICITED) fw_flags |= FW_RI_SOLICITED_EVENT_FLAG; - if (wr->send_flags & IB_SEND_SIGNALED) + if (wr->send_flags & IB_SEND_SIGNALED || qhp->sq_sig_all) fw_flags |= FW_RI_COMPLETION_FLAG; swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx]; switch (wr->opcode) { @@ -736,7 +792,8 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } swsqe->idx = qhp->wq.sq.pidx; swsqe->complete = 0; - swsqe->signaled = (wr->send_flags & IB_SEND_SIGNALED); + swsqe->signaled = (wr->send_flags & IB_SEND_SIGNALED) || + qhp->sq_sig_all; swsqe->flushed = 0; swsqe->wr_id = wr->wr_id; @@ -750,9 +807,14 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, t4_sq_produce(&qhp->wq, len16); idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); } - if (t4_wq_db_enabled(&qhp->wq)) - t4_ring_sq_db(&qhp->wq, idx); - spin_unlock_irqrestore(&qhp->lock, flag); + if (!qhp->rhp->rdev.status_page->db_off) { + t4_ring_sq_db(&qhp->wq, idx, + is_t5(qhp->rhp->rdev.lldi.adapter_type), wqe); + spin_unlock_irqrestore(&qhp->lock, flag); + } else { + spin_unlock_irqrestore(&qhp->lock, flag); + ring_kernel_sq_db(qhp, idx); + } return err; } @@ -761,7 +823,7 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, { int err = 0; struct c4iw_qp *qhp; - union t4_recv_wr *wqe; + union t4_recv_wr *wqe = NULL; u32 num_wrs; u8 len16 = 0; unsigned long flag; @@ -812,9 +874,14 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, wr = wr->next; num_wrs--; } - if (t4_wq_db_enabled(&qhp->wq)) - t4_ring_rq_db(&qhp->wq, idx); - spin_unlock_irqrestore(&qhp->lock, flag); + if (!qhp->rhp->rdev.status_page->db_off) { + t4_ring_rq_db(&qhp->wq, idx, + is_t5(qhp->rhp->rdev.lldi.adapter_type), wqe); + spin_unlock_irqrestore(&qhp->lock, flag); + } else { + spin_unlock_irqrestore(&qhp->lock, flag); + ring_kernel_rq_db(qhp, idx); + } return err; } @@ -1200,35 +1267,6 @@ out: return ret; } -/* - * Called by the library when the qp has user dbs disabled due to - * a DB_FULL condition. This function will single-thread all user - * DB rings to avoid overflowing the hw db-fifo. - */ -static int ring_kernel_db(struct c4iw_qp *qhp, u32 qid, u16 inc) -{ - int delay = db_delay_usecs; - - mutex_lock(&qhp->rhp->db_mutex); - do { - - /* - * The interrupt threshold is dbfifo_int_thresh << 6. So - * make sure we don't cross that and generate an interrupt. - */ - if (cxgb4_dbfifo_count(qhp->rhp->rdev.lldi.ports[0], 1) < - (qhp->rhp->rdev.lldi.dbfifo_int_thresh << 5)) { - writel(QID(qid) | PIDX(inc), qhp->wq.db); - break; - } - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(usecs_to_jiffies(delay)); - delay = min(delay << 1, 2000); - } while (1); - mutex_unlock(&qhp->rhp->db_mutex); - return 0; -} - int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, enum c4iw_qp_attr_mask mask, struct c4iw_qp_attributes *attrs, @@ -1278,11 +1316,11 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, } if (mask & C4IW_QP_ATTR_SQ_DB) { - ret = ring_kernel_db(qhp, qhp->wq.sq.qid, attrs->sq_db_inc); + ret = ring_kernel_sq_db(qhp, attrs->sq_db_inc); goto out; } if (mask & C4IW_QP_ATTR_RQ_DB) { - ret = ring_kernel_db(qhp, qhp->wq.rq.qid, attrs->rq_db_inc); + ret = ring_kernel_rq_db(qhp, attrs->rq_db_inc); goto out; } @@ -1332,6 +1370,7 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, switch (attrs->next_state) { case C4IW_QP_STATE_CLOSING: BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2); + t4_set_wq_in_error(&qhp->wq); set_state(qhp, C4IW_QP_STATE_CLOSING); ep = qhp->ep; if (!internal) { @@ -1339,30 +1378,30 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, disconnect = 1; c4iw_get_ep(&qhp->ep->com); } - t4_set_wq_in_error(&qhp->wq); ret = rdma_fini(rhp, qhp, ep); if (ret) goto err; break; case C4IW_QP_STATE_TERMINATE: + t4_set_wq_in_error(&qhp->wq); set_state(qhp, C4IW_QP_STATE_TERMINATE); qhp->attr.layer_etype = attrs->layer_etype; qhp->attr.ecode = attrs->ecode; - t4_set_wq_in_error(&qhp->wq); ep = qhp->ep; - disconnect = 1; - if (!internal) + if (!internal) { + c4iw_get_ep(&qhp->ep->com); terminate = 1; - else { + disconnect = 1; + } else { + terminate = qhp->attr.send_term; ret = rdma_fini(rhp, qhp, ep); if (ret) goto err; } - c4iw_get_ep(&qhp->ep->com); break; case C4IW_QP_STATE_ERROR: - set_state(qhp, C4IW_QP_STATE_ERROR); t4_set_wq_in_error(&qhp->wq); + set_state(qhp, C4IW_QP_STATE_ERROR); if (!internal) { abort = 1; disconnect = 1; @@ -1465,14 +1504,6 @@ out: return ret; } -static int enable_qp_db(int id, void *p, void *data) -{ - struct c4iw_qp *qp = p; - - t4_enable_wq_db(&qp->wq); - return 0; -} - int c4iw_destroy_qp(struct ib_qp *ib_qp) { struct c4iw_dev *rhp; @@ -1490,22 +1521,15 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp) c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); wait_event(qhp->wait, !qhp->ep); - spin_lock_irq(&rhp->lock); - remove_handle_nolock(rhp, &rhp->qpidr, qhp->wq.sq.qid); - rhp->qpcnt--; - BUG_ON(rhp->qpcnt < 0); - if (rhp->qpcnt <= db_fc_threshold && rhp->db_state == FLOW_CONTROL) { - rhp->rdev.stats.db_state_transitions++; - rhp->db_state = NORMAL; - idr_for_each(&rhp->qpidr, enable_qp_db, NULL); - } - if (db_coalescing_threshold >= 0) - if (rhp->qpcnt <= db_coalescing_threshold) - cxgb4_enable_db_coalescing(rhp->rdev.lldi.ports[0]); - spin_unlock_irq(&rhp->lock); + remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid); atomic_dec(&qhp->refcnt); wait_event(qhp->wait, !atomic_read(&qhp->refcnt)); + spin_lock_irq(&rhp->lock); + if (!list_empty(&qhp->db_fc_entry)) + list_del_init(&qhp->db_fc_entry); + spin_unlock_irq(&rhp->lock); + ucontext = ib_qp->uobject ? to_c4iw_ucontext(ib_qp->uobject->context) : NULL; destroy_qp(&rhp->rdev, &qhp->wq, @@ -1516,14 +1540,6 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp) return 0; } -static int disable_qp_db(int id, void *p, void *data) -{ - struct c4iw_qp *qp = p; - - t4_disable_wq_db(&qp->wq); - return 0; -} - struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, struct ib_udata *udata) { @@ -1533,7 +1549,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, struct c4iw_cq *schp; struct c4iw_cq *rchp; struct c4iw_create_qp_resp uresp; - int sqsize, rqsize; + unsigned int sqsize, rqsize; struct c4iw_ucontext *ucontext; int ret; struct c4iw_mm_entry *mm1, *mm2, *mm3, *mm4, *mm5 = NULL; @@ -1605,25 +1621,13 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, qhp->attr.enable_bind = 1; qhp->attr.max_ord = 1; qhp->attr.max_ird = 1; + qhp->sq_sig_all = attrs->sq_sig_type == IB_SIGNAL_ALL_WR; spin_lock_init(&qhp->lock); mutex_init(&qhp->mutex); init_waitqueue_head(&qhp->wait); atomic_set(&qhp->refcnt, 1); - spin_lock_irq(&rhp->lock); - if (rhp->db_state != NORMAL) - t4_disable_wq_db(&qhp->wq); - rhp->qpcnt++; - if (rhp->qpcnt > db_fc_threshold && rhp->db_state == NORMAL) { - rhp->rdev.stats.db_state_transitions++; - rhp->db_state = FLOW_CONTROL; - idr_for_each(&rhp->qpidr, disable_qp_db, NULL); - } - if (db_coalescing_threshold >= 0) - if (rhp->qpcnt > db_coalescing_threshold) - cxgb4_disable_db_coalescing(rhp->rdev.lldi.ports[0]); - ret = insert_handle_nolock(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid); - spin_unlock_irq(&rhp->lock); + ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid); if (ret) goto err2; @@ -1692,11 +1696,11 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, mm2->len = PAGE_ALIGN(qhp->wq.rq.memsize); insert_mmap(ucontext, mm2); mm3->key = uresp.sq_db_gts_key; - mm3->addr = qhp->wq.sq.udb; + mm3->addr = (__force unsigned long) qhp->wq.sq.udb; mm3->len = PAGE_SIZE; insert_mmap(ucontext, mm3); mm4->key = uresp.rq_db_gts_key; - mm4->addr = qhp->wq.rq.udb; + mm4->addr = (__force unsigned long) qhp->wq.rq.udb; mm4->len = PAGE_SIZE; insert_mmap(ucontext, mm4); if (mm5) { @@ -1709,6 +1713,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, } qhp->ibqp.qp_num = qhp->wq.sq.qid; init_timer(&(qhp->timer)); + INIT_LIST_HEAD(&qhp->db_fc_entry); PDBG("%s qhp %p sq_num_entries %d, rq_num_entries %d qpid 0x%0x\n", __func__, qhp, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries, qhp->wq.sq.qid); @@ -1772,11 +1777,15 @@ int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, /* * Use SQ_PSN and RQ_PSN to pass in IDX_INC values for * ringing the queue db when we're in DB_FULL mode. + * Only allow this on T4 devices. */ attrs.sq_db_inc = attr->sq_psn; attrs.rq_db_inc = attr->rq_psn; mask |= (attr_mask & IB_QP_SQ_PSN) ? C4IW_QP_ATTR_SQ_DB : 0; mask |= (attr_mask & IB_QP_RQ_PSN) ? C4IW_QP_ATTR_RQ_DB : 0; + if (is_t5(to_c4iw_qp(ibqp)->rhp->rdev.lldi.adapter_type) && + (mask & (C4IW_QP_ATTR_SQ_DB|C4IW_QP_ATTR_RQ_DB))) + return -EINVAL; return c4iw_modify_qp(rhp, qhp, mask, &attrs, 0); } diff --git a/drivers/infiniband/hw/cxgb4/resource.c b/drivers/infiniband/hw/cxgb4/resource.c index cdef4d7fb6d..67df71a7012 100644 --- a/drivers/infiniband/hw/cxgb4/resource.c +++ b/drivers/infiniband/hw/cxgb4/resource.c @@ -179,8 +179,12 @@ u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx) kfree(entry); } else { qid = c4iw_get_resource(&rdev->resource.qid_table); - if (!qid) + if (!qid) { + mutex_lock(&rdev->stats.lock); + rdev->stats.qid.fail++; + mutex_unlock(&rdev->stats.lock); goto out; + } mutex_lock(&rdev->stats.lock); rdev->stats.qid.cur += rdev->qpmask + 1; mutex_unlock(&rdev->stats.lock); @@ -322,8 +326,8 @@ u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size) unsigned long addr = gen_pool_alloc(rdev->rqt_pool, size << 6); PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size << 6); if (!addr) - printk_ratelimited(KERN_WARNING MOD "%s: Out of RQT memory\n", - pci_name(rdev->lldi.pdev)); + pr_warn_ratelimited(MOD "%s: Out of RQT memory\n", + pci_name(rdev->lldi.pdev)); mutex_lock(&rdev->stats.lock); if (addr) { rdev->stats.rqt.cur += roundup(size << 6, 1 << MIN_RQT_SHIFT); diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h index e73ace73918..68b0a6bf4eb 100644 --- a/drivers/infiniband/hw/cxgb4/t4.h +++ b/drivers/infiniband/hw/cxgb4/t4.h @@ -84,7 +84,14 @@ struct t4_status_page { sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge)) #define T4_MAX_FR_IMMD ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_fr_nsmr_wr) - \ sizeof(struct fw_ri_immd)) & ~31UL) -#define T4_MAX_FR_DEPTH (1024 / sizeof(u64)) +#define T4_MAX_FR_IMMD_DEPTH (T4_MAX_FR_IMMD / sizeof(u64)) +#define T4_MAX_FR_DSGL 1024 +#define T4_MAX_FR_DSGL_DEPTH (T4_MAX_FR_DSGL / sizeof(u64)) + +static inline int t4_max_fr_depth(int use_dsgl) +{ + return use_dsgl ? T4_MAX_FR_DSGL_DEPTH : T4_MAX_FR_IMMD_DEPTH; +} #define T4_RQ_NUM_SLOTS 2 #define T4_RQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_RQ_NUM_SLOTS) @@ -292,7 +299,7 @@ struct t4_sq { unsigned long phys_addr; struct t4_swsqe *sw_sq; struct t4_swsqe *oldest_read; - u64 udb; + u64 __iomem *udb; size_t memsize; u32 qid; u16 in_use; @@ -300,6 +307,7 @@ struct t4_sq { u16 cidx; u16 pidx; u16 wq_pidx; + u16 wq_pidx_inc; u16 flags; short flush_cidx; }; @@ -313,7 +321,7 @@ struct t4_rq { dma_addr_t dma_addr; DEFINE_DMA_UNMAP_ADDR(mapping); struct t4_swrqe *sw_rq; - u64 udb; + u64 __iomem *udb; size_t memsize; u32 qid; u32 msn; @@ -324,6 +332,7 @@ struct t4_rq { u16 cidx; u16 pidx; u16 wq_pidx; + u16 wq_pidx_inc; }; struct t4_wq { @@ -433,15 +442,67 @@ static inline u16 t4_sq_wq_size(struct t4_wq *wq) return wq->sq.size * T4_SQ_NUM_SLOTS; } -static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc) +/* This function copies 64 byte coalesced work request to memory + * mapped BAR2 space. For coalesced WRs, the SGE fetches data + * from the FIFO instead of from Host. + */ +static inline void pio_copy(u64 __iomem *dst, u64 *src) +{ + int count = 8; + + while (count) { + writeq(*src, dst); + src++; + dst++; + count--; + } +} + +static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc, u8 t5, + union t4_wr *wqe) { + + /* Flush host queue memory writes. */ wmb(); + if (t5) { + if (inc == 1 && wqe) { + PDBG("%s: WC wq->sq.pidx = %d\n", + __func__, wq->sq.pidx); + pio_copy(wq->sq.udb + 7, (void *)wqe); + } else { + PDBG("%s: DB wq->sq.pidx = %d\n", + __func__, wq->sq.pidx); + writel(PIDX_T5(inc), wq->sq.udb); + } + + /* Flush user doorbell area writes. */ + wmb(); + return; + } writel(QID(wq->sq.qid) | PIDX(inc), wq->db); } -static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc) +static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc, u8 t5, + union t4_recv_wr *wqe) { + + /* Flush host queue memory writes. */ wmb(); + if (t5) { + if (inc == 1 && wqe) { + PDBG("%s: WC wq->rq.pidx = %d\n", + __func__, wq->rq.pidx); + pio_copy(wq->rq.udb + 7, (void *)wqe); + } else { + PDBG("%s: DB wq->rq.pidx = %d\n", + __func__, wq->rq.pidx); + writel(PIDX_T5(inc), wq->rq.udb); + } + + /* Flush user doorbell area writes. */ + wmb(); + return; + } writel(QID(wq->rq.qid) | PIDX(inc), wq->db); } @@ -481,6 +542,7 @@ struct t4_cq { size_t memsize; __be64 bits_type_ts; u32 cqid; + int vector; u16 size; /* including status page */ u16 cidx; u16 sw_pidx; @@ -566,6 +628,9 @@ static inline int t4_next_hw_cqe(struct t4_cq *cq, struct t4_cqe **cqe) printk(KERN_ERR MOD "cq overflow cqid %u\n", cq->cqid); BUG_ON(1); } else if (t4_valid_cqe(cq, &cq->queue[cq->cidx])) { + + /* Ensure CQE is flushed to memory */ + rmb(); *cqe = &cq->queue[cq->cidx]; ret = 0; } else @@ -609,3 +674,7 @@ static inline void t4_set_cq_in_error(struct t4_cq *cq) ((struct t4_status_page *)&cq->queue[cq->size])->qp_err = 1; } #endif + +struct t4_dev_status_page { + u8 db_off; +}; diff --git a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h index dc193c29267..91289a051af 100644 --- a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h +++ b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h @@ -836,4 +836,19 @@ struct ulptx_idata { #define V_RX_DACK_CHANGE(x) ((x) << S_RX_DACK_CHANGE) #define F_RX_DACK_CHANGE V_RX_DACK_CHANGE(1U) +enum { /* TCP congestion control algorithms */ + CONG_ALG_RENO, + CONG_ALG_TAHOE, + CONG_ALG_NEWRENO, + CONG_ALG_HIGHSPEED +}; + +#define S_CONG_CNTRL 14 +#define M_CONG_CNTRL 0x3 +#define V_CONG_CNTRL(x) ((x) << S_CONG_CNTRL) +#define G_CONG_CNTRL(x) (((x) >> S_CONG_CNTRL) & M_CONG_CNTRL) + +#define CONG_CNTRL_VALID (1 << 18) +#define T5_OPT_2_VALID (1 << 31) + #endif /* _T4FW_RI_API_H_ */ diff --git a/drivers/infiniband/hw/cxgb4/user.h b/drivers/infiniband/hw/cxgb4/user.h index 32b754c35ab..cbd0ce17072 100644 --- a/drivers/infiniband/hw/cxgb4/user.h +++ b/drivers/infiniband/hw/cxgb4/user.h @@ -48,6 +48,7 @@ struct c4iw_create_cq_resp { __u32 cqid; __u32 size; __u32 qid_mask; + __u32 reserved; /* explicit padding (optional for i386) */ }; @@ -70,4 +71,10 @@ struct c4iw_create_qp_resp { __u32 qid_mask; __u32 flags; }; + +struct c4iw_alloc_ucontext_resp { + __u64 status_page_key; + __u32 status_page_size; + __u32 reserved; /* explicit padding (optional for i386) */ +}; #endif diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index f08f6eaf3fa..bd45e0f3923 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -322,7 +322,7 @@ struct ehca_mr_pginfo { } phy; struct { /* type EHCA_MR_PGI_USER section */ struct ib_umem *region; - struct ib_umem_chunk *next_chunk; + struct scatterlist *next_sg; u64 next_nmap; } usr; struct { /* type EHCA_MR_PGI_FMR section */ diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c index 212150c25ea..8cc83753776 100644 --- a/drivers/infiniband/hw/ehca/ehca_cq.c +++ b/drivers/infiniband/hw/ehca/ehca_cq.c @@ -283,6 +283,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, (my_cq->galpas.user.fw_handle & (PAGE_SIZE - 1)); if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { ehca_err(device, "Copy to udata failed."); + cq = ERR_PTR(-EFAULT); goto create_cq_exit4; } } diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c index bcfb0c18362..3488e8c9fcb 100644 --- a/drivers/infiniband/hw/ehca/ehca_mrmw.c +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c @@ -400,10 +400,7 @@ reg_user_mr_fallback: pginfo.num_hwpages = num_hwpages; pginfo.u.usr.region = e_mr->umem; pginfo.next_hwpage = e_mr->umem->offset / hwpage_size; - pginfo.u.usr.next_chunk = list_prepare_entry(pginfo.u.usr.next_chunk, - (&e_mr->umem->chunk_list), - list); - + pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl; ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags, e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, &e_mr->ib.ib_mr.rkey, EHCA_REG_MR); @@ -1858,61 +1855,39 @@ static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo, u64 *kpage) { int ret = 0; - struct ib_umem_chunk *prev_chunk; - struct ib_umem_chunk *chunk; u64 pgaddr; - u32 i = 0; u32 j = 0; int hwpages_per_kpage = PAGE_SIZE / pginfo->hwpage_size; - - /* loop over desired chunk entries */ - chunk = pginfo->u.usr.next_chunk; - prev_chunk = pginfo->u.usr.next_chunk; - list_for_each_entry_continue( - chunk, (&(pginfo->u.usr.region->chunk_list)), list) { - for (i = pginfo->u.usr.next_nmap; i < chunk->nmap; ) { - pgaddr = page_to_pfn(sg_page(&chunk->page_list[i])) - << PAGE_SHIFT ; - *kpage = pgaddr + (pginfo->next_hwpage * - pginfo->hwpage_size); - if ( !(*kpage) ) { - ehca_gen_err("pgaddr=%llx " - "chunk->page_list[i]=%llx " - "i=%x next_hwpage=%llx", - pgaddr, (u64)sg_dma_address( - &chunk->page_list[i]), - i, pginfo->next_hwpage); - return -EFAULT; - } - (pginfo->hwpage_cnt)++; - (pginfo->next_hwpage)++; - kpage++; - if (pginfo->next_hwpage % hwpages_per_kpage == 0) { - (pginfo->kpage_cnt)++; - (pginfo->u.usr.next_nmap)++; - pginfo->next_hwpage = 0; - i++; - } - j++; - if (j >= number) break; + struct scatterlist **sg = &pginfo->u.usr.next_sg; + + while (*sg != NULL) { + pgaddr = page_to_pfn(sg_page(*sg)) + << PAGE_SHIFT; + *kpage = pgaddr + (pginfo->next_hwpage * + pginfo->hwpage_size); + if (!(*kpage)) { + ehca_gen_err("pgaddr=%llx " + "sg_dma_address=%llx " + "entry=%llx next_hwpage=%llx", + pgaddr, (u64)sg_dma_address(*sg), + pginfo->u.usr.next_nmap, + pginfo->next_hwpage); + return -EFAULT; } - if ((pginfo->u.usr.next_nmap >= chunk->nmap) && - (j >= number)) { - pginfo->u.usr.next_nmap = 0; - prev_chunk = chunk; - break; - } else if (pginfo->u.usr.next_nmap >= chunk->nmap) { - pginfo->u.usr.next_nmap = 0; - prev_chunk = chunk; - } else if (j >= number) + (pginfo->hwpage_cnt)++; + (pginfo->next_hwpage)++; + kpage++; + if (pginfo->next_hwpage % hwpages_per_kpage == 0) { + (pginfo->kpage_cnt)++; + (pginfo->u.usr.next_nmap)++; + pginfo->next_hwpage = 0; + *sg = sg_next(*sg); + } + j++; + if (j >= number) break; - else - prev_chunk = chunk; } - pginfo->u.usr.next_chunk = - list_prepare_entry(prev_chunk, - (&(pginfo->u.usr.region->chunk_list)), - list); + return ret; } @@ -1920,20 +1895,19 @@ static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo, * check given pages for contiguous layout * last page addr is returned in prev_pgaddr for further check */ -static int ehca_check_kpages_per_ate(struct scatterlist *page_list, - int start_idx, int end_idx, +static int ehca_check_kpages_per_ate(struct scatterlist **sg, + int num_pages, u64 *prev_pgaddr) { - int t; - for (t = start_idx; t <= end_idx; t++) { - u64 pgaddr = page_to_pfn(sg_page(&page_list[t])) << PAGE_SHIFT; + for (; *sg && num_pages > 0; *sg = sg_next(*sg), num_pages--) { + u64 pgaddr = page_to_pfn(sg_page(*sg)) << PAGE_SHIFT; if (ehca_debug_level >= 3) ehca_gen_dbg("chunk_page=%llx value=%016llx", pgaddr, *(u64 *)__va(pgaddr)); if (pgaddr - PAGE_SIZE != *prev_pgaddr) { ehca_gen_err("uncontiguous page found pgaddr=%llx " - "prev_pgaddr=%llx page_list_i=%x", - pgaddr, *prev_pgaddr, t); + "prev_pgaddr=%llx entries_left_in_hwpage=%x", + pgaddr, *prev_pgaddr, num_pages); return -EINVAL; } *prev_pgaddr = pgaddr; @@ -1947,111 +1921,80 @@ static int ehca_set_pagebuf_user2(struct ehca_mr_pginfo *pginfo, u64 *kpage) { int ret = 0; - struct ib_umem_chunk *prev_chunk; - struct ib_umem_chunk *chunk; u64 pgaddr, prev_pgaddr; - u32 i = 0; u32 j = 0; int kpages_per_hwpage = pginfo->hwpage_size / PAGE_SIZE; int nr_kpages = kpages_per_hwpage; + struct scatterlist **sg = &pginfo->u.usr.next_sg; + + while (*sg != NULL) { - /* loop over desired chunk entries */ - chunk = pginfo->u.usr.next_chunk; - prev_chunk = pginfo->u.usr.next_chunk; - list_for_each_entry_continue( - chunk, (&(pginfo->u.usr.region->chunk_list)), list) { - for (i = pginfo->u.usr.next_nmap; i < chunk->nmap; ) { - if (nr_kpages == kpages_per_hwpage) { - pgaddr = ( page_to_pfn(sg_page(&chunk->page_list[i])) - << PAGE_SHIFT ); - *kpage = pgaddr; - if ( !(*kpage) ) { - ehca_gen_err("pgaddr=%llx i=%x", - pgaddr, i); + if (nr_kpages == kpages_per_hwpage) { + pgaddr = (page_to_pfn(sg_page(*sg)) + << PAGE_SHIFT); + *kpage = pgaddr; + if (!(*kpage)) { + ehca_gen_err("pgaddr=%llx entry=%llx", + pgaddr, pginfo->u.usr.next_nmap); + ret = -EFAULT; + return ret; + } + /* + * The first page in a hwpage must be aligned; + * the first MR page is exempt from this rule. + */ + if (pgaddr & (pginfo->hwpage_size - 1)) { + if (pginfo->hwpage_cnt) { + ehca_gen_err( + "invalid alignment " + "pgaddr=%llx entry=%llx " + "mr_pgsize=%llx", + pgaddr, pginfo->u.usr.next_nmap, + pginfo->hwpage_size); ret = -EFAULT; return ret; } - /* - * The first page in a hwpage must be aligned; - * the first MR page is exempt from this rule. - */ - if (pgaddr & (pginfo->hwpage_size - 1)) { - if (pginfo->hwpage_cnt) { - ehca_gen_err( - "invalid alignment " - "pgaddr=%llx i=%x " - "mr_pgsize=%llx", - pgaddr, i, - pginfo->hwpage_size); - ret = -EFAULT; - return ret; - } - /* first MR page */ - pginfo->kpage_cnt = - (pgaddr & - (pginfo->hwpage_size - 1)) >> - PAGE_SHIFT; - nr_kpages -= pginfo->kpage_cnt; - *kpage = pgaddr & - ~(pginfo->hwpage_size - 1); - } - if (ehca_debug_level >= 3) { - u64 val = *(u64 *)__va(pgaddr); - ehca_gen_dbg("kpage=%llx chunk_page=%llx " - "value=%016llx", - *kpage, pgaddr, val); - } - prev_pgaddr = pgaddr; - i++; - pginfo->kpage_cnt++; - pginfo->u.usr.next_nmap++; - nr_kpages--; - if (!nr_kpages) - goto next_kpage; - continue; + /* first MR page */ + pginfo->kpage_cnt = + (pgaddr & + (pginfo->hwpage_size - 1)) >> + PAGE_SHIFT; + nr_kpages -= pginfo->kpage_cnt; + *kpage = pgaddr & + ~(pginfo->hwpage_size - 1); } - if (i + nr_kpages > chunk->nmap) { - ret = ehca_check_kpages_per_ate( - chunk->page_list, i, - chunk->nmap - 1, &prev_pgaddr); - if (ret) return ret; - pginfo->kpage_cnt += chunk->nmap - i; - pginfo->u.usr.next_nmap += chunk->nmap - i; - nr_kpages -= chunk->nmap - i; - break; + if (ehca_debug_level >= 3) { + u64 val = *(u64 *)__va(pgaddr); + ehca_gen_dbg("kpage=%llx page=%llx " + "value=%016llx", + *kpage, pgaddr, val); } + prev_pgaddr = pgaddr; + *sg = sg_next(*sg); + pginfo->kpage_cnt++; + pginfo->u.usr.next_nmap++; + nr_kpages--; + if (!nr_kpages) + goto next_kpage; + continue; + } + + ret = ehca_check_kpages_per_ate(sg, nr_kpages, + &prev_pgaddr); + if (ret) + return ret; + pginfo->kpage_cnt += nr_kpages; + pginfo->u.usr.next_nmap += nr_kpages; - ret = ehca_check_kpages_per_ate(chunk->page_list, i, - i + nr_kpages - 1, - &prev_pgaddr); - if (ret) return ret; - i += nr_kpages; - pginfo->kpage_cnt += nr_kpages; - pginfo->u.usr.next_nmap += nr_kpages; next_kpage: - nr_kpages = kpages_per_hwpage; - (pginfo->hwpage_cnt)++; - kpage++; - j++; - if (j >= number) break; - } - if ((pginfo->u.usr.next_nmap >= chunk->nmap) && - (j >= number)) { - pginfo->u.usr.next_nmap = 0; - prev_chunk = chunk; - break; - } else if (pginfo->u.usr.next_nmap >= chunk->nmap) { - pginfo->u.usr.next_nmap = 0; - prev_chunk = chunk; - } else if (j >= number) + nr_kpages = kpages_per_hwpage; + (pginfo->hwpage_cnt)++; + kpage++; + j++; + if (j >= number) break; - else - prev_chunk = chunk; } - pginfo->u.usr.next_chunk = - list_prepare_entry(prev_chunk, - (&(pginfo->u.usr.region->chunk_list)), - list); + return ret; } @@ -2591,16 +2534,6 @@ static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, /* This is only a stub; nothing to be done here */ } -static u64 ehca_dma_address(struct ib_device *dev, struct scatterlist *sg) -{ - return sg->dma_address; -} - -static unsigned int ehca_dma_len(struct ib_device *dev, struct scatterlist *sg) -{ - return sg->length; -} - static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) @@ -2653,8 +2586,6 @@ struct ib_dma_mapping_ops ehca_dma_mapping_ops = { .unmap_page = ehca_dma_unmap_page, .map_sg = ehca_dma_map_sg, .unmap_sg = ehca_dma_unmap_sg, - .dma_address = ehca_dma_address, - .dma_len = ehca_dma_len, .sync_single_for_cpu = ehca_dma_sync_single_for_cpu, .sync_single_for_device = ehca_dma_sync_single_for_device, .alloc_coherent = ehca_dma_alloc_coherent, diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index 00d6861a6a1..2e89356c46f 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -1329,7 +1329,7 @@ static int internal_modify_qp(struct ib_qp *ibqp, qp_new_state = attr_mask & IB_QP_STATE ? attr->qp_state : qp_cur_state; if (!smi_reset2init && !ib_modify_qp_is_ok(qp_cur_state, qp_new_state, ibqp->qp_type, - attr_mask)) { + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) { ret = -EINVAL; ehca_err(ibqp->device, "Invalid qp transition new_state=%x cur_state=%x " diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c index 714293b7851..45802e97332 100644 --- a/drivers/infiniband/hw/ipath/ipath_diag.c +++ b/drivers/infiniband/hw/ipath/ipath_diag.c @@ -326,7 +326,7 @@ static ssize_t ipath_diagpkt_write(struct file *fp, size_t count, loff_t *off) { u32 __iomem *piobuf; - u32 plen, clen, pbufn; + u32 plen, pbufn, maxlen_reserve; struct ipath_diag_pkt odp; struct ipath_diag_xpkt dp; u32 *tmpbuf = NULL; @@ -335,42 +335,24 @@ static ssize_t ipath_diagpkt_write(struct file *fp, u64 val; u32 l_state, lt_state; /* LinkState, LinkTrainingState */ - if (count < sizeof(odp)) { - ret = -EINVAL; - goto bail; - } if (count == sizeof(dp)) { if (copy_from_user(&dp, data, sizeof(dp))) { ret = -EFAULT; goto bail; } - } else if (copy_from_user(&odp, data, sizeof(odp))) { - ret = -EFAULT; - goto bail; - } - - /* - * Due to padding/alignment issues (lessened with new struct) - * the old and new structs are the same length. We need to - * disambiguate them, which we can do because odp.len has never - * been less than the total of LRH+BTH+DETH so far, while - * dp.unit (same offset) unit is unlikely to get that high. - * Similarly, dp.data, the pointer to user at the same offset - * as odp.unit, is almost certainly at least one (512byte)page - * "above" NULL. The if-block below can be omitted if compatibility - * between a new driver and older diagnostic code is unimportant. - * compatibility the other direction (new diags, old driver) is - * handled in the diagnostic code, with a warning. - */ - if (dp.unit >= 20 && dp.data < 512) { - /* very probable version mismatch. Fix it up */ - memcpy(&odp, &dp, sizeof(odp)); - /* We got a legacy dp, copy elements to dp */ + } else if (count == sizeof(odp)) { + if (copy_from_user(&odp, data, sizeof(odp))) { + ret = -EFAULT; + goto bail; + } + dp.len = odp.len; dp.unit = odp.unit; dp.data = odp.data; - dp.len = odp.len; - dp.pbc_wd = 0; /* Indicate we need to compute PBC wd */ + dp.pbc_wd = 0; + } else { + ret = -EINVAL; + goto bail; } /* send count must be an exact number of dwords */ @@ -379,7 +361,7 @@ static ssize_t ipath_diagpkt_write(struct file *fp, goto bail; } - clen = dp.len >> 2; + plen = dp.len >> 2; dd = ipath_lookup(dp.unit); if (!dd || !(dd->ipath_flags & IPATH_PRESENT) || @@ -422,16 +404,22 @@ static ssize_t ipath_diagpkt_write(struct file *fp, goto bail; } - /* need total length before first word written */ - /* +1 word is for the qword padding */ - plen = sizeof(u32) + dp.len; - - if ((plen + 4) > dd->ipath_ibmaxlen) { + /* + * need total length before first word written, plus 2 Dwords. One Dword + * is for padding so we get the full user data when not aligned on + * a word boundary. The other Dword is to make sure we have room for the + * ICRC which gets tacked on later. + */ + maxlen_reserve = 2 * sizeof(u32); + if (dp.len > dd->ipath_ibmaxlen - maxlen_reserve) { ipath_dbg("Pkt len 0x%x > ibmaxlen %x\n", - plen - 4, dd->ipath_ibmaxlen); + dp.len, dd->ipath_ibmaxlen); ret = -EINVAL; - goto bail; /* before writing pbc */ + goto bail; } + + plen = sizeof(u32) + dp.len; + tmpbuf = vmalloc(plen); if (!tmpbuf) { dev_info(&dd->pcidev->dev, "Unable to allocate tmp buffer, " @@ -473,11 +461,11 @@ static ssize_t ipath_diagpkt_write(struct file *fp, */ if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) { ipath_flush_wc(); - __iowrite32_copy(piobuf + 2, tmpbuf, clen - 1); + __iowrite32_copy(piobuf + 2, tmpbuf, plen - 1); ipath_flush_wc(); - __raw_writel(tmpbuf[clen - 1], piobuf + clen + 1); + __raw_writel(tmpbuf[plen - 1], piobuf + plen + 1); } else - __iowrite32_copy(piobuf + 2, tmpbuf, clen); + __iowrite32_copy(piobuf + 2, tmpbuf, plen); ipath_flush_wc(); diff --git a/drivers/infiniband/hw/ipath/ipath_dma.c b/drivers/infiniband/hw/ipath/ipath_dma.c index 644c2c74e05..123a8c05353 100644 --- a/drivers/infiniband/hw/ipath/ipath_dma.c +++ b/drivers/infiniband/hw/ipath/ipath_dma.c @@ -115,6 +115,10 @@ static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl, ret = 0; break; } + sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif } return ret; } @@ -126,21 +130,6 @@ static void ipath_unmap_sg(struct ib_device *dev, BUG_ON(!valid_dma_direction(direction)); } -static u64 ipath_sg_dma_address(struct ib_device *dev, struct scatterlist *sg) -{ - u64 addr = (u64) page_address(sg_page(sg)); - - if (addr) - addr += sg->offset; - return addr; -} - -static unsigned int ipath_sg_dma_len(struct ib_device *dev, - struct scatterlist *sg) -{ - return sg->length; -} - static void ipath_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, @@ -176,17 +165,15 @@ static void ipath_dma_free_coherent(struct ib_device *dev, size_t size, } struct ib_dma_mapping_ops ipath_dma_mapping_ops = { - ipath_mapping_error, - ipath_dma_map_single, - ipath_dma_unmap_single, - ipath_dma_map_page, - ipath_dma_unmap_page, - ipath_map_sg, - ipath_unmap_sg, - ipath_sg_dma_address, - ipath_sg_dma_len, - ipath_sync_single_for_cpu, - ipath_sync_single_for_device, - ipath_dma_alloc_coherent, - ipath_dma_free_coherent + .mapping_error = ipath_mapping_error, + .map_single = ipath_dma_map_single, + .unmap_single = ipath_dma_unmap_single, + .map_page = ipath_dma_map_page, + .unmap_page = ipath_dma_unmap_page, + .map_sg = ipath_map_sg, + .unmap_sg = ipath_unmap_sg, + .sync_single_for_cpu = ipath_sync_single_for_cpu, + .sync_single_for_device = ipath_sync_single_for_device, + .alloc_coherent = ipath_dma_alloc_coherent, + .free_coherent = ipath_dma_free_coherent }; diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 26dfbc8ee0f..01ba792791a 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -70,7 +70,7 @@ void ipath_disarm_senderrbufs(struct ipath_devdata *dd) if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) { int i; if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG) && - dd->ipath_lastcancel > jiffies) { + time_after(dd->ipath_lastcancel, jiffies)) { __IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG, "SendbufErrs %lx %lx", sbuf[0], sbuf[1]); @@ -755,7 +755,7 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) /* likely due to cancel; so suppress message unless verbose */ if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) && - dd->ipath_lastcancel > jiffies) { + time_after(dd->ipath_lastcancel, jiffies)) { /* armlaunch takes precedence; it often causes both. */ ipath_cdbg(VERBOSE, "Suppressed %s error (%llx) after sendbuf cancel\n", diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c index e346d3890a0..5e61e9bff69 100644 --- a/drivers/infiniband/hw/ipath/ipath_mr.c +++ b/drivers/infiniband/hw/ipath/ipath_mr.c @@ -188,8 +188,8 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, { struct ipath_mr *mr; struct ib_umem *umem; - struct ib_umem_chunk *chunk; - int n, m, i; + int n, m, entry; + struct scatterlist *sg; struct ib_mr *ret; if (length == 0) { @@ -202,10 +202,7 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (IS_ERR(umem)) return (void *) umem; - n = 0; - list_for_each_entry(chunk, &umem->chunk_list, list) - n += chunk->nents; - + n = umem->nmap; mr = alloc_mr(n, &to_idev(pd->device)->lk_table); if (!mr) { ret = ERR_PTR(-ENOMEM); @@ -224,22 +221,20 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, m = 0; n = 0; - list_for_each_entry(chunk, &umem->chunk_list, list) { - for (i = 0; i < chunk->nents; i++) { - void *vaddr; - - vaddr = page_address(sg_page(&chunk->page_list[i])); - if (!vaddr) { - ret = ERR_PTR(-EINVAL); - goto bail; - } - mr->mr.map[m]->segs[n].vaddr = vaddr; - mr->mr.map[m]->segs[n].length = umem->page_size; - n++; - if (n == IPATH_SEGSZ) { - m++; - n = 0; - } + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + void *vaddr; + + vaddr = page_address(sg_page(sg)); + if (!vaddr) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + mr->mr.map[m]->segs[n].vaddr = vaddr; + mr->mr.map[m]->segs[n].length = umem->page_size; + n++; + if (n == IPATH_SEGSZ) { + m++; + n = 0; } } ret = &mr->ibmr; diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c index 0857a9c3cd3..face87602dc 100644 --- a/drivers/infiniband/hw/ipath/ipath_qp.c +++ b/drivers/infiniband/hw/ipath/ipath_qp.c @@ -463,7 +463,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask)) + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) goto inval; if (attr_mask & IB_QP_AV) { diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c index 98ac18ec977..17a517766ad 100644 --- a/drivers/infiniband/hw/ipath/ipath_sdma.c +++ b/drivers/infiniband/hw/ipath/ipath_sdma.c @@ -247,7 +247,7 @@ static void sdma_abort_task(unsigned long opaque) /* ipath_sdma_abort() is done, waiting for interrupt */ if (status == IPATH_SDMA_ABORT_DISARMED) { - if (jiffies < dd->ipath_sdma_abort_intr_timeout) + if (time_before(jiffies, dd->ipath_sdma_abort_intr_timeout)) goto resched_noprint; /* give up, intr got lost somewhere */ ipath_dbg("give up waiting for SDMADISABLED intr\n"); @@ -341,7 +341,7 @@ resched: * JAG - this is bad to just have default be a loop without * state change */ - if (jiffies > dd->ipath_sdma_abort_jiffies) { + if (time_after(jiffies, dd->ipath_sdma_abort_jiffies)) { ipath_dbg("looping with status 0x%08lx\n", dd->ipath_sdma_status); dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ; diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c index f5cb13b2144..cc04b7ba348 100644 --- a/drivers/infiniband/hw/ipath/ipath_user_sdma.c +++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c @@ -280,9 +280,7 @@ static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd, int j; int ret; - ret = get_user_pages(current, current->mm, addr, - npages, 0, 1, pages, NULL); - + ret = get_user_pages_fast(addr, npages, 0, pages); if (ret != npages) { int i; @@ -811,10 +809,7 @@ int ipath_user_sdma_writev(struct ipath_devdata *dd, while (dim) { const int mxp = 8; - down_write(¤t->mm->mmap_sem); ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp); - up_write(¤t->mm->mmap_sem); - if (ret <= 0) goto done_unlock; else { diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig index 24ab11a9ad1..fc01deac1d3 100644 --- a/drivers/infiniband/hw/mlx4/Kconfig +++ b/drivers/infiniband/hw/mlx4/Kconfig @@ -1,6 +1,6 @@ config MLX4_INFINIBAND tristate "Mellanox ConnectX HCA support" - depends on NETDEVICES && ETHERNET && PCI + depends on NETDEVICES && ETHERNET && PCI && INET select NET_VENDOR_MELLANOX select MLX4_CORE ---help--- diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index a251becdaa9..2d8c3397774 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -39,25 +39,6 @@ #include "mlx4_ib.h" -int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, - u8 *mac, int *is_mcast, u8 port) -{ - struct in6_addr in6; - - *is_mcast = 0; - - memcpy(&in6, ah_attr->grh.dgid.raw, sizeof in6); - if (rdma_link_local_addr(&in6)) - rdma_get_ll_mac(&in6, mac); - else if (rdma_is_multicast_addr(&in6)) { - rdma_get_mcast_mac(&in6, mac); - *is_mcast = 1; - } else - return -EINVAL; - - return 0; -} - static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, struct mlx4_ib_ah *ah) { @@ -92,21 +73,18 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr { struct mlx4_ib_dev *ibdev = to_mdev(pd->device); struct mlx4_dev *dev = ibdev->dev; - union ib_gid sgid; - u8 mac[6]; - int err; - int is_mcast; + int is_mcast = 0; + struct in6_addr in6; u16 vlan_tag; - err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, &is_mcast, ah_attr->port_num); - if (err) - return ERR_PTR(err); - - memcpy(ah->av.eth.mac, mac, 6); - err = ib_get_cached_gid(pd->device, ah_attr->port_num, ah_attr->grh.sgid_index, &sgid); - if (err) - return ERR_PTR(err); - vlan_tag = rdma_get_vlan_id(&sgid); + memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6)); + if (rdma_is_multicast_addr(&in6)) { + is_mcast = 1; + rdma_get_mcast_mac(&in6, ah->av.eth.mac); + } else { + memcpy(ah->av.eth.mac, ah_attr->dmac, ETH_ALEN); + } + vlan_tag = ah_attr->vlan_id; if (vlan_tag < 0x1000) vlan_tag |= (ah_attr->sl & 7) << 13; ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index 2f215b93db6..0eb141c4141 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -154,7 +154,7 @@ void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, continue; slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ; - if (slave_id >= dev->dev->num_slaves) + if (slave_id >= dev->dev->num_vfs + 1) return; tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE]; form_cache_ag = get_cached_alias_guid(dev, port_num, diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c index d1f5f1dd77b..56a593e0ae5 100644 --- a/drivers/infiniband/hw/mlx4/cm.c +++ b/drivers/infiniband/hw/mlx4/cm.c @@ -61,6 +61,11 @@ struct cm_generic_msg { __be32 remote_comm_id; }; +struct cm_sidr_generic_msg { + struct ib_mad_hdr hdr; + __be32 request_id; +}; + struct cm_req_msg { unsigned char unused[0x60]; union ib_gid primary_path_sgid; @@ -69,28 +74,62 @@ struct cm_req_msg { static void set_local_comm_id(struct ib_mad *mad, u32 cm_id) { - struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - msg->local_comm_id = cpu_to_be32(cm_id); + if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + msg->request_id = cpu_to_be32(cm_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + pr_err("trying to set local_comm_id in SIDR_REP\n"); + return; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->local_comm_id = cpu_to_be32(cm_id); + } } static u32 get_local_comm_id(struct ib_mad *mad) { - struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - - return be32_to_cpu(msg->local_comm_id); + if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + return be32_to_cpu(msg->request_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + pr_err("trying to set local_comm_id in SIDR_REP\n"); + return -1; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + return be32_to_cpu(msg->local_comm_id); + } } static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id) { - struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - msg->remote_comm_id = cpu_to_be32(cm_id); + if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + msg->request_id = cpu_to_be32(cm_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + pr_err("trying to set remote_comm_id in SIDR_REQ\n"); + return; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->remote_comm_id = cpu_to_be32(cm_id); + } } static u32 get_remote_comm_id(struct ib_mad *mad) { - struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - - return be32_to_cpu(msg->remote_comm_id); + if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + return be32_to_cpu(msg->request_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + pr_err("trying to set remote_comm_id in SIDR_REQ\n"); + return -1; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + return be32_to_cpu(msg->remote_comm_id); + } } static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad) @@ -282,19 +321,21 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id u32 sl_cm_id; int pv_cm_id = -1; - sl_cm_id = get_local_comm_id(mad); - if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || - mad->mad_hdr.attr_id == CM_REP_ATTR_ID) { + mad->mad_hdr.attr_id == CM_REP_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + sl_cm_id = get_local_comm_id(mad); id = id_map_alloc(ibdev, slave_id, sl_cm_id); if (IS_ERR(id)) { mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n", __func__, slave_id, sl_cm_id); return PTR_ERR(id); } - } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID) { + } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { return 0; } else { + sl_cm_id = get_local_comm_id(mad); id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id); } @@ -315,14 +356,18 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id } int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, - struct ib_mad *mad) + struct ib_mad *mad) { u32 pv_cm_id; struct id_map_entry *id; - if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID) { + if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { union ib_gid gid; + if (!slave) + return 0; + gid = gid_from_req_msg(ibdev, mad); *slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id); if (*slave < 0) { @@ -341,7 +386,8 @@ int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, return -ENOENT; } - *slave = id->slave_id; + if (slave) + *slave = id->slave_id; set_remote_comm_id(mad, id->sl_cm_id); if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index d5e60f44ba5..1066eec854a 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -102,7 +102,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * int err; err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size, - PAGE_SIZE * 2, &buf->buf); + PAGE_SIZE * 2, &buf->buf, GFP_KERNEL); if (err) goto out; @@ -113,7 +113,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf); + err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf, GFP_KERNEL); if (err) goto err_mtt; @@ -209,7 +209,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector uar = &to_mucontext(context)->uar; } else { - err = mlx4_db_alloc(dev->dev, &cq->db, 1); + err = mlx4_db_alloc(dev->dev, &cq->db, 1, GFP_KERNEL); if (err) goto err_cq; @@ -324,7 +324,7 @@ static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq) u32 i; i = cq->mcq.cons_index; - while (get_sw_cqe(cq, i & cq->ibcq.cqe)) + while (get_sw_cqe(cq, i)) ++i; return i - cq->mcq.cons_index; @@ -365,7 +365,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) mutex_lock(&cq->resize_mutex); - if (entries < 1 || entries > dev->dev->caps.max_cqes) { + if (entries < 1) { err = -EINVAL; goto out; } @@ -376,6 +376,11 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) goto out; } + if (entries > dev->dev->caps.max_cqes) { + err = -EINVAL; + goto out; + } + if (ibcq->uobject) { err = mlx4_alloc_resize_umem(dev, cq, entries, udata); if (err) @@ -559,7 +564,7 @@ static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum) } static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc, - unsigned tail, struct mlx4_cqe *cqe) + unsigned tail, struct mlx4_cqe *cqe, int is_eth) { struct mlx4_ib_proxy_sqp_hdr *hdr; @@ -569,12 +574,20 @@ static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct DMA_FROM_DEVICE); hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr); wc->pkey_index = be16_to_cpu(hdr->tun.pkey_index); - wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32); - wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12); wc->src_qp = be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF; wc->wc_flags |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0; wc->dlid_path_bits = 0; + if (is_eth) { + wc->vlan_id = be16_to_cpu(hdr->tun.sl_vid); + memcpy(&(wc->smac[0]), (char *)&hdr->tun.mac_31_0, 4); + memcpy(&(wc->smac[4]), (char *)&hdr->tun.slid_mac_47_32, 2); + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); + } else { + wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32); + wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12); + } + return 0; } @@ -589,6 +602,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, struct mlx4_srq *msrq = NULL; int is_send; int is_error; + int is_eth; u32 g_mlpath_rqpn; u16 wqe_ctr; unsigned tail = 0; @@ -773,11 +787,15 @@ repoll: break; } + is_eth = (rdma_port_get_link_layer(wc->qp->device, + (*cur_qp)->port) == + IB_LINK_LAYER_ETHERNET); if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) { if ((*cur_qp)->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) - return use_tunnel_data(*cur_qp, cq, wc, tail, cqe); + return use_tunnel_data(*cur_qp, cq, wc, tail, + cqe, is_eth); } wc->slid = be16_to_cpu(cqe->rlid); @@ -788,11 +806,21 @@ repoll: wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f; wc->wc_flags |= mlx4_ib_ipoib_csum_ok(cqe->status, cqe->checksum) ? IB_WC_IP_CSUM_OK : 0; - if (rdma_port_get_link_layer(wc->qp->device, - (*cur_qp)->port) == IB_LINK_LAYER_ETHERNET) + if (is_eth) { wc->sl = be16_to_cpu(cqe->sl_vid) >> 13; - else + if (be32_to_cpu(cqe->vlan_my_qpn) & + MLX4_CQE_VLAN_PRESENT_MASK) { + wc->vlan_id = be16_to_cpu(cqe->sl_vid) & + MLX4_CQE_VID_MASK; + } else { + wc->vlan_id = 0xffff; + } + memcpy(wc->smac, cqe->smac, ETH_ALEN); + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); + } else { wc->sl = be16_to_cpu(cqe->sl_vid) >> 12; + wc->vlan_id = 0xffff; + } } return 0; diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c index 8aee4233b38..c5174098636 100644 --- a/drivers/infiniband/hw/mlx4/doorbell.c +++ b/drivers/infiniband/hw/mlx4/doorbell.c @@ -45,7 +45,6 @@ int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, struct mlx4_db *db) { struct mlx4_ib_user_db_page *page; - struct ib_umem_chunk *chunk; int err = 0; mutex_lock(&context->db_page_mutex); @@ -73,8 +72,7 @@ int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, list_add(&page->list, &context->db_page_list); found: - chunk = list_entry(page->umem->chunk_list.next, struct ib_umem_chunk, list); - db->dma = sg_dma_address(chunk->page_list) + (virt & ~PAGE_MASK); + db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK); db->u.user_page = page; ++page->refcnt; diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index f2a3f48107e..287ad0564ac 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -467,6 +467,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, int ret = 0; u16 tun_pkey_ix; u16 cached_pkey; + u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; if (dest_qpt > IB_QPT_GSI) return -EINVAL; @@ -477,10 +478,6 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE) return -EAGAIN; - /* QP0 forwarding only for Dom0 */ - if (!dest_qpt && (mlx4_master_func_num(dev->dev) != slave)) - return -EINVAL; - if (!dest_qpt) tun_qp = &tun_ctx->qp[0]; else @@ -509,6 +506,10 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, * The driver will set the force loopback bit in post_send */ memset(&attr, 0, sizeof attr); attr.port_num = port; + if (is_eth) { + memcpy(&attr.grh.dgid.raw[0], &grh->dgid.raw[0], 16); + attr.ah_flags = IB_AH_GRH; + } ah = ib_create_ah(tun_ctx->pd, &attr); if (IS_ERR(ah)) return -ENOMEM; @@ -540,11 +541,36 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, /* adjust tunnel data */ tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix); - tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); - tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF); tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0; + if (is_eth) { + u16 vlan = 0; + if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan, + NULL)) { + /* VST mode */ + if (vlan != wc->vlan_id) + /* Packet vlan is not the VST-assigned vlan. + * Drop the packet. + */ + goto out; + else + /* Remove the vlan tag before forwarding + * the packet to the VF. + */ + vlan = 0xffff; + } else { + vlan = wc->vlan_id; + } + + tun_mad->hdr.sl_vid = cpu_to_be16(vlan); + memcpy((char *)&tun_mad->hdr.mac_31_0, &(wc->smac[0]), 4); + memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2); + } else { + tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); + tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); + } + ib_dma_sync_single_for_device(&dev->ib_dev, tun_qp->tx_ring[tun_tx_ix].buf.map, sizeof (struct mlx4_rcv_tunnel_mad), @@ -580,6 +606,41 @@ static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port, int err; int slave; u8 *slave_id; + int is_eth = 0; + + if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) + is_eth = 0; + else + is_eth = 1; + + if (is_eth) { + if (!(wc->wc_flags & IB_WC_GRH)) { + mlx4_ib_warn(ibdev, "RoCE grh not present.\n"); + return -EINVAL; + } + if (mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_CM) { + mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n"); + return -EINVAL; + } + if (mlx4_get_slave_from_roce_gid(dev->dev, port, grh->dgid.raw, &slave)) { + mlx4_ib_warn(ibdev, "failed matching grh\n"); + return -ENOENT; + } + if (slave >= dev->dev->caps.sqp_demux) { + mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", + slave, dev->dev->caps.sqp_demux); + return -ENOENT; + } + + if (mlx4_ib_demux_cm_handler(ibdev, port, NULL, mad)) + return 0; + + err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); + if (err) + pr_debug("failed sending to slave %d via tunnel qp (%d)\n", + slave, err); + return 0; + } /* Initially assume that this mad is for us */ slave = mlx4_master_func_num(dev->dev); @@ -602,6 +663,21 @@ static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port, } /* Class-specific handling */ switch (mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + /* 255 indicates the dom0 */ + if (slave != 255 && slave != mlx4_master_func_num(dev->dev)) { + if (!mlx4_vf_smi_enabled(dev->dev, slave, port)) + return -EPERM; + /* for a VF. drop unsolicited MADs */ + if (!(mad->mad_hdr.method & IB_MGMT_METHOD_RESP)) { + mlx4_ib_warn(ibdev, "demux QP0. rejecting unsolicited mad for slave %d class 0x%x, method 0x%x\n", + slave, mad->mad_hdr.mgmt_class, + mad->mad_hdr.method); + return -EINVAL; + } + } + break; case IB_MGMT_CLASS_SUBN_ADM: if (mlx4_ib_demux_sa_handler(ibdev, port, slave, (struct ib_sa_mad *) mad)) @@ -1076,8 +1152,9 @@ static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave) int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, - enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, - u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad) + enum ib_qp_type dest_qpt, u16 pkey_index, + u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr, + u8 *s_mac, struct ib_mad *mad) { struct ib_sge list; struct ib_send_wr wr, *bad_wr; @@ -1099,10 +1176,6 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE) return -EAGAIN; - /* QP0 forwarding only for Dom0 */ - if (dest_qpt == IB_QPT_SMI && (mlx4_master_func_num(dev->dev) != slave)) - return -EINVAL; - if (dest_qpt == IB_QPT_SMI) { src_qpnum = 0; sqp = &sqp_ctx->qp[0]; @@ -1166,6 +1239,9 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, wr.num_sge = 1; wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; + if (s_mac) + memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6); + ret = ib_post_send(send_qp, &wr, &bad_wr); out: @@ -1174,6 +1250,22 @@ out: return ret; } +static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port) +{ + if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) + return slave; + return mlx4_get_base_gid_ix(dev->dev, slave, port); +} + +static void fill_in_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port, + struct ib_ah_attr *ah_attr) +{ + if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) + ah_attr->grh.sgid_index = slave; + else + ah_attr->grh.sgid_index += get_slave_base_gid_ix(dev, slave, port); +} + static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc) { struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); @@ -1184,6 +1276,7 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc struct ib_ah_attr ah_attr; u8 *slave_id; int slave; + int port; /* Get slave that sent this packet */ if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn || @@ -1199,11 +1292,6 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc "belongs to another slave\n", wc->src_qp); return; } - if (slave != mlx4_master_func_num(dev->dev) && !(wc->src_qp & 0x2)) { - mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: " - "non-master trying to send QP0 packets\n", wc->src_qp); - return; - } /* Map transaction ID */ ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map, @@ -1231,6 +1319,12 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc /* Class-specific handling */ switch (tunnel->mad.mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + if (slave != mlx4_master_func_num(dev->dev) && + !mlx4_vf_smi_enabled(dev->dev, slave, ctx->port)) + return; + break; case IB_MGMT_CLASS_SUBN_ADM: if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave, (struct ib_sa_mad *) &tunnel->mad)) @@ -1260,12 +1354,18 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av)); ah.ibah.device = ctx->ib_dev; mlx4_ib_query_ah(&ah.ibah, &ah_attr); - if ((ah_attr.ah_flags & IB_AH_GRH) && - (ah_attr.grh.sgid_index != slave)) { - mlx4_ib_warn(ctx->ib_dev, "slave:%d accessed invalid sgid_index:%d\n", - slave, ah_attr.grh.sgid_index); + if (ah_attr.ah_flags & IB_AH_GRH) + fill_in_real_sgid_index(dev, slave, ctx->port, &ah_attr); + + port = mlx4_slave_convert_port(dev->dev, slave, ah_attr.port_num); + if (port < 0) return; - } + ah_attr.port_num = port; + memcpy(ah_attr.dmac, tunnel->hdr.mac, 6); + ah_attr.vlan_id = be16_to_cpu(tunnel->hdr.vlan); + /* if slave have default vlan use it */ + mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave, + &ah_attr.vlan_id, &ah_attr.sl); mlx4_ib_send_to_wire(dev, slave, ctx->port, is_proxy_qp0(dev, wc->src_qp, slave) ? @@ -1273,7 +1373,7 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc be16_to_cpu(tunnel->hdr.pkey_index), be32_to_cpu(tunnel->hdr.remote_qpn), be32_to_cpu(tunnel->hdr.qkey), - &ah_attr, &tunnel->mad); + &ah_attr, wc->smac, &tunnel->mad); } static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, @@ -1657,9 +1757,9 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port, return -EEXIST; ctx->state = DEMUX_PV_STATE_STARTING; - /* have QP0 only on port owner, and only if link layer is IB */ - if (ctx->slave == mlx4_master_func_num(to_mdev(ctx->ib_dev)->dev) && - rdma_port_get_link_layer(ibdev, ctx->port) == IB_LINK_LAYER_INFINIBAND) + /* have QP0 only if link layer is IB */ + if (rdma_port_get_link_layer(ibdev, ctx->port) == + IB_LINK_LAYER_INFINIBAND) ctx->has_smi = 1; if (ctx->has_smi) { @@ -1850,7 +1950,15 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, ctx->port = port; ctx->ib_dev = &dev->ib_dev; - for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + for (i = 0; + i < min(dev->dev->caps.sqp_demux, (u16)(dev->dev->num_vfs + 1)); + i++) { + struct mlx4_active_ports actv_ports = + mlx4_get_active_ports(dev->dev, i); + + if (!test_bit(port - 1, actv_ports.ports)) + continue; + ret = alloc_pv_object(dev, i, port, &ctx->tun[i]); if (ret) { ret = -ENOMEM; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index d6c5a73becf..0f7027e7db1 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -39,6 +39,8 @@ #include <linux/inetdevice.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> +#include <net/ipv6.h> +#include <net/addrconf.h> #include <rdma/ib_smi.h> #include <rdma/ib_user_verbs.h> @@ -46,15 +48,17 @@ #include <linux/mlx4/driver.h> #include <linux/mlx4/cmd.h> +#include <linux/mlx4/qp.h> #include "mlx4_ib.h" #include "user.h" #define DRV_NAME MLX4_IB_DRV_NAME -#define DRV_VERSION "1.0" -#define DRV_RELDATE "April 4, 2008" +#define DRV_VERSION "2.2-1" +#define DRV_RELDATE "Feb 2014" #define MLX4_IB_FLOW_MAX_PRIO 0xFFF +#define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); @@ -92,21 +96,27 @@ static union ib_gid zgid; static int check_flow_steering_support(struct mlx4_dev *dev) { + int eth_num_ports = 0; int ib_num_ports = 0; - int i; - mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) - ib_num_ports++; - - if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { - if (ib_num_ports || mlx4_is_mfunc(dev)) { - pr_warn("Device managed flow steering is unavailable " - "for IB ports or in multifunction env.\n"); - return 0; + int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED; + + if (dmfs) { + int i; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) + eth_num_ports++; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_num_ports++; + dmfs &= (!ib_num_ports || + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) && + (!eth_num_ports || + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)); + if (ib_num_ports && mlx4_is_mfunc(dev)) { + pr_warn("Device managed flow steering is unavailable for IB port in multifunction env.\n"); + dmfs = 0; } - return 1; } - return 0; + return dmfs; } static int mlx4_ib_query_device(struct ib_device *ibdev, @@ -165,7 +175,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B; else props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A; - if (check_flow_steering_support(dev->dev)) + if (dev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; } @@ -177,18 +187,18 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->max_mr_size = ~0ull; props->page_size_cap = dev->dev->caps.page_size_cap; - props->max_qp = dev->dev->caps.num_qps - dev->dev->caps.reserved_qps; + props->max_qp = dev->dev->quotas.qp; props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); - props->max_cq = dev->dev->caps.num_cqs - dev->dev->caps.reserved_cqs; + props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; - props->max_mr = dev->dev->caps.num_mpts - dev->dev->caps.reserved_mrws; + props->max_mr = dev->dev->quotas.mpt; props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds; props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma; props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma; props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; - props->max_srq = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs; + props->max_srq = dev->dev->quotas.srq; props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; props->max_srq_sge = dev->dev->caps.max_srq_sge; props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; @@ -338,7 +348,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port, props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) ? IB_WIDTH_4X : IB_WIDTH_1X; props->active_speed = IB_SPEED_QDR; - props->port_cap_flags = IB_PORT_CM_SUP; + props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS; props->gid_tbl_len = mdev->dev->caps.gid_table_len[port]; props->max_msg_sz = mdev->dev->caps.max_msg_sz; props->pkey_tbl_len = 1; @@ -526,7 +536,6 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, if (IS_ERR(mailbox)) return 0; - memset(mailbox->buf, 0, 256); memcpy(mailbox->buf, props->node_desc, 64); mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); @@ -536,19 +545,16 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, return 0; } -static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, - u32 cap_mask) +static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, + u32 cap_mask) { struct mlx4_cmd_mailbox *mailbox; int err; - u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; mailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); - memset(mailbox->buf, 0, 256); - if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { *(u8 *) mailbox->buf = !!reset_qkey_viols << 6; ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask); @@ -557,8 +563,8 @@ static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask); } - err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT, - MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + err = mlx4_cmd(dev->dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); mlx4_free_cmd_mailbox(dev->dev, mailbox); return err; @@ -567,11 +573,20 @@ static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { + struct mlx4_ib_dev *mdev = to_mdev(ibdev); + u8 is_eth = mdev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; struct ib_port_attr attr; u32 cap_mask; int err; - mutex_lock(&to_mdev(ibdev)->cap_mask_mutex); + /* return OK if this is RoCE. CM calls ib_modify_port() regardless + * of whether port link layer is ETH or IB. For ETH ports, qkey + * violations and port capabilities are not meaningful. + */ + if (is_eth) + return 0; + + mutex_lock(&mdev->cap_mask_mutex); err = mlx4_ib_query_port(ibdev, port, &attr); if (err) @@ -580,9 +595,9 @@ static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; - err = mlx4_SET_PORT(to_mdev(ibdev), port, - !!(mask & IB_PORT_RESET_QKEY_CNTR), - cap_mask); + err = mlx4_ib_SET_PORT(mdev, port, + !!(mask & IB_PORT_RESET_QKEY_CNTR), + cap_mask); out: mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); @@ -790,7 +805,6 @@ static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, union ib_gid *gid) { - u8 mac[6]; struct net_device *ndev; int ret = 0; @@ -804,11 +818,7 @@ int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, spin_unlock(&mdev->iboe.lock); if (ndev) { - rdma_get_mcast_mac((struct in6_addr *)gid, mac); - rtnl_lock(); - dev_mc_add(mdev->iboe.netdevs[mqp->port - 1], mac); ret = 1; - rtnl_unlock(); dev_put(ndev); } @@ -822,6 +832,7 @@ struct mlx4_ib_steering { }; static int parse_flow_attr(struct mlx4_dev *dev, + u32 qp_num, union ib_flow_spec *ib_spec, struct _rule_hw *mlx4_spec) { @@ -837,6 +848,14 @@ static int parse_flow_attr(struct mlx4_dev *dev, mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag; mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag; break; + case IB_FLOW_SPEC_IB: + type = MLX4_NET_TRANS_RULE_ID_IB; + mlx4_spec->ib.l3_qpn = + cpu_to_be32(qp_num); + mlx4_spec->ib.qpn_mask = + cpu_to_be32(MLX4_IB_FLOW_QPN_MASK); + break; + case IB_FLOW_SPEC_IPV4: type = MLX4_NET_TRANS_RULE_ID_IPV4; @@ -868,6 +887,115 @@ static int parse_flow_attr(struct mlx4_dev *dev, return mlx4_hw_rule_sz(dev, type); } +struct default_rules { + __u32 mandatory_fields[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u32 mandatory_not_fields[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u32 rules_create_list[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u8 link_layer; +}; +static const struct default_rules default_table[] = { + { + .mandatory_fields = {IB_FLOW_SPEC_IPV4}, + .mandatory_not_fields = {IB_FLOW_SPEC_ETH}, + .rules_create_list = {IB_FLOW_SPEC_IB}, + .link_layer = IB_LINK_LAYER_INFINIBAND + } +}; + +static int __mlx4_ib_default_rules_match(struct ib_qp *qp, + struct ib_flow_attr *flow_attr) +{ + int i, j, k; + void *ib_flow; + const struct default_rules *pdefault_rules = default_table; + u8 link_layer = rdma_port_get_link_layer(qp->device, flow_attr->port); + + for (i = 0; i < sizeof(default_table)/sizeof(default_table[0]); i++, + pdefault_rules++) { + __u32 field_types[IB_FLOW_SPEC_SUPPORT_LAYERS]; + memset(&field_types, 0, sizeof(field_types)); + + if (link_layer != pdefault_rules->link_layer) + continue; + + ib_flow = flow_attr + 1; + /* we assume the specs are sorted */ + for (j = 0, k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS && + j < flow_attr->num_of_specs; k++) { + union ib_flow_spec *current_flow = + (union ib_flow_spec *)ib_flow; + + /* same layer but different type */ + if (((current_flow->type & IB_FLOW_SPEC_LAYER_MASK) == + (pdefault_rules->mandatory_fields[k] & + IB_FLOW_SPEC_LAYER_MASK)) && + (current_flow->type != + pdefault_rules->mandatory_fields[k])) + goto out; + + /* same layer, try match next one */ + if (current_flow->type == + pdefault_rules->mandatory_fields[k]) { + j++; + ib_flow += + ((union ib_flow_spec *)ib_flow)->size; + } + } + + ib_flow = flow_attr + 1; + for (j = 0; j < flow_attr->num_of_specs; + j++, ib_flow += ((union ib_flow_spec *)ib_flow)->size) + for (k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS; k++) + /* same layer and same type */ + if (((union ib_flow_spec *)ib_flow)->type == + pdefault_rules->mandatory_not_fields[k]) + goto out; + + return i; + } +out: + return -1; +} + +static int __mlx4_ib_create_default_rules( + struct mlx4_ib_dev *mdev, + struct ib_qp *qp, + const struct default_rules *pdefault_rules, + struct _rule_hw *mlx4_spec) { + int size = 0; + int i; + + for (i = 0; i < sizeof(pdefault_rules->rules_create_list)/ + sizeof(pdefault_rules->rules_create_list[0]); i++) { + int ret; + union ib_flow_spec ib_spec; + switch (pdefault_rules->rules_create_list[i]) { + case 0: + /* no rule */ + continue; + case IB_FLOW_SPEC_IB: + ib_spec.type = IB_FLOW_SPEC_IB; + ib_spec.size = sizeof(struct ib_flow_spec_ib); + + break; + default: + /* invalid rule */ + return -EINVAL; + } + /* We must put empty rule, qpn is being ignored */ + ret = parse_flow_attr(mdev->dev, 0, &ib_spec, + mlx4_spec); + if (ret < 0) { + pr_info("invalid parsing\n"); + return -EINVAL; + } + + mlx4_spec = (void *)mlx4_spec + ret; + size += ret; + } + return size; +} + static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain, enum mlx4_net_trans_promisc_mode flow_type, @@ -879,8 +1007,7 @@ static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_att struct mlx4_ib_dev *mdev = to_mdev(qp->device); struct mlx4_cmd_mailbox *mailbox; struct mlx4_net_trans_rule_hw_ctrl *ctrl; - size_t rule_size = sizeof(struct mlx4_net_trans_rule_hw_ctrl) + - (sizeof(struct _rule_hw) * flow_attr->num_of_specs); + int default_flow; static const u16 __mlx4_domain[] = { [IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS, @@ -905,7 +1032,6 @@ static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_att mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); - memset(mailbox->buf, 0, rule_size); ctrl = mailbox->buf; ctrl->prio = cpu_to_be16(__mlx4_domain[domain] | @@ -916,8 +1042,21 @@ static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_att ib_flow = flow_attr + 1; size += sizeof(struct mlx4_net_trans_rule_hw_ctrl); + /* Add default flows */ + default_flow = __mlx4_ib_default_rules_match(qp, flow_attr); + if (default_flow >= 0) { + ret = __mlx4_ib_create_default_rules( + mdev, qp, default_table + default_flow, + mailbox->buf + size); + if (ret < 0) { + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return -EINVAL; + } + size += ret; + } for (i = 0; i < flow_attr->num_of_specs; i++) { - ret = parse_flow_attr(mdev->dev, ib_flow, mailbox->buf + size); + ret = parse_flow_attr(mdev->dev, qp->qp_num, ib_flow, + mailbox->buf + size); if (ret < 0) { mlx4_free_cmd_mailbox(mdev->dev, mailbox); return -EINVAL; @@ -1031,6 +1170,8 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) struct mlx4_ib_qp *mqp = to_mqp(ibqp); u64 reg_id; struct mlx4_ib_steering *ib_steering = NULL; + enum mlx4_protocol prot = (gid->raw[1] == 0x0e) ? + MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { @@ -1042,7 +1183,7 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port, !!(mqp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), - MLX4_PROT_IB_IPV6, ®_id); + prot, ®_id); if (err) goto err_malloc; @@ -1061,7 +1202,7 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) err_add: mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, - MLX4_PROT_IB_IPV6, reg_id); + prot, reg_id); err_malloc: kfree(ib_steering); @@ -1089,10 +1230,11 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); - u8 mac[6]; struct net_device *ndev; struct mlx4_ib_gid_entry *ge; u64 reg_id = 0; + enum mlx4_protocol prot = (gid->raw[1] == 0x0e) ? + MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { @@ -1115,7 +1257,7 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) } err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, - MLX4_PROT_IB_IPV6, reg_id); + prot, reg_id); if (err) return err; @@ -1127,13 +1269,8 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) if (ndev) dev_hold(ndev); spin_unlock(&mdev->iboe.lock); - rdma_get_mcast_mac((struct in6_addr *)gid, mac); - if (ndev) { - rtnl_lock(); - dev_mc_del(mdev->iboe.netdevs[ge->port - 1], mac); - rtnl_unlock(); + if (ndev) dev_put(ndev); - } list_del(&ge->list); kfree(ge); } else @@ -1229,7 +1366,8 @@ static struct device_attribute *mlx4_class_attributes[] = { &dev_attr_board_id }; -static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev) +static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, + struct net_device *dev) { memcpy(eui, dev->dev_addr, 3); memcpy(eui + 5, dev->dev_addr + 3, 3); @@ -1265,162 +1403,437 @@ static void update_gids_task(struct work_struct *work) MLX4_CMD_WRAPPED); if (err) pr_warn("set port command failed\n"); - else { - memcpy(gw->dev->iboe.gid_table[gw->port - 1], gw->gids, sizeof gw->gids); + else mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE); + + mlx4_free_cmd_mailbox(dev, mailbox); + kfree(gw); +} + +static void reset_gids_task(struct work_struct *work) +{ + struct update_gid_work *gw = + container_of(work, struct update_gid_work, work); + struct mlx4_cmd_mailbox *mailbox; + union ib_gid *gids; + int err; + struct mlx4_dev *dev = gw->dev->dev; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + pr_warn("reset gid table failed\n"); + goto free; + } + + gids = mailbox->buf; + memcpy(gids, gw->gids, sizeof(gw->gids)); + + if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) == + IB_LINK_LAYER_ETHERNET) { + err = mlx4_cmd(dev, mailbox->dma, + MLX4_SET_PORT_GID_TABLE << 8 | gw->port, + 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + pr_warn(KERN_WARNING + "set port %d command failed\n", gw->port); } mlx4_free_cmd_mailbox(dev, mailbox); +free: kfree(gw); } -static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) +static int update_gid_table(struct mlx4_ib_dev *dev, int port, + union ib_gid *gid, int clear, + int default_gid) { - struct net_device *ndev = dev->iboe.netdevs[port - 1]; struct update_gid_work *work; - struct net_device *tmp; int i; - u8 *hits; - int ret; - union ib_gid gid; - int free; - int found; int need_update = 0; - u16 vid; - - work = kzalloc(sizeof *work, GFP_ATOMIC); - if (!work) - return -ENOMEM; - - hits = kzalloc(128, GFP_ATOMIC); - if (!hits) { - ret = -ENOMEM; - goto out; - } + int free = -1; + int found = -1; + int max_gids; - rcu_read_lock(); - for_each_netdev_rcu(&init_net, tmp) { - if (ndev && (tmp == ndev || rdma_vlan_dev_real_dev(tmp) == ndev)) { - gid.global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); - vid = rdma_vlan_dev_vlan_id(tmp); - mlx4_addrconf_ifid_eui48(&gid.raw[8], vid, ndev); - found = 0; - free = -1; - for (i = 0; i < 128; ++i) { - if (free < 0 && - !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) - free = i; - if (!memcmp(&dev->iboe.gid_table[port - 1][i], &gid, sizeof gid)) { - hits[i] = 1; - found = 1; + if (default_gid) { + free = 0; + } else { + max_gids = dev->dev->caps.gid_table_len[port]; + for (i = 1; i < max_gids; ++i) { + if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid, + sizeof(*gid))) + found = i; + + if (clear) { + if (found >= 0) { + need_update = 1; + dev->iboe.gid_table[port - 1][found] = + zgid; break; } - } + } else { + if (found >= 0) + break; - if (!found) { - if (tmp == ndev && - (memcmp(&dev->iboe.gid_table[port - 1][0], - &gid, sizeof gid) || - !memcmp(&dev->iboe.gid_table[port - 1][0], - &zgid, sizeof gid))) { - dev->iboe.gid_table[port - 1][0] = gid; - ++need_update; - hits[0] = 1; - } else if (free >= 0) { - dev->iboe.gid_table[port - 1][free] = gid; - hits[free] = 1; - ++need_update; - } + if (free < 0 && + !memcmp(&dev->iboe.gid_table[port - 1][i], + &zgid, sizeof(*gid))) + free = i; } } } - rcu_read_unlock(); - for (i = 0; i < 128; ++i) - if (!hits[i]) { - if (memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) - ++need_update; - dev->iboe.gid_table[port - 1][i] = zgid; - } + if (found == -1 && !clear && free >= 0) { + dev->iboe.gid_table[port - 1][free] = *gid; + need_update = 1; + } - if (need_update) { - memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof work->gids); - INIT_WORK(&work->work, update_gids_task); - work->port = port; - work->dev = dev; - queue_work(wq, &work->work); - } else - kfree(work); + if (!need_update) + return 0; + + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return -ENOMEM; + + memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids)); + INIT_WORK(&work->work, update_gids_task); + work->port = port; + work->dev = dev; + queue_work(wq, &work->work); - kfree(hits); return 0; +} -out: - kfree(work); - return ret; +static void mlx4_make_default_gid(struct net_device *dev, union ib_gid *gid) +{ + gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev); } -static void handle_en_event(struct mlx4_ib_dev *dev, int port, unsigned long event) + +static int reset_gid_table(struct mlx4_ib_dev *dev, u8 port) { - switch (event) { - case NETDEV_UP: - case NETDEV_CHANGEADDR: - update_ipv6_gids(dev, port, 0); - break; + struct update_gid_work *work; - case NETDEV_DOWN: - update_ipv6_gids(dev, port, 1); - dev->iboe.netdevs[port - 1] = NULL; - } + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return -ENOMEM; + + memset(dev->iboe.gid_table[port - 1], 0, sizeof(work->gids)); + memset(work->gids, 0, sizeof(work->gids)); + INIT_WORK(&work->work, reset_gids_task); + work->dev = dev; + work->port = port; + queue_work(wq, &work->work); + return 0; } -static void netdev_added(struct mlx4_ib_dev *dev, int port) +static int mlx4_ib_addr_event(int event, struct net_device *event_netdev, + struct mlx4_ib_dev *ibdev, union ib_gid *gid) { - update_ipv6_gids(dev, port, 0); + struct mlx4_ib_iboe *iboe; + int port = 0; + struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ? + rdma_vlan_dev_real_dev(event_netdev) : + event_netdev; + union ib_gid default_gid; + + mlx4_make_default_gid(real_dev, &default_gid); + + if (!memcmp(gid, &default_gid, sizeof(*gid))) + return 0; + + if (event != NETDEV_DOWN && event != NETDEV_UP) + return 0; + + if ((real_dev != event_netdev) && + (event == NETDEV_DOWN) && + rdma_link_local_addr((struct in6_addr *)gid)) + return 0; + + iboe = &ibdev->iboe; + spin_lock(&iboe->lock); + + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) + if ((netif_is_bond_master(real_dev) && + (real_dev == iboe->masters[port - 1])) || + (!netif_is_bond_master(real_dev) && + (real_dev == iboe->netdevs[port - 1]))) + update_gid_table(ibdev, port, gid, + event == NETDEV_DOWN, 0); + + spin_unlock(&iboe->lock); + return 0; + } -static void netdev_removed(struct mlx4_ib_dev *dev, int port) +static u8 mlx4_ib_get_dev_port(struct net_device *dev, + struct mlx4_ib_dev *ibdev) { - update_ipv6_gids(dev, port, 1); + u8 port = 0; + struct mlx4_ib_iboe *iboe; + struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ? + rdma_vlan_dev_real_dev(dev) : dev; + + iboe = &ibdev->iboe; + + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) + if ((netif_is_bond_master(real_dev) && + (real_dev == iboe->masters[port - 1])) || + (!netif_is_bond_master(real_dev) && + (real_dev == iboe->netdevs[port - 1]))) + break; + + if ((port == 0) || (port > ibdev->dev->caps.num_ports)) + return 0; + else + return port; } -static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event, +static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct mlx4_ib_dev *ibdev; + struct in_ifaddr *ifa = ptr; + union ib_gid gid; + struct net_device *event_netdev = ifa->ifa_dev->dev; + + ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid); + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet); + + mlx4_ib_addr_event(event, event_netdev, ibdev, &gid); + return NOTIFY_DONE; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int mlx4_ib_inet6_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct mlx4_ib_dev *ibdev; - struct net_device *oldnd; + struct inet6_ifaddr *ifa = ptr; + union ib_gid *gid = (union ib_gid *)&ifa->addr; + struct net_device *event_netdev = ifa->idev->dev; + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet6); + + mlx4_ib_addr_event(event, event_netdev, ibdev, gid); + return NOTIFY_DONE; +} +#endif + +#define MLX4_IB_INVALID_MAC ((u64)-1) +static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev, + struct net_device *dev, + int port) +{ + u64 new_smac = 0; + u64 release_mac = MLX4_IB_INVALID_MAC; + struct mlx4_ib_qp *qp; + + read_lock(&dev_base_lock); + new_smac = mlx4_mac_to_u64(dev->dev_addr); + read_unlock(&dev_base_lock); + + mutex_lock(&ibdev->qp1_proxy_lock[port - 1]); + qp = ibdev->qp1_proxy[port - 1]; + if (qp) { + int new_smac_index; + u64 old_smac = qp->pri.smac; + struct mlx4_update_qp_params update_params; + + if (new_smac == old_smac) + goto unlock; + + new_smac_index = mlx4_register_mac(ibdev->dev, port, new_smac); + + if (new_smac_index < 0) + goto unlock; + + update_params.smac_index = new_smac_index; + if (mlx4_update_qp(ibdev->dev, &qp->mqp, MLX4_UPDATE_QP_SMAC, + &update_params)) { + release_mac = new_smac; + goto unlock; + } + + qp->pri.smac = new_smac; + qp->pri.smac_index = new_smac_index; + + release_mac = old_smac; + } + +unlock: + mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]); + if (release_mac != MLX4_IB_INVALID_MAC) + mlx4_unregister_mac(ibdev->dev, port, release_mac); +} + +static void mlx4_ib_get_dev_addr(struct net_device *dev, + struct mlx4_ib_dev *ibdev, u8 port) +{ + struct in_device *in_dev; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev; + union ib_gid *pgid; + struct inet6_ifaddr *ifp; +#endif + union ib_gid gid; + + + if ((port == 0) || (port > ibdev->dev->caps.num_ports)) + return; + + /* IPv4 gids */ + in_dev = in_dev_get(dev); + if (in_dev) { + for_ifa(in_dev) { + /*ifa->ifa_address;*/ + ipv6_addr_set_v4mapped(ifa->ifa_address, + (struct in6_addr *)&gid); + update_gid_table(ibdev, port, &gid, 0, 0); + } + endfor_ifa(in_dev); + in_dev_put(in_dev); + } +#if IS_ENABLED(CONFIG_IPV6) + /* IPv6 gids */ + in6_dev = in6_dev_get(dev); + if (in6_dev) { + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { + pgid = (union ib_gid *)&ifp->addr; + update_gid_table(ibdev, port, pgid, 0, 0); + } + read_unlock_bh(&in6_dev->lock); + in6_dev_put(in6_dev); + } +#endif +} + +static void mlx4_ib_set_default_gid(struct mlx4_ib_dev *ibdev, + struct net_device *dev, u8 port) +{ + union ib_gid gid; + mlx4_make_default_gid(dev, &gid); + update_gid_table(ibdev, port, &gid, 0, 1); +} + +static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev) +{ + struct net_device *dev; + struct mlx4_ib_iboe *iboe = &ibdev->iboe; + int i; + + for (i = 1; i <= ibdev->num_ports; ++i) + if (reset_gid_table(ibdev, i)) + return -1; + + read_lock(&dev_base_lock); + spin_lock(&iboe->lock); + + for_each_netdev(&init_net, dev) { + u8 port = mlx4_ib_get_dev_port(dev, ibdev); + if (port) + mlx4_ib_get_dev_addr(dev, ibdev, port); + } + + spin_unlock(&iboe->lock); + read_unlock(&dev_base_lock); + + return 0; +} + +static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, + struct net_device *dev, + unsigned long event) + +{ struct mlx4_ib_iboe *iboe; + int update_qps_port = -1; int port; - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - - ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); iboe = &ibdev->iboe; spin_lock(&iboe->lock); mlx4_foreach_ib_transport_port(port, ibdev->dev) { - oldnd = iboe->netdevs[port - 1]; + enum ib_port_state port_state = IB_PORT_NOP; + struct net_device *old_master = iboe->masters[port - 1]; + struct net_device *curr_netdev; + struct net_device *curr_master; + iboe->netdevs[port - 1] = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); - if (oldnd != iboe->netdevs[port - 1]) { - if (iboe->netdevs[port - 1]) - netdev_added(ibdev, port); - else - netdev_removed(ibdev, port); + if (iboe->netdevs[port - 1]) + mlx4_ib_set_default_gid(ibdev, + iboe->netdevs[port - 1], port); + curr_netdev = iboe->netdevs[port - 1]; + + if (iboe->netdevs[port - 1] && + netif_is_bond_slave(iboe->netdevs[port - 1])) { + iboe->masters[port - 1] = netdev_master_upper_dev_get( + iboe->netdevs[port - 1]); + } else { + iboe->masters[port - 1] = NULL; + } + curr_master = iboe->masters[port - 1]; + + if (dev == iboe->netdevs[port - 1] && + (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER || + event == NETDEV_UP || event == NETDEV_CHANGE)) + update_qps_port = port; + + if (curr_netdev) { + port_state = (netif_running(curr_netdev) && netif_carrier_ok(curr_netdev)) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + mlx4_ib_set_default_gid(ibdev, curr_netdev, port); + } else { + reset_gid_table(ibdev, port); + } + /* if using bonding/team and a slave port is down, we don't the bond IP + * based gids in the table since flows that select port by gid may get + * the down port. + */ + if (curr_master && (port_state == IB_PORT_DOWN)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, curr_netdev, port); + } + /* if bonding is used it is possible that we add it to masters + * only after IP address is assigned to the net bonding + * interface. + */ + if (curr_master && (old_master != curr_master)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, curr_netdev, port); + mlx4_ib_get_dev_addr(curr_master, ibdev, port); } - } - if (dev == iboe->netdevs[0] || - (iboe->netdevs[0] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[0])) - handle_en_event(ibdev, 1, event); - else if (dev == iboe->netdevs[1] - || (iboe->netdevs[1] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[1])) - handle_en_event(ibdev, 2, event); + if (!curr_master && (old_master != curr_master)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, curr_netdev, port); + mlx4_ib_get_dev_addr(curr_netdev, ibdev, port); + } + } spin_unlock(&iboe->lock); + if (update_qps_port > 0) + mlx4_ib_update_qps(ibdev, dev, update_qps_port); +} + +static int mlx4_ib_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct mlx4_ib_dev *ibdev; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); + mlx4_ib_scan_netdevs(ibdev, dev, event); + return NOTIFY_DONE; } @@ -1458,7 +1871,7 @@ static void init_pkeys(struct mlx4_ib_dev *ibdev) static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { - char name[32]; + char name[80]; int eq_per_port = 0; int added_eqs = 0; int total_eqs = 0; @@ -1488,8 +1901,8 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) eq = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) { for (j = 0; j < eq_per_port; j++) { - sprintf(name, "mlx4-ib-%d-%d@%s", - i, j, dev->pdev->bus->name); + snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%s", + i, j, dev->pdev->bus->name); /* Set IRQ for specific name (per ring) */ if (mlx4_assign_eq(dev, name, NULL, &ibdev->eq_table[eq])) { @@ -1539,17 +1952,10 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) int i, j; int err; struct mlx4_ib_iboe *iboe; + int ib_num_ports = 0; pr_info_once("%s", mlx4_ib_version); - mlx4_foreach_non_ib_transport_port(i, dev) - num_ports++; - - if (mlx4_is_mfunc(dev) && num_ports) { - dev_err(&dev->pdev->dev, "RoCE is not supported over SRIOV as yet\n"); - return NULL; - } - num_ports = 0; mlx4_foreach_ib_transport_port(i, dev) num_ports++; @@ -1688,12 +2094,13 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) } if (check_flow_steering_support(dev)) { + ibdev->steering_support = MLX4_STEERING_MODE_DEVICE_MANAGED; ibdev->ib_dev.create_flow = mlx4_ib_create_flow; ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; - ibdev->ib_dev.uverbs_cmd_mask |= - (1ull << IB_USER_VERBS_CMD_CREATE_FLOW) | - (1ull << IB_USER_VERBS_CMD_DESTROY_FLOW); + ibdev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); } mlx4_ib_alloc_eqs(dev, ibdev); @@ -1704,20 +2111,53 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) goto err_map; for (i = 0; i < ibdev->num_ports; ++i) { + mutex_init(&ibdev->qp1_proxy_lock[i]); if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == IB_LINK_LAYER_ETHERNET) { err = mlx4_counter_alloc(ibdev->dev, &ibdev->counters[i]); if (err) ibdev->counters[i] = -1; - } else - ibdev->counters[i] = -1; + } else { + ibdev->counters[i] = -1; + } } + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_num_ports++; + spin_lock_init(&ibdev->sm_lock); mutex_init(&ibdev->cap_mask_mutex); + if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED && + ib_num_ports) { + ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; + err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, + MLX4_IB_UC_STEER_QPN_ALIGN, + &ibdev->steer_qpn_base); + if (err) + goto err_counter; + + ibdev->ib_uc_qpns_bitmap = + kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) * + sizeof(long), + GFP_KERNEL); + if (!ibdev->ib_uc_qpns_bitmap) { + dev_err(&dev->pdev->dev, "bit map alloc failed\n"); + goto err_steer_qp_release; + } + + bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count); + + err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE( + dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_base + + ibdev->steer_qpn_count - 1); + if (err) + goto err_steer_free_bitmap; + } + if (ib_register_device(&ibdev->ib_dev, NULL)) - goto err_counter; + goto err_steer_free_bitmap; if (mlx4_ib_mad_init(ibdev)) goto err_reg; @@ -1725,11 +2165,39 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) if (mlx4_ib_init_sriov(ibdev)) goto err_mad; - if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE && !iboe->nb.notifier_call) { - iboe->nb.notifier_call = mlx4_ib_netdev_event; - err = register_netdevice_notifier(&iboe->nb); - if (err) - goto err_sriov; + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) { + if (!iboe->nb.notifier_call) { + iboe->nb.notifier_call = mlx4_ib_netdev_event; + err = register_netdevice_notifier(&iboe->nb); + if (err) { + iboe->nb.notifier_call = NULL; + goto err_notif; + } + } + if (!iboe->nb_inet.notifier_call) { + iboe->nb_inet.notifier_call = mlx4_ib_inet_event; + err = register_inetaddr_notifier(&iboe->nb_inet); + if (err) { + iboe->nb_inet.notifier_call = NULL; + goto err_notif; + } + } +#if IS_ENABLED(CONFIG_IPV6) + if (!iboe->nb_inet6.notifier_call) { + iboe->nb_inet6.notifier_call = mlx4_ib_inet6_event; + err = register_inet6addr_notifier(&iboe->nb_inet6); + if (err) { + iboe->nb_inet6.notifier_call = NULL; + goto err_notif; + } + } +#endif + for (i = 1 ; i <= ibdev->num_ports ; ++i) + reset_gid_table(ibdev, i); + rtnl_lock(); + mlx4_ib_scan_netdevs(ibdev, NULL, 0); + rtnl_unlock(); + mlx4_ib_init_gid_table(ibdev); } for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { @@ -1755,11 +2223,25 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) return ibdev; err_notif: - if (unregister_netdevice_notifier(&ibdev->iboe.nb)) - pr_warn("failure unregistering notifier\n"); + if (ibdev->iboe.nb.notifier_call) { + if (unregister_netdevice_notifier(&ibdev->iboe.nb)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb.notifier_call = NULL; + } + if (ibdev->iboe.nb_inet.notifier_call) { + if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet.notifier_call = NULL; + } +#if IS_ENABLED(CONFIG_IPV6) + if (ibdev->iboe.nb_inet6.notifier_call) { + if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet6.notifier_call = NULL; + } +#endif flush_workqueue(wq); -err_sriov: mlx4_ib_close_sriov(ibdev); err_mad: @@ -1768,6 +2250,13 @@ err_mad: err_reg: ib_unregister_device(&ibdev->ib_dev); +err_steer_free_bitmap: + kfree(ibdev->ib_uc_qpns_bitmap); + +err_steer_qp_release: + if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); err_counter: for (; i; --i) if (ibdev->counters[i - 1] != -1) @@ -1788,6 +2277,69 @@ err_dealloc: return NULL; } +int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn) +{ + int offset; + + WARN_ON(!dev->ib_uc_qpns_bitmap); + + offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap, + dev->steer_qpn_count, + get_count_order(count)); + if (offset < 0) + return offset; + + *qpn = dev->steer_qpn_base + offset; + return 0; +} + +void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) +{ + if (!qpn || + dev->steering_support != MLX4_STEERING_MODE_DEVICE_MANAGED) + return; + + BUG_ON(qpn < dev->steer_qpn_base); + + bitmap_release_region(dev->ib_uc_qpns_bitmap, + qpn - dev->steer_qpn_base, + get_count_order(count)); +} + +int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + int is_attach) +{ + int err; + size_t flow_size; + struct ib_flow_attr *flow = NULL; + struct ib_flow_spec_ib *ib_spec; + + if (is_attach) { + flow_size = sizeof(struct ib_flow_attr) + + sizeof(struct ib_flow_spec_ib); + flow = kzalloc(flow_size, GFP_KERNEL); + if (!flow) + return -ENOMEM; + flow->port = mqp->port; + flow->num_of_specs = 1; + flow->size = flow_size; + ib_spec = (struct ib_flow_spec_ib *)(flow + 1); + ib_spec->type = IB_FLOW_SPEC_IB; + ib_spec->size = sizeof(struct ib_flow_spec_ib); + /* Add an empty rule for IB L2 */ + memset(&ib_spec->mask, 0, sizeof(ib_spec->mask)); + + err = __mlx4_ib_create_flow(&mqp->ibqp, flow, + IB_FLOW_DOMAIN_NIC, + MLX4_FS_REGULAR, + &mqp->reg_id); + } else { + err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id); + } + kfree(flow); + return err; +} + static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) { struct mlx4_ib_dev *ibdev = ibdev_ptr; @@ -1801,6 +2353,26 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb.notifier_call = NULL; } + + if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) { + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); + kfree(ibdev->ib_uc_qpns_bitmap); + } + + if (ibdev->iboe.nb_inet.notifier_call) { + if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet.notifier_call = NULL; + } +#if IS_ENABLED(CONFIG_IPV6) + if (ibdev->iboe.nb_inet6.notifier_call) { + if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet6.notifier_call = NULL; + } +#endif + iounmap(ibdev->uar_map); for (p = 0; p < ibdev->num_ports; ++p) if (ibdev->counters[p] != -1) @@ -1821,17 +2393,24 @@ static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) struct mlx4_dev *dev = ibdev->dev; int i; unsigned long flags; + struct mlx4_active_ports actv_ports; + unsigned int ports; + unsigned int first_port; if (!mlx4_is_master(dev)) return; - dm = kcalloc(dev->caps.num_ports, sizeof *dm, GFP_ATOMIC); + actv_ports = mlx4_get_active_ports(dev, slave); + ports = bitmap_weight(actv_ports.ports, dev->caps.num_ports); + first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports); + + dm = kcalloc(ports, sizeof(*dm), GFP_ATOMIC); if (!dm) { pr_err("failed to allocate memory for tunneling qp update\n"); goto out; } - for (i = 0; i < dev->caps.num_ports; i++) { + for (i = 0; i < ports; i++) { dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC); if (!dm[i]) { pr_err("failed to allocate memory for tunneling qp update work struct\n"); @@ -1843,9 +2422,9 @@ static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) } } /* initialize or tear down tunnel QPs for the slave */ - for (i = 0; i < dev->caps.num_ports; i++) { + for (i = 0; i < ports; i++) { INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work); - dm[i]->port = i + 1; + dm[i]->port = first_port + i + 1; dm[i]->slave = slave; dm[i]->do_init = do_init; dm[i]->dev = ibdev; diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c index 25b2cdff00f..ed327e6c8fd 100644 --- a/drivers/infiniband/hw/mlx4/mcg.c +++ b/drivers/infiniband/hw/mlx4/mcg.c @@ -215,8 +215,9 @@ static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad) } mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); spin_unlock(&dev->sm_lock); - return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), ctx->port, - IB_QPT_GSI, 0, 1, IB_QP1_QKEY, &ah_attr, mad); + return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), + ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY, + &ah_attr, NULL, mad); } static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx, diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 036b663dd26..369da3ca5d6 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -68,6 +68,8 @@ enum { /*module param to indicate if SM assigns the alias_GUID*/ extern int mlx4_ib_sm_guid_assign; +#define MLX4_IB_UC_STEER_QPN_ALIGN 1 +#define MLX4_IB_UC_MAX_NUM_QPS 256 struct mlx4_ib_ucontext { struct ib_ucontext ibucontext; struct mlx4_uar uar; @@ -153,6 +155,8 @@ struct mlx4_ib_wq { enum mlx4_ib_qp_flags { MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, + MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP, + MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO, MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30, MLX4_IB_SRIOV_SQP = 1 << 31, }; @@ -238,6 +242,22 @@ struct mlx4_ib_proxy_sqp_hdr { struct mlx4_rcv_tunnel_hdr tun; } __packed; +struct mlx4_roce_smac_vlan_info { + u64 smac; + int smac_index; + int smac_port; + u64 candidate_smac; + int candidate_smac_index; + int candidate_smac_port; + u16 vid; + int vlan_index; + int vlan_port; + u16 candidate_vid; + int candidate_vlan_index; + int candidate_vlan_port; + int update_vid; +}; + struct mlx4_ib_qp { struct ib_qp ibqp; struct mlx4_qp mqp; @@ -270,7 +290,9 @@ struct mlx4_ib_qp { struct list_head gid_list; struct list_head steering_rules; struct mlx4_ib_buf *sqp_proxy_rcv; - + struct mlx4_roce_smac_vlan_info pri; + struct mlx4_roce_smac_vlan_info alt; + u64 reg_id; }; struct mlx4_ib_srq { @@ -428,7 +450,10 @@ struct mlx4_ib_sriov { struct mlx4_ib_iboe { spinlock_t lock; struct net_device *netdevs[MLX4_MAX_PORTS]; + struct net_device *masters[MLX4_MAX_PORTS]; struct notifier_block nb; + struct notifier_block nb_inet; + struct notifier_block nb_inet6; union ib_gid gid_table[MLX4_MAX_PORTS][128]; }; @@ -494,6 +519,13 @@ struct mlx4_ib_dev { struct kobject *dev_ports_parent[MLX4_MFUNC_MAX]; struct mlx4_ib_iov_port iov_ports[MLX4_MAX_PORTS]; struct pkey_mgt pkeys; + unsigned long *ib_uc_qpns_bitmap; + int steer_qpn_count; + int steer_qpn_base; + int steering_support; + struct mlx4_ib_qp *qp1_proxy[MLX4_MAX_PORTS]; + /* lock when destroying qp1_proxy and getting netdev events */ + struct mutex qp1_proxy_lock[MLX4_MAX_PORTS]; }; struct ib_event_work { @@ -675,9 +707,6 @@ int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid, int netw_view); -int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, - u8 *mac, int *is_mcast, u8 port); - static inline bool mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) { u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3; @@ -712,9 +741,12 @@ void mlx4_ib_tunnels_update_work(struct work_struct *work); int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type qpt, struct ib_wc *wc, struct ib_grh *grh, struct ib_mad *mad); + int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, - u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad); + u32 qkey, struct ib_ah_attr *attr, u8 *s_mac, + struct ib_mad *mad); + __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx); int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, @@ -752,5 +784,9 @@ void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device); __be64 mlx4_ib_gen_node_guid(void); +int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn); +void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count); +int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + int is_attach); #endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index e471f089ff0..cb2a8727f3f 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -90,11 +90,11 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, struct ib_umem *umem) { u64 *pages; - struct ib_umem_chunk *chunk; - int i, j, k; + int i, k, entry; int n; int len; int err = 0; + struct scatterlist *sg; pages = (u64 *) __get_free_page(GFP_KERNEL); if (!pages) @@ -102,26 +102,25 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, i = n = 0; - list_for_each_entry(chunk, &umem->chunk_list, list) - for (j = 0; j < chunk->nmap; ++j) { - len = sg_dma_len(&chunk->page_list[j]) >> mtt->page_shift; - for (k = 0; k < len; ++k) { - pages[i++] = sg_dma_address(&chunk->page_list[j]) + - umem->page_size * k; - /* - * Be friendly to mlx4_write_mtt() and - * pass it chunks of appropriate size. - */ - if (i == PAGE_SIZE / sizeof (u64)) { - err = mlx4_write_mtt(dev->dev, mtt, n, - i, pages); - if (err) - goto out; - n += i; - i = 0; - } + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> mtt->page_shift; + for (k = 0; k < len; ++k) { + pages[i++] = sg_dma_address(sg) + + umem->page_size * k; + /* + * Be friendly to mlx4_write_mtt() and + * pass it chunks of appropriate size. + */ + if (i == PAGE_SIZE / sizeof (u64)) { + err = mlx4_write_mtt(dev->dev, mtt, n, + i, pages); + if (err) + goto out; + n += i; + i = 0; } } + } if (i) err = mlx4_write_mtt(dev->dev, mtt, n, i, pages); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 4f10af2905b..67780452f0c 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -90,6 +90,21 @@ enum { MLX4_RAW_QP_MSGMAX = 31, }; +#ifndef ETH_ALEN +#define ETH_ALEN 6 +#endif +static inline u64 mlx4_mac_to_u64(u8 *addr) +{ + u64 mac = 0; + int i; + + for (i = 0; i < ETH_ALEN; i++) { + mac <<= 8; + mac |= addr[i]; + } + return mac; +} + static const __be32 mlx4_ib_opcode[] = { [IB_WR_SEND] = cpu_to_be32(MLX4_OPCODE_SEND), [IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO), @@ -593,9 +608,20 @@ static int qp_has_rq(struct ib_qp_init_attr *attr) return !attr->srq; } +static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn) +{ + int i; + for (i = 0; i < dev->caps.num_ports; i++) { + if (qpn == dev->caps.qp0_proxy[i]) + return !!dev->caps.qp0_qkey[i]; + } + return 0; +} + static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, - struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp) + struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp, + gfp_t gfp) { int qpn; int err; @@ -610,10 +636,13 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) { if (init_attr->qp_type == IB_QPT_GSI) qp_type = MLX4_IB_QPT_PROXY_GSI; - else if (mlx4_is_master(dev->dev)) - qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER; - else - qp_type = MLX4_IB_QPT_PROXY_SMI; + else { + if (mlx4_is_master(dev->dev) || + qp0_enabled_vf(dev->dev, sqpn)) + qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER; + else + qp_type = MLX4_IB_QPT_PROXY_SMI; + } } qpn = sqpn; /* add extra sg entry for tunneling */ @@ -628,7 +657,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, return -EINVAL; if (tnl_init->proxy_qp_type == IB_QPT_GSI) qp_type = MLX4_IB_QPT_TUN_GSI; - else if (tnl_init->slave == mlx4_master_func_num(dev->dev)) + else if (tnl_init->slave == mlx4_master_func_num(dev->dev) || + mlx4_vf_smi_enabled(dev->dev, tnl_init->slave, + tnl_init->port)) qp_type = MLX4_IB_QPT_TUN_SMI_OWNER; else qp_type = MLX4_IB_QPT_TUN_SMI; @@ -643,14 +674,18 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI || (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) { - sqp = kzalloc(sizeof (struct mlx4_ib_sqp), GFP_KERNEL); + sqp = kzalloc(sizeof (struct mlx4_ib_sqp), gfp); if (!sqp) return -ENOMEM; qp = &sqp->qp; + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; } else { - qp = kzalloc(sizeof (struct mlx4_ib_qp), GFP_KERNEL); + qp = kzalloc(sizeof (struct mlx4_ib_qp), gfp); if (!qp) return -ENOMEM; + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; } } else qp = *caller_qp; @@ -716,19 +751,27 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) qp->flags |= MLX4_IB_QP_LSO; + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { + if (dev->steering_support == + MLX4_STEERING_MODE_DEVICE_MANAGED) + qp->flags |= MLX4_IB_QP_NETIF; + else + goto err; + } + err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp); if (err) goto err; if (qp_has_rq(init_attr)) { - err = mlx4_db_alloc(dev->dev, &qp->db, 0); + err = mlx4_db_alloc(dev->dev, &qp->db, 0, gfp); if (err) goto err; *qp->db.db = 0; } - if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) { + if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) { err = -ENOMEM; goto err_db; } @@ -738,13 +781,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf); + err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, gfp); if (err) goto err_mtt; - qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL); - qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL); - + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp); if (!qp->sq.wrid || !qp->rq.wrid) { err = -ENOMEM; goto err_wrid; @@ -765,12 +807,16 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (init_attr->qp_type == IB_QPT_RAW_PACKET) err = mlx4_qp_reserve_range(dev->dev, 1, 1 << 8, &qpn); else - err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn); + if (qp->flags & MLX4_IB_QP_NETIF) + err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn); + else + err = mlx4_qp_reserve_range(dev->dev, 1, 1, + &qpn); if (err) goto err_proxy; } - err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp); + err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, gfp); if (err) goto err_qpn; @@ -790,8 +836,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, return 0; err_qpn: - if (!sqpn) - mlx4_qp_release_range(dev->dev, qpn, 1); + if (!sqpn) { + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_free(dev, qpn, 1); + else + mlx4_qp_release_range(dev->dev, qpn, 1); + } err_proxy: if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) free_proxy_bufs(pd->device, qp); @@ -909,11 +959,32 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, { struct mlx4_ib_cq *send_cq, *recv_cq; - if (qp->state != IB_QPS_RESET) + if (qp->state != IB_QPS_RESET) { if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) pr_warn("modify QP %06x to RESET failed.\n", qp->mqp.qpn); + if (qp->pri.smac) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = 0; + } + if (qp->alt.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = 0; + } + if (qp->pri.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); + qp->pri.vid = 0xFFFF; + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + if (qp->alt.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); + qp->alt.vid = 0xFFFF; + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + } get_cqs(qp, &send_cq, &recv_cq); @@ -932,8 +1003,12 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, mlx4_qp_free(dev->dev, &qp->mqp); - if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) - mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); + if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) { + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1); + else + mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); + } mlx4_mtt_cleanup(dev->dev, &qp->mtt); @@ -980,19 +1055,30 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, struct mlx4_ib_qp *qp = NULL; int err; u16 xrcdn = 0; + gfp_t gfp; + gfp = (init_attr->create_flags & MLX4_IB_QP_CREATE_USE_GFP_NOIO) ? + GFP_NOIO : GFP_KERNEL; /* * We only support LSO, vendor flag1, and multicast loopback blocking, * and only for kernel UD QPs. */ if (init_attr->create_flags & ~(MLX4_IB_QP_LSO | MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK | - MLX4_IB_SRIOV_TUNNEL_QP | MLX4_IB_SRIOV_SQP)) + MLX4_IB_SRIOV_TUNNEL_QP | + MLX4_IB_SRIOV_SQP | + MLX4_IB_QP_NETIF | + MLX4_IB_QP_CREATE_USE_GFP_NOIO)) return ERR_PTR(-EINVAL); + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { + if (init_attr->qp_type != IB_QPT_UD) + return ERR_PTR(-EINVAL); + } + if (init_attr->create_flags && (udata || - ((init_attr->create_flags & ~MLX4_IB_SRIOV_SQP) && + ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | MLX4_IB_QP_CREATE_USE_GFP_NOIO)) && init_attr->qp_type != IB_QPT_UD) || ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) && init_attr->qp_type > IB_QPT_GSI))) @@ -1012,14 +1098,16 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_RAW_PACKET: - qp = kzalloc(sizeof *qp, GFP_KERNEL); + qp = kzalloc(sizeof *qp, gfp); if (!qp) return ERR_PTR(-ENOMEM); + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; /* fall through */ case IB_QPT_UD: { err = create_qp_common(to_mdev(pd->device), pd, init_attr, - udata, 0, &qp); + udata, 0, &qp, gfp); if (err) return ERR_PTR(err); @@ -1037,7 +1125,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, get_sqp_num(to_mdev(pd->device), init_attr), - &qp); + &qp, gfp); if (err) return ERR_PTR(err); @@ -1063,6 +1151,12 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp) if (is_qp0(dev, mqp)) mlx4_CLOSE_PORT(dev->dev, mqp->port); + if (dev->qp1_proxy[mqp->port - 1] == mqp) { + mutex_lock(&dev->qp1_proxy_lock[mqp->port - 1]); + dev->qp1_proxy[mqp->port - 1] = NULL; + mutex_unlock(&dev->qp1_proxy_lock[mqp->port - 1]); + } + pd = get_pd(mqp); destroy_qp_common(dev, mqp, !!pd->ibpd.uobject); @@ -1144,16 +1238,16 @@ static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6); } -static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, - struct mlx4_qp_path *path, u8 port) +static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, + u64 smac, u16 vlan_tag, struct mlx4_qp_path *path, + struct mlx4_roce_smac_vlan_info *smac_info, u8 port) { - int err; int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_ETHERNET; - u8 mac[6]; - int is_mcast; - u16 vlan_tag; int vidx; + int smac_index; + int err; + path->grh_mylmc = ah->src_path_bits & 0x7f; path->rlid = cpu_to_be16(ah->dlid); @@ -1182,36 +1276,105 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, } if (is_eth) { - path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | - ((port - 1) << 6) | ((ah->sl & 7) << 3); - if (!(ah->ah_flags & IB_AH_GRH)) return -1; - err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast, port); - if (err) - return err; - - memcpy(path->dmac, mac, 6); - path->ackto = MLX4_IB_LINK_TYPE_ETH; - /* use index 0 into MAC table for IBoE */ - path->grh_mylmc &= 0x80; + path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | + ((port - 1) << 6) | ((ah->sl & 7) << 3); - vlan_tag = rdma_get_vlan_id(&dev->iboe.gid_table[port - 1][ah->grh.sgid_index]); + path->feup |= MLX4_FEUP_FORCE_ETH_UP; if (vlan_tag < 0x1000) { - if (mlx4_find_cached_vlan(dev->dev, port, vlan_tag, &vidx)) - return -ENOENT; - - path->vlan_index = vidx; + if (smac_info->vid < 0x1000) { + /* both valid vlan ids */ + if (smac_info->vid != vlan_tag) { + /* different VIDs. unreg old and reg new */ + err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); + if (err) + return err; + smac_info->candidate_vid = vlan_tag; + smac_info->candidate_vlan_index = vidx; + smac_info->candidate_vlan_port = port; + smac_info->update_vid = 1; + path->vlan_index = vidx; + } else { + path->vlan_index = smac_info->vlan_index; + } + } else { + /* no current vlan tag in qp */ + err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); + if (err) + return err; + smac_info->candidate_vid = vlan_tag; + smac_info->candidate_vlan_index = vidx; + smac_info->candidate_vlan_port = port; + smac_info->update_vid = 1; + path->vlan_index = vidx; + } + path->feup |= MLX4_FVL_FORCE_ETH_VLAN; path->fl = 1 << 6; + } else { + /* have current vlan tag. unregister it at modify-qp success */ + if (smac_info->vid < 0x1000) { + smac_info->candidate_vid = 0xFFFF; + smac_info->update_vid = 1; + } } - } else + + /* get smac_index for RoCE use. + * If no smac was yet assigned, register one. + * If one was already assigned, but the new mac differs, + * unregister the old one and register the new one. + */ + if (!smac_info->smac || smac_info->smac != smac) { + /* register candidate now, unreg if needed, after success */ + smac_index = mlx4_register_mac(dev->dev, port, smac); + if (smac_index >= 0) { + smac_info->candidate_smac_index = smac_index; + smac_info->candidate_smac = smac; + smac_info->candidate_smac_port = port; + } else { + return -EINVAL; + } + } else { + smac_index = smac_info->smac_index; + } + + memcpy(path->dmac, ah->dmac, 6); + path->ackto = MLX4_IB_LINK_TYPE_ETH; + /* put MAC table smac index for IBoE */ + path->grh_mylmc = (u8) (smac_index) | 0x80; + } else { path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((port - 1) << 6) | ((ah->sl & 0xf) << 2); + } return 0; } +static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp, + enum ib_qp_attr_mask qp_attr_mask, + struct mlx4_ib_qp *mqp, + struct mlx4_qp_path *path, u8 port) +{ + return _mlx4_set_path(dev, &qp->ah_attr, + mlx4_mac_to_u64((u8 *)qp->smac), + (qp_attr_mask & IB_QP_VID) ? qp->vlan_id : 0xffff, + path, &mqp->pri, port); +} + +static int mlx4_set_alt_path(struct mlx4_ib_dev *dev, + const struct ib_qp_attr *qp, + enum ib_qp_attr_mask qp_attr_mask, + struct mlx4_ib_qp *mqp, + struct mlx4_qp_path *path, u8 port) +{ + return _mlx4_set_path(dev, &qp->alt_ah_attr, + mlx4_mac_to_u64((u8 *)qp->alt_smac), + (qp_attr_mask & IB_QP_ALT_VID) ? + qp->alt_vlan_id : 0xffff, + path, &mqp->alt, port); +} + static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { struct mlx4_ib_gid_entry *ge, *tmp; @@ -1224,6 +1387,37 @@ static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) } } +static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, u8 *smac, + struct mlx4_qp_context *context) +{ + struct net_device *ndev; + u64 u64_mac; + int smac_index; + + + ndev = dev->iboe.netdevs[qp->port - 1]; + if (ndev) { + smac = ndev->dev_addr; + u64_mac = mlx4_mac_to_u64(smac); + } else { + u64_mac = dev->dev->caps.def_mac[qp->port]; + } + + context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6); + if (!qp->pri.smac) { + smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac); + if (smac_index >= 0) { + qp->pri.candidate_smac_index = smac_index; + qp->pri.candidate_smac = u64_mac; + qp->pri.candidate_smac_port = qp->port; + context->pri_path.grh_mylmc = 0x80 | (u8) smac_index; + } else { + return -ENOENT; + } + } + return 0; +} + static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) @@ -1235,6 +1429,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, struct mlx4_qp_context *context; enum mlx4_qp_optpar optpar = 0; int sqd_event; + int steer_qp = 0; int err = -EINVAL; context = kzalloc(sizeof *context, GFP_KERNEL); @@ -1319,6 +1514,11 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX; } else context->pri_path.counter_index = 0xff; + + if (qp->flags & MLX4_IB_QP_NETIF) { + mlx4_ib_steer_qp_reg(dev, qp, 1); + steer_qp = 1; + } } if (attr_mask & IB_QP_PKEY_INDEX) { @@ -1329,7 +1529,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } if (attr_mask & IB_QP_AV) { - if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path, + if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path, attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) goto out; @@ -1352,8 +1552,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, dev->dev->caps.pkey_table_len[attr->alt_port_num]) goto out; - if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path, - attr->alt_port_num)) + if (mlx4_set_alt_path(dev, attr, attr_mask, qp, + &context->alt_path, + attr->alt_port_num)) goto out; context->alt_path.pkey_index = attr->alt_pkey_index; @@ -1458,12 +1659,39 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, context->pri_path.fl = 0x80; context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE; } + if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET) { + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) + context->pri_path.feup = 1 << 7; /* don't fsm */ + /* handle smac_index */ + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) { + err = handle_eth_ud_smac_index(dev, qp, (u8 *)attr->smac, context); + if (err) + return -EINVAL; + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) + dev->qp1_proxy[qp->port - 1] = qp; + } + } } if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) context->pri_path.ackto = (context->pri_path.ackto & 0xf8) | MLX4_IB_LINK_TYPE_ETH; + if (ibqp->qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) { + int is_eth = rdma_port_get_link_layer( + &dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET; + if (is_eth) { + context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH; + optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH; + } + } + + if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) sqd_event = 1; @@ -1534,23 +1762,113 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, * If we moved a kernel QP to RESET, clean up all old CQ * entries and reinitialize the QP. */ - if (new_state == IB_QPS_RESET && !ibqp->uobject) { - mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, - ibqp->srq ? to_msrq(ibqp->srq): NULL); - if (send_cq != recv_cq) - mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + if (new_state == IB_QPS_RESET) { + if (!ibqp->uobject) { + mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (send_cq != recv_cq) + mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq_next_wqe = 0; + if (qp->rq.wqe_cnt) + *qp->db.db = 0; + + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_reg(dev, qp, 0); + } + if (qp->pri.smac) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = 0; + } + if (qp->alt.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = 0; + } + if (qp->pri.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); + qp->pri.vid = 0xFFFF; + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } - qp->rq.head = 0; - qp->rq.tail = 0; - qp->sq.head = 0; - qp->sq.tail = 0; - qp->sq_next_wqe = 0; - if (qp->rq.wqe_cnt) - *qp->db.db = 0; + if (qp->alt.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); + qp->alt.vid = 0xFFFF; + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } } - out: + if (err && steer_qp) + mlx4_ib_steer_qp_reg(dev, qp, 0); kfree(context); + if (qp->pri.candidate_smac) { + if (err) { + mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac); + } else { + if (qp->pri.smac) + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = qp->pri.candidate_smac; + qp->pri.smac_index = qp->pri.candidate_smac_index; + qp->pri.smac_port = qp->pri.candidate_smac_port; + } + qp->pri.candidate_smac = 0; + qp->pri.candidate_smac_index = 0; + qp->pri.candidate_smac_port = 0; + } + if (qp->alt.candidate_smac) { + if (err) { + mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->alt.candidate_smac); + } else { + if (qp->alt.smac) + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = qp->alt.candidate_smac; + qp->alt.smac_index = qp->alt.candidate_smac_index; + qp->alt.smac_port = qp->alt.candidate_smac_port; + } + qp->alt.candidate_smac = 0; + qp->alt.candidate_smac_index = 0; + qp->alt.candidate_smac_port = 0; + } + + if (qp->pri.update_vid) { + if (err) { + if (qp->pri.candidate_vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port, + qp->pri.candidate_vid); + } else { + if (qp->pri.vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, + qp->pri.vid); + qp->pri.vid = qp->pri.candidate_vid; + qp->pri.vlan_port = qp->pri.candidate_vlan_port; + qp->pri.vlan_index = qp->pri.candidate_vlan_index; + } + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + + if (qp->alt.update_vid) { + if (err) { + if (qp->alt.candidate_vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port, + qp->alt.candidate_vid); + } else { + if (qp->alt.vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, + qp->alt.vid); + qp->alt.vid = qp->alt.candidate_vid; + qp->alt.vlan_port = qp->alt.candidate_vlan_port; + qp->alt.vlan_index = qp->alt.candidate_vlan_index; + } + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + return err; } @@ -1561,13 +1879,21 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct mlx4_ib_qp *qp = to_mqp(ibqp); enum ib_qp_state cur_state, new_state; int err = -EINVAL; - + int ll; mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) { + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + ll = IB_LINK_LAYER_UNSPECIFIED; + } else { + int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + ll = rdma_port_get_link_layer(&dev->ib_dev, port); + } + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask, ll)) { pr_debug("qpn 0x%x: invalid attribute mask specified " "for transition %d to %d. qp_type %d," " attr_mask 0x%x\n", @@ -1631,6 +1957,19 @@ out: return err; } +static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey) +{ + int i; + for (i = 0; i < dev->caps.num_ports; i++) { + if (qpn == dev->caps.qp0_proxy[i] || + qpn == dev->caps.qp0_tunnel[i]) { + *qkey = dev->caps.qp0_qkey[i]; + return 0; + } + } + return -EINVAL; +} + static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len) @@ -1688,8 +2027,13 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]); sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); - if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) - return -EINVAL; + if (mlx4_is_master(mdev->dev)) { + if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) + return -EINVAL; + } else { + if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) + return -EINVAL; + } sqp->ud_header.deth.qkey = cpu_to_be32(qkey); sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn); @@ -1744,9 +2088,9 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, { struct ib_device *ib_dev = sqp->qp.ibqp.device; struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_ctrl_seg *ctrl = wqe; struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); - struct net_device *ndev; union ib_gid sgid; u16 pkey; int send_size; @@ -1770,12 +2114,11 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, /* When multi-function is enabled, the ib_core gid * indexes don't necessarily match the hw ones, so * we must use our own cache */ - sgid.global.subnet_prefix = - to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. - subnet_prefix; - sgid.global.interface_id = - to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. - guid_cache[ah->av.ib.gid_index]; + err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sgid.raw[0]); + if (err) + return err; } else { err = ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, @@ -1784,8 +2127,10 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, return err; } - vlan = rdma_get_vlan_id(&sgid); - is_vlan = vlan < 0x1000; + if (ah->av.eth.vlan != cpu_to_be16(0xffff)) { + vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff; + is_vlan = 1; + } } ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header); @@ -1802,6 +2147,9 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, sqp->ud_header.grh.flow_label = ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit; + if (is_eth) + memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16); + else { if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { /* When multi-function is enabled, the ib_core gid * indexes don't necessarily match the hw ones, so @@ -1817,6 +2165,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, be32_to_cpu(ah->av.ib.port_pd) >> 24, ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid); + } memcpy(sqp->ud_header.grh.destination_gid.raw, ah->av.ib.dgid, 16); } @@ -1849,16 +2198,23 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, if (is_eth) { u8 *smac; + struct in6_addr in6; + u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13; mlx->sched_prio = cpu_to_be16(pcp); memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); /* FIXME: cache smac value? */ - ndev = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]; - if (!ndev) - return -ENODEV; - smac = ndev->dev_addr; + memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2); + memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4); + memcpy(&in6, sgid.raw, sizeof(in6)); + + if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) + smac = to_mdev(sqp->qp.ibqp.device)-> + iboe.netdevs[sqp->qp.port - 1]->dev_addr; + else /* use the src mac of the tunnel */ + smac = ah->av.eth.s_mac; memcpy(sqp->ud_header.eth.smac_h, smac, 6); if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); @@ -2059,7 +2415,8 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev, struct mlx4_wqe_datagram_seg *dseg, - struct ib_send_wr *wr, enum ib_qp_type qpt) + struct ib_send_wr *wr, + enum mlx4_ib_qp_type qpt) { union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av; struct mlx4_av sqp_av = {0}; @@ -2072,8 +2429,10 @@ static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev, cpu_to_be32(0xf0000000); memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av)); - /* This function used only for sending on QP1 proxies */ - dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]); + if (qpt == MLX4_IB_QPT_PROXY_GSI) + dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]); + else + dseg->dqpn = cpu_to_be32(dev->dev->caps.qp0_tunnel[port - 1]); /* Use QKEY from the QP context, which is set by master */ dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY); } @@ -2090,6 +2449,8 @@ static void build_tunnel_header(struct ib_send_wr *wr, void *wqe, unsigned *mlx_ hdr.remote_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index); hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + memcpy(hdr.mac, ah->av.eth.mac, 6); + hdr.vlan = ah->av.eth.vlan; spc = MLX4_INLINE_ALIGN - ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); @@ -2366,11 +2727,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, break; case MLX4_IB_QPT_PROXY_SMI_OWNER: - if (unlikely(!mlx4_is_master(to_mdev(ibqp->device)->dev))) { - err = -ENOSYS; - *bad_wr = wr; - goto out; - } err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen); if (unlikely(err)) { *bad_wr = wr; @@ -2387,16 +2743,13 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += seglen / 16; break; case MLX4_IB_QPT_PROXY_SMI: - /* don't allow QP0 sends on guests */ - err = -ENOSYS; - *bad_wr = wr; - goto out; case MLX4_IB_QPT_PROXY_GSI: /* If we are tunneling special qps, this is a UD qp. * In this case we first add a UD segment targeting * the tunnel qp, and then add a header with address * information */ - set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, ibqp->qp_type); + set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, + qp->mlx4_ib_qp_type); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; build_tunnel_header(wr, wqe, &seglen); @@ -2762,6 +3115,9 @@ done: if (qp->flags & MLX4_IB_QP_LSO) qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + if (qp->flags & MLX4_IB_QP_NETIF) + qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP; + qp_init_attr->sq_sig_type = qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 60c5fb025fc..62d9285300a 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -134,13 +134,14 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, if (err) goto err_mtt; } else { - err = mlx4_db_alloc(dev->dev, &srq->db, 0); + err = mlx4_db_alloc(dev->dev, &srq->db, 0, GFP_KERNEL); if (err) goto err_srq; *srq->db.db = 0; - if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf)) { + if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf, + GFP_KERNEL)) { err = -ENOMEM; goto err_db; } @@ -165,7 +166,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf); + err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf, GFP_KERNEL); if (err) goto err_mtt; diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c index 97516eb363b..cb4c66e723b 100644 --- a/drivers/infiniband/hw/mlx4/sysfs.c +++ b/drivers/infiniband/hw/mlx4/sysfs.c @@ -389,8 +389,10 @@ struct mlx4_port { struct mlx4_ib_dev *dev; struct attribute_group pkey_group; struct attribute_group gid_group; - u8 port_num; + struct device_attribute enable_smi_admin; + struct device_attribute smi_enabled; int slave; + u8 port_num; }; @@ -558,6 +560,101 @@ err: return NULL; } +static ssize_t sysfs_show_smi_enabled(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, smi_enabled); + ssize_t len = 0; + + if (mlx4_vf_smi_enabled(p->dev->dev, p->slave, p->port_num)) + len = sprintf(buf, "%d\n", 1); + else + len = sprintf(buf, "%d\n", 0); + + return len; +} + +static ssize_t sysfs_show_enable_smi_admin(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, enable_smi_admin); + ssize_t len = 0; + + if (mlx4_vf_get_enable_smi_admin(p->dev->dev, p->slave, p->port_num)) + len = sprintf(buf, "%d\n", 1); + else + len = sprintf(buf, "%d\n", 0); + + return len; +} + +static ssize_t sysfs_store_enable_smi_admin(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, enable_smi_admin); + int enable; + + if (sscanf(buf, "%i", &enable) != 1 || + enable < 0 || enable > 1) + return -EINVAL; + + if (mlx4_vf_set_enable_smi_admin(p->dev->dev, p->slave, p->port_num, enable)) + return -EINVAL; + return count; +} + +static int add_vf_smi_entries(struct mlx4_port *p) +{ + int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) == + IB_LINK_LAYER_ETHERNET; + int ret; + + /* do not display entries if eth transport, or if master */ + if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev)) + return 0; + + sysfs_attr_init(&p->smi_enabled.attr); + p->smi_enabled.show = sysfs_show_smi_enabled; + p->smi_enabled.store = NULL; + p->smi_enabled.attr.name = "smi_enabled"; + p->smi_enabled.attr.mode = 0444; + ret = sysfs_create_file(&p->kobj, &p->smi_enabled.attr); + if (ret) { + pr_err("failed to create smi_enabled\n"); + return ret; + } + + sysfs_attr_init(&p->enable_smi_admin.attr); + p->enable_smi_admin.show = sysfs_show_enable_smi_admin; + p->enable_smi_admin.store = sysfs_store_enable_smi_admin; + p->enable_smi_admin.attr.name = "enable_smi_admin"; + p->enable_smi_admin.attr.mode = 0644; + ret = sysfs_create_file(&p->kobj, &p->enable_smi_admin.attr); + if (ret) { + pr_err("failed to create enable_smi_admin\n"); + sysfs_remove_file(&p->kobj, &p->smi_enabled.attr); + return ret; + } + return 0; +} + +static void remove_vf_smi_entries(struct mlx4_port *p) +{ + int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) == + IB_LINK_LAYER_ETHERNET; + + if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev)) + return; + + sysfs_remove_file(&p->kobj, &p->smi_enabled.attr); + sysfs_remove_file(&p->kobj, &p->enable_smi_admin.attr); +} + static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave) { struct mlx4_port *p; @@ -582,8 +679,10 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave) p->pkey_group.attrs = alloc_group_attrs(show_port_pkey, store_port_pkey, dev->dev->caps.pkey_table_len[port_num]); - if (!p->pkey_group.attrs) + if (!p->pkey_group.attrs) { + ret = -ENOMEM; goto err_alloc; + } ret = sysfs_create_group(&p->kobj, &p->pkey_group); if (ret) @@ -591,13 +690,19 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave) p->gid_group.name = "gid_idx"; p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx, NULL, 1); - if (!p->gid_group.attrs) + if (!p->gid_group.attrs) { + ret = -ENOMEM; goto err_free_pkey; + } ret = sysfs_create_group(&p->kobj, &p->gid_group); if (ret) goto err_free_gid; + ret = add_vf_smi_entries(p); + if (ret) + goto err_free_gid; + list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]); return 0; @@ -623,6 +728,7 @@ static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave) int port; struct kobject *p, *t; struct mlx4_port *mport; + struct mlx4_active_ports actv_ports; get_name(dev, name, slave, sizeof name); @@ -645,7 +751,11 @@ static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave) goto err_ports; } + actv_ports = mlx4_get_active_ports(dev->dev, slave); + for (port = 1; port <= dev->dev->caps.num_ports; ++port) { + if (!test_bit(port - 1, actv_ports.ports)) + continue; err = add_port(dev, port, slave); if (err) goto err_add; @@ -660,6 +770,7 @@ err_add: mport = container_of(p, struct mlx4_port, kobj); sysfs_remove_group(p, &mport->pkey_group); sysfs_remove_group(p, &mport->gid_group); + remove_vf_smi_entries(mport); kobject_put(p); } kobject_put(dev->dev_ports_parent[slave]); @@ -704,6 +815,7 @@ static void unregister_pkey_tree(struct mlx4_ib_dev *device) port = container_of(p, struct mlx4_port, kobj); sysfs_remove_group(p, &port->pkey_group); sysfs_remove_group(p, &port->gid_group); + remove_vf_smi_entries(port); kobject_put(p); kobject_put(device->dev_ports_parent[slave]); } diff --git a/drivers/infiniband/hw/mlx5/Kconfig b/drivers/infiniband/hw/mlx5/Kconfig index 8e6aebfaf8a..10df386c634 100644 --- a/drivers/infiniband/hw/mlx5/Kconfig +++ b/drivers/infiniband/hw/mlx5/Kconfig @@ -1,6 +1,6 @@ config MLX5_INFINIBAND tristate "Mellanox Connect-IB HCA support" - depends on NETDEVICES && ETHERNET && PCI && X86 + depends on NETDEVICES && ETHERNET && PCI select NET_VENDOR_MELLANOX select MLX5_CORE ---help--- diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 344ab03948a..8ae4f896cb4 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -32,6 +32,7 @@ #include <linux/kref.h> #include <rdma/ib_umem.h> +#include <rdma/ib_user_verbs.h> #include "mlx5_ib.h" #include "user.h" @@ -73,14 +74,24 @@ static void *get_cqe(struct mlx5_ib_cq *cq, int n) return get_cqe_from_buf(&cq->buf, n, cq->mcq.cqe_sz); } +static u8 sw_ownership_bit(int n, int nent) +{ + return (n & nent) ? 1 : 0; +} + static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n) { void *cqe = get_cqe(cq, n & cq->ibcq.cqe); struct mlx5_cqe64 *cqe64; cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; - return ((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ - !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe; + + if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) && + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) { + return cqe; + } else { + return NULL; + } } static void *next_cqe_sw(struct mlx5_ib_cq *cq) @@ -351,6 +362,43 @@ static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, qp->sq.last_poll = tail; } +static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) +{ + mlx5_buf_free(&dev->mdev, &buf->buf); +} + +static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe, + struct ib_sig_err *item) +{ + u16 syndrome = be16_to_cpu(cqe->syndrome); + +#define GUARD_ERR (1 << 13) +#define APPTAG_ERR (1 << 12) +#define REFTAG_ERR (1 << 11) + + if (syndrome & GUARD_ERR) { + item->err_type = IB_SIG_BAD_GUARD; + item->expected = be32_to_cpu(cqe->expected_trans_sig) >> 16; + item->actual = be32_to_cpu(cqe->actual_trans_sig) >> 16; + } else + if (syndrome & REFTAG_ERR) { + item->err_type = IB_SIG_BAD_REFTAG; + item->expected = be32_to_cpu(cqe->expected_reftag); + item->actual = be32_to_cpu(cqe->actual_reftag); + } else + if (syndrome & APPTAG_ERR) { + item->err_type = IB_SIG_BAD_APPTAG; + item->expected = be32_to_cpu(cqe->expected_trans_sig) & 0xffff; + item->actual = be32_to_cpu(cqe->actual_trans_sig) & 0xffff; + } else { + pr_err("Got signature completion error with bad syndrome %04x\n", + syndrome); + } + + item->sig_err_offset = be64_to_cpu(cqe->err_offset); + item->key = be32_to_cpu(cqe->mkey); +} + static int mlx5_poll_one(struct mlx5_ib_cq *cq, struct mlx5_ib_qp **cur_qp, struct ib_wc *wc) @@ -360,12 +408,16 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, struct mlx5_cqe64 *cqe64; struct mlx5_core_qp *mqp; struct mlx5_ib_wq *wq; + struct mlx5_sig_err_cqe *sig_err_cqe; + struct mlx5_core_mr *mmr; + struct mlx5_ib_mr *mr; uint8_t opcode; uint32_t qpn; u16 wqe_ctr; void *cqe; int idx; +repoll: cqe = next_cqe_sw(cq); if (!cqe) return -EAGAIN; @@ -379,7 +431,18 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, */ rmb(); - /* TBD: resize CQ */ + opcode = cqe64->op_own >> 4; + if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) { + if (likely(cq->resize_buf)) { + free_cq_buf(dev, &cq->buf); + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + goto repoll; + } else { + mlx5_ib_warn(dev, "unexpected resize cqe\n"); + } + } qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff; if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) { @@ -398,7 +461,6 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, } wc->qp = &(*cur_qp)->ibqp; - opcode = cqe64->op_own >> 4; switch (opcode) { case MLX5_CQE_REQ: wq = &(*cur_qp)->sq; @@ -449,6 +511,33 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, } } break; + case MLX5_CQE_SIG_ERR: + sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64; + + read_lock(&dev->mdev.priv.mr_table.lock); + mmr = __mlx5_mr_lookup(&dev->mdev, + mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); + if (unlikely(!mmr)) { + read_unlock(&dev->mdev.priv.mr_table.lock); + mlx5_ib_warn(dev, "CQE@CQ %06x for unknown MR %6x\n", + cq->mcq.cqn, be32_to_cpu(sig_err_cqe->mkey)); + return -EINVAL; + } + + mr = to_mibmr(mmr); + get_sig_err_item(sig_err_cqe, &mr->sig->err_item); + mr->sig->sig_err_exists = true; + mr->sig->sigerr_count++; + + mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n", + cq->mcq.cqn, mr->sig->err_item.key, + mr->sig->err_item.err_type, + mr->sig->err_item.sig_err_offset, + mr->sig->err_item.expected, + mr->sig->err_item.actual); + + read_unlock(&dev->mdev.priv.mr_table.lock); + goto repoll; } return 0; @@ -503,29 +592,35 @@ static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf, return err; buf->cqe_size = cqe_size; + buf->nent = nent; return 0; } -static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) -{ - mlx5_buf_free(&dev->mdev, &buf->buf); -} - static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, struct ib_ucontext *context, struct mlx5_ib_cq *cq, int entries, struct mlx5_create_cq_mbox_in **cqb, int *cqe_size, int *index, int *inlen) { struct mlx5_ib_create_cq ucmd; + size_t ucmdlen; int page_shift; int npages; int ncont; int err; - if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) + ucmdlen = + (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < + sizeof(ucmd)) ? (sizeof(ucmd) - + sizeof(ucmd.reserved)) : sizeof(ucmd); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) return -EFAULT; + if (ucmdlen == sizeof(ucmd) && + ucmd.reserved != 0) + return -EINVAL; + if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128) return -EINVAL; @@ -556,7 +651,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, goto err_db; } mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0); - (*cqb)->ctx.log_pg_sz = page_shift - PAGE_SHIFT; + (*cqb)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; *index = to_mucontext(context)->uuari.uars[0].index; @@ -576,16 +671,16 @@ static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context) ib_umem_release(cq->buf.umem); } -static void init_cq_buf(struct mlx5_ib_cq *cq, int nent) +static void init_cq_buf(struct mlx5_ib_cq *cq, struct mlx5_ib_cq_buf *buf) { int i; void *cqe; struct mlx5_cqe64 *cqe64; - for (i = 0; i < nent; i++) { - cqe = get_cqe(cq, i); - cqe64 = (cq->buf.cqe_size == 64) ? cqe : cqe + 64; - cqe64->op_own = 0xf1; + for (i = 0; i < buf->nent; i++) { + cqe = get_cqe_from_buf(buf, i, buf->cqe_size); + cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; + cqe64->op_own = MLX5_CQE_INVALID << 4; } } @@ -610,7 +705,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, if (err) goto err_db; - init_cq_buf(cq, entries); + init_cq_buf(cq, &cq->buf); *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * cq->buf.buf.npages; *cqb = mlx5_vzalloc(*inlen); @@ -620,7 +715,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, } mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas); - (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - PAGE_SHIFT; + (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; *index = dev->mdev.priv.uuari.uars[0].index; return 0; @@ -653,8 +748,11 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, int eqn; int err; + if (entries < 0) + return ERR_PTR(-EINVAL); + entries = roundup_pow_of_two(entries + 1); - if (entries < 1 || entries > dev->mdev.caps.max_cqes) + if (entries > dev->mdev.caps.max_cqes) return ERR_PTR(-EINVAL); cq = kzalloc(sizeof(*cq), GFP_KERNEL); @@ -747,17 +845,9 @@ int mlx5_ib_destroy_cq(struct ib_cq *cq) return 0; } -static int is_equal_rsn(struct mlx5_cqe64 *cqe64, struct mlx5_ib_srq *srq, - u32 rsn) +static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn) { - u32 lrsn; - - if (srq) - lrsn = be32_to_cpu(cqe64->srqn) & 0xffffff; - else - lrsn = be32_to_cpu(cqe64->sop_drop_qpn) & 0xffffff; - - return rsn == lrsn; + return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff); } void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq) @@ -787,8 +877,8 @@ void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq) while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; - if (is_equal_rsn(cqe64, srq, rsn)) { - if (srq) + if (is_equal_rsn(cqe64, rsn)) { + if (srq && (ntohl(cqe64->srqn) & 0xffffff)) mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter)); ++nfreed; } else if (nfreed) { @@ -823,12 +913,266 @@ void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq) int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) { - return -ENOSYS; + struct mlx5_modify_cq_mbox_in *in; + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + int err; + u32 fsel; + + if (!(dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_CQ_MODER)) + return -ENOSYS; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->cqn = cpu_to_be32(mcq->mcq.cqn); + fsel = (MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT); + in->ctx.cq_period = cpu_to_be16(cq_period); + in->ctx.cq_max_count = cpu_to_be16(cq_count); + in->field_select = cpu_to_be32(fsel); + err = mlx5_core_modify_cq(&dev->mdev, &mcq->mcq, in, sizeof(*in)); + kfree(in); + + if (err) + mlx5_ib_warn(dev, "modify cq 0x%x failed\n", mcq->mcq.cqn); + + return err; +} + +static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, struct ib_udata *udata, int *npas, + int *page_shift, int *cqe_size) +{ + struct mlx5_ib_resize_cq ucmd; + struct ib_umem *umem; + int err; + int npages; + struct ib_ucontext *context = cq->buf.umem->context; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) + return err; + + if (ucmd.reserved0 || ucmd.reserved1) + return -EINVAL; + + umem = ib_umem_get(context, ucmd.buf_addr, entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + return err; + } + + mlx5_ib_cont_pages(umem, ucmd.buf_addr, &npages, page_shift, + npas, NULL); + + cq->resize_umem = umem; + *cqe_size = ucmd.cqe_size; + + return 0; +} + +static void un_resize_user(struct mlx5_ib_cq *cq) +{ + ib_umem_release(cq->resize_umem); +} + +static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size) +{ + int err; + + cq->resize_buf = kzalloc(sizeof(*cq->resize_buf), GFP_KERNEL); + if (!cq->resize_buf) + return -ENOMEM; + + err = alloc_cq_buf(dev, cq->resize_buf, entries, cqe_size); + if (err) + goto ex; + + init_cq_buf(cq, cq->resize_buf); + + return 0; + +ex: + kfree(cq->resize_buf); + return err; +} + +static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, cq->resize_buf); + cq->resize_buf = NULL; +} + +static int copy_resize_cqes(struct mlx5_ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_cqe64 *scqe64; + struct mlx5_cqe64 *dcqe64; + void *start_cqe; + void *scqe; + void *dcqe; + int ssize; + int dsize; + int i; + u8 sw_own; + + ssize = cq->buf.cqe_size; + dsize = cq->resize_buf->cqe_size; + if (ssize != dsize) { + mlx5_ib_warn(dev, "resize from different cqe size is not supported\n"); + return -EINVAL; + } + + i = cq->mcq.cons_index; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + start_cqe = scqe; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) { + dcqe = get_cqe_from_buf(cq->resize_buf, + (i + 1) & (cq->resize_buf->nent), + dsize); + dcqe64 = dsize == 64 ? dcqe : dcqe + 64; + sw_own = sw_ownership_bit(i + 1, cq->resize_buf->nent); + memcpy(dcqe, scqe, dsize); + dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own; + + ++i; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + if (scqe == start_cqe) { + pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n", + cq->mcq.cqn); + return -ENOMEM; + } + } + ++cq->mcq.cons_index; + return 0; } int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) { - return -ENOSYS; + struct mlx5_ib_dev *dev = to_mdev(ibcq->device); + struct mlx5_ib_cq *cq = to_mcq(ibcq); + struct mlx5_modify_cq_mbox_in *in; + int err; + int npas; + int page_shift; + int inlen; + int uninitialized_var(cqe_size); + unsigned long flags; + + if (!(dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_RESIZE_CQ)) { + pr_info("Firmware does not support resize CQ\n"); + return -ENOSYS; + } + + if (entries < 1) + return -EINVAL; + + entries = roundup_pow_of_two(entries + 1); + if (entries > dev->mdev.caps.max_cqes + 1) + return -EINVAL; + + if (entries == ibcq->cqe + 1) + return 0; + + mutex_lock(&cq->resize_mutex); + if (udata) { + err = resize_user(dev, cq, entries, udata, &npas, &page_shift, + &cqe_size); + } else { + cqe_size = 64; + err = resize_kernel(dev, cq, entries, cqe_size); + if (!err) { + npas = cq->resize_buf->buf.npages; + page_shift = cq->resize_buf->buf.page_shift; + } + } + + if (err) + goto ex; + + inlen = sizeof(*in) + npas * sizeof(in->pas[0]); + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto ex_resize; + } + + if (udata) + mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift, + in->pas, 0); + else + mlx5_fill_page_array(&cq->resize_buf->buf, in->pas); + + in->field_select = cpu_to_be32(MLX5_MODIFY_CQ_MASK_LOG_SIZE | + MLX5_MODIFY_CQ_MASK_PG_OFFSET | + MLX5_MODIFY_CQ_MASK_PG_SIZE); + in->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + in->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; + in->ctx.page_offset = 0; + in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(entries) << 24); + in->hdr.opmod = cpu_to_be16(MLX5_CQ_OPMOD_RESIZE); + in->cqn = cpu_to_be32(cq->mcq.cqn); + + err = mlx5_core_modify_cq(&dev->mdev, &cq->mcq, in, inlen); + if (err) + goto ex_alloc; + + if (udata) { + cq->ibcq.cqe = entries - 1; + ib_umem_release(cq->buf.umem); + cq->buf.umem = cq->resize_umem; + cq->resize_umem = NULL; + } else { + struct mlx5_ib_cq_buf tbuf; + int resized = 0; + + spin_lock_irqsave(&cq->lock, flags); + if (cq->resize_buf) { + err = copy_resize_cqes(cq); + if (!err) { + tbuf = cq->buf; + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + resized = 1; + } + } + cq->ibcq.cqe = entries - 1; + spin_unlock_irqrestore(&cq->lock, flags); + if (resized) + free_cq_buf(dev, &tbuf); + } + mutex_unlock(&cq->resize_mutex); + + mlx5_vfree(in); + return 0; + +ex_alloc: + mlx5_vfree(in); + +ex_resize: + if (udata) + un_resize_user(cq); + else + un_resize_kernel(dev, cq); +ex: + mutex_unlock(&cq->resize_mutex); + return err; } int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq) diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c index 256a23344f2..ece028fc47d 100644 --- a/drivers/infiniband/hw/mlx5/doorbell.c +++ b/drivers/infiniband/hw/mlx5/doorbell.c @@ -47,7 +47,6 @@ int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, struct mlx5_db *db) { struct mlx5_ib_user_db_page *page; - struct ib_umem_chunk *chunk; int err = 0; mutex_lock(&context->db_page_mutex); @@ -75,8 +74,7 @@ int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, list_add(&page->list, &context->db_page_list); found: - chunk = list_entry(page->umem->chunk_list.next, struct ib_umem_chunk, list); - db->dma = sg_dma_address(chunk->page_list) + (virt & ~PAGE_MASK); + db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK); db->u.user_page = page; ++page->refcnt; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 3f831de9a4d..364d4b6937f 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -46,8 +46,8 @@ #include "mlx5_ib.h" #define DRIVER_NAME "mlx5_ib" -#define DRIVER_VERSION "1.0" -#define DRIVER_RELDATE "June 2013" +#define DRIVER_VERSION "2.2-1" +#define DRIVER_RELDATE "Feb 2014" MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>"); MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); @@ -164,6 +164,7 @@ int mlx5_vector2eqn(struct mlx5_ib_dev *dev, int vector, int *eqn, int *irqn) static int alloc_comp_eqs(struct mlx5_ib_dev *dev) { struct mlx5_eq_table *table = &dev->mdev.priv.eq_table; + char name[MLX5_MAX_EQ_NAME]; struct mlx5_eq *eq, *n; int ncomp_vec; int nent; @@ -180,11 +181,10 @@ static int alloc_comp_eqs(struct mlx5_ib_dev *dev) goto clean; } - snprintf(eq->name, MLX5_MAX_EQ_NAME, "mlx5_comp%d", i); + snprintf(name, MLX5_MAX_EQ_NAME, "mlx5_comp%d", i); err = mlx5_create_map_eq(&dev->mdev, eq, i + MLX5_EQ_VEC_COMP_BASE, nent, 0, - eq->name, - &dev->mdev.priv.uuari.uars[0]); + name, &dev->mdev.priv.uuari.uars[0]); if (err) { kfree(eq); goto clean; @@ -261,8 +261,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | - IB_DEVICE_RC_RNR_NAK_GEN | - IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + IB_DEVICE_RC_RNR_NAK_GEN; flags = dev->mdev.caps.flags; if (flags & MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; @@ -274,6 +273,17 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (flags & MLX5_DEV_CAP_FLAG_XRC) props->device_cap_flags |= IB_DEVICE_XRC; props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + if (flags & MLX5_DEV_CAP_FLAG_SIG_HAND_OVER) { + props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; + /* At this stage no support for signature handover */ + props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | + IB_PROT_T10DIF_TYPE_2 | + IB_PROT_T10DIF_TYPE_3; + props->sig_guard_cap = IB_GUARD_T10DIF_CRC | + IB_GUARD_T10DIF_CSUM; + } + if (flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST) + props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; props->vendor_id = be32_to_cpup((__be32 *)(out_mad->data + 36)) & 0xffffff; @@ -301,9 +311,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->max_srq_sge = max_rq_sg - 1; props->max_fast_reg_page_list_len = (unsigned int)-1; props->local_ca_ack_delay = dev->mdev.caps.local_ca_ack_delay; - props->atomic_cap = dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_ATOMIC ? - IB_ATOMIC_HCA : IB_ATOMIC_NONE; - props->masked_atomic_cap = IB_ATOMIC_HCA; + props->atomic_cap = IB_ATOMIC_NONE; + props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_pkeys = be16_to_cpup((__be16 *)(out_mad->data + 28)); props->max_mcast_grp = 1 << dev->mdev.caps.log_max_mcg; props->max_mcast_qp_attach = dev->mdev.caps.max_qp_mcg; @@ -537,34 +546,51 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_ib_alloc_ucontext_req req; + struct mlx5_ib_alloc_ucontext_req_v2 req; struct mlx5_ib_alloc_ucontext_resp resp; struct mlx5_ib_ucontext *context; struct mlx5_uuar_info *uuari; struct mlx5_uar *uars; + int gross_uuars; int num_uars; + int ver; int uuarn; int err; int i; + int reqlen; if (!dev->ib_active) return ERR_PTR(-EAGAIN); - err = ib_copy_from_udata(&req, udata, sizeof(req)); + memset(&req, 0, sizeof(req)); + reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); + if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) + ver = 0; + else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2)) + ver = 2; + else + return ERR_PTR(-EINVAL); + + err = ib_copy_from_udata(&req, udata, reqlen); if (err) return ERR_PTR(err); + if (req.flags || req.reserved) + return ERR_PTR(-EINVAL); + if (req.total_num_uuars > MLX5_MAX_UUARS) return ERR_PTR(-ENOMEM); if (req.total_num_uuars == 0) return ERR_PTR(-EINVAL); - req.total_num_uuars = ALIGN(req.total_num_uuars, MLX5_BF_REGS_PER_PAGE); + req.total_num_uuars = ALIGN(req.total_num_uuars, + MLX5_NON_FP_BF_REGS_PER_PAGE); if (req.num_low_latency_uuars > req.total_num_uuars - 1) return ERR_PTR(-EINVAL); - num_uars = req.total_num_uuars / MLX5_BF_REGS_PER_PAGE; + num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE; + gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE; resp.qp_tab_size = 1 << dev->mdev.caps.log_max_qp; resp.bf_reg_size = dev->mdev.caps.bf_reg_size; resp.cache_line_size = L1_CACHE_BYTES; @@ -586,7 +612,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, goto out_ctx; } - uuari->bitmap = kcalloc(BITS_TO_LONGS(req.total_num_uuars), + uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars), sizeof(*uuari->bitmap), GFP_KERNEL); if (!uuari->bitmap) { @@ -596,13 +622,13 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, /* * clear all fast path uuars */ - for (i = 0; i < req.total_num_uuars; i++) { + for (i = 0; i < gross_uuars; i++) { uuarn = i & 3; if (uuarn == 2 || uuarn == 3) set_bit(i, uuari->bitmap); } - uuari->count = kcalloc(req.total_num_uuars, sizeof(*uuari->count), GFP_KERNEL); + uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL); if (!uuari->count) { err = -ENOMEM; goto out_bitmap; @@ -624,6 +650,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, if (err) goto out_uars; + uuari->ver = ver; uuari->num_low_latency_uuars = req.num_low_latency_uuars; uuari->uars = uars; uuari->num_uars = num_uars; @@ -746,7 +773,8 @@ static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn) seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); seg->start_addr = 0; - err = mlx5_core_create_mkey(&dev->mdev, &mr, in, sizeof(*in)); + err = mlx5_core_create_mkey(&dev->mdev, &mr, in, sizeof(*in), + NULL, NULL, NULL); if (err) { mlx5_ib_warn(dev, "failed to create mkey, %d\n", err); goto err_in; @@ -1006,6 +1034,11 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event, ibev.device = &ibdev->ib_dev; ibev.element.port_num = port; + if (port < 1 || port > ibdev->num_ports) { + mlx5_ib_warn(ibdev, "warning: event on port %d\n", port); + return; + } + if (ibdev->ib_active) ib_dispatch_event(&ibev); } @@ -1401,12 +1434,15 @@ static int init_one(struct pci_dev *pdev, dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; + dev->ib_dev.destroy_mr = mlx5_ib_destroy_mr; dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; dev->ib_dev.process_mad = mlx5_ib_process_mad; + dev->ib_dev.create_mr = mlx5_ib_create_mr; dev->ib_dev.alloc_fast_reg_mr = mlx5_ib_alloc_fast_reg_mr; dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list; dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; + dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; if (mdev->caps.flags & MLX5_DEV_CAP_FLAG_XRC) { dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 3a5322870b9..8499aec94db 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -44,16 +44,17 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, int *ncont, int *order) { - struct ib_umem_chunk *chunk; unsigned long tmp; unsigned long m; - int i, j, k; + int i, k; u64 base = 0; int p = 0; int skip; int mask; u64 len; u64 pfn; + struct scatterlist *sg; + int entry; addr = addr >> PAGE_SHIFT; tmp = (unsigned long)addr; @@ -61,32 +62,31 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, skip = 1 << m; mask = skip - 1; i = 0; - list_for_each_entry(chunk, &umem->chunk_list, list) - for (j = 0; j < chunk->nmap; j++) { - len = sg_dma_len(&chunk->page_list[j]) >> PAGE_SHIFT; - pfn = sg_dma_address(&chunk->page_list[j]) >> PAGE_SHIFT; - for (k = 0; k < len; k++) { - if (!(i & mask)) { - tmp = (unsigned long)pfn; - m = min(m, find_first_bit(&tmp, sizeof(tmp))); + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> PAGE_SHIFT; + pfn = sg_dma_address(sg) >> PAGE_SHIFT; + for (k = 0; k < len; k++) { + if (!(i & mask)) { + tmp = (unsigned long)pfn; + m = min(m, find_first_bit(&tmp, sizeof(tmp))); + skip = 1 << m; + mask = skip - 1; + base = pfn; + p = 0; + } else { + if (base + p != pfn) { + tmp = (unsigned long)p; + m = find_first_bit(&tmp, sizeof(tmp)); skip = 1 << m; mask = skip - 1; base = pfn; p = 0; - } else { - if (base + p != pfn) { - tmp = (unsigned long)p; - m = find_first_bit(&tmp, sizeof(tmp)); - skip = 1 << m; - mask = skip - 1; - base = pfn; - p = 0; - } } - p++; - i++; } + p++; + i++; } + } if (i) { m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m); @@ -112,32 +112,32 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, { int shift = page_shift - PAGE_SHIFT; int mask = (1 << shift) - 1; - struct ib_umem_chunk *chunk; - int i, j, k; + int i, k; u64 cur = 0; u64 base; int len; + struct scatterlist *sg; + int entry; i = 0; - list_for_each_entry(chunk, &umem->chunk_list, list) - for (j = 0; j < chunk->nmap; j++) { - len = sg_dma_len(&chunk->page_list[j]) >> PAGE_SHIFT; - base = sg_dma_address(&chunk->page_list[j]); - for (k = 0; k < len; k++) { - if (!(i & mask)) { - cur = base + (k << PAGE_SHIFT); - if (umr) - cur |= 3; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> PAGE_SHIFT; + base = sg_dma_address(sg); + for (k = 0; k < len; k++) { + if (!(i & mask)) { + cur = base + (k << PAGE_SHIFT); + if (umr) + cur |= 3; - pas[i >> shift] = cpu_to_be64(cur); - mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", - i >> shift, be64_to_cpu(pas[i >> shift])); - } else - mlx5_ib_dbg(dev, "=====> 0x%llx\n", - base + (k << PAGE_SHIFT)); - i++; - } + pas[i >> shift] = cpu_to_be64(cur); + mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", + i >> shift, be64_to_cpu(pas[i >> shift])); + } else + mlx5_ib_dbg(dev, "=====> 0x%llx\n", + base + (k << PAGE_SHIFT)); + i++; } + } } int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 836be915724..f2ccf1a5a29 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -189,12 +189,16 @@ struct mlx5_ib_qp { int create_type; u32 pa_lkey; + + /* Store signature errors */ + bool signature_en; }; struct mlx5_ib_cq_buf { struct mlx5_buf buf; struct ib_umem *umem; int cqe_size; + int nent; }; enum mlx5_ib_qp_flags { @@ -220,7 +224,7 @@ struct mlx5_ib_cq { /* protect resize cq */ struct mutex resize_mutex; - struct mlx5_ib_cq_resize *resize_buf; + struct mlx5_ib_cq_buf *resize_buf; struct ib_umem *resize_umem; int cqe_size; }; @@ -260,8 +264,9 @@ struct mlx5_ib_mr { __be64 *pas; dma_addr_t dma; int npages; - struct completion done; - enum ib_wc_status status; + struct mlx5_ib_dev *dev; + struct mlx5_create_mkey_mbox_out out; + struct mlx5_core_sig_ctx *sig; }; struct mlx5_ib_fast_reg_page_list { @@ -270,6 +275,17 @@ struct mlx5_ib_fast_reg_page_list { dma_addr_t map; }; +struct mlx5_ib_umr_context { + enum ib_wc_status status; + struct completion done; +}; + +static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) +{ + context->status = -1; + init_completion(&context->done); +} + struct umr_common { struct ib_pd *pd; struct ib_cq *cq; @@ -323,6 +339,7 @@ struct mlx5_cache_ent { struct mlx5_ib_dev *dev; struct work_struct work; struct delayed_work dwork; + int pending; }; struct mlx5_mr_cache { @@ -358,6 +375,8 @@ struct mlx5_ib_dev { spinlock_t mr_lock; struct mlx5_ib_resources devr; struct mlx5_mr_cache cache; + struct timer_list delay_timer; + int fill_delay; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -390,6 +409,11 @@ static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) return container_of(mqp, struct mlx5_ib_qp, mqp); } +static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr) +{ + return container_of(mmr, struct mlx5_ib_mr, mmr); +} + static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd) { return container_of(ibpd, struct mlx5_ib_pd, ibpd); @@ -489,6 +513,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); int mlx5_ib_dereg_mr(struct ib_mr *ibmr); +int mlx5_ib_destroy_mr(struct ib_mr *ibmr); +struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr); struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len); struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, @@ -524,6 +551,8 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift); void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context); +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status); static inline void init_query_mad(struct ib_smp *mad) { diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index bd41df95b6f..afa873bd028 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -35,11 +35,16 @@ #include <linux/random.h> #include <linux/debugfs.h> #include <linux/export.h> +#include <linux/delay.h> #include <rdma/ib_umem.h> #include "mlx5_ib.h" enum { - DEF_CACHE_SIZE = 10, + MAX_PENDING_REG_MR = 8, +}; + +enum { + MLX5_UMR_ALIGN = 2048 }; static __be64 *mr_align(__be64 *ptr, int align) @@ -59,15 +64,67 @@ static int order2idx(struct mlx5_ib_dev *dev, int order) return order - cache->ent[0].order; } +static void reg_mr_callback(int status, void *context) +{ + struct mlx5_ib_mr *mr = context; + struct mlx5_ib_dev *dev = mr->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int c = order2idx(dev, mr->order); + struct mlx5_cache_ent *ent = &cache->ent[c]; + u8 key; + unsigned long flags; + struct mlx5_mr_table *table = &dev->mdev.priv.mr_table; + int err; + + spin_lock_irqsave(&ent->lock, flags); + ent->pending--; + spin_unlock_irqrestore(&ent->lock, flags); + if (status) { + mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + if (mr->out.hdr.status) { + mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n", + mr->out.hdr.status, + be32_to_cpu(mr->out.hdr.syndrome)); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + spin_lock_irqsave(&dev->mdev.priv.mkey_lock, flags); + key = dev->mdev.priv.mkey_key++; + spin_unlock_irqrestore(&dev->mdev.priv.mkey_lock, flags); + mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; + + cache->last_add = jiffies; + + spin_lock_irqsave(&ent->lock, flags); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + ent->size++; + spin_unlock_irqrestore(&ent->lock, flags); + + write_lock_irqsave(&table->lock, flags); + err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmr.key), + &mr->mmr); + if (err) + pr_err("Error inserting to mr tree. 0x%x\n", -err); + write_unlock_irqrestore(&table->lock, flags); +} + static int add_keys(struct mlx5_ib_dev *dev, int c, int num) { - struct device *ddev = dev->ib_dev.dma_device; struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent = &cache->ent[c]; struct mlx5_create_mkey_mbox_in *in; struct mlx5_ib_mr *mr; int npages = 1 << ent->order; - int size = sizeof(u64) * npages; int err = 0; int i; @@ -76,87 +133,66 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) return -ENOMEM; for (i = 0; i < num; i++) { + if (ent->pending >= MAX_PENDING_REG_MR) { + err = -EAGAIN; + break; + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) { err = -ENOMEM; - goto out; + break; } mr->order = ent->order; mr->umred = 1; - mr->pas = kmalloc(size + 0x3f, GFP_KERNEL); - if (!mr->pas) { - kfree(mr); - err = -ENOMEM; - goto out; - } - mr->dma = dma_map_single(ddev, mr_align(mr->pas, 0x40), size, - DMA_TO_DEVICE); - if (dma_mapping_error(ddev, mr->dma)) { - kfree(mr->pas); - kfree(mr); - err = -ENOMEM; - goto out; - } - + mr->dev = dev; in->seg.status = 1 << 6; in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; in->seg.log2_page_size = 12; + spin_lock_irq(&ent->lock); + ent->pending++; + spin_unlock_irq(&ent->lock); err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, - sizeof(*in)); + sizeof(*in), reg_mr_callback, + mr, &mr->out); if (err) { mlx5_ib_warn(dev, "create mkey failed %d\n", err); - dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE); - kfree(mr->pas); kfree(mr); - goto out; + break; } - cache->last_add = jiffies; - - spin_lock(&ent->lock); - list_add_tail(&mr->list, &ent->head); - ent->cur++; - ent->size++; - spin_unlock(&ent->lock); } -out: kfree(in); return err; } static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) { - struct device *ddev = dev->ib_dev.dma_device; struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent = &cache->ent[c]; struct mlx5_ib_mr *mr; - int size; int err; int i; for (i = 0; i < num; i++) { - spin_lock(&ent->lock); + spin_lock_irq(&ent->lock); if (list_empty(&ent->head)) { - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); return; } mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); list_del(&mr->list); ent->cur--; ent->size--; - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr); - if (err) { + if (err) mlx5_ib_warn(dev, "failed destroy mkey\n"); - } else { - size = ALIGN(sizeof(u64) * (1 << mr->order), 0x40); - dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE); - kfree(mr->pas); + else kfree(mr); - } } } @@ -183,9 +219,13 @@ static ssize_t size_write(struct file *filp, const char __user *buf, return -EINVAL; if (var > ent->size) { - err = add_keys(dev, c, var - ent->size); - if (err) - return err; + do { + err = add_keys(dev, c, var - ent->size); + if (err && err != -EAGAIN) + return err; + + usleep_range(3000, 5000); + } while (err); } else if (var < ent->size) { remove_keys(dev, c, ent->size - var); } @@ -301,23 +341,37 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) struct mlx5_ib_dev *dev = ent->dev; struct mlx5_mr_cache *cache = &dev->cache; int i = order2idx(dev, ent->order); + int err; if (cache->stopped) return; ent = &dev->cache.ent[i]; - if (ent->cur < 2 * ent->limit) { - add_keys(dev, i, 1); - if (ent->cur < 2 * ent->limit) - queue_work(cache->wq, &ent->work); + if (ent->cur < 2 * ent->limit && !dev->fill_delay) { + err = add_keys(dev, i, 1); + if (ent->cur < 2 * ent->limit) { + if (err == -EAGAIN) { + mlx5_ib_dbg(dev, "returned eagain, order %d\n", + i + 2); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(3)); + } else if (err) { + mlx5_ib_warn(dev, "command failed order %d, err %d\n", + i + 2, err); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(1000)); + } else { + queue_work(cache->wq, &ent->work); + } + } } else if (ent->cur > 2 * ent->limit) { if (!someone_adding(cache) && - time_after(jiffies, cache->last_add + 60 * HZ)) { + time_after(jiffies, cache->last_add + 300 * HZ)) { remove_keys(dev, i, 1); if (ent->cur > ent->limit) queue_work(cache->wq, &ent->work); } else { - queue_delayed_work(cache->wq, &ent->dwork, 60 * HZ); + queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); } } } @@ -357,18 +411,18 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i); - spin_lock(&ent->lock); + spin_lock_irq(&ent->lock); if (!list_empty(&ent->head)) { mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); list_del(&mr->list); ent->cur--; - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); if (ent->cur < ent->limit) queue_work(cache->wq, &ent->work); break; } - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); queue_work(cache->wq, &ent->work); @@ -395,12 +449,12 @@ static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) return; } ent = &cache->ent[c]; - spin_lock(&ent->lock); + spin_lock_irq(&ent->lock); list_add_tail(&mr->list, &ent->head); ent->cur++; if (ent->cur > 2 * ent->limit) shrink = 1; - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); if (shrink) queue_work(cache->wq, &ent->work); @@ -408,33 +462,28 @@ static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) static void clean_keys(struct mlx5_ib_dev *dev, int c) { - struct device *ddev = dev->ib_dev.dma_device; struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent = &cache->ent[c]; struct mlx5_ib_mr *mr; - int size; int err; + cancel_delayed_work(&ent->dwork); while (1) { - spin_lock(&ent->lock); + spin_lock_irq(&ent->lock); if (list_empty(&ent->head)) { - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); return; } mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); list_del(&mr->list); ent->cur--; ent->size--; - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr); - if (err) { + if (err) mlx5_ib_warn(dev, "failed destroy mkey\n"); - } else { - size = ALIGN(sizeof(u64) * (1 << mr->order), 0x40); - dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE); - kfree(mr->pas); + else kfree(mr); - } } } @@ -490,12 +539,18 @@ static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) debugfs_remove_recursive(dev->cache.root); } +static void delay_time_func(unsigned long ctx) +{ + struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx; + + dev->fill_delay = 0; +} + int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent; int limit; - int size; int err; int i; @@ -505,6 +560,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) return -ENOMEM; } + setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev); for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { INIT_LIST_HEAD(&cache->ent[i].head); spin_lock_init(&cache->ent[i].lock); @@ -515,13 +571,11 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) ent->order = i + 2; ent->dev = dev; - if (dev->mdev.profile->mask & MLX5_PROF_MASK_MR_CACHE) { - size = dev->mdev.profile->mr_cache[i].size; + if (dev->mdev.profile->mask & MLX5_PROF_MASK_MR_CACHE) limit = dev->mdev.profile->mr_cache[i].limit; - } else { - size = DEF_CACHE_SIZE; + else limit = 0; - } + INIT_WORK(&ent->work, cache_work_func); INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); ent->limit = limit; @@ -540,13 +594,16 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) int i; dev->cache.stopped = 1; - destroy_workqueue(dev->cache.wq); + flush_workqueue(dev->cache.wq); mlx5_mr_cache_debugfs_cleanup(dev); for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) clean_keys(dev, i); + destroy_workqueue(dev->cache.wq); + del_timer_sync(&dev->delay_timer); + return 0; } @@ -575,7 +632,8 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); seg->start_addr = 0; - err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in)); + err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL, + NULL); if (err) goto err_in; @@ -650,7 +708,7 @@ static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) { - struct mlx5_ib_mr *mr; + struct mlx5_ib_umr_context *context; struct ib_wc wc; int err; @@ -663,9 +721,9 @@ void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) if (err == 0) break; - mr = (struct mlx5_ib_mr *)(unsigned long)wc.wr_id; - mr->status = wc.status; - complete(&mr->done); + context = (struct mlx5_ib_umr_context *) (unsigned long) wc.wr_id; + context->status = wc.status; + complete(&context->done); } ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); } @@ -675,21 +733,24 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, int page_shift, int order, int access_flags) { struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct device *ddev = dev->ib_dev.dma_device; struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; struct ib_send_wr wr, *bad; struct mlx5_ib_mr *mr; struct ib_sge sg; - int err; + int size = sizeof(u64) * npages; + int err = 0; int i; - for (i = 0; i < 10; i++) { + for (i = 0; i < 1; i++) { mr = alloc_cached_mr(dev, order); if (mr) break; err = add_keys(dev, order2idx(dev, order), 1); - if (err) { - mlx5_ib_warn(dev, "add_keys failed\n"); + if (err && err != -EAGAIN) { + mlx5_ib_warn(dev, "add_keys failed, err %d\n", err); break; } } @@ -697,38 +758,58 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, if (!mr) return ERR_PTR(-EAGAIN); - mlx5_ib_populate_pas(dev, umem, page_shift, mr_align(mr->pas, 0x40), 1); + mr->pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); + if (!mr->pas) { + err = -ENOMEM; + goto free_mr; + } + + mlx5_ib_populate_pas(dev, umem, page_shift, + mr_align(mr->pas, MLX5_UMR_ALIGN), 1); + + mr->dma = dma_map_single(ddev, mr_align(mr->pas, MLX5_UMR_ALIGN), size, + DMA_TO_DEVICE); + if (dma_mapping_error(ddev, mr->dma)) { + err = -ENOMEM; + goto free_pas; + } memset(&wr, 0, sizeof(wr)); - wr.wr_id = (u64)(unsigned long)mr; + wr.wr_id = (u64)(unsigned long)&umr_context; prep_umr_reg_wqe(pd, &wr, &sg, mr->dma, npages, mr->mmr.key, page_shift, virt_addr, len, access_flags); - /* We serialize polls so one process does not kidnap another's - * completion. This is not a problem since wr is completed in - * around 1 usec - */ + mlx5_ib_init_umr_context(&umr_context); down(&umrc->sem); - init_completion(&mr->done); err = ib_post_send(umrc->qp, &wr, &bad); if (err) { mlx5_ib_warn(dev, "post send failed, err %d\n", err); - up(&umrc->sem); - goto error; + goto unmap_dma; + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed\n"); + err = -EFAULT; + } } - wait_for_completion(&mr->done); + + mr->mmr.iova = virt_addr; + mr->mmr.size = len; + mr->mmr.pd = to_mpd(pd)->pdn; + +unmap_dma: up(&umrc->sem); + dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE); - if (mr->status != IB_WC_SUCCESS) { - mlx5_ib_warn(dev, "reg umr failed\n"); - err = -EFAULT; - goto error; +free_pas: + kfree(mr->pas); + +free_mr: + if (err) { + free_cached_mr(dev, mr); + return ERR_PTR(err); } return mr; - -error: - free_cached_mr(dev, mr); - return ERR_PTR(err); } static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, @@ -763,8 +844,10 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); in->seg.log2_page_size = page_shift; in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); - err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen); + in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, + 1 << page_shift)); + err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen, NULL, + NULL, NULL); if (err) { mlx5_ib_warn(dev, "create mkey failed\n"); goto err_2; @@ -855,24 +938,26 @@ error: static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; struct ib_send_wr wr, *bad; int err; memset(&wr, 0, sizeof(wr)); - wr.wr_id = (u64)(unsigned long)mr; + wr.wr_id = (u64)(unsigned long)&umr_context; prep_umr_unreg_wqe(dev, &wr, mr->mmr.key); + mlx5_ib_init_umr_context(&umr_context); down(&umrc->sem); - init_completion(&mr->done); err = ib_post_send(umrc->qp, &wr, &bad); if (err) { up(&umrc->sem); mlx5_ib_dbg(dev, "err %d\n", err); goto error; + } else { + wait_for_completion(&umr_context.done); + up(&umrc->sem); } - wait_for_completion(&mr->done); - up(&umrc->sem); - if (mr->status != IB_WC_SUCCESS) { + if (umr_context.status != IB_WC_SUCCESS) { mlx5_ib_warn(dev, "unreg umr failed\n"); err = -EFAULT; goto error; @@ -921,6 +1006,122 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) return 0; } +struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int access_mode, err; + int ndescs = roundup(mr_init_attr->max_reg_descriptors, 4); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + in->seg.status = 1 << 6; /* free */ + in->seg.xlt_oct_size = cpu_to_be32(ndescs); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + access_mode = MLX5_ACCESS_MODE_MTT; + + if (mr_init_attr->flags & IB_MR_SIGNATURE_EN) { + u32 psv_index[2]; + + in->seg.flags_pd = cpu_to_be32(be32_to_cpu(in->seg.flags_pd) | + MLX5_MKEY_BSF_EN); + in->seg.bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); + mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); + if (!mr->sig) { + err = -ENOMEM; + goto err_free_in; + } + + /* create mem & wire PSVs */ + err = mlx5_core_create_psv(&dev->mdev, to_mpd(pd)->pdn, + 2, psv_index); + if (err) + goto err_free_sig; + + access_mode = MLX5_ACCESS_MODE_KLM; + mr->sig->psv_memory.psv_idx = psv_index[0]; + mr->sig->psv_wire.psv_idx = psv_index[1]; + + mr->sig->sig_status_checked = true; + mr->sig->sig_err_exists = false; + /* Next UMR, Arm SIGERR */ + ++mr->sig->sigerr_count; + } + + in->seg.flags = MLX5_PERM_UMR_EN | access_mode; + err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in), + NULL, NULL, NULL); + if (err) + goto err_destroy_psv; + + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + kfree(in); + + return &mr->ibmr; + +err_destroy_psv: + if (mr->sig) { + if (mlx5_core_destroy_psv(&dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(&dev->mdev, + mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + } +err_free_sig: + kfree(mr->sig); +err_free_in: + kfree(in); +err_free: + kfree(mr); + return ERR_PTR(err); +} + +int mlx5_ib_destroy_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + int err; + + if (mr->sig) { + if (mlx5_core_destroy_psv(&dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(&dev->mdev, + mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + kfree(mr->sig); + } + + err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr); + if (err) { + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", + mr->mmr.key, err); + return err; + } + + kfree(mr); + + return err; +} + struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) { @@ -948,7 +1149,8 @@ struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, * TBD not needed - issue 197292 */ in->seg.log2_page_size = PAGE_SHIFT; - err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in)); + err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in), NULL, + NULL, NULL); kfree(in); if (err) goto err_free; @@ -1005,3 +1207,44 @@ void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) kfree(mfrpl->ibfrpl.page_list); kfree(mfrpl); } + +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status) +{ + struct mlx5_ib_mr *mmr = to_mmr(ibmr); + int ret = 0; + + if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { + pr_err("Invalid status check mask\n"); + ret = -EINVAL; + goto done; + } + + mr_status->fail_status = 0; + if (check_mask & IB_MR_CHECK_SIG_STATUS) { + if (!mmr->sig) { + ret = -EINVAL; + pr_err("signature status check requested on a non-signature enabled MR\n"); + goto done; + } + + mmr->sig->sig_status_checked = true; + if (!mmr->sig->sig_err_exists) + goto done; + + if (ibmr->lkey == mmr->sig->err_item.key) + memcpy(&mr_status->sig_err, &mmr->sig->err_item, + sizeof(mr_status->sig_err)); + else { + mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; + mr_status->sig_err.sig_err_offset = 0; + mr_status->sig_err.key = mmr->sig->err_item.key; + } + + mmr->sig->sig_err_exists = false; + mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; + } + +done: + return ret; +} diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 045f8cdbd30..bbbcf389272 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -203,7 +203,7 @@ static int sq_overhead(enum ib_qp_type qp_type) switch (qp_type) { case IB_QPT_XRC_INI: - size = sizeof(struct mlx5_wqe_xrc_seg); + size += sizeof(struct mlx5_wqe_xrc_seg); /* fall through */ case IB_QPT_RC: size += sizeof(struct mlx5_wqe_ctrl_seg) + @@ -211,20 +211,25 @@ static int sq_overhead(enum ib_qp_type qp_type) sizeof(struct mlx5_wqe_raddr_seg); break; + case IB_QPT_XRC_TGT: + return 0; + case IB_QPT_UC: - size = sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); break; case IB_QPT_UD: case IB_QPT_SMI: case IB_QPT_GSI: - size = sizeof(struct mlx5_wqe_ctrl_seg) + + size += sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_datagram_seg); break; case MLX5_IB_QPT_REG_UMR: - size = sizeof(struct mlx5_wqe_ctrl_seg) + + size += sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_umr_ctrl_seg) + sizeof(struct mlx5_mkey_seg); break; @@ -251,8 +256,11 @@ static int calc_send_wqe(struct ib_qp_init_attr *attr) } size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); - - return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN && + ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE) + return MLX5_SIG_WQE_SIZE; + else + return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); } static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, @@ -270,7 +278,8 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, return wqe_size; if (wqe_size > dev->mdev.caps.max_sq_desc_sz) { - mlx5_ib_dbg(dev, "\n"); + mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n", + wqe_size, dev->mdev.caps.max_sq_desc_sz); return -EINVAL; } @@ -278,11 +287,20 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, sizeof(struct mlx5_wqe_inline_seg); attr->cap.max_inline_data = qp->max_inline_data; + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) + qp->signature_en = true; + wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size); qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; + if (qp->sq.wqe_cnt > dev->mdev.caps.max_wqes) { + mlx5_ib_dbg(dev, "wqe count(%d) exceeds limits(%d)\n", + qp->sq.wqe_cnt, dev->mdev.caps.max_wqes); + return -ENOMEM; + } qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); qp->sq.max_gs = attr->cap.max_send_sge; - qp->sq.max_post = 1 << ilog2(wq_size / wqe_size); + qp->sq.max_post = wq_size / wqe_size; + attr->cap.max_send_wr = qp->sq.max_post; return wq_size; } @@ -330,14 +348,57 @@ static int qp_has_rq(struct ib_qp_init_attr *attr) return 1; } +static int first_med_uuar(void) +{ + return 1; +} + +static int next_uuar(int n) +{ + n++; + + while (((n % 4) & 2)) + n++; + + return n; +} + +static int num_med_uuar(struct mlx5_uuar_info *uuari) +{ + int n; + + n = uuari->num_uars * MLX5_NON_FP_BF_REGS_PER_PAGE - + uuari->num_low_latency_uuars - 1; + + return n >= 0 ? n : 0; +} + +static int max_uuari(struct mlx5_uuar_info *uuari) +{ + return uuari->num_uars * 4; +} + +static int first_hi_uuar(struct mlx5_uuar_info *uuari) +{ + int med; + int i; + int t; + + med = num_med_uuar(uuari); + for (t = 0, i = first_med_uuar();; i = next_uuar(i)) { + t++; + if (t == med) + return next_uuar(i); + } + + return 0; +} + static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari) { - int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE; - int start_uuar; int i; - start_uuar = nuuars - uuari->num_low_latency_uuars; - for (i = start_uuar; i < nuuars; i++) { + for (i = first_hi_uuar(uuari); i < max_uuari(uuari); i = next_uuar(i)) { if (!test_bit(i, uuari->bitmap)) { set_bit(i, uuari->bitmap); uuari->count[i]++; @@ -350,19 +411,10 @@ static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari) static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari) { - int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE; - int minidx = 1; - int uuarn; - int end; + int minidx = first_med_uuar(); int i; - end = nuuars - uuari->num_low_latency_uuars; - - for (i = 1; i < end; i++) { - uuarn = i & 3; - if (uuarn == 2 || uuarn == 3) - continue; - + for (i = first_med_uuar(); i < first_hi_uuar(uuari); i = next_uuar(i)) { if (uuari->count[i] < uuari->count[minidx]) minidx = i; } @@ -384,11 +436,17 @@ static int alloc_uuar(struct mlx5_uuar_info *uuari, break; case MLX5_IB_LATENCY_CLASS_MEDIUM: - uuarn = alloc_med_class_uuar(uuari); + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_med_class_uuar(uuari); break; case MLX5_IB_LATENCY_CLASS_HIGH: - uuarn = alloc_high_class_uuar(uuari); + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_high_class_uuar(uuari); break; case MLX5_IB_LATENCY_CLASS_FAST_PATH: @@ -479,12 +537,12 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, { struct mlx5_ib_ucontext *context; struct mlx5_ib_create_qp ucmd; - int page_shift; + int page_shift = 0; int uar_index; int npages; - u32 offset; + u32 offset = 0; int uuarn; - int ncont; + int ncont = 0; int err; err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); @@ -500,38 +558,53 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH); if (uuarn < 0) { mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n"); - mlx5_ib_dbg(dev, "reverting to high latency\n"); - uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); + mlx5_ib_dbg(dev, "reverting to medium latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM); if (uuarn < 0) { - mlx5_ib_dbg(dev, "uuar allocation failed\n"); - return uuarn; + mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to high latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); + if (uuarn < 0) { + mlx5_ib_warn(dev, "uuar allocation failed\n"); + return uuarn; + } } } uar_index = uuarn_to_uar_index(&context->uuari, uuarn); mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index); + qp->rq.offset = 0; + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + err = set_user_buf_size(dev, qp, &ucmd); if (err) goto err_uuar; - qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, - qp->buf_size, 0, 0); - if (IS_ERR(qp->umem)) { - mlx5_ib_dbg(dev, "umem_get failed\n"); - err = PTR_ERR(qp->umem); - goto err_uuar; + if (ucmd.buf_addr && qp->buf_size) { + qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, + qp->buf_size, 0, 0); + if (IS_ERR(qp->umem)) { + mlx5_ib_dbg(dev, "umem_get failed\n"); + err = PTR_ERR(qp->umem); + goto err_uuar; + } + } else { + qp->umem = NULL; } - mlx5_ib_cont_pages(qp->umem, ucmd.buf_addr, &npages, &page_shift, - &ncont, NULL); - err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset); - if (err) { - mlx5_ib_warn(dev, "bad offset\n"); - goto err_umem; + if (qp->umem) { + mlx5_ib_cont_pages(qp->umem, ucmd.buf_addr, &npages, &page_shift, + &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n", + ucmd.buf_addr, qp->buf_size, npages, page_shift, ncont, offset); } - mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n", - ucmd.buf_addr, qp->buf_size, npages, page_shift, ncont, offset); *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; *in = mlx5_vzalloc(*inlen); @@ -539,9 +612,10 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, err = -ENOMEM; goto err_umem; } - mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); + if (qp->umem) + mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); (*in)->ctx.log_pg_sz_remote_qpn = - cpu_to_be32((page_shift - PAGE_SHIFT) << 24); + cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); (*in)->ctx.params2 = cpu_to_be32(offset << 6); (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); @@ -570,7 +644,8 @@ err_free: mlx5_vfree(*in); err_umem: - ib_umem_release(qp->umem); + if (qp->umem) + ib_umem_release(qp->umem); err_uuar: free_uuar(&context->uuari, uuarn); @@ -583,7 +658,8 @@ static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp) context = to_mucontext(pd->uobject->context); mlx5_ib_db_unmap_user(context, &qp->db); - ib_umem_release(qp->umem); + if (qp->umem) + ib_umem_release(qp->umem); free_uuar(&context->uuari, qp->uuarn); } @@ -599,8 +675,8 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, int err; uuari = &dev->mdev.priv.uuari; - if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) - qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK; + if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) + return -EINVAL; if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) lc = MLX5_IB_LATENCY_CLASS_FAST_PATH; @@ -638,7 +714,8 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, goto err_buf; } (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); - (*in)->ctx.log_pg_sz_remote_qpn = cpu_to_be32((qp->buf.page_shift - PAGE_SHIFT) << 24); + (*in)->ctx.log_pg_sz_remote_qpn = + cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); /* Set "fast registration enabled" for all kernel QPs */ (*in)->ctx.params1 |= cpu_to_be32(1 << 11); (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); @@ -734,6 +811,15 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { + if (!(dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST)) { + mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n"); + return -EINVAL; + } else { + qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK; + } + } + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; @@ -805,6 +891,9 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (qp->wq_sig) in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_ENABLE_SIG); + if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST); + if (qp->scat_cqe && is_connected(init_attr->qp_type)) { int rcqe_sz; int scqe_sz; @@ -1280,6 +1369,11 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q MLX5_QP_OPTPAR_Q_KEY, [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, }, }, [MLX5_QP_STATE_RTR] = { @@ -1302,9 +1396,11 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RNR_TIMEOUT | - MLX5_QP_OPTPAR_PM_STATE, + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | - MLX5_QP_OPTPAR_PM_STATE, + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY | MLX5_QP_OPTPAR_SRQN | MLX5_QP_OPTPAR_CQN_RCV, @@ -1314,6 +1410,11 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q [MLX5_QP_STATE_RTS] = { [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE, + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RRE, }, }, }; @@ -1530,7 +1631,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); mlx5_st = to_mlx5_st(ibqp->qp_type); - if (mlx5_cur < 0 || mlx5_new < 0 || mlx5_st < 0) + if (mlx5_st < 0) goto out; optpar = ib_mask_to_mlx5_opt(attr_mask); @@ -1593,7 +1694,8 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR && - !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) + !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, + IB_LINK_LAYER_UNSPECIFIED)) goto out; if ((attr_mask & IB_QP_PORT) && @@ -1651,29 +1753,6 @@ static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, rseg->reserved = 0; } -static void set_atomic_seg(struct mlx5_wqe_atomic_seg *aseg, struct ib_send_wr *wr) -{ - if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { - aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap); - aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add); - } else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) { - aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); - aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add_mask); - } else { - aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); - aseg->compare = 0; - } -} - -static void set_masked_atomic_seg(struct mlx5_wqe_masked_atomic_seg *aseg, - struct ib_send_wr *wr) -{ - aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap); - aseg->swap_add_mask = cpu_to_be64(wr->wr.atomic.swap_mask); - aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add); - aseg->compare_mask = cpu_to_be64(wr->wr.atomic.compare_add_mask); -} - static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, struct ib_send_wr *wr) { @@ -1714,6 +1793,27 @@ static __be64 frwr_mkey_mask(void) return cpu_to_be64(result); } +static __be64 sig_mkey_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_SIGERR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE | + MLX5_MKEY_MASK_BSF_EN; + + return cpu_to_be64(result); +} + static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, struct ib_send_wr *wr, int li) { @@ -1747,6 +1847,7 @@ static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, MLX5_MKEY_MASK_PD | MLX5_MKEY_MASK_LR | MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_KEY | MLX5_MKEY_MASK_RR | MLX5_MKEY_MASK_RW | MLX5_MKEY_MASK_A | @@ -1768,7 +1869,7 @@ static u8 get_umr_flags(int acc) (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | - MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT; + MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN; } static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, @@ -1780,7 +1881,8 @@ static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, return; } - seg->flags = get_umr_flags(wr->wr.fast_reg.access_flags); + seg->flags = get_umr_flags(wr->wr.fast_reg.access_flags) | + MLX5_ACCESS_MODE_MTT; *writ = seg->flags & (MLX5_PERM_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE); seg->qpn_mkey7_0 = cpu_to_be32((wr->wr.fast_reg.rkey & 0xff) | 0xffffff00); seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); @@ -1803,7 +1905,8 @@ static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *w seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); seg->len = cpu_to_be64(wr->wr.fast_reg.length); seg->log2_page_size = wr->wr.fast_reg.page_shift; - seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | + mlx5_mkey_variant(wr->wr.fast_reg.rkey)); } static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, @@ -1895,6 +1998,342 @@ static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr, return 0; } +static u16 prot_field_size(enum ib_signature_type type) +{ + switch (type) { + case IB_SIG_TYPE_T10_DIF: + return MLX5_DIF_SIZE; + default: + return 0; + } +} + +static u8 bs_selector(int block_size) +{ + switch (block_size) { + case 512: return 0x1; + case 520: return 0x2; + case 4096: return 0x3; + case 4160: return 0x4; + case 1073741824: return 0x5; + default: return 0; + } +} + +static int format_selector(struct ib_sig_attrs *attr, + struct ib_sig_domain *domain, + int *selector) +{ + +#define FORMAT_DIF_NONE 0 +#define FORMAT_DIF_CRC_INC 8 +#define FORMAT_DIF_CRC_NO_INC 12 +#define FORMAT_DIF_CSUM_INC 13 +#define FORMAT_DIF_CSUM_NO_INC 14 + + switch (domain->sig.dif.type) { + case IB_T10DIF_NONE: + /* No DIF */ + *selector = FORMAT_DIF_NONE; + break; + case IB_T10DIF_TYPE1: /* Fall through */ + case IB_T10DIF_TYPE2: + switch (domain->sig.dif.bg_type) { + case IB_T10DIF_CRC: + *selector = FORMAT_DIF_CRC_INC; + break; + case IB_T10DIF_CSUM: + *selector = FORMAT_DIF_CSUM_INC; + break; + default: + return 1; + } + break; + case IB_T10DIF_TYPE3: + switch (domain->sig.dif.bg_type) { + case IB_T10DIF_CRC: + *selector = domain->sig.dif.type3_inc_reftag ? + FORMAT_DIF_CRC_INC : + FORMAT_DIF_CRC_NO_INC; + break; + case IB_T10DIF_CSUM: + *selector = domain->sig.dif.type3_inc_reftag ? + FORMAT_DIF_CSUM_INC : + FORMAT_DIF_CSUM_NO_INC; + break; + default: + return 1; + } + break; + default: + return 1; + } + + return 0; +} + +static int mlx5_set_bsf(struct ib_mr *sig_mr, + struct ib_sig_attrs *sig_attrs, + struct mlx5_bsf *bsf, u32 data_size) +{ + struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig; + struct mlx5_bsf_basic *basic = &bsf->basic; + struct ib_sig_domain *mem = &sig_attrs->mem; + struct ib_sig_domain *wire = &sig_attrs->wire; + int ret, selector; + + memset(bsf, 0, sizeof(*bsf)); + switch (sig_attrs->mem.sig_type) { + case IB_SIG_TYPE_T10_DIF: + if (sig_attrs->wire.sig_type != IB_SIG_TYPE_T10_DIF) + return -EINVAL; + + /* Input domain check byte mask */ + basic->check_byte_mask = sig_attrs->check_mask; + if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval && + mem->sig.dif.type == wire->sig.dif.type) { + /* Same block structure */ + basic->bsf_size_sbs = 1 << 4; + if (mem->sig.dif.bg_type == wire->sig.dif.bg_type) + basic->wire.copy_byte_mask |= 0xc0; + if (mem->sig.dif.app_tag == wire->sig.dif.app_tag) + basic->wire.copy_byte_mask |= 0x30; + if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag) + basic->wire.copy_byte_mask |= 0x0f; + } else + basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval); + + basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval); + basic->raw_data_size = cpu_to_be32(data_size); + + ret = format_selector(sig_attrs, mem, &selector); + if (ret) + return -EINVAL; + basic->m_bfs_psv = cpu_to_be32(selector << 24 | + msig->psv_memory.psv_idx); + + ret = format_selector(sig_attrs, wire, &selector); + if (ret) + return -EINVAL; + basic->w_bfs_psv = cpu_to_be32(selector << 24 | + msig->psv_wire.psv_idx); + break; + + default: + return -EINVAL; + } + + return 0; +} + +static int set_sig_data_segment(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, + void **seg, int *size) +{ + struct ib_sig_attrs *sig_attrs = wr->wr.sig_handover.sig_attrs; + struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr; + struct mlx5_bsf *bsf; + u32 data_len = wr->sg_list->length; + u32 data_key = wr->sg_list->lkey; + u64 data_va = wr->sg_list->addr; + int ret; + int wqe_size; + + if (!wr->wr.sig_handover.prot || + (data_key == wr->wr.sig_handover.prot->lkey && + data_va == wr->wr.sig_handover.prot->addr && + data_len == wr->wr.sig_handover.prot->length)) { + /** + * Source domain doesn't contain signature information + * or data and protection are interleaved in memory. + * So need construct: + * ------------------ + * | data_klm | + * ------------------ + * | BSF | + * ------------------ + **/ + struct mlx5_klm *data_klm = *seg; + + data_klm->bcount = cpu_to_be32(data_len); + data_klm->key = cpu_to_be32(data_key); + data_klm->va = cpu_to_be64(data_va); + wqe_size = ALIGN(sizeof(*data_klm), 64); + } else { + /** + * Source domain contains signature information + * So need construct a strided block format: + * --------------------------- + * | stride_block_ctrl | + * --------------------------- + * | data_klm | + * --------------------------- + * | prot_klm | + * --------------------------- + * | BSF | + * --------------------------- + **/ + struct mlx5_stride_block_ctrl_seg *sblock_ctrl; + struct mlx5_stride_block_entry *data_sentry; + struct mlx5_stride_block_entry *prot_sentry; + u32 prot_key = wr->wr.sig_handover.prot->lkey; + u64 prot_va = wr->wr.sig_handover.prot->addr; + u16 block_size = sig_attrs->mem.sig.dif.pi_interval; + int prot_size; + + sblock_ctrl = *seg; + data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl); + prot_sentry = (void *)data_sentry + sizeof(*data_sentry); + + prot_size = prot_field_size(sig_attrs->mem.sig_type); + if (!prot_size) { + pr_err("Bad block size given: %u\n", block_size); + return -EINVAL; + } + sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size + + prot_size); + sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP); + sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size); + sblock_ctrl->num_entries = cpu_to_be16(2); + + data_sentry->bcount = cpu_to_be16(block_size); + data_sentry->key = cpu_to_be32(data_key); + data_sentry->va = cpu_to_be64(data_va); + data_sentry->stride = cpu_to_be16(block_size); + + prot_sentry->bcount = cpu_to_be16(prot_size); + prot_sentry->key = cpu_to_be32(prot_key); + prot_sentry->va = cpu_to_be64(prot_va); + prot_sentry->stride = cpu_to_be16(prot_size); + + wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) + + sizeof(*prot_sentry), 64); + } + + *seg += wqe_size; + *size += wqe_size / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + bsf = *seg; + ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len); + if (ret) + return -EINVAL; + + *seg += sizeof(*bsf); + *size += sizeof(*bsf) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + return 0; +} + +static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, + struct ib_send_wr *wr, u32 nelements, + u32 length, u32 pdn) +{ + struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr; + u32 sig_key = sig_mr->rkey; + u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1; + + memset(seg, 0, sizeof(*seg)); + + seg->flags = get_umr_flags(wr->wr.sig_handover.access_flags) | + MLX5_ACCESS_MODE_KLM; + seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | + MLX5_MKEY_BSF_EN | pdn); + seg->len = cpu_to_be64(length); + seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(nelements))); + seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); +} + +static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr, u32 nelements) +{ + memset(umr, 0, sizeof(*umr)); + + umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE; + umr->klm_octowords = get_klm_octo(nelements); + umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE); + umr->mkey_mask = sig_mkey_mask(); +} + + +static int set_sig_umr_wr(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, + void **seg, int *size) +{ + struct mlx5_ib_mr *sig_mr = to_mmr(wr->wr.sig_handover.sig_mr); + u32 pdn = get_pd(qp)->pdn; + u32 klm_oct_size; + int region_len, ret; + + if (unlikely(wr->num_sge != 1) || + unlikely(wr->wr.sig_handover.access_flags & + IB_ACCESS_REMOTE_ATOMIC) || + unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) || + unlikely(!sig_mr->sig->sig_status_checked)) + return -EINVAL; + + /* length of the protected region, data + protection */ + region_len = wr->sg_list->length; + if (wr->wr.sig_handover.prot && + (wr->wr.sig_handover.prot->lkey != wr->sg_list->lkey || + wr->wr.sig_handover.prot->addr != wr->sg_list->addr || + wr->wr.sig_handover.prot->length != wr->sg_list->length)) + region_len += wr->wr.sig_handover.prot->length; + + /** + * KLM octoword size - if protection was provided + * then we use strided block format (3 octowords), + * else we use single KLM (1 octoword) + **/ + klm_oct_size = wr->wr.sig_handover.prot ? 3 : 1; + + set_sig_umr_segment(*seg, wr, klm_oct_size); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + set_sig_mkey_segment(*seg, wr, klm_oct_size, region_len, pdn); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + ret = set_sig_data_segment(wr, qp, seg, size); + if (ret) + return ret; + + sig_mr->sig->sig_status_checked = false; + return 0; +} + +static int set_psv_wr(struct ib_sig_domain *domain, + u32 psv_idx, void **seg, int *size) +{ + struct mlx5_seg_set_psv *psv_seg = *seg; + + memset(psv_seg, 0, sizeof(*psv_seg)); + psv_seg->psv_num = cpu_to_be32(psv_idx); + switch (domain->sig_type) { + case IB_SIG_TYPE_T10_DIF: + psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 | + domain->sig.dif.app_tag); + psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag); + + *seg += sizeof(*psv_seg); + *size += sizeof(*psv_seg) / 16; + break; + + default: + pr_err("Bad signature type given.\n"); + return 1; + } + + return 0; +} + static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size, struct mlx5_core_dev *mdev, struct mlx5_ib_pd *pd, struct mlx5_ib_qp *qp) { @@ -1916,6 +2355,10 @@ static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size, if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); if (!li) { + if (unlikely(wr->wr.fast_reg.page_list_len > + wr->wr.fast_reg.page_list->max_page_list_len)) + return -ENOMEM; + set_frwr_pages(*seg, wr, mdev, pd, writ); *seg += sizeof(struct mlx5_wqe_data_seg); *size += (sizeof(struct mlx5_wqe_data_seg) / 16); @@ -1978,6 +2421,59 @@ static u8 get_fence(u8 fence, struct ib_send_wr *wr) } } +static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, + struct ib_send_wr *wr, int *idx, + int *size, int nreq) +{ + int err = 0; + + if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) { + err = -ENOMEM; + return err; + } + + *idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); + *seg = mlx5_get_send_wqe(qp, *idx); + *ctrl = *seg; + *(uint32_t *)(*seg + 8) = 0; + (*ctrl)->imm = send_ieth(wr); + (*ctrl)->fm_ce_se = qp->sq_signal_bits | + (wr->send_flags & IB_SEND_SIGNALED ? + MLX5_WQE_CTRL_CQ_UPDATE : 0) | + (wr->send_flags & IB_SEND_SOLICITED ? + MLX5_WQE_CTRL_SOLICITED : 0); + + *seg += sizeof(**ctrl); + *size = sizeof(**ctrl) / 16; + + return err; +} + +static void finish_wqe(struct mlx5_ib_qp *qp, + struct mlx5_wqe_ctrl_seg *ctrl, + u8 size, unsigned idx, u64 wr_id, + int nreq, u8 fence, u8 next_fence, + u32 mlx5_opcode) +{ + u8 opmod = 0; + + ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | + mlx5_opcode | ((u32)opmod << 24)); + ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8)); + ctrl->fm_ce_se |= fence; + qp->fm_cache = next_fence; + if (unlikely(qp->wq_sig)) + ctrl->signature = wq_sig(ctrl); + + qp->sq.wrid[idx] = wr_id; + qp->sq.w_list[idx].opcode = mlx5_opcode; + qp->sq.wqe_head[idx] = qp->sq.head + nreq; + qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); + qp->sq.w_list[idx].next = qp->sq.cur_post; +} + + int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { @@ -1985,13 +2481,13 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_core_dev *mdev = &dev->mdev; struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_mr *mr; struct mlx5_wqe_data_seg *dpseg; struct mlx5_wqe_xrc_seg *xrc; struct mlx5_bf *bf = qp->bf; int uninitialized_var(size); void *qend = qp->sq.qend; unsigned long flags; - u32 mlx5_opcode; unsigned idx; int err = 0; int inl = 0; @@ -2000,7 +2496,6 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, int nreq; int i; u8 next_fence = 0; - u8 opmod = 0; u8 fence; spin_lock_irqsave(&qp->sq.lock, flags); @@ -2013,36 +2508,23 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, goto out; } - if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) { + fence = qp->fm_cache; + num_sge = wr->num_sge; + if (unlikely(num_sge > qp->sq.max_gs)) { mlx5_ib_warn(dev, "\n"); err = -ENOMEM; *bad_wr = wr; goto out; } - fence = qp->fm_cache; - num_sge = wr->num_sge; - if (unlikely(num_sge > qp->sq.max_gs)) { + err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq); + if (err) { mlx5_ib_warn(dev, "\n"); err = -ENOMEM; *bad_wr = wr; goto out; } - idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); - seg = mlx5_get_send_wqe(qp, idx); - ctrl = seg; - *(uint32_t *)(seg + 8) = 0; - ctrl->imm = send_ieth(wr); - ctrl->fm_ce_se = qp->sq_signal_bits | - (wr->send_flags & IB_SEND_SIGNALED ? - MLX5_WQE_CTRL_CQ_UPDATE : 0) | - (wr->send_flags & IB_SEND_SOLICITED ? - MLX5_WQE_CTRL_SOLICITED : 0); - - seg += sizeof(*ctrl); - size = sizeof(*ctrl) / 16; - switch (ibqp->qp_type) { case IB_QPT_XRC_INI: xrc = seg; @@ -2063,28 +2545,11 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: - set_raddr_seg(seg, wr->wr.atomic.remote_addr, - wr->wr.atomic.rkey); - seg += sizeof(struct mlx5_wqe_raddr_seg); - - set_atomic_seg(seg, wr); - seg += sizeof(struct mlx5_wqe_atomic_seg); - - size += (sizeof(struct mlx5_wqe_raddr_seg) + - sizeof(struct mlx5_wqe_atomic_seg)) / 16; - break; - case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: - set_raddr_seg(seg, wr->wr.atomic.remote_addr, - wr->wr.atomic.rkey); - seg += sizeof(struct mlx5_wqe_raddr_seg); - - set_masked_atomic_seg(seg, wr); - seg += sizeof(struct mlx5_wqe_masked_atomic_seg); - - size += (sizeof(struct mlx5_wqe_raddr_seg) + - sizeof(struct mlx5_wqe_masked_atomic_seg)) / 16; - break; + mlx5_ib_warn(dev, "Atomic operations are not supported yet\n"); + err = -ENOSYS; + *bad_wr = wr; + goto out; case IB_WR_LOCAL_INV: next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; @@ -2112,6 +2577,73 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, num_sge = 0; break; + case IB_WR_REG_SIG_MR: + qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR; + mr = to_mmr(wr->wr.sig_handover.sig_mr); + + ctrl->imm = cpu_to_be32(mr->ibmr.rkey); + err = set_sig_umr_wr(wr, qp, &seg, &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_UMR); + /* + * SET_PSV WQEs are not signaled and solicited + * on error + */ + wr->send_flags &= ~IB_SEND_SIGNALED; + wr->send_flags |= IB_SEND_SOLICITED; + err = begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->mem, + mr->sig->psv_memory.psv_idx, &seg, + &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_SET_PSV); + err = begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->wire, + mr->sig->psv_wire.psv_idx, &seg, + &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_SET_PSV); + num_sge = 0; + goto skip_psv; + default: break; } @@ -2192,22 +2724,10 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } } - mlx5_opcode = mlx5_ib_opcode[wr->opcode]; - ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | - mlx5_opcode | - ((u32)opmod << 24)); - ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8)); - ctrl->fm_ce_se |= get_fence(fence, wr); - qp->fm_cache = next_fence; - if (unlikely(qp->wq_sig)) - ctrl->signature = wq_sig(ctrl); - - qp->sq.wrid[idx] = wr->wr_id; - qp->sq.w_list[idx].opcode = mlx5_opcode; - qp->sq.wqe_head[idx] = qp->sq.head + nreq; - qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); - qp->sq.w_list[idx].next = qp->sq.cur_post; - + finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, + get_fence(fence, wr), next_fence, + mlx5_ib_opcode[wr->opcode]); +skip_psv: if (0) dump_wqe(qp, idx, size); } @@ -2223,6 +2743,10 @@ out: qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); + /* Make sure doorbell record is visible to the HCA before + * we hit doorbell */ + wmb(); + if (bf->need_lock) spin_lock(&bf->lock); diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 84d297afd6a..384af6dec5e 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -35,6 +35,7 @@ #include <linux/mlx5/srq.h> #include <linux/slab.h> #include <rdma/ib_umem.h> +#include <rdma/ib_user_verbs.h> #include "mlx5_ib.h" #include "user.h" @@ -78,16 +79,27 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_create_srq ucmd; + size_t ucmdlen; int err; int npages; int page_shift; int ncont; u32 offset; - if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { + ucmdlen = + (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < + sizeof(ucmd)) ? (sizeof(ucmd) - + sizeof(ucmd.reserved)) : sizeof(ucmd); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { mlx5_ib_dbg(dev, "failed copy udata\n"); return -EFAULT; } + + if (ucmdlen == sizeof(ucmd) && + ucmd.reserved != 0) + return -EINVAL; + srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size, @@ -123,7 +135,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, goto err_in; } - (*in)->ctx.log_pg_sz = page_shift - PAGE_SHIFT; + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); return 0; @@ -192,7 +204,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, } srq->wq_sig = !!srq_signature; - (*in)->ctx.log_pg_sz = page_shift - PAGE_SHIFT; + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; return 0; @@ -295,7 +307,7 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, mlx5_vfree(in); if (err) { mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err); - goto err_srq; + goto err_usr_kern_srq; } mlx5_ib_dbg(dev, "create SRQ with srqn 0x%x\n", srq->msrq.srqn); @@ -316,6 +328,8 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, err_core: mlx5_core_destroy_srq(&dev->mdev, &srq->msrq); + +err_usr_kern_srq: if (pd->uobject) destroy_srq_user(pd, srq); else @@ -388,9 +402,7 @@ int mlx5_ib_destroy_srq(struct ib_srq *srq) mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); ib_umem_release(msrq->umem); } else { - kfree(msrq->wrid); - mlx5_buf_free(&dev->mdev, &msrq->buf); - mlx5_db_free(&dev->mdev, &msrq->db); + destroy_srq_kernel(dev, msrq); } kfree(srq); diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h index a886de3e593..d0ba264ac1e 100644 --- a/drivers/infiniband/hw/mlx5/user.h +++ b/drivers/infiniband/hw/mlx5/user.h @@ -62,6 +62,13 @@ struct mlx5_ib_alloc_ucontext_req { __u32 num_low_latency_uuars; }; +struct mlx5_ib_alloc_ucontext_req_v2 { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; + __u32 flags; + __u32 reserved; +}; + struct mlx5_ib_alloc_ucontext_resp { __u32 qp_tab_size; __u32 bf_reg_size; @@ -84,6 +91,7 @@ struct mlx5_ib_create_cq { __u64 buf_addr; __u64 db_addr; __u32 cqe_size; + __u32 reserved; /* explicit padding (optional on i386) */ }; struct mlx5_ib_create_cq_resp { @@ -93,12 +101,16 @@ struct mlx5_ib_create_cq_resp { struct mlx5_ib_resize_cq { __u64 buf_addr; + __u16 cqe_size; + __u16 reserved0; + __u32 reserved1; }; struct mlx5_ib_create_srq { __u64 buf_addr; __u64 db_addr; __u32 flags; + __u32 reserved; /* explicit padding (optional on i386) */ }; struct mlx5_ib_create_srq_resp { diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c index 7c9d35f39d7..69020173899 100644 --- a/drivers/infiniband/hw/mthca/mthca_eq.c +++ b/drivers/infiniband/hw/mthca/mthca_eq.c @@ -357,7 +357,7 @@ static int mthca_eq_int(struct mthca_dev *dev, struct mthca_eq *eq) mthca_warn(dev, "Unhandled event %02x(%02x) on EQ %d\n", eqe->type, eqe->subtype, eq->eqn); break; - }; + } set_eqe_hw(eqe); ++eq->cons_index; diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 87897b95666..ded76c101dd 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -858,13 +858,9 @@ static int mthca_enable_msi_x(struct mthca_dev *mdev) entries[1].entry = 1; entries[2].entry = 2; - err = pci_enable_msix(mdev->pdev, entries, ARRAY_SIZE(entries)); - if (err) { - if (err > 0) - mthca_info(mdev, "Only %d MSI-X vectors available, " - "not using MSI-X\n", err); + err = pci_enable_msix_exact(mdev->pdev, entries, ARRAY_SIZE(entries)); + if (err) return err; - } mdev->eq_table.eq[MTHCA_EQ_COMP ].msi_x_vector = entries[0].vector; mdev->eq_table.eq[MTHCA_EQ_ASYNC].msi_x_vector = entries[1].vector; diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 5b71d43bd89..415f8e1a54d 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -695,6 +695,7 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries, if (context && ib_copy_to_udata(udata, &cq->cqn, sizeof (__u32))) { mthca_free_cq(to_mdev(ibdev), cq); + err = -EFAULT; goto err_free; } @@ -976,12 +977,12 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata) { struct mthca_dev *dev = to_mdev(pd->device); - struct ib_umem_chunk *chunk; + struct scatterlist *sg; struct mthca_mr *mr; struct mthca_reg_mr ucmd; u64 *pages; int shift, n, len; - int i, j, k; + int i, k, entry; int err = 0; int write_mtt_size; @@ -1009,10 +1010,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } shift = ffs(mr->umem->page_size) - 1; - - n = 0; - list_for_each_entry(chunk, &mr->umem->chunk_list, list) - n += chunk->nents; + n = mr->umem->nmap; mr->mtt = mthca_alloc_mtt(dev, n); if (IS_ERR(mr->mtt)) { @@ -1030,25 +1028,24 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages)); - list_for_each_entry(chunk, &mr->umem->chunk_list, list) - for (j = 0; j < chunk->nmap; ++j) { - len = sg_dma_len(&chunk->page_list[j]) >> shift; - for (k = 0; k < len; ++k) { - pages[i++] = sg_dma_address(&chunk->page_list[j]) + - mr->umem->page_size * k; - /* - * Be friendly to write_mtt and pass it chunks - * of appropriate size. - */ - if (i == write_mtt_size) { - err = mthca_write_mtt(dev, mr->mtt, n, pages, i); - if (err) - goto mtt_done; - n += i; - i = 0; - } + for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = sg_dma_address(sg) + + mr->umem->page_size * k; + /* + * Be friendly to write_mtt and pass it chunks + * of appropriate size. + */ + if (i == write_mtt_size) { + err = mthca_write_mtt(dev, mr->mtt, n, pages, i); + if (err) + goto mtt_done; + n += i; + i = 0; } } + } if (i) err = mthca_write_mtt(dev, mr->mtt, n, pages, i); diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 26a68453610..e354b2f04ad 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -860,7 +860,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) { + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, + IB_LINK_LAYER_UNSPECIFIED)) { mthca_dbg(dev, "Bad QP transition (transport %d) " "%d->%d with attr 0x%08x\n", qp->transport, cur_state, new_state, diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index 429141078ee..3b2a6dc8ea9 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -68,7 +68,6 @@ MODULE_VERSION(DRV_VERSION); int max_mtu = 9000; int interrupt_mod_interval = 0; - /* Interoperability */ int mpa_version = 1; module_param(mpa_version, int, 0644); @@ -112,6 +111,16 @@ static struct pci_device_id nes_pci_table[] = { MODULE_DEVICE_TABLE(pci, nes_pci_table); +/* registered nes netlink callbacks */ +static struct ibnl_client_cbs nes_nl_cb_table[] = { + [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, + [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, + [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, + [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, + [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} +}; + static int nes_inetaddr_event(struct notifier_block *, unsigned long, void *); static int nes_net_event(struct notifier_block *, unsigned long, void *); static int nes_notifiers_registered; @@ -672,11 +681,25 @@ static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) } nes_notifiers_registered++; + if (ibnl_add_client(RDMA_NL_NES, RDMA_NL_IWPM_NUM_OPS, nes_nl_cb_table)) + printk(KERN_ERR PFX "%s[%u]: Failed to add netlink callback\n", + __func__, __LINE__); + + ret = iwpm_init(RDMA_NL_NES); + if (ret) { + printk(KERN_ERR PFX "%s: port mapper initialization failed\n", + pci_name(pcidev)); + goto bail7; + } + INIT_DELAYED_WORK(&nesdev->work, nes_recheck_link_status); /* Initialize network devices */ - if ((netdev = nes_netdev_init(nesdev, mmio_regs)) == NULL) + netdev = nes_netdev_init(nesdev, mmio_regs); + if (netdev == NULL) { + ret = -ENOMEM; goto bail7; + } /* Register network device */ ret = register_netdev(netdev); @@ -707,6 +730,7 @@ static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) nes_debug(NES_DBG_INIT, "netdev_count=%d, nesadapter->netdev_count=%d\n", nesdev->netdev_count, nesdev->nesadapter->netdev_count); + ibnl_remove_client(RDMA_NL_NES); nes_notifiers_registered--; if (nes_notifiers_registered == 0) { @@ -770,6 +794,8 @@ static void nes_remove(struct pci_dev *pcidev) nesdev->nesadapter->netdev_count--; } } + ibnl_remove_client(RDMA_NL_NES); + iwpm_exit(RDMA_NL_NES); nes_notifiers_registered--; if (nes_notifiers_registered == 0) { diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index 33cc58941a3..bd9d132f11c 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -51,6 +51,8 @@ #include <rdma/ib_pack.h> #include <rdma/rdma_cm.h> #include <rdma/iw_cm.h> +#include <rdma/rdma_netlink.h> +#include <rdma/iw_portmap.h> #define NES_SEND_FIRST_WRITE @@ -130,6 +132,7 @@ #define NES_DBG_IW_TX 0x00040000 #define NES_DBG_SHUTDOWN 0x00080000 #define NES_DBG_PAU 0x00100000 +#define NES_DBG_NLMSG 0x00200000 #define NES_DBG_RSVD1 0x10000000 #define NES_DBG_RSVD2 0x20000000 #define NES_DBG_RSVD3 0x40000000 diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 6b29249aa85..6f09a72e78d 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -59,6 +59,7 @@ #include <net/route.h> #include <net/ip_fib.h> #include <net/tcp.h> +#include <linux/fcntl.h> #include "nes.h" @@ -128,6 +129,7 @@ static void build_mpa_v1(struct nes_cm_node *, void *, u8); static void build_rdma0_msg(struct nes_cm_node *, struct nes_qp **); static void print_core(struct nes_cm_core *core); +static void record_ird_ord(struct nes_cm_node *, u16, u16); /* External CM API Interface */ /* instance of function pointers for client API */ @@ -165,7 +167,6 @@ int nes_rem_ref_cm_node(struct nes_cm_node *cm_node) { return rem_ref_cm_node(cm_node->cm_core, cm_node); } - /** * create_event */ @@ -317,7 +318,6 @@ static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type, } } - if (priv_data_len + mpa_hdr_len != len) { nes_debug(NES_DBG_CM, "The received ietf buffer was not right" " complete (%x + %x != %x)\n", @@ -356,25 +356,57 @@ static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type, /* send reset */ return -EINVAL; } + if (ird_size == IETF_NO_IRD_ORD || ord_size == IETF_NO_IRD_ORD) + cm_node->mpav2_ird_ord = IETF_NO_IRD_ORD; - if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) { + if (cm_node->mpav2_ird_ord != IETF_NO_IRD_ORD) { /* responder */ - if (cm_node->ord_size > ird_size) - cm_node->ord_size = ird_size; - } else { - /* initiator */ - if (cm_node->ord_size > ird_size) - cm_node->ord_size = ird_size; - - if (cm_node->ird_size < ord_size) { - /* no resources available */ - /* send terminate message */ - return -EINVAL; + if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) { + /* we are still negotiating */ + if (ord_size > NES_MAX_IRD) { + cm_node->ird_size = NES_MAX_IRD; + } else { + cm_node->ird_size = ord_size; + if (ord_size == 0 && + (rtr_ctrl_ord & IETF_RDMA0_READ)) { + cm_node->ird_size = 1; + nes_debug(NES_DBG_CM, + "%s: Remote peer doesn't support RDMA0_READ (ord=%u)\n", + __func__, ord_size); + } + } + if (ird_size > NES_MAX_ORD) + cm_node->ord_size = NES_MAX_ORD; + else + cm_node->ord_size = ird_size; + } else { /* initiator */ + if (ord_size > NES_MAX_IRD) { + nes_debug(NES_DBG_CM, + "%s: Unable to support the requested (ord =%u)\n", + __func__, ord_size); + return -EINVAL; + } + cm_node->ird_size = ord_size; + + if (ird_size > NES_MAX_ORD) { + cm_node->ord_size = NES_MAX_ORD; + } else { + if (ird_size == 0 && + (rtr_ctrl_ord & IETF_RDMA0_READ)) { + nes_debug(NES_DBG_CM, + "%s: Remote peer doesn't support RDMA0_READ (ird=%u)\n", + __func__, ird_size); + return -EINVAL; + } else { + cm_node->ord_size = ird_size; + } + } } } if (rtr_ctrl_ord & IETF_RDMA0_READ) { cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; + } else if (rtr_ctrl_ord & IETF_RDMA0_WRITE) { cm_node->send_rdma0_op = SEND_RDMA_WRITE_ZERO; } else { /* Not supported RDMA0 operation */ @@ -450,11 +482,11 @@ static void form_cm_frame(struct sk_buff *skb, iph->ttl = 0x40; iph->protocol = 0x06; /* IPPROTO_TCP */ - iph->saddr = htonl(cm_node->loc_addr); - iph->daddr = htonl(cm_node->rem_addr); + iph->saddr = htonl(cm_node->mapped_loc_addr); + iph->daddr = htonl(cm_node->mapped_rem_addr); - tcph->source = htons(cm_node->loc_port); - tcph->dest = htons(cm_node->rem_port); + tcph->source = htons(cm_node->mapped_loc_port); + tcph->dest = htons(cm_node->mapped_rem_port); tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num); if (flags & SET_ACK) { @@ -493,6 +525,100 @@ static void form_cm_frame(struct sk_buff *skb, cm_packets_created++; } +/* + * nes_create_sockaddr - Record ip addr and tcp port in a sockaddr struct + */ +static void nes_create_sockaddr(__be32 ip_addr, __be16 port, + struct sockaddr_storage *addr) +{ + struct sockaddr_in *nes_sockaddr = (struct sockaddr_in *)addr; + nes_sockaddr->sin_family = AF_INET; + memcpy(&nes_sockaddr->sin_addr.s_addr, &ip_addr, sizeof(__be32)); + nes_sockaddr->sin_port = port; +} + +/* + * nes_create_mapinfo - Create a mapinfo object in the port mapper data base + */ +static int nes_create_mapinfo(struct nes_cm_info *cm_info) +{ + struct sockaddr_storage local_sockaddr; + struct sockaddr_storage mapped_sockaddr; + + nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port), + &local_sockaddr); + nes_create_sockaddr(htonl(cm_info->mapped_loc_addr), + htons(cm_info->mapped_loc_port), &mapped_sockaddr); + + return iwpm_create_mapinfo(&local_sockaddr, + &mapped_sockaddr, RDMA_NL_NES); +} + +/* + * nes_remove_mapinfo - Remove a mapinfo object from the port mapper data base + * and send a remove mapping op message to + * the userspace port mapper + */ +static int nes_remove_mapinfo(u32 loc_addr, u16 loc_port, + u32 mapped_loc_addr, u16 mapped_loc_port) +{ + struct sockaddr_storage local_sockaddr; + struct sockaddr_storage mapped_sockaddr; + + nes_create_sockaddr(htonl(loc_addr), htons(loc_port), &local_sockaddr); + nes_create_sockaddr(htonl(mapped_loc_addr), htons(mapped_loc_port), + &mapped_sockaddr); + + iwpm_remove_mapinfo(&local_sockaddr, &mapped_sockaddr); + return iwpm_remove_mapping(&local_sockaddr, RDMA_NL_NES); +} + +/* + * nes_form_pm_msg - Form a port mapper message with mapping info + */ +static void nes_form_pm_msg(struct nes_cm_info *cm_info, + struct iwpm_sa_data *pm_msg) +{ + nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port), + &pm_msg->loc_addr); + nes_create_sockaddr(htonl(cm_info->rem_addr), htons(cm_info->rem_port), + &pm_msg->rem_addr); +} + +/* + * nes_form_reg_msg - Form a port mapper message with dev info + */ +static void nes_form_reg_msg(struct nes_vnic *nesvnic, + struct iwpm_dev_data *pm_msg) +{ + memcpy(pm_msg->dev_name, nesvnic->nesibdev->ibdev.name, + IWPM_DEVNAME_SIZE); + memcpy(pm_msg->if_name, nesvnic->netdev->name, IWPM_IFNAME_SIZE); +} + +/* + * nes_record_pm_msg - Save the received mapping info + */ +static void nes_record_pm_msg(struct nes_cm_info *cm_info, + struct iwpm_sa_data *pm_msg) +{ + struct sockaddr_in *mapped_loc_addr = + (struct sockaddr_in *)&pm_msg->mapped_loc_addr; + struct sockaddr_in *mapped_rem_addr = + (struct sockaddr_in *)&pm_msg->mapped_rem_addr; + + if (mapped_loc_addr->sin_family == AF_INET) { + cm_info->mapped_loc_addr = + ntohl(mapped_loc_addr->sin_addr.s_addr); + cm_info->mapped_loc_port = ntohs(mapped_loc_addr->sin_port); + } + if (mapped_rem_addr->sin_family == AF_INET) { + cm_info->mapped_rem_addr = + ntohl(mapped_rem_addr->sin_addr.s_addr); + cm_info->mapped_rem_port = ntohs(mapped_rem_addr->sin_port); + } +} + /** * print_core - dump a cm core */ @@ -514,6 +640,19 @@ static void print_core(struct nes_cm_core *core) nes_debug(NES_DBG_CM, "-------------- end core ---------------\n"); } +static void record_ird_ord(struct nes_cm_node *cm_node, + u16 conn_ird, u16 conn_ord) +{ + if (conn_ird > NES_MAX_IRD) + conn_ird = NES_MAX_IRD; + + if (conn_ord > NES_MAX_ORD) + conn_ord = NES_MAX_ORD; + + cm_node->ird_size = conn_ird; + cm_node->ord_size = conn_ord; +} + /** * cm_build_mpa_frame - build a MPA V1 frame or MPA V2 frame */ @@ -557,11 +696,13 @@ static void build_mpa_v2(struct nes_cm_node *cm_node, mpa_frame->priv_data_len += htons(IETF_RTR_MSG_SIZE); /* initialize RTR msg */ - ctrl_ird = (cm_node->ird_size > IETF_NO_IRD_ORD) ? - IETF_NO_IRD_ORD : cm_node->ird_size; - ctrl_ord = (cm_node->ord_size > IETF_NO_IRD_ORD) ? - IETF_NO_IRD_ORD : cm_node->ord_size; - + if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) { + ctrl_ird = IETF_NO_IRD_ORD; + ctrl_ord = IETF_NO_IRD_ORD; + } else { + ctrl_ird = cm_node->ird_size & IETF_NO_IRD_ORD; + ctrl_ord = cm_node->ord_size & IETF_NO_IRD_ORD; + } ctrl_ird |= IETF_PEER_TO_PEER; ctrl_ird |= IETF_FLPDU_ZERO_LEN; @@ -610,7 +751,7 @@ static void build_rdma0_msg(struct nes_cm_node *cm_node, struct nes_qp **nesqp_a struct nes_qp *nesqp = *nesqp_addr; struct nes_hw_qp_wqe *wqe = &nesqp->hwqp.sq_vbase[0]; - u64temp = (unsigned long)nesqp; + u64temp = (unsigned long)nesqp->nesuqp_addr; u64temp |= NES_SW_CONTEXT_ALIGN >> 1; set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, u64temp); @@ -1100,8 +1241,11 @@ static struct nes_cm_node *find_node(struct nes_cm_core *cm_core, loc_addr, loc_port, cm_node->rem_addr, cm_node->rem_port, rem_addr, rem_port); - if ((cm_node->loc_addr == loc_addr) && (cm_node->loc_port == loc_port) && - (cm_node->rem_addr == rem_addr) && (cm_node->rem_port == rem_port)) { + if ((cm_node->mapped_loc_addr == loc_addr) && + (cm_node->mapped_loc_port == loc_port) && + (cm_node->mapped_rem_addr == rem_addr) && + (cm_node->mapped_rem_port == rem_port)) { + add_ref_cm_node(cm_node); spin_unlock_irqrestore(&cm_core->ht_lock, flags); return cm_node; @@ -1118,18 +1262,28 @@ static struct nes_cm_node *find_node(struct nes_cm_core *cm_core, * find_listener - find a cm node listening on this addr-port pair */ static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core, - nes_addr_t dst_addr, u16 dst_port, enum nes_cm_listener_state listener_state) + nes_addr_t dst_addr, u16 dst_port, + enum nes_cm_listener_state listener_state, int local) { unsigned long flags; struct nes_cm_listener *listen_node; + nes_addr_t listen_addr; + u16 listen_port; /* walk list and find cm_node associated with this session ID */ spin_lock_irqsave(&cm_core->listen_list_lock, flags); list_for_each_entry(listen_node, &cm_core->listen_list.list, list) { + if (local) { + listen_addr = listen_node->loc_addr; + listen_port = listen_node->loc_port; + } else { + listen_addr = listen_node->mapped_loc_addr; + listen_port = listen_node->mapped_loc_port; + } /* compare node pair, return node handle if a match */ - if (((listen_node->loc_addr == dst_addr) || - listen_node->loc_addr == 0x00000000) && - (listen_node->loc_port == dst_port) && + if (((listen_addr == dst_addr) || + listen_addr == 0x00000000) && + (listen_port == dst_port) && (listener_state & listen_node->listener_state)) { atomic_inc(&listen_node->ref_count); spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); @@ -1142,7 +1296,6 @@ static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core, return NULL; } - /** * add_hte_node - add a cm node to the hash table */ @@ -1263,9 +1416,20 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); - if (listener->nesvnic) - nes_manage_apbvt(listener->nesvnic, listener->loc_port, - PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); + if (listener->nesvnic) { + nes_manage_apbvt(listener->nesvnic, + listener->mapped_loc_port, + PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn), + NES_MANAGE_APBVT_DEL); + + nes_remove_mapinfo(listener->loc_addr, + listener->loc_port, + listener->mapped_loc_addr, + listener->mapped_loc_port); + nes_debug(NES_DBG_NLMSG, + "Delete APBVT mapped_loc_port = %04X\n", + listener->mapped_loc_port); + } nes_debug(NES_DBG_CM, "destroying listener (%p)\n", listener); @@ -1354,8 +1518,7 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpi neigh->ha, ntohl(rt->rt_gateway)); if (arpindex >= 0) { - if (!memcmp(nesadapter->arp_table[arpindex].mac_addr, - neigh->ha, ETH_ALEN)) { + if (ether_addr_equal(nesadapter->arp_table[arpindex].mac_addr, neigh->ha)) { /* Mac address same as in nes_arp_table */ goto out; } @@ -1408,10 +1571,16 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, cm_node->loc_port = cm_info->loc_port; cm_node->rem_port = cm_info->rem_port; + cm_node->mapped_loc_addr = cm_info->mapped_loc_addr; + cm_node->mapped_rem_addr = cm_info->mapped_rem_addr; + cm_node->mapped_loc_port = cm_info->mapped_loc_port; + cm_node->mapped_rem_port = cm_info->mapped_rem_port; + cm_node->mpa_frame_rev = mpa_version; cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; - cm_node->ird_size = IETF_NO_IRD_ORD; - cm_node->ord_size = IETF_NO_IRD_ORD; + cm_node->mpav2_ird_ord = 0; + cm_node->ird_size = 0; + cm_node->ord_size = 0; nes_debug(NES_DBG_CM, "Make node addresses : loc = %pI4:%x, rem = %pI4:%x\n", &cm_node->loc_addr, cm_node->loc_port, @@ -1453,8 +1622,10 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, cm_node->loopbackpartner = NULL; /* get the mac addr for the remote node */ - oldarpindex = nes_arp_table(nesdev, cm_node->rem_addr, NULL, NES_ARP_RESOLVE); - arpindex = nes_addr_resolve_neigh(nesvnic, cm_info->rem_addr, oldarpindex); + oldarpindex = nes_arp_table(nesdev, cm_node->mapped_rem_addr, + NULL, NES_ARP_RESOLVE); + arpindex = nes_addr_resolve_neigh(nesvnic, + cm_node->mapped_rem_addr, oldarpindex); if (arpindex < 0) { kfree(cm_node); return NULL; @@ -1516,11 +1687,14 @@ static int rem_ref_cm_node(struct nes_cm_core *cm_core, mini_cm_dec_refcnt_listen(cm_core, cm_node->listener, 0); } else { if (cm_node->apbvt_set && cm_node->nesvnic) { - nes_manage_apbvt(cm_node->nesvnic, cm_node->loc_port, - PCI_FUNC( - cm_node->nesvnic->nesdev->pcidev->devfn), + nes_manage_apbvt(cm_node->nesvnic, cm_node->mapped_loc_port, + PCI_FUNC(cm_node->nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); } + nes_debug(NES_DBG_NLMSG, "Delete APBVT mapped_loc_port = %04X\n", + cm_node->mapped_loc_port); + nes_remove_mapinfo(cm_node->loc_addr, cm_node->loc_port, + cm_node->mapped_loc_addr, cm_node->mapped_loc_port); } atomic_dec(&cm_core->node_cnt); @@ -2188,17 +2362,21 @@ static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb, * mini_cm_listen - create a listen node with params */ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, - struct nes_vnic *nesvnic, struct nes_cm_info *cm_info) + struct nes_vnic *nesvnic, struct nes_cm_info *cm_info) { struct nes_cm_listener *listener; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; unsigned long flags; + int iwpm_err = 0; nes_debug(NES_DBG_CM, "Search for 0x%08x : 0x%04x\n", cm_info->loc_addr, cm_info->loc_port); /* cannot have multiple matching listeners */ - listener = find_listener(cm_core, htonl(cm_info->loc_addr), - htons(cm_info->loc_port), NES_CM_LISTENER_EITHER_STATE); + listener = find_listener(cm_core, cm_info->loc_addr, cm_info->loc_port, + NES_CM_LISTENER_EITHER_STATE, 1); + if (listener && listener->listener_state == NES_CM_LISTENER_ACTIVE_STATE) { /* find automatically incs ref count ??? */ atomic_dec(&listener->ref_count); @@ -2207,6 +2385,22 @@ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, } if (!listener) { + nes_form_reg_msg(nesvnic, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES); + if (iwpm_err) { + nes_debug(NES_DBG_NLMSG, + "Port Mapper reg pid fail (err = %d).\n", iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + nes_form_pm_msg(cm_info, &pm_msg); + iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_NES); + if (iwpm_err) + nes_debug(NES_DBG_NLMSG, + "Port Mapper query fail (err = %d).\n", iwpm_err); + else + nes_record_pm_msg(cm_info, &pm_msg); + } + /* create a CM listen node (1/2 node to compare incoming traffic to) */ listener = kzalloc(sizeof(*listener), GFP_ATOMIC); if (!listener) { @@ -2214,8 +2408,10 @@ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, return NULL; } - listener->loc_addr = htonl(cm_info->loc_addr); - listener->loc_port = htons(cm_info->loc_port); + listener->loc_addr = cm_info->loc_addr; + listener->loc_port = cm_info->loc_port; + listener->mapped_loc_addr = cm_info->mapped_loc_addr; + listener->mapped_loc_port = cm_info->mapped_loc_port; listener->reused_node = 0; atomic_set(&listener->ref_count, 1); @@ -2277,14 +2473,18 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, if (cm_info->loc_addr == cm_info->rem_addr) { loopbackremotelistener = find_listener(cm_core, - ntohl(nesvnic->local_ipaddr), cm_node->rem_port, - NES_CM_LISTENER_ACTIVE_STATE); + cm_node->mapped_loc_addr, cm_node->mapped_rem_port, + NES_CM_LISTENER_ACTIVE_STATE, 0); if (loopbackremotelistener == NULL) { create_event(cm_node, NES_CM_EVENT_ABORTED); } else { loopback_cm_info = *cm_info; loopback_cm_info.loc_port = cm_info->rem_port; loopback_cm_info.rem_port = cm_info->loc_port; + loopback_cm_info.mapped_loc_port = + cm_info->mapped_rem_port; + loopback_cm_info.mapped_rem_port = + cm_info->mapped_loc_port; loopback_cm_info.cm_id = loopbackremotelistener->cm_id; loopbackremotenode = make_cm_node(cm_core, nesvnic, &loopback_cm_info, loopbackremotelistener); @@ -2513,6 +2713,12 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, nfo.rem_addr = ntohl(iph->saddr); nfo.rem_port = ntohs(tcph->source); + /* If port mapper is available these should be mapped address info */ + nfo.mapped_loc_addr = ntohl(iph->daddr); + nfo.mapped_loc_port = ntohs(tcph->dest); + nfo.mapped_rem_addr = ntohl(iph->saddr); + nfo.mapped_rem_port = ntohs(tcph->source); + tmp_daddr = cpu_to_be32(iph->daddr); tmp_saddr = cpu_to_be32(iph->saddr); @@ -2521,8 +2727,8 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, do { cm_node = find_node(cm_core, - nfo.rem_port, nfo.rem_addr, - nfo.loc_port, nfo.loc_addr); + nfo.mapped_rem_port, nfo.mapped_rem_addr, + nfo.mapped_loc_port, nfo.mapped_loc_addr); if (!cm_node) { /* Only type of packet accepted are for */ @@ -2531,9 +2737,9 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, skb_handled = 0; break; } - listener = find_listener(cm_core, nfo.loc_addr, - nfo.loc_port, - NES_CM_LISTENER_ACTIVE_STATE); + listener = find_listener(cm_core, nfo.mapped_loc_addr, + nfo.mapped_loc_port, + NES_CM_LISTENER_ACTIVE_STATE, 0); if (!listener) { nfo.cm_id = NULL; nfo.conn_type = 0; @@ -3028,11 +3234,11 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) rem_ref_cm_node(cm_node->cm_core, cm_node); return -ECONNRESET; } - /* associate the node with the QP */ nesqp->cm_node = (void *)cm_node; cm_node->nesqp = nesqp; + nes_debug(NES_DBG_CM, "QP%u, cm_node=%p, jiffies = %lu listener = %p\n", nesqp->hwqp.qp_id, cm_node, jiffies, cm_node->listener); atomic_inc(&cm_accepts); @@ -3055,6 +3261,11 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (cm_node->mpa_frame_rev == IETF_MPA_V1) mpa_frame_offset = 4; + if (cm_node->mpa_frame_rev == IETF_MPA_V1 || + cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) { + record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); + } + memcpy(mpa_v2_frame->priv_data, conn_param->private_data, conn_param->private_data_len); @@ -3118,7 +3329,6 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) } nesqp->skip_lsmm = 1; - /* Cache the cm_id in the qp */ nesqp->cm_id = cm_id; cm_node->cm_id = cm_id; @@ -3133,10 +3343,12 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) nes_cm_init_tsa_conn(nesqp, cm_node); - nesqp->nesqp_context->tcpPorts[0] = cpu_to_le16(ntohs(laddr->sin_port)); - nesqp->nesqp_context->tcpPorts[1] = cpu_to_le16(ntohs(raddr->sin_port)); + nesqp->nesqp_context->tcpPorts[0] = + cpu_to_le16(cm_node->mapped_loc_port); + nesqp->nesqp_context->tcpPorts[1] = + cpu_to_le16(cm_node->mapped_rem_port); - nesqp->nesqp_context->ip0 = cpu_to_le32(ntohl(raddr->sin_addr.s_addr)); + nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr); nesqp->nesqp_context->misc2 |= cpu_to_le32( (u32)PCI_FUNC(nesdev->pcidev->devfn) << @@ -3155,14 +3367,14 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32( ((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT)); nesqp->nesqp_context->ird_ord_sizes |= - cpu_to_le32((u32)conn_param->ord); + cpu_to_le32((u32)cm_node->ord_size); memset(&nes_quad, 0, sizeof(nes_quad)); nes_quad.DstIpAdrIndex = cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); - nes_quad.SrcIpadr = raddr->sin_addr.s_addr; - nes_quad.TcpPorts[0] = raddr->sin_port; - nes_quad.TcpPorts[1] = laddr->sin_port; + nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr); + nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port); + nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port); /* Produce hash key */ crc_value = get_crc_value(&nes_quad); @@ -3195,6 +3407,9 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_event.remote_addr = cm_id->remote_addr; cm_event.private_data = NULL; cm_event.private_data_len = 0; + cm_event.ird = cm_node->ird_size; + cm_event.ord = cm_node->ord_size; + ret = cm_id->event_handler(cm_id, &cm_event); attr.qp_state = IB_QPS_RTS; nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); @@ -3261,6 +3476,9 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) int apbvt_set = 0; struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; + int iwpm_err = 0; if (cm_id->remote_addr.ss_family != AF_INET) return -ENOSYS; @@ -3291,33 +3509,51 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) /* cache the cm_id in the qp */ nesqp->cm_id = cm_id; - cm_id->provider_data = nesqp; - nesqp->private_data_len = conn_param->private_data_len; - nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32((u32)conn_param->ord); - /* space for rdma0 read msg */ - if (conn_param->ord == 0) - nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(1); nes_debug(NES_DBG_CM, "requested ord = 0x%08X.\n", (u32)conn_param->ord); nes_debug(NES_DBG_CM, "mpa private data len =%u\n", conn_param->private_data_len); + /* set up the connection params for the node */ + cm_info.loc_addr = ntohl(laddr->sin_addr.s_addr); + cm_info.loc_port = ntohs(laddr->sin_port); + cm_info.rem_addr = ntohl(raddr->sin_addr.s_addr); + cm_info.rem_port = ntohs(raddr->sin_port); + cm_info.cm_id = cm_id; + cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; + + /* No port mapper available, go with the specified peer information */ + cm_info.mapped_loc_addr = cm_info.loc_addr; + cm_info.mapped_loc_port = cm_info.loc_port; + cm_info.mapped_rem_addr = cm_info.rem_addr; + cm_info.mapped_rem_port = cm_info.rem_port; + + nes_form_reg_msg(nesvnic, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES); + if (iwpm_err) { + nes_debug(NES_DBG_NLMSG, + "Port Mapper reg pid fail (err = %d).\n", iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + nes_form_pm_msg(&cm_info, &pm_msg); + iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_NES); + if (iwpm_err) + nes_debug(NES_DBG_NLMSG, + "Port Mapper query fail (err = %d).\n", iwpm_err); + else + nes_record_pm_msg(&cm_info, &pm_msg); + } + if (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr) { - nes_manage_apbvt(nesvnic, ntohs(laddr->sin_port), - PCI_FUNC(nesdev->pcidev->devfn), - NES_MANAGE_APBVT_ADD); + nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port, + PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD); apbvt_set = 1; } - /* set up the connection params for the node */ - cm_info.loc_addr = htonl(laddr->sin_addr.s_addr); - cm_info.loc_port = htons(laddr->sin_port); - cm_info.rem_addr = htonl(raddr->sin_addr.s_addr); - cm_info.rem_port = htons(raddr->sin_port); - cm_info.cm_id = cm_id; - cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; + if (nes_create_mapinfo(&cm_info)) + return -ENOMEM; cm_id->add_ref(cm_id); @@ -3327,14 +3563,23 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) &cm_info); if (!cm_node) { if (apbvt_set) - nes_manage_apbvt(nesvnic, ntohs(laddr->sin_port), + nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port, PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); + nes_debug(NES_DBG_NLMSG, "Delete mapped_loc_port = %04X\n", + cm_info.mapped_loc_port); + nes_remove_mapinfo(cm_info.loc_addr, cm_info.loc_port, + cm_info.mapped_loc_addr, cm_info.mapped_loc_port); cm_id->rem_ref(cm_id); return -ENOMEM; } + record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); + if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO && + cm_node->ord_size == 0) + cm_node->ord_size = 1; + cm_node->apbvt_set = apbvt_set; nesqp->cm_node = cm_node; cm_node->nesqp = nesqp; @@ -3371,13 +3616,16 @@ int nes_create_listen(struct iw_cm_id *cm_id, int backlog) nesvnic->local_ipaddr, laddr->sin_addr.s_addr); /* setup listen params in our api call struct */ - cm_info.loc_addr = nesvnic->local_ipaddr; - cm_info.loc_port = laddr->sin_port; + cm_info.loc_addr = ntohl(nesvnic->local_ipaddr); + cm_info.loc_port = ntohs(laddr->sin_port); cm_info.backlog = backlog; cm_info.cm_id = cm_id; cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; + /* No port mapper available, go with the specified info */ + cm_info.mapped_loc_addr = cm_info.loc_addr; + cm_info.mapped_loc_port = cm_info.loc_port; cm_node = g_cm_core->api->listen(g_cm_core, nesvnic, &cm_info); if (!cm_node) { @@ -3389,7 +3637,10 @@ int nes_create_listen(struct iw_cm_id *cm_id, int backlog) cm_id->provider_data = cm_node; if (!cm_node->reused_node) { - err = nes_manage_apbvt(nesvnic, ntohs(laddr->sin_port), + if (nes_create_mapinfo(&cm_info)) + return -ENOMEM; + + err = nes_manage_apbvt(nesvnic, cm_node->mapped_loc_port, PCI_FUNC(nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD); if (err) { @@ -3514,9 +3765,11 @@ static void cm_event_connected(struct nes_cm_event *event) nes_cm_init_tsa_conn(nesqp, cm_node); /* set the QP tsa context */ - nesqp->nesqp_context->tcpPorts[0] = cpu_to_le16(ntohs(laddr->sin_port)); - nesqp->nesqp_context->tcpPorts[1] = cpu_to_le16(ntohs(raddr->sin_port)); - nesqp->nesqp_context->ip0 = cpu_to_le32(ntohl(raddr->sin_addr.s_addr)); + nesqp->nesqp_context->tcpPorts[0] = + cpu_to_le16(cm_node->mapped_loc_port); + nesqp->nesqp_context->tcpPorts[1] = + cpu_to_le16(cm_node->mapped_rem_port); + nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr); nesqp->nesqp_context->misc2 |= cpu_to_le32( (u32)PCI_FUNC(nesdev->pcidev->devfn) << @@ -3531,6 +3784,8 @@ static void cm_event_connected(struct nes_cm_event *event) nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT); + nesqp->nesqp_context->ird_ord_sizes |= + cpu_to_le32((u32)cm_node->ord_size); /* Adjust tail for not having a LSMM */ /*nesqp->hwqp.sq_tail = 1;*/ @@ -3544,9 +3799,9 @@ static void cm_event_connected(struct nes_cm_event *event) nes_quad.DstIpAdrIndex = cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); - nes_quad.SrcIpadr = raddr->sin_addr.s_addr; - nes_quad.TcpPorts[0] = raddr->sin_port; - nes_quad.TcpPorts[1] = laddr->sin_port; + nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr); + nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port); + nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port); /* Produce hash key */ crc_value = get_crc_value(&nes_quad); @@ -3574,7 +3829,7 @@ static void cm_event_connected(struct nes_cm_event *event) cm_event.ird = cm_node->ird_size; cm_event.ord = cm_node->ord_size; - cm_event_laddr->sin_addr.s_addr = event->cm_info.rem_addr; + cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr); ret = cm_id->event_handler(cm_id, &cm_event); nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); @@ -3743,8 +3998,13 @@ static void cm_event_mpa_req(struct nes_cm_event *event) cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr); cm_event.private_data = cm_node->mpa_frame_buf; cm_event.private_data_len = (u8)cm_node->mpa_frame_size; + if (cm_node->mpa_frame_rev == IETF_MPA_V1) { + cm_event.ird = NES_MAX_IRD; + cm_event.ord = NES_MAX_ORD; + } else { cm_event.ird = cm_node->ird_size; cm_event.ord = cm_node->ord_size; + } ret = cm_id->event_handler(cm_id, &cm_event); if (ret) diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 4646e666608..f522cf63978 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -58,6 +58,8 @@ #define IETF_RDMA0_WRITE 0x8000 #define IETF_RDMA0_READ 0x4000 #define IETF_NO_IRD_ORD 0x3FFF +#define NES_MAX_IRD 0x40 +#define NES_MAX_ORD 0x7F enum ietf_mpa_flags { IETF_MPA_FLAGS_MARKERS = 0x80, /* receive Markers */ @@ -291,8 +293,8 @@ struct nes_cm_listener { struct list_head list; struct nes_cm_core *cm_core; u8 loc_mac[ETH_ALEN]; - nes_addr_t loc_addr; - u16 loc_port; + nes_addr_t loc_addr, mapped_loc_addr; + u16 loc_port, mapped_loc_port; struct iw_cm_id *cm_id; enum nes_cm_conn_type conn_type; atomic_t ref_count; @@ -306,7 +308,9 @@ struct nes_cm_listener { /* per connection node and node state information */ struct nes_cm_node { nes_addr_t loc_addr, rem_addr; + nes_addr_t mapped_loc_addr, mapped_rem_addr; u16 loc_port, rem_port; + u16 mapped_loc_port, mapped_rem_port; u8 loc_mac[ETH_ALEN]; u8 rem_mac[ETH_ALEN]; @@ -333,6 +337,7 @@ struct nes_cm_node { enum mpa_frame_version mpa_frame_rev; u16 ird_size; u16 ord_size; + u16 mpav2_ird_ord; u16 mpa_frame_size; struct iw_cm_id *cm_id; @@ -361,6 +366,10 @@ struct nes_cm_info { u16 rem_port; nes_addr_t loc_addr; nes_addr_t rem_addr; + u16 mapped_loc_port; + u16 mapped_rem_port; + nes_addr_t mapped_loc_addr; + nes_addr_t mapped_rem_addr; enum nes_cm_conn_type conn_type; int backlog; diff --git a/drivers/infiniband/hw/nes/nes_user.h b/drivers/infiniband/hw/nes/nes_user.h index 4926de74448..529c421bb15 100644 --- a/drivers/infiniband/hw/nes/nes_user.h +++ b/drivers/infiniband/hw/nes/nes_user.h @@ -39,8 +39,8 @@ #include <linux/types.h> -#define NES_ABI_USERSPACE_VER 1 -#define NES_ABI_KERNEL_VER 1 +#define NES_ABI_USERSPACE_VER 2 +#define NES_ABI_KERNEL_VER 2 /* * Make sure that all structs defined in this file remain laid out so @@ -78,6 +78,7 @@ struct nes_create_cq_req { struct nes_create_qp_req { __u64 user_wqe_buffers; + __u64 user_qp_buffer; }; enum iwnes_memreg_type { diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 5b53ca5a228..218dd357428 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1186,11 +1186,13 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); kfree(nesqp->allocated_buffer); nes_debug(NES_DBG_QP, "ib_copy_from_udata() Failed \n"); - return NULL; + return ERR_PTR(-EFAULT); } if (req.user_wqe_buffers) { virt_wqs = 1; } + if (req.user_qp_buffer) + nesqp->nesuqp_addr = req.user_qp_buffer; if ((ibpd->uobject) && (ibpd->uobject->context)) { nesqp->user_mode = 1; nes_ucontext = to_nesucontext(ibpd->uobject->context); @@ -2307,7 +2309,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct nes_device *nesdev = nesvnic->nesdev; struct nes_adapter *nesadapter = nesdev->nesadapter; struct ib_mr *ibmr = ERR_PTR(-EINVAL); - struct ib_umem_chunk *chunk; + struct scatterlist *sg; struct nes_ucontext *nes_ucontext; struct nes_pbl *nespbl; struct nes_mr *nesmr; @@ -2315,7 +2317,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct nes_mem_reg_req req; struct nes_vpbl vpbl; struct nes_root_vpbl root_vpbl; - int nmap_index, page_index; + int entry, page_index; int page_count = 0; int err, pbl_depth = 0; int chunk_pages; @@ -2330,6 +2332,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u16 pbl_count; u8 single_page = 1; u8 stag_key; + int first_page = 1; region = ib_umem_get(pd->uobject->context, start, length, acc, 0); if (IS_ERR(region)) { @@ -2380,128 +2383,125 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } nesmr->region = region; - list_for_each_entry(chunk, ®ion->chunk_list, list) { - nes_debug(NES_DBG_MR, "Chunk: nents = %u, nmap = %u .\n", - chunk->nents, chunk->nmap); - for (nmap_index = 0; nmap_index < chunk->nmap; ++nmap_index) { - if (sg_dma_address(&chunk->page_list[nmap_index]) & ~PAGE_MASK) { - ib_umem_release(region); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n", - (unsigned int) sg_dma_address(&chunk->page_list[nmap_index])); - ibmr = ERR_PTR(-EINVAL); - kfree(nesmr); - goto reg_user_mr_err; - } + for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) { + if (sg_dma_address(sg) & ~PAGE_MASK) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n", + (unsigned int) sg_dma_address(sg)); + ibmr = ERR_PTR(-EINVAL); + kfree(nesmr); + goto reg_user_mr_err; + } - if (!sg_dma_len(&chunk->page_list[nmap_index])) { - ib_umem_release(region); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, - stag_index); - nes_debug(NES_DBG_MR, "Invalid Buffer Size\n"); - ibmr = ERR_PTR(-EINVAL); - kfree(nesmr); - goto reg_user_mr_err; - } + if (!sg_dma_len(sg)) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + stag_index); + nes_debug(NES_DBG_MR, "Invalid Buffer Size\n"); + ibmr = ERR_PTR(-EINVAL); + kfree(nesmr); + goto reg_user_mr_err; + } - region_length += sg_dma_len(&chunk->page_list[nmap_index]); - chunk_pages = sg_dma_len(&chunk->page_list[nmap_index]) >> 12; - region_length -= skip_pages << 12; - for (page_index=skip_pages; page_index < chunk_pages; page_index++) { - skip_pages = 0; - if ((page_count!=0)&&(page_count<<12)-(region->offset&(4096-1))>=region->length) - goto enough_pages; - if ((page_count&0x01FF) == 0) { - if (page_count >= 1024 * 512) { + region_length += sg_dma_len(sg); + chunk_pages = sg_dma_len(sg) >> 12; + region_length -= skip_pages << 12; + for (page_index = skip_pages; page_index < chunk_pages; page_index++) { + skip_pages = 0; + if ((page_count != 0) && (page_count<<12)-(region->offset&(4096-1)) >= region->length) + goto enough_pages; + if ((page_count&0x01FF) == 0) { + if (page_count >= 1024 * 512) { + ib_umem_release(region); + nes_free_resource(nesadapter, + nesadapter->allocated_mrs, stag_index); + kfree(nesmr); + ibmr = ERR_PTR(-E2BIG); + goto reg_user_mr_err; + } + if (root_pbl_index == 1) { + root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, + 8192, &root_vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n", + root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase); + if (!root_vpbl.pbl_vbase) { ib_umem_release(region); - nes_free_resource(nesadapter, - nesadapter->allocated_mrs, stag_index); + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + stag_index); kfree(nesmr); - ibmr = ERR_PTR(-E2BIG); + ibmr = ERR_PTR(-ENOMEM); goto reg_user_mr_err; } - if (root_pbl_index == 1) { - root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, - 8192, &root_vpbl.pbl_pbase); - nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n", - root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase); - if (!root_vpbl.pbl_vbase) { - ib_umem_release(region); - pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, - vpbl.pbl_pbase); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, - stag_index); - kfree(nesmr); - ibmr = ERR_PTR(-ENOMEM); - goto reg_user_mr_err; - } - root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, - GFP_KERNEL); - if (!root_vpbl.leaf_vpbl) { - ib_umem_release(region); - pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, - root_vpbl.pbl_pbase); - pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, - vpbl.pbl_pbase); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, - stag_index); - kfree(nesmr); - ibmr = ERR_PTR(-ENOMEM); - goto reg_user_mr_err; - } - root_vpbl.pbl_vbase[0].pa_low = - cpu_to_le32((u32)vpbl.pbl_pbase); - root_vpbl.pbl_vbase[0].pa_high = - cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); - root_vpbl.leaf_vpbl[0] = vpbl; - } - vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, - &vpbl.pbl_pbase); - nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n", - vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase); - if (!vpbl.pbl_vbase) { + root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, + GFP_KERNEL); + if (!root_vpbl.leaf_vpbl) { ib_umem_release(region); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - ibmr = ERR_PTR(-ENOMEM); + pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, + root_vpbl.pbl_pbase); + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + stag_index); kfree(nesmr); + ibmr = ERR_PTR(-ENOMEM); goto reg_user_mr_err; } - if (1 <= root_pbl_index) { - root_vpbl.pbl_vbase[root_pbl_index].pa_low = - cpu_to_le32((u32)vpbl.pbl_pbase); - root_vpbl.pbl_vbase[root_pbl_index].pa_high = - cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32))); - root_vpbl.leaf_vpbl[root_pbl_index] = vpbl; - } - root_pbl_index++; - cur_pbl_index = 0; + root_vpbl.pbl_vbase[0].pa_low = + cpu_to_le32((u32)vpbl.pbl_pbase); + root_vpbl.pbl_vbase[0].pa_high = + cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); + root_vpbl.leaf_vpbl[0] = vpbl; } - if (single_page) { - if (page_count != 0) { - if ((last_dma_addr+4096) != - (sg_dma_address(&chunk->page_list[nmap_index])+ - (page_index*4096))) - single_page = 0; - last_dma_addr = sg_dma_address(&chunk->page_list[nmap_index])+ - (page_index*4096); - } else { - first_dma_addr = sg_dma_address(&chunk->page_list[nmap_index])+ - (page_index*4096); - last_dma_addr = first_dma_addr; - } + vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, + &vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n", + vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase); + if (!vpbl.pbl_vbase) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + ibmr = ERR_PTR(-ENOMEM); + kfree(nesmr); + goto reg_user_mr_err; + } + if (1 <= root_pbl_index) { + root_vpbl.pbl_vbase[root_pbl_index].pa_low = + cpu_to_le32((u32)vpbl.pbl_pbase); + root_vpbl.pbl_vbase[root_pbl_index].pa_high = + cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32))); + root_vpbl.leaf_vpbl[root_pbl_index] = vpbl; + } + root_pbl_index++; + cur_pbl_index = 0; + } + if (single_page) { + if (page_count != 0) { + if ((last_dma_addr+4096) != + (sg_dma_address(sg)+ + (page_index*4096))) + single_page = 0; + last_dma_addr = sg_dma_address(sg)+ + (page_index*4096); + } else { + first_dma_addr = sg_dma_address(sg)+ + (page_index*4096); + last_dma_addr = first_dma_addr; } - - vpbl.pbl_vbase[cur_pbl_index].pa_low = - cpu_to_le32((u32)(sg_dma_address(&chunk->page_list[nmap_index])+ - (page_index*4096))); - vpbl.pbl_vbase[cur_pbl_index].pa_high = - cpu_to_le32((u32)((((u64)(sg_dma_address(&chunk->page_list[nmap_index])+ - (page_index*4096))) >> 32))); - cur_pbl_index++; - page_count++; } + + vpbl.pbl_vbase[cur_pbl_index].pa_low = + cpu_to_le32((u32)(sg_dma_address(sg)+ + (page_index*4096))); + vpbl.pbl_vbase[cur_pbl_index].pa_high = + cpu_to_le32((u32)((((u64)(sg_dma_address(sg)+ + (page_index*4096))) >> 32))); + cur_pbl_index++; + page_count++; } } + enough_pages: nes_debug(NES_DBG_MR, "calculating stag, stag_index=0x%08x, driver_key=0x%08x," " stag_key=0x%08x\n", @@ -2613,25 +2613,28 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, nespbl->pbl_size, (unsigned long) nespbl->pbl_pbase, (void *) nespbl->pbl_vbase, nespbl->user_base); - list_for_each_entry(chunk, ®ion->chunk_list, list) { - for (nmap_index = 0; nmap_index < chunk->nmap; ++nmap_index) { - chunk_pages = sg_dma_len(&chunk->page_list[nmap_index]) >> 12; - chunk_pages += (sg_dma_len(&chunk->page_list[nmap_index]) & (4096-1)) ? 1 : 0; - nespbl->page = sg_page(&chunk->page_list[0]); - for (page_index=0; page_index<chunk_pages; page_index++) { - ((__le32 *)pbl)[0] = cpu_to_le32((u32) - (sg_dma_address(&chunk->page_list[nmap_index])+ - (page_index*4096))); - ((__le32 *)pbl)[1] = cpu_to_le32(((u64) - (sg_dma_address(&chunk->page_list[nmap_index])+ - (page_index*4096)))>>32); - nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl, - (unsigned long long)*pbl, - le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0])); - pbl++; - } + for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) { + chunk_pages = sg_dma_len(sg) >> 12; + chunk_pages += (sg_dma_len(sg) & (4096-1)) ? 1 : 0; + if (first_page) { + nespbl->page = sg_page(sg); + first_page = 0; + } + + for (page_index = 0; page_index < chunk_pages; page_index++) { + ((__le32 *)pbl)[0] = cpu_to_le32((u32) + (sg_dma_address(sg)+ + (page_index*4096))); + ((__le32 *)pbl)[1] = cpu_to_le32(((u64) + (sg_dma_address(sg)+ + (page_index*4096)))>>32); + nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl, + (unsigned long long)*pbl, + le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0])); + pbl++; } } + if (req.reg_type == IWNES_MEMREG_TYPE_QP) { list_add_tail(&nespbl->list, &nes_ucontext->qp_reg_mem_list); } else { @@ -2834,7 +2837,7 @@ static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, init_attr->qp_context = nesqp->ibqp.qp_context; init_attr->send_cq = nesqp->ibqp.send_cq; init_attr->recv_cq = nesqp->ibqp.recv_cq; - init_attr->srq = nesqp->ibqp.srq = nesqp->ibqp.srq; + init_attr->srq = nesqp->ibqp.srq; init_attr->cap = attr->cap; return 0; @@ -3134,9 +3137,7 @@ int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), original_last_aeq, nesqp->last_aeq); - if ((!ret) || - ((original_last_aeq != NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) && - (ret))) { + if (!ret || original_last_aeq != NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) { if (dont_wait) { if (nesqp->cm_id && nesqp->hw_tcp_state != 0) { nes_debug(NES_DBG_MOD_QP, "QP%u Queuing fake disconnect for QP refcount (%d)," diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index 0eff7c44d76..309b31c31ae 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -184,5 +184,6 @@ struct nes_qp { u8 pau_busy; u8 pau_pending; u8 pau_state; + __u64 nesuqp_addr; }; #endif /* NES_VERBS_H */ diff --git a/drivers/infiniband/hw/ocrdma/Kconfig b/drivers/infiniband/hw/ocrdma/Kconfig index b5b6056c851..c0cddc0192d 100644 --- a/drivers/infiniband/hw/ocrdma/Kconfig +++ b/drivers/infiniband/hw/ocrdma/Kconfig @@ -1,6 +1,6 @@ config INFINIBAND_OCRDMA tristate "Emulex One Connect HCA support" - depends on ETHERNET && NETDEVICES && PCI && (IPV6 || IPV6=n) + depends on ETHERNET && NETDEVICES && PCI && INET && (IPV6 || IPV6=n) select NET_VENDOR_EMULEX select BE2NET ---help--- diff --git a/drivers/infiniband/hw/ocrdma/Makefile b/drivers/infiniband/hw/ocrdma/Makefile index 06a5bed12e4..d1bfd4f4cdd 100644 --- a/drivers/infiniband/hw/ocrdma/Makefile +++ b/drivers/infiniband/hw/ocrdma/Makefile @@ -2,4 +2,4 @@ ccflags-y := -Idrivers/net/ethernet/emulex/benet obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma.o -ocrdma-y := ocrdma_main.o ocrdma_verbs.o ocrdma_hw.o ocrdma_ah.o +ocrdma-y := ocrdma_main.o ocrdma_verbs.o ocrdma_hw.o ocrdma_ah.o ocrdma_stats.o diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index adc11d14f87..19011dbb930 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -35,17 +35,27 @@ #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> +#include <rdma/ib_addr.h> #include <be_roce.h> #include "ocrdma_sli.h" -#define OCRDMA_ROCE_DEV_VERSION "1.0.0" +#define OCRDMA_ROCE_DRV_VERSION "10.2.145.0u" + +#define OCRDMA_ROCE_DRV_DESC "Emulex OneConnect RoCE Driver" #define OCRDMA_NODE_DESC "Emulex OneConnect RoCE HCA" +#define OC_NAME_SH OCRDMA_NODE_DESC "(Skyhawk)" +#define OC_NAME_UNKNOWN OCRDMA_NODE_DESC "(Unknown)" + +#define OC_SKH_DEVICE_PF 0x720 +#define OC_SKH_DEVICE_VF 0x728 #define OCRDMA_MAX_AH 512 #define OCRDMA_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME) +#define convert_to_64bit(lo, hi) ((u64)hi << 32 | (u64)lo) + struct ocrdma_dev_attr { u8 fw_ver[32]; u32 vendor_id; @@ -65,6 +75,7 @@ struct ocrdma_dev_attr { int max_mr; u64 max_mr_size; u32 max_num_mr_pbl; + int max_mw; int max_fmr; int max_map_per_fmr; int max_pages_per_frmr; @@ -83,6 +94,12 @@ struct ocrdma_dev_attr { u8 num_ird_pages; }; +struct ocrdma_dma_mem { + void *va; + dma_addr_t pa; + u32 size; +}; + struct ocrdma_pbl { void *va; dma_addr_t pa; @@ -122,6 +139,52 @@ struct mqe_ctx { bool cmd_done; }; +struct ocrdma_hw_mr { + u32 lkey; + u8 fr_mr; + u8 remote_atomic; + u8 remote_rd; + u8 remote_wr; + u8 local_rd; + u8 local_wr; + u8 mw_bind; + u8 rsvd; + u64 len; + struct ocrdma_pbl *pbl_table; + u32 num_pbls; + u32 num_pbes; + u32 pbl_size; + u32 pbe_size; + u64 fbo; + u64 va; +}; + +struct ocrdma_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct ocrdma_hw_mr hwmr; +}; + +struct ocrdma_stats { + u8 type; + struct ocrdma_dev *dev; +}; + +struct stats_mem { + struct ocrdma_mqe mqe; + void *va; + dma_addr_t pa; + u32 size; + char *debugfs_mem; +}; + +struct phy_info { + u16 auto_speeds_supported; + u16 fixed_speeds_supported; + u16 phy_type; + u16 interface_type; +}; + struct ocrdma_dev { struct ib_device ibdev; struct ocrdma_dev_attr attr; @@ -165,12 +228,30 @@ struct ocrdma_dev { struct mqe_ctx mqe_ctx; struct be_dev_info nic_info; + struct phy_info phy; + char model_number[32]; + u32 hba_port_num; struct list_head entry; struct rcu_head rcu; int id; u64 stag_arr[OCRDMA_MAX_STAG]; u16 pvid; + u32 asic_id; + + ulong last_stats_time; + struct mutex stats_lock; /* provide synch for debugfs operations */ + struct stats_mem stats_mem; + struct ocrdma_stats rsrc_stats; + struct ocrdma_stats rx_stats; + struct ocrdma_stats wqe_stats; + struct ocrdma_stats tx_stats; + struct ocrdma_stats db_err_stats; + struct ocrdma_stats tx_qp_err_stats; + struct ocrdma_stats rx_qp_err_stats; + struct ocrdma_stats tx_dbg_stats; + struct ocrdma_stats rx_dbg_stats; + struct dentry *dir; }; struct ocrdma_cq { @@ -183,8 +264,8 @@ struct ocrdma_cq { */ u32 max_hw_cqe; bool phase_change; - bool armed, solicited; - bool arm_needed; + bool deferred_arm, deferred_sol; + bool first_arm; spinlock_t cq_lock ____cacheline_aligned; /* provide synchronization * to cq polling @@ -197,6 +278,7 @@ struct ocrdma_cq { struct ocrdma_ucontext *ucontext; dma_addr_t pa; u32 len; + u32 cqe_cnt; /* head of all qp's sq and rq for which cqes need to be flushed * by the software. @@ -206,7 +288,6 @@ struct ocrdma_cq { struct ocrdma_pd { struct ib_pd ibpd; - struct ocrdma_dev *dev; struct ocrdma_ucontext *uctx; u32 id; int num_dpp_qp; @@ -291,33 +372,6 @@ struct ocrdma_qp { bool dpp_enabled; u8 *ird_q_va; bool signaled; - u16 db_cache; -}; - -struct ocrdma_hw_mr { - u32 lkey; - u8 fr_mr; - u8 remote_atomic; - u8 remote_rd; - u8 remote_wr; - u8 local_rd; - u8 local_wr; - u8 mw_bind; - u8 rsvd; - u64 len; - struct ocrdma_pbl *pbl_table; - u32 num_pbls; - u32 num_pbes; - u32 pbl_size; - u32 pbe_size; - u64 fbo; - u64 va; -}; - -struct ocrdma_mr { - struct ib_mr ibmr; - struct ib_umem *umem; - struct ocrdma_hw_mr hwmr; }; struct ocrdma_ucontext { @@ -384,13 +438,6 @@ static inline struct ocrdma_srq *get_ocrdma_srq(struct ib_srq *ibsrq) return container_of(ibsrq, struct ocrdma_srq, ibsrq); } - -static inline int ocrdma_get_num_posted_shift(struct ocrdma_qp *qp) -{ - return ((qp->dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY && - qp->id < 128) ? 24 : 16); -} - static inline int is_cqe_valid(struct ocrdma_cq *cq, struct ocrdma_cqe *cqe) { int cqe_valid; @@ -422,5 +469,53 @@ static inline int is_cqe_wr_imm(struct ocrdma_cqe *cqe) OCRDMA_CQE_WRITE_IMM) ? 1 : 0; } +static inline int ocrdma_resolve_dmac(struct ocrdma_dev *dev, + struct ib_ah_attr *ah_attr, u8 *mac_addr) +{ + struct in6_addr in6; + + memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6)); + if (rdma_is_multicast_addr(&in6)) + rdma_get_mcast_mac(&in6, mac_addr); + else + memcpy(mac_addr, ah_attr->dmac, ETH_ALEN); + return 0; +} + +static inline char *hca_name(struct ocrdma_dev *dev) +{ + switch (dev->nic_info.pdev->device) { + case OC_SKH_DEVICE_PF: + case OC_SKH_DEVICE_VF: + return OC_NAME_SH; + default: + return OC_NAME_UNKNOWN; + } +} + +static inline int ocrdma_get_eq_table_index(struct ocrdma_dev *dev, + int eqid) +{ + int indx; + + for (indx = 0; indx < dev->eq_cnt; indx++) { + if (dev->eq_tbl[indx].q.id == eqid) + return indx; + } + + return -EINVAL; +} + +static inline u8 ocrdma_get_asic_type(struct ocrdma_dev *dev) +{ + if (dev->nic_info.dev_family == 0xF && !dev->asic_id) { + pci_read_config_dword( + dev->nic_info.pdev, + OCRDMA_SLI_ASIC_ID_OFFSET, &dev->asic_id); + } + + return (dev->asic_id & OCRDMA_SLI_ASIC_GEN_NUM_MASK) >> + OCRDMA_SLI_ASIC_GEN_NUM_SHIFT; +} #endif diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_abi.h b/drivers/infiniband/hw/ocrdma/ocrdma_abi.h index fbac8eb4403..1554cca5712 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_abi.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_abi.h @@ -28,7 +28,8 @@ #ifndef __OCRDMA_ABI_H__ #define __OCRDMA_ABI_H__ -#define OCRDMA_ABI_VERSION 1 +#define OCRDMA_ABI_VERSION 2 +#define OCRDMA_BE_ROCE_ABI_VERSION 1 /* user kernel communication data structures. */ struct ocrdma_alloc_ucontext_resp { @@ -107,9 +108,7 @@ struct ocrdma_create_qp_uresp { u32 db_sq_offset; u32 db_rq_offset; u32 db_shift; - u64 rsvd1; - u64 rsvd2; - u64 rsvd3; + u64 rsvd[11]; } __packed; struct ocrdma_create_srq_uresp { diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index ee499d94225..d4cc01f10c0 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -49,7 +49,7 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, ah->sgid_index = attr->grh.sgid_index; - vlan_tag = rdma_get_vlan_id(&attr->grh.dgid); + vlan_tag = attr->vlan_id; if (!vlan_tag || (vlan_tag > 0xFFF)) vlan_tag = dev->pvid; if (vlan_tag && (vlan_tag < 0x1000)) { @@ -64,7 +64,8 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, eth_sz = sizeof(struct ocrdma_eth_basic); } memcpy(ð.smac[0], &dev->nic_info.mac_addr[0], ETH_ALEN); - status = ocrdma_resolve_dgid(dev, &attr->grh.dgid, ð.dmac[0]); + memcpy(ð.dmac[0], attr->dmac, ETH_ALEN); + status = ocrdma_resolve_dmac(dev, attr, ð.dmac[0]); if (status) return status; status = ocrdma_query_gid(&dev->ibdev, 1, attr->grh.sgid_index, @@ -84,6 +85,7 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, memcpy((u8 *)ah->av + eth_sz, &grh, sizeof(struct ocrdma_grh)); if (vlan_enabled) ah->av->valid |= OCRDMA_AV_VLAN_VALID; + ah->av->valid = cpu_to_le32(ah->av->valid); return status; } @@ -98,7 +100,7 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) if (!(attr->ah_flags & IB_AH_GRH)) return ERR_PTR(-EINVAL); - ah = kzalloc(sizeof *ah, GFP_ATOMIC); + ah = kzalloc(sizeof(*ah), GFP_ATOMIC); if (!ah) return ERR_PTR(-ENOMEM); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 4ed8235d2d3..3bbf2010a82 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -32,7 +32,6 @@ #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> -#include <rdma/ib_addr.h> #include "ocrdma.h" #include "ocrdma_hw.h" @@ -150,7 +149,7 @@ enum ib_qp_state get_ibqp_state(enum ocrdma_qp_state qps) return IB_QPS_SQE; case OCRDMA_QPS_ERR: return IB_QPS_ERR; - }; + } return IB_QPS_ERR; } @@ -171,7 +170,7 @@ static enum ocrdma_qp_state get_ocrdma_qp_state(enum ib_qp_state qps) return OCRDMA_QPS_SQE; case IB_QPS_ERR: return OCRDMA_QPS_ERR; - }; + } return OCRDMA_QPS_ERR; } @@ -243,6 +242,23 @@ static int ocrdma_get_mbx_errno(u32 status) return err_num; } +char *port_speed_string(struct ocrdma_dev *dev) +{ + char *str = ""; + u16 speeds_supported; + + speeds_supported = dev->phy.fixed_speeds_supported | + dev->phy.auto_speeds_supported; + if (speeds_supported & OCRDMA_PHY_SPEED_40GBPS) + str = "40Gbps "; + else if (speeds_supported & OCRDMA_PHY_SPEED_10GBPS) + str = "10Gbps "; + else if (speeds_supported & OCRDMA_PHY_SPEED_1GBPS) + str = "1Gbps "; + + return str; +} + static int ocrdma_get_mbx_cqe_errno(u16 cqe_status) { int err_num = -EINVAL; @@ -332,6 +348,11 @@ static void *ocrdma_init_emb_mqe(u8 opcode, u32 cmd_len) return mqe; } +static void *ocrdma_alloc_mqe(void) +{ + return kzalloc(sizeof(struct ocrdma_mqe), GFP_KERNEL); +} + static void ocrdma_free_q(struct ocrdma_dev *dev, struct ocrdma_queue_info *q) { dma_free_coherent(&dev->nic_info.pdev->dev, q->size, q->va, q->dma); @@ -364,8 +385,8 @@ static void ocrdma_build_q_pages(struct ocrdma_pa *q_pa, int cnt, } } -static int ocrdma_mbx_delete_q(struct ocrdma_dev *dev, struct ocrdma_queue_info *q, - int queue_type) +static int ocrdma_mbx_delete_q(struct ocrdma_dev *dev, + struct ocrdma_queue_info *q, int queue_type) { u8 opcode = 0; int status; @@ -444,7 +465,7 @@ mbx_err: return status; } -static int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) +int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) { int irq; @@ -574,6 +595,7 @@ static int ocrdma_create_mq(struct ocrdma_dev *dev) if (status) goto alloc_err; + dev->eq_tbl[0].cq_cnt++; status = ocrdma_mbx_mq_cq_create(dev, &dev->mq.cq, &dev->eq_tbl[0].q); if (status) goto mbx_cq_free; @@ -639,7 +661,7 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, { struct ocrdma_qp *qp = NULL; struct ocrdma_cq *cq = NULL; - struct ib_event ib_evt; + struct ib_event ib_evt = { 0 }; int cq_event = 0; int qp_event = 1; int srq_event = 0; @@ -664,6 +686,8 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, case OCRDMA_CQ_OVERRUN_ERROR: ib_evt.element.cq = &cq->ibcq; ib_evt.event = IB_EVENT_CQ_ERR; + cq_event = 1; + qp_event = 0; break; case OCRDMA_CQ_QPCAT_ERROR: ib_evt.element.qp = &qp->ibqp; @@ -725,6 +749,7 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, qp->srq->ibsrq. srq_context); } else if (dev_event) { + pr_err("%s: Fatal event received\n", dev->ibdev.name); ib_dispatch_event(&ib_evt); } @@ -752,7 +777,6 @@ static void ocrdma_process_grp5_aync(struct ocrdma_dev *dev, } } - static void ocrdma_process_acqe(struct ocrdma_dev *dev, void *ae_cqe) { /* async CQE processing */ @@ -799,8 +823,6 @@ static int ocrdma_mq_cq_handler(struct ocrdma_dev *dev, u16 cq_id) ocrdma_process_acqe(dev, cqe); else if (cqe->valid_ae_cmpl_cons & OCRDMA_MCQE_CMPL_MASK) ocrdma_process_mcqe(dev, cqe); - else - pr_err("%s() cqe->compl is not set.\n", __func__); memset(cqe, 0, sizeof(struct ocrdma_mcqe)); ocrdma_mcq_inc_tail(dev); } @@ -858,16 +880,8 @@ static void ocrdma_qp_cq_handler(struct ocrdma_dev *dev, u16 cq_idx) BUG(); cq = dev->cq_tbl[cq_idx]; - if (cq == NULL) { - pr_err("%s%d invalid id=0x%x\n", __func__, dev->id, cq_idx); + if (cq == NULL) return; - } - spin_lock_irqsave(&cq->cq_lock, flags); - cq->armed = false; - cq->solicited = false; - spin_unlock_irqrestore(&cq->cq_lock, flags); - - ocrdma_ring_cq_db(dev, cq->id, false, false, 0); if (cq->ibcq.comp_handler) { spin_lock_irqsave(&cq->comp_handler_lock, flags); @@ -892,27 +906,35 @@ static irqreturn_t ocrdma_irq_handler(int irq, void *handle) struct ocrdma_dev *dev = eq->dev; struct ocrdma_eqe eqe; struct ocrdma_eqe *ptr; - u16 eqe_popped = 0; u16 cq_id; - while (1) { + int budget = eq->cq_cnt; + + do { ptr = ocrdma_get_eqe(eq); eqe = *ptr; ocrdma_le32_to_cpu(&eqe, sizeof(eqe)); if ((eqe.id_valid & OCRDMA_EQE_VALID_MASK) == 0) break; - eqe_popped += 1; + ptr->id_valid = 0; + /* ring eq doorbell as soon as its consumed. */ + ocrdma_ring_eq_db(dev, eq->q.id, false, true, 1); /* check whether its CQE or not. */ if ((eqe.id_valid & OCRDMA_EQE_FOR_CQE_MASK) == 0) { cq_id = eqe.id_valid >> OCRDMA_EQE_RESOURCE_ID_SHIFT; ocrdma_cq_handler(dev, cq_id); } ocrdma_eq_inc_tail(eq); - } - ocrdma_ring_eq_db(dev, eq->q.id, true, true, eqe_popped); - /* Ring EQ doorbell with num_popped to 0 to enable interrupts again. */ - if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX) - ocrdma_ring_eq_db(dev, eq->q.id, true, true, 0); + + /* There can be a stale EQE after the last bound CQ is + * destroyed. EQE valid and budget == 0 implies this. + */ + if (budget) + budget--; + + } while (budget); + + ocrdma_ring_eq_db(dev, eq->q.id, true, true, 0); return IRQ_HANDLED; } @@ -949,7 +971,8 @@ static int ocrdma_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe) { int status = 0; u16 cqe_status, ext_status; - struct ocrdma_mqe *rsp; + struct ocrdma_mqe *rsp_mqe; + struct ocrdma_mbx_rsp *rsp = NULL; mutex_lock(&dev->mqe_ctx.lock); ocrdma_post_mqe(dev, mqe); @@ -958,23 +981,61 @@ static int ocrdma_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe) goto mbx_err; cqe_status = dev->mqe_ctx.cqe_status; ext_status = dev->mqe_ctx.ext_status; - rsp = ocrdma_get_mqe_rsp(dev); - ocrdma_copy_le32_to_cpu(mqe, rsp, (sizeof(*mqe))); + rsp_mqe = ocrdma_get_mqe_rsp(dev); + ocrdma_copy_le32_to_cpu(mqe, rsp_mqe, (sizeof(*mqe))); + if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >> + OCRDMA_MQE_HDR_EMB_SHIFT) + rsp = &mqe->u.rsp; + if (cqe_status || ext_status) { - pr_err("%s() opcode=0x%x, cqe_status=0x%x, ext_status=0x%x\n", - __func__, - (rsp->u.rsp.subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >> - OCRDMA_MBX_RSP_OPCODE_SHIFT, cqe_status, ext_status); + pr_err("%s() cqe_status=0x%x, ext_status=0x%x,", + __func__, cqe_status, ext_status); + if (rsp) { + /* This is for embedded cmds. */ + pr_err("opcode=0x%x, subsystem=0x%x\n", + (rsp->subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >> + OCRDMA_MBX_RSP_OPCODE_SHIFT, + (rsp->subsys_op & OCRDMA_MBX_RSP_SUBSYS_MASK) >> + OCRDMA_MBX_RSP_SUBSYS_SHIFT); + } status = ocrdma_get_mbx_cqe_errno(cqe_status); goto mbx_err; } - if (mqe->u.rsp.status & OCRDMA_MBX_RSP_STATUS_MASK) + /* For non embedded, rsp errors are handled in ocrdma_nonemb_mbx_cmd */ + if (rsp && (mqe->u.rsp.status & OCRDMA_MBX_RSP_STATUS_MASK)) status = ocrdma_get_mbx_errno(mqe->u.rsp.status); mbx_err: mutex_unlock(&dev->mqe_ctx.lock); return status; } +static int ocrdma_nonemb_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe, + void *payload_va) +{ + int status = 0; + struct ocrdma_mbx_rsp *rsp = payload_va; + + if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >> + OCRDMA_MQE_HDR_EMB_SHIFT) + BUG(); + + status = ocrdma_mbx_cmd(dev, mqe); + if (!status) + /* For non embedded, only CQE failures are handled in + * ocrdma_mbx_cmd. We need to check for RSP errors. + */ + if (rsp->status & OCRDMA_MBX_RSP_STATUS_MASK) + status = ocrdma_get_mbx_errno(rsp->status); + + if (status) + pr_err("opcode=0x%x, subsystem=0x%x\n", + (rsp->subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >> + OCRDMA_MBX_RSP_OPCODE_SHIFT, + (rsp->subsys_op & OCRDMA_MBX_RSP_SUBSYS_MASK) >> + OCRDMA_MBX_RSP_SUBSYS_SHIFT); + return status; +} + static void ocrdma_get_attr(struct ocrdma_dev *dev, struct ocrdma_dev_attr *attr, struct ocrdma_mbx_query_config *rsp) @@ -985,6 +1046,9 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev, attr->max_qp = (rsp->qp_srq_cq_ird_ord & OCRDMA_MBX_QUERY_CFG_MAX_QP_MASK) >> OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT; + attr->max_srq = + (rsp->max_srq_rpir_qps & OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET; attr->max_send_sge = ((rsp->max_write_send_sge & OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK) >> OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT); @@ -1000,9 +1064,6 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev, attr->max_ord_per_qp = (rsp->max_ird_ord_per_qp & OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_MASK) >> OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_SHIFT; - attr->max_srq = - (rsp->max_srq_rpir_qps & OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK) >> - OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET; attr->max_ird_per_qp = (rsp->max_ird_ord_per_qp & OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_MASK) >> OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT; @@ -1015,6 +1076,7 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev, attr->local_ca_ack_delay = (rsp->max_pd_ca_ack_delay & OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK) >> OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT; + attr->max_mw = rsp->max_mw; attr->max_mr = rsp->max_mr; attr->max_mr_size = ~0ull; attr->max_fmr = 0; @@ -1036,7 +1098,7 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev, attr->max_inline_data = attr->wqe_size - (sizeof(struct ocrdma_hdr_wqe) + sizeof(struct ocrdma_sge)); - if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { attr->ird = 1; attr->ird_page_size = OCRDMA_MIN_Q_PAGE_SIZE; attr->num_ird_pages = MAX_OCRDMA_IRD_PAGES; @@ -1110,6 +1172,96 @@ mbx_err: return status; } +int ocrdma_mbx_rdma_stats(struct ocrdma_dev *dev, bool reset) +{ + struct ocrdma_rdma_stats_req *req = dev->stats_mem.va; + struct ocrdma_mqe *mqe = &dev->stats_mem.mqe; + struct ocrdma_rdma_stats_resp *old_stats = NULL; + int status; + + old_stats = kzalloc(sizeof(*old_stats), GFP_KERNEL); + if (old_stats == NULL) + return -ENOMEM; + + memset(mqe, 0, sizeof(*mqe)); + mqe->hdr.pyld_len = dev->stats_mem.size; + mqe->hdr.spcl_sge_cnt_emb |= + (1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) & + OCRDMA_MQE_HDR_SGE_CNT_MASK; + mqe->u.nonemb_req.sge[0].pa_lo = (u32) (dev->stats_mem.pa & 0xffffffff); + mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dev->stats_mem.pa); + mqe->u.nonemb_req.sge[0].len = dev->stats_mem.size; + + /* Cache the old stats */ + memcpy(old_stats, req, sizeof(struct ocrdma_rdma_stats_resp)); + memset(req, 0, dev->stats_mem.size); + + ocrdma_init_mch((struct ocrdma_mbx_hdr *)req, + OCRDMA_CMD_GET_RDMA_STATS, + OCRDMA_SUBSYS_ROCE, + dev->stats_mem.size); + if (reset) + req->reset_stats = reset; + + status = ocrdma_nonemb_mbx_cmd(dev, mqe, dev->stats_mem.va); + if (status) + /* Copy from cache, if mbox fails */ + memcpy(req, old_stats, sizeof(struct ocrdma_rdma_stats_resp)); + else + ocrdma_le32_to_cpu(req, dev->stats_mem.size); + + kfree(old_stats); + return status; +} + +static int ocrdma_mbx_get_ctrl_attribs(struct ocrdma_dev *dev) +{ + int status = -ENOMEM; + struct ocrdma_dma_mem dma; + struct ocrdma_mqe *mqe; + struct ocrdma_get_ctrl_attribs_rsp *ctrl_attr_rsp; + struct mgmt_hba_attribs *hba_attribs; + + mqe = ocrdma_alloc_mqe(); + if (!mqe) + return status; + memset(mqe, 0, sizeof(*mqe)); + + dma.size = sizeof(struct ocrdma_get_ctrl_attribs_rsp); + dma.va = dma_alloc_coherent(&dev->nic_info.pdev->dev, + dma.size, &dma.pa, GFP_KERNEL); + if (!dma.va) + goto free_mqe; + + mqe->hdr.pyld_len = dma.size; + mqe->hdr.spcl_sge_cnt_emb |= + (1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) & + OCRDMA_MQE_HDR_SGE_CNT_MASK; + mqe->u.nonemb_req.sge[0].pa_lo = (u32) (dma.pa & 0xffffffff); + mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dma.pa); + mqe->u.nonemb_req.sge[0].len = dma.size; + + memset(dma.va, 0, dma.size); + ocrdma_init_mch((struct ocrdma_mbx_hdr *)dma.va, + OCRDMA_CMD_GET_CTRL_ATTRIBUTES, + OCRDMA_SUBSYS_COMMON, + dma.size); + + status = ocrdma_nonemb_mbx_cmd(dev, mqe, dma.va); + if (!status) { + ctrl_attr_rsp = (struct ocrdma_get_ctrl_attribs_rsp *)dma.va; + hba_attribs = &ctrl_attr_rsp->ctrl_attribs.hba_attribs; + + dev->hba_port_num = hba_attribs->phy_port; + strncpy(dev->model_number, + hba_attribs->controller_model_number, 31); + } + dma_free_coherent(&dev->nic_info.pdev->dev, dma.size, dma.va, dma.pa); +free_mqe: + kfree(mqe); + return status; +} + static int ocrdma_mbx_query_dev(struct ocrdma_dev *dev) { int status = -ENOMEM; @@ -1157,6 +1309,35 @@ mbx_err: return status; } +static int ocrdma_mbx_get_phy_info(struct ocrdma_dev *dev) +{ + int status = -ENOMEM; + struct ocrdma_mqe *cmd; + struct ocrdma_get_phy_info_rsp *rsp; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_PHY_DETAILS, sizeof(*cmd)); + if (!cmd) + return status; + + ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0], + OCRDMA_CMD_PHY_DETAILS, OCRDMA_SUBSYS_COMMON, + sizeof(*cmd)); + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + + rsp = (struct ocrdma_get_phy_info_rsp *)cmd; + dev->phy.phy_type = le16_to_cpu(rsp->phy_type); + dev->phy.auto_speeds_supported = + le16_to_cpu(rsp->auto_speeds_supported); + dev->phy.fixed_speeds_supported = + le16_to_cpu(rsp->fixed_speeds_supported); +mbx_err: + kfree(cmd); + return status; +} + int ocrdma_mbx_alloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd) { int status = -ENOMEM; @@ -1226,7 +1407,7 @@ static int ocrdma_build_q_conf(u32 *num_entries, int entry_size, static int ocrdma_mbx_create_ah_tbl(struct ocrdma_dev *dev) { - int i ; + int i; int status = 0; int max_ah; struct ocrdma_create_ah_tbl *cmd; @@ -1357,12 +1538,10 @@ static void ocrdma_unbind_eq(struct ocrdma_dev *dev, u16 eq_id) int i; mutex_lock(&dev->dev_lock); - for (i = 0; i < dev->eq_cnt; i++) { - if (dev->eq_tbl[i].q.id != eq_id) - continue; - dev->eq_tbl[i].cq_cnt -= 1; - break; - } + i = ocrdma_get_eq_table_index(dev, eq_id); + if (i == -EINVAL) + BUG(); + dev->eq_tbl[i].cq_cnt -= 1; mutex_unlock(&dev->dev_lock); } @@ -1380,7 +1559,7 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq, __func__, dev->id, dev->attr.max_cqe, entries); return -EINVAL; } - if (dpp_cq && (dev->nic_info.dev_family != OCRDMA_GEN2_FAMILY)) + if (dpp_cq && (ocrdma_get_asic_type(dev) != OCRDMA_ASIC_GEN_SKH_R)) return -EINVAL; if (dpp_cq) { @@ -1417,6 +1596,7 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq, cq->eqn = ocrdma_bind_eq(dev); cmd->cmd.req.rsvd_version = OCRDMA_CREATE_CQ_VER3; cqe_count = cq->len / cqe_size; + cq->cqe_cnt = cqe_count; if (cqe_count > 1024) { /* Set cnt to 3 to indicate more than 1024 cq entries */ cmd->cmd.ev_cnt_flags |= (0x3 << OCRDMA_CREATE_CQ_CNT_SHIFT); @@ -1439,7 +1619,7 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq, } /* shared eq between all the consumer cqs. */ cmd->cmd.eqn = cq->eqn; - if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { if (dpp_cq) cmd->cmd.pgsz_pgcnt |= OCRDMA_CREATE_CQ_DPP << OCRDMA_CREATE_CQ_TYPE_SHIFT; @@ -1484,12 +1664,9 @@ int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq) (cq->id << OCRDMA_DESTROY_CQ_QID_SHIFT) & OCRDMA_DESTROY_CQ_QID_MASK; - ocrdma_unbind_eq(dev, cq->eqn); status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); - if (status) - goto mbx_err; + ocrdma_unbind_eq(dev, cq->eqn); dma_free_coherent(&dev->nic_info.pdev->dev, cq->len, cq->va, cq->pa); -mbx_err: kfree(cmd); return status; } @@ -1783,7 +1960,7 @@ static int ocrdma_set_create_qp_sq_cmd(struct ocrdma_create_qp_req *cmd, u32 max_sges = attrs->cap.max_send_sge; /* QP1 may exceed 127 */ - max_wqe_allocated = min_t(int, attrs->cap.max_send_wr + 1, + max_wqe_allocated = min_t(u32, attrs->cap.max_send_wr + 1, dev->attr.max_wqe); status = ocrdma_build_q_conf(&max_wqe_allocated, @@ -1982,7 +2159,7 @@ int ocrdma_mbx_create_qp(struct ocrdma_qp *qp, struct ib_qp_init_attr *attrs, break; default: return -EINVAL; - }; + } cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_QP, sizeof(*cmd)); if (!cmd) @@ -2029,8 +2206,7 @@ int ocrdma_mbx_create_qp(struct ocrdma_qp *qp, struct ib_qp_init_attr *attrs, OCRDMA_CREATE_QP_REQ_RQ_CQID_MASK; qp->rq_cq = cq; - if (pd->dpp_enabled && attrs->cap.max_inline_data && pd->num_dpp_qp && - (attrs->cap.max_inline_data <= dev->attr.max_inline_data)) { + if (pd->dpp_enabled && pd->num_dpp_qp) { ocrdma_set_create_qp_dpp_cmd(cmd, pd, qp, enable_dpp_cq, dpp_cq_id); } @@ -2076,23 +2252,6 @@ mbx_err: return status; } -int ocrdma_resolve_dgid(struct ocrdma_dev *dev, union ib_gid *dgid, - u8 *mac_addr) -{ - struct in6_addr in6; - - memcpy(&in6, dgid, sizeof in6); - if (rdma_is_multicast_addr(&in6)) { - rdma_get_mcast_mac(&in6, mac_addr); - } else if (rdma_link_local_addr(&in6)) { - rdma_get_ll_mac(&in6, mac_addr); - } else { - pr_err("%s() fail to resolve mac_addr.\n", __func__); - return -EINVAL; - } - return 0; -} - static int ocrdma_set_av_params(struct ocrdma_qp *qp, struct ocrdma_modify_qp *cmd, struct ib_qp_attr *attrs) @@ -2116,7 +2275,7 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, memcpy(&cmd->params.dgid[0], &ah_attr->grh.dgid.raw[0], sizeof(cmd->params.dgid)); status = ocrdma_query_gid(&qp->dev->ibdev, 1, - ah_attr->grh.sgid_index, &sgid); + ah_attr->grh.sgid_index, &sgid); if (status) return status; @@ -2126,14 +2285,14 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, qp->sgid_idx = ah_attr->grh.sgid_index; memcpy(&cmd->params.sgid[0], &sgid.raw[0], sizeof(cmd->params.sgid)); - ocrdma_resolve_dgid(qp->dev, &ah_attr->grh.dgid, &mac_addr[0]); + ocrdma_resolve_dmac(qp->dev, ah_attr, &mac_addr[0]); cmd->params.dmac_b0_to_b3 = mac_addr[0] | (mac_addr[1] << 8) | (mac_addr[2] << 16) | (mac_addr[3] << 24); /* convert them to LE format. */ ocrdma_cpu_to_le32(&cmd->params.dgid[0], sizeof(cmd->params.dgid)); ocrdma_cpu_to_le32(&cmd->params.sgid[0], sizeof(cmd->params.sgid)); cmd->params.vlan_dmac_b4_to_b5 = mac_addr[4] | (mac_addr[5] << 8); - vlan_id = rdma_get_vlan_id(&sgid); + vlan_id = ah_attr->vlan_id; if (vlan_id && (vlan_id < 0x1000)) { cmd->params.vlan_dmac_b4_to_b5 |= vlan_id << OCRDMA_QP_PARAMS_VLAN_SHIFT; @@ -2144,8 +2303,7 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, static int ocrdma_set_qp_params(struct ocrdma_qp *qp, struct ocrdma_modify_qp *cmd, - struct ib_qp_attr *attrs, int attr_mask, - enum ib_qp_state old_qps) + struct ib_qp_attr *attrs, int attr_mask) { int status = 0; @@ -2250,8 +2408,7 @@ pmtu_err: } int ocrdma_mbx_modify_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp, - struct ib_qp_attr *attrs, int attr_mask, - enum ib_qp_state old_qps) + struct ib_qp_attr *attrs, int attr_mask) { int status = -ENOMEM; struct ocrdma_modify_qp *cmd; @@ -2274,7 +2431,7 @@ int ocrdma_mbx_modify_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp, OCRDMA_QP_PARAMS_STATE_MASK; } - status = ocrdma_set_qp_params(qp, cmd, attrs, attr_mask, old_qps); + status = ocrdma_set_qp_params(qp, cmd, attrs, attr_mask); if (status) goto mbx_err; status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); @@ -2505,7 +2662,7 @@ static int ocrdma_create_eqs(struct ocrdma_dev *dev) for (i = 0; i < num_eq; i++) { status = ocrdma_create_eq(dev, &dev->eq_tbl[i], - OCRDMA_EQ_LEN); + OCRDMA_EQ_LEN); if (status) { status = -EINVAL; break; @@ -2550,6 +2707,13 @@ int ocrdma_init_hw(struct ocrdma_dev *dev) status = ocrdma_mbx_create_ah_tbl(dev); if (status) goto conf_err; + status = ocrdma_mbx_get_phy_info(dev); + if (status) + goto conf_err; + status = ocrdma_mbx_get_ctrl_attribs(dev); + if (status) + goto conf_err; + return 0; conf_err: diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h index f2a89d4cc7c..e513f729314 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h @@ -94,7 +94,6 @@ void ocrdma_ring_cq_db(struct ocrdma_dev *, u16 cq_id, bool armed, int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed); int ocrdma_query_config(struct ocrdma_dev *, struct ocrdma_mbx_query_config *config); -int ocrdma_resolve_dgid(struct ocrdma_dev *, union ib_gid *dgid, u8 *mac_addr); int ocrdma_mbx_alloc_pd(struct ocrdma_dev *, struct ocrdma_pd *); int ocrdma_mbx_dealloc_pd(struct ocrdma_dev *, struct ocrdma_pd *); @@ -113,8 +112,7 @@ int ocrdma_mbx_create_qp(struct ocrdma_qp *, struct ib_qp_init_attr *attrs, u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset, u16 *dpp_credit_lmt); int ocrdma_mbx_modify_qp(struct ocrdma_dev *, struct ocrdma_qp *, - struct ib_qp_attr *attrs, int attr_mask, - enum ib_qp_state old_qps); + struct ib_qp_attr *attrs, int attr_mask); int ocrdma_mbx_query_qp(struct ocrdma_dev *, struct ocrdma_qp *, struct ocrdma_qp_params *param); int ocrdma_mbx_destroy_qp(struct ocrdma_dev *, struct ocrdma_qp *); @@ -133,5 +131,8 @@ int ocrdma_qp_state_change(struct ocrdma_qp *, enum ib_qp_state new_state, bool ocrdma_is_qp_in_sq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *); bool ocrdma_is_qp_in_rq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *); void ocrdma_flush_qp(struct ocrdma_qp *); +int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq); +int ocrdma_mbx_rdma_stats(struct ocrdma_dev *, bool reset); +char *port_speed_string(struct ocrdma_dev *dev); #endif /* __OCRDMA_HW_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 56e004940f1..7c504e07974 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -39,10 +39,11 @@ #include "ocrdma_ah.h" #include "be_roce.h" #include "ocrdma_hw.h" +#include "ocrdma_stats.h" #include "ocrdma_abi.h" -MODULE_VERSION(OCRDMA_ROCE_DEV_VERSION); -MODULE_DESCRIPTION("Emulex RoCE HCA Driver"); +MODULE_VERSION(OCRDMA_ROCE_DRV_VERSION); +MODULE_DESCRIPTION(OCRDMA_ROCE_DRV_DESC " " OCRDMA_ROCE_DRV_VERSION); MODULE_AUTHOR("Emulex Corporation"); MODULE_LICENSE("GPL"); @@ -67,46 +68,24 @@ void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid) guid[7] = mac_addr[5]; } -static void ocrdma_build_sgid_mac(union ib_gid *sgid, unsigned char *mac_addr, - bool is_vlan, u16 vlan_id) -{ - sgid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); - sgid->raw[8] = mac_addr[0] ^ 2; - sgid->raw[9] = mac_addr[1]; - sgid->raw[10] = mac_addr[2]; - if (is_vlan) { - sgid->raw[11] = vlan_id >> 8; - sgid->raw[12] = vlan_id & 0xff; - } else { - sgid->raw[11] = 0xff; - sgid->raw[12] = 0xfe; - } - sgid->raw[13] = mac_addr[3]; - sgid->raw[14] = mac_addr[4]; - sgid->raw[15] = mac_addr[5]; -} - -static bool ocrdma_add_sgid(struct ocrdma_dev *dev, unsigned char *mac_addr, - bool is_vlan, u16 vlan_id) +static bool ocrdma_add_sgid(struct ocrdma_dev *dev, union ib_gid *new_sgid) { int i; - union ib_gid new_sgid; unsigned long flags; memset(&ocrdma_zero_sgid, 0, sizeof(union ib_gid)); - ocrdma_build_sgid_mac(&new_sgid, mac_addr, is_vlan, vlan_id); spin_lock_irqsave(&dev->sgid_lock, flags); for (i = 0; i < OCRDMA_MAX_SGID; i++) { if (!memcmp(&dev->sgid_tbl[i], &ocrdma_zero_sgid, sizeof(union ib_gid))) { /* found free entry */ - memcpy(&dev->sgid_tbl[i], &new_sgid, + memcpy(&dev->sgid_tbl[i], new_sgid, sizeof(union ib_gid)); spin_unlock_irqrestore(&dev->sgid_lock, flags); return true; - } else if (!memcmp(&dev->sgid_tbl[i], &new_sgid, + } else if (!memcmp(&dev->sgid_tbl[i], new_sgid, sizeof(union ib_gid))) { /* entry already present, no addition is required. */ spin_unlock_irqrestore(&dev->sgid_lock, flags); @@ -117,20 +96,17 @@ static bool ocrdma_add_sgid(struct ocrdma_dev *dev, unsigned char *mac_addr, return false; } -static bool ocrdma_del_sgid(struct ocrdma_dev *dev, unsigned char *mac_addr, - bool is_vlan, u16 vlan_id) +static bool ocrdma_del_sgid(struct ocrdma_dev *dev, union ib_gid *sgid) { int found = false; int i; - union ib_gid sgid; unsigned long flags; - ocrdma_build_sgid_mac(&sgid, mac_addr, is_vlan, vlan_id); spin_lock_irqsave(&dev->sgid_lock, flags); /* first is default sgid, which cannot be deleted. */ for (i = 1; i < OCRDMA_MAX_SGID; i++) { - if (!memcmp(&dev->sgid_tbl[i], &sgid, sizeof(union ib_gid))) { + if (!memcmp(&dev->sgid_tbl[i], sgid, sizeof(union ib_gid))) { /* found matching entry */ memset(&dev->sgid_tbl[i], 0, sizeof(union ib_gid)); found = true; @@ -141,75 +117,18 @@ static bool ocrdma_del_sgid(struct ocrdma_dev *dev, unsigned char *mac_addr, return found; } -static void ocrdma_add_default_sgid(struct ocrdma_dev *dev) -{ - /* GID Index 0 - Invariant manufacturer-assigned EUI-64 */ - union ib_gid *sgid = &dev->sgid_tbl[0]; - - sgid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); - ocrdma_get_guid(dev, &sgid->raw[8]); -} - -#if IS_ENABLED(CONFIG_VLAN_8021Q) -static void ocrdma_add_vlan_sgids(struct ocrdma_dev *dev) -{ - struct net_device *netdev, *tmp; - u16 vlan_id; - bool is_vlan; - - netdev = dev->nic_info.netdev; - - rcu_read_lock(); - for_each_netdev_rcu(&init_net, tmp) { - if (netdev == tmp || vlan_dev_real_dev(tmp) == netdev) { - if (!netif_running(tmp) || !netif_oper_up(tmp)) - continue; - if (netdev != tmp) { - vlan_id = vlan_dev_vlan_id(tmp); - is_vlan = true; - } else { - is_vlan = false; - vlan_id = 0; - tmp = netdev; - } - ocrdma_add_sgid(dev, tmp->dev_addr, is_vlan, vlan_id); - } - } - rcu_read_unlock(); -} -#else -static void ocrdma_add_vlan_sgids(struct ocrdma_dev *dev) -{ - -} -#endif /* VLAN */ - -static int ocrdma_build_sgid_tbl(struct ocrdma_dev *dev) -{ - ocrdma_add_default_sgid(dev); - ocrdma_add_vlan_sgids(dev); - return 0; -} - -#if IS_ENABLED(CONFIG_IPV6) - -static int ocrdma_inet6addr_event(struct notifier_block *notifier, - unsigned long event, void *ptr) +static int ocrdma_addr_event(unsigned long event, struct net_device *netdev, + union ib_gid *gid) { - struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; - struct net_device *netdev = ifa->idev->dev; struct ib_event gid_event; struct ocrdma_dev *dev; bool found = false; bool updated = false; bool is_vlan = false; - u16 vid = 0; is_vlan = netdev->priv_flags & IFF_802_1Q_VLAN; - if (is_vlan) { - vid = vlan_dev_vlan_id(netdev); - netdev = vlan_dev_real_dev(netdev); - } + if (is_vlan) + netdev = rdma_vlan_dev_real_dev(netdev); rcu_read_lock(); list_for_each_entry_rcu(dev, &ocrdma_dev_list, entry) { @@ -222,16 +141,14 @@ static int ocrdma_inet6addr_event(struct notifier_block *notifier, if (!found) return NOTIFY_DONE; - if (!rdma_link_local_addr((struct in6_addr *)&ifa->addr)) - return NOTIFY_DONE; mutex_lock(&dev->dev_lock); switch (event) { case NETDEV_UP: - updated = ocrdma_add_sgid(dev, netdev->dev_addr, is_vlan, vid); + updated = ocrdma_add_sgid(dev, gid); break; case NETDEV_DOWN: - updated = ocrdma_del_sgid(dev, netdev->dev_addr, is_vlan, vid); + updated = ocrdma_del_sgid(dev, gid); break; default: break; @@ -247,6 +164,32 @@ static int ocrdma_inet6addr_event(struct notifier_block *notifier, return NOTIFY_OK; } +static int ocrdma_inetaddr_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = ptr; + union ib_gid gid; + struct net_device *netdev = ifa->ifa_dev->dev; + + ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid); + return ocrdma_addr_event(event, netdev, &gid); +} + +static struct notifier_block ocrdma_inetaddr_notifier = { + .notifier_call = ocrdma_inetaddr_event +}; + +#if IS_ENABLED(CONFIG_IPV6) + +static int ocrdma_inet6addr_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; + union ib_gid *gid = (union ib_gid *)&ifa->addr; + struct net_device *netdev = ifa->idev->dev; + return ocrdma_addr_event(event, netdev, gid); +} + static struct notifier_block ocrdma_inet6addr_notifier = { .notifier_call = ocrdma_inet6addr_event }; @@ -344,7 +287,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) dev->ibdev.process_mad = ocrdma_process_mad; - if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { dev->ibdev.uverbs_cmd_mask |= OCRDMA_UVERBS(CREATE_SRQ) | OCRDMA_UVERBS(MODIFY_SRQ) | @@ -396,9 +339,42 @@ static void ocrdma_free_resources(struct ocrdma_dev *dev) kfree(dev->sgid_tbl); } +/* OCRDMA sysfs interface */ +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ocrdma_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor); +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ocrdma_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "%s", &dev->attr.fw_ver[0]); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); + +static struct device_attribute *ocrdma_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver +}; + +static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++) + device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]); +} + static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) { - int status = 0; + int status = 0, i; struct ocrdma_dev *dev; dev = (struct ocrdma_dev *)ib_alloc_device(sizeof(struct ocrdma_dev)); @@ -423,19 +399,29 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) if (status) goto alloc_err; - status = ocrdma_build_sgid_tbl(dev); - if (status) - goto alloc_err; - status = ocrdma_register_device(dev); if (status) goto alloc_err; + for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++) + if (device_create_file(&dev->ibdev.dev, ocrdma_attributes[i])) + goto sysfs_err; spin_lock(&ocrdma_devlist_lock); list_add_tail_rcu(&dev->entry, &ocrdma_dev_list); spin_unlock(&ocrdma_devlist_lock); + /* Init stats */ + ocrdma_add_port_stats(dev); + + pr_info("%s %s: %s \"%s\" port %d\n", + dev_name(&dev->nic_info.pdev->dev), hca_name(dev), + port_speed_string(dev), dev->model_number, + dev->hba_port_num); + pr_info("%s ocrdma%d driver loaded successfully\n", + dev_name(&dev->nic_info.pdev->dev), dev->id); return dev; +sysfs_err: + ocrdma_remove_sysfiles(dev); alloc_err: ocrdma_free_resources(dev); ocrdma_cleanup_hw(dev); @@ -452,9 +438,6 @@ static void ocrdma_remove_free(struct rcu_head *rcu) { struct ocrdma_dev *dev = container_of(rcu, struct ocrdma_dev, rcu); - ocrdma_free_resources(dev); - ocrdma_cleanup_hw(dev); - idr_remove(&ocrdma_dev_id, dev->id); kfree(dev->mbx_cmd); ib_dealloc_device(&dev->ibdev); @@ -465,11 +448,18 @@ static void ocrdma_remove(struct ocrdma_dev *dev) /* first unregister with stack to stop all the active traffic * of the registered clients. */ + ocrdma_rem_port_stats(dev); + ocrdma_remove_sysfiles(dev); + ib_unregister_device(&dev->ibdev); spin_lock(&ocrdma_devlist_lock); list_del_rcu(&dev->entry); spin_unlock(&ocrdma_devlist_lock); + + ocrdma_free_resources(dev); + ocrdma_cleanup_hw(dev); + call_rcu(&dev->rcu, ocrdma_remove_free); } @@ -498,7 +488,7 @@ static int ocrdma_close(struct ocrdma_dev *dev) cur_qp = dev->qp_tbl; for (i = 0; i < OCRDMA_MAX_QP; i++) { qp = cur_qp[i]; - if (qp) { + if (qp && qp->ibqp.qp_type != IB_QPT_GSI) { /* change the QP state to ERROR */ _ocrdma_modify_qp(&qp->ibqp, &attrs, attr_mask); @@ -531,7 +521,7 @@ static void ocrdma_event_handler(struct ocrdma_dev *dev, u32 event) case BE_DEV_DOWN: ocrdma_close(dev); break; - }; + } } static struct ocrdma_driver ocrdma_drv = { @@ -539,6 +529,7 @@ static struct ocrdma_driver ocrdma_drv = { .add = ocrdma_add, .remove = ocrdma_remove, .state_change_handler = ocrdma_event_handler, + .be_abi_version = OCRDMA_BE_ROCE_ABI_VERSION, }; static void ocrdma_unregister_inet6addr_notifier(void) @@ -548,20 +539,37 @@ static void ocrdma_unregister_inet6addr_notifier(void) #endif } +static void ocrdma_unregister_inetaddr_notifier(void) +{ + unregister_inetaddr_notifier(&ocrdma_inetaddr_notifier); +} + static int __init ocrdma_init_module(void) { int status; + ocrdma_init_debugfs(); + + status = register_inetaddr_notifier(&ocrdma_inetaddr_notifier); + if (status) + return status; + #if IS_ENABLED(CONFIG_IPV6) status = register_inet6addr_notifier(&ocrdma_inet6addr_notifier); if (status) - return status; + goto err_notifier6; #endif status = be_roce_register_driver(&ocrdma_drv); if (status) - ocrdma_unregister_inet6addr_notifier(); + goto err_be_reg; + return 0; + +err_be_reg: + ocrdma_unregister_inet6addr_notifier(); +err_notifier6: + ocrdma_unregister_inetaddr_notifier(); return status; } @@ -569,6 +577,8 @@ static void __exit ocrdma_exit_module(void) { be_roce_unregister_driver(&ocrdma_drv); ocrdma_unregister_inet6addr_notifier(); + ocrdma_unregister_inetaddr_notifier(); + ocrdma_rem_debugfs(); } module_init(ocrdma_init_module); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h index 9f9570ec3c2..96c9ee602ba 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -30,8 +30,16 @@ #define Bit(_b) (1 << (_b)) -#define OCRDMA_GEN1_FAMILY 0xB -#define OCRDMA_GEN2_FAMILY 0x2 +enum { + OCRDMA_ASIC_GEN_SKH_R = 0x04, + OCRDMA_ASIC_GEN_LANCER = 0x0B +}; + +enum { + OCRDMA_ASIC_REV_A0 = 0x00, + OCRDMA_ASIC_REV_B0 = 0x10, + OCRDMA_ASIC_REV_C0 = 0x20 +}; #define OCRDMA_SUBSYS_ROCE 10 enum { @@ -64,6 +72,7 @@ enum { OCRDMA_CMD_ATTACH_MCAST, OCRDMA_CMD_DETACH_MCAST, + OCRDMA_CMD_GET_RDMA_STATS, OCRDMA_CMD_MAX }; @@ -74,12 +83,14 @@ enum { OCRDMA_CMD_CREATE_CQ = 12, OCRDMA_CMD_CREATE_EQ = 13, OCRDMA_CMD_CREATE_MQ = 21, + OCRDMA_CMD_GET_CTRL_ATTRIBUTES = 32, OCRDMA_CMD_GET_FW_VER = 35, OCRDMA_CMD_DELETE_MQ = 53, OCRDMA_CMD_DELETE_CQ = 54, OCRDMA_CMD_DELETE_EQ = 55, OCRDMA_CMD_GET_FW_CONFIG = 58, - OCRDMA_CMD_CREATE_MQ_EXT = 90 + OCRDMA_CMD_CREATE_MQ_EXT = 90, + OCRDMA_CMD_PHY_DETAILS = 102 }; enum { @@ -103,7 +114,10 @@ enum { OCRDMA_DB_GEN2_SRQ_OFFSET = OCRDMA_DB_GEN2_RQ_OFFSET, OCRDMA_DB_CQ_OFFSET = 0x120, OCRDMA_DB_EQ_OFFSET = OCRDMA_DB_CQ_OFFSET, - OCRDMA_DB_MQ_OFFSET = 0x140 + OCRDMA_DB_MQ_OFFSET = 0x140, + + OCRDMA_DB_SQ_SHIFT = 16, + OCRDMA_DB_RQ_SHIFT = 24 }; #define OCRDMA_DB_CQ_RING_ID_MASK 0x3FF /* bits 0 - 9 */ @@ -138,6 +152,10 @@ enum { #define OCRDMA_MIN_Q_PAGE_SIZE (4096) #define OCRDMA_MAX_Q_PAGES (8) +#define OCRDMA_SLI_ASIC_ID_OFFSET 0x9C +#define OCRDMA_SLI_ASIC_REV_MASK 0x000000FF +#define OCRDMA_SLI_ASIC_GEN_NUM_MASK 0x0000FF00 +#define OCRDMA_SLI_ASIC_GEN_NUM_SHIFT 0x08 /* # 0: 4K Bytes # 1: 8K Bytes @@ -562,6 +580,30 @@ enum { OCRDMA_FN_MODE_RDMA = 0x4 }; +struct ocrdma_get_phy_info_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u16 phy_type; + u16 interface_type; + u32 misc_params; + u16 ext_phy_details; + u16 rsvd; + u16 auto_speeds_supported; + u16 fixed_speeds_supported; + u32 future_use[2]; +}; + +enum { + OCRDMA_PHY_SPEED_ZERO = 0x0, + OCRDMA_PHY_SPEED_10MBPS = 0x1, + OCRDMA_PHY_SPEED_100MBPS = 0x2, + OCRDMA_PHY_SPEED_1GBPS = 0x4, + OCRDMA_PHY_SPEED_10GBPS = 0x8, + OCRDMA_PHY_SPEED_40GBPS = 0x20 +}; + + struct ocrdma_get_link_speed_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; @@ -590,7 +632,7 @@ enum { enum { OCRDMA_CREATE_CQ_VER2 = 2, - OCRDMA_CREATE_CQ_VER3 = 3, + OCRDMA_CREATE_CQ_VER3 = 3, OCRDMA_CREATE_CQ_PAGE_CNT_MASK = 0xFFFF, OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT = 16, @@ -1050,6 +1092,7 @@ enum { OCRDMA_MODIFY_QP_RSP_MAX_ORD_MASK = 0xFFFF << OCRDMA_MODIFY_QP_RSP_MAX_ORD_SHIFT }; + struct ocrdma_modify_qp_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; @@ -1062,8 +1105,8 @@ struct ocrdma_query_qp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_hdr req; -#define OCRDMA_QUERY_UP_QP_ID_SHIFT 0 -#define OCRDMA_QUERY_UP_QP_ID_MASK 0xFFFFFF +#define OCRDMA_QUERY_UP_QP_ID_SHIFT 0 +#define OCRDMA_QUERY_UP_QP_ID_MASK 0xFFFFFF u32 qp_id; }; @@ -1694,7 +1737,7 @@ struct ocrdma_grh { u16 rsvd; } __packed; -#define OCRDMA_AV_VALID Bit(0) +#define OCRDMA_AV_VALID Bit(7) #define OCRDMA_AV_VLAN_VALID Bit(1) struct ocrdma_av { @@ -1703,4 +1746,208 @@ struct ocrdma_av { u32 valid; } __packed; +struct ocrdma_rsrc_stats { + u32 dpp_pds; + u32 non_dpp_pds; + u32 rc_dpp_qps; + u32 uc_dpp_qps; + u32 ud_dpp_qps; + u32 rc_non_dpp_qps; + u32 rsvd; + u32 uc_non_dpp_qps; + u32 ud_non_dpp_qps; + u32 rsvd1; + u32 srqs; + u32 rbqs; + u32 r64K_nsmr; + u32 r64K_to_2M_nsmr; + u32 r2M_to_44M_nsmr; + u32 r44M_to_1G_nsmr; + u32 r1G_to_4G_nsmr; + u32 nsmr_count_4G_to_32G; + u32 r32G_to_64G_nsmr; + u32 r64G_to_128G_nsmr; + u32 r128G_to_higher_nsmr; + u32 embedded_nsmr; + u32 frmr; + u32 prefetch_qps; + u32 ondemand_qps; + u32 phy_mr; + u32 mw; + u32 rsvd2[7]; +}; + +struct ocrdma_db_err_stats { + u32 sq_doorbell_errors; + u32 cq_doorbell_errors; + u32 rq_srq_doorbell_errors; + u32 cq_overflow_errors; + u32 rsvd[4]; +}; + +struct ocrdma_wqe_stats { + u32 large_send_rc_wqes_lo; + u32 large_send_rc_wqes_hi; + u32 large_write_rc_wqes_lo; + u32 large_write_rc_wqes_hi; + u32 rsvd[4]; + u32 read_wqes_lo; + u32 read_wqes_hi; + u32 frmr_wqes_lo; + u32 frmr_wqes_hi; + u32 mw_bind_wqes_lo; + u32 mw_bind_wqes_hi; + u32 invalidate_wqes_lo; + u32 invalidate_wqes_hi; + u32 rsvd1[2]; + u32 dpp_wqe_drops; + u32 rsvd2[5]; +}; + +struct ocrdma_tx_stats { + u32 send_pkts_lo; + u32 send_pkts_hi; + u32 write_pkts_lo; + u32 write_pkts_hi; + u32 read_pkts_lo; + u32 read_pkts_hi; + u32 read_rsp_pkts_lo; + u32 read_rsp_pkts_hi; + u32 ack_pkts_lo; + u32 ack_pkts_hi; + u32 send_bytes_lo; + u32 send_bytes_hi; + u32 write_bytes_lo; + u32 write_bytes_hi; + u32 read_req_bytes_lo; + u32 read_req_bytes_hi; + u32 read_rsp_bytes_lo; + u32 read_rsp_bytes_hi; + u32 ack_timeouts; + u32 rsvd[5]; +}; + + +struct ocrdma_tx_qp_err_stats { + u32 local_length_errors; + u32 local_protection_errors; + u32 local_qp_operation_errors; + u32 retry_count_exceeded_errors; + u32 rnr_retry_count_exceeded_errors; + u32 rsvd[3]; +}; + +struct ocrdma_rx_stats { + u32 roce_frame_bytes_lo; + u32 roce_frame_bytes_hi; + u32 roce_frame_icrc_drops; + u32 roce_frame_payload_len_drops; + u32 ud_drops; + u32 qp1_drops; + u32 psn_error_request_packets; + u32 psn_error_resp_packets; + u32 rnr_nak_timeouts; + u32 rnr_nak_receives; + u32 roce_frame_rxmt_drops; + u32 nak_count_psn_sequence_errors; + u32 rc_drop_count_lookup_errors; + u32 rq_rnr_naks; + u32 srq_rnr_naks; + u32 roce_frames_lo; + u32 roce_frames_hi; + u32 rsvd; +}; + +struct ocrdma_rx_qp_err_stats { + u32 nak_invalid_requst_errors; + u32 nak_remote_operation_errors; + u32 nak_count_remote_access_errors; + u32 local_length_errors; + u32 local_protection_errors; + u32 local_qp_operation_errors; + u32 rsvd[2]; +}; + +struct ocrdma_tx_dbg_stats { + u32 data[100]; +}; + +struct ocrdma_rx_dbg_stats { + u32 data[200]; +}; + +struct ocrdma_rdma_stats_req { + struct ocrdma_mbx_hdr hdr; + u8 reset_stats; + u8 rsvd[3]; +} __packed; + +struct ocrdma_rdma_stats_resp { + struct ocrdma_mbx_hdr hdr; + struct ocrdma_rsrc_stats act_rsrc_stats; + struct ocrdma_rsrc_stats th_rsrc_stats; + struct ocrdma_db_err_stats db_err_stats; + struct ocrdma_wqe_stats wqe_stats; + struct ocrdma_tx_stats tx_stats; + struct ocrdma_tx_qp_err_stats tx_qp_err_stats; + struct ocrdma_rx_stats rx_stats; + struct ocrdma_rx_qp_err_stats rx_qp_err_stats; + struct ocrdma_tx_dbg_stats tx_dbg_stats; + struct ocrdma_rx_dbg_stats rx_dbg_stats; +} __packed; + + +struct mgmt_hba_attribs { + u8 flashrom_version_string[32]; + u8 manufacturer_name[32]; + u32 supported_modes; + u32 rsvd0[3]; + u8 ncsi_ver_string[12]; + u32 default_extended_timeout; + u8 controller_model_number[32]; + u8 controller_description[64]; + u8 controller_serial_number[32]; + u8 ip_version_string[32]; + u8 firmware_version_string[32]; + u8 bios_version_string[32]; + u8 redboot_version_string[32]; + u8 driver_version_string[32]; + u8 fw_on_flash_version_string[32]; + u32 functionalities_supported; + u16 max_cdblength; + u8 asic_revision; + u8 generational_guid[16]; + u8 hba_port_count; + u16 default_link_down_timeout; + u8 iscsi_ver_min_max; + u8 multifunction_device; + u8 cache_valid; + u8 hba_status; + u8 max_domains_supported; + u8 phy_port; + u32 firmware_post_status; + u32 hba_mtu[8]; + u32 rsvd1[4]; +}; + +struct mgmt_controller_attrib { + struct mgmt_hba_attribs hba_attribs; + u16 pci_vendor_id; + u16 pci_device_id; + u16 pci_sub_vendor_id; + u16 pci_sub_system_id; + u8 pci_bus_number; + u8 pci_device_number; + u8 pci_function_number; + u8 interface_type; + u64 unique_identifier; + u32 rsvd0[5]; +}; + +struct ocrdma_get_ctrl_attribs_rsp { + struct ocrdma_mbx_hdr hdr; + struct mgmt_controller_attrib ctrl_attribs; +}; + + #endif /* __OCRDMA_SLI_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c new file mode 100644 index 00000000000..41a9aec9998 --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -0,0 +1,616 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2014 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#include <rdma/ib_addr.h> +#include "ocrdma_stats.h" + +static struct dentry *ocrdma_dbgfs_dir; + +static int ocrdma_add_stat(char *start, char *pcur, + char *name, u64 count) +{ + char buff[128] = {0}; + int cpy_len = 0; + + snprintf(buff, 128, "%s: %llu\n", name, count); + cpy_len = strlen(buff); + + if (pcur + cpy_len > start + OCRDMA_MAX_DBGFS_MEM) { + pr_err("%s: No space in stats buff\n", __func__); + return 0; + } + + memcpy(pcur, buff, cpy_len); + return cpy_len; +} + +static bool ocrdma_alloc_stats_mem(struct ocrdma_dev *dev) +{ + struct stats_mem *mem = &dev->stats_mem; + + /* Alloc mbox command mem*/ + mem->size = max_t(u32, sizeof(struct ocrdma_rdma_stats_req), + sizeof(struct ocrdma_rdma_stats_resp)); + + mem->va = dma_alloc_coherent(&dev->nic_info.pdev->dev, mem->size, + &mem->pa, GFP_KERNEL); + if (!mem->va) { + pr_err("%s: stats mbox allocation failed\n", __func__); + return false; + } + + memset(mem->va, 0, mem->size); + + /* Alloc debugfs mem */ + mem->debugfs_mem = kzalloc(OCRDMA_MAX_DBGFS_MEM, GFP_KERNEL); + if (!mem->debugfs_mem) { + pr_err("%s: stats debugfs mem allocation failed\n", __func__); + return false; + } + + return true; +} + +static void ocrdma_release_stats_mem(struct ocrdma_dev *dev) +{ + struct stats_mem *mem = &dev->stats_mem; + + if (mem->va) + dma_free_coherent(&dev->nic_info.pdev->dev, mem->size, + mem->va, mem->pa); + kfree(mem->debugfs_mem); +} + +static char *ocrdma_resource_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rsrc_stats *rsrc_stats = &rdma_stats->act_rsrc_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "active_dpp_pds", + (u64)rsrc_stats->dpp_pds); + pcur += ocrdma_add_stat(stats, pcur, "active_non_dpp_pds", + (u64)rsrc_stats->non_dpp_pds); + pcur += ocrdma_add_stat(stats, pcur, "active_rc_dpp_qps", + (u64)rsrc_stats->rc_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_uc_dpp_qps", + (u64)rsrc_stats->uc_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_ud_dpp_qps", + (u64)rsrc_stats->ud_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_rc_non_dpp_qps", + (u64)rsrc_stats->rc_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_uc_non_dpp_qps", + (u64)rsrc_stats->uc_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_ud_non_dpp_qps", + (u64)rsrc_stats->ud_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_srqs", + (u64)rsrc_stats->srqs); + pcur += ocrdma_add_stat(stats, pcur, "active_rbqs", + (u64)rsrc_stats->rbqs); + pcur += ocrdma_add_stat(stats, pcur, "active_64K_nsmr", + (u64)rsrc_stats->r64K_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_64K_to_2M_nsmr", + (u64)rsrc_stats->r64K_to_2M_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_2M_to_44M_nsmr", + (u64)rsrc_stats->r2M_to_44M_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_44M_to_1G_nsmr", + (u64)rsrc_stats->r44M_to_1G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_1G_to_4G_nsmr", + (u64)rsrc_stats->r1G_to_4G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_nsmr_count_4G_to_32G", + (u64)rsrc_stats->nsmr_count_4G_to_32G); + pcur += ocrdma_add_stat(stats, pcur, "active_32G_to_64G_nsmr", + (u64)rsrc_stats->r32G_to_64G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_64G_to_128G_nsmr", + (u64)rsrc_stats->r64G_to_128G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_128G_to_higher_nsmr", + (u64)rsrc_stats->r128G_to_higher_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_embedded_nsmr", + (u64)rsrc_stats->embedded_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_frmr", + (u64)rsrc_stats->frmr); + pcur += ocrdma_add_stat(stats, pcur, "active_prefetch_qps", + (u64)rsrc_stats->prefetch_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_ondemand_qps", + (u64)rsrc_stats->ondemand_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_phy_mr", + (u64)rsrc_stats->phy_mr); + pcur += ocrdma_add_stat(stats, pcur, "active_mw", + (u64)rsrc_stats->mw); + + /* Print the threshold stats */ + rsrc_stats = &rdma_stats->th_rsrc_stats; + + pcur += ocrdma_add_stat(stats, pcur, "threshold_dpp_pds", + (u64)rsrc_stats->dpp_pds); + pcur += ocrdma_add_stat(stats, pcur, "threshold_non_dpp_pds", + (u64)rsrc_stats->non_dpp_pds); + pcur += ocrdma_add_stat(stats, pcur, "threshold_rc_dpp_qps", + (u64)rsrc_stats->rc_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_uc_dpp_qps", + (u64)rsrc_stats->uc_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_ud_dpp_qps", + (u64)rsrc_stats->ud_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_rc_non_dpp_qps", + (u64)rsrc_stats->rc_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_uc_non_dpp_qps", + (u64)rsrc_stats->uc_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_ud_non_dpp_qps", + (u64)rsrc_stats->ud_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_srqs", + (u64)rsrc_stats->srqs); + pcur += ocrdma_add_stat(stats, pcur, "threshold_rbqs", + (u64)rsrc_stats->rbqs); + pcur += ocrdma_add_stat(stats, pcur, "threshold_64K_nsmr", + (u64)rsrc_stats->r64K_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_64K_to_2M_nsmr", + (u64)rsrc_stats->r64K_to_2M_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_2M_to_44M_nsmr", + (u64)rsrc_stats->r2M_to_44M_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_44M_to_1G_nsmr", + (u64)rsrc_stats->r44M_to_1G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_1G_to_4G_nsmr", + (u64)rsrc_stats->r1G_to_4G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_nsmr_count_4G_to_32G", + (u64)rsrc_stats->nsmr_count_4G_to_32G); + pcur += ocrdma_add_stat(stats, pcur, "threshold_32G_to_64G_nsmr", + (u64)rsrc_stats->r32G_to_64G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_64G_to_128G_nsmr", + (u64)rsrc_stats->r64G_to_128G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_128G_to_higher_nsmr", + (u64)rsrc_stats->r128G_to_higher_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_embedded_nsmr", + (u64)rsrc_stats->embedded_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_frmr", + (u64)rsrc_stats->frmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_prefetch_qps", + (u64)rsrc_stats->prefetch_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_ondemand_qps", + (u64)rsrc_stats->ondemand_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_phy_mr", + (u64)rsrc_stats->phy_mr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_mw", + (u64)rsrc_stats->mw); + return stats; +} + +static char *ocrdma_rx_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rx_stats *rx_stats = &rdma_stats->rx_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat + (stats, pcur, "roce_frame_bytes", + convert_to_64bit(rx_stats->roce_frame_bytes_lo, + rx_stats->roce_frame_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "roce_frame_icrc_drops", + (u64)rx_stats->roce_frame_icrc_drops); + pcur += ocrdma_add_stat(stats, pcur, "roce_frame_payload_len_drops", + (u64)rx_stats->roce_frame_payload_len_drops); + pcur += ocrdma_add_stat(stats, pcur, "ud_drops", + (u64)rx_stats->ud_drops); + pcur += ocrdma_add_stat(stats, pcur, "qp1_drops", + (u64)rx_stats->qp1_drops); + pcur += ocrdma_add_stat(stats, pcur, "psn_error_request_packets", + (u64)rx_stats->psn_error_request_packets); + pcur += ocrdma_add_stat(stats, pcur, "psn_error_resp_packets", + (u64)rx_stats->psn_error_resp_packets); + pcur += ocrdma_add_stat(stats, pcur, "rnr_nak_timeouts", + (u64)rx_stats->rnr_nak_timeouts); + pcur += ocrdma_add_stat(stats, pcur, "rnr_nak_receives", + (u64)rx_stats->rnr_nak_receives); + pcur += ocrdma_add_stat(stats, pcur, "roce_frame_rxmt_drops", + (u64)rx_stats->roce_frame_rxmt_drops); + pcur += ocrdma_add_stat(stats, pcur, "nak_count_psn_sequence_errors", + (u64)rx_stats->nak_count_psn_sequence_errors); + pcur += ocrdma_add_stat(stats, pcur, "rc_drop_count_lookup_errors", + (u64)rx_stats->rc_drop_count_lookup_errors); + pcur += ocrdma_add_stat(stats, pcur, "rq_rnr_naks", + (u64)rx_stats->rq_rnr_naks); + pcur += ocrdma_add_stat(stats, pcur, "srq_rnr_naks", + (u64)rx_stats->srq_rnr_naks); + pcur += ocrdma_add_stat(stats, pcur, "roce_frames", + convert_to_64bit(rx_stats->roce_frames_lo, + rx_stats->roce_frames_hi)); + + return stats; +} + +static char *ocrdma_tx_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_tx_stats *tx_stats = &rdma_stats->tx_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "send_pkts", + convert_to_64bit(tx_stats->send_pkts_lo, + tx_stats->send_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "write_pkts", + convert_to_64bit(tx_stats->write_pkts_lo, + tx_stats->write_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_pkts", + convert_to_64bit(tx_stats->read_pkts_lo, + tx_stats->read_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_rsp_pkts", + convert_to_64bit(tx_stats->read_rsp_pkts_lo, + tx_stats->read_rsp_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "ack_pkts", + convert_to_64bit(tx_stats->ack_pkts_lo, + tx_stats->ack_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "send_bytes", + convert_to_64bit(tx_stats->send_bytes_lo, + tx_stats->send_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "write_bytes", + convert_to_64bit(tx_stats->write_bytes_lo, + tx_stats->write_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_req_bytes", + convert_to_64bit(tx_stats->read_req_bytes_lo, + tx_stats->read_req_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_rsp_bytes", + convert_to_64bit(tx_stats->read_rsp_bytes_lo, + tx_stats->read_rsp_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "ack_timeouts", + (u64)tx_stats->ack_timeouts); + + return stats; +} + +static char *ocrdma_wqe_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_wqe_stats *wqe_stats = &rdma_stats->wqe_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "large_send_rc_wqes", + convert_to_64bit(wqe_stats->large_send_rc_wqes_lo, + wqe_stats->large_send_rc_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "large_write_rc_wqes", + convert_to_64bit(wqe_stats->large_write_rc_wqes_lo, + wqe_stats->large_write_rc_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_wqes", + convert_to_64bit(wqe_stats->read_wqes_lo, + wqe_stats->read_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "frmr_wqes", + convert_to_64bit(wqe_stats->frmr_wqes_lo, + wqe_stats->frmr_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "mw_bind_wqes", + convert_to_64bit(wqe_stats->mw_bind_wqes_lo, + wqe_stats->mw_bind_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "invalidate_wqes", + convert_to_64bit(wqe_stats->invalidate_wqes_lo, + wqe_stats->invalidate_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "dpp_wqe_drops", + (u64)wqe_stats->dpp_wqe_drops); + return stats; +} + +static char *ocrdma_db_errstats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_db_err_stats *db_err_stats = &rdma_stats->db_err_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "sq_doorbell_errors", + (u64)db_err_stats->sq_doorbell_errors); + pcur += ocrdma_add_stat(stats, pcur, "cq_doorbell_errors", + (u64)db_err_stats->cq_doorbell_errors); + pcur += ocrdma_add_stat(stats, pcur, "rq_srq_doorbell_errors", + (u64)db_err_stats->rq_srq_doorbell_errors); + pcur += ocrdma_add_stat(stats, pcur, "cq_overflow_errors", + (u64)db_err_stats->cq_overflow_errors); + return stats; +} + +static char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rx_qp_err_stats *rx_qp_err_stats = + &rdma_stats->rx_qp_err_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "nak_invalid_requst_errors", + (u64)rx_qp_err_stats->nak_invalid_requst_errors); + pcur += ocrdma_add_stat(stats, pcur, "nak_remote_operation_errors", + (u64)rx_qp_err_stats->nak_remote_operation_errors); + pcur += ocrdma_add_stat(stats, pcur, "nak_count_remote_access_errors", + (u64)rx_qp_err_stats->nak_count_remote_access_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_length_errors", + (u64)rx_qp_err_stats->local_length_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_protection_errors", + (u64)rx_qp_err_stats->local_protection_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_qp_operation_errors", + (u64)rx_qp_err_stats->local_qp_operation_errors); + return stats; +} + +static char *ocrdma_txqp_errstats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_tx_qp_err_stats *tx_qp_err_stats = + &rdma_stats->tx_qp_err_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "local_length_errors", + (u64)tx_qp_err_stats->local_length_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_protection_errors", + (u64)tx_qp_err_stats->local_protection_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_qp_operation_errors", + (u64)tx_qp_err_stats->local_qp_operation_errors); + pcur += ocrdma_add_stat(stats, pcur, "retry_count_exceeded_errors", + (u64)tx_qp_err_stats->retry_count_exceeded_errors); + pcur += ocrdma_add_stat(stats, pcur, "rnr_retry_count_exceeded_errors", + (u64)tx_qp_err_stats->rnr_retry_count_exceeded_errors); + return stats; +} + +static char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev) +{ + int i; + char *pstats = dev->stats_mem.debugfs_mem; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_tx_dbg_stats *tx_dbg_stats = + &rdma_stats->tx_dbg_stats; + + memset(pstats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + for (i = 0; i < 100; i++) + pstats += snprintf(pstats, 80, "DW[%d] = 0x%x\n", i, + tx_dbg_stats->data[i]); + + return dev->stats_mem.debugfs_mem; +} + +static char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev) +{ + int i; + char *pstats = dev->stats_mem.debugfs_mem; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rx_dbg_stats *rx_dbg_stats = + &rdma_stats->rx_dbg_stats; + + memset(pstats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + for (i = 0; i < 200; i++) + pstats += snprintf(pstats, 80, "DW[%d] = 0x%x\n", i, + rx_dbg_stats->data[i]); + + return dev->stats_mem.debugfs_mem; +} + +static void ocrdma_update_stats(struct ocrdma_dev *dev) +{ + ulong now = jiffies, secs; + int status = 0; + + secs = jiffies_to_msecs(now - dev->last_stats_time) / 1000U; + if (secs) { + /* update */ + status = ocrdma_mbx_rdma_stats(dev, false); + if (status) + pr_err("%s: stats mbox failed with status = %d\n", + __func__, status); + dev->last_stats_time = jiffies; + } +} + +static ssize_t ocrdma_dbgfs_ops_read(struct file *filp, char __user *buffer, + size_t usr_buf_len, loff_t *ppos) +{ + struct ocrdma_stats *pstats = filp->private_data; + struct ocrdma_dev *dev = pstats->dev; + ssize_t status = 0; + char *data = NULL; + + /* No partial reads */ + if (*ppos != 0) + return 0; + + mutex_lock(&dev->stats_lock); + + ocrdma_update_stats(dev); + + switch (pstats->type) { + case OCRDMA_RSRC_STATS: + data = ocrdma_resource_stats(dev); + break; + case OCRDMA_RXSTATS: + data = ocrdma_rx_stats(dev); + break; + case OCRDMA_WQESTATS: + data = ocrdma_wqe_stats(dev); + break; + case OCRDMA_TXSTATS: + data = ocrdma_tx_stats(dev); + break; + case OCRDMA_DB_ERRSTATS: + data = ocrdma_db_errstats(dev); + break; + case OCRDMA_RXQP_ERRSTATS: + data = ocrdma_rxqp_errstats(dev); + break; + case OCRDMA_TXQP_ERRSTATS: + data = ocrdma_txqp_errstats(dev); + break; + case OCRDMA_TX_DBG_STATS: + data = ocrdma_tx_dbg_stats(dev); + break; + case OCRDMA_RX_DBG_STATS: + data = ocrdma_rx_dbg_stats(dev); + break; + + default: + status = -EFAULT; + goto exit; + } + + if (usr_buf_len < strlen(data)) { + status = -ENOSPC; + goto exit; + } + + status = simple_read_from_buffer(buffer, usr_buf_len, ppos, data, + strlen(data)); +exit: + mutex_unlock(&dev->stats_lock); + return status; +} + +static const struct file_operations ocrdma_dbg_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = ocrdma_dbgfs_ops_read, +}; + +void ocrdma_add_port_stats(struct ocrdma_dev *dev) +{ + if (!ocrdma_dbgfs_dir) + return; + + /* Create post stats base dir */ + dev->dir = debugfs_create_dir(dev->ibdev.name, ocrdma_dbgfs_dir); + if (!dev->dir) + goto err; + + dev->rsrc_stats.type = OCRDMA_RSRC_STATS; + dev->rsrc_stats.dev = dev; + if (!debugfs_create_file("resource_stats", S_IRUSR, dev->dir, + &dev->rsrc_stats, &ocrdma_dbg_ops)) + goto err; + + dev->rx_stats.type = OCRDMA_RXSTATS; + dev->rx_stats.dev = dev; + if (!debugfs_create_file("rx_stats", S_IRUSR, dev->dir, + &dev->rx_stats, &ocrdma_dbg_ops)) + goto err; + + dev->wqe_stats.type = OCRDMA_WQESTATS; + dev->wqe_stats.dev = dev; + if (!debugfs_create_file("wqe_stats", S_IRUSR, dev->dir, + &dev->wqe_stats, &ocrdma_dbg_ops)) + goto err; + + dev->tx_stats.type = OCRDMA_TXSTATS; + dev->tx_stats.dev = dev; + if (!debugfs_create_file("tx_stats", S_IRUSR, dev->dir, + &dev->tx_stats, &ocrdma_dbg_ops)) + goto err; + + dev->db_err_stats.type = OCRDMA_DB_ERRSTATS; + dev->db_err_stats.dev = dev; + if (!debugfs_create_file("db_err_stats", S_IRUSR, dev->dir, + &dev->db_err_stats, &ocrdma_dbg_ops)) + goto err; + + + dev->tx_qp_err_stats.type = OCRDMA_TXQP_ERRSTATS; + dev->tx_qp_err_stats.dev = dev; + if (!debugfs_create_file("tx_qp_err_stats", S_IRUSR, dev->dir, + &dev->tx_qp_err_stats, &ocrdma_dbg_ops)) + goto err; + + dev->rx_qp_err_stats.type = OCRDMA_RXQP_ERRSTATS; + dev->rx_qp_err_stats.dev = dev; + if (!debugfs_create_file("rx_qp_err_stats", S_IRUSR, dev->dir, + &dev->rx_qp_err_stats, &ocrdma_dbg_ops)) + goto err; + + + dev->tx_dbg_stats.type = OCRDMA_TX_DBG_STATS; + dev->tx_dbg_stats.dev = dev; + if (!debugfs_create_file("tx_dbg_stats", S_IRUSR, dev->dir, + &dev->tx_dbg_stats, &ocrdma_dbg_ops)) + goto err; + + dev->rx_dbg_stats.type = OCRDMA_RX_DBG_STATS; + dev->rx_dbg_stats.dev = dev; + if (!debugfs_create_file("rx_dbg_stats", S_IRUSR, dev->dir, + &dev->rx_dbg_stats, &ocrdma_dbg_ops)) + goto err; + + /* Now create dma_mem for stats mbx command */ + if (!ocrdma_alloc_stats_mem(dev)) + goto err; + + mutex_init(&dev->stats_lock); + + return; +err: + ocrdma_release_stats_mem(dev); + debugfs_remove_recursive(dev->dir); + dev->dir = NULL; +} + +void ocrdma_rem_port_stats(struct ocrdma_dev *dev) +{ + if (!dev->dir) + return; + mutex_destroy(&dev->stats_lock); + ocrdma_release_stats_mem(dev); + debugfs_remove(dev->dir); +} + +void ocrdma_init_debugfs(void) +{ + /* Create base dir in debugfs root dir */ + ocrdma_dbgfs_dir = debugfs_create_dir("ocrdma", NULL); +} + +void ocrdma_rem_debugfs(void) +{ + debugfs_remove_recursive(ocrdma_dbgfs_dir); +} diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.h b/drivers/infiniband/hw/ocrdma/ocrdma_stats.h new file mode 100644 index 00000000000..5f5e20c46d7 --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.h @@ -0,0 +1,54 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2014 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_STATS_H__ +#define __OCRDMA_STATS_H__ + +#include <linux/debugfs.h> +#include "ocrdma.h" +#include "ocrdma_hw.h" + +#define OCRDMA_MAX_DBGFS_MEM 4096 + +enum OCRDMA_STATS_TYPE { + OCRDMA_RSRC_STATS, + OCRDMA_RXSTATS, + OCRDMA_WQESTATS, + OCRDMA_TXSTATS, + OCRDMA_DB_ERRSTATS, + OCRDMA_RXQP_ERRSTATS, + OCRDMA_TXQP_ERRSTATS, + OCRDMA_TX_DBG_STATS, + OCRDMA_RX_DBG_STATS +}; + +void ocrdma_rem_debugfs(void); +void ocrdma_init_debugfs(void); +void ocrdma_rem_port_stats(struct ocrdma_dev *dev); +void ocrdma_add_port_stats(struct ocrdma_dev *dev); + +#endif /* __OCRDMA_STATS_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 6e982bb43c3..edf6211d84b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -53,7 +53,7 @@ int ocrdma_query_gid(struct ib_device *ibdev, u8 port, dev = get_ocrdma_dev(ibdev); memset(sgid, 0, sizeof(*sgid)); - if (index >= OCRDMA_MAX_SGID) + if (index > OCRDMA_MAX_SGID) return -EINVAL; memcpy(sgid, &dev->sgid_tbl[index], sizeof(*sgid)); @@ -89,7 +89,7 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr) attr->max_cq = dev->attr.max_cq; attr->max_cqe = dev->attr.max_cqe; attr->max_mr = dev->attr.max_mr; - attr->max_mw = 0; + attr->max_mw = dev->attr.max_mw; attr->max_pd = dev->attr.max_pd; attr->atomic_cap = 0; attr->max_fmr = 0; @@ -141,10 +141,9 @@ static inline void get_link_speed_and_width(struct ocrdma_dev *dev, /* Unsupported */ *ib_speed = IB_SPEED_SDR; *ib_width = IB_WIDTH_1X; - }; + } } - int ocrdma_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { @@ -176,7 +175,7 @@ int ocrdma_query_port(struct ib_device *ibdev, props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP | - IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP; + IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_IP_BASED_GIDS; props->gid_tbl_len = OCRDMA_MAX_SGID; props->pkey_tbl_len = 1; props->bad_pkey_cntr = 0; @@ -267,7 +266,7 @@ static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev, if (udata && uctx) { pd->dpp_enabled = - dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY; + ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R; pd->num_dpp_qp = pd->dpp_enabled ? OCRDMA_PD_MAX_DPP_ENABLED_QP : 0; } @@ -726,10 +725,10 @@ static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr, u32 num_pbes) { struct ocrdma_pbe *pbe; - struct ib_umem_chunk *chunk; + struct scatterlist *sg; struct ocrdma_pbl *pbl_tbl = mr->hwmr.pbl_table; struct ib_umem *umem = mr->umem; - int i, shift, pg_cnt, pages, pbe_cnt, total_num_pbes = 0; + int shift, pg_cnt, pages, pbe_cnt, entry, total_num_pbes = 0; if (!mr->hwmr.num_pbes) return; @@ -739,39 +738,37 @@ static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr, shift = ilog2(umem->page_size); - list_for_each_entry(chunk, &umem->chunk_list, list) { - /* get all the dma regions from the chunk. */ - for (i = 0; i < chunk->nmap; i++) { - pages = sg_dma_len(&chunk->page_list[i]) >> shift; - for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) { - /* store the page address in pbe */ - pbe->pa_lo = - cpu_to_le32(sg_dma_address - (&chunk->page_list[i]) + - (umem->page_size * pg_cnt)); - pbe->pa_hi = - cpu_to_le32(upper_32_bits - ((sg_dma_address - (&chunk->page_list[i]) + - umem->page_size * pg_cnt))); - pbe_cnt += 1; - total_num_pbes += 1; - pbe++; - - /* if done building pbes, issue the mbx cmd. */ - if (total_num_pbes == num_pbes) - return; - - /* if the given pbl is full storing the pbes, - * move to next pbl. - */ - if (pbe_cnt == - (mr->hwmr.pbl_size / sizeof(u64))) { - pbl_tbl++; - pbe = (struct ocrdma_pbe *)pbl_tbl->va; - pbe_cnt = 0; - } + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + pages = sg_dma_len(sg) >> shift; + for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) { + /* store the page address in pbe */ + pbe->pa_lo = + cpu_to_le32(sg_dma_address + (sg) + + (umem->page_size * pg_cnt)); + pbe->pa_hi = + cpu_to_le32(upper_32_bits + ((sg_dma_address + (sg) + + umem->page_size * pg_cnt))); + pbe_cnt += 1; + total_num_pbes += 1; + pbe++; + + /* if done building pbes, issue the mbx cmd. */ + if (total_num_pbes == num_pbes) + return; + + /* if the given pbl is full storing the pbes, + * move to next pbl. + */ + if (pbe_cnt == + (mr->hwmr.pbl_size / sizeof(u64))) { + pbl_tbl++; + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + pbe_cnt = 0; } + } } } @@ -840,8 +837,7 @@ int ocrdma_dereg_mr(struct ib_mr *ib_mr) status = ocrdma_mbx_dealloc_lkey(dev, mr->hwmr.fr_mr, mr->hwmr.lkey); - if (mr->hwmr.fr_mr == 0) - ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); + ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); /* it could be user registered memory. */ if (mr->umem) @@ -910,6 +906,7 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, int entries, int vector, spin_lock_init(&cq->comp_handler_lock); INIT_LIST_HEAD(&cq->sq_head); INIT_LIST_HEAD(&cq->rq_head); + cq->first_arm = true; if (ib_ctx) { uctx = get_ocrdma_ucontext(ib_ctx); @@ -927,9 +924,7 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, int entries, int vector, goto ctx_err; } cq->phase = OCRDMA_CQE_VALID; - cq->arm_needed = true; dev->cq_tbl[cq->id] = cq; - return &cq->ibcq; ctx_err: @@ -952,15 +947,52 @@ int ocrdma_resize_cq(struct ib_cq *ibcq, int new_cnt, return status; } +static void ocrdma_flush_cq(struct ocrdma_cq *cq) +{ + int cqe_cnt; + int valid_count = 0; + unsigned long flags; + + struct ocrdma_dev *dev = get_ocrdma_dev(cq->ibcq.device); + struct ocrdma_cqe *cqe = NULL; + + cqe = cq->va; + cqe_cnt = cq->cqe_cnt; + + /* Last irq might have scheduled a polling thread + * sync-up with it before hard flushing. + */ + spin_lock_irqsave(&cq->cq_lock, flags); + while (cqe_cnt) { + if (is_cqe_valid(cq, cqe)) + valid_count++; + cqe++; + cqe_cnt--; + } + ocrdma_ring_cq_db(dev, cq->id, false, false, valid_count); + spin_unlock_irqrestore(&cq->cq_lock, flags); +} + int ocrdma_destroy_cq(struct ib_cq *ibcq) { int status; struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); + struct ocrdma_eq *eq = NULL; struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device); int pdid = 0; + u32 irq, indx; - status = ocrdma_mbx_destroy_cq(dev, cq); + dev->cq_tbl[cq->id] = NULL; + indx = ocrdma_get_eq_table_index(dev, cq->eqn); + if (indx == -EINVAL) + BUG(); + eq = &dev->eq_tbl[indx]; + irq = ocrdma_get_irq(dev, eq); + synchronize_irq(irq); + ocrdma_flush_cq(cq); + + status = ocrdma_mbx_destroy_cq(dev, cq); if (cq->ucontext) { pdid = cq->ucontext->cntxt_pd->id; ocrdma_del_mmap(cq->ucontext, (u64) cq->pa, @@ -969,7 +1001,6 @@ int ocrdma_destroy_cq(struct ib_cq *ibcq) ocrdma_get_db_addr(dev, pdid), dev->nic_info.db_page_size); } - dev->cq_tbl[cq->id] = NULL; kfree(cq); return status; @@ -1092,15 +1123,9 @@ static int ocrdma_copy_qp_uresp(struct ocrdma_qp *qp, } uresp.db_page_addr = usr_db; uresp.db_page_size = dev->nic_info.db_page_size; - if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { - uresp.db_sq_offset = OCRDMA_DB_GEN2_SQ_OFFSET; - uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET; - uresp.db_shift = 24; - } else { - uresp.db_sq_offset = OCRDMA_DB_SQ_OFFSET; - uresp.db_rq_offset = OCRDMA_DB_RQ_OFFSET; - uresp.db_shift = 16; - } + uresp.db_sq_offset = OCRDMA_DB_GEN2_SQ_OFFSET; + uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET; + uresp.db_shift = OCRDMA_DB_RQ_SHIFT; if (qp->dpp_enabled) { uresp.dpp_credit = dpp_credit_lmt; @@ -1132,7 +1157,7 @@ err: static void ocrdma_set_qp_db(struct ocrdma_dev *dev, struct ocrdma_qp *qp, struct ocrdma_pd *pd) { - if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { qp->sq_db = dev->nic_info.db + (pd->id * dev->nic_info.db_page_size) + OCRDMA_DB_GEN2_SQ_OFFSET; @@ -1182,7 +1207,6 @@ static void ocrdma_set_qp_init_params(struct ocrdma_qp *qp, qp->signaled = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) ? true : false; } - static void ocrdma_store_gsi_qp_cq(struct ocrdma_dev *dev, struct ib_qp_init_attr *attrs) { @@ -1268,17 +1292,6 @@ gen_err: return ERR_PTR(status); } - -static void ocrdma_flush_rq_db(struct ocrdma_qp *qp) -{ - if (qp->db_cache) { - u32 val = qp->rq.dbid | (qp->db_cache << - ocrdma_get_num_posted_shift(qp)); - iowrite32(val, qp->rq_db); - qp->db_cache = 0; - } -} - int _ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) { @@ -1296,9 +1309,7 @@ int _ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, */ if (status < 0) return status; - status = ocrdma_mbx_modify_qp(dev, qp, attr, attr_mask, old_qps); - if (!status && attr_mask & IB_QP_STATE && attr->qp_state == IB_QPS_RTR) - ocrdma_flush_rq_db(qp); + status = ocrdma_mbx_modify_qp(dev, qp, attr, attr_mask); return status; } @@ -1326,7 +1337,8 @@ int ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, new_qps = old_qps; spin_unlock_irqrestore(&qp->q_lock, flags); - if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask)) { + if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask, + IB_LINK_LAYER_ETHERNET)) { pr_err("%s(%d) invalid attribute mask=0x%x specified for\n" "qpn=0x%x of type=0x%x old_qps=0x%x, new_qps=0x%x\n", __func__, dev->id, attr_mask, qp->id, ibqp->qp_type, @@ -1415,7 +1427,7 @@ int ocrdma_query_qp(struct ib_qp *ibqp, OCRDMA_QP_PARAMS_HOP_LMT_MASK) >> OCRDMA_QP_PARAMS_HOP_LMT_SHIFT; qp_attr->ah_attr.grh.traffic_class = (params.tclass_sq_psn & - OCRDMA_QP_PARAMS_SQ_PSN_MASK) >> + OCRDMA_QP_PARAMS_TCLASS_MASK) >> OCRDMA_QP_PARAMS_TCLASS_SHIFT; qp_attr->ah_attr.ah_flags = IB_AH_GRH; @@ -1509,7 +1521,7 @@ static void ocrdma_discard_cqes(struct ocrdma_qp *qp, struct ocrdma_cq *cq) int discard_cnt = 0; u32 cur_getp, stop_getp; struct ocrdma_cqe *cqe; - u32 qpn = 0; + u32 qpn = 0, wqe_idx = 0; spin_lock_irqsave(&cq->cq_lock, cq_flags); @@ -1538,24 +1550,29 @@ static void ocrdma_discard_cqes(struct ocrdma_qp *qp, struct ocrdma_cq *cq) if (qpn == 0 || qpn != qp->id) goto skip_cqe; - /* mark cqe discarded so that it is not picked up later - * in the poll_cq(). - */ - discard_cnt += 1; - cqe->cmn.qpn = 0; if (is_cqe_for_sq(cqe)) { ocrdma_hwq_inc_tail(&qp->sq); } else { if (qp->srq) { + wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >> + OCRDMA_CQE_BUFTAG_SHIFT) & + qp->srq->rq.max_wqe_idx; + if (wqe_idx < 1) + BUG(); spin_lock_irqsave(&qp->srq->q_lock, flags); ocrdma_hwq_inc_tail(&qp->srq->rq); - ocrdma_srq_toggle_bit(qp->srq, cur_getp); + ocrdma_srq_toggle_bit(qp->srq, wqe_idx - 1); spin_unlock_irqrestore(&qp->srq->q_lock, flags); } else { ocrdma_hwq_inc_tail(&qp->rq); } } + /* mark cqe discarded so that it is not picked up later + * in the poll_cq(). + */ + discard_cnt += 1; + cqe->cmn.qpn = 0; skip_cqe: cur_getp = (cur_getp + 1) % cq->max_hw_cqe; } while (cur_getp != stop_getp); @@ -1658,7 +1675,7 @@ static int ocrdma_copy_srq_uresp(struct ocrdma_dev *dev, struct ocrdma_srq *srq, (srq->pd->id * dev->nic_info.db_page_size); uresp.db_page_size = dev->nic_info.db_page_size; uresp.num_rqe_allocated = srq->rq.max_cnt; - if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET; uresp.db_shift = 24; } else { @@ -1981,9 +1998,7 @@ static int ocrdma_build_fr(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, wqe_size = roundup(wqe_size, OCRDMA_WQE_ALIGN_BYTES); - if ((wr->wr.fast_reg.page_list_len > - qp->dev->attr.max_pages_per_frmr) || - (wr->wr.fast_reg.length > 0xffffffffULL)) + if (wr->wr.fast_reg.page_list_len > qp->dev->attr.max_pages_per_frmr) return -EINVAL; hdr->cw |= (OCRDMA_FR_MR << OCRDMA_WQE_OPCODE_SHIFT); @@ -2010,15 +2025,15 @@ static int ocrdma_build_fr(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, fast_reg->num_sges = wr->wr.fast_reg.page_list_len; fast_reg->size_sge = get_encoded_page_size(1 << wr->wr.fast_reg.page_shift); - mr = (struct ocrdma_mr *) (unsigned long) qp->dev->stag_arr[(hdr->lkey >> 8) & - (OCRDMA_MAX_STAG - 1)]; + mr = (struct ocrdma_mr *) (unsigned long) + qp->dev->stag_arr[(hdr->lkey >> 8) & (OCRDMA_MAX_STAG - 1)]; build_frmr_pbes(wr, mr->hwmr.pbl_table, &mr->hwmr); return 0; } static void ocrdma_ring_sq_db(struct ocrdma_qp *qp) { - u32 val = qp->sq.dbid | (1 << 16); + u32 val = qp->sq.dbid | (1 << OCRDMA_DB_SQ_SHIFT); iowrite32(val, qp->sq_db); } @@ -2123,12 +2138,9 @@ int ocrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, static void ocrdma_ring_rq_db(struct ocrdma_qp *qp) { - u32 val = qp->rq.dbid | (1 << ocrdma_get_num_posted_shift(qp)); + u32 val = qp->rq.dbid | (1 << OCRDMA_DB_RQ_SHIFT); - if (qp->state != OCRDMA_QPS_INIT) - iowrite32(val, qp->rq_db); - else - qp->db_cache++; + iowrite32(val, qp->rq_db); } static void ocrdma_build_rqe(struct ocrdma_hdr_wqe *rqe, struct ib_recv_wr *wr, @@ -2214,7 +2226,7 @@ static int ocrdma_srq_get_idx(struct ocrdma_srq *srq) if (row == srq->bit_fields_len) BUG(); - return indx; + return indx + 1; /* Use from index 1 */ } static void ocrdma_ring_srq_db(struct ocrdma_srq *srq) @@ -2331,7 +2343,7 @@ static enum ib_wc_status ocrdma_to_ibwc_err(u16 status) default: ibwc_status = IB_WC_GENERAL_ERR; break; - }; + } return ibwc_status; } @@ -2370,7 +2382,7 @@ static void ocrdma_update_wc(struct ocrdma_qp *qp, struct ib_wc *ibwc, pr_err("%s() invalid opcode received = 0x%x\n", __func__, hdr->cw & OCRDMA_WQE_OPCODE_MASK); break; - }; + } } static void ocrdma_set_cqe_status_flushed(struct ocrdma_qp *qp, @@ -2551,10 +2563,13 @@ static void ocrdma_update_free_srq_cqe(struct ib_wc *ibwc, srq = get_ocrdma_srq(qp->ibqp.srq); wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >> - OCRDMA_CQE_BUFTAG_SHIFT) & srq->rq.max_wqe_idx; + OCRDMA_CQE_BUFTAG_SHIFT) & srq->rq.max_wqe_idx; + if (wqe_idx < 1) + BUG(); + ibwc->wr_id = srq->rqe_wr_id_tbl[wqe_idx]; spin_lock_irqsave(&srq->q_lock, flags); - ocrdma_srq_toggle_bit(srq, wqe_idx); + ocrdma_srq_toggle_bit(srq, wqe_idx - 1); spin_unlock_irqrestore(&srq->q_lock, flags); ocrdma_hwq_inc_tail(&srq->rq); } @@ -2706,10 +2721,18 @@ expand_cqe: } stop_cqe: cq->getp = cur_getp; - if (polled_hw_cqes || expand || stop) { - ocrdma_ring_cq_db(dev, cq->id, cq->armed, cq->solicited, + if (cq->deferred_arm) { + ocrdma_ring_cq_db(dev, cq->id, true, cq->deferred_sol, polled_hw_cqes); + cq->deferred_arm = false; + cq->deferred_sol = false; + } else { + /* We need to pop the CQE. No need to arm */ + ocrdma_ring_cq_db(dev, cq->id, false, cq->deferred_sol, + polled_hw_cqes); + cq->deferred_sol = false; } + return i; } @@ -2781,30 +2804,28 @@ int ocrdma_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags cq_flags) struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device); u16 cq_id; - u16 cur_getp; - struct ocrdma_cqe *cqe; unsigned long flags; + bool arm_needed = false, sol_needed = false; cq_id = cq->id; spin_lock_irqsave(&cq->cq_lock, flags); if (cq_flags & IB_CQ_NEXT_COMP || cq_flags & IB_CQ_SOLICITED) - cq->armed = true; + arm_needed = true; if (cq_flags & IB_CQ_SOLICITED) - cq->solicited = true; - - cur_getp = cq->getp; - cqe = cq->va + cur_getp; + sol_needed = true; - /* check whether any valid cqe exist or not, if not then safe to - * arm. If cqe is not yet consumed, then let it get consumed and then - * we arm it to avoid false interrupts. - */ - if (!is_cqe_valid(cq, cqe) || cq->arm_needed) { - cq->arm_needed = false; - ocrdma_ring_cq_db(dev, cq_id, cq->armed, cq->solicited, 0); + if (cq->first_arm) { + ocrdma_ring_cq_db(dev, cq_id, arm_needed, sol_needed, 0); + cq->first_arm = false; + goto skip_defer; } + cq->deferred_arm = true; + +skip_defer: + cq->deferred_sol = sol_needed; spin_unlock_irqrestore(&cq->cq_lock, flags); + return 0; } @@ -2839,7 +2860,8 @@ struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *ibpd, int max_page_list_len) goto mbx_err; mr->ibmr.rkey = mr->hwmr.lkey; mr->ibmr.lkey = mr->hwmr.lkey; - dev->stag_arr[(mr->hwmr.lkey >> 8) & (OCRDMA_MAX_STAG - 1)] = (unsigned long) mr; + dev->stag_arr[(mr->hwmr.lkey >> 8) & (OCRDMA_MAX_STAG - 1)] = + (unsigned long) mr; return &mr->ibmr; mbx_err: ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 1946101419a..c00ae093b6f 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -868,8 +868,10 @@ struct qib_devdata { /* last buffer for user use */ u32 lastctxt_piobuf; - /* saturating counter of (non-port-specific) device interrupts */ - u32 int_counter; + /* reset value */ + u64 z_int_counter; + /* percpu intcounter */ + u64 __percpu *int_counter; /* pio bufs allocated per ctxt */ u32 pbufsctxt; @@ -1184,7 +1186,7 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *); void qib_set_ctxtcnt(struct qib_devdata *); int qib_create_ctxts(struct qib_devdata *dd); struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *, u32, int); -void qib_init_pportdata(struct qib_pportdata *, struct qib_devdata *, u8, u8); +int qib_init_pportdata(struct qib_pportdata *, struct qib_devdata *, u8, u8); void qib_free_ctxtdata(struct qib_devdata *, struct qib_ctxtdata *); u32 qib_kreceive(struct qib_ctxtdata *, u32 *, u32 *); @@ -1449,6 +1451,10 @@ void qib_nomsi(struct qib_devdata *); void qib_nomsix(struct qib_devdata *); void qib_pcie_getcmd(struct qib_devdata *, u16 *, u8 *, u8 *); void qib_pcie_reenable(struct qib_devdata *, u16, u8, u8); +/* interrupts for device */ +u64 qib_int_counter(struct qib_devdata *); +/* interrupt for all devices */ +u64 qib_sps_ints(void); /* * dma_addr wrappers - all 0's invalid for hw diff --git a/drivers/infiniband/hw/qib/qib_diag.c b/drivers/infiniband/hw/qib/qib_diag.c index 1686fd4bda8..5dfda4c5cc9 100644 --- a/drivers/infiniband/hw/qib/qib_diag.c +++ b/drivers/infiniband/hw/qib/qib_diag.c @@ -546,7 +546,7 @@ static ssize_t qib_diagpkt_write(struct file *fp, size_t count, loff_t *off) { u32 __iomem *piobuf; - u32 plen, clen, pbufn; + u32 plen, pbufn, maxlen_reserve; struct qib_diag_xpkt dp; u32 *tmpbuf = NULL; struct qib_devdata *dd; @@ -590,15 +590,20 @@ static ssize_t qib_diagpkt_write(struct file *fp, } ppd = &dd->pport[dp.port - 1]; - /* need total length before first word written */ - /* +1 word is for the qword padding */ - plen = sizeof(u32) + dp.len; - clen = dp.len >> 2; - - if ((plen + 4) > ppd->ibmaxlen) { + /* + * need total length before first word written, plus 2 Dwords. One Dword + * is for padding so we get the full user data when not aligned on + * a word boundary. The other Dword is to make sure we have room for the + * ICRC which gets tacked on later. + */ + maxlen_reserve = 2 * sizeof(u32); + if (dp.len > ppd->ibmaxlen - maxlen_reserve) { ret = -EINVAL; - goto bail; /* before writing pbc */ + goto bail; } + + plen = sizeof(u32) + dp.len; + tmpbuf = vmalloc(plen); if (!tmpbuf) { qib_devinfo(dd->pcidev, @@ -638,11 +643,11 @@ static ssize_t qib_diagpkt_write(struct file *fp, */ if (dd->flags & QIB_PIO_FLUSH_WC) { qib_flush_wc(); - qib_pio_copy(piobuf + 2, tmpbuf, clen - 1); + qib_pio_copy(piobuf + 2, tmpbuf, plen - 1); qib_flush_wc(); - __raw_writel(tmpbuf[clen - 1], piobuf + clen + 1); + __raw_writel(tmpbuf[plen - 1], piobuf + plen + 1); } else - qib_pio_copy(piobuf + 2, tmpbuf, clen); + qib_pio_copy(piobuf + 2, tmpbuf, plen); if (dd->flags & QIB_USE_SPCL_TRIG) { u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023; @@ -689,28 +694,23 @@ int qib_register_observer(struct qib_devdata *dd, const struct diag_observer *op) { struct diag_observer_list_elt *olp; - int ret = -EINVAL; + unsigned long flags; if (!dd || !op) - goto bail; - ret = -ENOMEM; + return -EINVAL; olp = vmalloc(sizeof *olp); if (!olp) { pr_err("vmalloc for observer failed\n"); - goto bail; + return -ENOMEM; } - if (olp) { - unsigned long flags; - spin_lock_irqsave(&dd->qib_diag_trans_lock, flags); - olp->op = op; - olp->next = dd->diag_observer_list; - dd->diag_observer_list = olp; - spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags); - ret = 0; - } -bail: - return ret; + spin_lock_irqsave(&dd->qib_diag_trans_lock, flags); + olp->op = op; + olp->next = dd->diag_observer_list; + dd->diag_observer_list = olp; + spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags); + + return 0; } /* Remove all registered observers when device is closed */ diff --git a/drivers/infiniband/hw/qib/qib_dma.c b/drivers/infiniband/hw/qib/qib_dma.c index 2920bb39a65..59fe092b4b0 100644 --- a/drivers/infiniband/hw/qib/qib_dma.c +++ b/drivers/infiniband/hw/qib/qib_dma.c @@ -108,6 +108,10 @@ static int qib_map_sg(struct ib_device *dev, struct scatterlist *sgl, ret = 0; break; } + sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif } return ret; } @@ -119,21 +123,6 @@ static void qib_unmap_sg(struct ib_device *dev, BUG_ON(!valid_dma_direction(direction)); } -static u64 qib_sg_dma_address(struct ib_device *dev, struct scatterlist *sg) -{ - u64 addr = (u64) page_address(sg_page(sg)); - - if (addr) - addr += sg->offset; - return addr; -} - -static unsigned int qib_sg_dma_len(struct ib_device *dev, - struct scatterlist *sg) -{ - return sg->length; -} - static void qib_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { @@ -173,8 +162,6 @@ struct ib_dma_mapping_ops qib_dma_mapping_ops = { .unmap_page = qib_dma_unmap_page, .map_sg = qib_map_sg, .unmap_sg = qib_unmap_sg, - .dma_address = qib_sg_dma_address, - .dma_len = qib_sg_dma_len, .sync_single_for_cpu = qib_sync_single_for_cpu, .sync_single_for_device = qib_sync_single_for_device, .alloc_coherent = qib_dma_alloc_coherent, diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 275f247f9fc..b15e34eeef6 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -1459,7 +1459,7 @@ static int get_a_ctxt(struct file *fp, const struct qib_user_info *uinfo, cused++; else cfree++; - if (pusable && cfree && cused < inuse) { + if (cfree && cused < inuse) { udd = dd; inuse = cused; } @@ -1578,7 +1578,7 @@ static int do_qib_user_sdma_queue_create(struct file *fp) struct qib_ctxtdata *rcd = fd->rcd; struct qib_devdata *dd = rcd->dd; - if (dd->flags & QIB_HAS_SEND_DMA) + if (dd->flags & QIB_HAS_SEND_DMA) { fd->pq = qib_user_sdma_queue_create(&dd->pcidev->dev, dd->unit, @@ -1586,6 +1586,7 @@ static int do_qib_user_sdma_queue_create(struct file *fp) fd->subctxt); if (!fd->pq) return -ENOMEM; + } return 0; } diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c index f247fc6e618..cab610ccd50 100644 --- a/drivers/infiniband/hw/qib/qib_fs.c +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -105,6 +105,7 @@ static int create_file(const char *name, umode_t mode, static ssize_t driver_stats_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + qib_stats.sps_ints = qib_sps_ints(); return simple_read_from_buffer(buf, count, ppos, &qib_stats, sizeof qib_stats); } @@ -456,13 +457,13 @@ static int remove_file(struct dentry *parent, char *name) spin_lock(&tmp->d_lock); if (!(d_unhashed(tmp) && tmp->d_inode)) { - dget_dlock(tmp); __d_drop(tmp); spin_unlock(&tmp->d_lock); simple_unlink(parent->d_inode, tmp); } else { spin_unlock(&tmp->d_lock); } + dput(tmp); ret = 0; bail: @@ -491,6 +492,7 @@ static int remove_device_files(struct super_block *sb, goto bail; } + mutex_lock(&dir->d_inode->i_mutex); remove_file(dir, "counters"); remove_file(dir, "counter_names"); remove_file(dir, "portcounter_names"); @@ -505,8 +507,10 @@ static int remove_device_files(struct super_block *sb, } } remove_file(dir, "flash"); - d_delete(dir); + mutex_unlock(&dir->d_inode->i_mutex); ret = simple_rmdir(root->d_inode, dir); + d_delete(dir); + dput(dir); bail: mutex_unlock(&root->d_inode->i_mutex); diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c index 84e593d6007..d68266ac761 100644 --- a/drivers/infiniband/hw/qib/qib_iba6120.c +++ b/drivers/infiniband/hw/qib/qib_iba6120.c @@ -1634,9 +1634,7 @@ static irqreturn_t qib_6120intr(int irq, void *data) goto bail; } - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; + this_cpu_inc(*dd->int_counter); if (unlikely(istat & (~QLOGIC_IB_I_BITSEXTANT | QLOGIC_IB_I_GPIO | QLOGIC_IB_I_ERROR))) @@ -1808,7 +1806,8 @@ static int qib_6120_setup_reset(struct qib_devdata *dd) * isn't set. */ dd->flags &= ~(QIB_INITTED | QIB_PRESENT); - dd->int_counter = 0; /* so we check interrupts work again */ + /* so we check interrupts work again */ + dd->z_int_counter = qib_int_counter(dd); val = dd->control | QLOGIC_IB_C_RESET; writeq(val, &dd->kregbase[kr_control]); mb(); /* prevent compiler re-ordering around actual reset */ @@ -3266,7 +3265,9 @@ static int init_6120_variables(struct qib_devdata *dd) dd->eep_st_masks[2].errs_to_log = ERR_MASK(ResetNegated); - qib_init_pportdata(ppd, dd, 0, 1); + ret = qib_init_pportdata(ppd, dd, 0, 1); + if (ret) + goto bail; ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X; ppd->link_speed_supported = QIB_IB_SDR; ppd->link_width_enabled = IB_WIDTH_4X; diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c index 454c2e7668f..7dec89fdc12 100644 --- a/drivers/infiniband/hw/qib/qib_iba7220.c +++ b/drivers/infiniband/hw/qib/qib_iba7220.c @@ -1962,10 +1962,7 @@ static irqreturn_t qib_7220intr(int irq, void *data) goto bail; } - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; - + this_cpu_inc(*dd->int_counter); if (unlikely(istat & (~QLOGIC_IB_I_BITSEXTANT | QLOGIC_IB_I_GPIO | QLOGIC_IB_I_ERROR))) unlikely_7220_intr(dd, istat); @@ -2120,7 +2117,8 @@ static int qib_setup_7220_reset(struct qib_devdata *dd) * isn't set. */ dd->flags &= ~(QIB_INITTED | QIB_PRESENT); - dd->int_counter = 0; /* so we check interrupts work again */ + /* so we check interrupts work again */ + dd->z_int_counter = qib_int_counter(dd); val = dd->control | QLOGIC_IB_C_RESET; writeq(val, &dd->kregbase[kr_control]); mb(); /* prevent compiler reordering around actual reset */ @@ -4061,7 +4059,9 @@ static int qib_init_7220_variables(struct qib_devdata *dd) init_waitqueue_head(&cpspec->autoneg_wait); INIT_DELAYED_WORK(&cpspec->autoneg_work, autoneg_7220_work); - qib_init_pportdata(ppd, dd, 0, 1); + ret = qib_init_pportdata(ppd, dd, 0, 1); + if (ret) + goto bail; ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X; ppd->link_speed_supported = QIB_IB_SDR | QIB_IB_DDR; diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 016e7429adf..a7eb32517a0 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -2395,6 +2395,11 @@ static int qib_7322_bringup_serdes(struct qib_pportdata *ppd) qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a); qib_write_kreg(dd, kr_scratch, 0ULL); + /* ensure previous Tx parameters are not still forced */ + qib_write_kreg_port(ppd, krp_tx_deemph_override, + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + reset_tx_deemphasis_override)); + if (qib_compat_ddr_negotiate) { ppd->cpspec->ibdeltainprog = 1; ppd->cpspec->ibsymsnap = read_7322_creg32_port(ppd, @@ -3110,9 +3115,7 @@ static irqreturn_t qib_7322intr(int irq, void *data) goto bail; } - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; + this_cpu_inc(*dd->int_counter); /* handle "errors" of various kinds first, device ahead of port */ if (unlikely(istat & (~QIB_I_BITSEXTANT | QIB_I_GPIO | @@ -3181,9 +3184,7 @@ static irqreturn_t qib_7322pintr(int irq, void *data) */ return IRQ_HANDLED; - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; + this_cpu_inc(*dd->int_counter); /* Clear the interrupt bit we expect to be set. */ qib_write_kreg(dd, kr_intclear, ((1ULL << QIB_I_RCVAVAIL_LSB) | @@ -3210,9 +3211,7 @@ static irqreturn_t qib_7322bufavail(int irq, void *data) */ return IRQ_HANDLED; - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; + this_cpu_inc(*dd->int_counter); /* Clear the interrupt bit we expect to be set. */ qib_write_kreg(dd, kr_intclear, QIB_I_SPIOBUFAVAIL); @@ -3243,9 +3242,7 @@ static irqreturn_t sdma_intr(int irq, void *data) */ return IRQ_HANDLED; - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; + this_cpu_inc(*dd->int_counter); /* Clear the interrupt bit we expect to be set. */ qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? @@ -3272,9 +3269,7 @@ static irqreturn_t sdma_idle_intr(int irq, void *data) */ return IRQ_HANDLED; - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; + this_cpu_inc(*dd->int_counter); /* Clear the interrupt bit we expect to be set. */ qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? @@ -3301,9 +3296,7 @@ static irqreturn_t sdma_progress_intr(int irq, void *data) */ return IRQ_HANDLED; - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; + this_cpu_inc(*dd->int_counter); /* Clear the interrupt bit we expect to be set. */ qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? @@ -3331,9 +3324,7 @@ static irqreturn_t sdma_cleanup_intr(int irq, void *data) */ return IRQ_HANDLED; - qib_stats.sps_ints++; - if (dd->int_counter != (u32) -1) - dd->int_counter++; + this_cpu_inc(*dd->int_counter); /* Clear the interrupt bit we expect to be set. */ qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? @@ -3718,7 +3709,8 @@ static int qib_do_7322_reset(struct qib_devdata *dd) dd->pport->cpspec->ibsymdelta = 0; dd->pport->cpspec->iblnkerrdelta = 0; dd->pport->cpspec->ibmalfdelta = 0; - dd->int_counter = 0; /* so we check interrupts work again */ + /* so we check interrupts work again */ + dd->z_int_counter = qib_int_counter(dd); /* * Keep chip from being accessed until we are ready. Use @@ -6190,21 +6182,20 @@ static int setup_txselect(const char *str, struct kernel_param *kp) { struct qib_devdata *dd; unsigned long val; - int ret; - + char *n; if (strlen(str) >= MAX_ATTEN_LEN) { pr_info("txselect_values string too long\n"); return -ENOSPC; } - ret = kstrtoul(str, 0, &val); - if (ret || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + + val = simple_strtoul(str, &n, 0); + if (n == str || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ)) { pr_info("txselect_values must start with a number < %d\n", TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ); - return ret ? ret : -EINVAL; + return -EINVAL; } - strcpy(txselect_list, str); + list_for_each_entry(dd, &qib_dev_list, list) if (dd->deviceid == PCI_DEVICE_ID_QLOGIC_IB_7322) set_no_qsfp_atten(dd, 1); @@ -6553,7 +6544,11 @@ static int qib_init_7322_variables(struct qib_devdata *dd) } dd->num_pports++; - qib_init_pportdata(ppd, dd, pidx, dd->num_pports); + ret = qib_init_pportdata(ppd, dd, pidx, dd->num_pports); + if (ret) { + dd->num_pports--; + goto bail; + } ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X; ppd->link_width_enabled = IB_WIDTH_4X; diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 24e802f4ea2..8d3c78ddc90 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -130,7 +130,6 @@ void qib_set_ctxtcnt(struct qib_devdata *dd) int qib_create_ctxts(struct qib_devdata *dd) { unsigned i; - int ret; int local_node_id = pcibus_to_node(dd->pcidev->bus); if (local_node_id < 0) @@ -145,8 +144,7 @@ int qib_create_ctxts(struct qib_devdata *dd) if (!dd->rcd) { qib_dev_err(dd, "Unable to allocate ctxtdata array, failing\n"); - ret = -ENOMEM; - goto done; + return -ENOMEM; } /* create (one or more) kctxt */ @@ -163,15 +161,14 @@ int qib_create_ctxts(struct qib_devdata *dd) if (!rcd) { qib_dev_err(dd, "Unable to allocate ctxtdata for Kernel ctxt, failing\n"); - ret = -ENOMEM; - goto done; + kfree(dd->rcd); + dd->rcd = NULL; + return -ENOMEM; } rcd->pkeys[0] = QIB_DEFAULT_P_KEY; rcd->seq_cnt = 1; } - ret = 0; -done: - return ret; + return 0; } /* @@ -233,7 +230,7 @@ struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt, /* * Common code for initializing the physical port structure. */ -void qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, +int qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, u8 hw_pidx, u8 port) { int size; @@ -243,6 +240,7 @@ void qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, spin_lock_init(&ppd->sdma_lock); spin_lock_init(&ppd->lflags_lock); + spin_lock_init(&ppd->cc_shadow_lock); init_waitqueue_head(&ppd->state_wait); init_timer(&ppd->symerr_clear_timer); @@ -250,8 +248,10 @@ void qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, ppd->symerr_clear_timer.data = (unsigned long)ppd; ppd->qib_wq = NULL; - - spin_lock_init(&ppd->cc_shadow_lock); + ppd->ibport_data.pmastats = + alloc_percpu(struct qib_pma_counters); + if (!ppd->ibport_data.pmastats) + return -ENOMEM; if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) goto bail; @@ -299,7 +299,7 @@ void qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, goto bail_3; } - return; + return 0; bail_3: kfree(ppd->ccti_entries_shadow); @@ -313,7 +313,7 @@ bail_1: bail: /* User is intentionally disabling the congestion control agent */ if (!qib_cc_table_size) - return; + return 0; if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) { qib_cc_table_size = 0; @@ -324,7 +324,7 @@ bail: qib_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port); - return; + return 0; } static int init_pioavailregs(struct qib_devdata *dd) @@ -525,6 +525,7 @@ static void enable_chip(struct qib_devdata *dd) static void verify_interrupt(unsigned long opaque) { struct qib_devdata *dd = (struct qib_devdata *) opaque; + u64 int_counter; if (!dd) return; /* being torn down */ @@ -533,7 +534,8 @@ static void verify_interrupt(unsigned long opaque) * If we don't have a lid or any interrupts, let the user know and * don't bother checking again. */ - if (dd->int_counter == 0) { + int_counter = qib_int_counter(dd) - dd->z_int_counter; + if (int_counter == 0) { if (!dd->f_intr_fallback(dd)) dev_err(&dd->pcidev->dev, "No interrupts detected, not usable.\n"); @@ -633,6 +635,12 @@ wq_error: return -ENOMEM; } +static void qib_free_pportdata(struct qib_pportdata *ppd) +{ + free_percpu(ppd->ibport_data.pmastats); + ppd->ibport_data.pmastats = NULL; +} + /** * qib_init - do the actual initialization sequence on the chip * @dd: the qlogic_ib device @@ -920,6 +928,7 @@ static void qib_shutdown_device(struct qib_devdata *dd) destroy_workqueue(ppd->qib_wq); ppd->qib_wq = NULL; } + qib_free_pportdata(ppd); } qib_update_eeprom_log(dd); @@ -1079,9 +1088,34 @@ void qib_free_devdata(struct qib_devdata *dd) #ifdef CONFIG_DEBUG_FS qib_dbg_ibdev_exit(&dd->verbs_dev); #endif + free_percpu(dd->int_counter); ib_dealloc_device(&dd->verbs_dev.ibdev); } +u64 qib_int_counter(struct qib_devdata *dd) +{ + int cpu; + u64 int_counter = 0; + + for_each_possible_cpu(cpu) + int_counter += *per_cpu_ptr(dd->int_counter, cpu); + return int_counter; +} + +u64 qib_sps_ints(void) +{ + unsigned long flags; + struct qib_devdata *dd; + u64 sps_ints = 0; + + spin_lock_irqsave(&qib_devs_lock, flags); + list_for_each_entry(dd, &qib_dev_list, list) { + sps_ints += qib_int_counter(dd); + } + spin_unlock_irqrestore(&qib_devs_lock, flags); + return sps_ints; +} + /* * Allocate our primary per-unit data structure. Must be done via verbs * allocator, because the verbs cleanup process both does cleanup and @@ -1097,14 +1131,10 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) int ret; dd = (struct qib_devdata *) ib_alloc_device(sizeof(*dd) + extra); - if (!dd) { - dd = ERR_PTR(-ENOMEM); - goto bail; - } + if (!dd) + return ERR_PTR(-ENOMEM); -#ifdef CONFIG_DEBUG_FS - qib_dbg_ibdev_init(&dd->verbs_dev); -#endif + INIT_LIST_HEAD(&dd->list); idr_preload(GFP_KERNEL); spin_lock_irqsave(&qib_devs_lock, flags); @@ -1121,11 +1151,13 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) if (ret < 0) { qib_early_err(&pdev->dev, "Could not allocate unit ID: error %d\n", -ret); -#ifdef CONFIG_DEBUG_FS - qib_dbg_ibdev_exit(&dd->verbs_dev); -#endif - ib_dealloc_device(&dd->verbs_dev.ibdev); - dd = ERR_PTR(ret); + goto bail; + } + dd->int_counter = alloc_percpu(u64); + if (!dd->int_counter) { + ret = -ENOMEM; + qib_early_err(&pdev->dev, + "Could not allocate per-cpu int_counter\n"); goto bail; } @@ -1139,9 +1171,15 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) qib_early_err(&pdev->dev, "Could not alloc cpulist info, cpu affinity might be wrong\n"); } - -bail: +#ifdef CONFIG_DEBUG_FS + qib_dbg_ibdev_init(&dd->verbs_dev); +#endif return dd; +bail: + if (!list_empty(&dd->list)) + list_del_init(&dd->list); + ib_dealloc_device(&dd->verbs_dev.ibdev); + return ERR_PTR(ret);; } /* @@ -1234,7 +1272,7 @@ static int qib_notify_dca(struct notifier_block *nb, unsigned long event, * Do all the generic driver unit- and chip-independent memory * allocation and initialization. */ -static int __init qlogic_ib_init(void) +static int __init qib_ib_init(void) { int ret; @@ -1278,12 +1316,12 @@ bail: return ret; } -module_init(qlogic_ib_init); +module_init(qib_ib_init); /* * Do the non-unit driver cleanup, memory free, etc. at unload. */ -static void __exit qlogic_ib_cleanup(void) +static void __exit qib_ib_cleanup(void) { int ret; @@ -1308,7 +1346,7 @@ static void __exit qlogic_ib_cleanup(void) qib_dev_cleanup(); } -module_exit(qlogic_ib_cleanup); +module_exit(qib_ib_cleanup); /* this can only be called after a successful initialization */ static void cleanup_device_data(struct qib_devdata *dd) diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c index ccb119143d2..22c720e5740 100644 --- a/drivers/infiniband/hw/qib/qib_mad.c +++ b/drivers/infiniband/hw/qib/qib_mad.c @@ -1028,7 +1028,7 @@ static int set_pkeys(struct qib_devdata *dd, u8 port, u16 *pkeys) event.event = IB_EVENT_PKEY_CHANGE; event.device = &dd->verbs_dev.ibdev; - event.element.port_num = 1; + event.element.port_num = port; ib_dispatch_event(&event); } return 0; @@ -1634,6 +1634,23 @@ static int pma_get_portcounters_cong(struct ib_pma_mad *pmp, return reply((struct ib_smp *)pmp); } +static void qib_snapshot_pmacounters( + struct qib_ibport *ibp, + struct qib_pma_counters *pmacounters) +{ + struct qib_pma_counters *p; + int cpu; + + memset(pmacounters, 0, sizeof(*pmacounters)); + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(ibp->pmastats, cpu); + pmacounters->n_unicast_xmit += p->n_unicast_xmit; + pmacounters->n_unicast_rcv += p->n_unicast_rcv; + pmacounters->n_multicast_xmit += p->n_multicast_xmit; + pmacounters->n_multicast_rcv += p->n_multicast_rcv; + } +} + static int pma_get_portcounters_ext(struct ib_pma_mad *pmp, struct ib_device *ibdev, u8 port) { @@ -1642,6 +1659,7 @@ static int pma_get_portcounters_ext(struct ib_pma_mad *pmp, struct qib_ibport *ibp = to_iport(ibdev, port); struct qib_pportdata *ppd = ppd_from_ibp(ibp); u64 swords, rwords, spkts, rpkts, xwait; + struct qib_pma_counters pma; u8 port_select = p->port_select; memset(pmp->data, 0, sizeof(pmp->data)); @@ -1664,10 +1682,17 @@ static int pma_get_portcounters_ext(struct ib_pma_mad *pmp, p->port_rcv_data = cpu_to_be64(rwords); p->port_xmit_packets = cpu_to_be64(spkts); p->port_rcv_packets = cpu_to_be64(rpkts); - p->port_unicast_xmit_packets = cpu_to_be64(ibp->n_unicast_xmit); - p->port_unicast_rcv_packets = cpu_to_be64(ibp->n_unicast_rcv); - p->port_multicast_xmit_packets = cpu_to_be64(ibp->n_multicast_xmit); - p->port_multicast_rcv_packets = cpu_to_be64(ibp->n_multicast_rcv); + + qib_snapshot_pmacounters(ibp, &pma); + + p->port_unicast_xmit_packets = cpu_to_be64(pma.n_unicast_xmit + - ibp->z_unicast_xmit); + p->port_unicast_rcv_packets = cpu_to_be64(pma.n_unicast_rcv + - ibp->z_unicast_rcv); + p->port_multicast_xmit_packets = cpu_to_be64(pma.n_multicast_xmit + - ibp->z_multicast_xmit); + p->port_multicast_rcv_packets = cpu_to_be64(pma.n_multicast_rcv + - ibp->z_multicast_rcv); bail: return reply((struct ib_smp *) pmp); @@ -1795,6 +1820,7 @@ static int pma_set_portcounters_ext(struct ib_pma_mad *pmp, struct qib_ibport *ibp = to_iport(ibdev, port); struct qib_pportdata *ppd = ppd_from_ibp(ibp); u64 swords, rwords, spkts, rpkts, xwait; + struct qib_pma_counters pma; qib_snapshot_counters(ppd, &swords, &rwords, &spkts, &rpkts, &xwait); @@ -1810,17 +1836,19 @@ static int pma_set_portcounters_ext(struct ib_pma_mad *pmp, if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS) ibp->z_port_rcv_packets = rpkts; + qib_snapshot_pmacounters(ibp, &pma); + if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS) - ibp->n_unicast_xmit = 0; + ibp->z_unicast_xmit = pma.n_unicast_xmit; if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS) - ibp->n_unicast_rcv = 0; + ibp->z_unicast_rcv = pma.n_unicast_rcv; if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS) - ibp->n_multicast_xmit = 0; + ibp->z_multicast_xmit = pma.n_multicast_xmit; if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS) - ibp->n_multicast_rcv = 0; + ibp->z_multicast_rcv = pma.n_multicast_rcv; return pma_get_portcounters_ext(pmp, ibdev, port); } diff --git a/drivers/infiniband/hw/qib/qib_mad.h b/drivers/infiniband/hw/qib/qib_mad.h index 28874f8606f..941d4d50d8e 100644 --- a/drivers/infiniband/hw/qib/qib_mad.h +++ b/drivers/infiniband/hw/qib/qib_mad.h @@ -54,7 +54,7 @@ struct ib_node_info { __be32 revision; u8 local_port_num; u8 vendor_id[3]; -} __attribute__ ((packed)); +} __packed; struct ib_mad_notice_attr { u8 generic_type; @@ -73,7 +73,7 @@ struct ib_mad_notice_attr { __be16 reserved; __be16 lid; /* where violation happened */ u8 port_num; /* where violation happened */ - } __attribute__ ((packed)) ntc_129_131; + } __packed ntc_129_131; struct { __be16 reserved; @@ -83,14 +83,14 @@ struct ib_mad_notice_attr { __be32 new_cap_mask; /* new capability mask */ u8 reserved3; u8 change_flags; /* low 3 bits only */ - } __attribute__ ((packed)) ntc_144; + } __packed ntc_144; struct { __be16 reserved; __be16 lid; /* lid where sys guid changed */ __be16 reserved2; __be64 new_sys_guid; - } __attribute__ ((packed)) ntc_145; + } __packed ntc_145; struct { __be16 reserved; @@ -104,7 +104,7 @@ struct ib_mad_notice_attr { u8 reserved3; u8 dr_trunc_hop; u8 dr_rtn_path[30]; - } __attribute__ ((packed)) ntc_256; + } __packed ntc_256; struct { __be16 reserved; @@ -115,7 +115,7 @@ struct ib_mad_notice_attr { __be32 qp2; /* high 8 bits reserved */ union ib_gid gid1; union ib_gid gid2; - } __attribute__ ((packed)) ntc_257_258; + } __packed ntc_257_258; } details; }; @@ -209,7 +209,7 @@ struct ib_pma_portcounters_cong { __be64 port_rcv_packets; __be64 port_xmit_wait; __be64 port_adr_events; -} __attribute__ ((packed)); +} __packed; #define IB_PMA_CONG_HW_CONTROL_TIMER 0x00 #define IB_PMA_CONG_HW_CONTROL_SAMPLE 0x01 diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c index e6687ded821..9bbb55347cc 100644 --- a/drivers/infiniband/hw/qib/qib_mr.c +++ b/drivers/infiniband/hw/qib/qib_mr.c @@ -232,8 +232,8 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, { struct qib_mr *mr; struct ib_umem *umem; - struct ib_umem_chunk *chunk; - int n, m, i; + struct scatterlist *sg; + int n, m, entry; struct ib_mr *ret; if (length == 0) { @@ -246,9 +246,7 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (IS_ERR(umem)) return (void *) umem; - n = 0; - list_for_each_entry(chunk, &umem->chunk_list, list) - n += chunk->nents; + n = umem->nmap; mr = alloc_mr(n, pd); if (IS_ERR(mr)) { @@ -268,11 +266,10 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mr->mr.page_shift = ilog2(umem->page_size); m = 0; n = 0; - list_for_each_entry(chunk, &umem->chunk_list, list) { - for (i = 0; i < chunk->nents; i++) { + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { void *vaddr; - vaddr = page_address(sg_page(&chunk->page_list[i])); + vaddr = page_address(sg_page(sg)); if (!vaddr) { ret = ERR_PTR(-EINVAL); goto bail; @@ -284,7 +281,6 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, m++; n = 0; } - } } ret = &mr->ibmr; diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c index 3f14009fb66..61a0046efb7 100644 --- a/drivers/infiniband/hw/qib/qib_pcie.c +++ b/drivers/infiniband/hw/qib/qib_pcie.c @@ -51,8 +51,8 @@ * file calls, even though this violates some * expectations of harmlessness. */ -static int qib_tune_pcie_caps(struct qib_devdata *); -static int qib_tune_pcie_coalesce(struct qib_devdata *); +static void qib_tune_pcie_caps(struct qib_devdata *); +static void qib_tune_pcie_coalesce(struct qib_devdata *); /* * Do all the common PCIe setup and initialization. @@ -197,46 +197,47 @@ static void qib_msix_setup(struct qib_devdata *dd, int pos, u32 *msixcnt, struct qib_msix_entry *qib_msix_entry) { int ret; - u32 tabsize = 0; - u16 msix_flags; + int nvec = *msixcnt; struct msix_entry *msix_entry; int i; + ret = pci_msix_vec_count(dd->pcidev); + if (ret < 0) + goto do_intx; + + nvec = min(nvec, ret); + /* We can't pass qib_msix_entry array to qib_msix_setup * so use a dummy msix_entry array and copy the allocated * irq back to the qib_msix_entry array. */ - msix_entry = kmalloc(*msixcnt * sizeof(*msix_entry), GFP_KERNEL); - if (!msix_entry) { - ret = -ENOMEM; + msix_entry = kmalloc(nvec * sizeof(*msix_entry), GFP_KERNEL); + if (!msix_entry) goto do_intx; - } - for (i = 0; i < *msixcnt; i++) + + for (i = 0; i < nvec; i++) msix_entry[i] = qib_msix_entry[i].msix; - pci_read_config_word(dd->pcidev, pos + PCI_MSIX_FLAGS, &msix_flags); - tabsize = 1 + (msix_flags & PCI_MSIX_FLAGS_QSIZE); - if (tabsize > *msixcnt) - tabsize = *msixcnt; - ret = pci_enable_msix(dd->pcidev, msix_entry, tabsize); - if (ret > 0) { - tabsize = ret; - ret = pci_enable_msix(dd->pcidev, msix_entry, tabsize); - } -do_intx: - if (ret) { - qib_dev_err(dd, - "pci_enable_msix %d vectors failed: %d, falling back to INTx\n", - tabsize, ret); - tabsize = 0; - } - for (i = 0; i < tabsize; i++) + ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec); + if (ret < 0) + goto free_msix_entry; + else + nvec = ret; + + for (i = 0; i < nvec; i++) qib_msix_entry[i].msix = msix_entry[i]; + kfree(msix_entry); - *msixcnt = tabsize; + *msixcnt = nvec; + return; - if (ret) - qib_enable_intx(dd->pcidev); +free_msix_entry: + kfree(msix_entry); +do_intx: + qib_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, " + "falling back to INTx\n", nvec, ret); + *msixcnt = 0; + qib_enable_intx(dd->pcidev); } /** @@ -476,30 +477,6 @@ void qib_pcie_reenable(struct qib_devdata *dd, u16 cmd, u8 iline, u8 cline) "pci_enable_device failed after reset: %d\n", r); } -/* code to adjust PCIe capabilities. */ - -static int fld2val(int wd, int mask) -{ - int lsbmask; - - if (!mask) - return 0; - wd &= mask; - lsbmask = mask ^ (mask & (mask - 1)); - wd /= lsbmask; - return wd; -} - -static int val2fld(int wd, int mask) -{ - int lsbmask; - - if (!mask) - return 0; - lsbmask = mask ^ (mask & (mask - 1)); - wd *= lsbmask; - return wd; -} static int qib_pcie_coalesce; module_param_named(pcie_coalesce, qib_pcie_coalesce, int, S_IRUGO); @@ -511,7 +488,7 @@ MODULE_PARM_DESC(pcie_coalesce, "tune PCIe colescing on some Intel chipsets"); * of these chipsets, with some BIOS settings, and enabling it on those * systems may result in the system crashing, and/or data corruption. */ -static int qib_tune_pcie_coalesce(struct qib_devdata *dd) +static void qib_tune_pcie_coalesce(struct qib_devdata *dd) { int r; struct pci_dev *parent; @@ -519,18 +496,18 @@ static int qib_tune_pcie_coalesce(struct qib_devdata *dd) u32 mask, bits, val; if (!qib_pcie_coalesce) - return 0; + return; /* Find out supported and configured values for parent (root) */ parent = dd->pcidev->bus->self; if (parent->bus->parent) { qib_devinfo(dd->pcidev, "Parent not root\n"); - return 1; + return; } if (!pci_is_pcie(parent)) - return 1; + return; if (parent->vendor != 0x8086) - return 1; + return; /* * - bit 12: Max_rdcmp_Imt_EN: need to set to 1 @@ -563,13 +540,12 @@ static int qib_tune_pcie_coalesce(struct qib_devdata *dd) mask = (3U << 24) | (7U << 10); } else { /* not one of the chipsets that we know about */ - return 1; + return; } pci_read_config_dword(parent, 0x48, &val); val &= ~mask; val |= bits; r = pci_write_config_dword(parent, 0x48, val); - return 0; } /* @@ -580,55 +556,44 @@ static int qib_pcie_caps; module_param_named(pcie_caps, qib_pcie_caps, int, S_IRUGO); MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)"); -static int qib_tune_pcie_caps(struct qib_devdata *dd) +static void qib_tune_pcie_caps(struct qib_devdata *dd) { - int ret = 1; /* Assume the worst */ struct pci_dev *parent; - u16 pcaps, pctl, ecaps, ectl; - int rc_sup, ep_sup; - int rc_cur, ep_cur; + u16 rc_mpss, rc_mps, ep_mpss, ep_mps; + u16 rc_mrrs, ep_mrrs, max_mrrs; /* Find out supported and configured values for parent (root) */ parent = dd->pcidev->bus->self; - if (parent->bus->parent) { + if (!pci_is_root_bus(parent->bus)) { qib_devinfo(dd->pcidev, "Parent not root\n"); - goto bail; + return; } if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev)) - goto bail; - pcie_capability_read_word(parent, PCI_EXP_DEVCAP, &pcaps); - pcie_capability_read_word(parent, PCI_EXP_DEVCTL, &pctl); + return; + + rc_mpss = parent->pcie_mpss; + rc_mps = ffs(pcie_get_mps(parent)) - 8; /* Find out supported and configured values for endpoint (us) */ - pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCAP, &ecaps); - pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl); + ep_mpss = dd->pcidev->pcie_mpss; + ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8; - ret = 0; /* Find max payload supported by root, endpoint */ - rc_sup = fld2val(pcaps, PCI_EXP_DEVCAP_PAYLOAD); - ep_sup = fld2val(ecaps, PCI_EXP_DEVCAP_PAYLOAD); - if (rc_sup > ep_sup) - rc_sup = ep_sup; - - rc_cur = fld2val(pctl, PCI_EXP_DEVCTL_PAYLOAD); - ep_cur = fld2val(ectl, PCI_EXP_DEVCTL_PAYLOAD); + if (rc_mpss > ep_mpss) + rc_mpss = ep_mpss; /* If Supported greater than limit in module param, limit it */ - if (rc_sup > (qib_pcie_caps & 7)) - rc_sup = qib_pcie_caps & 7; + if (rc_mpss > (qib_pcie_caps & 7)) + rc_mpss = qib_pcie_caps & 7; /* If less than (allowed, supported), bump root payload */ - if (rc_sup > rc_cur) { - rc_cur = rc_sup; - pctl = (pctl & ~PCI_EXP_DEVCTL_PAYLOAD) | - val2fld(rc_cur, PCI_EXP_DEVCTL_PAYLOAD); - pcie_capability_write_word(parent, PCI_EXP_DEVCTL, pctl); + if (rc_mpss > rc_mps) { + rc_mps = rc_mpss; + pcie_set_mps(parent, 128 << rc_mps); } /* If less than (allowed, supported), bump endpoint payload */ - if (rc_sup > ep_cur) { - ep_cur = rc_sup; - ectl = (ectl & ~PCI_EXP_DEVCTL_PAYLOAD) | - val2fld(ep_cur, PCI_EXP_DEVCTL_PAYLOAD); - pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl); + if (rc_mpss > ep_mps) { + ep_mps = rc_mpss; + pcie_set_mps(dd->pcidev, 128 << ep_mps); } /* @@ -636,26 +601,22 @@ static int qib_tune_pcie_caps(struct qib_devdata *dd) * No field for max supported, but PCIe spec limits it to 4096, * which is code '5' (log2(4096) - 7) */ - rc_sup = 5; - if (rc_sup > ((qib_pcie_caps >> 4) & 7)) - rc_sup = (qib_pcie_caps >> 4) & 7; - rc_cur = fld2val(pctl, PCI_EXP_DEVCTL_READRQ); - ep_cur = fld2val(ectl, PCI_EXP_DEVCTL_READRQ); - - if (rc_sup > rc_cur) { - rc_cur = rc_sup; - pctl = (pctl & ~PCI_EXP_DEVCTL_READRQ) | - val2fld(rc_cur, PCI_EXP_DEVCTL_READRQ); - pcie_capability_write_word(parent, PCI_EXP_DEVCTL, pctl); + max_mrrs = 5; + if (max_mrrs > ((qib_pcie_caps >> 4) & 7)) + max_mrrs = (qib_pcie_caps >> 4) & 7; + + max_mrrs = 128 << max_mrrs; + rc_mrrs = pcie_get_readrq(parent); + ep_mrrs = pcie_get_readrq(dd->pcidev); + + if (max_mrrs > rc_mrrs) { + rc_mrrs = max_mrrs; + pcie_set_readrq(parent, rc_mrrs); } - if (rc_sup > ep_cur) { - ep_cur = rc_sup; - ectl = (ectl & ~PCI_EXP_DEVCTL_READRQ) | - val2fld(ep_cur, PCI_EXP_DEVCTL_READRQ); - pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl); + if (max_mrrs > ep_mrrs) { + ep_mrrs = max_mrrs; + pcie_set_readrq(dd->pcidev, ep_mrrs); } -bail: - return ret; } /* End of PCIe capability tuning */ diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 3cca55b51e5..7fcc150d603 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -585,7 +585,7 @@ int qib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask)) + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) goto inval; if (attr_mask & IB_QP_AV) { @@ -985,7 +985,8 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd, struct ib_qp *ret; if (init_attr->cap.max_send_sge > ib_qib_max_sges || - init_attr->cap.max_send_wr > ib_qib_max_qp_wrs) { + init_attr->cap.max_send_wr > ib_qib_max_qp_wrs || + init_attr->create_flags) { ret = ERR_PTR(-EINVAL); goto bail; } diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 3ab341320ea..2f2501890c4 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -752,7 +752,7 @@ void qib_send_rc_ack(struct qib_qp *qp) qib_flush_wc(); qib_sendbuf_done(dd, pbufn); - ibp->n_unicast_xmit++; + this_cpu_inc(ibp->pmastats->n_unicast_xmit); goto done; queue_ack: diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index 357b6cfcd46..4c07a8b34ff 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -703,6 +703,7 @@ void qib_make_ruc_header(struct qib_qp *qp, struct qib_other_headers *ohdr, ohdr->bth[0] = cpu_to_be32(bth0); ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); ohdr->bth[2] = cpu_to_be32(bth2); + this_cpu_inc(ibp->pmastats->n_unicast_xmit); } /** diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index d6c7fe7f88d..aaf7039f8ed 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -57,13 +57,20 @@ static void qib_ud_loopback(struct qib_qp *sqp, struct qib_swqe *swqe) struct qib_sge *sge; struct ib_wc wc; u32 length; + enum ib_qp_type sqptype, dqptype; qp = qib_lookup_qpn(ibp, swqe->wr.wr.ud.remote_qpn); if (!qp) { ibp->n_pkt_drops++; return; } - if (qp->ibqp.qp_type != sqp->ibqp.qp_type || + + sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ? + IB_QPT_UD : sqp->ibqp.qp_type; + dqptype = qp->ibqp.qp_type == IB_QPT_GSI ? + IB_QPT_UD : qp->ibqp.qp_type; + + if (dqptype != sqptype || !(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) { ibp->n_pkt_drops++; goto drop; @@ -273,11 +280,11 @@ int qib_make_ud_req(struct qib_qp *qp) ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr; if (ah_attr->dlid >= QIB_MULTICAST_LID_BASE) { if (ah_attr->dlid != QIB_PERMISSIVE_LID) - ibp->n_multicast_xmit++; + this_cpu_inc(ibp->pmastats->n_multicast_xmit); else - ibp->n_unicast_xmit++; + this_cpu_inc(ibp->pmastats->n_unicast_xmit); } else { - ibp->n_unicast_xmit++; + this_cpu_inc(ibp->pmastats->n_unicast_xmit); lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1); if (unlikely(lid == ppd->lid)) { /* diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c index d0a0ea0c14d..d2806cae234 100644 --- a/drivers/infiniband/hw/qib/qib_user_sdma.c +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -52,6 +52,17 @@ /* attempt to drain the queue for 5secs */ #define QIB_USER_SDMA_DRAIN_TIMEOUT 500 +/* + * track how many times a process open this driver. + */ +static struct rb_root qib_user_sdma_rb_root = RB_ROOT; + +struct qib_user_sdma_rb_node { + struct rb_node node; + int refcount; + pid_t pid; +}; + struct qib_user_sdma_pkt { struct list_head list; /* list element */ @@ -120,15 +131,60 @@ struct qib_user_sdma_queue { /* dma page table */ struct rb_root dma_pages_root; + struct qib_user_sdma_rb_node *sdma_rb_node; + /* protect everything above... */ struct mutex lock; }; +static struct qib_user_sdma_rb_node * +qib_user_sdma_rb_search(struct rb_root *root, pid_t pid) +{ + struct qib_user_sdma_rb_node *sdma_rb_node; + struct rb_node *node = root->rb_node; + + while (node) { + sdma_rb_node = container_of(node, + struct qib_user_sdma_rb_node, node); + if (pid < sdma_rb_node->pid) + node = node->rb_left; + else if (pid > sdma_rb_node->pid) + node = node->rb_right; + else + return sdma_rb_node; + } + return NULL; +} + +static int +qib_user_sdma_rb_insert(struct rb_root *root, struct qib_user_sdma_rb_node *new) +{ + struct rb_node **node = &(root->rb_node); + struct rb_node *parent = NULL; + struct qib_user_sdma_rb_node *got; + + while (*node) { + got = container_of(*node, struct qib_user_sdma_rb_node, node); + parent = *node; + if (new->pid < got->pid) + node = &((*node)->rb_left); + else if (new->pid > got->pid) + node = &((*node)->rb_right); + else + return 0; + } + + rb_link_node(&new->node, parent, node); + rb_insert_color(&new->node, root); + return 1; +} + struct qib_user_sdma_queue * qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt) { struct qib_user_sdma_queue *pq = kmalloc(sizeof(struct qib_user_sdma_queue), GFP_KERNEL); + struct qib_user_sdma_rb_node *sdma_rb_node; if (!pq) goto done; @@ -138,6 +194,7 @@ qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt) pq->num_pending = 0; pq->num_sending = 0; pq->added = 0; + pq->sdma_rb_node = NULL; INIT_LIST_HEAD(&pq->sent); spin_lock_init(&pq->sent_lock); @@ -163,8 +220,30 @@ qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt) pq->dma_pages_root = RB_ROOT; + sdma_rb_node = qib_user_sdma_rb_search(&qib_user_sdma_rb_root, + current->pid); + if (sdma_rb_node) { + sdma_rb_node->refcount++; + } else { + int ret; + sdma_rb_node = kmalloc(sizeof( + struct qib_user_sdma_rb_node), GFP_KERNEL); + if (!sdma_rb_node) + goto err_rb; + + sdma_rb_node->refcount = 1; + sdma_rb_node->pid = current->pid; + + ret = qib_user_sdma_rb_insert(&qib_user_sdma_rb_root, + sdma_rb_node); + BUG_ON(ret == 0); + } + pq->sdma_rb_node = sdma_rb_node; + goto done; +err_rb: + dma_pool_destroy(pq->header_cache); err_slab: kmem_cache_destroy(pq->pkt_slab); err_kfree: @@ -594,8 +673,7 @@ static int qib_user_sdma_pin_pages(const struct qib_devdata *dd, else j = npages; - ret = get_user_pages(current, current->mm, addr, - j, 0, 1, pages, NULL); + ret = get_user_pages_fast(addr, j, 0, pages); if (ret != j) { i = 0; j = ret; @@ -1021,8 +1099,13 @@ void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq) if (!pq) return; - kmem_cache_destroy(pq->pkt_slab); + pq->sdma_rb_node->refcount--; + if (pq->sdma_rb_node->refcount == 0) { + rb_erase(&pq->sdma_rb_node->node, &qib_user_sdma_rb_root); + kfree(pq->sdma_rb_node); + } dma_pool_destroy(pq->header_cache); + kmem_cache_destroy(pq->pkt_slab); kfree(pq); } @@ -1242,26 +1325,52 @@ static int qib_user_sdma_push_pkts(struct qib_pportdata *ppd, struct qib_user_sdma_queue *pq, struct list_head *pktlist, int count) { - int ret = 0; unsigned long flags; if (unlikely(!(ppd->lflags & QIBL_LINKACTIVE))) return -ECOMM; - spin_lock_irqsave(&ppd->sdma_lock, flags); - - if (unlikely(!__qib_sdma_running(ppd))) { - ret = -ECOMM; - goto unlock; + /* non-blocking mode */ + if (pq->sdma_rb_node->refcount > 1) { + spin_lock_irqsave(&ppd->sdma_lock, flags); + if (unlikely(!__qib_sdma_running(ppd))) { + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + return -ECOMM; + } + pq->num_pending += count; + list_splice_tail_init(pktlist, &ppd->sdma_userpending); + qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + return 0; } + /* In this case, descriptors from this process are not + * linked to ppd pending queue, interrupt handler + * won't update this process, it is OK to directly + * modify without sdma lock. + */ + + pq->num_pending += count; - list_splice_tail_init(pktlist, &ppd->sdma_userpending); - qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending); + /* + * Blocking mode for single rail process, we must + * release/regain sdma_lock to give other process + * chance to make progress. This is important for + * performance. + */ + do { + spin_lock_irqsave(&ppd->sdma_lock, flags); + if (unlikely(!__qib_sdma_running(ppd))) { + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + return -ECOMM; + } + qib_user_sdma_send_desc(ppd, pktlist); + if (!list_empty(pktlist)) + qib_sdma_make_progress(ppd); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + } while (!list_empty(pktlist)); -unlock: - spin_unlock_irqrestore(&ppd->sdma_lock, flags); - return ret; + return 0; } int qib_user_sdma_writev(struct qib_ctxtdata *rcd, @@ -1291,14 +1400,11 @@ int qib_user_sdma_writev(struct qib_ctxtdata *rcd, qib_user_sdma_queue_clean(ppd, pq); while (dim) { - int mxp = 8; + int mxp = 1; int ndesc = 0; - down_write(¤t->mm->mmap_sem); ret = qib_user_sdma_queue_pkts(dd, ppd, pq, iov, dim, &list, &mxp, &ndesc); - up_write(¤t->mm->mmap_sem); - if (ret < 0) goto done_unlock; else { diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 092b0bb1bb7..9bcfbd84298 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -662,7 +662,7 @@ void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen) mcast = qib_mcast_find(ibp, &hdr->u.l.grh.dgid); if (mcast == NULL) goto drop; - ibp->n_multicast_rcv++; + this_cpu_inc(ibp->pmastats->n_multicast_rcv); list_for_each_entry_rcu(p, &mcast->qp_list, list) qib_qp_rcv(rcd, hdr, 1, data, tlen, p->qp); /* @@ -678,8 +678,8 @@ void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen) &rcd->lookaside_qp->refcount)) wake_up( &rcd->lookaside_qp->wait); - rcd->lookaside_qp = NULL; - } + rcd->lookaside_qp = NULL; + } } if (!rcd->lookaside_qp) { qp = qib_lookup_qpn(ibp, qp_num); @@ -689,7 +689,7 @@ void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen) rcd->lookaside_qpn = qp_num; } else qp = rcd->lookaside_qp; - ibp->n_unicast_rcv++; + this_cpu_inc(ibp->pmastats->n_unicast_rcv); qib_qp_rcv(rcd, hdr, lnh == QIB_LRH_GRH, data, tlen, qp); } return; diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 012e2c7575a..bfc8948fdd3 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -150,14 +150,14 @@ struct ib_reth { __be64 vaddr; __be32 rkey; __be32 length; -} __attribute__ ((packed)); +} __packed; struct ib_atomic_eth { __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */ __be32 rkey; __be64 swap_data; __be64 compare_data; -} __attribute__ ((packed)); +} __packed; struct qib_other_headers { __be32 bth[3]; @@ -178,7 +178,7 @@ struct qib_other_headers { __be32 aeth; struct ib_atomic_eth atomic_eth; } u; -} __attribute__ ((packed)); +} __packed; /* * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes @@ -195,12 +195,12 @@ struct qib_ib_header { } l; struct qib_other_headers oth; } u; -} __attribute__ ((packed)); +} __packed; struct qib_pio_header { __le32 pbc[2]; struct qib_ib_header hdr; -} __attribute__ ((packed)); +} __packed; /* * There is one struct qib_mcast for each multicast GID. @@ -664,6 +664,13 @@ struct qib_opcode_stats_perctx { struct qib_opcode_stats stats[128]; }; +struct qib_pma_counters { + u64 n_unicast_xmit; /* total unicast packets sent */ + u64 n_unicast_rcv; /* total unicast packets received */ + u64 n_multicast_xmit; /* total multicast packets sent */ + u64 n_multicast_rcv; /* total multicast packets received */ +}; + struct qib_ibport { struct qib_qp __rcu *qp0; struct qib_qp __rcu *qp1; @@ -680,10 +687,11 @@ struct qib_ibport { __be64 mkey; __be64 guids[QIB_GUIDS_PER_PORT - 1]; /* writable GUIDs */ u64 tid; /* TID for traps */ - u64 n_unicast_xmit; /* total unicast packets sent */ - u64 n_unicast_rcv; /* total unicast packets received */ - u64 n_multicast_xmit; /* total multicast packets sent */ - u64 n_multicast_rcv; /* total multicast packets received */ + struct qib_pma_counters __percpu *pmastats; + u64 z_unicast_xmit; /* starting count for PMA */ + u64 z_unicast_rcv; /* starting count for PMA */ + u64 z_multicast_xmit; /* starting count for PMA */ + u64 z_multicast_rcv; /* starting count for PMA */ u64 z_symbol_error_counter; /* starting count for PMA */ u64 z_link_error_recovery_counter; /* starting count for PMA */ u64 z_link_downed_counter; /* starting count for PMA */ diff --git a/drivers/infiniband/hw/usnic/Kconfig b/drivers/infiniband/hw/usnic/Kconfig new file mode 100644 index 00000000000..29ab11c34f3 --- /dev/null +++ b/drivers/infiniband/hw/usnic/Kconfig @@ -0,0 +1,10 @@ +config INFINIBAND_USNIC + tristate "Verbs support for Cisco VIC" + depends on NETDEVICES && ETHERNET && INET && PCI && INTEL_IOMMU + select ENIC + select NET_VENDOR_CISCO + select PCI_IOV + select INFINIBAND_USER_ACCESS + ---help--- + This is a low-level driver for Cisco's Virtual Interface + Cards (VICs), including the VIC 1240 and 1280 cards. diff --git a/drivers/infiniband/hw/usnic/Makefile b/drivers/infiniband/hw/usnic/Makefile new file mode 100644 index 00000000000..99fb2db47cd --- /dev/null +++ b/drivers/infiniband/hw/usnic/Makefile @@ -0,0 +1,15 @@ +ccflags-y := -Idrivers/net/ethernet/cisco/enic + +obj-$(CONFIG_INFINIBAND_USNIC)+= usnic_verbs.o + +usnic_verbs-y=\ +usnic_fwd.o \ +usnic_transport.o \ +usnic_uiom.o \ +usnic_uiom_interval_tree.o \ +usnic_vnic.o \ +usnic_ib_main.o \ +usnic_ib_qp_grp.o \ +usnic_ib_sysfs.o \ +usnic_ib_verbs.o \ +usnic_debugfs.o \ diff --git a/drivers/infiniband/hw/usnic/usnic.h b/drivers/infiniband/hw/usnic/usnic.h new file mode 100644 index 00000000000..5be13d8991b --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_H_ +#define USNIC_H_ + +#define DRV_NAME "usnic_verbs" + +#define PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC 0x00cf /* User space NIC */ + +#define DRV_VERSION "1.0.3" +#define DRV_RELDATE "December 19, 2013" + +#endif /* USNIC_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_abi.h b/drivers/infiniband/hw/usnic/usnic_abi.h new file mode 100644 index 00000000000..04a66229584 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_abi.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + + +#ifndef USNIC_ABI_H +#define USNIC_ABI_H + +/* ABI between userspace and kernel */ +#define USNIC_UVERBS_ABI_VERSION 4 + +#define USNIC_QP_GRP_MAX_WQS 8 +#define USNIC_QP_GRP_MAX_RQS 8 +#define USNIC_QP_GRP_MAX_CQS 16 + +enum usnic_transport_type { + USNIC_TRANSPORT_UNKNOWN = 0, + USNIC_TRANSPORT_ROCE_CUSTOM = 1, + USNIC_TRANSPORT_IPV4_UDP = 2, + USNIC_TRANSPORT_MAX = 3, +}; + +struct usnic_transport_spec { + enum usnic_transport_type trans_type; + union { + struct { + uint16_t port_num; + } usnic_roce; + struct { + uint32_t sock_fd; + } udp; + }; +}; + +struct usnic_ib_create_qp_cmd { + struct usnic_transport_spec spec; +}; + +/*TODO: Future - usnic_modify_qp needs to pass in generic filters */ +struct usnic_ib_create_qp_resp { + u32 vfid; + u32 qp_grp_id; + u64 bar_bus_addr; + u32 bar_len; +/* + * WQ, RQ, CQ are explicity specified bc exposing a generic resources inteface + * expands the scope of ABI to many files. + */ + u32 wq_cnt; + u32 rq_cnt; + u32 cq_cnt; + u32 wq_idx[USNIC_QP_GRP_MAX_WQS]; + u32 rq_idx[USNIC_QP_GRP_MAX_RQS]; + u32 cq_idx[USNIC_QP_GRP_MAX_CQS]; + u32 transport; + u32 reserved[9]; +}; + +#endif /* USNIC_ABI_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h b/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h new file mode 100644 index 00000000000..39356726614 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_CMN_PKT_HDR_H +#define USNIC_CMN_PKT_HDR_H + +#define USNIC_ROCE_ETHERTYPE (0x8915) +#define USNIC_ROCE_GRH_VER (8) +#define USNIC_PROTO_VER (1) +#define USNIC_ROCE_GRH_VER_SHIFT (4) + +#endif /* USNIC_COMMON_PKT_HDR_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_common_util.h b/drivers/infiniband/hw/usnic/usnic_common_util.h new file mode 100644 index 00000000000..9d737ed5e55 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_common_util.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_CMN_UTIL_H +#define USNIC_CMN_UTIL_H + +static inline void +usnic_mac_to_gid(const char *const mac, char *raw_gid) +{ + raw_gid[0] = 0xfe; + raw_gid[1] = 0x80; + memset(&raw_gid[2], 0, 6); + raw_gid[8] = mac[0]^2; + raw_gid[9] = mac[1]; + raw_gid[10] = mac[2]; + raw_gid[11] = 0xff; + raw_gid[12] = 0xfe; + raw_gid[13] = mac[3]; + raw_gid[14] = mac[4]; + raw_gid[15] = mac[5]; +} + +static inline void +usnic_mac_ip_to_gid(const char *const mac, const __be32 inaddr, char *raw_gid) +{ + raw_gid[0] = 0xfe; + raw_gid[1] = 0x80; + memset(&raw_gid[2], 0, 2); + memcpy(&raw_gid[4], &inaddr, 4); + raw_gid[8] = mac[0]^2; + raw_gid[9] = mac[1]; + raw_gid[10] = mac[2]; + raw_gid[11] = 0xff; + raw_gid[12] = 0xfe; + raw_gid[13] = mac[3]; + raw_gid[14] = mac[4]; + raw_gid[15] = mac[5]; +} + +static inline void +usnic_write_gid_if_id_from_mac(char *mac, char *raw_gid) +{ + raw_gid[8] = mac[0]^2; + raw_gid[9] = mac[1]; + raw_gid[10] = mac[2]; + raw_gid[11] = 0xff; + raw_gid[12] = 0xfe; + raw_gid[13] = mac[3]; + raw_gid[14] = mac[4]; + raw_gid[15] = mac[5]; +} + +#endif /* USNIC_COMMON_UTIL_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.c b/drivers/infiniband/hw/usnic/usnic_debugfs.c new file mode 100644 index 00000000000..5d13860161a --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_debugfs.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/debugfs.h> +#include <linux/module.h> + +#include "usnic.h" +#include "usnic_log.h" +#include "usnic_debugfs.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_transport.h" + +static struct dentry *debugfs_root; +static struct dentry *flows_dentry; + +static ssize_t usnic_debugfs_buildinfo_read(struct file *f, char __user *data, + size_t count, loff_t *ppos) +{ + char buf[500]; + int res; + + if (*ppos > 0) + return 0; + + res = scnprintf(buf, sizeof(buf), + "version: %s\n" + "build date: %s\n", + DRV_VERSION, DRV_RELDATE); + + return simple_read_from_buffer(data, count, ppos, buf, res); +} + +static const struct file_operations usnic_debugfs_buildinfo_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = usnic_debugfs_buildinfo_read +}; + +static ssize_t flowinfo_read(struct file *f, char __user *data, + size_t count, loff_t *ppos) +{ + struct usnic_ib_qp_grp_flow *qp_flow; + int n; + int left; + char *ptr; + char buf[512]; + + qp_flow = f->private_data; + ptr = buf; + left = count; + + if (*ppos > 0) + return 0; + + spin_lock(&qp_flow->qp_grp->lock); + n = scnprintf(ptr, left, + "QP Grp ID: %d Transport: %s ", + qp_flow->qp_grp->grp_id, + usnic_transport_to_str(qp_flow->trans_type)); + UPDATE_PTR_LEFT(n, ptr, left); + if (qp_flow->trans_type == USNIC_TRANSPORT_ROCE_CUSTOM) { + n = scnprintf(ptr, left, "Port_Num:%hu\n", + qp_flow->usnic_roce.port_num); + UPDATE_PTR_LEFT(n, ptr, left); + } else if (qp_flow->trans_type == USNIC_TRANSPORT_IPV4_UDP) { + n = usnic_transport_sock_to_str(ptr, left, + qp_flow->udp.sock); + UPDATE_PTR_LEFT(n, ptr, left); + n = scnprintf(ptr, left, "\n"); + UPDATE_PTR_LEFT(n, ptr, left); + } + spin_unlock(&qp_flow->qp_grp->lock); + + return simple_read_from_buffer(data, count, ppos, buf, ptr - buf); +} + +static const struct file_operations flowinfo_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = flowinfo_read, +}; + +void usnic_debugfs_init(void) +{ + debugfs_root = debugfs_create_dir(DRV_NAME, NULL); + if (IS_ERR(debugfs_root)) { + usnic_err("Failed to create debugfs root dir, check if debugfs is enabled in kernel configuration\n"); + goto out_clear_root; + } + + flows_dentry = debugfs_create_dir("flows", debugfs_root); + if (IS_ERR_OR_NULL(flows_dentry)) { + usnic_err("Failed to create debugfs flow dir with err %ld\n", + PTR_ERR(flows_dentry)); + goto out_free_root; + } + + debugfs_create_file("build-info", S_IRUGO, debugfs_root, + NULL, &usnic_debugfs_buildinfo_ops); + return; + +out_free_root: + debugfs_remove_recursive(debugfs_root); +out_clear_root: + debugfs_root = NULL; +} + +void usnic_debugfs_exit(void) +{ + if (!debugfs_root) + return; + + debugfs_remove_recursive(debugfs_root); + debugfs_root = NULL; +} + +void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow) +{ + if (IS_ERR_OR_NULL(flows_dentry)) + return; + + scnprintf(qp_flow->dentry_name, sizeof(qp_flow->dentry_name), + "%u", qp_flow->flow->flow_id); + qp_flow->dbgfs_dentry = debugfs_create_file(qp_flow->dentry_name, + S_IRUGO, + flows_dentry, + qp_flow, + &flowinfo_ops); + if (IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) { + usnic_err("Failed to create dbg fs entry for flow %u\n", + qp_flow->flow->flow_id); + } +} + +void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow) +{ + if (!IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) + debugfs_remove(qp_flow->dbgfs_dentry); +} diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.h b/drivers/infiniband/hw/usnic/usnic_debugfs.h new file mode 100644 index 00000000000..4087d24a88f --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_debugfs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#ifndef USNIC_DEBUGFS_H_ +#define USNIC_DEBUGFS_H_ + +#include "usnic_ib_qp_grp.h" + +void usnic_debugfs_init(void); + +void usnic_debugfs_exit(void); +void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow); +void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow); + +#endif /*!USNIC_DEBUGFS_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.c b/drivers/infiniband/hw/usnic/usnic_fwd.c new file mode 100644 index 00000000000..e3c9bd9d3ba --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_fwd.c @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/netdevice.h> +#include <linux/pci.h> + +#include "enic_api.h" +#include "usnic_common_pkt_hdr.h" +#include "usnic_fwd.h" +#include "usnic_log.h" + +static int usnic_fwd_devcmd_locked(struct usnic_fwd_dev *ufdev, int vnic_idx, + enum vnic_devcmd_cmd cmd, u64 *a0, + u64 *a1) +{ + int status; + struct net_device *netdev = ufdev->netdev; + + lockdep_assert_held(&ufdev->lock); + + status = enic_api_devcmd_proxy_by_index(netdev, + vnic_idx, + cmd, + a0, a1, + 1000); + if (status) { + if (status == ERR_EINVAL && cmd == CMD_DEL_FILTER) { + usnic_dbg("Dev %s vnic idx %u cmd %u already deleted", + ufdev->name, vnic_idx, cmd); + } else { + usnic_err("Dev %s vnic idx %u cmd %u failed with status %d\n", + ufdev->name, vnic_idx, cmd, + status); + } + } else { + usnic_dbg("Dev %s vnic idx %u cmd %u success", + ufdev->name, vnic_idx, cmd); + } + + return status; +} + +static int usnic_fwd_devcmd(struct usnic_fwd_dev *ufdev, int vnic_idx, + enum vnic_devcmd_cmd cmd, u64 *a0, u64 *a1) +{ + int status; + + spin_lock(&ufdev->lock); + status = usnic_fwd_devcmd_locked(ufdev, vnic_idx, cmd, a0, a1); + spin_unlock(&ufdev->lock); + + return status; +} + +struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev) +{ + struct usnic_fwd_dev *ufdev; + + ufdev = kzalloc(sizeof(*ufdev), GFP_KERNEL); + if (!ufdev) + return NULL; + + ufdev->pdev = pdev; + ufdev->netdev = pci_get_drvdata(pdev); + spin_lock_init(&ufdev->lock); + strncpy(ufdev->name, netdev_name(ufdev->netdev), + sizeof(ufdev->name) - 1); + + return ufdev; +} + +void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev) +{ + kfree(ufdev); +} + +void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN]) +{ + spin_lock(&ufdev->lock); + memcpy(&ufdev->mac, mac, sizeof(ufdev->mac)); + spin_unlock(&ufdev->lock); +} + +int usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr) +{ + int status; + + spin_lock(&ufdev->lock); + if (ufdev->inaddr == 0) { + ufdev->inaddr = inaddr; + status = 0; + } else { + status = -EFAULT; + } + spin_unlock(&ufdev->lock); + + return status; +} + +void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev) +{ + spin_lock(&ufdev->lock); + ufdev->inaddr = 0; + spin_unlock(&ufdev->lock); +} + +void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev) +{ + spin_lock(&ufdev->lock); + ufdev->link_up = 1; + spin_unlock(&ufdev->lock); +} + +void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev) +{ + spin_lock(&ufdev->lock); + ufdev->link_up = 0; + spin_unlock(&ufdev->lock); +} + +void usnic_fwd_set_mtu(struct usnic_fwd_dev *ufdev, unsigned int mtu) +{ + spin_lock(&ufdev->lock); + ufdev->mtu = mtu; + spin_unlock(&ufdev->lock); +} + +static int usnic_fwd_dev_ready_locked(struct usnic_fwd_dev *ufdev) +{ + lockdep_assert_held(&ufdev->lock); + + if (!ufdev->link_up) + return -EPERM; + + return 0; +} + +static int validate_filter_locked(struct usnic_fwd_dev *ufdev, + struct filter *filter) +{ + + lockdep_assert_held(&ufdev->lock); + + if (filter->type == FILTER_IPV4_5TUPLE) { + if (!(filter->u.ipv4.flags & FILTER_FIELD_5TUP_DST_AD)) + return -EACCES; + if (!(filter->u.ipv4.flags & FILTER_FIELD_5TUP_DST_PT)) + return -EBUSY; + else if (ufdev->inaddr == 0) + return -EINVAL; + else if (filter->u.ipv4.dst_port == 0) + return -ERANGE; + else if (ntohl(ufdev->inaddr) != filter->u.ipv4.dst_addr) + return -EFAULT; + else + return 0; + } + + return 0; +} + +static void fill_tlv(struct filter_tlv *tlv, struct filter *filter, + struct filter_action *action) +{ + tlv->type = CLSF_TLV_FILTER; + tlv->length = sizeof(struct filter); + *((struct filter *)&tlv->val) = *filter; + + tlv = (struct filter_tlv *)((char *)tlv + sizeof(struct filter_tlv) + + sizeof(struct filter)); + tlv->type = CLSF_TLV_ACTION; + tlv->length = sizeof(struct filter_action); + *((struct filter_action *)&tlv->val) = *action; +} + +struct usnic_fwd_flow* +usnic_fwd_alloc_flow(struct usnic_fwd_dev *ufdev, struct filter *filter, + struct usnic_filter_action *uaction) +{ + struct filter_tlv *tlv; + struct pci_dev *pdev; + struct usnic_fwd_flow *flow; + uint64_t a0, a1; + uint64_t tlv_size; + dma_addr_t tlv_pa; + int status; + + pdev = ufdev->pdev; + tlv_size = (2*sizeof(struct filter_tlv) + sizeof(struct filter) + + sizeof(struct filter_action)); + + flow = kzalloc(sizeof(*flow), GFP_ATOMIC); + if (!flow) + return ERR_PTR(-ENOMEM); + + tlv = pci_alloc_consistent(pdev, tlv_size, &tlv_pa); + if (!tlv) { + usnic_err("Failed to allocate memory\n"); + status = -ENOMEM; + goto out_free_flow; + } + + fill_tlv(tlv, filter, &uaction->action); + + spin_lock(&ufdev->lock); + status = usnic_fwd_dev_ready_locked(ufdev); + if (status) { + usnic_err("Forwarding dev %s not ready with status %d\n", + ufdev->name, status); + goto out_free_tlv; + } + + status = validate_filter_locked(ufdev, filter); + if (status) { + usnic_err("Failed to validate filter with status %d\n", + status); + goto out_free_tlv; + } + + /* Issue Devcmd */ + a0 = tlv_pa; + a1 = tlv_size; + status = usnic_fwd_devcmd_locked(ufdev, uaction->vnic_idx, + CMD_ADD_FILTER, &a0, &a1); + if (status) { + usnic_err("VF %s Filter add failed with status:%d", + ufdev->name, status); + status = -EFAULT; + goto out_free_tlv; + } else { + usnic_dbg("VF %s FILTER ID:%llu", ufdev->name, a0); + } + + flow->flow_id = (uint32_t) a0; + flow->vnic_idx = uaction->vnic_idx; + flow->ufdev = ufdev; + +out_free_tlv: + spin_unlock(&ufdev->lock); + pci_free_consistent(pdev, tlv_size, tlv, tlv_pa); + if (!status) + return flow; +out_free_flow: + kfree(flow); + return ERR_PTR(status); +} + +int usnic_fwd_dealloc_flow(struct usnic_fwd_flow *flow) +{ + int status; + u64 a0, a1; + + a0 = flow->flow_id; + + status = usnic_fwd_devcmd(flow->ufdev, flow->vnic_idx, + CMD_DEL_FILTER, &a0, &a1); + if (status) { + if (status == ERR_EINVAL) { + usnic_dbg("Filter %u already deleted for VF Idx %u pf: %s status: %d", + flow->flow_id, flow->vnic_idx, + flow->ufdev->name, status); + } else { + usnic_err("PF %s VF Idx %u Filter: %u FILTER DELETE failed with status %d", + flow->ufdev->name, flow->vnic_idx, + flow->flow_id, status); + } + status = 0; + /* + * Log the error and fake success to the caller because if + * a flow fails to be deleted in the firmware, it is an + * unrecoverable error. + */ + } else { + usnic_dbg("PF %s VF Idx %u Filter: %u FILTER DELETED", + flow->ufdev->name, flow->vnic_idx, + flow->flow_id); + } + + kfree(flow); + return status; +} + +int usnic_fwd_enable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx) +{ + int status; + struct net_device *pf_netdev; + u64 a0, a1; + + pf_netdev = ufdev->netdev; + a0 = qp_idx; + a1 = CMD_QP_RQWQ; + + status = usnic_fwd_devcmd(ufdev, vnic_idx, CMD_QP_ENABLE, + &a0, &a1); + if (status) { + usnic_err("PF %s VNIC Index %u RQ Index: %u ENABLE Failed with status %d", + netdev_name(pf_netdev), + vnic_idx, + qp_idx, + status); + } else { + usnic_dbg("PF %s VNIC Index %u RQ Index: %u ENABLED", + netdev_name(pf_netdev), + vnic_idx, qp_idx); + } + + return status; +} + +int usnic_fwd_disable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx) +{ + int status; + u64 a0, a1; + struct net_device *pf_netdev; + + pf_netdev = ufdev->netdev; + a0 = qp_idx; + a1 = CMD_QP_RQWQ; + + status = usnic_fwd_devcmd(ufdev, vnic_idx, CMD_QP_DISABLE, + &a0, &a1); + if (status) { + usnic_err("PF %s VNIC Index %u RQ Index: %u DISABLE Failed with status %d", + netdev_name(pf_netdev), + vnic_idx, + qp_idx, + status); + } else { + usnic_dbg("PF %s VNIC Index %u RQ Index: %u DISABLED", + netdev_name(pf_netdev), + vnic_idx, + qp_idx); + } + + return status; +} diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.h b/drivers/infiniband/hw/usnic/usnic_fwd.h new file mode 100644 index 00000000000..93713a2230b --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_fwd.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_FWD_H_ +#define USNIC_FWD_H_ + +#include <linux/if.h> +#include <linux/netdevice.h> +#include <linux/pci.h> +#include <linux/in.h> + +#include "usnic_abi.h" +#include "usnic_common_pkt_hdr.h" +#include "vnic_devcmd.h" + +struct usnic_fwd_dev { + struct pci_dev *pdev; + struct net_device *netdev; + spinlock_t lock; + /* + * The following fields can be read directly off the device. + * However, they should be set by a accessor function, except name, + * which cannot be changed. + */ + bool link_up; + char mac[ETH_ALEN]; + unsigned int mtu; + __be32 inaddr; + char name[IFNAMSIZ+1]; +}; + +struct usnic_fwd_flow { + uint32_t flow_id; + struct usnic_fwd_dev *ufdev; + unsigned int vnic_idx; +}; + +struct usnic_filter_action { + int vnic_idx; + struct filter_action action; +}; + +struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev); +void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev); + +void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN]); +int usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr); +void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev); +void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev); +void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev); +void usnic_fwd_set_mtu(struct usnic_fwd_dev *ufdev, unsigned int mtu); + +/* + * Allocate a flow on this forwarding device. Whoever calls this function, + * must monitor netdev events on ufdev's netdevice. If NETDEV_REBOOT or + * NETDEV_DOWN is seen, flow will no longer function and must be + * immediately freed by calling usnic_dealloc_flow. + */ +struct usnic_fwd_flow* +usnic_fwd_alloc_flow(struct usnic_fwd_dev *ufdev, struct filter *filter, + struct usnic_filter_action *action); +int usnic_fwd_dealloc_flow(struct usnic_fwd_flow *flow); +int usnic_fwd_enable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx); +int usnic_fwd_disable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx); + +static inline void usnic_fwd_init_usnic_filter(struct filter *filter, + uint32_t usnic_id) +{ + filter->type = FILTER_USNIC_ID; + filter->u.usnic.ethtype = USNIC_ROCE_ETHERTYPE; + filter->u.usnic.flags = FILTER_FIELD_USNIC_ETHTYPE | + FILTER_FIELD_USNIC_ID | + FILTER_FIELD_USNIC_PROTO; + filter->u.usnic.proto_version = (USNIC_ROCE_GRH_VER << + USNIC_ROCE_GRH_VER_SHIFT) | + USNIC_PROTO_VER; + filter->u.usnic.usnic_id = usnic_id; +} + +static inline void usnic_fwd_init_udp_filter(struct filter *filter, + uint32_t daddr, uint16_t dport) +{ + filter->type = FILTER_IPV4_5TUPLE; + filter->u.ipv4.flags = FILTER_FIELD_5TUP_PROTO; + filter->u.ipv4.protocol = PROTO_UDP; + + if (daddr) { + filter->u.ipv4.flags |= FILTER_FIELD_5TUP_DST_AD; + filter->u.ipv4.dst_addr = daddr; + } + + if (dport) { + filter->u.ipv4.flags |= FILTER_FIELD_5TUP_DST_PT; + filter->u.ipv4.dst_port = dport; + } +} + +#endif /* !USNIC_FWD_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib.h b/drivers/infiniband/hw/usnic/usnic_ib.h new file mode 100644 index 00000000000..e5a9297dd1b --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_H_ +#define USNIC_IB_H_ + +#include <linux/iommu.h> +#include <linux/netdevice.h> + +#include <rdma/ib_verbs.h> + + +#include "usnic.h" +#include "usnic_abi.h" +#include "usnic_vnic.h" + +#define USNIC_IB_PORT_CNT 1 +#define USNIC_IB_NUM_COMP_VECTORS 1 + +extern unsigned int usnic_ib_share_vf; + +struct usnic_ib_ucontext { + struct ib_ucontext ibucontext; + /* Protected by usnic_ib_dev->usdev_lock */ + struct list_head qp_grp_list; + struct list_head link; +}; + +struct usnic_ib_pd { + struct ib_pd ibpd; + struct usnic_uiom_pd *umem_pd; +}; + +struct usnic_ib_mr { + struct ib_mr ibmr; + struct usnic_uiom_reg *umem; +}; + +struct usnic_ib_dev { + struct ib_device ib_dev; + struct pci_dev *pdev; + struct net_device *netdev; + struct usnic_fwd_dev *ufdev; + struct list_head ib_dev_link; + struct list_head vf_dev_list; + struct list_head ctx_list; + struct mutex usdev_lock; + + /* provisioning information */ + struct kref vf_cnt; + unsigned int vf_res_cnt[USNIC_VNIC_RES_TYPE_MAX]; + + /* sysfs vars for QPN reporting */ + struct kobject *qpn_kobj; +}; + +struct usnic_ib_vf { + struct usnic_ib_dev *pf; + spinlock_t lock; + struct usnic_vnic *vnic; + unsigned int qp_grp_ref_cnt; + struct usnic_ib_pd *pd; + struct list_head link; +}; + +static inline +struct usnic_ib_dev *to_usdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct usnic_ib_dev, ib_dev); +} + +static inline +struct usnic_ib_ucontext *to_ucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct usnic_ib_ucontext, ibucontext); +} + +static inline +struct usnic_ib_pd *to_upd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct usnic_ib_pd, ibpd); +} + +static inline +struct usnic_ib_ucontext *to_uucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct usnic_ib_ucontext, ibucontext); +} + +static inline +struct usnic_ib_mr *to_umr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct usnic_ib_mr, ibmr); +} +void usnic_ib_log_vf(struct usnic_ib_vf *vf); + +#define UPDATE_PTR_LEFT(N, P, L) \ +do { \ + L -= (N); \ + P += (N); \ +} while (0) + +#endif /* USNIC_IB_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c new file mode 100644 index 00000000000..fb6d026f92c --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -0,0 +1,682 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Author: Upinder Malhi <umalhi@cisco.com> + * Author: Anant Deepak <anadeepa@cisco.com> + * Author: Cesare Cantu' <cantuc@cisco.com> + * Author: Jeff Squyres <jsquyres@cisco.com> + * Author: Kiran Thirumalai <kithirum@cisco.com> + * Author: Xuyang Wang <xuywang@cisco.com> + * Author: Reese Faucette <rfaucett@cisco.com> + * + */ + +#include <linux/module.h> +#include <linux/inetdevice.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/errno.h> +#include <linux/pci.h> +#include <linux/netdevice.h> + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_addr.h> + +#include "usnic_abi.h" +#include "usnic_common_util.h" +#include "usnic_ib.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_log.h" +#include "usnic_fwd.h" +#include "usnic_debugfs.h" +#include "usnic_ib_verbs.h" +#include "usnic_transport.h" +#include "usnic_uiom.h" +#include "usnic_ib_sysfs.h" + +unsigned int usnic_log_lvl = USNIC_LOG_LVL_ERR; +unsigned int usnic_ib_share_vf = 1; + +static const char usnic_version[] = + DRV_NAME ": Cisco VIC (USNIC) Verbs Driver v" + DRV_VERSION " (" DRV_RELDATE ")\n"; + +static DEFINE_MUTEX(usnic_ib_ibdev_list_lock); +static LIST_HEAD(usnic_ib_ibdev_list); + +/* Callback dump funcs */ +static int usnic_ib_dump_vf_hdr(void *obj, char *buf, int buf_sz) +{ + struct usnic_ib_vf *vf = obj; + return scnprintf(buf, buf_sz, "PF: %s ", vf->pf->ib_dev.name); +} +/* End callback dump funcs */ + +static void usnic_ib_dump_vf(struct usnic_ib_vf *vf, char *buf, int buf_sz) +{ + usnic_vnic_dump(vf->vnic, buf, buf_sz, vf, + usnic_ib_dump_vf_hdr, + usnic_ib_qp_grp_dump_hdr, usnic_ib_qp_grp_dump_rows); +} + +void usnic_ib_log_vf(struct usnic_ib_vf *vf) +{ + char buf[1000]; + usnic_ib_dump_vf(vf, buf, sizeof(buf)); + usnic_dbg("%s\n", buf); +} + +/* Start of netdev section */ +static inline const char *usnic_ib_netdev_event_to_string(unsigned long event) +{ + const char *event2str[] = {"NETDEV_NONE", "NETDEV_UP", "NETDEV_DOWN", + "NETDEV_REBOOT", "NETDEV_CHANGE", + "NETDEV_REGISTER", "NETDEV_UNREGISTER", "NETDEV_CHANGEMTU", + "NETDEV_CHANGEADDR", "NETDEV_GOING_DOWN", "NETDEV_FEAT_CHANGE", + "NETDEV_BONDING_FAILOVER", "NETDEV_PRE_UP", + "NETDEV_PRE_TYPE_CHANGE", "NETDEV_POST_TYPE_CHANGE", + "NETDEV_POST_INT", "NETDEV_UNREGISTER_FINAL", "NETDEV_RELEASE", + "NETDEV_NOTIFY_PEERS", "NETDEV_JOIN" + }; + + if (event >= ARRAY_SIZE(event2str)) + return "UNKNOWN_NETDEV_EVENT"; + else + return event2str[event]; +} + +static void usnic_ib_qp_grp_modify_active_to_err(struct usnic_ib_dev *us_ibdev) +{ + struct usnic_ib_ucontext *ctx; + struct usnic_ib_qp_grp *qp_grp; + enum ib_qp_state cur_state; + int status; + + BUG_ON(!mutex_is_locked(&us_ibdev->usdev_lock)); + + list_for_each_entry(ctx, &us_ibdev->ctx_list, link) { + list_for_each_entry(qp_grp, &ctx->qp_grp_list, link) { + cur_state = qp_grp->state; + if (cur_state == IB_QPS_INIT || + cur_state == IB_QPS_RTR || + cur_state == IB_QPS_RTS) { + status = usnic_ib_qp_grp_modify(qp_grp, + IB_QPS_ERR, + NULL); + if (status) { + usnic_err("Failed to transistion qp grp %u from %s to %s\n", + qp_grp->grp_id, + usnic_ib_qp_grp_state_to_string + (cur_state), + usnic_ib_qp_grp_state_to_string + (IB_QPS_ERR)); + } + } + } + } +} + +static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, + unsigned long event) +{ + struct net_device *netdev; + struct ib_event ib_event; + + memset(&ib_event, 0, sizeof(ib_event)); + + mutex_lock(&us_ibdev->usdev_lock); + netdev = us_ibdev->netdev; + switch (event) { + case NETDEV_REBOOT: + usnic_info("PF Reset on %s\n", us_ibdev->ib_dev.name); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_PORT_ERR; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + break; + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_CHANGE: + if (!us_ibdev->ufdev->link_up && + netif_carrier_ok(netdev)) { + usnic_fwd_carrier_up(us_ibdev->ufdev); + usnic_info("Link UP on %s\n", us_ibdev->ib_dev.name); + ib_event.event = IB_EVENT_PORT_ACTIVE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } else if (us_ibdev->ufdev->link_up && + !netif_carrier_ok(netdev)) { + usnic_fwd_carrier_down(us_ibdev->ufdev); + usnic_info("Link DOWN on %s\n", us_ibdev->ib_dev.name); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_PORT_ERR; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } else { + usnic_dbg("Ignoring %s on %s\n", + usnic_ib_netdev_event_to_string(event), + us_ibdev->ib_dev.name); + } + break; + case NETDEV_CHANGEADDR: + if (!memcmp(us_ibdev->ufdev->mac, netdev->dev_addr, + sizeof(us_ibdev->ufdev->mac))) { + usnic_dbg("Ignoring addr change on %s\n", + us_ibdev->ib_dev.name); + } else { + usnic_info(" %s old mac: %pM new mac: %pM\n", + us_ibdev->ib_dev.name, + us_ibdev->ufdev->mac, + netdev->dev_addr); + usnic_fwd_set_mac(us_ibdev->ufdev, netdev->dev_addr); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_GID_CHANGE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } + + break; + case NETDEV_CHANGEMTU: + if (us_ibdev->ufdev->mtu != netdev->mtu) { + usnic_info("MTU Change on %s old: %u new: %u\n", + us_ibdev->ib_dev.name, + us_ibdev->ufdev->mtu, netdev->mtu); + usnic_fwd_set_mtu(us_ibdev->ufdev, netdev->mtu); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + } else { + usnic_dbg("Ignoring MTU change on %s\n", + us_ibdev->ib_dev.name); + } + break; + default: + usnic_dbg("Ignoring event %s on %s", + usnic_ib_netdev_event_to_string(event), + us_ibdev->ib_dev.name); + } + mutex_unlock(&us_ibdev->usdev_lock); +} + +static int usnic_ib_netdevice_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct usnic_ib_dev *us_ibdev; + + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + mutex_lock(&usnic_ib_ibdev_list_lock); + list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) { + if (us_ibdev->netdev == netdev) { + usnic_ib_handle_usdev_event(us_ibdev, event); + break; + } + } + mutex_unlock(&usnic_ib_ibdev_list_lock); + + return NOTIFY_DONE; +} + +static struct notifier_block usnic_ib_netdevice_notifier = { + .notifier_call = usnic_ib_netdevice_event +}; +/* End of netdev section */ + +/* Start of inet section */ +static int usnic_ib_handle_inet_event(struct usnic_ib_dev *us_ibdev, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = ptr; + struct ib_event ib_event; + + mutex_lock(&us_ibdev->usdev_lock); + + switch (event) { + case NETDEV_DOWN: + usnic_info("%s via ip notifiers", + usnic_ib_netdev_event_to_string(event)); + usnic_fwd_del_ipaddr(us_ibdev->ufdev); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_GID_CHANGE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + break; + case NETDEV_UP: + usnic_fwd_add_ipaddr(us_ibdev->ufdev, ifa->ifa_address); + usnic_info("%s via ip notifiers: ip %pI4", + usnic_ib_netdev_event_to_string(event), + &us_ibdev->ufdev->inaddr); + ib_event.event = IB_EVENT_GID_CHANGE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + break; + default: + usnic_info("Ignoring event %s on %s", + usnic_ib_netdev_event_to_string(event), + us_ibdev->ib_dev.name); + } + mutex_unlock(&us_ibdev->usdev_lock); + + return NOTIFY_DONE; +} + +static int usnic_ib_inetaddr_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct usnic_ib_dev *us_ibdev; + struct in_ifaddr *ifa = ptr; + struct net_device *netdev = ifa->ifa_dev->dev; + + mutex_lock(&usnic_ib_ibdev_list_lock); + list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) { + if (us_ibdev->netdev == netdev) { + usnic_ib_handle_inet_event(us_ibdev, event, ptr); + break; + } + } + mutex_unlock(&usnic_ib_ibdev_list_lock); + + return NOTIFY_DONE; +} +static struct notifier_block usnic_ib_inetaddr_notifier = { + .notifier_call = usnic_ib_inetaddr_event +}; +/* End of inet section*/ + +/* Start of PF discovery section */ +static void *usnic_ib_device_add(struct pci_dev *dev) +{ + struct usnic_ib_dev *us_ibdev; + union ib_gid gid; + struct in_ifaddr *in; + struct net_device *netdev; + + usnic_dbg("\n"); + netdev = pci_get_drvdata(dev); + + us_ibdev = (struct usnic_ib_dev *)ib_alloc_device(sizeof(*us_ibdev)); + if (IS_ERR_OR_NULL(us_ibdev)) { + usnic_err("Device %s context alloc failed\n", + netdev_name(pci_get_drvdata(dev))); + return ERR_PTR(us_ibdev ? PTR_ERR(us_ibdev) : -EFAULT); + } + + us_ibdev->ufdev = usnic_fwd_dev_alloc(dev); + if (IS_ERR_OR_NULL(us_ibdev->ufdev)) { + usnic_err("Failed to alloc ufdev for %s with err %ld\n", + pci_name(dev), PTR_ERR(us_ibdev->ufdev)); + goto err_dealloc; + } + + mutex_init(&us_ibdev->usdev_lock); + INIT_LIST_HEAD(&us_ibdev->vf_dev_list); + INIT_LIST_HEAD(&us_ibdev->ctx_list); + + us_ibdev->pdev = dev; + us_ibdev->netdev = pci_get_drvdata(dev); + us_ibdev->ib_dev.owner = THIS_MODULE; + us_ibdev->ib_dev.node_type = RDMA_NODE_USNIC_UDP; + us_ibdev->ib_dev.phys_port_cnt = USNIC_IB_PORT_CNT; + us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS; + us_ibdev->ib_dev.dma_device = &dev->dev; + us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION; + strlcpy(us_ibdev->ib_dev.name, "usnic_%d", IB_DEVICE_NAME_MAX); + + us_ibdev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP); + + us_ibdev->ib_dev.query_device = usnic_ib_query_device; + us_ibdev->ib_dev.query_port = usnic_ib_query_port; + us_ibdev->ib_dev.query_pkey = usnic_ib_query_pkey; + us_ibdev->ib_dev.query_gid = usnic_ib_query_gid; + us_ibdev->ib_dev.get_link_layer = usnic_ib_port_link_layer; + us_ibdev->ib_dev.alloc_pd = usnic_ib_alloc_pd; + us_ibdev->ib_dev.dealloc_pd = usnic_ib_dealloc_pd; + us_ibdev->ib_dev.create_qp = usnic_ib_create_qp; + us_ibdev->ib_dev.modify_qp = usnic_ib_modify_qp; + us_ibdev->ib_dev.query_qp = usnic_ib_query_qp; + us_ibdev->ib_dev.destroy_qp = usnic_ib_destroy_qp; + us_ibdev->ib_dev.create_cq = usnic_ib_create_cq; + us_ibdev->ib_dev.destroy_cq = usnic_ib_destroy_cq; + us_ibdev->ib_dev.reg_user_mr = usnic_ib_reg_mr; + us_ibdev->ib_dev.dereg_mr = usnic_ib_dereg_mr; + us_ibdev->ib_dev.alloc_ucontext = usnic_ib_alloc_ucontext; + us_ibdev->ib_dev.dealloc_ucontext = usnic_ib_dealloc_ucontext; + us_ibdev->ib_dev.mmap = usnic_ib_mmap; + us_ibdev->ib_dev.create_ah = usnic_ib_create_ah; + us_ibdev->ib_dev.destroy_ah = usnic_ib_destroy_ah; + us_ibdev->ib_dev.post_send = usnic_ib_post_send; + us_ibdev->ib_dev.post_recv = usnic_ib_post_recv; + us_ibdev->ib_dev.poll_cq = usnic_ib_poll_cq; + us_ibdev->ib_dev.req_notify_cq = usnic_ib_req_notify_cq; + us_ibdev->ib_dev.get_dma_mr = usnic_ib_get_dma_mr; + + + if (ib_register_device(&us_ibdev->ib_dev, NULL)) + goto err_fwd_dealloc; + + usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu); + usnic_fwd_set_mac(us_ibdev->ufdev, us_ibdev->netdev->dev_addr); + if (netif_carrier_ok(us_ibdev->netdev)) + usnic_fwd_carrier_up(us_ibdev->ufdev); + + in = ((struct in_device *)(netdev->ip_ptr))->ifa_list; + if (in != NULL) + usnic_fwd_add_ipaddr(us_ibdev->ufdev, in->ifa_address); + + usnic_mac_ip_to_gid(us_ibdev->netdev->perm_addr, + us_ibdev->ufdev->inaddr, &gid.raw[0]); + memcpy(&us_ibdev->ib_dev.node_guid, &gid.global.interface_id, + sizeof(gid.global.interface_id)); + kref_init(&us_ibdev->vf_cnt); + + usnic_info("Added ibdev: %s netdev: %s with mac %pM Link: %u MTU: %u\n", + us_ibdev->ib_dev.name, netdev_name(us_ibdev->netdev), + us_ibdev->ufdev->mac, us_ibdev->ufdev->link_up, + us_ibdev->ufdev->mtu); + return us_ibdev; + +err_fwd_dealloc: + usnic_fwd_dev_free(us_ibdev->ufdev); +err_dealloc: + usnic_err("failed -- deallocing device\n"); + ib_dealloc_device(&us_ibdev->ib_dev); + return NULL; +} + +static void usnic_ib_device_remove(struct usnic_ib_dev *us_ibdev) +{ + usnic_info("Unregistering %s\n", us_ibdev->ib_dev.name); + usnic_ib_sysfs_unregister_usdev(us_ibdev); + usnic_fwd_dev_free(us_ibdev->ufdev); + ib_unregister_device(&us_ibdev->ib_dev); + ib_dealloc_device(&us_ibdev->ib_dev); +} + +static void usnic_ib_undiscover_pf(struct kref *kref) +{ + struct usnic_ib_dev *us_ibdev, *tmp; + struct pci_dev *dev; + bool found = false; + + dev = container_of(kref, struct usnic_ib_dev, vf_cnt)->pdev; + mutex_lock(&usnic_ib_ibdev_list_lock); + list_for_each_entry_safe(us_ibdev, tmp, + &usnic_ib_ibdev_list, ib_dev_link) { + if (us_ibdev->pdev == dev) { + list_del(&us_ibdev->ib_dev_link); + usnic_ib_device_remove(us_ibdev); + found = true; + break; + } + } + + WARN(!found, "Failed to remove PF %s\n", pci_name(dev)); + + mutex_unlock(&usnic_ib_ibdev_list_lock); +} + +static struct usnic_ib_dev *usnic_ib_discover_pf(struct usnic_vnic *vnic) +{ + struct usnic_ib_dev *us_ibdev; + struct pci_dev *parent_pci, *vf_pci; + int err; + + vf_pci = usnic_vnic_get_pdev(vnic); + parent_pci = pci_physfn(vf_pci); + + BUG_ON(!parent_pci); + + mutex_lock(&usnic_ib_ibdev_list_lock); + list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) { + if (us_ibdev->pdev == parent_pci) { + kref_get(&us_ibdev->vf_cnt); + goto out; + } + } + + us_ibdev = usnic_ib_device_add(parent_pci); + if (IS_ERR_OR_NULL(us_ibdev)) { + us_ibdev = us_ibdev ? us_ibdev : ERR_PTR(-EFAULT); + goto out; + } + + err = usnic_ib_sysfs_register_usdev(us_ibdev); + if (err) { + usnic_ib_device_remove(us_ibdev); + us_ibdev = ERR_PTR(err); + goto out; + } + + list_add(&us_ibdev->ib_dev_link, &usnic_ib_ibdev_list); +out: + mutex_unlock(&usnic_ib_ibdev_list_lock); + return us_ibdev; +} +/* End of PF discovery section */ + +/* Start of PCI section */ + +static DEFINE_PCI_DEVICE_TABLE(usnic_ib_pci_ids) = { + {PCI_DEVICE(PCI_VENDOR_ID_CISCO, PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC)}, + {0,} +}; + +static int usnic_ib_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + int err; + struct usnic_ib_dev *pf; + struct usnic_ib_vf *vf; + enum usnic_vnic_res_type res_type; + + vf = kzalloc(sizeof(*vf), GFP_KERNEL); + if (!vf) + return -ENOMEM; + + err = pci_enable_device(pdev); + if (err) { + usnic_err("Failed to enable %s with err %d\n", + pci_name(pdev), err); + goto out_clean_vf; + } + + err = pci_request_regions(pdev, DRV_NAME); + if (err) { + usnic_err("Failed to request region for %s with err %d\n", + pci_name(pdev), err); + goto out_disable_device; + } + + pci_set_master(pdev); + pci_set_drvdata(pdev, vf); + + vf->vnic = usnic_vnic_alloc(pdev); + if (IS_ERR_OR_NULL(vf->vnic)) { + err = vf->vnic ? PTR_ERR(vf->vnic) : -ENOMEM; + usnic_err("Failed to alloc vnic for %s with err %d\n", + pci_name(pdev), err); + goto out_release_regions; + } + + pf = usnic_ib_discover_pf(vf->vnic); + if (IS_ERR_OR_NULL(pf)) { + usnic_err("Failed to discover pf of vnic %s with err%ld\n", + pci_name(pdev), PTR_ERR(pf)); + err = pf ? PTR_ERR(pf) : -EFAULT; + goto out_clean_vnic; + } + + vf->pf = pf; + spin_lock_init(&vf->lock); + mutex_lock(&pf->usdev_lock); + list_add_tail(&vf->link, &pf->vf_dev_list); + /* + * Save max settings (will be same for each VF, easier to re-write than + * to say "if (!set) { set_values(); set=1; } + */ + for (res_type = USNIC_VNIC_RES_TYPE_EOL+1; + res_type < USNIC_VNIC_RES_TYPE_MAX; + res_type++) { + pf->vf_res_cnt[res_type] = usnic_vnic_res_cnt(vf->vnic, + res_type); + } + + mutex_unlock(&pf->usdev_lock); + + usnic_info("Registering usnic VF %s into PF %s\n", pci_name(pdev), + pf->ib_dev.name); + usnic_ib_log_vf(vf); + return 0; + +out_clean_vnic: + usnic_vnic_free(vf->vnic); +out_release_regions: + pci_set_drvdata(pdev, NULL); + pci_clear_master(pdev); + pci_release_regions(pdev); +out_disable_device: + pci_disable_device(pdev); +out_clean_vf: + kfree(vf); + return err; +} + +static void usnic_ib_pci_remove(struct pci_dev *pdev) +{ + struct usnic_ib_vf *vf = pci_get_drvdata(pdev); + struct usnic_ib_dev *pf = vf->pf; + + mutex_lock(&pf->usdev_lock); + list_del(&vf->link); + mutex_unlock(&pf->usdev_lock); + + kref_put(&pf->vf_cnt, usnic_ib_undiscover_pf); + usnic_vnic_free(vf->vnic); + pci_set_drvdata(pdev, NULL); + pci_clear_master(pdev); + pci_release_regions(pdev); + pci_disable_device(pdev); + kfree(vf); + + usnic_info("Removed VF %s\n", pci_name(pdev)); +} + +/* PCI driver entry points */ +static struct pci_driver usnic_ib_pci_driver = { + .name = DRV_NAME, + .id_table = usnic_ib_pci_ids, + .probe = usnic_ib_pci_probe, + .remove = usnic_ib_pci_remove, +}; +/* End of PCI section */ + +/* Start of module section */ +static int __init usnic_ib_init(void) +{ + int err; + + printk_once(KERN_INFO "%s", usnic_version); + + err = usnic_uiom_init(DRV_NAME); + if (err) { + usnic_err("Unable to initalize umem with err %d\n", err); + return err; + } + + if (pci_register_driver(&usnic_ib_pci_driver)) { + usnic_err("Unable to register with PCI\n"); + goto out_umem_fini; + } + + err = register_netdevice_notifier(&usnic_ib_netdevice_notifier); + if (err) { + usnic_err("Failed to register netdev notifier\n"); + goto out_pci_unreg; + } + + err = register_inetaddr_notifier(&usnic_ib_inetaddr_notifier); + if (err) { + usnic_err("Failed to register inet addr notifier\n"); + goto out_unreg_netdev_notifier; + } + + err = usnic_transport_init(); + if (err) { + usnic_err("Failed to initialize transport\n"); + goto out_unreg_inetaddr_notifier; + } + + usnic_debugfs_init(); + + return 0; + +out_unreg_inetaddr_notifier: + unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier); +out_unreg_netdev_notifier: + unregister_netdevice_notifier(&usnic_ib_netdevice_notifier); +out_pci_unreg: + pci_unregister_driver(&usnic_ib_pci_driver); +out_umem_fini: + usnic_uiom_fini(); + + return err; +} + +static void __exit usnic_ib_destroy(void) +{ + usnic_dbg("\n"); + usnic_debugfs_exit(); + usnic_transport_fini(); + unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier); + unregister_netdevice_notifier(&usnic_ib_netdevice_notifier); + pci_unregister_driver(&usnic_ib_pci_driver); + usnic_uiom_fini(); +} + +MODULE_DESCRIPTION("Cisco VIC (usNIC) Verbs Driver"); +MODULE_AUTHOR("Upinder Malhi <umalhi@cisco.com>"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); +module_param(usnic_log_lvl, uint, S_IRUGO | S_IWUSR); +module_param(usnic_ib_share_vf, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(usnic_log_lvl, " Off=0, Err=1, Info=2, Debug=3"); +MODULE_PARM_DESC(usnic_ib_share_vf, "Off=0, On=1 VF sharing amongst QPs"); +MODULE_DEVICE_TABLE(pci, usnic_ib_pci_ids); + +module_init(usnic_ib_init); +module_exit(usnic_ib_destroy); +/* End of module section */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c new file mode 100644 index 00000000000..f8dfd76be89 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c @@ -0,0 +1,761 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/bug.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/spinlock.h> + +#include "usnic_log.h" +#include "usnic_vnic.h" +#include "usnic_fwd.h" +#include "usnic_uiom.h" +#include "usnic_debugfs.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_ib_sysfs.h" +#include "usnic_transport.h" + +#define DFLT_RQ_IDX 0 + +const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: + return "Rst"; + case IB_QPS_INIT: + return "Init"; + case IB_QPS_RTR: + return "RTR"; + case IB_QPS_RTS: + return "RTS"; + case IB_QPS_SQD: + return "SQD"; + case IB_QPS_SQE: + return "SQE"; + case IB_QPS_ERR: + return "ERR"; + default: + return "UNKOWN STATE"; + + } +} + +int usnic_ib_qp_grp_dump_hdr(char *buf, int buf_sz) +{ + return scnprintf(buf, buf_sz, "|QPN\t|State\t|PID\t|VF Idx\t|Fil ID"); +} + +int usnic_ib_qp_grp_dump_rows(void *obj, char *buf, int buf_sz) +{ + struct usnic_ib_qp_grp *qp_grp = obj; + struct usnic_ib_qp_grp_flow *default_flow; + if (obj) { + default_flow = list_first_entry(&qp_grp->flows_lst, + struct usnic_ib_qp_grp_flow, link); + return scnprintf(buf, buf_sz, "|%d\t|%s\t|%d\t|%hu\t|%d", + qp_grp->ibqp.qp_num, + usnic_ib_qp_grp_state_to_string( + qp_grp->state), + qp_grp->owner_pid, + usnic_vnic_get_index(qp_grp->vf->vnic), + default_flow->flow->flow_id); + } else { + return scnprintf(buf, buf_sz, "|N/A\t|N/A\t|N/A\t|N/A\t|N/A"); + } +} + +static struct usnic_vnic_res_chunk * +get_qp_res_chunk(struct usnic_ib_qp_grp *qp_grp) +{ + lockdep_assert_held(&qp_grp->lock); + /* + * The QP res chunk, used to derive qp indices, + * are just indices of the RQs + */ + return usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ); +} + +static int enable_qp_grp(struct usnic_ib_qp_grp *qp_grp) +{ + + int status; + int i, vnic_idx; + struct usnic_vnic_res_chunk *res_chunk; + struct usnic_vnic_res *res; + + lockdep_assert_held(&qp_grp->lock); + + vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + + res_chunk = get_qp_res_chunk(qp_grp); + if (IS_ERR_OR_NULL(res_chunk)) { + usnic_err("Unable to get qp res with err %ld\n", + PTR_ERR(res_chunk)); + return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM; + } + + for (i = 0; i < res_chunk->cnt; i++) { + res = res_chunk->res[i]; + status = usnic_fwd_enable_qp(qp_grp->ufdev, vnic_idx, + res->vnic_idx); + if (status) { + usnic_err("Failed to enable qp %d of %s:%d\n with err %d\n", + res->vnic_idx, qp_grp->ufdev->name, + vnic_idx, status); + goto out_err; + } + } + + return 0; + +out_err: + for (i--; i >= 0; i--) { + res = res_chunk->res[i]; + usnic_fwd_disable_qp(qp_grp->ufdev, vnic_idx, + res->vnic_idx); + } + + return status; +} + +static int disable_qp_grp(struct usnic_ib_qp_grp *qp_grp) +{ + int i, vnic_idx; + struct usnic_vnic_res_chunk *res_chunk; + struct usnic_vnic_res *res; + int status = 0; + + lockdep_assert_held(&qp_grp->lock); + vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + + res_chunk = get_qp_res_chunk(qp_grp); + if (IS_ERR_OR_NULL(res_chunk)) { + usnic_err("Unable to get qp res with err %ld\n", + PTR_ERR(res_chunk)); + return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM; + } + + for (i = 0; i < res_chunk->cnt; i++) { + res = res_chunk->res[i]; + status = usnic_fwd_disable_qp(qp_grp->ufdev, vnic_idx, + res->vnic_idx); + if (status) { + usnic_err("Failed to disable rq %d of %s:%d\n with err %d\n", + res->vnic_idx, + qp_grp->ufdev->name, + vnic_idx, status); + } + } + + return status; + +} + +static int init_filter_action(struct usnic_ib_qp_grp *qp_grp, + struct usnic_filter_action *uaction) +{ + struct usnic_vnic_res_chunk *res_chunk; + + res_chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ); + if (IS_ERR_OR_NULL(res_chunk)) { + usnic_err("Unable to get %s with err %ld\n", + usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_RQ), + PTR_ERR(res_chunk)); + return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM; + } + + uaction->vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + uaction->action.type = FILTER_ACTION_RQ_STEERING; + uaction->action.u.rq_idx = res_chunk->res[DFLT_RQ_IDX]->vnic_idx; + + return 0; +} + +static struct usnic_ib_qp_grp_flow* +create_roce_custom_flow(struct usnic_ib_qp_grp *qp_grp, + struct usnic_transport_spec *trans_spec) +{ + uint16_t port_num; + int err; + struct filter filter; + struct usnic_filter_action uaction; + struct usnic_ib_qp_grp_flow *qp_flow; + struct usnic_fwd_flow *flow; + enum usnic_transport_type trans_type; + + trans_type = trans_spec->trans_type; + port_num = trans_spec->usnic_roce.port_num; + + /* Reserve Port */ + port_num = usnic_transport_rsrv_port(trans_type, port_num); + if (port_num == 0) + return ERR_PTR(-EINVAL); + + /* Create Flow */ + usnic_fwd_init_usnic_filter(&filter, port_num); + err = init_filter_action(qp_grp, &uaction); + if (err) + goto out_unreserve_port; + + flow = usnic_fwd_alloc_flow(qp_grp->ufdev, &filter, &uaction); + if (IS_ERR_OR_NULL(flow)) { + usnic_err("Unable to alloc flow failed with err %ld\n", + PTR_ERR(flow)); + err = flow ? PTR_ERR(flow) : -EFAULT; + goto out_unreserve_port; + } + + /* Create Flow Handle */ + qp_flow = kzalloc(sizeof(*qp_flow), GFP_ATOMIC); + if (IS_ERR_OR_NULL(qp_flow)) { + err = qp_flow ? PTR_ERR(qp_flow) : -ENOMEM; + goto out_dealloc_flow; + } + qp_flow->flow = flow; + qp_flow->trans_type = trans_type; + qp_flow->usnic_roce.port_num = port_num; + qp_flow->qp_grp = qp_grp; + return qp_flow; + +out_dealloc_flow: + usnic_fwd_dealloc_flow(flow); +out_unreserve_port: + usnic_transport_unrsrv_port(trans_type, port_num); + return ERR_PTR(err); +} + +static void release_roce_custom_flow(struct usnic_ib_qp_grp_flow *qp_flow) +{ + usnic_fwd_dealloc_flow(qp_flow->flow); + usnic_transport_unrsrv_port(qp_flow->trans_type, + qp_flow->usnic_roce.port_num); + kfree(qp_flow); +} + +static struct usnic_ib_qp_grp_flow* +create_udp_flow(struct usnic_ib_qp_grp *qp_grp, + struct usnic_transport_spec *trans_spec) +{ + struct socket *sock; + int sock_fd; + int err; + struct filter filter; + struct usnic_filter_action uaction; + struct usnic_ib_qp_grp_flow *qp_flow; + struct usnic_fwd_flow *flow; + enum usnic_transport_type trans_type; + uint32_t addr; + uint16_t port_num; + int proto; + + trans_type = trans_spec->trans_type; + sock_fd = trans_spec->udp.sock_fd; + + /* Get and check socket */ + sock = usnic_transport_get_socket(sock_fd); + if (IS_ERR_OR_NULL(sock)) + return ERR_CAST(sock); + + err = usnic_transport_sock_get_addr(sock, &proto, &addr, &port_num); + if (err) + goto out_put_sock; + + if (proto != IPPROTO_UDP) { + usnic_err("Protocol for fd %d is not UDP", sock_fd); + err = -EPERM; + goto out_put_sock; + } + + /* Create flow */ + usnic_fwd_init_udp_filter(&filter, addr, port_num); + err = init_filter_action(qp_grp, &uaction); + if (err) + goto out_put_sock; + + flow = usnic_fwd_alloc_flow(qp_grp->ufdev, &filter, &uaction); + if (IS_ERR_OR_NULL(flow)) { + usnic_err("Unable to alloc flow failed with err %ld\n", + PTR_ERR(flow)); + err = flow ? PTR_ERR(flow) : -EFAULT; + goto out_put_sock; + } + + /* Create qp_flow */ + qp_flow = kzalloc(sizeof(*qp_flow), GFP_ATOMIC); + if (IS_ERR_OR_NULL(qp_flow)) { + err = qp_flow ? PTR_ERR(qp_flow) : -ENOMEM; + goto out_dealloc_flow; + } + qp_flow->flow = flow; + qp_flow->trans_type = trans_type; + qp_flow->udp.sock = sock; + qp_flow->qp_grp = qp_grp; + return qp_flow; + +out_dealloc_flow: + usnic_fwd_dealloc_flow(flow); +out_put_sock: + usnic_transport_put_socket(sock); + return ERR_PTR(err); +} + +static void release_udp_flow(struct usnic_ib_qp_grp_flow *qp_flow) +{ + usnic_fwd_dealloc_flow(qp_flow->flow); + usnic_transport_put_socket(qp_flow->udp.sock); + kfree(qp_flow); +} + +static struct usnic_ib_qp_grp_flow* +create_and_add_flow(struct usnic_ib_qp_grp *qp_grp, + struct usnic_transport_spec *trans_spec) +{ + struct usnic_ib_qp_grp_flow *qp_flow; + enum usnic_transport_type trans_type; + + trans_type = trans_spec->trans_type; + switch (trans_type) { + case USNIC_TRANSPORT_ROCE_CUSTOM: + qp_flow = create_roce_custom_flow(qp_grp, trans_spec); + break; + case USNIC_TRANSPORT_IPV4_UDP: + qp_flow = create_udp_flow(qp_grp, trans_spec); + break; + default: + usnic_err("Unsupported transport %u\n", + trans_spec->trans_type); + return ERR_PTR(-EINVAL); + } + + if (!IS_ERR_OR_NULL(qp_flow)) { + list_add_tail(&qp_flow->link, &qp_grp->flows_lst); + usnic_debugfs_flow_add(qp_flow); + } + + + return qp_flow; +} + +static void release_and_remove_flow(struct usnic_ib_qp_grp_flow *qp_flow) +{ + usnic_debugfs_flow_remove(qp_flow); + list_del(&qp_flow->link); + + switch (qp_flow->trans_type) { + case USNIC_TRANSPORT_ROCE_CUSTOM: + release_roce_custom_flow(qp_flow); + break; + case USNIC_TRANSPORT_IPV4_UDP: + release_udp_flow(qp_flow); + break; + default: + WARN(1, "Unsupported transport %u\n", + qp_flow->trans_type); + break; + } +} + +static void release_and_remove_all_flows(struct usnic_ib_qp_grp *qp_grp) +{ + struct usnic_ib_qp_grp_flow *qp_flow, *tmp; + list_for_each_entry_safe(qp_flow, tmp, &qp_grp->flows_lst, link) + release_and_remove_flow(qp_flow); +} + +int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp, + enum ib_qp_state new_state, + void *data) +{ + int status = 0; + int vnic_idx; + struct ib_event ib_event; + enum ib_qp_state old_state; + struct usnic_transport_spec *trans_spec; + struct usnic_ib_qp_grp_flow *qp_flow; + + old_state = qp_grp->state; + vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + trans_spec = (struct usnic_transport_spec *) data; + + spin_lock(&qp_grp->lock); + switch (new_state) { + case IB_QPS_RESET: + switch (old_state) { + case IB_QPS_RESET: + /* NO-OP */ + break; + case IB_QPS_INIT: + release_and_remove_all_flows(qp_grp); + status = 0; + break; + case IB_QPS_RTR: + case IB_QPS_RTS: + case IB_QPS_ERR: + status = disable_qp_grp(qp_grp); + release_and_remove_all_flows(qp_grp); + break; + default: + status = -EINVAL; + } + break; + case IB_QPS_INIT: + switch (old_state) { + case IB_QPS_RESET: + if (trans_spec) { + qp_flow = create_and_add_flow(qp_grp, + trans_spec); + if (IS_ERR_OR_NULL(qp_flow)) { + status = qp_flow ? PTR_ERR(qp_flow) : -EFAULT; + break; + } + } else { + /* + * Optional to specify filters. + */ + status = 0; + } + break; + case IB_QPS_INIT: + if (trans_spec) { + qp_flow = create_and_add_flow(qp_grp, + trans_spec); + if (IS_ERR_OR_NULL(qp_flow)) { + status = qp_flow ? PTR_ERR(qp_flow) : -EFAULT; + break; + } + } else { + /* + * Doesn't make sense to go into INIT state + * from INIT state w/o adding filters. + */ + status = -EINVAL; + } + break; + case IB_QPS_RTR: + status = disable_qp_grp(qp_grp); + break; + case IB_QPS_RTS: + status = disable_qp_grp(qp_grp); + break; + default: + status = -EINVAL; + } + break; + case IB_QPS_RTR: + switch (old_state) { + case IB_QPS_INIT: + status = enable_qp_grp(qp_grp); + break; + default: + status = -EINVAL; + } + break; + case IB_QPS_RTS: + switch (old_state) { + case IB_QPS_RTR: + /* NO-OP FOR NOW */ + break; + default: + status = -EINVAL; + } + break; + case IB_QPS_ERR: + ib_event.device = &qp_grp->vf->pf->ib_dev; + ib_event.element.qp = &qp_grp->ibqp; + ib_event.event = IB_EVENT_QP_FATAL; + + switch (old_state) { + case IB_QPS_RESET: + qp_grp->ibqp.event_handler(&ib_event, + qp_grp->ibqp.qp_context); + break; + case IB_QPS_INIT: + release_and_remove_all_flows(qp_grp); + qp_grp->ibqp.event_handler(&ib_event, + qp_grp->ibqp.qp_context); + break; + case IB_QPS_RTR: + case IB_QPS_RTS: + status = disable_qp_grp(qp_grp); + release_and_remove_all_flows(qp_grp); + qp_grp->ibqp.event_handler(&ib_event, + qp_grp->ibqp.qp_context); + break; + default: + status = -EINVAL; + } + break; + default: + status = -EINVAL; + } + spin_unlock(&qp_grp->lock); + + if (!status) { + qp_grp->state = new_state; + usnic_info("Transistioned %u from %s to %s", + qp_grp->grp_id, + usnic_ib_qp_grp_state_to_string(old_state), + usnic_ib_qp_grp_state_to_string(new_state)); + } else { + usnic_err("Failed to transistion %u from %s to %s", + qp_grp->grp_id, + usnic_ib_qp_grp_state_to_string(old_state), + usnic_ib_qp_grp_state_to_string(new_state)); + } + + return status; +} + +static struct usnic_vnic_res_chunk** +alloc_res_chunk_list(struct usnic_vnic *vnic, + struct usnic_vnic_res_spec *res_spec, void *owner_obj) +{ + enum usnic_vnic_res_type res_type; + struct usnic_vnic_res_chunk **res_chunk_list; + int err, i, res_cnt, res_lst_sz; + + for (res_lst_sz = 0; + res_spec->resources[res_lst_sz].type != USNIC_VNIC_RES_TYPE_EOL; + res_lst_sz++) { + /* Do Nothing */ + } + + res_chunk_list = kzalloc(sizeof(*res_chunk_list)*(res_lst_sz+1), + GFP_ATOMIC); + if (!res_chunk_list) + return ERR_PTR(-ENOMEM); + + for (i = 0; res_spec->resources[i].type != USNIC_VNIC_RES_TYPE_EOL; + i++) { + res_type = res_spec->resources[i].type; + res_cnt = res_spec->resources[i].cnt; + + res_chunk_list[i] = usnic_vnic_get_resources(vnic, res_type, + res_cnt, owner_obj); + if (IS_ERR_OR_NULL(res_chunk_list[i])) { + err = res_chunk_list[i] ? + PTR_ERR(res_chunk_list[i]) : -ENOMEM; + usnic_err("Failed to get %s from %s with err %d\n", + usnic_vnic_res_type_to_str(res_type), + usnic_vnic_pci_name(vnic), + err); + goto out_free_res; + } + } + + return res_chunk_list; + +out_free_res: + for (i--; i > 0; i--) + usnic_vnic_put_resources(res_chunk_list[i]); + kfree(res_chunk_list); + return ERR_PTR(err); +} + +static void free_qp_grp_res(struct usnic_vnic_res_chunk **res_chunk_list) +{ + int i; + for (i = 0; res_chunk_list[i]; i++) + usnic_vnic_put_resources(res_chunk_list[i]); + kfree(res_chunk_list); +} + +static int qp_grp_and_vf_bind(struct usnic_ib_vf *vf, + struct usnic_ib_pd *pd, + struct usnic_ib_qp_grp *qp_grp) +{ + int err; + struct pci_dev *pdev; + + lockdep_assert_held(&vf->lock); + + pdev = usnic_vnic_get_pdev(vf->vnic); + if (vf->qp_grp_ref_cnt == 0) { + err = usnic_uiom_attach_dev_to_pd(pd->umem_pd, &pdev->dev); + if (err) { + usnic_err("Failed to attach %s to domain\n", + pci_name(pdev)); + return err; + } + vf->pd = pd; + } + vf->qp_grp_ref_cnt++; + + WARN_ON(vf->pd != pd); + qp_grp->vf = vf; + + return 0; +} + +static void qp_grp_and_vf_unbind(struct usnic_ib_qp_grp *qp_grp) +{ + struct pci_dev *pdev; + struct usnic_ib_pd *pd; + + lockdep_assert_held(&qp_grp->vf->lock); + + pd = qp_grp->vf->pd; + pdev = usnic_vnic_get_pdev(qp_grp->vf->vnic); + if (--qp_grp->vf->qp_grp_ref_cnt == 0) { + qp_grp->vf->pd = NULL; + usnic_uiom_detach_dev_from_pd(pd->umem_pd, &pdev->dev); + } + qp_grp->vf = NULL; +} + +static void log_spec(struct usnic_vnic_res_spec *res_spec) +{ + char buf[512]; + usnic_vnic_spec_dump(buf, sizeof(buf), res_spec); + usnic_dbg("%s\n", buf); +} + +static int qp_grp_id_from_flow(struct usnic_ib_qp_grp_flow *qp_flow, + uint32_t *id) +{ + enum usnic_transport_type trans_type = qp_flow->trans_type; + int err; + uint16_t port_num = 0; + + switch (trans_type) { + case USNIC_TRANSPORT_ROCE_CUSTOM: + *id = qp_flow->usnic_roce.port_num; + break; + case USNIC_TRANSPORT_IPV4_UDP: + err = usnic_transport_sock_get_addr(qp_flow->udp.sock, + NULL, NULL, + &port_num); + if (err) + return err; + /* + * Copy port_num to stack first and then to *id, + * so that the short to int cast works for little + * and big endian systems. + */ + *id = port_num; + break; + default: + usnic_err("Unsupported transport %u\n", trans_type); + return -EINVAL; + } + + return 0; +} + +struct usnic_ib_qp_grp * +usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf, + struct usnic_ib_pd *pd, + struct usnic_vnic_res_spec *res_spec, + struct usnic_transport_spec *transport_spec) +{ + struct usnic_ib_qp_grp *qp_grp; + int err; + enum usnic_transport_type transport = transport_spec->trans_type; + struct usnic_ib_qp_grp_flow *qp_flow; + + lockdep_assert_held(&vf->lock); + + err = usnic_vnic_res_spec_satisfied(&min_transport_spec[transport], + res_spec); + if (err) { + usnic_err("Spec does not meet miniumum req for transport %d\n", + transport); + log_spec(res_spec); + return ERR_PTR(err); + } + + qp_grp = kzalloc(sizeof(*qp_grp), GFP_ATOMIC); + if (!qp_grp) { + usnic_err("Unable to alloc qp_grp - Out of memory\n"); + return NULL; + } + + qp_grp->res_chunk_list = alloc_res_chunk_list(vf->vnic, res_spec, + qp_grp); + if (IS_ERR_OR_NULL(qp_grp->res_chunk_list)) { + err = qp_grp->res_chunk_list ? + PTR_ERR(qp_grp->res_chunk_list) : -ENOMEM; + usnic_err("Unable to alloc res for %d with err %d\n", + qp_grp->grp_id, err); + goto out_free_qp_grp; + } + + err = qp_grp_and_vf_bind(vf, pd, qp_grp); + if (err) + goto out_free_res; + + INIT_LIST_HEAD(&qp_grp->flows_lst); + spin_lock_init(&qp_grp->lock); + qp_grp->ufdev = ufdev; + qp_grp->state = IB_QPS_RESET; + qp_grp->owner_pid = current->pid; + + qp_flow = create_and_add_flow(qp_grp, transport_spec); + if (IS_ERR_OR_NULL(qp_flow)) { + usnic_err("Unable to create and add flow with err %ld\n", + PTR_ERR(qp_flow)); + err = qp_flow ? PTR_ERR(qp_flow) : -EFAULT; + goto out_qp_grp_vf_unbind; + } + + err = qp_grp_id_from_flow(qp_flow, &qp_grp->grp_id); + if (err) + goto out_release_flow; + qp_grp->ibqp.qp_num = qp_grp->grp_id; + + usnic_ib_sysfs_qpn_add(qp_grp); + + return qp_grp; + +out_release_flow: + release_and_remove_flow(qp_flow); +out_qp_grp_vf_unbind: + qp_grp_and_vf_unbind(qp_grp); +out_free_res: + free_qp_grp_res(qp_grp->res_chunk_list); +out_free_qp_grp: + kfree(qp_grp); + + return ERR_PTR(err); +} + +void usnic_ib_qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp) +{ + + WARN_ON(qp_grp->state != IB_QPS_RESET); + lockdep_assert_held(&qp_grp->vf->lock); + + release_and_remove_all_flows(qp_grp); + usnic_ib_sysfs_qpn_remove(qp_grp); + qp_grp_and_vf_unbind(qp_grp); + free_qp_grp_res(qp_grp->res_chunk_list); + kfree(qp_grp); +} + +struct usnic_vnic_res_chunk* +usnic_ib_qp_grp_get_chunk(struct usnic_ib_qp_grp *qp_grp, + enum usnic_vnic_res_type res_type) +{ + int i; + + for (i = 0; qp_grp->res_chunk_list[i]; i++) { + if (qp_grp->res_chunk_list[i]->type == res_type) + return qp_grp->res_chunk_list[i]; + } + + return ERR_PTR(-EINVAL); +} diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h new file mode 100644 index 00000000000..b0aafe8db0c --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_QP_GRP_H_ +#define USNIC_IB_QP_GRP_H_ + +#include <linux/debugfs.h> +#include <rdma/ib_verbs.h> + +#include "usnic_ib.h" +#include "usnic_abi.h" +#include "usnic_fwd.h" +#include "usnic_vnic.h" + +/* + * The qp group struct represents all the hw resources needed to present a ib_qp + */ +struct usnic_ib_qp_grp { + struct ib_qp ibqp; + enum ib_qp_state state; + int grp_id; + + struct usnic_fwd_dev *ufdev; + struct usnic_ib_ucontext *ctx; + struct list_head flows_lst; + + struct usnic_vnic_res_chunk **res_chunk_list; + + pid_t owner_pid; + struct usnic_ib_vf *vf; + struct list_head link; + + spinlock_t lock; + + struct kobject kobj; +}; + +struct usnic_ib_qp_grp_flow { + struct usnic_fwd_flow *flow; + enum usnic_transport_type trans_type; + union { + struct { + uint16_t port_num; + } usnic_roce; + struct { + struct socket *sock; + } udp; + }; + struct usnic_ib_qp_grp *qp_grp; + struct list_head link; + + /* Debug FS */ + struct dentry *dbgfs_dentry; + char dentry_name[32]; +}; + +static const struct +usnic_vnic_res_spec min_transport_spec[USNIC_TRANSPORT_MAX] = { + { /*USNIC_TRANSPORT_UNKNOWN*/ + .resources = { + {.type = USNIC_VNIC_RES_TYPE_EOL, .cnt = 0,}, + }, + }, + { /*USNIC_TRANSPORT_ROCE_CUSTOM*/ + .resources = { + {.type = USNIC_VNIC_RES_TYPE_WQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_RQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_CQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_EOL, .cnt = 0,}, + }, + }, + { /*USNIC_TRANSPORT_IPV4_UDP*/ + .resources = { + {.type = USNIC_VNIC_RES_TYPE_WQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_RQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_CQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_EOL, .cnt = 0,}, + }, + }, +}; + +const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state); +int usnic_ib_qp_grp_dump_hdr(char *buf, int buf_sz); +int usnic_ib_qp_grp_dump_rows(void *obj, char *buf, int buf_sz); +struct usnic_ib_qp_grp * +usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf, + struct usnic_ib_pd *pd, + struct usnic_vnic_res_spec *res_spec, + struct usnic_transport_spec *trans_spec); +void usnic_ib_qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp); +int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp, + enum ib_qp_state new_state, + void *data); +struct usnic_vnic_res_chunk +*usnic_ib_qp_grp_get_chunk(struct usnic_ib_qp_grp *qp_grp, + enum usnic_vnic_res_type type); +static inline +struct usnic_ib_qp_grp *to_uqp_grp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct usnic_ib_qp_grp, ibqp); +} +#endif /* USNIC_IB_QP_GRP_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c new file mode 100644 index 00000000000..27dc67c1689 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/errno.h> + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_addr.h> + +#include "usnic_common_util.h" +#include "usnic_ib.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_vnic.h" +#include "usnic_ib_verbs.h" +#include "usnic_log.h" + +static ssize_t usnic_ib_show_fw_ver(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev = + container_of(device, struct usnic_ib_dev, ib_dev.dev); + struct ethtool_drvinfo info; + + mutex_lock(&us_ibdev->usdev_lock); + us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info); + mutex_unlock(&us_ibdev->usdev_lock); + + return scnprintf(buf, PAGE_SIZE, "%s\n", info.fw_version); +} + +static ssize_t usnic_ib_show_board(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev = + container_of(device, struct usnic_ib_dev, ib_dev.dev); + unsigned short subsystem_device_id; + + mutex_lock(&us_ibdev->usdev_lock); + subsystem_device_id = us_ibdev->pdev->subsystem_device; + mutex_unlock(&us_ibdev->usdev_lock); + + return scnprintf(buf, PAGE_SIZE, "%hu\n", subsystem_device_id); +} + +/* + * Report the configuration for this PF + */ +static ssize_t +usnic_ib_show_config(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + char *ptr; + unsigned left; + unsigned n; + enum usnic_vnic_res_type res_type; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + + /* Buffer space limit is 1 page */ + ptr = buf; + left = PAGE_SIZE; + + mutex_lock(&us_ibdev->usdev_lock); + if (atomic_read(&us_ibdev->vf_cnt.refcount) > 0) { + char *busname; + + /* + * bus name seems to come with annoying prefix. + * Remove it if it is predictable + */ + busname = us_ibdev->pdev->bus->name; + if (strncmp(busname, "PCI Bus ", 8) == 0) + busname += 8; + + n = scnprintf(ptr, left, + "%s: %s:%d.%d, %s, %pM, %u VFs\n Per VF:", + us_ibdev->ib_dev.name, + busname, + PCI_SLOT(us_ibdev->pdev->devfn), + PCI_FUNC(us_ibdev->pdev->devfn), + netdev_name(us_ibdev->netdev), + us_ibdev->ufdev->mac, + atomic_read(&us_ibdev->vf_cnt.refcount)); + UPDATE_PTR_LEFT(n, ptr, left); + + for (res_type = USNIC_VNIC_RES_TYPE_EOL; + res_type < USNIC_VNIC_RES_TYPE_MAX; + res_type++) { + if (us_ibdev->vf_res_cnt[res_type] == 0) + continue; + n = scnprintf(ptr, left, " %d %s%s", + us_ibdev->vf_res_cnt[res_type], + usnic_vnic_res_type_to_str(res_type), + (res_type < (USNIC_VNIC_RES_TYPE_MAX - 1)) ? + "," : ""); + UPDATE_PTR_LEFT(n, ptr, left); + } + n = scnprintf(ptr, left, "\n"); + UPDATE_PTR_LEFT(n, ptr, left); + } else { + n = scnprintf(ptr, left, "%s: no VFs\n", + us_ibdev->ib_dev.name); + UPDATE_PTR_LEFT(n, ptr, left); + } + mutex_unlock(&us_ibdev->usdev_lock); + + return ptr - buf; +} + +static ssize_t +usnic_ib_show_iface(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + + return scnprintf(buf, PAGE_SIZE, "%s\n", + netdev_name(us_ibdev->netdev)); +} + +static ssize_t +usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + atomic_read(&us_ibdev->vf_cnt.refcount)); +} + +static ssize_t +usnic_ib_show_qp_per_vf(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + int qp_per_vf; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ], + us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]); + + return scnprintf(buf, PAGE_SIZE, + "%d\n", qp_per_vf); +} + +static ssize_t +usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", + us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]); +} + +static DEVICE_ATTR(fw_ver, S_IRUGO, usnic_ib_show_fw_ver, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, usnic_ib_show_board, NULL); +static DEVICE_ATTR(config, S_IRUGO, usnic_ib_show_config, NULL); +static DEVICE_ATTR(iface, S_IRUGO, usnic_ib_show_iface, NULL); +static DEVICE_ATTR(max_vf, S_IRUGO, usnic_ib_show_max_vf, NULL); +static DEVICE_ATTR(qp_per_vf, S_IRUGO, usnic_ib_show_qp_per_vf, NULL); +static DEVICE_ATTR(cq_per_vf, S_IRUGO, usnic_ib_show_cq_per_vf, NULL); + +static struct device_attribute *usnic_class_attributes[] = { + &dev_attr_fw_ver, + &dev_attr_board_id, + &dev_attr_config, + &dev_attr_iface, + &dev_attr_max_vf, + &dev_attr_qp_per_vf, + &dev_attr_cq_per_vf, +}; + +struct qpn_attribute { + struct attribute attr; + ssize_t (*show)(struct usnic_ib_qp_grp *, char *buf); +}; + +/* + * Definitions for supporting QPN entries in sysfs + */ +static ssize_t +usnic_ib_qpn_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct usnic_ib_qp_grp *qp_grp; + struct qpn_attribute *qpn_attr; + + qp_grp = container_of(kobj, struct usnic_ib_qp_grp, kobj); + qpn_attr = container_of(attr, struct qpn_attribute, attr); + + return qpn_attr->show(qp_grp, buf); +} + +static const struct sysfs_ops usnic_ib_qpn_sysfs_ops = { + .show = usnic_ib_qpn_attr_show +}; + +#define QPN_ATTR_RO(NAME) \ +struct qpn_attribute qpn_attr_##NAME = __ATTR_RO(NAME) + +static ssize_t context_show(struct usnic_ib_qp_grp *qp_grp, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "0x%p\n", qp_grp->ctx); +} + +static ssize_t summary_show(struct usnic_ib_qp_grp *qp_grp, char *buf) +{ + int i, j, n; + int left; + char *ptr; + struct usnic_vnic_res_chunk *res_chunk; + struct usnic_vnic_res *vnic_res; + + left = PAGE_SIZE; + ptr = buf; + + n = scnprintf(ptr, left, + "QPN: %d State: (%s) PID: %u VF Idx: %hu ", + qp_grp->ibqp.qp_num, + usnic_ib_qp_grp_state_to_string(qp_grp->state), + qp_grp->owner_pid, + usnic_vnic_get_index(qp_grp->vf->vnic)); + UPDATE_PTR_LEFT(n, ptr, left); + + for (i = 0; qp_grp->res_chunk_list[i]; i++) { + res_chunk = qp_grp->res_chunk_list[i]; + for (j = 0; j < res_chunk->cnt; j++) { + vnic_res = res_chunk->res[j]; + n = scnprintf(ptr, left, "%s[%d] ", + usnic_vnic_res_type_to_str(vnic_res->type), + vnic_res->vnic_idx); + UPDATE_PTR_LEFT(n, ptr, left); + } + } + + n = scnprintf(ptr, left, "\n"); + UPDATE_PTR_LEFT(n, ptr, left); + + return ptr - buf; +} + +static QPN_ATTR_RO(context); +static QPN_ATTR_RO(summary); + +static struct attribute *usnic_ib_qpn_default_attrs[] = { + &qpn_attr_context.attr, + &qpn_attr_summary.attr, + NULL +}; + +static struct kobj_type usnic_ib_qpn_type = { + .sysfs_ops = &usnic_ib_qpn_sysfs_ops, + .default_attrs = usnic_ib_qpn_default_attrs +}; + +int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev) +{ + int i; + int err; + for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) { + err = device_create_file(&us_ibdev->ib_dev.dev, + usnic_class_attributes[i]); + if (err) { + usnic_err("Failed to create device file %d for %s eith err %d", + i, us_ibdev->ib_dev.name, err); + return -EINVAL; + } + } + + /* create kernel object for looking at individual QPs */ + kobject_get(&us_ibdev->ib_dev.dev.kobj); + us_ibdev->qpn_kobj = kobject_create_and_add("qpn", + &us_ibdev->ib_dev.dev.kobj); + if (us_ibdev->qpn_kobj == NULL) { + kobject_put(&us_ibdev->ib_dev.dev.kobj); + return -ENOMEM; + } + + return 0; +} + +void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev) +{ + int i; + for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) { + device_remove_file(&us_ibdev->ib_dev.dev, + usnic_class_attributes[i]); + } + + kobject_put(us_ibdev->qpn_kobj); +} + +void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp) +{ + struct usnic_ib_dev *us_ibdev; + int err; + + us_ibdev = qp_grp->vf->pf; + + err = kobject_init_and_add(&qp_grp->kobj, &usnic_ib_qpn_type, + kobject_get(us_ibdev->qpn_kobj), + "%d", qp_grp->grp_id); + if (err) { + kobject_put(us_ibdev->qpn_kobj); + return; + } +} + +void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp) +{ + struct usnic_ib_dev *us_ibdev; + + us_ibdev = qp_grp->vf->pf; + + kobject_put(&qp_grp->kobj); + kobject_put(us_ibdev->qpn_kobj); +} diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h new file mode 100644 index 00000000000..0d09b493cd0 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_SYSFS_H_ +#define USNIC_IB_SYSFS_H_ + +#include "usnic_ib.h" + +int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev); +void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev); +void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp); +void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp); + +#endif /* !USNIC_IB_SYSFS_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c new file mode 100644 index 00000000000..53bd6a2d9cd --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -0,0 +1,768 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/errno.h> + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_addr.h> + +#include "usnic_abi.h" +#include "usnic_ib.h" +#include "usnic_common_util.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_fwd.h" +#include "usnic_log.h" +#include "usnic_uiom.h" +#include "usnic_transport.h" + +#define USNIC_DEFAULT_TRANSPORT USNIC_TRANSPORT_ROCE_CUSTOM + +static void usnic_ib_fw_string_to_u64(char *fw_ver_str, u64 *fw_ver) +{ + *fw_ver = (u64) *fw_ver_str; +} + +static int usnic_ib_fill_create_qp_resp(struct usnic_ib_qp_grp *qp_grp, + struct ib_udata *udata) +{ + struct usnic_ib_dev *us_ibdev; + struct usnic_ib_create_qp_resp resp; + struct pci_dev *pdev; + struct vnic_dev_bar *bar; + struct usnic_vnic_res_chunk *chunk; + struct usnic_ib_qp_grp_flow *default_flow; + int i, err; + + memset(&resp, 0, sizeof(resp)); + + us_ibdev = qp_grp->vf->pf; + pdev = usnic_vnic_get_pdev(qp_grp->vf->vnic); + if (!pdev) { + usnic_err("Failed to get pdev of qp_grp %d\n", + qp_grp->grp_id); + return -EFAULT; + } + + bar = usnic_vnic_get_bar(qp_grp->vf->vnic, 0); + if (!bar) { + usnic_err("Failed to get bar0 of qp_grp %d vf %s", + qp_grp->grp_id, pci_name(pdev)); + return -EFAULT; + } + + resp.vfid = usnic_vnic_get_index(qp_grp->vf->vnic); + resp.bar_bus_addr = bar->bus_addr; + resp.bar_len = bar->len; + + chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ); + if (IS_ERR_OR_NULL(chunk)) { + usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n", + usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_RQ), + qp_grp->grp_id, + PTR_ERR(chunk)); + return chunk ? PTR_ERR(chunk) : -ENOMEM; + } + + WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_RQ); + resp.rq_cnt = chunk->cnt; + for (i = 0; i < chunk->cnt; i++) + resp.rq_idx[i] = chunk->res[i]->vnic_idx; + + chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_WQ); + if (IS_ERR_OR_NULL(chunk)) { + usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n", + usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_WQ), + qp_grp->grp_id, + PTR_ERR(chunk)); + return chunk ? PTR_ERR(chunk) : -ENOMEM; + } + + WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_WQ); + resp.wq_cnt = chunk->cnt; + for (i = 0; i < chunk->cnt; i++) + resp.wq_idx[i] = chunk->res[i]->vnic_idx; + + chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_CQ); + if (IS_ERR_OR_NULL(chunk)) { + usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n", + usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_CQ), + qp_grp->grp_id, + PTR_ERR(chunk)); + return chunk ? PTR_ERR(chunk) : -ENOMEM; + } + + WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_CQ); + resp.cq_cnt = chunk->cnt; + for (i = 0; i < chunk->cnt; i++) + resp.cq_idx[i] = chunk->res[i]->vnic_idx; + + default_flow = list_first_entry(&qp_grp->flows_lst, + struct usnic_ib_qp_grp_flow, link); + resp.transport = default_flow->trans_type; + + err = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (err) { + usnic_err("Failed to copy udata for %s", us_ibdev->ib_dev.name); + return err; + } + + return 0; +} + +static struct usnic_ib_qp_grp* +find_free_vf_and_create_qp_grp(struct usnic_ib_dev *us_ibdev, + struct usnic_ib_pd *pd, + struct usnic_transport_spec *trans_spec, + struct usnic_vnic_res_spec *res_spec) +{ + struct usnic_ib_vf *vf; + struct usnic_vnic *vnic; + struct usnic_ib_qp_grp *qp_grp; + struct device *dev, **dev_list; + int i, found = 0; + + BUG_ON(!mutex_is_locked(&us_ibdev->usdev_lock)); + + if (list_empty(&us_ibdev->vf_dev_list)) { + usnic_info("No vfs to allocate\n"); + return NULL; + } + + if (usnic_ib_share_vf) { + /* Try to find resouces on a used vf which is in pd */ + dev_list = usnic_uiom_get_dev_list(pd->umem_pd); + for (i = 0; dev_list[i]; i++) { + dev = dev_list[i]; + vf = pci_get_drvdata(to_pci_dev(dev)); + spin_lock(&vf->lock); + vnic = vf->vnic; + if (!usnic_vnic_check_room(vnic, res_spec)) { + usnic_dbg("Found used vnic %s from %s\n", + us_ibdev->ib_dev.name, + pci_name(usnic_vnic_get_pdev( + vnic))); + found = 1; + break; + } + spin_unlock(&vf->lock); + + } + usnic_uiom_free_dev_list(dev_list); + } + + if (!found) { + /* Try to find resources on an unused vf */ + list_for_each_entry(vf, &us_ibdev->vf_dev_list, link) { + spin_lock(&vf->lock); + vnic = vf->vnic; + if (vf->qp_grp_ref_cnt == 0 && + usnic_vnic_check_room(vnic, res_spec) == 0) { + found = 1; + break; + } + spin_unlock(&vf->lock); + } + } + + if (!found) { + usnic_info("No free qp grp found on %s\n", + us_ibdev->ib_dev.name); + return ERR_PTR(-ENOMEM); + } + + qp_grp = usnic_ib_qp_grp_create(us_ibdev->ufdev, vf, pd, res_spec, + trans_spec); + spin_unlock(&vf->lock); + if (IS_ERR_OR_NULL(qp_grp)) { + usnic_err("Failed to allocate qp_grp\n"); + return ERR_PTR(qp_grp ? PTR_ERR(qp_grp) : -ENOMEM); + } + + return qp_grp; +} + +static void qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp) +{ + struct usnic_ib_vf *vf = qp_grp->vf; + + WARN_ON(qp_grp->state != IB_QPS_RESET); + + spin_lock(&vf->lock); + usnic_ib_qp_grp_destroy(qp_grp); + spin_unlock(&vf->lock); +} + +static void eth_speed_to_ib_speed(int speed, u8 *active_speed, + u8 *active_width) +{ + if (speed <= 10000) { + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_FDR10; + } else if (speed <= 20000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_DDR; + } else if (speed <= 30000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_QDR; + } else if (speed <= 40000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_FDR10; + } else { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_EDR; + } +} + +static int create_qp_validate_user_data(struct usnic_ib_create_qp_cmd cmd) +{ + if (cmd.spec.trans_type <= USNIC_TRANSPORT_UNKNOWN || + cmd.spec.trans_type >= USNIC_TRANSPORT_MAX) + return -EINVAL; + + return 0; +} + +/* Start of ib callback functions */ + +enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device, + u8 port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} + +int usnic_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); + union ib_gid gid; + struct ethtool_drvinfo info; + struct ethtool_cmd cmd; + int qp_per_vf; + + usnic_dbg("\n"); + mutex_lock(&us_ibdev->usdev_lock); + us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info); + us_ibdev->netdev->ethtool_ops->get_settings(us_ibdev->netdev, &cmd); + memset(props, 0, sizeof(*props)); + usnic_mac_ip_to_gid(us_ibdev->ufdev->mac, us_ibdev->ufdev->inaddr, + &gid.raw[0]); + memcpy(&props->sys_image_guid, &gid.global.interface_id, + sizeof(gid.global.interface_id)); + usnic_ib_fw_string_to_u64(&info.fw_version[0], &props->fw_ver); + props->max_mr_size = USNIC_UIOM_MAX_MR_SIZE; + props->page_size_cap = USNIC_UIOM_PAGE_SIZE; + props->vendor_id = PCI_VENDOR_ID_CISCO; + props->vendor_part_id = PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC; + props->hw_ver = us_ibdev->pdev->subsystem_device; + qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ], + us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]); + props->max_qp = qp_per_vf * + atomic_read(&us_ibdev->vf_cnt.refcount); + props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + props->max_cq = us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ] * + atomic_read(&us_ibdev->vf_cnt.refcount); + props->max_pd = USNIC_UIOM_MAX_PD_CNT; + props->max_mr = USNIC_UIOM_MAX_MR_CNT; + props->local_ca_ack_delay = 0; + props->max_pkeys = 0; + props->atomic_cap = IB_ATOMIC_NONE; + props->masked_atomic_cap = props->atomic_cap; + props->max_qp_rd_atom = 0; + props->max_qp_init_rd_atom = 0; + props->max_res_rd_atom = 0; + props->max_srq = 0; + props->max_srq_wr = 0; + props->max_srq_sge = 0; + props->max_fast_reg_page_list_len = 0; + props->max_mcast_grp = 0; + props->max_mcast_qp_attach = 0; + props->max_total_mcast_qp_attach = 0; + props->max_map_per_fmr = 0; + /* Owned by Userspace + * max_qp_wr, max_sge, max_sge_rd, max_cqe */ + mutex_unlock(&us_ibdev->usdev_lock); + + return 0; +} + +int usnic_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); + struct ethtool_cmd cmd; + + usnic_dbg("\n"); + + mutex_lock(&us_ibdev->usdev_lock); + us_ibdev->netdev->ethtool_ops->get_settings(us_ibdev->netdev, &cmd); + memset(props, 0, sizeof(*props)); + + props->lid = 0; + props->lmc = 1; + props->sm_lid = 0; + props->sm_sl = 0; + + if (!us_ibdev->ufdev->link_up) { + props->state = IB_PORT_DOWN; + props->phys_state = 3; + } else if (!us_ibdev->ufdev->inaddr) { + props->state = IB_PORT_INIT; + props->phys_state = 4; + } else { + props->state = IB_PORT_ACTIVE; + props->phys_state = 5; + } + + props->port_cap_flags = 0; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->bad_pkey_cntr = 0; + props->qkey_viol_cntr = 0; + eth_speed_to_ib_speed(cmd.speed, &props->active_speed, + &props->active_width); + props->max_mtu = IB_MTU_4096; + props->active_mtu = iboe_get_mtu(us_ibdev->ufdev->mtu); + /* Userspace will adjust for hdrs */ + props->max_msg_sz = us_ibdev->ufdev->mtu; + props->max_vl_num = 1; + mutex_unlock(&us_ibdev->usdev_lock); + + return 0; +} + +int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct usnic_ib_qp_grp *qp_grp; + struct usnic_ib_vf *vf; + int err; + + usnic_dbg("\n"); + + memset(qp_attr, 0, sizeof(*qp_attr)); + memset(qp_init_attr, 0, sizeof(*qp_init_attr)); + + qp_grp = to_uqp_grp(qp); + vf = qp_grp->vf; + mutex_lock(&vf->pf->usdev_lock); + usnic_dbg("\n"); + qp_attr->qp_state = qp_grp->state; + qp_attr->cur_qp_state = qp_grp->state; + + switch (qp_grp->ibqp.qp_type) { + case IB_QPT_UD: + qp_attr->qkey = 0; + break; + default: + usnic_err("Unexpected qp_type %d\n", qp_grp->ibqp.qp_type); + err = -EINVAL; + goto err_out; + } + + mutex_unlock(&vf->pf->usdev_lock); + return 0; + +err_out: + mutex_unlock(&vf->pf->usdev_lock); + return err; +} + +int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + + struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); + usnic_dbg("\n"); + + if (index > 1) + return -EINVAL; + + mutex_lock(&us_ibdev->usdev_lock); + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + usnic_mac_ip_to_gid(us_ibdev->ufdev->mac, us_ibdev->ufdev->inaddr, + &gid->raw[0]); + mutex_unlock(&us_ibdev->usdev_lock); + + return 0; +} + +int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + if (index > 1) + return -EINVAL; + + *pkey = 0xffff; + return 0; +} + +struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct usnic_ib_pd *pd; + void *umem_pd; + + usnic_dbg("\n"); + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + umem_pd = pd->umem_pd = usnic_uiom_alloc_pd(); + if (IS_ERR_OR_NULL(umem_pd)) { + kfree(pd); + return ERR_PTR(umem_pd ? PTR_ERR(umem_pd) : -ENOMEM); + } + + usnic_info("domain 0x%p allocated for context 0x%p and device %s\n", + pd, context, ibdev->name); + return &pd->ibpd; +} + +int usnic_ib_dealloc_pd(struct ib_pd *pd) +{ + usnic_info("freeing domain 0x%p\n", pd); + + usnic_uiom_dealloc_pd((to_upd(pd))->umem_pd); + kfree(pd); + return 0; +} + +struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + int err; + struct usnic_ib_dev *us_ibdev; + struct usnic_ib_qp_grp *qp_grp; + struct usnic_ib_ucontext *ucontext; + int cq_cnt; + struct usnic_vnic_res_spec res_spec; + struct usnic_ib_create_qp_cmd cmd; + struct usnic_transport_spec trans_spec; + + usnic_dbg("\n"); + + ucontext = to_uucontext(pd->uobject->context); + us_ibdev = to_usdev(pd->device); + + if (init_attr->create_flags) + return ERR_PTR(-EINVAL); + + err = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); + if (err) { + usnic_err("%s: cannot copy udata for create_qp\n", + us_ibdev->ib_dev.name); + return ERR_PTR(-EINVAL); + } + + err = create_qp_validate_user_data(cmd); + if (err) { + usnic_err("%s: Failed to validate user data\n", + us_ibdev->ib_dev.name); + return ERR_PTR(-EINVAL); + } + + if (init_attr->qp_type != IB_QPT_UD) { + usnic_err("%s asked to make a non-UD QP: %d\n", + us_ibdev->ib_dev.name, init_attr->qp_type); + return ERR_PTR(-EINVAL); + } + + trans_spec = cmd.spec; + mutex_lock(&us_ibdev->usdev_lock); + cq_cnt = (init_attr->send_cq == init_attr->recv_cq) ? 1 : 2; + res_spec = min_transport_spec[trans_spec.trans_type]; + usnic_vnic_res_spec_update(&res_spec, USNIC_VNIC_RES_TYPE_CQ, cq_cnt); + qp_grp = find_free_vf_and_create_qp_grp(us_ibdev, to_upd(pd), + &trans_spec, + &res_spec); + if (IS_ERR_OR_NULL(qp_grp)) { + err = qp_grp ? PTR_ERR(qp_grp) : -ENOMEM; + goto out_release_mutex; + } + + err = usnic_ib_fill_create_qp_resp(qp_grp, udata); + if (err) { + err = -EBUSY; + goto out_release_qp_grp; + } + + qp_grp->ctx = ucontext; + list_add_tail(&qp_grp->link, &ucontext->qp_grp_list); + usnic_ib_log_vf(qp_grp->vf); + mutex_unlock(&us_ibdev->usdev_lock); + return &qp_grp->ibqp; + +out_release_qp_grp: + qp_grp_destroy(qp_grp); +out_release_mutex: + mutex_unlock(&us_ibdev->usdev_lock); + return ERR_PTR(err); +} + +int usnic_ib_destroy_qp(struct ib_qp *qp) +{ + struct usnic_ib_qp_grp *qp_grp; + struct usnic_ib_vf *vf; + + usnic_dbg("\n"); + + qp_grp = to_uqp_grp(qp); + vf = qp_grp->vf; + mutex_lock(&vf->pf->usdev_lock); + if (usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RESET, NULL)) { + usnic_err("Failed to move qp grp %u to reset\n", + qp_grp->grp_id); + } + + list_del(&qp_grp->link); + qp_grp_destroy(qp_grp); + mutex_unlock(&vf->pf->usdev_lock); + + return 0; +} + +int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct usnic_ib_qp_grp *qp_grp; + int status; + usnic_dbg("\n"); + + qp_grp = to_uqp_grp(ibqp); + + /* TODO: Future Support All States */ + mutex_lock(&qp_grp->vf->pf->usdev_lock); + if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT) { + status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_INIT, NULL); + } else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTR) { + status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTR, NULL); + } else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTS) { + status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTS, NULL); + } else { + usnic_err("Unexpected combination mask: %u state: %u\n", + attr_mask & IB_QP_STATE, attr->qp_state); + status = -EINVAL; + } + + mutex_unlock(&qp_grp->vf->pf->usdev_lock); + return status; +} + +struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct ib_cq *cq; + + usnic_dbg("\n"); + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return ERR_PTR(-EBUSY); + + return cq; +} + +int usnic_ib_destroy_cq(struct ib_cq *cq) +{ + usnic_dbg("\n"); + kfree(cq); + return 0; +} + +struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata) +{ + struct usnic_ib_mr *mr; + int err; + + usnic_dbg("start 0x%llx va 0x%llx length 0x%llx\n", start, + virt_addr, length); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (IS_ERR_OR_NULL(mr)) + return ERR_PTR(mr ? PTR_ERR(mr) : -ENOMEM); + + mr->umem = usnic_uiom_reg_get(to_upd(pd)->umem_pd, start, length, + access_flags, 0); + if (IS_ERR_OR_NULL(mr->umem)) { + err = mr->umem ? PTR_ERR(mr->umem) : -EFAULT; + goto err_free; + } + + mr->ibmr.lkey = mr->ibmr.rkey = 0; + return &mr->ibmr; + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +int usnic_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct usnic_ib_mr *mr = to_umr(ibmr); + + usnic_dbg("va 0x%lx length 0x%zx\n", mr->umem->va, mr->umem->length); + + usnic_uiom_reg_release(mr->umem, ibmr->pd->uobject->context->closing); + kfree(mr); + return 0; +} + +struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct usnic_ib_ucontext *context; + struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); + usnic_dbg("\n"); + + context = kmalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&context->qp_grp_list); + mutex_lock(&us_ibdev->usdev_lock); + list_add_tail(&context->link, &us_ibdev->ctx_list); + mutex_unlock(&us_ibdev->usdev_lock); + + return &context->ibucontext; +} + +int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct usnic_ib_ucontext *context = to_uucontext(ibcontext); + struct usnic_ib_dev *us_ibdev = to_usdev(ibcontext->device); + usnic_dbg("\n"); + + mutex_lock(&us_ibdev->usdev_lock); + BUG_ON(!list_empty(&context->qp_grp_list)); + list_del(&context->link); + mutex_unlock(&us_ibdev->usdev_lock); + kfree(context); + return 0; +} + +int usnic_ib_mmap(struct ib_ucontext *context, + struct vm_area_struct *vma) +{ + struct usnic_ib_ucontext *uctx = to_ucontext(context); + struct usnic_ib_dev *us_ibdev; + struct usnic_ib_qp_grp *qp_grp; + struct usnic_ib_vf *vf; + struct vnic_dev_bar *bar; + dma_addr_t bus_addr; + unsigned int len; + unsigned int vfid; + + usnic_dbg("\n"); + + us_ibdev = to_usdev(context->device); + vma->vm_flags |= VM_IO; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vfid = vma->vm_pgoff; + usnic_dbg("Page Offset %lu PAGE_SHIFT %u VFID %u\n", + vma->vm_pgoff, PAGE_SHIFT, vfid); + + mutex_lock(&us_ibdev->usdev_lock); + list_for_each_entry(qp_grp, &uctx->qp_grp_list, link) { + vf = qp_grp->vf; + if (usnic_vnic_get_index(vf->vnic) == vfid) { + bar = usnic_vnic_get_bar(vf->vnic, 0); + if ((vma->vm_end - vma->vm_start) != bar->len) { + usnic_err("Bar0 Len %lu - Request map %lu\n", + bar->len, + vma->vm_end - vma->vm_start); + mutex_unlock(&us_ibdev->usdev_lock); + return -EINVAL; + } + bus_addr = bar->bus_addr; + len = bar->len; + usnic_dbg("bus: %pa vaddr: %p size: %ld\n", + &bus_addr, bar->vaddr, bar->len); + mutex_unlock(&us_ibdev->usdev_lock); + + return remap_pfn_range(vma, + vma->vm_start, + bus_addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } + } + + mutex_unlock(&us_ibdev->usdev_lock); + usnic_err("No VF %u found\n", vfid); + return -EINVAL; +} + +/* In ib callbacks section - Start of stub funcs */ +struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + usnic_dbg("\n"); + return ERR_PTR(-EPERM); +} + +int usnic_ib_destroy_ah(struct ib_ah *ah) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *wc) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +int usnic_ib_req_notify_cq(struct ib_cq *cq, + enum ib_cq_notify_flags flags) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ + usnic_dbg("\n"); + return ERR_PTR(-ENOMEM); +} + + +/* In ib callbacks section - End of stub funcs */ +/* End of ib callbacks section */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h new file mode 100644 index 00000000000..bb864f5aed7 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_VERBS_H_ +#define USNIC_IB_VERBS_H_ + +#include "usnic_ib.h" + +enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device, + u8 port_num); +int usnic_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props); +int usnic_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); +int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid); +int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey); +struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata); +int usnic_ib_dealloc_pd(struct ib_pd *pd); +struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int usnic_ib_destroy_qp(struct ib_qp *qp); +int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *context, + struct ib_udata *udata); +int usnic_ib_destroy_cq(struct ib_cq *cq); +struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata); +int usnic_ib_dereg_mr(struct ib_mr *ibmr); +struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata); +int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext); +int usnic_ib_mmap(struct ib_ucontext *context, + struct vm_area_struct *vma); +struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr); +int usnic_ib_destroy_ah(struct ib_ah *ah); +int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *wc); +int usnic_ib_req_notify_cq(struct ib_cq *cq, + enum ib_cq_notify_flags flags); +struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc); +#endif /* !USNIC_IB_VERBS_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_log.h b/drivers/infiniband/hw/usnic/usnic_log.h new file mode 100644 index 00000000000..75777a66c68 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_log.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_LOG_H_ +#define USNIC_LOG_H_ + +#include "usnic.h" + +extern unsigned int usnic_log_lvl; + +#define USNIC_LOG_LVL_NONE (0) +#define USNIC_LOG_LVL_ERR (1) +#define USNIC_LOG_LVL_INFO (2) +#define USNIC_LOG_LVL_DBG (3) + +#define usnic_printk(lvl, args...) \ + do { \ + printk(lvl "%s:%s:%d: ", DRV_NAME, __func__, \ + __LINE__); \ + printk(args); \ + } while (0) + +#define usnic_dbg(args...) \ + do { \ + if (unlikely(usnic_log_lvl >= USNIC_LOG_LVL_DBG)) { \ + usnic_printk(KERN_INFO, args); \ + } \ +} while (0) + +#define usnic_info(args...) \ +do { \ + if (usnic_log_lvl >= USNIC_LOG_LVL_INFO) { \ + usnic_printk(KERN_INFO, args); \ + } \ +} while (0) + +#define usnic_err(args...) \ + do { \ + if (usnic_log_lvl >= USNIC_LOG_LVL_ERR) { \ + usnic_printk(KERN_ERR, args); \ + } \ + } while (0) +#endif /* !USNIC_LOG_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_transport.c b/drivers/infiniband/hw/usnic/usnic_transport.c new file mode 100644 index 00000000000..ddef6f77a78 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_transport.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/bitmap.h> +#include <linux/file.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <net/inet_sock.h> + +#include "usnic_transport.h" +#include "usnic_log.h" + +/* ROCE */ +static unsigned long *roce_bitmap; +static u16 roce_next_port = 1; +#define ROCE_BITMAP_SZ ((1 << (8 /*CHAR_BIT*/ * sizeof(u16)))/8 /*CHAR BIT*/) +static DEFINE_SPINLOCK(roce_bitmap_lock); + +const char *usnic_transport_to_str(enum usnic_transport_type type) +{ + switch (type) { + case USNIC_TRANSPORT_UNKNOWN: + return "Unknown"; + case USNIC_TRANSPORT_ROCE_CUSTOM: + return "roce custom"; + case USNIC_TRANSPORT_IPV4_UDP: + return "IPv4 UDP"; + case USNIC_TRANSPORT_MAX: + return "Max?"; + default: + return "Not known"; + } +} + +int usnic_transport_sock_to_str(char *buf, int buf_sz, + struct socket *sock) +{ + int err; + uint32_t addr; + uint16_t port; + int proto; + + memset(buf, 0, buf_sz); + err = usnic_transport_sock_get_addr(sock, &proto, &addr, &port); + if (err) + return 0; + + return scnprintf(buf, buf_sz, "Proto:%u Addr:%pI4h Port:%hu", + proto, &addr, port); +} + +/* + * reserve a port number. if "0" specified, we will try to pick one + * starting at roce_next_port. roce_next_port will take on the values + * 1..4096 + */ +u16 usnic_transport_rsrv_port(enum usnic_transport_type type, u16 port_num) +{ + if (type == USNIC_TRANSPORT_ROCE_CUSTOM) { + spin_lock(&roce_bitmap_lock); + if (!port_num) { + port_num = bitmap_find_next_zero_area(roce_bitmap, + ROCE_BITMAP_SZ, + roce_next_port /* start */, + 1 /* nr */, + 0 /* align */); + roce_next_port = (port_num & 4095) + 1; + } else if (test_bit(port_num, roce_bitmap)) { + usnic_err("Failed to allocate port for %s\n", + usnic_transport_to_str(type)); + spin_unlock(&roce_bitmap_lock); + goto out_fail; + } + bitmap_set(roce_bitmap, port_num, 1); + spin_unlock(&roce_bitmap_lock); + } else { + usnic_err("Failed to allocate port - transport %s unsupported\n", + usnic_transport_to_str(type)); + goto out_fail; + } + + usnic_dbg("Allocating port %hu for %s\n", port_num, + usnic_transport_to_str(type)); + return port_num; + +out_fail: + return 0; +} + +void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num) +{ + if (type == USNIC_TRANSPORT_ROCE_CUSTOM) { + spin_lock(&roce_bitmap_lock); + if (!port_num) { + usnic_err("Unreserved unvalid port num 0 for %s\n", + usnic_transport_to_str(type)); + goto out_roce_custom; + } + + if (!test_bit(port_num, roce_bitmap)) { + usnic_err("Unreserving invalid %hu for %s\n", + port_num, + usnic_transport_to_str(type)); + goto out_roce_custom; + } + bitmap_clear(roce_bitmap, port_num, 1); + usnic_dbg("Freeing port %hu for %s\n", port_num, + usnic_transport_to_str(type)); +out_roce_custom: + spin_unlock(&roce_bitmap_lock); + } else { + usnic_err("Freeing invalid port %hu for %d\n", port_num, type); + } +} + +struct socket *usnic_transport_get_socket(int sock_fd) +{ + struct socket *sock; + int err; + char buf[25]; + + /* sockfd_lookup will internally do a fget */ + sock = sockfd_lookup(sock_fd, &err); + if (!sock) { + usnic_err("Unable to lookup socket for fd %d with err %d\n", + sock_fd, err); + return ERR_PTR(-ENOENT); + } + + usnic_transport_sock_to_str(buf, sizeof(buf), sock); + usnic_dbg("Get sock %s\n", buf); + + return sock; +} + +void usnic_transport_put_socket(struct socket *sock) +{ + char buf[100]; + + usnic_transport_sock_to_str(buf, sizeof(buf), sock); + usnic_dbg("Put sock %s\n", buf); + sockfd_put(sock); +} + +int usnic_transport_sock_get_addr(struct socket *sock, int *proto, + uint32_t *addr, uint16_t *port) +{ + int len; + int err; + struct sockaddr_in sock_addr; + + err = sock->ops->getname(sock, + (struct sockaddr *)&sock_addr, + &len, 0); + if (err) + return err; + + if (sock_addr.sin_family != AF_INET) + return -EINVAL; + + if (proto) + *proto = sock->sk->sk_protocol; + if (port) + *port = ntohs(((struct sockaddr_in *)&sock_addr)->sin_port); + if (addr) + *addr = ntohl(((struct sockaddr_in *) + &sock_addr)->sin_addr.s_addr); + + return 0; +} + +int usnic_transport_init(void) +{ + roce_bitmap = kzalloc(ROCE_BITMAP_SZ, GFP_KERNEL); + if (!roce_bitmap) { + usnic_err("Failed to allocate bit map"); + return -ENOMEM; + } + + /* Do not ever allocate bit 0, hence set it here */ + bitmap_set(roce_bitmap, 0, 1); + return 0; +} + +void usnic_transport_fini(void) +{ + kfree(roce_bitmap); +} diff --git a/drivers/infiniband/hw/usnic/usnic_transport.h b/drivers/infiniband/hw/usnic/usnic_transport.h new file mode 100644 index 00000000000..7e5dc6d9f46 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_transport.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_TRANSPORT_H_ +#define USNIC_TRANSPORT_H_ + +#include "usnic_abi.h" + +const char *usnic_transport_to_str(enum usnic_transport_type trans_type); +/* + * Returns number of bytes written, excluding null terminator. If + * nothing was written, the function returns 0. + */ +int usnic_transport_sock_to_str(char *buf, int buf_sz, + struct socket *sock); +/* + * Reserve a port. If "port_num" is set, then the function will try + * to reserve that particular port. + */ +u16 usnic_transport_rsrv_port(enum usnic_transport_type type, u16 port_num); +void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num); +/* + * Do a fget on the socket refered to by sock_fd and returns the socket. + * Socket will not be destroyed before usnic_transport_put_socket has + * been called. + */ +struct socket *usnic_transport_get_socket(int sock_fd); +void usnic_transport_put_socket(struct socket *sock); +/* + * Call usnic_transport_get_socket before calling *_sock_get_addr + */ +int usnic_transport_sock_get_addr(struct socket *sock, int *proto, + uint32_t *addr, uint16_t *port); +int usnic_transport_init(void); +void usnic_transport_fini(void); +#endif /* !USNIC_TRANSPORT_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c new file mode 100644 index 00000000000..801a1d6937e --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -0,0 +1,604 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2013 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/mm.h> +#include <linux/dma-mapping.h> +#include <linux/sched.h> +#include <linux/hugetlb.h> +#include <linux/dma-attrs.h> +#include <linux/iommu.h> +#include <linux/workqueue.h> +#include <linux/list.h> +#include <linux/pci.h> + +#include "usnic_log.h" +#include "usnic_uiom.h" +#include "usnic_uiom_interval_tree.h" + +static struct workqueue_struct *usnic_uiom_wq; + +#define USNIC_UIOM_PAGE_CHUNK \ + ((PAGE_SIZE - offsetof(struct usnic_uiom_chunk, page_list)) /\ + ((void *) &((struct usnic_uiom_chunk *) 0)->page_list[1] - \ + (void *) &((struct usnic_uiom_chunk *) 0)->page_list[0])) + +static void usnic_uiom_reg_account(struct work_struct *work) +{ + struct usnic_uiom_reg *umem = container_of(work, + struct usnic_uiom_reg, work); + + down_write(&umem->mm->mmap_sem); + umem->mm->locked_vm -= umem->diff; + up_write(&umem->mm->mmap_sem); + mmput(umem->mm); + kfree(umem); +} + +static int usnic_uiom_dma_fault(struct iommu_domain *domain, + struct device *dev, + unsigned long iova, int flags, + void *token) +{ + usnic_err("Device %s iommu fault domain 0x%pK va 0x%lx flags 0x%x\n", + dev_name(dev), + domain, iova, flags); + return -ENOSYS; +} + +static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty) +{ + struct usnic_uiom_chunk *chunk, *tmp; + struct page *page; + struct scatterlist *sg; + int i; + dma_addr_t pa; + + list_for_each_entry_safe(chunk, tmp, chunk_list, list) { + for_each_sg(chunk->page_list, sg, chunk->nents, i) { + page = sg_page(sg); + pa = sg_phys(sg); + if (dirty) + set_page_dirty_lock(page); + put_page(page); + usnic_dbg("pa: %pa\n", &pa); + } + kfree(chunk); + } +} + +static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, + int dmasync, struct list_head *chunk_list) +{ + struct page **page_list; + struct scatterlist *sg; + struct usnic_uiom_chunk *chunk; + unsigned long locked; + unsigned long lock_limit; + unsigned long cur_base; + unsigned long npages; + int ret; + int off; + int i; + int flags; + dma_addr_t pa; + DEFINE_DMA_ATTRS(attrs); + + if (dmasync) + dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); + + if (!can_do_mlock()) + return -EPERM; + + INIT_LIST_HEAD(chunk_list); + + page_list = (struct page **) __get_free_page(GFP_KERNEL); + if (!page_list) + return -ENOMEM; + + npages = PAGE_ALIGN(size + (addr & ~PAGE_MASK)) >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + locked = npages + current->mm->locked_vm; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { + ret = -ENOMEM; + goto out; + } + + flags = IOMMU_READ | IOMMU_CACHE; + flags |= (writable) ? IOMMU_WRITE : 0; + cur_base = addr & PAGE_MASK; + ret = 0; + + while (npages) { + ret = get_user_pages(current, current->mm, cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / sizeof(struct page *)), + 1, !writable, page_list, NULL); + + if (ret < 0) + goto out; + + npages -= ret; + off = 0; + + while (ret) { + chunk = kmalloc(sizeof(*chunk) + + sizeof(struct scatterlist) * + min_t(int, ret, USNIC_UIOM_PAGE_CHUNK), + GFP_KERNEL); + if (!chunk) { + ret = -ENOMEM; + goto out; + } + + chunk->nents = min_t(int, ret, USNIC_UIOM_PAGE_CHUNK); + sg_init_table(chunk->page_list, chunk->nents); + for_each_sg(chunk->page_list, sg, chunk->nents, i) { + sg_set_page(sg, page_list[i + off], + PAGE_SIZE, 0); + pa = sg_phys(sg); + usnic_dbg("va: 0x%lx pa: %pa\n", + cur_base + i*PAGE_SIZE, &pa); + } + cur_base += chunk->nents * PAGE_SIZE; + ret -= chunk->nents; + off += chunk->nents; + list_add_tail(&chunk->list, chunk_list); + } + + ret = 0; + } + +out: + if (ret < 0) + usnic_uiom_put_pages(chunk_list, 0); + else + current->mm->locked_vm = locked; + + up_write(¤t->mm->mmap_sem); + free_page((unsigned long) page_list); + return ret; +} + +static void usnic_uiom_unmap_sorted_intervals(struct list_head *intervals, + struct usnic_uiom_pd *pd) +{ + struct usnic_uiom_interval_node *interval, *tmp; + long unsigned va, size; + + list_for_each_entry_safe(interval, tmp, intervals, link) { + va = interval->start << PAGE_SHIFT; + size = ((interval->last - interval->start) + 1) << PAGE_SHIFT; + while (size > 0) { + /* Workaround for RH 970401 */ + usnic_dbg("va 0x%lx size 0x%lx", va, PAGE_SIZE); + iommu_unmap(pd->domain, va, PAGE_SIZE); + va += PAGE_SIZE; + size -= PAGE_SIZE; + } + } +} + +static void __usnic_uiom_reg_release(struct usnic_uiom_pd *pd, + struct usnic_uiom_reg *uiomr, + int dirty) +{ + int npages; + unsigned long vpn_start, vpn_last; + struct usnic_uiom_interval_node *interval, *tmp; + int writable = 0; + LIST_HEAD(rm_intervals); + + npages = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; + vpn_start = (uiomr->va & PAGE_MASK) >> PAGE_SHIFT; + vpn_last = vpn_start + npages - 1; + + spin_lock(&pd->lock); + usnic_uiom_remove_interval(&pd->rb_root, vpn_start, + vpn_last, &rm_intervals); + usnic_uiom_unmap_sorted_intervals(&rm_intervals, pd); + + list_for_each_entry_safe(interval, tmp, &rm_intervals, link) { + if (interval->flags & IOMMU_WRITE) + writable = 1; + list_del(&interval->link); + kfree(interval); + } + + usnic_uiom_put_pages(&uiomr->chunk_list, dirty & writable); + spin_unlock(&pd->lock); +} + +static int usnic_uiom_map_sorted_intervals(struct list_head *intervals, + struct usnic_uiom_reg *uiomr) +{ + int i, err; + size_t size; + struct usnic_uiom_chunk *chunk; + struct usnic_uiom_interval_node *interval_node; + dma_addr_t pa; + dma_addr_t pa_start = 0; + dma_addr_t pa_end = 0; + long int va_start = -EINVAL; + struct usnic_uiom_pd *pd = uiomr->pd; + long int va = uiomr->va & PAGE_MASK; + int flags = IOMMU_READ | IOMMU_CACHE; + + flags |= (uiomr->writable) ? IOMMU_WRITE : 0; + chunk = list_first_entry(&uiomr->chunk_list, struct usnic_uiom_chunk, + list); + list_for_each_entry(interval_node, intervals, link) { +iter_chunk: + for (i = 0; i < chunk->nents; i++, va += PAGE_SIZE) { + pa = sg_phys(&chunk->page_list[i]); + if ((va >> PAGE_SHIFT) < interval_node->start) + continue; + + if ((va >> PAGE_SHIFT) == interval_node->start) { + /* First page of the interval */ + va_start = va; + pa_start = pa; + pa_end = pa; + } + + WARN_ON(va_start == -EINVAL); + + if ((pa_end + PAGE_SIZE != pa) && + (pa != pa_start)) { + /* PAs are not contiguous */ + size = pa_end - pa_start + PAGE_SIZE; + usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x", + va_start, &pa_start, size, flags); + err = iommu_map(pd->domain, va_start, pa_start, + size, flags); + if (err) { + usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", + va_start, &pa_start, size, err); + goto err_out; + } + va_start = va; + pa_start = pa; + pa_end = pa; + } + + if ((va >> PAGE_SHIFT) == interval_node->last) { + /* Last page of the interval */ + size = pa - pa_start + PAGE_SIZE; + usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x\n", + va_start, &pa_start, size, flags); + err = iommu_map(pd->domain, va_start, pa_start, + size, flags); + if (err) { + usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", + va_start, &pa_start, size, err); + goto err_out; + } + break; + } + + if (pa != pa_start) + pa_end += PAGE_SIZE; + } + + if (i == chunk->nents) { + /* + * Hit last entry of the chunk, + * hence advance to next chunk + */ + chunk = list_first_entry(&chunk->list, + struct usnic_uiom_chunk, + list); + goto iter_chunk; + } + } + + return 0; + +err_out: + usnic_uiom_unmap_sorted_intervals(intervals, pd); + return err; +} + +struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, + unsigned long addr, size_t size, + int writable, int dmasync) +{ + struct usnic_uiom_reg *uiomr; + unsigned long va_base, vpn_start, vpn_last; + unsigned long npages; + int offset, err; + LIST_HEAD(sorted_diff_intervals); + + /* + * Intel IOMMU map throws an error if a translation entry is + * changed from read to write. This module may not unmap + * and then remap the entry after fixing the permission + * b/c this open up a small windows where hw DMA may page fault + * Hence, make all entries to be writable. + */ + writable = 1; + + va_base = addr & PAGE_MASK; + offset = addr & ~PAGE_MASK; + npages = PAGE_ALIGN(size + offset) >> PAGE_SHIFT; + vpn_start = (addr & PAGE_MASK) >> PAGE_SHIFT; + vpn_last = vpn_start + npages - 1; + + uiomr = kmalloc(sizeof(*uiomr), GFP_KERNEL); + if (!uiomr) + return ERR_PTR(-ENOMEM); + + uiomr->va = va_base; + uiomr->offset = offset; + uiomr->length = size; + uiomr->writable = writable; + uiomr->pd = pd; + + err = usnic_uiom_get_pages(addr, size, writable, dmasync, + &uiomr->chunk_list); + if (err) { + usnic_err("Failed get_pages vpn [0x%lx,0x%lx] err %d\n", + vpn_start, vpn_last, err); + goto out_free_uiomr; + } + + spin_lock(&pd->lock); + err = usnic_uiom_get_intervals_diff(vpn_start, vpn_last, + (writable) ? IOMMU_WRITE : 0, + IOMMU_WRITE, + &pd->rb_root, + &sorted_diff_intervals); + if (err) { + usnic_err("Failed disjoint interval vpn [0x%lx,0x%lx] err %d\n", + vpn_start, vpn_last, err); + goto out_put_pages; + } + + err = usnic_uiom_map_sorted_intervals(&sorted_diff_intervals, uiomr); + if (err) { + usnic_err("Failed map interval vpn [0x%lx,0x%lx] err %d\n", + vpn_start, vpn_last, err); + goto out_put_intervals; + + } + + err = usnic_uiom_insert_interval(&pd->rb_root, vpn_start, vpn_last, + (writable) ? IOMMU_WRITE : 0); + if (err) { + usnic_err("Failed insert interval vpn [0x%lx,0x%lx] err %d\n", + vpn_start, vpn_last, err); + goto out_unmap_intervals; + } + + usnic_uiom_put_interval_set(&sorted_diff_intervals); + spin_unlock(&pd->lock); + + return uiomr; + +out_unmap_intervals: + usnic_uiom_unmap_sorted_intervals(&sorted_diff_intervals, pd); +out_put_intervals: + usnic_uiom_put_interval_set(&sorted_diff_intervals); +out_put_pages: + usnic_uiom_put_pages(&uiomr->chunk_list, 0); + spin_unlock(&pd->lock); +out_free_uiomr: + kfree(uiomr); + return ERR_PTR(err); +} + +void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing) +{ + struct mm_struct *mm; + unsigned long diff; + + __usnic_uiom_reg_release(uiomr->pd, uiomr, 1); + + mm = get_task_mm(current); + if (!mm) { + kfree(uiomr); + return; + } + + diff = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; + + /* + * We may be called with the mm's mmap_sem already held. This + * can happen when a userspace munmap() is the call that drops + * the last reference to our file and calls our release + * method. If there are memory regions to destroy, we'll end + * up here and not be able to take the mmap_sem. In that case + * we defer the vm_locked accounting to the system workqueue. + */ + if (closing) { + if (!down_write_trylock(&mm->mmap_sem)) { + INIT_WORK(&uiomr->work, usnic_uiom_reg_account); + uiomr->mm = mm; + uiomr->diff = diff; + + queue_work(usnic_uiom_wq, &uiomr->work); + return; + } + } else + down_write(&mm->mmap_sem); + + current->mm->locked_vm -= diff; + up_write(&mm->mmap_sem); + mmput(mm); + kfree(uiomr); +} + +struct usnic_uiom_pd *usnic_uiom_alloc_pd(void) +{ + struct usnic_uiom_pd *pd; + void *domain; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + pd->domain = domain = iommu_domain_alloc(&pci_bus_type); + if (IS_ERR_OR_NULL(domain)) { + usnic_err("Failed to allocate IOMMU domain with err %ld\n", + PTR_ERR(pd->domain)); + kfree(pd); + return ERR_PTR(domain ? PTR_ERR(domain) : -ENOMEM); + } + + iommu_set_fault_handler(pd->domain, usnic_uiom_dma_fault, NULL); + + spin_lock_init(&pd->lock); + INIT_LIST_HEAD(&pd->devs); + + return pd; +} + +void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd) +{ + iommu_domain_free(pd->domain); + kfree(pd); +} + +int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev) +{ + struct usnic_uiom_dev *uiom_dev; + int err; + + uiom_dev = kzalloc(sizeof(*uiom_dev), GFP_ATOMIC); + if (!uiom_dev) + return -ENOMEM; + uiom_dev->dev = dev; + + err = iommu_attach_device(pd->domain, dev); + if (err) + goto out_free_dev; + + if (!iommu_domain_has_cap(pd->domain, IOMMU_CAP_CACHE_COHERENCY)) { + usnic_err("IOMMU of %s does not support cache coherency\n", + dev_name(dev)); + err = -EINVAL; + goto out_detach_device; + } + + spin_lock(&pd->lock); + list_add_tail(&uiom_dev->link, &pd->devs); + pd->dev_cnt++; + spin_unlock(&pd->lock); + + return 0; + +out_detach_device: + iommu_detach_device(pd->domain, dev); +out_free_dev: + kfree(uiom_dev); + return err; +} + +void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd, struct device *dev) +{ + struct usnic_uiom_dev *uiom_dev; + int found = 0; + + spin_lock(&pd->lock); + list_for_each_entry(uiom_dev, &pd->devs, link) { + if (uiom_dev->dev == dev) { + found = 1; + break; + } + } + + if (!found) { + usnic_err("Unable to free dev %s - not found\n", + dev_name(dev)); + spin_unlock(&pd->lock); + return; + } + + list_del(&uiom_dev->link); + pd->dev_cnt--; + spin_unlock(&pd->lock); + + return iommu_detach_device(pd->domain, dev); +} + +struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd) +{ + struct usnic_uiom_dev *uiom_dev; + struct device **devs; + int i = 0; + + spin_lock(&pd->lock); + devs = kcalloc(pd->dev_cnt + 1, sizeof(*devs), GFP_ATOMIC); + if (!devs) { + devs = ERR_PTR(-ENOMEM); + goto out; + } + + list_for_each_entry(uiom_dev, &pd->devs, link) { + devs[i++] = uiom_dev->dev; + } +out: + spin_unlock(&pd->lock); + return devs; +} + +void usnic_uiom_free_dev_list(struct device **devs) +{ + kfree(devs); +} + +int usnic_uiom_init(char *drv_name) +{ + if (!iommu_present(&pci_bus_type)) { + usnic_err("IOMMU required but not present or enabled. USNIC QPs will not function w/o enabling IOMMU\n"); + return -EPERM; + } + + usnic_uiom_wq = create_workqueue(drv_name); + if (!usnic_uiom_wq) { + usnic_err("Unable to alloc wq for drv %s\n", drv_name); + return -ENOMEM; + } + + return 0; +} + +void usnic_uiom_fini(void) +{ + flush_workqueue(usnic_uiom_wq); + destroy_workqueue(usnic_uiom_wq); +} diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h new file mode 100644 index 00000000000..70440996e8f --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_uiom.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_UIOM_H_ +#define USNIC_UIOM_H_ + +#include <linux/list.h> +#include <linux/scatterlist.h> + +#include "usnic_uiom_interval_tree.h" + +#define USNIC_UIOM_READ (1) +#define USNIC_UIOM_WRITE (2) + +#define USNIC_UIOM_MAX_PD_CNT (1000) +#define USNIC_UIOM_MAX_MR_CNT (1000000) +#define USNIC_UIOM_MAX_MR_SIZE (~0UL) +#define USNIC_UIOM_PAGE_SIZE (PAGE_SIZE) + +struct usnic_uiom_dev { + struct device *dev; + struct list_head link; +}; + +struct usnic_uiom_pd { + struct iommu_domain *domain; + spinlock_t lock; + struct rb_root rb_root; + struct list_head devs; + int dev_cnt; +}; + +struct usnic_uiom_reg { + struct usnic_uiom_pd *pd; + unsigned long va; + size_t length; + int offset; + int page_size; + int writable; + struct list_head chunk_list; + struct work_struct work; + struct mm_struct *mm; + unsigned long diff; +}; + +struct usnic_uiom_chunk { + struct list_head list; + int nents; + struct scatterlist page_list[0]; +}; + +struct usnic_uiom_pd *usnic_uiom_alloc_pd(void); +void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd); +int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev); +void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd, + struct device *dev); +struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd); +void usnic_uiom_free_dev_list(struct device **devs); +struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, + unsigned long addr, size_t size, + int access, int dmasync); +void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing); +int usnic_uiom_init(char *drv_name); +void usnic_uiom_fini(void); +#endif /* USNIC_UIOM_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c new file mode 100644 index 00000000000..3a4288e0fba --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/init.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/list_sort.h> + +#include <linux/interval_tree_generic.h> +#include "usnic_uiom_interval_tree.h" + +#define START(node) ((node)->start) +#define LAST(node) ((node)->last) + +#define MAKE_NODE(node, start, end, ref_cnt, flags, err, err_out) \ + do { \ + node = usnic_uiom_interval_node_alloc(start, \ + end, ref_cnt, flags); \ + if (!node) { \ + err = -ENOMEM; \ + goto err_out; \ + } \ + } while (0) + +#define MARK_FOR_ADD(node, list) (list_add_tail(&node->link, list)) + +#define MAKE_NODE_AND_APPEND(node, start, end, ref_cnt, flags, err, \ + err_out, list) \ + do { \ + MAKE_NODE(node, start, end, \ + ref_cnt, flags, err, \ + err_out); \ + MARK_FOR_ADD(node, list); \ + } while (0) + +#define FLAGS_EQUAL(flags1, flags2, mask) \ + (((flags1) & (mask)) == ((flags2) & (mask))) + +static struct usnic_uiom_interval_node* +usnic_uiom_interval_node_alloc(long int start, long int last, int ref_cnt, + int flags) +{ + struct usnic_uiom_interval_node *interval = kzalloc(sizeof(*interval), + GFP_ATOMIC); + if (!interval) + return NULL; + + interval->start = start; + interval->last = last; + interval->flags = flags; + interval->ref_cnt = ref_cnt; + + return interval; +} + +static int interval_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct usnic_uiom_interval_node *node_a, *node_b; + + node_a = list_entry(a, struct usnic_uiom_interval_node, link); + node_b = list_entry(b, struct usnic_uiom_interval_node, link); + + /* long to int */ + if (node_a->start < node_b->start) + return -1; + else if (node_a->start > node_b->start) + return 1; + + return 0; +} + +static void +find_intervals_intersection_sorted(struct rb_root *root, unsigned long start, + unsigned long last, + struct list_head *list) +{ + struct usnic_uiom_interval_node *node; + + INIT_LIST_HEAD(list); + + for (node = usnic_uiom_interval_tree_iter_first(root, start, last); + node; + node = usnic_uiom_interval_tree_iter_next(node, start, last)) + list_add_tail(&node->link, list); + + list_sort(NULL, list, interval_cmp); +} + +int usnic_uiom_get_intervals_diff(unsigned long start, unsigned long last, + int flags, int flag_mask, + struct rb_root *root, + struct list_head *diff_set) +{ + struct usnic_uiom_interval_node *interval, *tmp; + int err = 0; + long int pivot = start; + LIST_HEAD(intersection_set); + + INIT_LIST_HEAD(diff_set); + + find_intervals_intersection_sorted(root, start, last, + &intersection_set); + + list_for_each_entry(interval, &intersection_set, link) { + if (pivot < interval->start) { + MAKE_NODE_AND_APPEND(tmp, pivot, interval->start - 1, + 1, flags, err, err_out, + diff_set); + pivot = interval->start; + } + + /* + * Invariant: Set [start, pivot] is either in diff_set or root, + * but not in both. + */ + + if (pivot > interval->last) { + continue; + } else if (pivot <= interval->last && + FLAGS_EQUAL(interval->flags, flags, + flag_mask)) { + pivot = interval->last + 1; + } + } + + if (pivot <= last) + MAKE_NODE_AND_APPEND(tmp, pivot, last, 1, flags, err, err_out, + diff_set); + + return 0; + +err_out: + list_for_each_entry_safe(interval, tmp, diff_set, link) { + list_del(&interval->link); + kfree(interval); + } + + return err; +} + +void usnic_uiom_put_interval_set(struct list_head *intervals) +{ + struct usnic_uiom_interval_node *interval, *tmp; + list_for_each_entry_safe(interval, tmp, intervals, link) + kfree(interval); +} + +int usnic_uiom_insert_interval(struct rb_root *root, unsigned long start, + unsigned long last, int flags) +{ + struct usnic_uiom_interval_node *interval, *tmp; + unsigned long istart, ilast; + int iref_cnt, iflags; + unsigned long lpivot = start; + int err = 0; + LIST_HEAD(to_add); + LIST_HEAD(intersection_set); + + find_intervals_intersection_sorted(root, start, last, + &intersection_set); + + list_for_each_entry(interval, &intersection_set, link) { + /* + * Invariant - lpivot is the left edge of next interval to be + * inserted + */ + istart = interval->start; + ilast = interval->last; + iref_cnt = interval->ref_cnt; + iflags = interval->flags; + + if (istart < lpivot) { + MAKE_NODE_AND_APPEND(tmp, istart, lpivot - 1, iref_cnt, + iflags, err, err_out, &to_add); + } else if (istart > lpivot) { + MAKE_NODE_AND_APPEND(tmp, lpivot, istart - 1, 1, flags, + err, err_out, &to_add); + lpivot = istart; + } else { + lpivot = istart; + } + + if (ilast > last) { + MAKE_NODE_AND_APPEND(tmp, lpivot, last, iref_cnt + 1, + iflags | flags, err, err_out, + &to_add); + MAKE_NODE_AND_APPEND(tmp, last + 1, ilast, iref_cnt, + iflags, err, err_out, &to_add); + } else { + MAKE_NODE_AND_APPEND(tmp, lpivot, ilast, iref_cnt + 1, + iflags | flags, err, err_out, + &to_add); + } + + lpivot = ilast + 1; + } + + if (lpivot <= last) + MAKE_NODE_AND_APPEND(tmp, lpivot, last, 1, flags, err, err_out, + &to_add); + + list_for_each_entry_safe(interval, tmp, &intersection_set, link) { + usnic_uiom_interval_tree_remove(interval, root); + kfree(interval); + } + + list_for_each_entry(interval, &to_add, link) + usnic_uiom_interval_tree_insert(interval, root); + + return 0; + +err_out: + list_for_each_entry_safe(interval, tmp, &to_add, link) + kfree(interval); + + return err; +} + +void usnic_uiom_remove_interval(struct rb_root *root, unsigned long start, + unsigned long last, struct list_head *removed) +{ + struct usnic_uiom_interval_node *interval; + + for (interval = usnic_uiom_interval_tree_iter_first(root, start, last); + interval; + interval = usnic_uiom_interval_tree_iter_next(interval, + start, + last)) { + if (--interval->ref_cnt == 0) + list_add_tail(&interval->link, removed); + } + + list_for_each_entry(interval, removed, link) + usnic_uiom_interval_tree_remove(interval, root); +} + +INTERVAL_TREE_DEFINE(struct usnic_uiom_interval_node, rb, + unsigned long, __subtree_last, + START, LAST, , usnic_uiom_interval_tree) diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h new file mode 100644 index 00000000000..d4f752e258f --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_UIOM_INTERVAL_TREE_H_ +#define USNIC_UIOM_INTERVAL_TREE_H_ + +#include <linux/rbtree.h> + +struct usnic_uiom_interval_node { + struct rb_node rb; + struct list_head link; + unsigned long start; + unsigned long last; + unsigned long __subtree_last; + unsigned int ref_cnt; + int flags; +}; + +extern void +usnic_uiom_interval_tree_insert(struct usnic_uiom_interval_node *node, + struct rb_root *root); +extern void +usnic_uiom_interval_tree_remove(struct usnic_uiom_interval_node *node, + struct rb_root *root); +extern struct usnic_uiom_interval_node * +usnic_uiom_interval_tree_iter_first(struct rb_root *root, + unsigned long start, + unsigned long last); +extern struct usnic_uiom_interval_node * +usnic_uiom_interval_tree_iter_next(struct usnic_uiom_interval_node *node, + unsigned long start, unsigned long last); +/* + * Inserts {start...last} into {root}. If there are overlaps, + * nodes will be broken up and merged + */ +int usnic_uiom_insert_interval(struct rb_root *root, + unsigned long start, unsigned long last, + int flags); +/* + * Removed {start...last} from {root}. The nodes removed are returned in + * 'removed.' The caller is responsibile for freeing memory of nodes in + * 'removed.' + */ +void usnic_uiom_remove_interval(struct rb_root *root, + unsigned long start, unsigned long last, + struct list_head *removed); +/* + * Returns {start...last} - {root} (relative complement of {start...last} in + * {root}) in diff_set sorted ascendingly + */ +int usnic_uiom_get_intervals_diff(unsigned long start, + unsigned long last, int flags, + int flag_mask, + struct rb_root *root, + struct list_head *diff_set); +/* Call this to free diff_set returned by usnic_uiom_get_intervals_diff */ +void usnic_uiom_put_interval_set(struct list_head *intervals); +#endif /* USNIC_UIOM_INTERVAL_TREE_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.c b/drivers/infiniband/hw/usnic/usnic_vnic.c new file mode 100644 index 00000000000..656b88c39ed --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_vnic.c @@ -0,0 +1,467 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/pci.h> + +#include "usnic_ib.h" +#include "vnic_resource.h" +#include "usnic_log.h" +#include "usnic_vnic.h" + +struct usnic_vnic { + struct vnic_dev *vdev; + struct vnic_dev_bar bar[PCI_NUM_RESOURCES]; + struct usnic_vnic_res_chunk chunks[USNIC_VNIC_RES_TYPE_MAX]; + spinlock_t res_lock; +}; + +static enum vnic_res_type _to_vnic_res_type(enum usnic_vnic_res_type res_type) +{ +#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \ + vnic_res_type, +#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \ + vnic_res_type, + static enum vnic_res_type usnic_vnic_type_2_vnic_type[] = { + USNIC_VNIC_RES_TYPES}; +#undef DEFINE_USNIC_VNIC_RES +#undef DEFINE_USNIC_VNIC_RES_AT + + if (res_type >= USNIC_VNIC_RES_TYPE_MAX) + return RES_TYPE_MAX; + + return usnic_vnic_type_2_vnic_type[res_type]; +} + +const char *usnic_vnic_res_type_to_str(enum usnic_vnic_res_type res_type) +{ +#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \ + desc, +#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \ + desc, + static const char * const usnic_vnic_res_type_desc[] = { + USNIC_VNIC_RES_TYPES}; +#undef DEFINE_USNIC_VNIC_RES +#undef DEFINE_USNIC_VNIC_RES_AT + + if (res_type >= USNIC_VNIC_RES_TYPE_MAX) + return "unknown"; + + return usnic_vnic_res_type_desc[res_type]; + +} + +const char *usnic_vnic_pci_name(struct usnic_vnic *vnic) +{ + return pci_name(usnic_vnic_get_pdev(vnic)); +} + +int usnic_vnic_dump(struct usnic_vnic *vnic, char *buf, + int buf_sz, + void *hdr_obj, + int (*printtitle)(void *, char*, int), + int (*printcols)(char *, int), + int (*printrow)(void *, char *, int)) +{ + struct usnic_vnic_res_chunk *chunk; + struct usnic_vnic_res *res; + struct vnic_dev_bar *bar0; + int i, j, offset; + + offset = 0; + bar0 = usnic_vnic_get_bar(vnic, 0); + offset += scnprintf(buf + offset, buf_sz - offset, + "VF:%hu BAR0 bus_addr=%pa vaddr=0x%p size=%ld ", + usnic_vnic_get_index(vnic), + &bar0->bus_addr, + bar0->vaddr, bar0->len); + if (printtitle) + offset += printtitle(hdr_obj, buf + offset, buf_sz - offset); + offset += scnprintf(buf + offset, buf_sz - offset, "\n"); + offset += scnprintf(buf + offset, buf_sz - offset, + "|RES\t|CTRL_PIN\t\t|IN_USE\t"); + if (printcols) + offset += printcols(buf + offset, buf_sz - offset); + offset += scnprintf(buf + offset, buf_sz - offset, "\n"); + + spin_lock(&vnic->res_lock); + for (i = 0; i < ARRAY_SIZE(vnic->chunks); i++) { + chunk = &vnic->chunks[i]; + for (j = 0; j < chunk->cnt; j++) { + res = chunk->res[j]; + offset += scnprintf(buf + offset, buf_sz - offset, + "|%s[%u]\t|0x%p\t|%u\t", + usnic_vnic_res_type_to_str(res->type), + res->vnic_idx, res->ctrl, !!res->owner); + if (printrow) { + offset += printrow(res->owner, buf + offset, + buf_sz - offset); + } + offset += scnprintf(buf + offset, buf_sz - offset, + "\n"); + } + } + spin_unlock(&vnic->res_lock); + return offset; +} + +void usnic_vnic_res_spec_update(struct usnic_vnic_res_spec *spec, + enum usnic_vnic_res_type trgt_type, + u16 cnt) +{ + int i; + + for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { + if (spec->resources[i].type == trgt_type) { + spec->resources[i].cnt = cnt; + return; + } + } + + WARN_ON(1); +} + +int usnic_vnic_res_spec_satisfied(const struct usnic_vnic_res_spec *min_spec, + struct usnic_vnic_res_spec *res_spec) +{ + int found, i, j; + + for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { + found = 0; + + for (j = 0; j < USNIC_VNIC_RES_TYPE_MAX; j++) { + if (res_spec->resources[i].type != + min_spec->resources[i].type) + continue; + found = 1; + if (min_spec->resources[i].cnt > + res_spec->resources[i].cnt) + return -EINVAL; + break; + } + + if (!found) + return -EINVAL; + } + return 0; +} + +int usnic_vnic_spec_dump(char *buf, int buf_sz, + struct usnic_vnic_res_spec *res_spec) +{ + enum usnic_vnic_res_type res_type; + int res_cnt; + int i; + int offset = 0; + + for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { + res_type = res_spec->resources[i].type; + res_cnt = res_spec->resources[i].cnt; + offset += scnprintf(buf + offset, buf_sz - offset, + "Res: %s Cnt: %d ", + usnic_vnic_res_type_to_str(res_type), + res_cnt); + } + + return offset; +} + +int usnic_vnic_check_room(struct usnic_vnic *vnic, + struct usnic_vnic_res_spec *res_spec) +{ + int i; + enum usnic_vnic_res_type res_type; + int res_cnt; + + for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { + res_type = res_spec->resources[i].type; + res_cnt = res_spec->resources[i].cnt; + + if (res_type == USNIC_VNIC_RES_TYPE_EOL) + break; + + if (res_cnt > usnic_vnic_res_free_cnt(vnic, res_type)) + return -EBUSY; + } + + return 0; +} + +int usnic_vnic_res_cnt(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type) +{ + return vnic->chunks[type].cnt; +} + +int usnic_vnic_res_free_cnt(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type) +{ + return vnic->chunks[type].free_cnt; +} + +struct usnic_vnic_res_chunk * +usnic_vnic_get_resources(struct usnic_vnic *vnic, enum usnic_vnic_res_type type, + int cnt, void *owner) +{ + struct usnic_vnic_res_chunk *src, *ret; + struct usnic_vnic_res *res; + int i; + + if (usnic_vnic_res_free_cnt(vnic, type) < cnt || cnt < 1 || !owner) + return ERR_PTR(-EINVAL); + + ret = kzalloc(sizeof(*ret), GFP_ATOMIC); + if (!ret) { + usnic_err("Failed to allocate chunk for %s - Out of memory\n", + usnic_vnic_pci_name(vnic)); + return ERR_PTR(-ENOMEM); + } + + ret->res = kzalloc(sizeof(*(ret->res))*cnt, GFP_ATOMIC); + if (!ret->res) { + usnic_err("Failed to allocate resources for %s. Out of memory\n", + usnic_vnic_pci_name(vnic)); + kfree(ret); + return ERR_PTR(-ENOMEM); + } + + spin_lock(&vnic->res_lock); + src = &vnic->chunks[type]; + for (i = 0; i < src->cnt && ret->cnt < cnt; i++) { + res = src->res[i]; + if (!res->owner) { + src->free_cnt--; + res->owner = owner; + ret->res[ret->cnt++] = res; + } + } + + spin_unlock(&vnic->res_lock); + ret->type = type; + ret->vnic = vnic; + WARN_ON(ret->cnt != cnt); + + return ret; +} + +void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk) +{ + + struct usnic_vnic_res *res; + int i; + struct usnic_vnic *vnic = chunk->vnic; + + spin_lock(&vnic->res_lock); + while ((i = --chunk->cnt) >= 0) { + res = chunk->res[i]; + chunk->res[i] = NULL; + res->owner = NULL; + vnic->chunks[res->type].free_cnt++; + } + spin_unlock(&vnic->res_lock); + + kfree(chunk->res); + kfree(chunk); +} + +u16 usnic_vnic_get_index(struct usnic_vnic *vnic) +{ + return usnic_vnic_get_pdev(vnic)->devfn - 1; +} + +static int usnic_vnic_alloc_res_chunk(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type, + struct usnic_vnic_res_chunk *chunk) +{ + int cnt, err, i; + struct usnic_vnic_res *res; + + cnt = vnic_dev_get_res_count(vnic->vdev, _to_vnic_res_type(type)); + if (cnt < 1) + return -EINVAL; + + chunk->cnt = chunk->free_cnt = cnt; + chunk->res = kzalloc(sizeof(*(chunk->res))*cnt, GFP_KERNEL); + if (!chunk->res) + return -ENOMEM; + + for (i = 0; i < cnt; i++) { + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) { + err = -ENOMEM; + goto fail; + } + res->type = type; + res->vnic_idx = i; + res->vnic = vnic; + res->ctrl = vnic_dev_get_res(vnic->vdev, + _to_vnic_res_type(type), i); + chunk->res[i] = res; + } + + chunk->vnic = vnic; + return 0; +fail: + for (i--; i >= 0; i--) + kfree(chunk->res[i]); + kfree(chunk->res); + return err; +} + +static void usnic_vnic_free_res_chunk(struct usnic_vnic_res_chunk *chunk) +{ + int i; + for (i = 0; i < chunk->cnt; i++) + kfree(chunk->res[i]); + kfree(chunk->res); +} + +static int usnic_vnic_discover_resources(struct pci_dev *pdev, + struct usnic_vnic *vnic) +{ + enum usnic_vnic_res_type res_type; + int i; + int err = 0; + + for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; + vnic->bar[i].len = pci_resource_len(pdev, i); + vnic->bar[i].vaddr = pci_iomap(pdev, i, vnic->bar[i].len); + if (!vnic->bar[i].vaddr) { + usnic_err("Cannot memory-map BAR %d, aborting\n", + i); + err = -ENODEV; + goto out_clean_bar; + } + vnic->bar[i].bus_addr = pci_resource_start(pdev, i); + } + + vnic->vdev = vnic_dev_register(NULL, pdev, pdev, vnic->bar, + ARRAY_SIZE(vnic->bar)); + if (!vnic->vdev) { + usnic_err("Failed to register device %s\n", + pci_name(pdev)); + err = -EINVAL; + goto out_clean_bar; + } + + for (res_type = USNIC_VNIC_RES_TYPE_EOL + 1; + res_type < USNIC_VNIC_RES_TYPE_MAX; res_type++) { + err = usnic_vnic_alloc_res_chunk(vnic, res_type, + &vnic->chunks[res_type]); + if (err) { + usnic_err("Failed to alloc res %s with err %d\n", + usnic_vnic_res_type_to_str(res_type), + err); + goto out_clean_chunks; + } + } + + return 0; + +out_clean_chunks: + for (res_type--; res_type > USNIC_VNIC_RES_TYPE_EOL; res_type--) + usnic_vnic_free_res_chunk(&vnic->chunks[res_type]); + vnic_dev_unregister(vnic->vdev); +out_clean_bar: + for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; + if (!vnic->bar[i].vaddr) + break; + + iounmap(vnic->bar[i].vaddr); + } + + return err; +} + +struct pci_dev *usnic_vnic_get_pdev(struct usnic_vnic *vnic) +{ + return vnic_dev_get_pdev(vnic->vdev); +} + +struct vnic_dev_bar *usnic_vnic_get_bar(struct usnic_vnic *vnic, + int bar_num) +{ + return (bar_num < ARRAY_SIZE(vnic->bar)) ? &vnic->bar[bar_num] : NULL; +} + +static void usnic_vnic_release_resources(struct usnic_vnic *vnic) +{ + int i; + struct pci_dev *pdev; + enum usnic_vnic_res_type res_type; + + pdev = usnic_vnic_get_pdev(vnic); + + for (res_type = USNIC_VNIC_RES_TYPE_EOL + 1; + res_type < USNIC_VNIC_RES_TYPE_MAX; res_type++) + usnic_vnic_free_res_chunk(&vnic->chunks[res_type]); + + vnic_dev_unregister(vnic->vdev); + + for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; + iounmap(vnic->bar[i].vaddr); + } +} + +struct usnic_vnic *usnic_vnic_alloc(struct pci_dev *pdev) +{ + struct usnic_vnic *vnic; + int err = 0; + + if (!pci_is_enabled(pdev)) { + usnic_err("PCI dev %s is disabled\n", pci_name(pdev)); + return ERR_PTR(-EINVAL); + } + + vnic = kzalloc(sizeof(*vnic), GFP_KERNEL); + if (!vnic) { + usnic_err("Failed to alloc vnic for %s - out of memory\n", + pci_name(pdev)); + return ERR_PTR(-ENOMEM); + } + + spin_lock_init(&vnic->res_lock); + + err = usnic_vnic_discover_resources(pdev, vnic); + if (err) { + usnic_err("Failed to discover %s resources with err %d\n", + pci_name(pdev), err); + goto out_free_vnic; + } + + usnic_dbg("Allocated vnic for %s\n", usnic_vnic_pci_name(vnic)); + + return vnic; + +out_free_vnic: + kfree(vnic); + + return ERR_PTR(err); +} + +void usnic_vnic_free(struct usnic_vnic *vnic) +{ + usnic_vnic_release_resources(vnic); + kfree(vnic); +} diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.h b/drivers/infiniband/hw/usnic/usnic_vnic.h new file mode 100644 index 00000000000..14d931a8829 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_vnic.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_VNIC_H_ +#define USNIC_VNIC_H_ + +#include <linux/pci.h> + +#include "vnic_dev.h" + +/* =USNIC_VNIC_RES_TYPE= =VNIC_RES= =DESC= */ +#define USNIC_VNIC_RES_TYPES \ + DEFINE_USNIC_VNIC_RES_AT(EOL, RES_TYPE_EOL, "EOL", 0) \ + DEFINE_USNIC_VNIC_RES(WQ, RES_TYPE_WQ, "WQ") \ + DEFINE_USNIC_VNIC_RES(RQ, RES_TYPE_RQ, "RQ") \ + DEFINE_USNIC_VNIC_RES(CQ, RES_TYPE_CQ, "CQ") \ + DEFINE_USNIC_VNIC_RES(INTR, RES_TYPE_INTR_CTRL, "INT") \ + DEFINE_USNIC_VNIC_RES(MAX, RES_TYPE_MAX, "MAX")\ + +#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \ + USNIC_VNIC_RES_TYPE_##usnic_vnic_res_t = val, +#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \ + USNIC_VNIC_RES_TYPE_##usnic_vnic_res_t, +enum usnic_vnic_res_type { + USNIC_VNIC_RES_TYPES +}; +#undef DEFINE_USNIC_VNIC_RES +#undef DEFINE_USNIC_VNIC_RES_AT + +struct usnic_vnic_res { + enum usnic_vnic_res_type type; + unsigned int vnic_idx; + struct usnic_vnic *vnic; + void __iomem *ctrl; + void *owner; +}; + +struct usnic_vnic_res_chunk { + enum usnic_vnic_res_type type; + int cnt; + int free_cnt; + struct usnic_vnic_res **res; + struct usnic_vnic *vnic; +}; + +struct usnic_vnic_res_desc { + enum usnic_vnic_res_type type; + uint16_t cnt; +}; + +struct usnic_vnic_res_spec { + struct usnic_vnic_res_desc resources[USNIC_VNIC_RES_TYPE_MAX]; +}; + +const char *usnic_vnic_res_type_to_str(enum usnic_vnic_res_type res_type); +const char *usnic_vnic_pci_name(struct usnic_vnic *vnic); +int usnic_vnic_dump(struct usnic_vnic *vnic, char *buf, int buf_sz, + void *hdr_obj, + int (*printtitle)(void *, char*, int), + int (*printcols)(char *, int), + int (*printrow)(void *, char *, int)); +void usnic_vnic_res_spec_update(struct usnic_vnic_res_spec *spec, + enum usnic_vnic_res_type trgt_type, + u16 cnt); +int usnic_vnic_res_spec_satisfied(const struct usnic_vnic_res_spec *min_spec, + struct usnic_vnic_res_spec *res_spec); +int usnic_vnic_spec_dump(char *buf, int buf_sz, + struct usnic_vnic_res_spec *res_spec); +int usnic_vnic_check_room(struct usnic_vnic *vnic, + struct usnic_vnic_res_spec *res_spec); +int usnic_vnic_res_cnt(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type); +int usnic_vnic_res_free_cnt(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type); +struct usnic_vnic_res_chunk * +usnic_vnic_get_resources(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type, + int cnt, + void *owner); +void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk); +struct pci_dev *usnic_vnic_get_pdev(struct usnic_vnic *vnic); +struct vnic_dev_bar *usnic_vnic_get_bar(struct usnic_vnic *vnic, + int bar_num); +struct usnic_vnic *usnic_vnic_alloc(struct pci_dev *pdev); +void usnic_vnic_free(struct usnic_vnic *vnic); +u16 usnic_vnic_get_index(struct usnic_vnic *vnic); + +#endif /*!USNIC_VNIC_H_*/ diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile new file mode 100644 index 00000000000..f3c7dcf0309 --- /dev/null +++ b/drivers/infiniband/ulp/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_INFINIBAND_IPOIB) += ipoib/ +obj-$(CONFIG_INFINIBAND_SRP) += srp/ +obj-$(CONFIG_INFINIBAND_SRPT) += srpt/ +obj-$(CONFIG_INFINIBAND_ISER) += iser/ +obj-$(CONFIG_INFINIBAND_ISERT) += isert/ diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index eb71aaa26a9..c639f90cfda 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -101,6 +101,7 @@ enum { IPOIB_MCAST_FLAG_SENDONLY = 1, IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ IPOIB_MCAST_FLAG_ATTACHED = 3, + IPOIB_MCAST_JOIN_STARTED = 4, MAX_SEND_CQE = 16, IPOIB_CM_COPYBREAK = 256, @@ -151,6 +152,7 @@ struct ipoib_mcast { struct sk_buff_head pkt_queue; struct net_device *dev; + struct completion done; }; struct ipoib_rx_buf { @@ -299,7 +301,7 @@ struct ipoib_dev_priv { unsigned long flags; - struct mutex vlan_mutex; + struct rw_semaphore vlan_rwsem; struct rb_root path_tree; struct list_head path_list; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 7a3175400b2..933efcea0d0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -140,7 +140,8 @@ static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, struct ipoib_cm_rx_buf *rx_ring, int id, int frags, - u64 mapping[IPOIB_CM_RX_SG]) + u64 mapping[IPOIB_CM_RX_SG], + gfp_t gfp) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; @@ -164,7 +165,7 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, } for (i = 0; i < frags; i++) { - struct page *page = alloc_page(GFP_ATOMIC); + struct page *page = alloc_page(gfp); if (!page) goto partial_error; @@ -382,7 +383,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1, - rx->rx_ring[i].mapping)) { + rx->rx_ring[i].mapping, + GFP_KERNEL)) { ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); ret = -ENOMEM; goto err_count; @@ -639,7 +641,8 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; - newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping); + newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, + mapping, GFP_ATOMIC); if (unlikely(!newskb)) { /* * If we can't allocate a new RX buffer, dump @@ -1027,10 +1030,20 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ .cap.max_send_sge = 1, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, - .qp_context = tx + .qp_context = tx, + .create_flags = IB_QP_CREATE_USE_GFP_NOIO }; - return ib_create_qp(priv->pd, &attr); + struct ib_qp *tx_qp; + + tx_qp = ib_create_qp(priv->pd, &attr); + if (PTR_ERR(tx_qp) == -EINVAL) { + ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n", + priv->ca->name); + attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO; + tx_qp = ib_create_qp(priv->pd, &attr); + } + return tx_qp; } static int ipoib_cm_send_req(struct net_device *dev, @@ -1101,12 +1114,14 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, struct ipoib_dev_priv *priv = netdev_priv(p->dev); int ret; - p->tx_ring = vzalloc(ipoib_sendq_size * sizeof *p->tx_ring); + p->tx_ring = __vmalloc(ipoib_sendq_size * sizeof *p->tx_ring, + GFP_NOIO, PAGE_KERNEL); if (!p->tx_ring) { ipoib_warn(priv, "failed to allocate tx ring\n"); ret = -ENOMEM; goto err_tx; } + memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring); p->qp = ipoib_cm_create_tx_qp(p->dev, p); if (IS_ERR(p->qp)) { @@ -1556,7 +1571,8 @@ int ipoib_cm_dev_init(struct net_device *dev) for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i, priv->cm.num_frags - 1, - priv->cm.srq_ring[i].mapping)) { + priv->cm.srq_ring[i].mapping, + GFP_KERNEL)) { ipoib_warn(priv, "failed to allocate " "receive buffer %d\n", i); ipoib_cm_dev_cleanup(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index c4b3940845e..078cadd6c79 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -105,5 +105,5 @@ static const struct ethtool_ops ipoib_ethtool_ops = { void ipoib_set_ethtool_ops(struct net_device *dev) { - SET_ETHTOOL_OPS(dev, &ipoib_ethtool_ops); + dev->ethtool_ops = &ipoib_ethtool_ops; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 196b1d13cbc..6a7003ddb0b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -685,15 +685,13 @@ int ipoib_ib_dev_open(struct net_device *dev) ret = ipoib_ib_post_receives(dev); if (ret) { ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); - ipoib_ib_dev_stop(dev, 1); - return -1; + goto dev_stop; } ret = ipoib_cm_dev_open(dev); if (ret) { ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); - ipoib_ib_dev_stop(dev, 1); - return -1; + goto dev_stop; } clear_bit(IPOIB_STOP_REAPER, &priv->flags); @@ -704,6 +702,11 @@ int ipoib_ib_dev_open(struct net_device *dev) napi_enable(&priv->napi); return 0; +dev_stop: + if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_enable(&priv->napi); + ipoib_ib_dev_stop(dev, 1); + return -1; } static void ipoib_pkey_dev_check_presence(struct net_device *dev) @@ -746,10 +749,8 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush) if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { mutex_lock(&pkey_mutex); set_bit(IPOIB_PKEY_STOP, &priv->flags); - cancel_delayed_work(&priv->pkey_poll_task); + cancel_delayed_work_sync(&priv->pkey_poll_task); mutex_unlock(&pkey_mutex); - if (flush) - flush_workqueue(ipoib_workqueue); } ipoib_mcast_stop_thread(dev, flush); @@ -974,7 +975,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, u16 new_index; int result; - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); /* * Flush any child interfaces too -- they might be up even if @@ -983,7 +984,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, list_for_each_entry(cpriv, &priv->child_intfs, list) __ipoib_ib_dev_flush(cpriv, level); - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { /* for non-child devices must check/update the pkey value here */ @@ -1081,6 +1082,11 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) struct ipoib_dev_priv *priv = netdev_priv(dev); ipoib_dbg(priv, "cleaning up ib_dev\n"); + /* + * We must make sure there are no more (path) completions + * that may wish to touch priv fields that are no longer valid + */ + ipoib_flush_paths(dev); ipoib_mcast_stop_thread(dev, 1); ipoib_mcast_dev_flush(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 82cec1af902..5786a78ff8b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -104,6 +104,8 @@ int ipoib_open(struct net_device *dev) ipoib_dbg(priv, "bringing up interface\n"); + netif_carrier_off(dev); + set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_pkey_dev_delay_open(dev)) @@ -119,7 +121,7 @@ int ipoib_open(struct net_device *dev) struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; @@ -129,7 +131,7 @@ int ipoib_open(struct net_device *dev) dev_change_flags(cpriv->dev, flags | IFF_UP); } - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); } netif_start_queue(dev); @@ -162,7 +164,7 @@ static int ipoib_stop(struct net_device *dev) struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; @@ -172,7 +174,7 @@ static int ipoib_stop(struct net_device *dev) dev_change_flags(cpriv->dev, flags & ~IFF_UP); } - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); } return 0; @@ -1350,7 +1352,7 @@ void ipoib_setup(struct net_device *dev) ipoib_set_ethtool_ops(dev); - netif_napi_add(dev, &priv->napi, ipoib_poll, 100); + netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT); dev->watchdog_timeo = HZ; @@ -1366,13 +1368,11 @@ void ipoib_setup(struct net_device *dev) memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); - netif_carrier_off(dev); - priv->dev = dev; spin_lock_init(&priv->lock); - mutex_init(&priv->vlan_mutex); + init_rwsem(&priv->vlan_rwsem); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index cecb98a4c66..d4e005720d0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -386,8 +386,10 @@ static int ipoib_mcast_join_complete(int status, mcast->mcmember.mgid.raw, status); /* We trap for port events ourselves. */ - if (status == -ENETRESET) - return 0; + if (status == -ENETRESET) { + status = 0; + goto out; + } if (!status) status = ipoib_mcast_join_finish(mcast, &multicast->rec); @@ -407,7 +409,8 @@ static int ipoib_mcast_join_complete(int status, if (mcast == priv->broadcast) queue_work(ipoib_workqueue, &priv->carrier_on_task); - return 0; + status = 0; + goto out; } if (mcast->logcount++ < 20) { @@ -434,7 +437,8 @@ static int ipoib_mcast_join_complete(int status, mcast->backoff * HZ); spin_unlock_irq(&priv->lock); mutex_unlock(&mcast_mutex); - +out: + complete(&mcast->done); return status; } @@ -484,11 +488,15 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, } set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, comp_mask, GFP_KERNEL, ipoib_mcast_join_complete, mcast); if (IS_ERR(mcast->mc)) { clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + complete(&mcast->done); ret = PTR_ERR(mcast->mc); ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); @@ -510,10 +518,18 @@ void ipoib_mcast_join_task(struct work_struct *work) struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, mcast_task.work); struct net_device *dev = priv->dev; + struct ib_port_attr port_attr; if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) return; + if (ib_query_port(priv->ca, priv->port, &port_attr) || + port_attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "port state is not ACTIVE (state = %d) suspending join task\n", + port_attr.state); + return; + } + if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) ipoib_warn(priv, "ib_query_gid() failed\n"); else @@ -751,6 +767,11 @@ void ipoib_mcast_dev_flush(struct net_device *dev) spin_unlock_irqrestore(&priv->lock, flags); + /* seperate between the wait to the leave*/ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) + if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) + wait_for_completion(&mcast->done); + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(dev, mcast); ipoib_mcast_free(mcast); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c index f81abe16cf0..cdc7df4fdb8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -31,6 +31,7 @@ */ #include <linux/netdevice.h> +#include <linux/if_arp.h> /* For ARPHRD_xxx */ #include <linux/module.h> #include <net/rtnetlink.h> #include "ipoib.h" @@ -103,7 +104,7 @@ static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, return -EINVAL; pdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); - if (!pdev) + if (!pdev || pdev->type != ARPHRD_INFINIBAND) return -ENODEV; ppriv = netdev_priv(pdev); @@ -142,10 +143,10 @@ static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head priv = netdev_priv(dev); ppriv = netdev_priv(priv->parent); - mutex_lock(&ppriv->vlan_mutex); + down_write(&ppriv->vlan_rwsem); unregister_netdevice_queue(dev, head); list_del(&priv->list); - mutex_unlock(&ppriv->vlan_mutex); + up_write(&ppriv->vlan_rwsem); } static size_t ipoib_get_size(const struct net_device *dev) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 049a997caff..c56d5d44c53 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -192,6 +192,9 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK) init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING) + init_attr.create_flags |= IB_QP_CREATE_NETIF_QP; + if (dev->features & NETIF_F_SG) init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 8292554bccb..9fad7b5ac8b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -140,7 +140,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) if (!rtnl_trylock()) return restart_syscall(); - mutex_lock(&ppriv->vlan_mutex); + down_write(&ppriv->vlan_rwsem); /* * First ensure this isn't a duplicate. We check the parent device and @@ -163,7 +163,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD); out: - mutex_unlock(&ppriv->vlan_mutex); + up_write(&ppriv->vlan_rwsem); if (result) free_netdev(priv->dev); @@ -185,7 +185,8 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) if (!rtnl_trylock()) return restart_syscall(); - mutex_lock(&ppriv->vlan_mutex); + + down_write(&ppriv->vlan_rwsem); list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { if (priv->pkey == pkey && priv->child_type == IPOIB_LEGACY_CHILD) { @@ -195,7 +196,8 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) break; } } - mutex_unlock(&ppriv->vlan_mutex); + up_write(&ppriv->vlan_rwsem); + rtnl_unlock(); if (dev) { diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index dd03cfe596d..eb7973957a6 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -5,7 +5,7 @@ * Copyright (C) 2004 Alex Aizman * Copyright (C) 2005 Mike Christie * Copyright (c) 2005, 2006 Voltaire, Inc. All rights reserved. - * Copyright (c) 2013 Mellanox Technologies. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. * maintained by openib-general@openib.org * * This software is available to you under a choice of one of two @@ -82,6 +82,8 @@ static unsigned int iscsi_max_lun = 512; module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO); int iser_debug_level = 0; +bool iser_pi_enable = false; +int iser_pi_guard = 0; MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover"); MODULE_LICENSE("Dual BSD/GPL"); @@ -91,6 +93,13 @@ MODULE_VERSION(DRV_VER); module_param_named(debug_level, iser_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)"); +module_param_named(pi_enable, iser_pi_enable, bool, 0644); +MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)"); + +module_param_named(pi_guard, iser_pi_guard, int, 0644); +MODULE_PARM_DESC(pi_guard, "T10-PI guard_type, 0:CRC|1:IP_CSUM (default:CRC)"); + +static struct workqueue_struct *release_wq; struct iser_global ig; void @@ -138,8 +147,8 @@ static int iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode) int iser_initialize_task_headers(struct iscsi_task *task, struct iser_tx_desc *tx_desc) { - struct iscsi_iser_conn *iser_conn = task->conn->dd_data; - struct iser_device *device = iser_conn->ib_conn->device; + struct iser_conn *ib_conn = task->conn->dd_data; + struct iser_device *device = ib_conn->device; struct iscsi_iser_task *iser_task = task->dd_data; u64 dma_addr; @@ -153,7 +162,7 @@ int iser_initialize_task_headers(struct iscsi_task *task, tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; tx_desc->tx_sg[0].lkey = device->mr->lkey; - iser_task->iser_conn = iser_conn; + iser_task->ib_conn = ib_conn; return 0; } /** @@ -176,6 +185,8 @@ iscsi_iser_task_init(struct iscsi_task *task) iser_task->command_sent = 0; iser_task_rdma_init(iser_task); + iser_task->sc = task->sc; + return 0; } @@ -278,10 +289,9 @@ iscsi_iser_task_xmit(struct iscsi_task *task) static void iscsi_iser_cleanup_task(struct iscsi_task *task) { struct iscsi_iser_task *iser_task = task->dd_data; - struct iser_tx_desc *tx_desc = &iser_task->desc; - - struct iscsi_iser_conn *iser_conn = task->conn->dd_data; - struct iser_device *device = iser_conn->ib_conn->device; + struct iser_tx_desc *tx_desc = &iser_task->desc; + struct iser_conn *ib_conn = task->conn->dd_data; + struct iser_device *device = ib_conn->device; ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); @@ -296,14 +306,25 @@ static void iscsi_iser_cleanup_task(struct iscsi_task *task) } } +static u8 iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + + if (iser_task->dir[ISER_DIR_IN]) + return iser_check_task_pi_status(iser_task, ISER_DIR_IN, + sector); + else + return iser_check_task_pi_status(iser_task, ISER_DIR_OUT, + sector); +} + static struct iscsi_cls_conn * iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, uint32_t conn_idx) { struct iscsi_conn *conn; struct iscsi_cls_conn *cls_conn; - struct iscsi_iser_conn *iser_conn; - cls_conn = iscsi_conn_setup(cls_session, sizeof(*iser_conn), conn_idx); + cls_conn = iscsi_conn_setup(cls_session, 0, conn_idx); if (!cls_conn) return NULL; conn = cls_conn->dd_data; @@ -314,39 +335,15 @@ iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, uint32_t conn_idx) */ conn->max_recv_dlength = ISER_RECV_DATA_SEG_LEN; - iser_conn = conn->dd_data; - conn->dd_data = iser_conn; - iser_conn->iscsi_conn = conn; - return cls_conn; } -static void -iscsi_iser_conn_destroy(struct iscsi_cls_conn *cls_conn) -{ - struct iscsi_conn *conn = cls_conn->dd_data; - struct iscsi_iser_conn *iser_conn = conn->dd_data; - struct iser_conn *ib_conn = iser_conn->ib_conn; - - iscsi_conn_teardown(cls_conn); - /* - * Userspace will normally call the stop callback and - * already have freed the ib_conn, but if it goofed up then - * we free it here. - */ - if (ib_conn) { - ib_conn->iser_conn = NULL; - iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */ - } -} - static int iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, struct iscsi_cls_conn *cls_conn, uint64_t transport_eph, int is_leading) { struct iscsi_conn *conn = cls_conn->dd_data; - struct iscsi_iser_conn *iser_conn; struct iscsi_session *session; struct iser_conn *ib_conn; struct iscsi_endpoint *ep; @@ -373,35 +370,44 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, /* binds the iSER connection retrieved from the previously * connected ep_handle to the iSCSI layer connection. exchanges * connection pointers */ - iser_info("binding iscsi/iser conn %p %p to ib_conn %p\n", - conn, conn->dd_data, ib_conn); - iser_conn = conn->dd_data; - ib_conn->iser_conn = iser_conn; - iser_conn->ib_conn = ib_conn; - iser_conn_get(ib_conn); /* ref iscsi/ib conn binding */ + iser_info("binding iscsi conn %p to ib_conn %p\n", conn, ib_conn); + + conn->dd_data = ib_conn; + ib_conn->iscsi_conn = conn; + return 0; } +static int +iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn) +{ + struct iscsi_conn *iscsi_conn; + struct iser_conn *ib_conn; + + iscsi_conn = cls_conn->dd_data; + ib_conn = iscsi_conn->dd_data; + reinit_completion(&ib_conn->stop_completion); + + return iscsi_conn_start(cls_conn); +} + static void iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag) { struct iscsi_conn *conn = cls_conn->dd_data; - struct iscsi_iser_conn *iser_conn = conn->dd_data; - struct iser_conn *ib_conn = iser_conn->ib_conn; + struct iser_conn *ib_conn = conn->dd_data; + + iser_dbg("stopping iscsi_conn: %p, ib_conn: %p\n", conn, ib_conn); + iscsi_conn_stop(cls_conn, flag); /* * Userspace may have goofed up and not bound the connection or * might have only partially setup the connection. */ if (ib_conn) { - iscsi_conn_stop(cls_conn, flag); - /* - * There is no unbind event so the stop callback - * must release the ref from the bind. - */ - iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */ + conn->dd_data = NULL; + complete(&ib_conn->stop_completion); } - iser_conn->ib_conn = NULL; } static void iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session) @@ -413,6 +419,17 @@ static void iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session) iscsi_host_free(shost); } +static inline unsigned int +iser_dif_prot_caps(int prot_caps) +{ + return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ? SHOST_DIF_TYPE1_PROTECTION | + SHOST_DIX_TYPE1_PROTECTION : 0) | + ((prot_caps & IB_PROT_T10DIF_TYPE_2) ? SHOST_DIF_TYPE2_PROTECTION | + SHOST_DIX_TYPE2_PROTECTION : 0) | + ((prot_caps & IB_PROT_T10DIF_TYPE_3) ? SHOST_DIF_TYPE3_PROTECTION | + SHOST_DIX_TYPE3_PROTECTION : 0); +} + static struct iscsi_cls_session * iscsi_iser_session_create(struct iscsi_endpoint *ep, uint16_t cmds_max, uint16_t qdepth, @@ -437,8 +454,18 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, * older userspace tools (before 2.0-870) did not pass us * the leading conn's ep so this will be NULL; */ - if (ep) + if (ep) { ib_conn = ep->dd_data; + if (ib_conn->pi_support) { + u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap; + + scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps)); + if (iser_pi_guard) + scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP); + else + scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC); + } + } if (iscsi_host_add(shost, ep ? ib_conn->device->ib_device->dma_device : NULL)) @@ -481,28 +508,28 @@ iscsi_iser_set_param(struct iscsi_cls_conn *cls_conn, case ISCSI_PARAM_HDRDGST_EN: sscanf(buf, "%d", &value); if (value) { - iser_err("DataDigest wasn't negotiated to None"); + iser_err("DataDigest wasn't negotiated to None\n"); return -EPROTO; } break; case ISCSI_PARAM_DATADGST_EN: sscanf(buf, "%d", &value); if (value) { - iser_err("DataDigest wasn't negotiated to None"); + iser_err("DataDigest wasn't negotiated to None\n"); return -EPROTO; } break; case ISCSI_PARAM_IFMARKER_EN: sscanf(buf, "%d", &value); if (value) { - iser_err("IFMarker wasn't negotiated to No"); + iser_err("IFMarker wasn't negotiated to No\n"); return -EPROTO; } break; case ISCSI_PARAM_OFMARKER_EN: sscanf(buf, "%d", &value); if (value) { - iser_err("OFMarker wasn't negotiated to No"); + iser_err("OFMarker wasn't negotiated to No\n"); return -EPROTO; } break; @@ -618,19 +645,20 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) struct iser_conn *ib_conn; ib_conn = ep->dd_data; - if (ib_conn->iser_conn) - /* - * Must suspend xmit path if the ep is bound to the - * iscsi_conn, so we know we are not accessing the ib_conn - * when we free it. - * - * This may not be bound if the ep poll failed. - */ - iscsi_suspend_tx(ib_conn->iser_conn->iscsi_conn); - - - iser_info("ib conn %p state %d\n", ib_conn, ib_conn->state); + iser_info("ep %p ib conn %p state %d\n", ep, ib_conn, ib_conn->state); iser_conn_terminate(ib_conn); + + /* + * if iser_conn and iscsi_conn are bound, we must wait iscsi_conn_stop + * call and ISER_CONN_DOWN state before freeing the iser resources. + * otherwise we are safe to free resources immediately. + */ + if (ib_conn->iscsi_conn) { + INIT_WORK(&ib_conn->release_work, iser_release_work); + queue_work(release_wq, &ib_conn->release_work); + } else { + iser_conn_release(ib_conn); + } } static umode_t iser_attr_is_visible(int param_type, int param) @@ -714,13 +742,13 @@ static struct iscsi_transport iscsi_iser_transport = { /* connection management */ .create_conn = iscsi_iser_conn_create, .bind_conn = iscsi_iser_conn_bind, - .destroy_conn = iscsi_iser_conn_destroy, + .destroy_conn = iscsi_conn_teardown, .attr_is_visible = iser_attr_is_visible, .set_param = iscsi_iser_set_param, .get_conn_param = iscsi_conn_get_param, .get_ep_param = iscsi_iser_get_ep_param, .get_session_param = iscsi_session_get_param, - .start_conn = iscsi_conn_start, + .start_conn = iscsi_iser_conn_start, .stop_conn = iscsi_iser_conn_stop, /* iscsi host params */ .get_host_param = iscsi_host_get_param, @@ -732,6 +760,7 @@ static struct iscsi_transport iscsi_iser_transport = { .xmit_task = iscsi_iser_task_xmit, .cleanup_task = iscsi_iser_cleanup_task, .alloc_pdu = iscsi_iser_pdu_alloc, + .check_protection = iscsi_iser_check_protection, /* recovery */ .session_recovery_timedout = iscsi_session_recovery_timedout, @@ -766,6 +795,12 @@ static int __init iser_init(void) mutex_init(&ig.connlist_mutex); INIT_LIST_HEAD(&ig.connlist); + release_wq = alloc_workqueue("release workqueue", 0, 0); + if (!release_wq) { + iser_err("failed to allocate release workqueue\n"); + return -ENOMEM; + } + iscsi_iser_scsi_transport = iscsi_register_transport( &iscsi_iser_transport); if (!iscsi_iser_scsi_transport) { @@ -784,7 +819,24 @@ register_transport_failure: static void __exit iser_exit(void) { + struct iser_conn *ib_conn, *n; + int connlist_empty; + iser_dbg("Removing iSER datamover...\n"); + destroy_workqueue(release_wq); + + mutex_lock(&ig.connlist_mutex); + connlist_empty = list_empty(&ig.connlist); + mutex_unlock(&ig.connlist_mutex); + + if (!connlist_empty) { + iser_err("Error cleanup stage completed but we still have iser " + "connections, destroying them anyway.\n"); + list_for_each_entry_safe(ib_conn, n, &ig.connlist, conn_list) { + iser_conn_release(ib_conn); + } + } + iscsi_unregister_transport(&iscsi_iser_transport); kmem_cache_destroy(ig.desc_cache); } diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 67914027c61..97cd385bf7f 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -8,7 +8,7 @@ * * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. - * Copyright (c) 2013 Mellanox Technologies. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -46,6 +46,8 @@ #include <linux/printk.h> #include <scsi/libiscsi.h> #include <scsi/scsi_transport_iscsi.h> +#include <scsi/scsi_cmnd.h> +#include <scsi/scsi_device.h> #include <linux/interrupt.h> #include <linux/wait.h> @@ -67,7 +69,7 @@ #define DRV_NAME "iser" #define PFX DRV_NAME ": " -#define DRV_VER "1.1" +#define DRV_VER "1.4" #define iser_dbg(fmt, arg...) \ do { \ @@ -134,10 +136,21 @@ ISER_MAX_TX_MISC_PDUS + \ ISER_MAX_RX_MISC_PDUS) +/* Max registration work requests per command */ +#define ISER_MAX_REG_WR_PER_CMD 5 + +/* For Signature we don't support DATAOUTs so no need to make room for them */ +#define ISER_QP_SIG_MAX_REQ_DTOS (ISER_DEF_XMIT_CMDS_MAX * \ + (1 + ISER_MAX_REG_WR_PER_CMD) + \ + ISER_MAX_TX_MISC_PDUS + \ + ISER_MAX_RX_MISC_PDUS) + #define ISER_VER 0x10 #define ISER_WSV 0x08 #define ISER_RSV 0x04 +#define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL + struct iser_hdr { u8 flags; u8 rsvd[3]; @@ -201,7 +214,6 @@ struct iser_data_buf { /* fwd declarations */ struct iser_device; struct iser_cq_desc; -struct iscsi_iser_conn; struct iscsi_iser_task; struct iscsi_endpoint; @@ -258,6 +270,7 @@ struct iscsi_iser_task; struct iser_device { struct ib_device *ib_device; struct ib_pd *pd; + struct ib_device_attr dev_attr; struct ib_cq *rx_cq[ISER_MAX_CQ]; struct ib_cq *tx_cq[ISER_MAX_CQ]; struct ib_mr *mr; @@ -277,17 +290,35 @@ struct iser_device { enum iser_data_dir cmd_dir); }; +#define ISER_CHECK_GUARD 0xc0 +#define ISER_CHECK_REFTAG 0x0f +#define ISER_CHECK_APPTAG 0x30 + +enum iser_reg_indicator { + ISER_DATA_KEY_VALID = 1 << 0, + ISER_PROT_KEY_VALID = 1 << 1, + ISER_SIG_KEY_VALID = 1 << 2, + ISER_FASTREG_PROTECTED = 1 << 3, +}; + +struct iser_pi_context { + struct ib_mr *prot_mr; + struct ib_fast_reg_page_list *prot_frpl; + struct ib_mr *sig_mr; +}; + struct fast_reg_descriptor { struct list_head list; /* For fast registration - FRWR */ struct ib_mr *data_mr; struct ib_fast_reg_page_list *data_frpl; - /* Valid for fast registration flag */ - bool valid; + struct iser_pi_context *pi_ctx; + /* registration indicators container */ + u8 reg_indicators; }; struct iser_conn { - struct iscsi_iser_conn *iser_conn; /* iser conn for upcalls */ + struct iscsi_conn *iscsi_conn; struct iscsi_endpoint *ep; enum iser_ib_conn_state state; /* rdma connection state */ atomic_t refcount; @@ -302,6 +333,8 @@ struct iser_conn { int post_recv_buf_count; /* posted rx count */ atomic_t post_send_buf_count; /* posted tx count */ char name[ISER_OBJECT_NAME_SIZE]; + struct work_struct release_work; + struct completion stop_completion; struct list_head conn_list; /* entry in ig conn list */ char *login_buf; @@ -310,6 +343,9 @@ struct iser_conn { unsigned int rx_desc_head; struct iser_rx_desc *rx_descs; struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; + bool pi_support; + + /* Connection memory registration pool */ union { struct { struct ib_fmr_pool *pool; /* pool of IB FMRs */ @@ -319,24 +355,22 @@ struct iser_conn { struct { struct list_head pool; int pool_size; - } frwr; - } fastreg; -}; - -struct iscsi_iser_conn { - struct iscsi_conn *iscsi_conn;/* ptr to iscsi conn */ - struct iser_conn *ib_conn; /* iSER IB conn */ + } fastreg; + }; }; struct iscsi_iser_task { struct iser_tx_desc desc; - struct iscsi_iser_conn *iser_conn; + struct iser_conn *ib_conn; enum iser_task_status status; + struct scsi_cmnd *sc; int command_sent; /* set if command sent */ int dir[ISER_DIRS_NUM]; /* set if dir use*/ struct iser_regd_buf rdma_regd[ISER_DIRS_NUM];/* regd rdma buf */ struct iser_data_buf data[ISER_DIRS_NUM]; /* orig. data des*/ struct iser_data_buf data_copy[ISER_DIRS_NUM];/* contig. copy */ + struct iser_data_buf prot[ISER_DIRS_NUM]; /* prot desc */ + struct iser_data_buf prot_copy[ISER_DIRS_NUM];/* prot copy */ }; struct iser_page_vec { @@ -362,6 +396,8 @@ struct iser_global { extern struct iser_global ig; extern int iser_debug_level; +extern bool iser_pi_enable; +extern int iser_pi_guard; /* allocate connection resources needed for rdma functionality */ int iser_conn_set_full_featured_mode(struct iscsi_conn *conn); @@ -383,12 +419,12 @@ void iscsi_iser_recv(struct iscsi_conn *conn, void iser_conn_init(struct iser_conn *ib_conn); -void iser_conn_get(struct iser_conn *ib_conn); - -int iser_conn_put(struct iser_conn *ib_conn, int destroy_cma_id_allowed); +void iser_conn_release(struct iser_conn *ib_conn); void iser_conn_terminate(struct iser_conn *ib_conn); +void iser_release_work(struct work_struct *work); + void iser_rcv_completion(struct iser_rx_desc *desc, unsigned long dto_xfer_len, struct iser_conn *ib_conn); @@ -401,13 +437,15 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *task); void iser_free_rx_descriptors(struct iser_conn *ib_conn); -void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *task, - enum iser_data_dir cmd_dir); +void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, + struct iser_data_buf *mem, + struct iser_data_buf *mem_copy, + enum iser_data_dir cmd_dir); int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task, enum iser_data_dir cmd_dir); -int iser_reg_rdma_mem_frwr(struct iscsi_iser_task *task, - enum iser_data_dir cmd_dir); +int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *task, + enum iser_data_dir cmd_dir); int iser_connect(struct iser_conn *ib_conn, struct sockaddr_in *src_addr, @@ -420,8 +458,8 @@ int iser_reg_page_vec(struct iser_conn *ib_conn, void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir); -void iser_unreg_mem_frwr(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir); +void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir); int iser_post_recvl(struct iser_conn *ib_conn); int iser_post_recvm(struct iser_conn *ib_conn, int count); @@ -432,12 +470,15 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, enum iser_data_dir iser_dir, enum dma_data_direction dma_dir); -void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task); +void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data); int iser_initialize_task_headers(struct iscsi_task *task, struct iser_tx_desc *tx_desc); int iser_alloc_rx_descriptors(struct iser_conn *ib_conn, struct iscsi_session *session); int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max); void iser_free_fmr_pool(struct iser_conn *ib_conn); -int iser_create_frwr_pool(struct iser_conn *ib_conn, unsigned cmds_max); -void iser_free_frwr_pool(struct iser_conn *ib_conn); +int iser_create_fastreg_pool(struct iser_conn *ib_conn, unsigned cmds_max); +void iser_free_fastreg_pool(struct iser_conn *ib_conn); +u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir, sector_t *sector); #endif diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 538822684d5..8d44a406063 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. - * Copyright (c) 2013 Mellanox Technologies. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -41,15 +41,15 @@ #include "iscsi_iser.h" /* Register user buffer memory and initialize passive rdma - * dto descriptor. Total data size is stored in - * iser_task->data[ISER_DIR_IN].data_len + * dto descriptor. Data size is stored in + * task->data[ISER_DIR_IN].data_len, Protection size + * os stored in task->prot[ISER_DIR_IN].data_len */ -static int iser_prepare_read_cmd(struct iscsi_task *task, - unsigned int edtl) +static int iser_prepare_read_cmd(struct iscsi_task *task) { struct iscsi_iser_task *iser_task = task->dd_data; - struct iser_device *device = iser_task->iser_conn->ib_conn->device; + struct iser_device *device = iser_task->ib_conn->device; struct iser_regd_buf *regd_buf; int err; struct iser_hdr *hdr = &iser_task->desc.iser_header; @@ -62,12 +62,15 @@ static int iser_prepare_read_cmd(struct iscsi_task *task, if (err) return err; - if (edtl > iser_task->data[ISER_DIR_IN].data_len) { - iser_err("Total data length: %ld, less than EDTL: " - "%d, in READ cmd BHS itt: %d, conn: 0x%p\n", - iser_task->data[ISER_DIR_IN].data_len, edtl, - task->itt, iser_task->iser_conn); - return -EINVAL; + if (scsi_prot_sg_count(iser_task->sc)) { + struct iser_data_buf *pbuf_in = &iser_task->prot[ISER_DIR_IN]; + + err = iser_dma_map_task_data(iser_task, + pbuf_in, + ISER_DIR_IN, + DMA_FROM_DEVICE); + if (err) + return err; } err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_IN); @@ -89,8 +92,9 @@ static int iser_prepare_read_cmd(struct iscsi_task *task, } /* Register user buffer memory and initialize passive rdma - * dto descriptor. Total data size is stored in - * task->data[ISER_DIR_OUT].data_len + * dto descriptor. Data size is stored in + * task->data[ISER_DIR_OUT].data_len, Protection size + * is stored at task->prot[ISER_DIR_OUT].data_len */ static int iser_prepare_write_cmd(struct iscsi_task *task, @@ -99,7 +103,7 @@ iser_prepare_write_cmd(struct iscsi_task *task, unsigned int edtl) { struct iscsi_iser_task *iser_task = task->dd_data; - struct iser_device *device = iser_task->iser_conn->ib_conn->device; + struct iser_device *device = iser_task->ib_conn->device; struct iser_regd_buf *regd_buf; int err; struct iser_hdr *hdr = &iser_task->desc.iser_header; @@ -113,12 +117,15 @@ iser_prepare_write_cmd(struct iscsi_task *task, if (err) return err; - if (edtl > iser_task->data[ISER_DIR_OUT].data_len) { - iser_err("Total data length: %ld, less than EDTL: %d, " - "in WRITE cmd BHS itt: %d, conn: 0x%p\n", - iser_task->data[ISER_DIR_OUT].data_len, - edtl, task->itt, task->conn); - return -EINVAL; + if (scsi_prot_sg_count(iser_task->sc)) { + struct iser_data_buf *pbuf_out = &iser_task->prot[ISER_DIR_OUT]; + + err = iser_dma_map_task_data(iser_task, + pbuf_out, + ISER_DIR_OUT, + DMA_TO_DEVICE); + if (err) + return err; } err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_OUT); @@ -327,7 +334,7 @@ free_login_buf: static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) { - struct iscsi_iser_conn *iser_conn = conn->dd_data; + struct iser_conn *ib_conn = conn->dd_data; struct iscsi_session *session = conn->session; iser_dbg("req op %x flags %x\n", req->opcode, req->flags); @@ -340,19 +347,18 @@ static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) * response) and no posted send buffers left - they must have been * consumed during previous login phases. */ - WARN_ON(iser_conn->ib_conn->post_recv_buf_count != 1); - WARN_ON(atomic_read(&iser_conn->ib_conn->post_send_buf_count) != 0); + WARN_ON(ib_conn->post_recv_buf_count != 1); + WARN_ON(atomic_read(&ib_conn->post_send_buf_count) != 0); if (session->discovery_sess) { iser_info("Discovery session, re-using login RX buffer\n"); return 0; } else iser_info("Normal session, posting batch of RX %d buffers\n", - iser_conn->ib_conn->min_posted_rx); + ib_conn->min_posted_rx); /* Initial post receive buffers */ - if (iser_post_recvm(iser_conn->ib_conn, - iser_conn->ib_conn->min_posted_rx)) + if (iser_post_recvm(ib_conn, ib_conn->min_posted_rx)) return -ENOMEM; return 0; @@ -364,11 +370,11 @@ static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) int iser_send_command(struct iscsi_conn *conn, struct iscsi_task *task) { - struct iscsi_iser_conn *iser_conn = conn->dd_data; + struct iser_conn *ib_conn = conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; unsigned long edtl; int err; - struct iser_data_buf *data_buf; + struct iser_data_buf *data_buf, *prot_buf; struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr; struct scsi_cmnd *sc = task->sc; struct iser_tx_desc *tx_desc = &iser_task->desc; @@ -377,22 +383,31 @@ int iser_send_command(struct iscsi_conn *conn, /* build the tx desc regd header and add it to the tx desc dto */ tx_desc->type = ISCSI_TX_SCSI_COMMAND; - iser_create_send_desc(iser_conn->ib_conn, tx_desc); + iser_create_send_desc(ib_conn, tx_desc); - if (hdr->flags & ISCSI_FLAG_CMD_READ) + if (hdr->flags & ISCSI_FLAG_CMD_READ) { data_buf = &iser_task->data[ISER_DIR_IN]; - else + prot_buf = &iser_task->prot[ISER_DIR_IN]; + } else { data_buf = &iser_task->data[ISER_DIR_OUT]; + prot_buf = &iser_task->prot[ISER_DIR_OUT]; + } if (scsi_sg_count(sc)) { /* using a scatter list */ data_buf->buf = scsi_sglist(sc); data_buf->size = scsi_sg_count(sc); } - data_buf->data_len = scsi_bufflen(sc); + if (scsi_prot_sg_count(sc)) { + prot_buf->buf = scsi_prot_sglist(sc); + prot_buf->size = scsi_prot_sg_count(sc); + prot_buf->data_len = data_buf->data_len >> + ilog2(sc->device->sector_size) * 8; + } + if (hdr->flags & ISCSI_FLAG_CMD_READ) { - err = iser_prepare_read_cmd(task, edtl); + err = iser_prepare_read_cmd(task); if (err) goto send_command_error; } @@ -408,7 +423,7 @@ int iser_send_command(struct iscsi_conn *conn, iser_task->status = ISER_TASK_STATUS_STARTED; - err = iser_post_send(iser_conn->ib_conn, tx_desc); + err = iser_post_send(ib_conn, tx_desc); if (!err) return 0; @@ -424,7 +439,7 @@ int iser_send_data_out(struct iscsi_conn *conn, struct iscsi_task *task, struct iscsi_data *hdr) { - struct iscsi_iser_conn *iser_conn = conn->dd_data; + struct iser_conn *ib_conn = conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; struct iser_tx_desc *tx_desc = NULL; struct iser_regd_buf *regd_buf; @@ -473,7 +488,7 @@ int iser_send_data_out(struct iscsi_conn *conn, itt, buf_offset, data_seg_len); - err = iser_post_send(iser_conn->ib_conn, tx_desc); + err = iser_post_send(ib_conn, tx_desc); if (!err) return 0; @@ -486,19 +501,18 @@ send_data_out_error: int iser_send_control(struct iscsi_conn *conn, struct iscsi_task *task) { - struct iscsi_iser_conn *iser_conn = conn->dd_data; + struct iser_conn *ib_conn = conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; struct iser_tx_desc *mdesc = &iser_task->desc; unsigned long data_seg_len; int err = 0; struct iser_device *device; - struct iser_conn *ib_conn = iser_conn->ib_conn; /* build the tx desc regd header and add it to the tx desc dto */ mdesc->type = ISCSI_TX_CONTROL; - iser_create_send_desc(iser_conn->ib_conn, mdesc); + iser_create_send_desc(ib_conn, mdesc); - device = iser_conn->ib_conn->device; + device = ib_conn->device; data_seg_len = ntoh24(task->hdr->dlength); @@ -513,14 +527,13 @@ int iser_send_control(struct iscsi_conn *conn, ib_conn->login_req_dma, task->data_count, DMA_TO_DEVICE); - memcpy(iser_conn->ib_conn->login_req_buf, task->data, - task->data_count); + memcpy(ib_conn->login_req_buf, task->data, task->data_count); ib_dma_sync_single_for_device(device->ib_device, ib_conn->login_req_dma, task->data_count, DMA_TO_DEVICE); - tx_dsg->addr = iser_conn->ib_conn->login_req_dma; + tx_dsg->addr = ib_conn->login_req_dma; tx_dsg->length = task->data_count; tx_dsg->lkey = device->mr->lkey; mdesc->num_sge = 2; @@ -529,7 +542,7 @@ int iser_send_control(struct iscsi_conn *conn, if (task == conn->login_task) { iser_dbg("op %x dsl %lx, posting login rx buffer\n", task->hdr->opcode, data_seg_len); - err = iser_post_recvl(iser_conn->ib_conn); + err = iser_post_recvl(ib_conn); if (err) goto send_control_error; err = iser_post_rx_bufs(conn, task->hdr); @@ -537,7 +550,7 @@ int iser_send_control(struct iscsi_conn *conn, goto send_control_error; } - err = iser_post_send(iser_conn->ib_conn, mdesc); + err = iser_post_send(ib_conn, mdesc); if (!err) return 0; @@ -553,7 +566,6 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc, unsigned long rx_xfer_len, struct iser_conn *ib_conn) { - struct iscsi_iser_conn *conn = ib_conn->iser_conn; struct iscsi_hdr *hdr; u64 rx_dma; int rx_buflen, outstanding, count, err; @@ -575,17 +587,17 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc, iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode, hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN)); - iscsi_iser_recv(conn->iscsi_conn, hdr, - rx_desc->data, rx_xfer_len - ISER_HEADERS_LEN); + iscsi_iser_recv(ib_conn->iscsi_conn, hdr, rx_desc->data, + rx_xfer_len - ISER_HEADERS_LEN); ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma, - rx_buflen, DMA_FROM_DEVICE); + rx_buflen, DMA_FROM_DEVICE); /* decrementing conn->post_recv_buf_count only --after-- freeing the * * task eliminates the need to worry on tasks which are completed in * * parallel to the execution of iser_conn_term. So the code that waits * * for the posted rx bufs refcount to become zero handles everything */ - conn->ib_conn->post_recv_buf_count--; + ib_conn->post_recv_buf_count--; if (rx_dma == ib_conn->login_resp_dma) return; @@ -610,11 +622,12 @@ void iser_snd_completion(struct iser_tx_desc *tx_desc, ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); kmem_cache_free(ig.desc_cache, tx_desc); + tx_desc = NULL; } atomic_dec(&ib_conn->post_send_buf_count); - if (tx_desc->type == ISCSI_TX_CONTROL) { + if (tx_desc && tx_desc->type == ISCSI_TX_CONTROL) { /* this arithmetic is legal by libiscsi dd_data allocation */ task = (void *) ((long)(void *)tx_desc - sizeof(struct iscsi_task)); @@ -634,6 +647,9 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task) iser_task->data[ISER_DIR_IN].data_len = 0; iser_task->data[ISER_DIR_OUT].data_len = 0; + iser_task->prot[ISER_DIR_IN].data_len = 0; + iser_task->prot[ISER_DIR_OUT].data_len = 0; + memset(&iser_task->rdma_regd[ISER_DIR_IN], 0, sizeof(struct iser_regd_buf)); memset(&iser_task->rdma_regd[ISER_DIR_OUT], 0, @@ -642,28 +658,63 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task) void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) { - struct iser_device *device = iser_task->iser_conn->ib_conn->device; - int is_rdma_aligned = 1; + struct iser_device *device = iser_task->ib_conn->device; + int is_rdma_data_aligned = 1; + int is_rdma_prot_aligned = 1; + int prot_count = scsi_prot_sg_count(iser_task->sc); /* if we were reading, copy back to unaligned sglist, * anyway dma_unmap and free the copy */ if (iser_task->data_copy[ISER_DIR_IN].copy_buf != NULL) { - is_rdma_aligned = 0; - iser_finalize_rdma_unaligned_sg(iser_task, ISER_DIR_IN); + is_rdma_data_aligned = 0; + iser_finalize_rdma_unaligned_sg(iser_task, + &iser_task->data[ISER_DIR_IN], + &iser_task->data_copy[ISER_DIR_IN], + ISER_DIR_IN); } + if (iser_task->data_copy[ISER_DIR_OUT].copy_buf != NULL) { - is_rdma_aligned = 0; - iser_finalize_rdma_unaligned_sg(iser_task, ISER_DIR_OUT); + is_rdma_data_aligned = 0; + iser_finalize_rdma_unaligned_sg(iser_task, + &iser_task->data[ISER_DIR_OUT], + &iser_task->data_copy[ISER_DIR_OUT], + ISER_DIR_OUT); + } + + if (iser_task->prot_copy[ISER_DIR_IN].copy_buf != NULL) { + is_rdma_prot_aligned = 0; + iser_finalize_rdma_unaligned_sg(iser_task, + &iser_task->prot[ISER_DIR_IN], + &iser_task->prot_copy[ISER_DIR_IN], + ISER_DIR_IN); } - if (iser_task->dir[ISER_DIR_IN]) + if (iser_task->prot_copy[ISER_DIR_OUT].copy_buf != NULL) { + is_rdma_prot_aligned = 0; + iser_finalize_rdma_unaligned_sg(iser_task, + &iser_task->prot[ISER_DIR_OUT], + &iser_task->prot_copy[ISER_DIR_OUT], + ISER_DIR_OUT); + } + + if (iser_task->dir[ISER_DIR_IN]) { device->iser_unreg_rdma_mem(iser_task, ISER_DIR_IN); + if (is_rdma_data_aligned) + iser_dma_unmap_task_data(iser_task, + &iser_task->data[ISER_DIR_IN]); + if (prot_count && is_rdma_prot_aligned) + iser_dma_unmap_task_data(iser_task, + &iser_task->prot[ISER_DIR_IN]); + } - if (iser_task->dir[ISER_DIR_OUT]) + if (iser_task->dir[ISER_DIR_OUT]) { device->iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT); - - /* if the data was unaligned, it was already unmapped and then copied */ - if (is_rdma_aligned) - iser_dma_unmap_task_data(iser_task); + if (is_rdma_data_aligned) + iser_dma_unmap_task_data(iser_task, + &iser_task->data[ISER_DIR_OUT]); + if (prot_count && is_rdma_prot_aligned) + iser_dma_unmap_task_data(iser_task, + &iser_task->prot[ISER_DIR_OUT]); + } } diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 1ce0c97d2cc..47acd3ad3a1 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. - * Copyright (c) 2013 Mellanox Technologies. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -45,13 +45,19 @@ * iser_start_rdma_unaligned_sg */ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + struct iser_data_buf *data_copy, enum iser_data_dir cmd_dir) { - int dma_nents; - struct ib_device *dev; + struct ib_device *dev = iser_task->ib_conn->device->ib_device; + struct scatterlist *sgl = (struct scatterlist *)data->buf; + struct scatterlist *sg; char *mem = NULL; - struct iser_data_buf *data = &iser_task->data[cmd_dir]; - unsigned long cmd_data_len = data->data_len; + unsigned long cmd_data_len = 0; + int dma_nents, i; + + for_each_sg(sgl, sg, data->size, i) + cmd_data_len += ib_sg_dma_len(dev, sg); if (cmd_data_len > ISER_KMALLOC_THRESHOLD) mem = (void *)__get_free_pages(GFP_ATOMIC, @@ -61,17 +67,16 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, if (mem == NULL) { iser_err("Failed to allocate mem size %d %d for copying sglist\n", - data->size,(int)cmd_data_len); + data->size, (int)cmd_data_len); return -ENOMEM; } if (cmd_dir == ISER_DIR_OUT) { /* copy the unaligned sg the buffer which is used for RDMA */ - struct scatterlist *sgl = (struct scatterlist *)data->buf; - struct scatterlist *sg; int i; char *p, *from; + sgl = (struct scatterlist *)data->buf; p = mem; for_each_sg(sgl, sg, data->size, i) { from = kmap_atomic(sg_page(sg)); @@ -83,39 +88,37 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, } } - sg_init_one(&iser_task->data_copy[cmd_dir].sg_single, mem, cmd_data_len); - iser_task->data_copy[cmd_dir].buf = - &iser_task->data_copy[cmd_dir].sg_single; - iser_task->data_copy[cmd_dir].size = 1; + sg_init_one(&data_copy->sg_single, mem, cmd_data_len); + data_copy->buf = &data_copy->sg_single; + data_copy->size = 1; + data_copy->copy_buf = mem; - iser_task->data_copy[cmd_dir].copy_buf = mem; - - dev = iser_task->iser_conn->ib_conn->device->ib_device; - dma_nents = ib_dma_map_sg(dev, - &iser_task->data_copy[cmd_dir].sg_single, - 1, + dma_nents = ib_dma_map_sg(dev, &data_copy->sg_single, 1, (cmd_dir == ISER_DIR_OUT) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); BUG_ON(dma_nents == 0); - iser_task->data_copy[cmd_dir].dma_nents = dma_nents; + data_copy->dma_nents = dma_nents; + data_copy->data_len = cmd_data_len; + return 0; } /** * iser_finalize_rdma_unaligned_sg */ + void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir) + struct iser_data_buf *data, + struct iser_data_buf *data_copy, + enum iser_data_dir cmd_dir) { struct ib_device *dev; - struct iser_data_buf *mem_copy; unsigned long cmd_data_len; - dev = iser_task->iser_conn->ib_conn->device->ib_device; - mem_copy = &iser_task->data_copy[cmd_dir]; + dev = iser_task->ib_conn->device->ib_device; - ib_dma_unmap_sg(dev, &mem_copy->sg_single, 1, + ib_dma_unmap_sg(dev, &data_copy->sg_single, 1, (cmd_dir == ISER_DIR_OUT) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); @@ -127,10 +130,10 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, int i; /* copy back read RDMA to unaligned sg */ - mem = mem_copy->copy_buf; + mem = data_copy->copy_buf; - sgl = (struct scatterlist *)iser_task->data[ISER_DIR_IN].buf; - sg_size = iser_task->data[ISER_DIR_IN].size; + sgl = (struct scatterlist *)data->buf; + sg_size = data->size; p = mem; for_each_sg(sgl, sg, sg_size, i) { @@ -143,15 +146,15 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, } } - cmd_data_len = iser_task->data[cmd_dir].data_len; + cmd_data_len = data->data_len; if (cmd_data_len > ISER_KMALLOC_THRESHOLD) - free_pages((unsigned long)mem_copy->copy_buf, + free_pages((unsigned long)data_copy->copy_buf, ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); else - kfree(mem_copy->copy_buf); + kfree(data_copy->copy_buf); - mem_copy->copy_buf = NULL; + data_copy->copy_buf = NULL; } #define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) @@ -319,7 +322,7 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, struct ib_device *dev; iser_task->dir[iser_dir] = 1; - dev = iser_task->iser_conn->ib_conn->device->ib_device; + dev = iser_task->ib_conn->device->ib_device; data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir); if (data->dma_nents == 0) { @@ -329,31 +332,23 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, return 0; } -void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task) +void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data) { struct ib_device *dev; - struct iser_data_buf *data; - dev = iser_task->iser_conn->ib_conn->device->ib_device; - - if (iser_task->dir[ISER_DIR_IN]) { - data = &iser_task->data[ISER_DIR_IN]; - ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE); - } - - if (iser_task->dir[ISER_DIR_OUT]) { - data = &iser_task->data[ISER_DIR_OUT]; - ib_dma_unmap_sg(dev, data->buf, data->size, DMA_TO_DEVICE); - } + dev = iser_task->ib_conn->device->ib_device; + ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE); } static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, struct ib_device *ibdev, + struct iser_data_buf *mem, + struct iser_data_buf *mem_copy, enum iser_data_dir cmd_dir, int aligned_len) { - struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn; - struct iser_data_buf *mem = &iser_task->data[cmd_dir]; + struct iscsi_conn *iscsi_conn = iser_task->ib_conn->iscsi_conn; iscsi_conn->fmr_unalign_cnt++; iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n", @@ -363,12 +358,12 @@ static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, iser_data_buf_dump(mem, ibdev); /* unmap the command data before accessing it */ - iser_dma_unmap_task_data(iser_task); + iser_dma_unmap_task_data(iser_task, mem); /* allocate copy buf, if we are writing, copy the */ /* unaligned scatterlist, dma map the copy */ - if (iser_start_rdma_unaligned_sg(iser_task, cmd_dir) != 0) - return -ENOMEM; + if (iser_start_rdma_unaligned_sg(iser_task, mem, mem_copy, cmd_dir) != 0) + return -ENOMEM; return 0; } @@ -382,7 +377,7 @@ static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir) { - struct iser_conn *ib_conn = iser_task->iser_conn->ib_conn; + struct iser_conn *ib_conn = iser_task->ib_conn; struct iser_device *device = ib_conn->device; struct ib_device *ibdev = device->ib_device; struct iser_data_buf *mem = &iser_task->data[cmd_dir]; @@ -396,7 +391,8 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, aligned_len = iser_data_buf_aligned_len(mem, ibdev); if (aligned_len != mem->dma_nents) { - err = fall_to_bounce_buf(iser_task, ibdev, + err = fall_to_bounce_buf(iser_task, ibdev, mem, + &iser_task->data_copy[cmd_dir], cmd_dir, aligned_len); if (err) { iser_err("failed to allocate bounce buffer\n"); @@ -422,8 +418,8 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, (unsigned long)regd_buf->reg.va, (unsigned long)regd_buf->reg.len); } else { /* use FMR for multiple dma entries */ - iser_page_vec_build(mem, ib_conn->fastreg.fmr.page_vec, ibdev); - err = iser_reg_page_vec(ib_conn, ib_conn->fastreg.fmr.page_vec, + iser_page_vec_build(mem, ib_conn->fmr.page_vec, ibdev); + err = iser_reg_page_vec(ib_conn, ib_conn->fmr.page_vec, ®d_buf->reg); if (err && err != -EAGAIN) { iser_data_buf_dump(mem, ibdev); @@ -431,12 +427,12 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, mem->dma_nents, ntoh24(iser_task->desc.iscsi_header.dlength)); iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n", - ib_conn->fastreg.fmr.page_vec->data_size, - ib_conn->fastreg.fmr.page_vec->length, - ib_conn->fastreg.fmr.page_vec->offset); - for (i = 0; i < ib_conn->fastreg.fmr.page_vec->length; i++) + ib_conn->fmr.page_vec->data_size, + ib_conn->fmr.page_vec->length, + ib_conn->fmr.page_vec->offset); + for (i = 0; i < ib_conn->fmr.page_vec->length; i++) iser_err("page_vec[%d] = 0x%llx\n", i, - (unsigned long long) ib_conn->fastreg.fmr.page_vec->pages[i]); + (unsigned long long) ib_conn->fmr.page_vec->pages[i]); } if (err) return err; @@ -444,94 +440,280 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, return 0; } -static int iser_fast_reg_mr(struct fast_reg_descriptor *desc, - struct iser_conn *ib_conn, +static inline enum ib_t10_dif_type +scsi2ib_prot_type(unsigned char prot_type) +{ + switch (prot_type) { + case SCSI_PROT_DIF_TYPE0: + return IB_T10DIF_NONE; + case SCSI_PROT_DIF_TYPE1: + return IB_T10DIF_TYPE1; + case SCSI_PROT_DIF_TYPE2: + return IB_T10DIF_TYPE2; + case SCSI_PROT_DIF_TYPE3: + return IB_T10DIF_TYPE3; + default: + return IB_T10DIF_NONE; + } +} + + +static int +iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs) +{ + unsigned char scsi_ptype = scsi_get_prot_type(sc); + + sig_attrs->mem.sig_type = IB_SIG_TYPE_T10_DIF; + sig_attrs->wire.sig_type = IB_SIG_TYPE_T10_DIF; + sig_attrs->mem.sig.dif.pi_interval = sc->device->sector_size; + sig_attrs->wire.sig.dif.pi_interval = sc->device->sector_size; + + switch (scsi_get_prot_op(sc)) { + case SCSI_PROT_WRITE_INSERT: + case SCSI_PROT_READ_STRIP: + sig_attrs->mem.sig.dif.type = IB_T10DIF_NONE; + sig_attrs->wire.sig.dif.type = scsi2ib_prot_type(scsi_ptype); + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->wire.sig.dif.ref_tag = scsi_get_lba(sc) & + 0xffffffff; + break; + case SCSI_PROT_READ_INSERT: + case SCSI_PROT_WRITE_STRIP: + sig_attrs->mem.sig.dif.type = scsi2ib_prot_type(scsi_ptype); + sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->mem.sig.dif.ref_tag = scsi_get_lba(sc) & + 0xffffffff; + sig_attrs->wire.sig.dif.type = IB_T10DIF_NONE; + break; + case SCSI_PROT_READ_PASS: + case SCSI_PROT_WRITE_PASS: + sig_attrs->mem.sig.dif.type = scsi2ib_prot_type(scsi_ptype); + sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->mem.sig.dif.ref_tag = scsi_get_lba(sc) & + 0xffffffff; + sig_attrs->wire.sig.dif.type = scsi2ib_prot_type(scsi_ptype); + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->wire.sig.dif.ref_tag = scsi_get_lba(sc) & + 0xffffffff; + break; + default: + iser_err("Unsupported PI operation %d\n", + scsi_get_prot_op(sc)); + return -EINVAL; + } + return 0; +} + + +static int +iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) +{ + switch (scsi_get_prot_type(sc)) { + case SCSI_PROT_DIF_TYPE0: + *mask = 0x0; + break; + case SCSI_PROT_DIF_TYPE1: + case SCSI_PROT_DIF_TYPE2: + *mask = ISER_CHECK_GUARD | ISER_CHECK_REFTAG; + break; + case SCSI_PROT_DIF_TYPE3: + *mask = ISER_CHECK_GUARD; + break; + default: + iser_err("Unsupported protection type %d\n", + scsi_get_prot_type(sc)); + return -EINVAL; + } + + return 0; +} + +static int +iser_reg_sig_mr(struct iscsi_iser_task *iser_task, + struct fast_reg_descriptor *desc, struct ib_sge *data_sge, + struct ib_sge *prot_sge, struct ib_sge *sig_sge) +{ + struct iser_conn *ib_conn = iser_task->ib_conn; + struct iser_pi_context *pi_ctx = desc->pi_ctx; + struct ib_send_wr sig_wr, inv_wr; + struct ib_send_wr *bad_wr, *wr = NULL; + struct ib_sig_attrs sig_attrs; + int ret; + u32 key; + + memset(&sig_attrs, 0, sizeof(sig_attrs)); + ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs); + if (ret) + goto err; + + ret = iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask); + if (ret) + goto err; + + if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) { + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.wr_id = ISER_FASTREG_LI_WRID; + inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey; + wr = &inv_wr; + /* Bump the key */ + key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(pi_ctx->sig_mr, ++key); + } + + memset(&sig_wr, 0, sizeof(sig_wr)); + sig_wr.opcode = IB_WR_REG_SIG_MR; + sig_wr.wr_id = ISER_FASTREG_LI_WRID; + sig_wr.sg_list = data_sge; + sig_wr.num_sge = 1; + sig_wr.wr.sig_handover.sig_attrs = &sig_attrs; + sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr; + if (scsi_prot_sg_count(iser_task->sc)) + sig_wr.wr.sig_handover.prot = prot_sge; + sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + + if (!wr) + wr = &sig_wr; + else + wr->next = &sig_wr; + + ret = ib_post_send(ib_conn->qp, wr, &bad_wr); + if (ret) { + iser_err("reg_sig_mr failed, ret:%d\n", ret); + goto err; + } + desc->reg_indicators &= ~ISER_SIG_KEY_VALID; + + sig_sge->lkey = pi_ctx->sig_mr->lkey; + sig_sge->addr = 0; + sig_sge->length = data_sge->length + prot_sge->length; + if (scsi_get_prot_op(iser_task->sc) == SCSI_PROT_WRITE_INSERT || + scsi_get_prot_op(iser_task->sc) == SCSI_PROT_READ_STRIP) { + sig_sge->length += (data_sge->length / + iser_task->sc->device->sector_size) * 8; + } + + iser_dbg("sig_sge: addr: 0x%llx length: %u lkey: 0x%x\n", + sig_sge->addr, sig_sge->length, + sig_sge->lkey); +err: + return ret; +} + +static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, struct iser_regd_buf *regd_buf, - u32 offset, unsigned int data_size, - unsigned int page_list_len) + struct iser_data_buf *mem, + enum iser_reg_indicator ind, + struct ib_sge *sge) { + struct fast_reg_descriptor *desc = regd_buf->reg.mem_h; + struct iser_conn *ib_conn = iser_task->ib_conn; + struct iser_device *device = ib_conn->device; + struct ib_device *ibdev = device->ib_device; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; struct ib_send_wr fastreg_wr, inv_wr; struct ib_send_wr *bad_wr, *wr = NULL; u8 key; - int ret; + int ret, offset, size, plen; + + /* if there a single dma entry, dma mr suffices */ + if (mem->dma_nents == 1) { + struct scatterlist *sg = (struct scatterlist *)mem->buf; - if (!desc->valid) { + sge->lkey = device->mr->lkey; + sge->addr = ib_sg_dma_address(ibdev, &sg[0]); + sge->length = ib_sg_dma_len(ibdev, &sg[0]); + + iser_dbg("Single DMA entry: lkey=0x%x, addr=0x%llx, length=0x%x\n", + sge->lkey, sge->addr, sge->length); + return 0; + } + + if (ind == ISER_DATA_KEY_VALID) { + mr = desc->data_mr; + frpl = desc->data_frpl; + } else { + mr = desc->pi_ctx->prot_mr; + frpl = desc->pi_ctx->prot_frpl; + } + + plen = iser_sg_to_page_vec(mem, device->ib_device, frpl->page_list, + &offset, &size); + if (plen * SIZE_4K < size) { + iser_err("fast reg page_list too short to hold this SG\n"); + return -EINVAL; + } + + if (!(desc->reg_indicators & ind)) { memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.wr_id = ISER_FASTREG_LI_WRID; inv_wr.opcode = IB_WR_LOCAL_INV; - inv_wr.send_flags = IB_SEND_SIGNALED; - inv_wr.ex.invalidate_rkey = desc->data_mr->rkey; + inv_wr.ex.invalidate_rkey = mr->rkey; wr = &inv_wr; /* Bump the key */ - key = (u8)(desc->data_mr->rkey & 0x000000FF); - ib_update_fast_reg_key(desc->data_mr, ++key); + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); } /* Prepare FASTREG WR */ memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.wr_id = ISER_FASTREG_LI_WRID; fastreg_wr.opcode = IB_WR_FAST_REG_MR; - fastreg_wr.send_flags = IB_SEND_SIGNALED; - fastreg_wr.wr.fast_reg.iova_start = desc->data_frpl->page_list[0] + offset; - fastreg_wr.wr.fast_reg.page_list = desc->data_frpl; - fastreg_wr.wr.fast_reg.page_list_len = page_list_len; + fastreg_wr.wr.fast_reg.iova_start = frpl->page_list[0] + offset; + fastreg_wr.wr.fast_reg.page_list = frpl; + fastreg_wr.wr.fast_reg.page_list_len = plen; fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K; - fastreg_wr.wr.fast_reg.length = data_size; - fastreg_wr.wr.fast_reg.rkey = desc->data_mr->rkey; + fastreg_wr.wr.fast_reg.length = size; + fastreg_wr.wr.fast_reg.rkey = mr->rkey; fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ); - if (!wr) { + if (!wr) wr = &fastreg_wr; - atomic_inc(&ib_conn->post_send_buf_count); - } else { + else wr->next = &fastreg_wr; - atomic_add(2, &ib_conn->post_send_buf_count); - } ret = ib_post_send(ib_conn->qp, wr, &bad_wr); if (ret) { - if (bad_wr->next) - atomic_sub(2, &ib_conn->post_send_buf_count); - else - atomic_dec(&ib_conn->post_send_buf_count); iser_err("fast registration failed, ret:%d\n", ret); return ret; } - desc->valid = false; + desc->reg_indicators &= ~ind; - regd_buf->reg.mem_h = desc; - regd_buf->reg.lkey = desc->data_mr->lkey; - regd_buf->reg.rkey = desc->data_mr->rkey; - regd_buf->reg.va = desc->data_frpl->page_list[0] + offset; - regd_buf->reg.len = data_size; - regd_buf->reg.is_mr = 1; + sge->lkey = mr->lkey; + sge->addr = frpl->page_list[0] + offset; + sge->length = size; return ret; } /** - * iser_reg_rdma_mem_frwr - Registers memory intended for RDMA, + * iser_reg_rdma_mem_fastreg - Registers memory intended for RDMA, * using Fast Registration WR (if possible) obtaining rkey and va * * returns 0 on success, errno code on failure */ -int iser_reg_rdma_mem_frwr(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir) +int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) { - struct iser_conn *ib_conn = iser_task->iser_conn->ib_conn; + struct iser_conn *ib_conn = iser_task->ib_conn; struct iser_device *device = ib_conn->device; struct ib_device *ibdev = device->ib_device; struct iser_data_buf *mem = &iser_task->data[cmd_dir]; struct iser_regd_buf *regd_buf = &iser_task->rdma_regd[cmd_dir]; - struct fast_reg_descriptor *desc; - unsigned int data_size, page_list_len; + struct fast_reg_descriptor *desc = NULL; + struct ib_sge data_sge; int err, aligned_len; unsigned long flags; - u32 offset; aligned_len = iser_data_buf_aligned_len(mem, ibdev); if (aligned_len != mem->dma_nents) { - err = fall_to_bounce_buf(iser_task, ibdev, + err = fall_to_bounce_buf(iser_task, ibdev, mem, + &iser_task->data_copy[cmd_dir], cmd_dir, aligned_len); if (err) { iser_err("failed to allocate bounce buffer\n"); @@ -540,41 +722,79 @@ int iser_reg_rdma_mem_frwr(struct iscsi_iser_task *iser_task, mem = &iser_task->data_copy[cmd_dir]; } - /* if there a single dma entry, dma mr suffices */ - if (mem->dma_nents == 1) { - struct scatterlist *sg = (struct scatterlist *)mem->buf; - - regd_buf->reg.lkey = device->mr->lkey; - regd_buf->reg.rkey = device->mr->rkey; - regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); - regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); - regd_buf->reg.is_mr = 0; - } else { + if (mem->dma_nents != 1 || + scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { spin_lock_irqsave(&ib_conn->lock, flags); - desc = list_first_entry(&ib_conn->fastreg.frwr.pool, + desc = list_first_entry(&ib_conn->fastreg.pool, struct fast_reg_descriptor, list); list_del(&desc->list); spin_unlock_irqrestore(&ib_conn->lock, flags); - page_list_len = iser_sg_to_page_vec(mem, device->ib_device, - desc->data_frpl->page_list, - &offset, &data_size); - - if (page_list_len * SIZE_4K < data_size) { - iser_err("fast reg page_list too short to hold this SG\n"); - err = -EINVAL; - goto err_reg; + regd_buf->reg.mem_h = desc; + } + + err = iser_fast_reg_mr(iser_task, regd_buf, mem, + ISER_DATA_KEY_VALID, &data_sge); + if (err) + goto err_reg; + + if (scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { + struct ib_sge prot_sge, sig_sge; + + memset(&prot_sge, 0, sizeof(prot_sge)); + if (scsi_prot_sg_count(iser_task->sc)) { + mem = &iser_task->prot[cmd_dir]; + aligned_len = iser_data_buf_aligned_len(mem, ibdev); + if (aligned_len != mem->dma_nents) { + err = fall_to_bounce_buf(iser_task, ibdev, mem, + &iser_task->prot_copy[cmd_dir], + cmd_dir, aligned_len); + if (err) { + iser_err("failed to allocate bounce buffer\n"); + return err; + } + mem = &iser_task->prot_copy[cmd_dir]; + } + + err = iser_fast_reg_mr(iser_task, regd_buf, mem, + ISER_PROT_KEY_VALID, &prot_sge); + if (err) + goto err_reg; } - err = iser_fast_reg_mr(desc, ib_conn, regd_buf, - offset, data_size, page_list_len); - if (err) - goto err_reg; + err = iser_reg_sig_mr(iser_task, desc, &data_sge, + &prot_sge, &sig_sge); + if (err) { + iser_err("Failed to register signature mr\n"); + return err; + } + desc->reg_indicators |= ISER_FASTREG_PROTECTED; + + regd_buf->reg.lkey = sig_sge.lkey; + regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey; + regd_buf->reg.va = sig_sge.addr; + regd_buf->reg.len = sig_sge.length; + regd_buf->reg.is_mr = 1; + } else { + if (desc) { + regd_buf->reg.rkey = desc->data_mr->rkey; + regd_buf->reg.is_mr = 1; + } else { + regd_buf->reg.rkey = device->mr->rkey; + regd_buf->reg.is_mr = 0; + } + + regd_buf->reg.lkey = data_sge.lkey; + regd_buf->reg.va = data_sge.addr; + regd_buf->reg.len = data_sge.length; } return 0; err_reg: - spin_lock_irqsave(&ib_conn->lock, flags); - list_add_tail(&desc->list, &ib_conn->fastreg.frwr.pool); - spin_unlock_irqrestore(&ib_conn->lock, flags); + if (desc) { + spin_lock_irqsave(&ib_conn->lock, flags); + list_add_tail(&desc->list, &ib_conn->fastreg.pool); + spin_unlock_irqrestore(&ib_conn->lock, flags); + } + return err; } diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index afe95674008..ea01075f9f9 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -1,7 +1,7 @@ /* * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. - * Copyright (c) 2013 Mellanox Technologies. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -71,17 +71,14 @@ static void iser_event_handler(struct ib_event_handler *handler, */ static int iser_create_device_ib_res(struct iser_device *device) { - int i, j; struct iser_cq_desc *cq_desc; - struct ib_device_attr *dev_attr; + struct ib_device_attr *dev_attr = &device->dev_attr; + int ret, i, j; - dev_attr = kmalloc(sizeof(*dev_attr), GFP_KERNEL); - if (!dev_attr) - return -ENOMEM; - - if (ib_query_device(device->ib_device, dev_attr)) { + ret = ib_query_device(device->ib_device, dev_attr); + if (ret) { pr_warn("Query device failed for %s\n", device->ib_device->name); - goto dev_attr_err; + return ret; } /* Assign function handles - based on FMR support */ @@ -94,14 +91,14 @@ static int iser_create_device_ib_res(struct iser_device *device) device->iser_unreg_rdma_mem = iser_unreg_mem_fmr; } else if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { - iser_info("FRWR supported, using FRWR for registration\n"); - device->iser_alloc_rdma_reg_res = iser_create_frwr_pool; - device->iser_free_rdma_reg_res = iser_free_frwr_pool; - device->iser_reg_rdma_mem = iser_reg_rdma_mem_frwr; - device->iser_unreg_rdma_mem = iser_unreg_mem_frwr; + iser_info("FastReg supported, using FastReg for registration\n"); + device->iser_alloc_rdma_reg_res = iser_create_fastreg_pool; + device->iser_free_rdma_reg_res = iser_free_fastreg_pool; + device->iser_reg_rdma_mem = iser_reg_rdma_mem_fastreg; + device->iser_unreg_rdma_mem = iser_unreg_mem_fastreg; } else { - iser_err("IB device does not support FMRs nor FRWRs, can't register memory\n"); - goto dev_attr_err; + iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n"); + return -1; } device->cqs_used = min(ISER_MAX_CQ, device->ib_device->num_comp_vectors); @@ -158,7 +155,6 @@ static int iser_create_device_ib_res(struct iser_device *device) if (ib_register_event_handler(&device->event_handler)) goto handler_err; - kfree(dev_attr); return 0; handler_err: @@ -178,8 +174,6 @@ pd_err: kfree(device->cq_desc); cq_desc_err: iser_err("failed to allocate an IB resource\n"); -dev_attr_err: - kfree(dev_attr); return -1; } @@ -221,13 +215,13 @@ int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max) struct ib_fmr_pool_param params; int ret = -ENOMEM; - ib_conn->fastreg.fmr.page_vec = kmalloc(sizeof(struct iser_page_vec) + - (sizeof(u64)*(ISCSI_ISER_SG_TABLESIZE + 1)), - GFP_KERNEL); - if (!ib_conn->fastreg.fmr.page_vec) + ib_conn->fmr.page_vec = kmalloc(sizeof(*ib_conn->fmr.page_vec) + + (sizeof(u64)*(ISCSI_ISER_SG_TABLESIZE + 1)), + GFP_KERNEL); + if (!ib_conn->fmr.page_vec) return ret; - ib_conn->fastreg.fmr.page_vec->pages = (u64 *)(ib_conn->fastreg.fmr.page_vec + 1); + ib_conn->fmr.page_vec->pages = (u64 *)(ib_conn->fmr.page_vec + 1); params.page_shift = SHIFT_4K; /* when the first/last SG element are not start/end * @@ -243,16 +237,16 @@ int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max) IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ); - ib_conn->fastreg.fmr.pool = ib_create_fmr_pool(device->pd, ¶ms); - if (!IS_ERR(ib_conn->fastreg.fmr.pool)) + ib_conn->fmr.pool = ib_create_fmr_pool(device->pd, ¶ms); + if (!IS_ERR(ib_conn->fmr.pool)) return 0; /* no FMR => no need for page_vec */ - kfree(ib_conn->fastreg.fmr.page_vec); - ib_conn->fastreg.fmr.page_vec = NULL; + kfree(ib_conn->fmr.page_vec); + ib_conn->fmr.page_vec = NULL; - ret = PTR_ERR(ib_conn->fastreg.fmr.pool); - ib_conn->fastreg.fmr.pool = NULL; + ret = PTR_ERR(ib_conn->fmr.pool); + ib_conn->fmr.pool = NULL; if (ret != -ENOSYS) { iser_err("FMR allocation failed, err %d\n", ret); return ret; @@ -268,93 +262,173 @@ int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max) void iser_free_fmr_pool(struct iser_conn *ib_conn) { iser_info("freeing conn %p fmr pool %p\n", - ib_conn, ib_conn->fastreg.fmr.pool); + ib_conn, ib_conn->fmr.pool); + + if (ib_conn->fmr.pool != NULL) + ib_destroy_fmr_pool(ib_conn->fmr.pool); + + ib_conn->fmr.pool = NULL; + + kfree(ib_conn->fmr.page_vec); + ib_conn->fmr.page_vec = NULL; +} + +static int +iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd, + bool pi_enable, struct fast_reg_descriptor *desc) +{ + int ret; + + desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE + 1); + if (IS_ERR(desc->data_frpl)) { + ret = PTR_ERR(desc->data_frpl); + iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n", + ret); + return PTR_ERR(desc->data_frpl); + } - if (ib_conn->fastreg.fmr.pool != NULL) - ib_destroy_fmr_pool(ib_conn->fastreg.fmr.pool); + desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1); + if (IS_ERR(desc->data_mr)) { + ret = PTR_ERR(desc->data_mr); + iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); + goto fast_reg_mr_failure; + } + desc->reg_indicators |= ISER_DATA_KEY_VALID; + + if (pi_enable) { + struct ib_mr_init_attr mr_init_attr = {0}; + struct iser_pi_context *pi_ctx = NULL; + + desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); + if (!desc->pi_ctx) { + iser_err("Failed to allocate pi context\n"); + ret = -ENOMEM; + goto pi_ctx_alloc_failure; + } + pi_ctx = desc->pi_ctx; + + pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(pi_ctx->prot_frpl)) { + ret = PTR_ERR(pi_ctx->prot_frpl); + iser_err("Failed to allocate prot frpl ret=%d\n", + ret); + goto prot_frpl_failure; + } - ib_conn->fastreg.fmr.pool = NULL; + pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, + ISCSI_ISER_SG_TABLESIZE + 1); + if (IS_ERR(pi_ctx->prot_mr)) { + ret = PTR_ERR(pi_ctx->prot_mr); + iser_err("Failed to allocate prot frmr ret=%d\n", + ret); + goto prot_mr_failure; + } + desc->reg_indicators |= ISER_PROT_KEY_VALID; + + mr_init_attr.max_reg_descriptors = 2; + mr_init_attr.flags |= IB_MR_SIGNATURE_EN; + pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); + if (IS_ERR(pi_ctx->sig_mr)) { + ret = PTR_ERR(pi_ctx->sig_mr); + iser_err("Failed to allocate signature enabled mr err=%d\n", + ret); + goto sig_mr_failure; + } + desc->reg_indicators |= ISER_SIG_KEY_VALID; + } + desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; + + iser_dbg("Create fr_desc %p page_list %p\n", + desc, desc->data_frpl->page_list); + + return 0; +sig_mr_failure: + ib_dereg_mr(desc->pi_ctx->prot_mr); +prot_mr_failure: + ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); +prot_frpl_failure: + kfree(desc->pi_ctx); +pi_ctx_alloc_failure: + ib_dereg_mr(desc->data_mr); +fast_reg_mr_failure: + ib_free_fast_reg_page_list(desc->data_frpl); - kfree(ib_conn->fastreg.fmr.page_vec); - ib_conn->fastreg.fmr.page_vec = NULL; + return ret; } /** - * iser_create_frwr_pool - Creates pool of fast_reg descriptors + * iser_create_fastreg_pool - Creates pool of fast_reg descriptors * for fast registration work requests. * returns 0 on success, or errno code on failure */ -int iser_create_frwr_pool(struct iser_conn *ib_conn, unsigned cmds_max) +int iser_create_fastreg_pool(struct iser_conn *ib_conn, unsigned cmds_max) { struct iser_device *device = ib_conn->device; struct fast_reg_descriptor *desc; int i, ret; - INIT_LIST_HEAD(&ib_conn->fastreg.frwr.pool); - ib_conn->fastreg.frwr.pool_size = 0; + INIT_LIST_HEAD(&ib_conn->fastreg.pool); + ib_conn->fastreg.pool_size = 0; for (i = 0; i < cmds_max; i++) { - desc = kmalloc(sizeof(*desc), GFP_KERNEL); + desc = kzalloc(sizeof(*desc), GFP_KERNEL); if (!desc) { iser_err("Failed to allocate a new fast_reg descriptor\n"); ret = -ENOMEM; goto err; } - desc->data_frpl = ib_alloc_fast_reg_page_list(device->ib_device, - ISCSI_ISER_SG_TABLESIZE + 1); - if (IS_ERR(desc->data_frpl)) { - ret = PTR_ERR(desc->data_frpl); - iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n", ret); - goto fast_reg_page_failure; + ret = iser_create_fastreg_desc(device->ib_device, device->pd, + ib_conn->pi_support, desc); + if (ret) { + iser_err("Failed to create fastreg descriptor err=%d\n", + ret); + kfree(desc); + goto err; } - desc->data_mr = ib_alloc_fast_reg_mr(device->pd, - ISCSI_ISER_SG_TABLESIZE + 1); - if (IS_ERR(desc->data_mr)) { - ret = PTR_ERR(desc->data_mr); - iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); - goto fast_reg_mr_failure; - } - desc->valid = true; - list_add_tail(&desc->list, &ib_conn->fastreg.frwr.pool); - ib_conn->fastreg.frwr.pool_size++; + list_add_tail(&desc->list, &ib_conn->fastreg.pool); + ib_conn->fastreg.pool_size++; } return 0; -fast_reg_mr_failure: - ib_free_fast_reg_page_list(desc->data_frpl); -fast_reg_page_failure: - kfree(desc); err: - iser_free_frwr_pool(ib_conn); + iser_free_fastreg_pool(ib_conn); return ret; } /** - * iser_free_frwr_pool - releases the pool of fast_reg descriptors + * iser_free_fastreg_pool - releases the pool of fast_reg descriptors */ -void iser_free_frwr_pool(struct iser_conn *ib_conn) +void iser_free_fastreg_pool(struct iser_conn *ib_conn) { struct fast_reg_descriptor *desc, *tmp; int i = 0; - if (list_empty(&ib_conn->fastreg.frwr.pool)) + if (list_empty(&ib_conn->fastreg.pool)) return; - iser_info("freeing conn %p frwr pool\n", ib_conn); + iser_info("freeing conn %p fr pool\n", ib_conn); - list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.frwr.pool, list) { + list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) { list_del(&desc->list); ib_free_fast_reg_page_list(desc->data_frpl); ib_dereg_mr(desc->data_mr); + if (desc->pi_ctx) { + ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); + ib_dereg_mr(desc->pi_ctx->prot_mr); + ib_destroy_mr(desc->pi_ctx->sig_mr); + kfree(desc->pi_ctx); + } kfree(desc); ++i; } - if (i < ib_conn->fastreg.frwr.pool_size) + if (i < ib_conn->fastreg.pool_size) iser_warn("pool still has %d regions registered\n", - ib_conn->fastreg.frwr.pool_size - i); + ib_conn->fastreg.pool_size - i); } /** @@ -389,12 +463,17 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn) init_attr.qp_context = (void *)ib_conn; init_attr.send_cq = device->tx_cq[min_index]; init_attr.recv_cq = device->rx_cq[min_index]; - init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS; init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; init_attr.cap.max_send_sge = 2; init_attr.cap.max_recv_sge = 1; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; init_attr.qp_type = IB_QPT_RC; + if (ib_conn->pi_support) { + init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS; + init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; + } else { + init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS; + } ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); if (ret) @@ -502,14 +581,30 @@ static int iser_conn_state_comp_exch(struct iser_conn *ib_conn, return ret; } +void iser_release_work(struct work_struct *work) +{ + struct iser_conn *ib_conn; + + ib_conn = container_of(work, struct iser_conn, release_work); + + /* wait for .conn_stop callback */ + wait_for_completion(&ib_conn->stop_completion); + + /* wait for the qp`s post send and post receive buffers to empty */ + wait_event_interruptible(ib_conn->wait, + ib_conn->state == ISER_CONN_DOWN); + + iser_conn_release(ib_conn); +} + /** * Frees all conn objects and deallocs conn descriptor */ -static void iser_conn_release(struct iser_conn *ib_conn, int can_destroy_id) +void iser_conn_release(struct iser_conn *ib_conn) { struct iser_device *device = ib_conn->device; - BUG_ON(ib_conn->state != ISER_CONN_DOWN); + BUG_ON(ib_conn->state == ISER_CONN_UP); mutex_lock(&ig.connlist_mutex); list_del(&ib_conn->conn_list); @@ -521,27 +616,13 @@ static void iser_conn_release(struct iser_conn *ib_conn, int can_destroy_id) if (device != NULL) iser_device_try_release(device); /* if cma handler context, the caller actually destroy the id */ - if (ib_conn->cma_id != NULL && can_destroy_id) { + if (ib_conn->cma_id != NULL) { rdma_destroy_id(ib_conn->cma_id); ib_conn->cma_id = NULL; } iscsi_destroy_endpoint(ib_conn->ep); } -void iser_conn_get(struct iser_conn *ib_conn) -{ - atomic_inc(&ib_conn->refcount); -} - -int iser_conn_put(struct iser_conn *ib_conn, int can_destroy_id) -{ - if (atomic_dec_and_test(&ib_conn->refcount)) { - iser_conn_release(ib_conn, can_destroy_id); - return 1; - } - return 0; -} - /** * triggers start of the disconnect procedures and wait for them to be done */ @@ -559,24 +640,19 @@ void iser_conn_terminate(struct iser_conn *ib_conn) if (err) iser_err("Failed to disconnect, conn: 0x%p err %d\n", ib_conn,err); - - wait_event_interruptible(ib_conn->wait, - ib_conn->state == ISER_CONN_DOWN); - - iser_conn_put(ib_conn, 1); /* deref ib conn deallocate */ } -static int iser_connect_error(struct rdma_cm_id *cma_id) +static void iser_connect_error(struct rdma_cm_id *cma_id) { struct iser_conn *ib_conn; + ib_conn = (struct iser_conn *)cma_id->context; ib_conn->state = ISER_CONN_DOWN; wake_up_interruptible(&ib_conn->wait); - return iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */ } -static int iser_addr_handler(struct rdma_cm_id *cma_id) +static void iser_addr_handler(struct rdma_cm_id *cma_id) { struct iser_device *device; struct iser_conn *ib_conn; @@ -585,22 +661,35 @@ static int iser_addr_handler(struct rdma_cm_id *cma_id) device = iser_device_find_by_ib_device(cma_id); if (!device) { iser_err("device lookup/creation failed\n"); - return iser_connect_error(cma_id); + iser_connect_error(cma_id); + return; } ib_conn = (struct iser_conn *)cma_id->context; ib_conn->device = device; + /* connection T10-PI support */ + if (iser_pi_enable) { + if (!(device->dev_attr.device_cap_flags & + IB_DEVICE_SIGNATURE_HANDOVER)) { + iser_warn("T10-PI requested but not supported on %s, " + "continue without T10-PI\n", + ib_conn->device->ib_device->name); + ib_conn->pi_support = false; + } else { + ib_conn->pi_support = true; + } + } + ret = rdma_resolve_route(cma_id, 1000); if (ret) { iser_err("resolve route failed: %d\n", ret); - return iser_connect_error(cma_id); + iser_connect_error(cma_id); + return; } - - return 0; } -static int iser_route_handler(struct rdma_cm_id *cma_id) +static void iser_route_handler(struct rdma_cm_id *cma_id) { struct rdma_conn_param conn_param; int ret; @@ -628,33 +717,40 @@ static int iser_route_handler(struct rdma_cm_id *cma_id) goto failure; } - return 0; + return; failure: - return iser_connect_error(cma_id); + iser_connect_error(cma_id); } static void iser_connected_handler(struct rdma_cm_id *cma_id) { struct iser_conn *ib_conn; + struct ib_qp_attr attr; + struct ib_qp_init_attr init_attr; + + (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr); + iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num); ib_conn = (struct iser_conn *)cma_id->context; - ib_conn->state = ISER_CONN_UP; - wake_up_interruptible(&ib_conn->wait); + if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_PENDING, ISER_CONN_UP)) + wake_up_interruptible(&ib_conn->wait); } -static int iser_disconnected_handler(struct rdma_cm_id *cma_id) +static void iser_disconnected_handler(struct rdma_cm_id *cma_id) { struct iser_conn *ib_conn; - int ret; ib_conn = (struct iser_conn *)cma_id->context; /* getting here when the state is UP means that the conn is being * * terminated asynchronously from the iSCSI layer's perspective. */ if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, - ISER_CONN_TERMINATING)) - iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn, - ISCSI_ERR_CONN_FAILED); + ISER_CONN_TERMINATING)){ + if (ib_conn->iscsi_conn) + iscsi_conn_failure(ib_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); + else + iser_err("iscsi_iser connection isn't bound\n"); + } /* Complete the termination process if no posts are pending */ if (ib_conn->post_recv_buf_count == 0 && @@ -662,24 +758,19 @@ static int iser_disconnected_handler(struct rdma_cm_id *cma_id) ib_conn->state = ISER_CONN_DOWN; wake_up_interruptible(&ib_conn->wait); } - - ret = iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */ - return ret; } static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { - int ret = 0; - iser_info("event %d status %d conn %p id %p\n", event->event, event->status, cma_id->context, cma_id); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: - ret = iser_addr_handler(cma_id); + iser_addr_handler(cma_id); break; case RDMA_CM_EVENT_ROUTE_RESOLVED: - ret = iser_route_handler(cma_id); + iser_route_handler(cma_id); break; case RDMA_CM_EVENT_ESTABLISHED: iser_connected_handler(cma_id); @@ -689,18 +780,18 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: - ret = iser_connect_error(cma_id); + iser_connect_error(cma_id); break; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_ADDR_CHANGE: - ret = iser_disconnected_handler(cma_id); + iser_disconnected_handler(cma_id); break; default: iser_err("Unexpected RDMA CM event (%d)\n", event->event); break; } - return ret; + return 0; } void iser_conn_init(struct iser_conn *ib_conn) @@ -709,7 +800,7 @@ void iser_conn_init(struct iser_conn *ib_conn) init_waitqueue_head(&ib_conn->wait); ib_conn->post_recv_buf_count = 0; atomic_set(&ib_conn->post_send_buf_count, 0); - atomic_set(&ib_conn->refcount, 1); /* ref ib conn allocation */ + init_completion(&ib_conn->stop_completion); INIT_LIST_HEAD(&ib_conn->conn_list); spin_lock_init(&ib_conn->lock); } @@ -737,7 +828,6 @@ int iser_connect(struct iser_conn *ib_conn, ib_conn->state = ISER_CONN_PENDING; - iser_conn_get(ib_conn); /* ref ib conn's cma id */ ib_conn->cma_id = rdma_create_id(iser_cma_handler, (void *)ib_conn, RDMA_PS_TCP, IB_QPT_RC); @@ -774,9 +864,8 @@ id_failure: ib_conn->cma_id = NULL; addr_failure: ib_conn->state = ISER_CONN_DOWN; - iser_conn_put(ib_conn, 1); /* deref ib conn's cma id */ connect_failure: - iser_conn_put(ib_conn, 1); /* deref ib conn deallocate */ + iser_conn_release(ib_conn); return err; } @@ -797,7 +886,7 @@ int iser_reg_page_vec(struct iser_conn *ib_conn, page_list = page_vec->pages; io_addr = page_list[0]; - mem = ib_fmr_pool_map_phys(ib_conn->fastreg.fmr.pool, + mem = ib_fmr_pool_map_phys(ib_conn->fmr.pool, page_list, page_vec->length, io_addr); @@ -851,11 +940,11 @@ void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, reg->mem_h = NULL; } -void iser_unreg_mem_frwr(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir) +void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) { struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; - struct iser_conn *ib_conn = iser_task->iser_conn->ib_conn; + struct iser_conn *ib_conn = iser_task->ib_conn; struct fast_reg_descriptor *desc = reg->mem_h; if (!reg->is_mr) @@ -864,7 +953,7 @@ void iser_unreg_mem_frwr(struct iscsi_iser_task *iser_task, reg->mem_h = NULL; reg->is_mr = 0; spin_lock_bh(&ib_conn->lock); - list_add_tail(&desc->list, &ib_conn->fastreg.frwr.pool); + list_add_tail(&desc->list, &ib_conn->fastreg.pool); spin_unlock_bh(&ib_conn->lock); } @@ -965,7 +1054,7 @@ static void iser_handle_comp_error(struct iser_tx_desc *desc, * perspective. */ if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, ISER_CONN_TERMINATING)) - iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn, + iscsi_conn_failure(ib_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); /* no more non completed posts to the QP, complete the @@ -989,18 +1078,16 @@ static int iser_drain_tx_cq(struct iser_device *device, int cq_index) if (wc.status == IB_WC_SUCCESS) { if (wc.opcode == IB_WC_SEND) iser_snd_completion(tx_desc, ib_conn); - else if (wc.opcode == IB_WC_LOCAL_INV || - wc.opcode == IB_WC_FAST_REG_MR) { - atomic_dec(&ib_conn->post_send_buf_count); - continue; - } else + else iser_err("expected opcode %d got %d\n", IB_WC_SEND, wc.opcode); } else { iser_err("tx id %llx status %d vend_err %x\n", - wc.wr_id, wc.status, wc.vendor_err); - atomic_dec(&ib_conn->post_send_buf_count); - iser_handle_comp_error(tx_desc, ib_conn); + wc.wr_id, wc.status, wc.vendor_err); + if (wc.wr_id != ISER_FASTREG_LI_WRID) { + atomic_dec(&ib_conn->post_send_buf_count); + iser_handle_comp_error(tx_desc, ib_conn); + } } completed_tx++; } @@ -1018,8 +1105,12 @@ static void iser_cq_tasklet_fn(unsigned long data) struct iser_rx_desc *desc; unsigned long xfer_len; struct iser_conn *ib_conn; - int completed_tx, completed_rx; - completed_tx = completed_rx = 0; + int completed_tx, completed_rx = 0; + + /* First do tx drain, so in a case where we have rx flushes and a successful + * tx completion we will still go through completion error handling. + */ + completed_tx = iser_drain_tx_cq(device, cq_index); while (ib_poll_cq(cq, 1, &wc) == 1) { desc = (struct iser_rx_desc *) (unsigned long) wc.wr_id; @@ -1047,7 +1138,6 @@ static void iser_cq_tasklet_fn(unsigned long data) * " would not cause interrupts to be missed" */ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - completed_tx += iser_drain_tx_cq(device, cq_index); iser_dbg("got %d rx %d tx completions\n", completed_rx, completed_tx); } @@ -1059,3 +1149,51 @@ static void iser_cq_callback(struct ib_cq *cq, void *cq_context) tasklet_schedule(&device->cq_tasklet[cq_index]); } + +u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir, sector_t *sector) +{ + struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; + struct fast_reg_descriptor *desc = reg->mem_h; + unsigned long sector_size = iser_task->sc->device->sector_size; + struct ib_mr_status mr_status; + int ret; + + if (desc && desc->reg_indicators & ISER_FASTREG_PROTECTED) { + desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; + ret = ib_check_mr_status(desc->pi_ctx->sig_mr, + IB_MR_CHECK_SIG_STATUS, &mr_status); + if (ret) { + pr_err("ib_check_mr_status failed, ret %d\n", ret); + goto err; + } + + if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { + sector_t sector_off = mr_status.sig_err.sig_err_offset; + + do_div(sector_off, sector_size + 8); + *sector = scsi_get_lba(iser_task->sc) + sector_off; + + pr_err("PI error found type %d at sector %llx " + "expected %x vs actual %x\n", + mr_status.sig_err.err_type, + (unsigned long long)*sector, + mr_status.sig_err.expected, + mr_status.sig_err.actual); + + switch (mr_status.sig_err.err_type) { + case IB_SIG_BAD_GUARD: + return 0x1; + case IB_SIG_BAD_REFTAG: + return 0x3; + case IB_SIG_BAD_APPTAG: + return 0x2; + } + } + } + + return 0; +err: + /* Not alot we can do here, return ambiguous guard error */ + return 0x1; +} diff --git a/drivers/infiniband/ulp/isert/Kconfig b/drivers/infiniband/ulp/isert/Kconfig index ce3fd32167d..02f9759ebb1 100644 --- a/drivers/infiniband/ulp/isert/Kconfig +++ b/drivers/infiniband/ulp/isert/Kconfig @@ -1,5 +1,5 @@ config INFINIBAND_ISERT - tristate "iSCSI Extentions for RDMA (iSER) target support" + tristate "iSCSI Extensions for RDMA (iSER) target support" depends on INET && INFINIBAND_ADDR_TRANS && TARGET_CORE && ISCSI_TARGET ---help--- - Support for iSCSI Extentions for RDMA (iSER) Target on Infiniband fabrics. + Support for iSCSI Extensions for RDMA (iSER) Target on Infiniband fabrics. diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 3591855cc5b..d4c7928a0f3 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -22,11 +22,13 @@ #include <linux/socket.h> #include <linux/in.h> #include <linux/in6.h> +#include <linux/llist.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> #include <target/target_core_base.h> #include <target/target_core_fabric.h> #include <target/iscsi/iscsi_transport.h> +#include <linux/semaphore.h> #include "isert_proto.h" #include "ib_isert.h" @@ -46,10 +48,12 @@ static int isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, struct isert_rdma_wr *wr); static void -isert_unreg_rdma_frwr(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn); +isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn); static int -isert_reg_rdma_frwr(struct iscsi_conn *conn, struct iscsi_cmd *cmd, - struct isert_rdma_wr *wr); +isert_reg_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr); +static int +isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd); static void isert_qp_event_callback(struct ib_event *e, void *context) @@ -86,7 +90,8 @@ isert_query_device(struct ib_device *ib_dev, struct ib_device_attr *devattr) } static int -isert_conn_setup_qp(struct isert_conn *isert_conn, struct rdma_cm_id *cma_id) +isert_conn_setup_qp(struct isert_conn *isert_conn, struct rdma_cm_id *cma_id, + u8 protection) { struct isert_device *device = isert_conn->conn_device; struct ib_qp_init_attr attr; @@ -118,6 +123,8 @@ isert_conn_setup_qp(struct isert_conn *isert_conn, struct rdma_cm_id *cma_id) attr.cap.max_recv_sge = 1; attr.sq_sig_type = IB_SIGNAL_REQ_WR; attr.qp_type = IB_QPT_RC; + if (protection) + attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; pr_debug("isert_conn_setup_qp cma_id->device: %p\n", cma_id->device); @@ -206,7 +213,9 @@ isert_free_rx_descriptors(struct isert_conn *isert_conn) isert_conn->conn_rx_descs = NULL; } +static void isert_cq_tx_work(struct work_struct *); static void isert_cq_tx_callback(struct ib_cq *, void *); +static void isert_cq_rx_work(struct work_struct *); static void isert_cq_rx_callback(struct ib_cq *, void *); static int @@ -223,22 +232,29 @@ isert_create_device_ib_res(struct isert_device *device) return ret; /* asign function handlers */ - if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { - device->use_frwr = 1; - device->reg_rdma_mem = isert_reg_rdma_frwr; - device->unreg_rdma_mem = isert_unreg_rdma_frwr; + if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS && + dev_attr->device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) { + device->use_fastreg = 1; + device->reg_rdma_mem = isert_reg_rdma; + device->unreg_rdma_mem = isert_unreg_rdma; } else { - device->use_frwr = 0; + device->use_fastreg = 0; device->reg_rdma_mem = isert_map_rdma; device->unreg_rdma_mem = isert_unmap_cmd; } + /* Check signature cap */ + device->pi_capable = dev_attr->device_cap_flags & + IB_DEVICE_SIGNATURE_HANDOVER ? true : false; + device->cqs_used = min_t(int, num_online_cpus(), device->ib_device->num_comp_vectors); device->cqs_used = min(ISERT_MAX_CQ, device->cqs_used); - pr_debug("Using %d CQs, device %s supports %d vectors support FRWR %d\n", + pr_debug("Using %d CQs, device %s supports %d vectors support " + "Fast registration %d pi_capable %d\n", device->cqs_used, device->ib_device->name, - device->ib_device->num_comp_vectors, device->use_frwr); + device->ib_device->num_comp_vectors, device->use_fastreg, + device->pi_capable); device->cq_desc = kzalloc(sizeof(struct isert_cq_desc) * device->cqs_used, GFP_KERNEL); if (!device->cq_desc) { @@ -247,47 +263,43 @@ isert_create_device_ib_res(struct isert_device *device) } cq_desc = device->cq_desc; - device->dev_pd = ib_alloc_pd(ib_dev); - if (IS_ERR(device->dev_pd)) { - ret = PTR_ERR(device->dev_pd); - pr_err("ib_alloc_pd failed for dev_pd: %d\n", ret); - goto out_cq_desc; - } - for (i = 0; i < device->cqs_used; i++) { cq_desc[i].device = device; cq_desc[i].cq_index = i; + INIT_WORK(&cq_desc[i].cq_rx_work, isert_cq_rx_work); device->dev_rx_cq[i] = ib_create_cq(device->ib_device, isert_cq_rx_callback, isert_cq_event_callback, (void *)&cq_desc[i], ISER_MAX_RX_CQ_LEN, i); - if (IS_ERR(device->dev_rx_cq[i])) + if (IS_ERR(device->dev_rx_cq[i])) { + ret = PTR_ERR(device->dev_rx_cq[i]); + device->dev_rx_cq[i] = NULL; goto out_cq; + } + INIT_WORK(&cq_desc[i].cq_tx_work, isert_cq_tx_work); device->dev_tx_cq[i] = ib_create_cq(device->ib_device, isert_cq_tx_callback, isert_cq_event_callback, (void *)&cq_desc[i], ISER_MAX_TX_CQ_LEN, i); - if (IS_ERR(device->dev_tx_cq[i])) + if (IS_ERR(device->dev_tx_cq[i])) { + ret = PTR_ERR(device->dev_tx_cq[i]); + device->dev_tx_cq[i] = NULL; goto out_cq; + } - if (ib_req_notify_cq(device->dev_rx_cq[i], IB_CQ_NEXT_COMP)) + ret = ib_req_notify_cq(device->dev_rx_cq[i], IB_CQ_NEXT_COMP); + if (ret) goto out_cq; - if (ib_req_notify_cq(device->dev_tx_cq[i], IB_CQ_NEXT_COMP)) + ret = ib_req_notify_cq(device->dev_tx_cq[i], IB_CQ_NEXT_COMP); + if (ret) goto out_cq; } - device->dev_mr = ib_get_dma_mr(device->dev_pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(device->dev_mr)) { - ret = PTR_ERR(device->dev_mr); - pr_err("ib_get_dma_mr failed for dev_mr: %d\n", ret); - goto out_cq; - } - return 0; out_cq: @@ -303,9 +315,6 @@ out_cq: ib_destroy_cq(device->dev_tx_cq[j]); } } - ib_dealloc_pd(device->dev_pd); - -out_cq_desc: kfree(device->cq_desc); return ret; @@ -328,8 +337,6 @@ isert_free_device_ib_res(struct isert_device *device) device->dev_tx_cq[i] = NULL; } - ib_dereg_mr(device->dev_mr); - ib_dealloc_pd(device->dev_pd); kfree(device->cq_desc); } @@ -385,40 +392,136 @@ isert_device_find_by_ib_dev(struct rdma_cm_id *cma_id) } static void -isert_conn_free_frwr_pool(struct isert_conn *isert_conn) +isert_conn_free_fastreg_pool(struct isert_conn *isert_conn) { struct fast_reg_descriptor *fr_desc, *tmp; int i = 0; - if (list_empty(&isert_conn->conn_frwr_pool)) + if (list_empty(&isert_conn->conn_fr_pool)) return; - pr_debug("Freeing conn %p frwr pool", isert_conn); + pr_debug("Freeing conn %p fastreg pool", isert_conn); list_for_each_entry_safe(fr_desc, tmp, - &isert_conn->conn_frwr_pool, list) { + &isert_conn->conn_fr_pool, list) { list_del(&fr_desc->list); ib_free_fast_reg_page_list(fr_desc->data_frpl); ib_dereg_mr(fr_desc->data_mr); + if (fr_desc->pi_ctx) { + ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl); + ib_dereg_mr(fr_desc->pi_ctx->prot_mr); + ib_destroy_mr(fr_desc->pi_ctx->sig_mr); + kfree(fr_desc->pi_ctx); + } kfree(fr_desc); ++i; } - if (i < isert_conn->conn_frwr_pool_size) + if (i < isert_conn->conn_fr_pool_size) pr_warn("Pool still has %d regions registered\n", - isert_conn->conn_frwr_pool_size - i); + isert_conn->conn_fr_pool_size - i); +} + +static int +isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd, + struct fast_reg_descriptor *fr_desc, u8 protection) +{ + int ret; + + fr_desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(fr_desc->data_frpl)) { + pr_err("Failed to allocate data frpl err=%ld\n", + PTR_ERR(fr_desc->data_frpl)); + return PTR_ERR(fr_desc->data_frpl); + } + + fr_desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(fr_desc->data_mr)) { + pr_err("Failed to allocate data frmr err=%ld\n", + PTR_ERR(fr_desc->data_mr)); + ret = PTR_ERR(fr_desc->data_mr); + goto err_data_frpl; + } + pr_debug("Create fr_desc %p page_list %p\n", + fr_desc, fr_desc->data_frpl->page_list); + fr_desc->ind |= ISERT_DATA_KEY_VALID; + + if (protection) { + struct ib_mr_init_attr mr_init_attr = {0}; + struct pi_context *pi_ctx; + + fr_desc->pi_ctx = kzalloc(sizeof(*fr_desc->pi_ctx), GFP_KERNEL); + if (!fr_desc->pi_ctx) { + pr_err("Failed to allocate pi context\n"); + ret = -ENOMEM; + goto err_data_mr; + } + pi_ctx = fr_desc->pi_ctx; + + pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(pi_ctx->prot_frpl)) { + pr_err("Failed to allocate prot frpl err=%ld\n", + PTR_ERR(pi_ctx->prot_frpl)); + ret = PTR_ERR(pi_ctx->prot_frpl); + goto err_pi_ctx; + } + + pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(pi_ctx->prot_mr)) { + pr_err("Failed to allocate prot frmr err=%ld\n", + PTR_ERR(pi_ctx->prot_mr)); + ret = PTR_ERR(pi_ctx->prot_mr); + goto err_prot_frpl; + } + fr_desc->ind |= ISERT_PROT_KEY_VALID; + + mr_init_attr.max_reg_descriptors = 2; + mr_init_attr.flags |= IB_MR_SIGNATURE_EN; + pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); + if (IS_ERR(pi_ctx->sig_mr)) { + pr_err("Failed to allocate signature enabled mr err=%ld\n", + PTR_ERR(pi_ctx->sig_mr)); + ret = PTR_ERR(pi_ctx->sig_mr); + goto err_prot_mr; + } + fr_desc->ind |= ISERT_SIG_KEY_VALID; + } + fr_desc->ind &= ~ISERT_PROTECTED; + + return 0; +err_prot_mr: + ib_dereg_mr(fr_desc->pi_ctx->prot_mr); +err_prot_frpl: + ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl); +err_pi_ctx: + kfree(fr_desc->pi_ctx); +err_data_mr: + ib_dereg_mr(fr_desc->data_mr); +err_data_frpl: + ib_free_fast_reg_page_list(fr_desc->data_frpl); + + return ret; } static int -isert_conn_create_frwr_pool(struct isert_conn *isert_conn) +isert_conn_create_fastreg_pool(struct isert_conn *isert_conn, u8 pi_support) { struct fast_reg_descriptor *fr_desc; struct isert_device *device = isert_conn->conn_device; - int i, ret; + struct se_session *se_sess = isert_conn->conn->sess->se_sess; + struct se_node_acl *se_nacl = se_sess->se_node_acl; + int i, ret, tag_num; + /* + * Setup the number of FRMRs based upon the number of tags + * available to session in iscsi_target_locate_portal(). + */ + tag_num = max_t(u32, ISCSIT_MIN_TAGS, se_nacl->queue_depth); + tag_num = (tag_num * 2) + ISCSIT_EXTRA_TAGS; - INIT_LIST_HEAD(&isert_conn->conn_frwr_pool); - isert_conn->conn_frwr_pool_size = 0; - for (i = 0; i < ISCSI_DEF_XMIT_CMDS_MAX; i++) { + isert_conn->conn_fr_pool_size = 0; + for (i = 0; i < tag_num; i++) { fr_desc = kzalloc(sizeof(*fr_desc), GFP_KERNEL); if (!fr_desc) { pr_err("Failed to allocate fast_reg descriptor\n"); @@ -426,40 +529,27 @@ isert_conn_create_frwr_pool(struct isert_conn *isert_conn) goto err; } - fr_desc->data_frpl = - ib_alloc_fast_reg_page_list(device->ib_device, - ISCSI_ISER_SG_TABLESIZE); - if (IS_ERR(fr_desc->data_frpl)) { - pr_err("Failed to allocate fr_pg_list err=%ld\n", - PTR_ERR(fr_desc->data_frpl)); - ret = PTR_ERR(fr_desc->data_frpl); - goto err; - } - - fr_desc->data_mr = ib_alloc_fast_reg_mr(device->dev_pd, - ISCSI_ISER_SG_TABLESIZE); - if (IS_ERR(fr_desc->data_mr)) { - pr_err("Failed to allocate frmr err=%ld\n", - PTR_ERR(fr_desc->data_mr)); - ret = PTR_ERR(fr_desc->data_mr); - ib_free_fast_reg_page_list(fr_desc->data_frpl); + ret = isert_create_fr_desc(device->ib_device, + isert_conn->conn_pd, fr_desc, + pi_support); + if (ret) { + pr_err("Failed to create fastreg descriptor err=%d\n", + ret); + kfree(fr_desc); goto err; } - pr_debug("Create fr_desc %p page_list %p\n", - fr_desc, fr_desc->data_frpl->page_list); - fr_desc->valid = true; - list_add_tail(&fr_desc->list, &isert_conn->conn_frwr_pool); - isert_conn->conn_frwr_pool_size++; + list_add_tail(&fr_desc->list, &isert_conn->conn_fr_pool); + isert_conn->conn_fr_pool_size++; } - pr_debug("Creating conn %p frwr pool size=%d", - isert_conn, isert_conn->conn_frwr_pool_size); + pr_debug("Creating conn %p fastreg pool size=%d", + isert_conn, isert_conn->conn_fr_pool_size); return 0; err: - isert_conn_free_frwr_pool(isert_conn); + isert_conn_free_fastreg_pool(isert_conn); return ret; } @@ -472,6 +562,15 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) struct isert_device *device; struct ib_device *ib_dev = cma_id->device; int ret = 0; + u8 pi_support; + + spin_lock_bh(&np->np_thread_lock); + if (!np->enabled) { + spin_unlock_bh(&np->np_thread_lock); + pr_debug("iscsi_np is not enabled, reject connect request\n"); + return rdma_reject(cma_id, NULL, 0); + } + spin_unlock_bh(&np->np_thread_lock); pr_debug("Entering isert_connect_request cma_id: %p, context: %p\n", cma_id, cma_id->context); @@ -484,12 +583,13 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) isert_conn->state = ISER_CONN_INIT; INIT_LIST_HEAD(&isert_conn->conn_accept_node); init_completion(&isert_conn->conn_login_comp); - init_waitqueue_head(&isert_conn->conn_wait); - init_waitqueue_head(&isert_conn->conn_wait_comp_err); + init_completion(&isert_conn->conn_wait); + init_completion(&isert_conn->conn_wait_comp_err); kref_init(&isert_conn->conn_kref); kref_get(&isert_conn->conn_kref); mutex_init(&isert_conn->conn_mutex); spin_lock_init(&isert_conn->conn_lock); + INIT_LIST_HEAD(&isert_conn->conn_fr_pool); cma_id->context = isert_conn; isert_conn->conn_cm_id = cma_id; @@ -544,33 +644,48 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) } isert_conn->conn_device = device; - isert_conn->conn_pd = device->dev_pd; - isert_conn->conn_mr = device->dev_mr; + isert_conn->conn_pd = ib_alloc_pd(isert_conn->conn_device->ib_device); + if (IS_ERR(isert_conn->conn_pd)) { + ret = PTR_ERR(isert_conn->conn_pd); + pr_err("ib_alloc_pd failed for conn %p: ret=%d\n", + isert_conn, ret); + goto out_pd; + } - if (device->use_frwr) { - ret = isert_conn_create_frwr_pool(isert_conn); - if (ret) { - pr_err("Conn: %p failed to create frwr_pool\n", isert_conn); - goto out_frwr; - } + isert_conn->conn_mr = ib_get_dma_mr(isert_conn->conn_pd, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(isert_conn->conn_mr)) { + ret = PTR_ERR(isert_conn->conn_mr); + pr_err("ib_get_dma_mr failed for conn %p: ret=%d\n", + isert_conn, ret); + goto out_mr; + } + + pi_support = np->tpg_np->tpg->tpg_attrib.t10_pi; + if (pi_support && !device->pi_capable) { + pr_err("Protection information requested but not supported, " + "rejecting connect request\n"); + ret = rdma_reject(cma_id, NULL, 0); + goto out_mr; } - ret = isert_conn_setup_qp(isert_conn, cma_id); + ret = isert_conn_setup_qp(isert_conn, cma_id, pi_support); if (ret) goto out_conn_dev; mutex_lock(&isert_np->np_accept_mutex); - list_add_tail(&isert_np->np_accept_list, &isert_conn->conn_accept_node); + list_add_tail(&isert_conn->conn_accept_node, &isert_np->np_accept_list); mutex_unlock(&isert_np->np_accept_mutex); - pr_debug("isert_connect_request() waking up np_accept_wq: %p\n", np); - wake_up(&isert_np->np_accept_wq); + pr_debug("isert_connect_request() up np_sem np: %p\n", np); + up(&isert_np->np_sem); return 0; out_conn_dev: - if (device->use_frwr) - isert_conn_free_frwr_pool(isert_conn); -out_frwr: + ib_dereg_mr(isert_conn->conn_mr); +out_mr: + ib_dealloc_pd(isert_conn->conn_pd); +out_pd: isert_device_try_release(device); out_rsp_dma_map: ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma, @@ -594,8 +709,8 @@ isert_connect_release(struct isert_conn *isert_conn) pr_debug("Entering isert_connect_release(): >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"); - if (device->use_frwr) - isert_conn_free_frwr_pool(isert_conn); + if (device && device->use_fastreg) + isert_conn_free_fastreg_pool(isert_conn); if (isert_conn->conn_qp) { cq_index = ((struct isert_cq_desc *) @@ -609,6 +724,9 @@ isert_connect_release(struct isert_conn *isert_conn) isert_free_rx_descriptors(isert_conn); rdma_destroy_id(isert_conn->conn_cm_id); + ib_dereg_mr(isert_conn->conn_mr); + ib_dealloc_pd(isert_conn->conn_pd); + if (isert_conn->login_buf) { ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma, ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE); @@ -657,11 +775,11 @@ isert_disconnect_work(struct work_struct *work) pr_debug("isert_disconnect_work(): >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"); mutex_lock(&isert_conn->conn_mutex); - isert_conn->state = ISER_CONN_DOWN; + if (isert_conn->state == ISER_CONN_UP) + isert_conn->state = ISER_CONN_TERMINATING; if (isert_conn->post_recv_buf_count == 0 && atomic_read(&isert_conn->post_send_buf_count) == 0) { - pr_debug("Calling wake_up(&isert_conn->conn_wait);\n"); mutex_unlock(&isert_conn->conn_mutex); goto wake_up; } @@ -670,26 +788,25 @@ isert_disconnect_work(struct work_struct *work) isert_put_conn(isert_conn); return; } - if (!isert_conn->logout_posted) { - pr_debug("Calling rdma_disconnect for !logout_posted from" - " isert_disconnect_work\n"); + + if (isert_conn->disconnect) { + /* Send DREQ/DREP towards our initiator */ rdma_disconnect(isert_conn->conn_cm_id); - mutex_unlock(&isert_conn->conn_mutex); - iscsit_cause_connection_reinstatement(isert_conn->conn, 0); - goto wake_up; } + mutex_unlock(&isert_conn->conn_mutex); wake_up: - wake_up(&isert_conn->conn_wait); + complete(&isert_conn->conn_wait); isert_put_conn(isert_conn); } static void -isert_disconnected_handler(struct rdma_cm_id *cma_id) +isert_disconnected_handler(struct rdma_cm_id *cma_id, bool disconnect) { struct isert_conn *isert_conn = (struct isert_conn *)cma_id->context; + isert_conn->disconnect = disconnect; INIT_WORK(&isert_conn->conn_logout_work, isert_disconnect_work); schedule_work(&isert_conn->conn_logout_work); } @@ -698,29 +815,28 @@ static int isert_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { int ret = 0; + bool disconnect = false; pr_debug("isert_cma_handler: event %d status %d conn %p id %p\n", event->event, event->status, cma_id->context, cma_id); switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: - pr_debug("RDMA_CM_EVENT_CONNECT_REQUEST: >>>>>>>>>>>>>>>\n"); ret = isert_connect_request(cma_id, event); break; case RDMA_CM_EVENT_ESTABLISHED: - pr_debug("RDMA_CM_EVENT_ESTABLISHED >>>>>>>>>>>>>>\n"); isert_connected_handler(cma_id); break; - case RDMA_CM_EVENT_DISCONNECTED: - pr_debug("RDMA_CM_EVENT_DISCONNECTED: >>>>>>>>>>>>>>\n"); - isert_disconnected_handler(cma_id); - break; - case RDMA_CM_EVENT_DEVICE_REMOVAL: - case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_ADDR_CHANGE: /* FALLTHRU */ + case RDMA_CM_EVENT_DISCONNECTED: /* FALLTHRU */ + case RDMA_CM_EVENT_DEVICE_REMOVAL: /* FALLTHRU */ + disconnect = true; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: /* FALLTHRU */ + isert_disconnected_handler(cma_id, disconnect); break; case RDMA_CM_EVENT_CONNECT_ERROR: default: - pr_err("Unknown RDMA CMA event: %d\n", event->event); + pr_err("Unhandled RDMA CMA event: %d\n", event->event); break; } @@ -843,14 +959,33 @@ isert_init_tx_hdrs(struct isert_conn *isert_conn, } static void -isert_init_send_wr(struct isert_cmd *isert_cmd, struct ib_send_wr *send_wr) +isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct ib_send_wr *send_wr, bool coalesce) { + struct iser_tx_desc *tx_desc = &isert_cmd->tx_desc; + isert_cmd->rdma_wr.iser_ib_op = ISER_IB_SEND; send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc; send_wr->opcode = IB_WR_SEND; - send_wr->send_flags = IB_SEND_SIGNALED; - send_wr->sg_list = &isert_cmd->tx_desc.tx_sg[0]; + send_wr->sg_list = &tx_desc->tx_sg[0]; send_wr->num_sge = isert_cmd->tx_desc.num_sge; + /* + * Coalesce send completion interrupts by only setting IB_SEND_SIGNALED + * bit for every ISERT_COMP_BATCH_COUNT number of ib_post_send() calls. + */ + mutex_lock(&isert_conn->conn_mutex); + if (coalesce && isert_conn->state == ISER_CONN_UP && + ++isert_conn->conn_comp_batch < ISERT_COMP_BATCH_COUNT) { + tx_desc->llnode_active = true; + llist_add(&tx_desc->comp_llnode, &isert_conn->conn_comp_llist); + mutex_unlock(&isert_conn->conn_mutex); + return; + } + isert_conn->conn_comp_batch = 0; + tx_desc->comp_llnode_batch = llist_del_all(&isert_conn->conn_comp_llist); + mutex_unlock(&isert_conn->conn_mutex); + + send_wr->send_flags = IB_SEND_SIGNALED; } static int @@ -918,6 +1053,20 @@ isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login, } if (!login->login_failed) { if (login->login_complete) { + if (!conn->sess->sess_ops->SessionType && + isert_conn->conn_device->use_fastreg) { + /* Normal Session and fastreg is used */ + u8 pi_support = login->np->tpg_np->tpg->tpg_attrib.t10_pi; + + ret = isert_conn_create_fastreg_pool(isert_conn, + pi_support); + if (ret) { + pr_err("Conn: %p failed to create" + " fastreg pool\n", isert_conn); + return ret; + } + } + ret = isert_alloc_rx_descriptors(isert_conn); if (ret) return ret; @@ -992,13 +1141,13 @@ isert_rx_login_req(struct iser_rx_desc *rx_desc, int rx_buflen, } static struct iscsi_cmd -*isert_allocate_cmd(struct iscsi_conn *conn, gfp_t gfp) +*isert_allocate_cmd(struct iscsi_conn *conn) { struct isert_conn *isert_conn = (struct isert_conn *)conn->context; struct isert_cmd *isert_cmd; struct iscsi_cmd *cmd; - cmd = iscsit_allocate_cmd(conn, gfp); + cmd = iscsit_allocate_cmd(conn, TASK_INTERRUPTIBLE); if (!cmd) { pr_err("Unable to allocate iscsi_cmd + isert_cmd\n"); return NULL; @@ -1062,6 +1211,8 @@ sequence_cmd: if (!rc && dump_payload == false && unsol_data) iscsit_set_unsoliticed_dataout(cmd); + else if (dump_payload && imm_data) + target_put_sess_cmd(conn->sess->se_sess, &cmd->se_cmd); return 0; } @@ -1187,7 +1338,7 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc, switch (opcode) { case ISCSI_OP_SCSI_CMD: - cmd = isert_allocate_cmd(conn, GFP_KERNEL); + cmd = isert_allocate_cmd(conn); if (!cmd) break; @@ -1201,7 +1352,7 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc, rx_desc, (unsigned char *)hdr); break; case ISCSI_OP_NOOP_OUT: - cmd = isert_allocate_cmd(conn, GFP_KERNEL); + cmd = isert_allocate_cmd(conn); if (!cmd) break; @@ -1214,7 +1365,7 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc, (unsigned char *)hdr); break; case ISCSI_OP_SCSI_TMFUNC: - cmd = isert_allocate_cmd(conn, GFP_KERNEL); + cmd = isert_allocate_cmd(conn); if (!cmd) break; @@ -1222,7 +1373,7 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc, (unsigned char *)hdr); break; case ISCSI_OP_LOGOUT: - cmd = isert_allocate_cmd(conn, GFP_KERNEL); + cmd = isert_allocate_cmd(conn); if (!cmd) break; @@ -1233,7 +1384,7 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc, HZ); break; case ISCSI_OP_TEXT: - cmd = isert_allocate_cmd(conn, GFP_KERNEL); + cmd = isert_allocate_cmd(conn); if (!cmd) break; @@ -1343,19 +1494,60 @@ isert_rx_completion(struct iser_rx_desc *desc, struct isert_conn *isert_conn, } } +static int +isert_map_data_buf(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct scatterlist *sg, u32 nents, u32 length, u32 offset, + enum iser_ib_op_code op, struct isert_data_buf *data) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + + data->dma_dir = op == ISER_IB_RDMA_WRITE ? + DMA_TO_DEVICE : DMA_FROM_DEVICE; + + data->len = length - offset; + data->offset = offset; + data->sg_off = data->offset / PAGE_SIZE; + + data->sg = &sg[data->sg_off]; + data->nents = min_t(unsigned int, nents - data->sg_off, + ISCSI_ISER_SG_TABLESIZE); + data->len = min_t(unsigned int, data->len, ISCSI_ISER_SG_TABLESIZE * + PAGE_SIZE); + + data->dma_nents = ib_dma_map_sg(ib_dev, data->sg, data->nents, + data->dma_dir); + if (unlikely(!data->dma_nents)) { + pr_err("Cmd: unable to dma map SGs %p\n", sg); + return -EINVAL; + } + + pr_debug("Mapped cmd: %p count: %u sg: %p sg_nents: %u rdma_len %d\n", + isert_cmd, data->dma_nents, data->sg, data->nents, data->len); + + return 0; +} + +static void +isert_unmap_data_buf(struct isert_conn *isert_conn, struct isert_data_buf *data) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + + ib_dma_unmap_sg(ib_dev, data->sg, data->nents, data->dma_dir); + memset(data, 0, sizeof(*data)); +} + + + static void isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) { struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; pr_debug("isert_unmap_cmd: %p\n", isert_cmd); - if (wr->sge) { + + if (wr->data.sg) { pr_debug("isert_unmap_cmd: %p unmap_sg op\n", isert_cmd); - ib_dma_unmap_sg(ib_dev, wr->sge, wr->num_sge, - (wr->iser_ib_op == ISER_IB_RDMA_WRITE) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - wr->sge = NULL; + isert_unmap_data_buf(isert_conn, &wr->data); } if (wr->send_wr) { @@ -1372,29 +1564,29 @@ isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) } static void -isert_unreg_rdma_frwr(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) +isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) { struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; LIST_HEAD(unmap_list); - pr_debug("unreg_frwr_cmd: %p\n", isert_cmd); + pr_debug("unreg_fastreg_cmd: %p\n", isert_cmd); if (wr->fr_desc) { - pr_debug("unreg_frwr_cmd: %p free fr_desc %p\n", + pr_debug("unreg_fastreg_cmd: %p free fr_desc %p\n", isert_cmd, wr->fr_desc); + if (wr->fr_desc->ind & ISERT_PROTECTED) { + isert_unmap_data_buf(isert_conn, &wr->prot); + wr->fr_desc->ind &= ~ISERT_PROTECTED; + } spin_lock_bh(&isert_conn->conn_lock); - list_add_tail(&wr->fr_desc->list, &isert_conn->conn_frwr_pool); + list_add_tail(&wr->fr_desc->list, &isert_conn->conn_fr_pool); spin_unlock_bh(&isert_conn->conn_lock); wr->fr_desc = NULL; } - if (wr->sge) { - pr_debug("unreg_frwr_cmd: %p unmap_sg op\n", isert_cmd); - ib_dma_unmap_sg(ib_dev, wr->sge, wr->num_sge, - (wr->iser_ib_op == ISER_IB_RDMA_WRITE) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - wr->sge = NULL; + if (wr->data.sg) { + pr_debug("unreg_fastreg_cmd: %p unmap_sg op\n", isert_cmd); + isert_unmap_data_buf(isert_conn, &wr->data); } wr->ib_sge = NULL; @@ -1402,7 +1594,7 @@ isert_unreg_rdma_frwr(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn } static void -isert_put_cmd(struct isert_cmd *isert_cmd) +isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err) { struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; struct isert_conn *isert_conn = isert_cmd->conn; @@ -1415,11 +1607,24 @@ isert_put_cmd(struct isert_cmd *isert_cmd) case ISCSI_OP_SCSI_CMD: spin_lock_bh(&conn->cmd_lock); if (!list_empty(&cmd->i_conn_node)) - list_del(&cmd->i_conn_node); + list_del_init(&cmd->i_conn_node); spin_unlock_bh(&conn->cmd_lock); - if (cmd->data_direction == DMA_TO_DEVICE) + if (cmd->data_direction == DMA_TO_DEVICE) { iscsit_stop_dataout_timer(cmd); + /* + * Check for special case during comp_err where + * WRITE_PENDING has been handed off from core, + * but requires an extra target_put_sess_cmd() + * before transport_generic_free_cmd() below. + */ + if (comp_err && + cmd->se_cmd.t_state == TRANSPORT_WRITE_PENDING) { + struct se_cmd *se_cmd = &cmd->se_cmd; + + target_put_sess_cmd(se_cmd->se_sess, se_cmd); + } + } device->unreg_rdma_mem(isert_cmd, isert_conn); transport_generic_free_cmd(&cmd->se_cmd, 0); @@ -1427,7 +1632,7 @@ isert_put_cmd(struct isert_cmd *isert_cmd) case ISCSI_OP_SCSI_TMFUNC: spin_lock_bh(&conn->cmd_lock); if (!list_empty(&cmd->i_conn_node)) - list_del(&cmd->i_conn_node); + list_del_init(&cmd->i_conn_node); spin_unlock_bh(&conn->cmd_lock); transport_generic_free_cmd(&cmd->se_cmd, 0); @@ -1437,7 +1642,7 @@ isert_put_cmd(struct isert_cmd *isert_cmd) case ISCSI_OP_TEXT: spin_lock_bh(&conn->cmd_lock); if (!list_empty(&cmd->i_conn_node)) - list_del(&cmd->i_conn_node); + list_del_init(&cmd->i_conn_node); spin_unlock_bh(&conn->cmd_lock); /* @@ -1474,7 +1679,7 @@ isert_unmap_tx_desc(struct iser_tx_desc *tx_desc, struct ib_device *ib_dev) static void isert_completion_put(struct iser_tx_desc *tx_desc, struct isert_cmd *isert_cmd, - struct ib_device *ib_dev) + struct ib_device *ib_dev, bool comp_err) { if (isert_cmd->pdu_buf_dma != 0) { pr_debug("Calling ib_dma_unmap_single for isert_cmd->pdu_buf_dma\n"); @@ -1484,7 +1689,77 @@ isert_completion_put(struct iser_tx_desc *tx_desc, struct isert_cmd *isert_cmd, } isert_unmap_tx_desc(tx_desc, ib_dev); - isert_put_cmd(isert_cmd); + isert_put_cmd(isert_cmd, comp_err); +} + +static int +isert_check_pi_status(struct se_cmd *se_cmd, struct ib_mr *sig_mr) +{ + struct ib_mr_status mr_status; + int ret; + + ret = ib_check_mr_status(sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status); + if (ret) { + pr_err("ib_check_mr_status failed, ret %d\n", ret); + goto fail_mr_status; + } + + if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { + u64 sec_offset_err; + u32 block_size = se_cmd->se_dev->dev_attrib.block_size + 8; + + switch (mr_status.sig_err.err_type) { + case IB_SIG_BAD_GUARD: + se_cmd->pi_err = TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED; + break; + case IB_SIG_BAD_REFTAG: + se_cmd->pi_err = TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED; + break; + case IB_SIG_BAD_APPTAG: + se_cmd->pi_err = TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED; + break; + } + sec_offset_err = mr_status.sig_err.sig_err_offset; + do_div(sec_offset_err, block_size); + se_cmd->bad_sector = sec_offset_err + se_cmd->t_task_lba; + + pr_err("isert: PI error found type %d at sector 0x%llx " + "expected 0x%x vs actual 0x%x\n", + mr_status.sig_err.err_type, + (unsigned long long)se_cmd->bad_sector, + mr_status.sig_err.expected, + mr_status.sig_err.actual); + ret = 1; + } + +fail_mr_status: + return ret; +} + +static void +isert_completion_rdma_write(struct iser_tx_desc *tx_desc, + struct isert_cmd *isert_cmd) +{ + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_conn *isert_conn = isert_cmd->conn; + struct isert_device *device = isert_conn->conn_device; + int ret = 0; + + if (wr->fr_desc && wr->fr_desc->ind & ISERT_PROTECTED) { + ret = isert_check_pi_status(se_cmd, + wr->fr_desc->pi_ctx->sig_mr); + wr->fr_desc->ind &= ~ISERT_PROTECTED; + } + + device->unreg_rdma_mem(isert_cmd, isert_conn); + wr->send_wr_num = 0; + if (ret) + transport_send_check_condition_and_sense(se_cmd, + se_cmd->pi_err, 0); + else + isert_put_response(isert_conn->conn, cmd); } static void @@ -1496,10 +1771,18 @@ isert_completion_rdma_read(struct iser_tx_desc *tx_desc, struct se_cmd *se_cmd = &cmd->se_cmd; struct isert_conn *isert_conn = isert_cmd->conn; struct isert_device *device = isert_conn->conn_device; + int ret = 0; + + if (wr->fr_desc && wr->fr_desc->ind & ISERT_PROTECTED) { + ret = isert_check_pi_status(se_cmd, + wr->fr_desc->pi_ctx->sig_mr); + wr->fr_desc->ind &= ~ISERT_PROTECTED; + } iscsit_stop_dataout_timer(cmd); device->unreg_rdma_mem(isert_cmd, isert_conn); - cmd->write_data_done = wr->cur_rdma_length; + cmd->write_data_done = wr->data.len; + wr->send_wr_num = 0; pr_debug("Cmd: %p RDMA_READ comp calling execute_cmd\n", isert_cmd); spin_lock_bh(&cmd->istate_lock); @@ -1507,7 +1790,11 @@ isert_completion_rdma_read(struct iser_tx_desc *tx_desc, cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT; spin_unlock_bh(&cmd->istate_lock); - target_execute_cmd(se_cmd); + if (ret) + transport_send_check_condition_and_sense(se_cmd, + se_cmd->pi_err, 0); + else + target_execute_cmd(se_cmd); } static void @@ -1527,28 +1814,25 @@ isert_do_control_comp(struct work_struct *work) iscsit_tmr_post_handler(cmd, cmd->conn); cmd->i_state = ISTATE_SENT_STATUS; - isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev); + isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); break; case ISTATE_SEND_REJECT: pr_debug("Got isert_do_control_comp ISTATE_SEND_REJECT: >>>\n"); atomic_dec(&isert_conn->post_send_buf_count); cmd->i_state = ISTATE_SENT_STATUS; - isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev); + isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); break; case ISTATE_SEND_LOGOUTRSP: pr_debug("Calling iscsit_logout_post_handler >>>>>>>>>>>>>>\n"); - /* - * Call atomic_dec(&isert_conn->post_send_buf_count) - * from isert_free_conn() - */ - isert_conn->logout_posted = true; + + atomic_dec(&isert_conn->post_send_buf_count); iscsit_logout_post_handler(cmd, cmd->conn); break; case ISTATE_SEND_TEXTRSP: atomic_dec(&isert_conn->post_send_buf_count); cmd->i_state = ISTATE_SENT_STATUS; - isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev); + isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); break; default: pr_err("Unknown do_control_comp i_state %d\n", cmd->i_state); @@ -1564,6 +1848,7 @@ isert_response_completion(struct iser_tx_desc *tx_desc, struct ib_device *ib_dev) { struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; if (cmd->i_state == ISTATE_SEND_TASKMGTRSP || cmd->i_state == ISTATE_SEND_LOGOUTRSP || @@ -1575,15 +1860,26 @@ isert_response_completion(struct iser_tx_desc *tx_desc, queue_work(isert_comp_wq, &isert_cmd->comp_work); return; } - atomic_dec(&isert_conn->post_send_buf_count); + + /** + * If send_wr_num is 0 this means that we got + * RDMA completion and we cleared it and we should + * simply decrement the response post. else the + * response is incorporated in send_wr_num, just + * sub it. + **/ + if (wr->send_wr_num) + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + else + atomic_dec(&isert_conn->post_send_buf_count); cmd->i_state = ISTATE_SENT_STATUS; - isert_completion_put(tx_desc, isert_cmd, ib_dev); + isert_completion_put(tx_desc, isert_cmd, ib_dev, false); } static void -isert_send_completion(struct iser_tx_desc *tx_desc, - struct isert_conn *isert_conn) +__isert_send_completion(struct iser_tx_desc *tx_desc, + struct isert_conn *isert_conn) { struct ib_device *ib_dev = isert_conn->conn_cm_id->device; struct isert_cmd *isert_cmd = tx_desc->isert_cmd; @@ -1607,13 +1903,14 @@ isert_send_completion(struct iser_tx_desc *tx_desc, isert_conn, ib_dev); break; case ISER_IB_RDMA_WRITE: - pr_err("isert_send_completion: Got ISER_IB_RDMA_WRITE\n"); - dump_stack(); + pr_debug("isert_send_completion: Got ISER_IB_RDMA_WRITE\n"); + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + isert_completion_rdma_write(tx_desc, isert_cmd); break; case ISER_IB_RDMA_READ: pr_debug("isert_send_completion: Got ISER_IB_RDMA_READ:\n"); - atomic_dec(&isert_conn->post_send_buf_count); + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); isert_completion_rdma_read(tx_desc, isert_cmd); break; default: @@ -1624,31 +1921,120 @@ isert_send_completion(struct iser_tx_desc *tx_desc, } static void -isert_cq_comp_err(struct iser_tx_desc *tx_desc, struct isert_conn *isert_conn) +isert_send_completion(struct iser_tx_desc *tx_desc, + struct isert_conn *isert_conn) +{ + struct llist_node *llnode = tx_desc->comp_llnode_batch; + struct iser_tx_desc *t; + /* + * Drain coalesced completion llist starting from comp_llnode_batch + * setup in isert_init_send_wr(), and then complete trailing tx_desc. + */ + while (llnode) { + t = llist_entry(llnode, struct iser_tx_desc, comp_llnode); + llnode = llist_next(llnode); + __isert_send_completion(t, isert_conn); + } + __isert_send_completion(tx_desc, isert_conn); +} + +static void +isert_cq_drain_comp_llist(struct isert_conn *isert_conn, struct ib_device *ib_dev) { - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct llist_node *llnode; + struct isert_rdma_wr *wr; + struct iser_tx_desc *t; + + mutex_lock(&isert_conn->conn_mutex); + llnode = llist_del_all(&isert_conn->conn_comp_llist); + isert_conn->conn_comp_batch = 0; + mutex_unlock(&isert_conn->conn_mutex); + + while (llnode) { + t = llist_entry(llnode, struct iser_tx_desc, comp_llnode); + llnode = llist_next(llnode); + wr = &t->isert_cmd->rdma_wr; + + /** + * If send_wr_num is 0 this means that we got + * RDMA completion and we cleared it and we should + * simply decrement the response post. else the + * response is incorporated in send_wr_num, just + * sub it. + **/ + if (wr->send_wr_num) + atomic_sub(wr->send_wr_num, + &isert_conn->post_send_buf_count); + else + atomic_dec(&isert_conn->post_send_buf_count); - if (tx_desc) { - struct isert_cmd *isert_cmd = tx_desc->isert_cmd; + isert_completion_put(t, t->isert_cmd, ib_dev, true); + } +} - if (!isert_cmd) - isert_unmap_tx_desc(tx_desc, ib_dev); +static void +isert_cq_tx_comp_err(struct iser_tx_desc *tx_desc, struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_cmd *isert_cmd = tx_desc->isert_cmd; + struct llist_node *llnode = tx_desc->comp_llnode_batch; + struct isert_rdma_wr *wr; + struct iser_tx_desc *t; + + while (llnode) { + t = llist_entry(llnode, struct iser_tx_desc, comp_llnode); + llnode = llist_next(llnode); + wr = &t->isert_cmd->rdma_wr; + + /** + * If send_wr_num is 0 this means that we got + * RDMA completion and we cleared it and we should + * simply decrement the response post. else the + * response is incorporated in send_wr_num, just + * sub it. + **/ + if (wr->send_wr_num) + atomic_sub(wr->send_wr_num, + &isert_conn->post_send_buf_count); else - isert_completion_put(tx_desc, isert_cmd, ib_dev); + atomic_dec(&isert_conn->post_send_buf_count); + + isert_completion_put(t, t->isert_cmd, ib_dev, true); } + tx_desc->comp_llnode_batch = NULL; - if (isert_conn->post_recv_buf_count == 0 && - atomic_read(&isert_conn->post_send_buf_count) == 0) { - pr_debug("isert_cq_comp_err >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"); - pr_debug("Calling wake_up from isert_cq_comp_err\n"); + if (!isert_cmd) + isert_unmap_tx_desc(tx_desc, ib_dev); + else + isert_completion_put(tx_desc, isert_cmd, ib_dev, true); +} - mutex_lock(&isert_conn->conn_mutex); - if (isert_conn->state != ISER_CONN_DOWN) - isert_conn->state = ISER_CONN_TERMINATING; - mutex_unlock(&isert_conn->conn_mutex); +static void +isert_cq_rx_comp_err(struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iscsi_conn *conn = isert_conn->conn; + + if (isert_conn->post_recv_buf_count) + return; + + isert_cq_drain_comp_llist(isert_conn, ib_dev); - wake_up(&isert_conn->conn_wait_comp_err); + if (conn->sess) { + target_sess_cmd_list_set_waiting(conn->sess->se_sess); + target_wait_for_sess_cmds(conn->sess->se_sess); } + + while (atomic_read(&isert_conn->post_send_buf_count)) + msleep(3000); + + mutex_lock(&isert_conn->conn_mutex); + isert_conn->state = ISER_CONN_DOWN; + mutex_unlock(&isert_conn->conn_mutex); + + iscsit_cause_connection_reinstatement(isert_conn->conn, 0); + + complete(&isert_conn->conn_wait_comp_err); } static void @@ -1673,8 +2059,14 @@ isert_cq_tx_work(struct work_struct *work) pr_debug("TX wc.status != IB_WC_SUCCESS >>>>>>>>>>>>>>\n"); pr_debug("TX wc.status: 0x%08x\n", wc.status); pr_debug("TX wc.vendor_err: 0x%08x\n", wc.vendor_err); - atomic_dec(&isert_conn->post_send_buf_count); - isert_cq_comp_err(tx_desc, isert_conn); + + if (wc.wr_id != ISER_FASTREG_LI_WRID) { + if (tx_desc->llnode_active) + continue; + + atomic_dec(&isert_conn->post_send_buf_count); + isert_cq_tx_comp_err(tx_desc, isert_conn); + } } } @@ -1686,7 +2078,6 @@ isert_cq_tx_callback(struct ib_cq *cq, void *context) { struct isert_cq_desc *cq_desc = (struct isert_cq_desc *)context; - INIT_WORK(&cq_desc->cq_tx_work, isert_cq_tx_work); queue_work(isert_comp_wq, &cq_desc->cq_tx_work); } @@ -1718,7 +2109,7 @@ isert_cq_rx_work(struct work_struct *work) wc.vendor_err); } isert_conn->post_recv_buf_count--; - isert_cq_comp_err(NULL, isert_conn); + isert_cq_rx_comp_err(isert_conn); } } @@ -1730,7 +2121,6 @@ isert_cq_rx_callback(struct ib_cq *cq, void *context) { struct isert_cq_desc *cq_desc = (struct isert_cq_desc *)context; - INIT_WORK(&cq_desc->cq_rx_work, isert_cq_rx_work); queue_work(isert_rx_wq, &cq_desc->cq_rx_work); } @@ -1793,13 +2183,43 @@ isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd) isert_cmd->tx_desc.num_sge = 2; } - isert_init_send_wr(isert_cmd, send_wr); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, true); pr_debug("Posting SCSI Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); return isert_post_response(isert_conn, isert_cmd); } +static void +isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_device *device = isert_conn->conn_device; + + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + if (cmd->data_direction == DMA_TO_DEVICE) + iscsit_stop_dataout_timer(cmd); + + device->unreg_rdma_mem(isert_cmd, isert_conn); +} + +static enum target_prot_op +isert_get_sup_prot_ops(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_device *device = isert_conn->conn_device; + + if (device->pi_capable) + return TARGET_PROT_ALL; + + return TARGET_PROT_NORMAL; +} + static int isert_put_nopin(struct iscsi_cmd *cmd, struct iscsi_conn *conn, bool nopout_response) @@ -1813,7 +2233,7 @@ isert_put_nopin(struct iscsi_cmd *cmd, struct iscsi_conn *conn, &isert_cmd->tx_desc.iscsi_header, nopout_response); isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); - isert_init_send_wr(isert_cmd, send_wr); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); pr_debug("Posting NOPIN Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); @@ -1831,7 +2251,7 @@ isert_put_logout_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) iscsit_build_logout_rsp(cmd, conn, (struct iscsi_logout_rsp *) &isert_cmd->tx_desc.iscsi_header); isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); - isert_init_send_wr(isert_cmd, send_wr); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); pr_debug("Posting Logout Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); @@ -1849,7 +2269,7 @@ isert_put_tm_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) iscsit_build_task_mgt_rsp(cmd, conn, (struct iscsi_tm_rsp *) &isert_cmd->tx_desc.iscsi_header); isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); - isert_init_send_wr(isert_cmd, send_wr); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); pr_debug("Posting Task Management Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); @@ -1881,7 +2301,7 @@ isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn) tx_dsg->lkey = isert_conn->conn_mr->lkey; isert_cmd->tx_desc.num_sge = 2; - isert_init_send_wr(isert_cmd, send_wr); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); pr_debug("Posting Reject IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); @@ -1900,7 +2320,7 @@ isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) int rc; isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); - rc = iscsit_build_text_rsp(cmd, conn, hdr); + rc = iscsit_build_text_rsp(cmd, conn, hdr, ISCSI_INFINIBAND); if (rc < 0) return rc; @@ -1921,7 +2341,7 @@ isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) tx_dsg->lkey = isert_conn->conn_mr->lkey; isert_cmd->tx_desc.num_sge = 2; } - isert_init_send_wr(isert_cmd, send_wr); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); pr_debug("Posting Text Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); @@ -1981,56 +2401,39 @@ isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, struct se_cmd *se_cmd = &cmd->se_cmd; struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); struct isert_conn *isert_conn = (struct isert_conn *)conn->context; - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_data_buf *data = &wr->data; struct ib_send_wr *send_wr; struct ib_sge *ib_sge; - struct scatterlist *sg_start; - u32 sg_off = 0, sg_nents; - u32 offset = 0, data_len, data_left, rdma_write_max, va_offset = 0; - int ret = 0, count, i, ib_sge_cnt; + u32 offset, data_len, data_left, rdma_write_max, va_offset = 0; + int ret = 0, i, ib_sge_cnt; - if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) { - data_left = se_cmd->data_length; - iscsit_increment_maxcmdsn(cmd, conn->sess); - cmd->stat_sn = conn->stat_sn++; - } else { - sg_off = cmd->write_data_done / PAGE_SIZE; - data_left = se_cmd->data_length - cmd->write_data_done; - offset = cmd->write_data_done; - isert_cmd->tx_desc.isert_cmd = isert_cmd; - } + isert_cmd->tx_desc.isert_cmd = isert_cmd; - sg_start = &cmd->se_cmd.t_data_sg[sg_off]; - sg_nents = se_cmd->t_data_nents - sg_off; + offset = wr->iser_ib_op == ISER_IB_RDMA_READ ? cmd->write_data_done : 0; + ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg, + se_cmd->t_data_nents, se_cmd->data_length, + offset, wr->iser_ib_op, &wr->data); + if (ret) + return ret; - count = ib_dma_map_sg(ib_dev, sg_start, sg_nents, - (wr->iser_ib_op == ISER_IB_RDMA_WRITE) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - if (unlikely(!count)) { - pr_err("Cmd: %p unrable to map SGs\n", isert_cmd); - return -EINVAL; - } - wr->sge = sg_start; - wr->num_sge = sg_nents; - wr->cur_rdma_length = data_left; - pr_debug("Mapped cmd: %p count: %u sg: %p sg_nents: %u rdma_len %d\n", - isert_cmd, count, sg_start, sg_nents, data_left); + data_left = data->len; + offset = data->offset; - ib_sge = kzalloc(sizeof(struct ib_sge) * sg_nents, GFP_KERNEL); + ib_sge = kzalloc(sizeof(struct ib_sge) * data->nents, GFP_KERNEL); if (!ib_sge) { pr_warn("Unable to allocate ib_sge\n"); ret = -ENOMEM; - goto unmap_sg; + goto unmap_cmd; } wr->ib_sge = ib_sge; - wr->send_wr_num = DIV_ROUND_UP(sg_nents, isert_conn->max_sge); + wr->send_wr_num = DIV_ROUND_UP(data->nents, isert_conn->max_sge); wr->send_wr = kzalloc(sizeof(struct ib_send_wr) * wr->send_wr_num, GFP_KERNEL); if (!wr->send_wr) { pr_debug("Unable to allocate wr->send_wr\n"); ret = -ENOMEM; - goto unmap_sg; + goto unmap_cmd; } wr->isert_cmd = isert_cmd; @@ -2069,10 +2472,9 @@ isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, } return 0; -unmap_sg: - ib_dma_unmap_sg(ib_dev, sg_start, sg_nents, - (wr->iser_ib_op == ISER_IB_RDMA_WRITE) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); +unmap_cmd: + isert_unmap_data_buf(isert_conn, data); + return ret; } @@ -2116,51 +2518,70 @@ isert_map_fr_pagelist(struct ib_device *ib_dev, } static int -isert_fast_reg_mr(struct fast_reg_descriptor *fr_desc, - struct isert_cmd *isert_cmd, struct isert_conn *isert_conn, - struct ib_sge *ib_sge, u32 offset, unsigned int data_len) +isert_fast_reg_mr(struct isert_conn *isert_conn, + struct fast_reg_descriptor *fr_desc, + struct isert_data_buf *mem, + enum isert_indicator ind, + struct ib_sge *sge) { - struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; struct ib_device *ib_dev = isert_conn->conn_cm_id->device; - struct scatterlist *sg_start; - u32 sg_off, page_off; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; struct ib_send_wr fr_wr, inv_wr; struct ib_send_wr *bad_wr, *wr = NULL; + int ret, pagelist_len; + u32 page_off; u8 key; - int ret, sg_nents, pagelist_len; - sg_off = offset / PAGE_SIZE; - sg_start = &cmd->se_cmd.t_data_sg[sg_off]; - sg_nents = min_t(unsigned int, cmd->se_cmd.t_data_nents - sg_off, - ISCSI_ISER_SG_TABLESIZE); - page_off = offset % PAGE_SIZE; + if (mem->dma_nents == 1) { + sge->lkey = isert_conn->conn_mr->lkey; + sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]); + sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]); + pr_debug("%s:%d sge: addr: 0x%llx length: %u lkey: %x\n", + __func__, __LINE__, sge->addr, sge->length, + sge->lkey); + return 0; + } + + if (ind == ISERT_DATA_KEY_VALID) { + /* Registering data buffer */ + mr = fr_desc->data_mr; + frpl = fr_desc->data_frpl; + } else { + /* Registering protection buffer */ + mr = fr_desc->pi_ctx->prot_mr; + frpl = fr_desc->pi_ctx->prot_frpl; + } - pr_debug("Cmd: %p use fr_desc %p sg_nents %d sg_off %d offset %u\n", - isert_cmd, fr_desc, sg_nents, sg_off, offset); + page_off = mem->offset % PAGE_SIZE; - pagelist_len = isert_map_fr_pagelist(ib_dev, sg_start, sg_nents, - &fr_desc->data_frpl->page_list[0]); + pr_debug("Use fr_desc %p sg_nents %d offset %u\n", + fr_desc, mem->nents, mem->offset); - if (!fr_desc->valid) { + pagelist_len = isert_map_fr_pagelist(ib_dev, mem->sg, mem->nents, + &frpl->page_list[0]); + + if (!(fr_desc->ind & ISERT_DATA_KEY_VALID)) { memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.wr_id = ISER_FASTREG_LI_WRID; inv_wr.opcode = IB_WR_LOCAL_INV; - inv_wr.ex.invalidate_rkey = fr_desc->data_mr->rkey; + inv_wr.ex.invalidate_rkey = mr->rkey; wr = &inv_wr; /* Bump the key */ - key = (u8)(fr_desc->data_mr->rkey & 0x000000FF); - ib_update_fast_reg_key(fr_desc->data_mr, ++key); + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); } /* Prepare FASTREG WR */ memset(&fr_wr, 0, sizeof(fr_wr)); + fr_wr.wr_id = ISER_FASTREG_LI_WRID; fr_wr.opcode = IB_WR_FAST_REG_MR; - fr_wr.wr.fast_reg.iova_start = - fr_desc->data_frpl->page_list[0] + page_off; - fr_wr.wr.fast_reg.page_list = fr_desc->data_frpl; + fr_wr.wr.fast_reg.iova_start = frpl->page_list[0] + page_off; + fr_wr.wr.fast_reg.page_list = frpl; fr_wr.wr.fast_reg.page_list_len = pagelist_len; fr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - fr_wr.wr.fast_reg.length = data_len; - fr_wr.wr.fast_reg.rkey = fr_desc->data_mr->rkey; + fr_wr.wr.fast_reg.length = mem->len; + fr_wr.wr.fast_reg.rkey = mr->rkey; fr_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE; if (!wr) @@ -2173,82 +2594,242 @@ isert_fast_reg_mr(struct fast_reg_descriptor *fr_desc, pr_err("fast registration failed, ret:%d\n", ret); return ret; } - fr_desc->valid = false; + fr_desc->ind &= ~ind; + + sge->lkey = mr->lkey; + sge->addr = frpl->page_list[0] + page_off; + sge->length = mem->len; + + pr_debug("%s:%d sge: addr: 0x%llx length: %u lkey: %x\n", + __func__, __LINE__, sge->addr, sge->length, + sge->lkey); + + return ret; +} + +static inline enum ib_t10_dif_type +se2ib_prot_type(enum target_prot_type prot_type) +{ + switch (prot_type) { + case TARGET_DIF_TYPE0_PROT: + return IB_T10DIF_NONE; + case TARGET_DIF_TYPE1_PROT: + return IB_T10DIF_TYPE1; + case TARGET_DIF_TYPE2_PROT: + return IB_T10DIF_TYPE2; + case TARGET_DIF_TYPE3_PROT: + return IB_T10DIF_TYPE3; + default: + return IB_T10DIF_NONE; + } +} + +static int +isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs) +{ + enum ib_t10_dif_type ib_prot_type = se2ib_prot_type(se_cmd->prot_type); + + sig_attrs->mem.sig_type = IB_SIG_TYPE_T10_DIF; + sig_attrs->wire.sig_type = IB_SIG_TYPE_T10_DIF; + sig_attrs->mem.sig.dif.pi_interval = + se_cmd->se_dev->dev_attrib.block_size; + sig_attrs->wire.sig.dif.pi_interval = + se_cmd->se_dev->dev_attrib.block_size; + + switch (se_cmd->prot_op) { + case TARGET_PROT_DIN_INSERT: + case TARGET_PROT_DOUT_STRIP: + sig_attrs->mem.sig.dif.type = IB_T10DIF_NONE; + sig_attrs->wire.sig.dif.type = ib_prot_type; + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->wire.sig.dif.ref_tag = se_cmd->reftag_seed; + break; + case TARGET_PROT_DOUT_INSERT: + case TARGET_PROT_DIN_STRIP: + sig_attrs->mem.sig.dif.type = ib_prot_type; + sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->mem.sig.dif.ref_tag = se_cmd->reftag_seed; + sig_attrs->wire.sig.dif.type = IB_T10DIF_NONE; + break; + case TARGET_PROT_DIN_PASS: + case TARGET_PROT_DOUT_PASS: + sig_attrs->mem.sig.dif.type = ib_prot_type; + sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->mem.sig.dif.ref_tag = se_cmd->reftag_seed; + sig_attrs->wire.sig.dif.type = ib_prot_type; + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->wire.sig.dif.ref_tag = se_cmd->reftag_seed; + break; + default: + pr_err("Unsupported PI operation %d\n", se_cmd->prot_op); + return -EINVAL; + } + + return 0; +} + +static inline u8 +isert_set_prot_checks(u8 prot_checks) +{ + return (prot_checks & TARGET_DIF_CHECK_GUARD ? 0xc0 : 0) | + (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) | + (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0); +} + +static int +isert_reg_sig_mr(struct isert_conn *isert_conn, struct se_cmd *se_cmd, + struct fast_reg_descriptor *fr_desc, + struct ib_sge *data_sge, struct ib_sge *prot_sge, + struct ib_sge *sig_sge) +{ + struct ib_send_wr sig_wr, inv_wr; + struct ib_send_wr *bad_wr, *wr = NULL; + struct pi_context *pi_ctx = fr_desc->pi_ctx; + struct ib_sig_attrs sig_attrs; + int ret; + u32 key; + + memset(&sig_attrs, 0, sizeof(sig_attrs)); + ret = isert_set_sig_attrs(se_cmd, &sig_attrs); + if (ret) + goto err; + + sig_attrs.check_mask = isert_set_prot_checks(se_cmd->prot_checks); + + if (!(fr_desc->ind & ISERT_SIG_KEY_VALID)) { + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.wr_id = ISER_FASTREG_LI_WRID; + inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey; + wr = &inv_wr; + /* Bump the key */ + key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(pi_ctx->sig_mr, ++key); + } + + memset(&sig_wr, 0, sizeof(sig_wr)); + sig_wr.opcode = IB_WR_REG_SIG_MR; + sig_wr.wr_id = ISER_FASTREG_LI_WRID; + sig_wr.sg_list = data_sge; + sig_wr.num_sge = 1; + sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE; + sig_wr.wr.sig_handover.sig_attrs = &sig_attrs; + sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr; + if (se_cmd->t_prot_sg) + sig_wr.wr.sig_handover.prot = prot_sge; + + if (!wr) + wr = &sig_wr; + else + wr->next = &sig_wr; - ib_sge->lkey = fr_desc->data_mr->lkey; - ib_sge->addr = fr_desc->data_frpl->page_list[0] + page_off; - ib_sge->length = data_len; + ret = ib_post_send(isert_conn->conn_qp, wr, &bad_wr); + if (ret) { + pr_err("fast registration failed, ret:%d\n", ret); + goto err; + } + fr_desc->ind &= ~ISERT_SIG_KEY_VALID; - pr_debug("RDMA ib_sge: addr: 0x%16llx length: %u lkey: %08x\n", - ib_sge->addr, ib_sge->length, ib_sge->lkey); + sig_sge->lkey = pi_ctx->sig_mr->lkey; + sig_sge->addr = 0; + sig_sge->length = se_cmd->data_length; + if (se_cmd->prot_op != TARGET_PROT_DIN_STRIP && + se_cmd->prot_op != TARGET_PROT_DOUT_INSERT) + /* + * We have protection guards on the wire + * so we need to set a larget transfer + */ + sig_sge->length += se_cmd->prot_length; + pr_debug("sig_sge: addr: 0x%llx length: %u lkey: %x\n", + sig_sge->addr, sig_sge->length, + sig_sge->lkey); +err: return ret; } static int -isert_reg_rdma_frwr(struct iscsi_conn *conn, struct iscsi_cmd *cmd, - struct isert_rdma_wr *wr) +isert_reg_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr) { struct se_cmd *se_cmd = &cmd->se_cmd; struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); - struct isert_conn *isert_conn = (struct isert_conn *)conn->context; - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_conn *isert_conn = conn->context; + struct ib_sge data_sge; struct ib_send_wr *send_wr; - struct ib_sge *ib_sge; - struct scatterlist *sg_start; - struct fast_reg_descriptor *fr_desc; - u32 sg_off = 0, sg_nents; - u32 offset = 0, data_len, data_left, rdma_write_max; - int ret = 0, count; + struct fast_reg_descriptor *fr_desc = NULL; + u32 offset; + int ret = 0; unsigned long flags; - if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) { - data_left = se_cmd->data_length; - iscsit_increment_maxcmdsn(cmd, conn->sess); - cmd->stat_sn = conn->stat_sn++; - } else { - sg_off = cmd->write_data_done / PAGE_SIZE; - data_left = se_cmd->data_length - cmd->write_data_done; - offset = cmd->write_data_done; - isert_cmd->tx_desc.isert_cmd = isert_cmd; - } + isert_cmd->tx_desc.isert_cmd = isert_cmd; - sg_start = &cmd->se_cmd.t_data_sg[sg_off]; - sg_nents = se_cmd->t_data_nents - sg_off; + offset = wr->iser_ib_op == ISER_IB_RDMA_READ ? cmd->write_data_done : 0; + ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg, + se_cmd->t_data_nents, se_cmd->data_length, + offset, wr->iser_ib_op, &wr->data); + if (ret) + return ret; - count = ib_dma_map_sg(ib_dev, sg_start, sg_nents, - (wr->iser_ib_op == ISER_IB_RDMA_WRITE) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - if (unlikely(!count)) { - pr_err("Cmd: %p unrable to map SGs\n", isert_cmd); - return -EINVAL; + if (wr->data.dma_nents != 1 || + se_cmd->prot_op != TARGET_PROT_NORMAL) { + spin_lock_irqsave(&isert_conn->conn_lock, flags); + fr_desc = list_first_entry(&isert_conn->conn_fr_pool, + struct fast_reg_descriptor, list); + list_del(&fr_desc->list); + spin_unlock_irqrestore(&isert_conn->conn_lock, flags); + wr->fr_desc = fr_desc; } - wr->sge = sg_start; - wr->num_sge = sg_nents; - pr_debug("Mapped cmd: %p count: %u sg: %p sg_nents: %u rdma_len %d\n", - isert_cmd, count, sg_start, sg_nents, data_left); - memset(&wr->s_ib_sge, 0, sizeof(*ib_sge)); - ib_sge = &wr->s_ib_sge; - wr->ib_sge = ib_sge; + ret = isert_fast_reg_mr(isert_conn, fr_desc, &wr->data, + ISERT_DATA_KEY_VALID, &data_sge); + if (ret) + goto unmap_cmd; + + if (se_cmd->prot_op != TARGET_PROT_NORMAL) { + struct ib_sge prot_sge, sig_sge; + + if (se_cmd->t_prot_sg) { + ret = isert_map_data_buf(isert_conn, isert_cmd, + se_cmd->t_prot_sg, + se_cmd->t_prot_nents, + se_cmd->prot_length, + 0, wr->iser_ib_op, &wr->prot); + if (ret) + goto unmap_cmd; + + ret = isert_fast_reg_mr(isert_conn, fr_desc, &wr->prot, + ISERT_PROT_KEY_VALID, &prot_sge); + if (ret) + goto unmap_prot_cmd; + } + + ret = isert_reg_sig_mr(isert_conn, se_cmd, fr_desc, + &data_sge, &prot_sge, &sig_sge); + if (ret) + goto unmap_prot_cmd; + + fr_desc->ind |= ISERT_PROTECTED; + memcpy(&wr->s_ib_sge, &sig_sge, sizeof(sig_sge)); + } else + memcpy(&wr->s_ib_sge, &data_sge, sizeof(data_sge)); + wr->ib_sge = &wr->s_ib_sge; wr->send_wr_num = 1; memset(&wr->s_send_wr, 0, sizeof(*send_wr)); wr->send_wr = &wr->s_send_wr; - wr->isert_cmd = isert_cmd; - rdma_write_max = ISCSI_ISER_SG_TABLESIZE * PAGE_SIZE; send_wr = &isert_cmd->rdma_wr.s_send_wr; - send_wr->sg_list = ib_sge; + send_wr->sg_list = &wr->s_ib_sge; send_wr->num_sge = 1; send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc; if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) { send_wr->opcode = IB_WR_RDMA_WRITE; send_wr->wr.rdma.remote_addr = isert_cmd->read_va; send_wr->wr.rdma.rkey = isert_cmd->read_stag; - send_wr->send_flags = 0; - send_wr->next = &isert_cmd->tx_desc.send_wr; + send_wr->send_flags = se_cmd->prot_op == TARGET_PROT_NORMAL ? + 0 : IB_SEND_SIGNALED; } else { send_wr->opcode = IB_WR_RDMA_READ; send_wr->wr.rdma.remote_addr = isert_cmd->write_va; @@ -2256,29 +2837,18 @@ isert_reg_rdma_frwr(struct iscsi_conn *conn, struct iscsi_cmd *cmd, send_wr->send_flags = IB_SEND_SIGNALED; } - data_len = min(data_left, rdma_write_max); - wr->cur_rdma_length = data_len; - - spin_lock_irqsave(&isert_conn->conn_lock, flags); - fr_desc = list_first_entry(&isert_conn->conn_frwr_pool, - struct fast_reg_descriptor, list); - list_del(&fr_desc->list); - spin_unlock_irqrestore(&isert_conn->conn_lock, flags); - wr->fr_desc = fr_desc; - - ret = isert_fast_reg_mr(fr_desc, isert_cmd, isert_conn, - ib_sge, offset, data_len); - if (ret) { - list_add_tail(&fr_desc->list, &isert_conn->conn_frwr_pool); - goto unmap_sg; - } - return 0; +unmap_prot_cmd: + if (se_cmd->t_prot_sg) + isert_unmap_data_buf(isert_conn, &wr->prot); +unmap_cmd: + if (fr_desc) { + spin_lock_irqsave(&isert_conn->conn_lock, flags); + list_add_tail(&fr_desc->list, &isert_conn->conn_fr_pool); + spin_unlock_irqrestore(&isert_conn->conn_lock, flags); + } + isert_unmap_data_buf(isert_conn, &wr->data); -unmap_sg: - ib_dma_unmap_sg(ib_dev, sg_start, sg_nents, - (wr->iser_ib_op == ISER_IB_RDMA_WRITE) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); return ret; } @@ -2302,24 +2872,35 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) return rc; } - /* - * Build isert_conn->tx_desc for iSCSI response PDU and attach - */ - isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); - iscsit_build_rsp_pdu(cmd, conn, false, (struct iscsi_scsi_rsp *) - &isert_cmd->tx_desc.iscsi_header); - isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); - isert_init_send_wr(isert_cmd, &isert_cmd->tx_desc.send_wr); + if (se_cmd->prot_op == TARGET_PROT_NORMAL) { + /* + * Build isert_conn->tx_desc for iSCSI response PDU and attach + */ + isert_create_send_desc(isert_conn, isert_cmd, + &isert_cmd->tx_desc); + iscsit_build_rsp_pdu(cmd, conn, true, (struct iscsi_scsi_rsp *) + &isert_cmd->tx_desc.iscsi_header); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr(isert_conn, isert_cmd, + &isert_cmd->tx_desc.send_wr, true); + isert_cmd->rdma_wr.s_send_wr.next = &isert_cmd->tx_desc.send_wr; + wr->send_wr_num += 1; + } - atomic_inc(&isert_conn->post_send_buf_count); + atomic_add(wr->send_wr_num, &isert_conn->post_send_buf_count); rc = ib_post_send(isert_conn->conn_qp, wr->send_wr, &wr_failed); if (rc) { pr_warn("ib_post_send() failed for IB_WR_RDMA_WRITE\n"); - atomic_dec(&isert_conn->post_send_buf_count); + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); } - pr_debug("Cmd: %p posted RDMA_WRITE + Response for iSER Data READ\n", - isert_cmd); + + if (se_cmd->prot_op == TARGET_PROT_NORMAL) + pr_debug("Cmd: %p posted RDMA_WRITE + Response for iSER Data " + "READ\n", isert_cmd); + else + pr_debug("Cmd: %p posted RDMA_WRITE for iSER Data READ\n", + isert_cmd); return 1; } @@ -2344,12 +2925,12 @@ isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery) return rc; } - atomic_inc(&isert_conn->post_send_buf_count); + atomic_add(wr->send_wr_num, &isert_conn->post_send_buf_count); rc = ib_post_send(isert_conn->conn_qp, wr->send_wr, &wr_failed); if (rc) { pr_warn("ib_post_send() failed for IB_WR_RDMA_READ\n"); - atomic_dec(&isert_conn->post_send_buf_count); + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); } pr_debug("Cmd: %p posted RDMA_READ memory for ISER Data WRITE\n", isert_cmd); @@ -2430,7 +3011,7 @@ isert_setup_np(struct iscsi_np *np, pr_err("Unable to allocate struct isert_np\n"); return -ENOMEM; } - init_waitqueue_head(&isert_np->np_accept_wq); + sema_init(&isert_np->np_sem, 0); mutex_init(&isert_np->np_accept_mutex); INIT_LIST_HEAD(&isert_np->np_accept_list); init_completion(&isert_np->np_login_comp); @@ -2479,18 +3060,6 @@ out: } static int -isert_check_accept_queue(struct isert_np *isert_np) -{ - int empty; - - mutex_lock(&isert_np->np_accept_mutex); - empty = list_empty(&isert_np->np_accept_list); - mutex_unlock(&isert_np->np_accept_mutex); - - return empty; -} - -static int isert_rdma_accept(struct isert_conn *isert_conn) { struct rdma_cm_id *cm_id = isert_conn->conn_cm_id; @@ -2582,16 +3151,19 @@ isert_accept_np(struct iscsi_np *np, struct iscsi_conn *conn) int max_accept = 0, ret; accept_wait: - ret = wait_event_interruptible(isert_np->np_accept_wq, - !isert_check_accept_queue(isert_np) || - np->np_thread_state == ISCSI_NP_THREAD_RESET); + ret = down_interruptible(&isert_np->np_sem); if (max_accept > 5) return -ENODEV; spin_lock_bh(&np->np_thread_lock); - if (np->np_thread_state == ISCSI_NP_THREAD_RESET) { + if (np->np_thread_state >= ISCSI_NP_THREAD_RESET) { spin_unlock_bh(&np->np_thread_lock); - pr_err("ISCSI_NP_THREAD_RESET for isert_accept_np\n"); + pr_debug("np_thread_state %d for isert_accept_np\n", + np->np_thread_state); + /** + * No point in stalling here when np_thread + * is in state RESET/SHUTDOWN/EXIT - bail + **/ return -ENODEV; } spin_unlock_bh(&np->np_thread_lock); @@ -2636,63 +3208,37 @@ isert_free_np(struct iscsi_np *np) kfree(isert_np); } -static int isert_check_state(struct isert_conn *isert_conn, int state) -{ - int ret; - - mutex_lock(&isert_conn->conn_mutex); - ret = (isert_conn->state == state); - mutex_unlock(&isert_conn->conn_mutex); - - return ret; -} - -static void isert_free_conn(struct iscsi_conn *conn) +static void isert_wait_conn(struct iscsi_conn *conn) { struct isert_conn *isert_conn = conn->context; - pr_debug("isert_free_conn: Starting \n"); - /* - * Decrement post_send_buf_count for special case when called - * from isert_do_control_comp() -> iscsit_logout_post_handler() - */ - mutex_lock(&isert_conn->conn_mutex); - if (isert_conn->logout_posted) - atomic_dec(&isert_conn->post_send_buf_count); + pr_debug("isert_wait_conn: Starting \n"); - if (isert_conn->conn_cm_id && isert_conn->state != ISER_CONN_DOWN) { - pr_debug("Calling rdma_disconnect from isert_free_conn\n"); + mutex_lock(&isert_conn->conn_mutex); + if (isert_conn->conn_cm_id) { + pr_debug("Calling rdma_disconnect from isert_wait_conn\n"); rdma_disconnect(isert_conn->conn_cm_id); } /* * Only wait for conn_wait_comp_err if the isert_conn made it * into full feature phase.. */ - if (isert_conn->state == ISER_CONN_UP) { - pr_debug("isert_free_conn: Before wait_event comp_err %d\n", - isert_conn->state); - mutex_unlock(&isert_conn->conn_mutex); - - wait_event(isert_conn->conn_wait_comp_err, - (isert_check_state(isert_conn, ISER_CONN_TERMINATING))); - - wait_event(isert_conn->conn_wait, - (isert_check_state(isert_conn, ISER_CONN_DOWN))); - - isert_put_conn(isert_conn); - return; - } if (isert_conn->state == ISER_CONN_INIT) { mutex_unlock(&isert_conn->conn_mutex); - isert_put_conn(isert_conn); return; } - pr_debug("isert_free_conn: wait_event conn_wait %d\n", - isert_conn->state); + if (isert_conn->state == ISER_CONN_UP) + isert_conn->state = ISER_CONN_TERMINATING; mutex_unlock(&isert_conn->conn_mutex); - wait_event(isert_conn->conn_wait, - (isert_check_state(isert_conn, ISER_CONN_DOWN))); + wait_for_completion(&isert_conn->conn_wait_comp_err); + + wait_for_completion(&isert_conn->conn_wait); +} + +static void isert_free_conn(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = conn->context; isert_put_conn(isert_conn); } @@ -2705,6 +3251,7 @@ static struct iscsit_transport iser_target_transport = { .iscsit_setup_np = isert_setup_np, .iscsit_accept_np = isert_accept_np, .iscsit_free_np = isert_free_np, + .iscsit_wait_conn = isert_wait_conn, .iscsit_free_conn = isert_free_conn, .iscsit_get_login_rx = isert_get_login_rx, .iscsit_put_login_tx = isert_put_login_tx, @@ -2713,6 +3260,8 @@ static struct iscsit_transport iser_target_transport = { .iscsit_get_dataout = isert_get_dataout, .iscsit_queue_data_in = isert_put_datain, .iscsit_queue_status = isert_put_response, + .iscsit_aborted_task = isert_aborted_task, + .iscsit_get_sup_prot_ops = isert_get_sup_prot_ops, }; static int __init isert_init(void) @@ -2743,6 +3292,7 @@ destroy_rx_wq: static void __exit isert_exit(void) { + flush_scheduled_work(); destroy_workqueue(isert_comp_wq); destroy_workqueue(isert_rx_wq); iscsit_unregister_transport(&iser_target_transport); diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h index 631f2090f0b..04f51f7bf61 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.h +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -6,6 +6,7 @@ #define ISERT_RDMA_LISTEN_BACKLOG 10 #define ISCSI_ISER_SG_TABLESIZE 256 +#define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL enum isert_desc_type { ISCSI_TX_CONTROL, @@ -43,14 +44,41 @@ struct iser_tx_desc { struct ib_sge tx_sg[2]; int num_sge; struct isert_cmd *isert_cmd; + struct llist_node *comp_llnode_batch; + struct llist_node comp_llnode; + bool llnode_active; struct ib_send_wr send_wr; } __packed; +enum isert_indicator { + ISERT_PROTECTED = 1 << 0, + ISERT_DATA_KEY_VALID = 1 << 1, + ISERT_PROT_KEY_VALID = 1 << 2, + ISERT_SIG_KEY_VALID = 1 << 3, +}; + +struct pi_context { + struct ib_mr *prot_mr; + struct ib_fast_reg_page_list *prot_frpl; + struct ib_mr *sig_mr; +}; + struct fast_reg_descriptor { - struct list_head list; - struct ib_mr *data_mr; - struct ib_fast_reg_page_list *data_frpl; - bool valid; + struct list_head list; + struct ib_mr *data_mr; + struct ib_fast_reg_page_list *data_frpl; + u8 ind; + struct pi_context *pi_ctx; +}; + +struct isert_data_buf { + struct scatterlist *sg; + int nents; + u32 sg_off; + u32 len; /* cur_rdma_length */ + u32 offset; + unsigned int dma_nents; + enum dma_data_direction dma_dir; }; struct isert_rdma_wr { @@ -59,12 +87,11 @@ struct isert_rdma_wr { enum iser_ib_op_code iser_ib_op; struct ib_sge *ib_sge; struct ib_sge s_ib_sge; - int num_sge; - struct scatterlist *sge; int send_wr_num; struct ib_send_wr *send_wr; struct ib_send_wr s_send_wr; - u32 cur_rdma_length; + struct isert_data_buf data; + struct isert_data_buf prot; struct fast_reg_descriptor *fr_desc; }; @@ -89,7 +116,6 @@ struct isert_device; struct isert_conn { enum iser_conn_state state; - bool logout_posted; int post_recv_buf_count; atomic_t post_send_buf_count; u32 responder_resources; @@ -114,13 +140,17 @@ struct isert_conn { struct isert_device *conn_device; struct work_struct conn_logout_work; struct mutex conn_mutex; - wait_queue_head_t conn_wait; - wait_queue_head_t conn_wait_comp_err; + struct completion conn_wait; + struct completion conn_wait_comp_err; struct kref conn_kref; - struct list_head conn_frwr_pool; - int conn_frwr_pool_size; - /* lock to protect frwr_pool */ + struct list_head conn_fr_pool; + int conn_fr_pool_size; + /* lock to protect fastreg pool */ spinlock_t conn_lock; +#define ISERT_COMP_BATCH_COUNT 8 + int conn_comp_batch; + struct llist_head conn_comp_llist; + bool disconnect; }; #define ISERT_MAX_CQ 64 @@ -133,13 +163,12 @@ struct isert_cq_desc { }; struct isert_device { - int use_frwr; + int use_fastreg; + bool pi_capable; int cqs_used; int refcount; int cq_active_qps[ISERT_MAX_CQ]; struct ib_device *ib_device; - struct ib_pd *dev_pd; - struct ib_mr *dev_mr; struct ib_cq *dev_rx_cq[ISERT_MAX_CQ]; struct ib_cq *dev_tx_cq[ISERT_MAX_CQ]; struct isert_cq_desc *cq_desc; @@ -153,7 +182,7 @@ struct isert_device { }; struct isert_np { - wait_queue_head_t np_accept_wq; + struct semaphore np_sem; struct rdma_cm_id *np_cm_id; struct mutex np_accept_mutex; struct list_head np_accept_list; diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index f93baf8254c..e3c2c5b4297 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -30,7 +30,7 @@ * SOFTWARE. */ -#define pr_fmt(fmt) PFX fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/init.h> @@ -46,6 +46,7 @@ #include <scsi/scsi.h> #include <scsi/scsi_device.h> #include <scsi/scsi_dbg.h> +#include <scsi/scsi_tcq.h> #include <scsi/srp.h> #include <scsi/scsi_transport_srp.h> @@ -65,6 +66,8 @@ static unsigned int srp_sg_tablesize; static unsigned int cmd_sg_entries; static unsigned int indirect_sg_entries; static bool allow_ext_sg; +static bool prefer_fr; +static bool register_always; static int topspin_workarounds = 1; module_param(srp_sg_tablesize, uint, 0444); @@ -86,6 +89,40 @@ module_param(topspin_workarounds, int, 0444); MODULE_PARM_DESC(topspin_workarounds, "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); +module_param(prefer_fr, bool, 0444); +MODULE_PARM_DESC(prefer_fr, +"Whether to use fast registration if both FMR and fast registration are supported"); + +module_param(register_always, bool, 0444); +MODULE_PARM_DESC(register_always, + "Use memory registration even for contiguous memory regions"); + +static struct kernel_param_ops srp_tmo_ops; + +static int srp_reconnect_delay = 10; +module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reconnect_delay, "Time between successive reconnect attempts"); + +static int srp_fast_io_fail_tmo = 15; +module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(fast_io_fail_tmo, + "Number of seconds between the observation of a transport" + " layer error and failing all I/O. \"off\" means that this" + " functionality is disabled."); + +static int srp_dev_loss_tmo = 600; +module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dev_loss_tmo, + "Maximum number of seconds that the SRP transport should" + " insulate transport layer errors. After this time has been" + " exceeded the SCSI host is removed. Should be" + " between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT) + " if fast_io_fail_tmo has not been set. \"off\" means that" + " this functionality is disabled."); + static void srp_add_one(struct ib_device *device); static void srp_remove_one(struct ib_device *device); static void srp_recv_completion(struct ib_cq *cq, void *target_ptr); @@ -102,6 +139,48 @@ static struct ib_client srp_client = { static struct ib_sa_client srp_sa_client; +static int srp_tmo_get(char *buffer, const struct kernel_param *kp) +{ + int tmo = *(int *)kp->arg; + + if (tmo >= 0) + return sprintf(buffer, "%d", tmo); + else + return sprintf(buffer, "off"); +} + +static int srp_tmo_set(const char *val, const struct kernel_param *kp) +{ + int tmo, res; + + if (strncmp(val, "off", 3) != 0) { + res = kstrtoint(val, 0, &tmo); + if (res) + goto out; + } else { + tmo = -1; + } + if (kp->arg == &srp_reconnect_delay) + res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo, + srp_dev_loss_tmo); + else if (kp->arg == &srp_fast_io_fail_tmo) + res = srp_tmo_valid(srp_reconnect_delay, tmo, srp_dev_loss_tmo); + else + res = srp_tmo_valid(srp_reconnect_delay, srp_fast_io_fail_tmo, + tmo); + if (res) + goto out; + *(int *)kp->arg = tmo; + +out: + return res; +} + +static struct kernel_param_ops srp_tmo_ops = { + .get = srp_tmo_get, + .set = srp_tmo_set, +}; + static inline struct srp_target_port *host_to_target(struct Scsi_Host *host) { return (struct srp_target_port *) host->hostdata; @@ -219,28 +298,174 @@ static int srp_new_cm_id(struct srp_target_port *target) return 0; } +static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_fmr_pool_param fmr_param; + + memset(&fmr_param, 0, sizeof(fmr_param)); + fmr_param.pool_size = target->scsi_host->can_queue; + fmr_param.dirty_watermark = fmr_param.pool_size / 4; + fmr_param.cache = 1; + fmr_param.max_pages_per_fmr = dev->max_pages_per_mr; + fmr_param.page_shift = ilog2(dev->mr_page_size); + fmr_param.access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + + return ib_create_fmr_pool(dev->pd, &fmr_param); +} + +/** + * srp_destroy_fr_pool() - free the resources owned by a pool + * @pool: Fast registration pool to be destroyed. + */ +static void srp_destroy_fr_pool(struct srp_fr_pool *pool) +{ + int i; + struct srp_fr_desc *d; + + if (!pool) + return; + + for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { + if (d->frpl) + ib_free_fast_reg_page_list(d->frpl); + if (d->mr) + ib_dereg_mr(d->mr); + } + kfree(pool); +} + +/** + * srp_create_fr_pool() - allocate and initialize a pool for fast registration + * @device: IB device to allocate fast registration descriptors for. + * @pd: Protection domain associated with the FR descriptors. + * @pool_size: Number of descriptors to allocate. + * @max_page_list_len: Maximum fast registration work request page list length. + */ +static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device, + struct ib_pd *pd, int pool_size, + int max_page_list_len) +{ + struct srp_fr_pool *pool; + struct srp_fr_desc *d; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; + int i, ret = -EINVAL; + + if (pool_size <= 0) + goto err; + ret = -ENOMEM; + pool = kzalloc(sizeof(struct srp_fr_pool) + + pool_size * sizeof(struct srp_fr_desc), GFP_KERNEL); + if (!pool) + goto err; + pool->size = pool_size; + pool->max_page_list_len = max_page_list_len; + spin_lock_init(&pool->lock); + INIT_LIST_HEAD(&pool->free_list); + + for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { + mr = ib_alloc_fast_reg_mr(pd, max_page_list_len); + if (IS_ERR(mr)) { + ret = PTR_ERR(mr); + goto destroy_pool; + } + d->mr = mr; + frpl = ib_alloc_fast_reg_page_list(device, max_page_list_len); + if (IS_ERR(frpl)) { + ret = PTR_ERR(frpl); + goto destroy_pool; + } + d->frpl = frpl; + list_add_tail(&d->entry, &pool->free_list); + } + +out: + return pool; + +destroy_pool: + srp_destroy_fr_pool(pool); + +err: + pool = ERR_PTR(ret); + goto out; +} + +/** + * srp_fr_pool_get() - obtain a descriptor suitable for fast registration + * @pool: Pool to obtain descriptor from. + */ +static struct srp_fr_desc *srp_fr_pool_get(struct srp_fr_pool *pool) +{ + struct srp_fr_desc *d = NULL; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + if (!list_empty(&pool->free_list)) { + d = list_first_entry(&pool->free_list, typeof(*d), entry); + list_del(&d->entry); + } + spin_unlock_irqrestore(&pool->lock, flags); + + return d; +} + +/** + * srp_fr_pool_put() - put an FR descriptor back in the free list + * @pool: Pool the descriptor was allocated from. + * @desc: Pointer to an array of fast registration descriptor pointers. + * @n: Number of descriptors to put back. + * + * Note: The caller must already have queued an invalidation request for + * desc->mr->rkey before calling this function. + */ +static void srp_fr_pool_put(struct srp_fr_pool *pool, struct srp_fr_desc **desc, + int n) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&pool->lock, flags); + for (i = 0; i < n; i++) + list_add(&desc[i]->entry, &pool->free_list); + spin_unlock_irqrestore(&pool->lock, flags); +} + +static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + + return srp_create_fr_pool(dev->dev, dev->pd, + target->scsi_host->can_queue, + dev->max_pages_per_mr); +} + static int srp_create_target_ib(struct srp_target_port *target) { + struct srp_device *dev = target->srp_host->srp_dev; struct ib_qp_init_attr *init_attr; struct ib_cq *recv_cq, *send_cq; struct ib_qp *qp; + struct ib_fmr_pool *fmr_pool = NULL; + struct srp_fr_pool *fr_pool = NULL; + const int m = 1 + dev->use_fast_reg; int ret; init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL); if (!init_attr) return -ENOMEM; - recv_cq = ib_create_cq(target->srp_host->srp_dev->dev, - srp_recv_completion, NULL, target, SRP_RQ_SIZE, - target->comp_vector); + recv_cq = ib_create_cq(dev->dev, srp_recv_completion, NULL, target, + target->queue_size, target->comp_vector); if (IS_ERR(recv_cq)) { ret = PTR_ERR(recv_cq); goto err; } - send_cq = ib_create_cq(target->srp_host->srp_dev->dev, - srp_send_completion, NULL, target, SRP_SQ_SIZE, - target->comp_vector); + send_cq = ib_create_cq(dev->dev, srp_send_completion, NULL, target, + m * target->queue_size, target->comp_vector); if (IS_ERR(send_cq)) { ret = PTR_ERR(send_cq); goto err_recv_cq; @@ -249,16 +474,16 @@ static int srp_create_target_ib(struct srp_target_port *target) ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP); init_attr->event_handler = srp_qp_event; - init_attr->cap.max_send_wr = SRP_SQ_SIZE; - init_attr->cap.max_recv_wr = SRP_RQ_SIZE; + init_attr->cap.max_send_wr = m * target->queue_size; + init_attr->cap.max_recv_wr = target->queue_size; init_attr->cap.max_recv_sge = 1; init_attr->cap.max_send_sge = 1; - init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; init_attr->qp_type = IB_QPT_RC; init_attr->send_cq = send_cq; init_attr->recv_cq = recv_cq; - qp = ib_create_qp(target->srp_host->srp_dev->pd, init_attr); + qp = ib_create_qp(dev->pd, init_attr); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_send_cq; @@ -268,6 +493,30 @@ static int srp_create_target_ib(struct srp_target_port *target) if (ret) goto err_qp; + if (dev->use_fast_reg && dev->has_fr) { + fr_pool = srp_alloc_fr_pool(target); + if (IS_ERR(fr_pool)) { + ret = PTR_ERR(fr_pool); + shost_printk(KERN_WARNING, target->scsi_host, PFX + "FR pool allocation failed (%d)\n", ret); + goto err_qp; + } + if (target->fr_pool) + srp_destroy_fr_pool(target->fr_pool); + target->fr_pool = fr_pool; + } else if (!dev->use_fast_reg && dev->has_fmr) { + fmr_pool = srp_alloc_fmr_pool(target); + if (IS_ERR(fmr_pool)) { + ret = PTR_ERR(fmr_pool); + shost_printk(KERN_WARNING, target->scsi_host, PFX + "FMR pool allocation failed (%d)\n", ret); + goto err_qp; + } + if (target->fmr_pool) + ib_destroy_fmr_pool(target->fmr_pool); + target->fmr_pool = fmr_pool; + } + if (target->qp) ib_destroy_qp(target->qp); if (target->recv_cq) @@ -296,10 +545,22 @@ err: return ret; } +/* + * Note: this function may be called without srp_alloc_iu_bufs() having been + * invoked. Hence the target->[rt]x_ring checks. + */ static void srp_free_target_ib(struct srp_target_port *target) { + struct srp_device *dev = target->srp_host->srp_dev; int i; + if (dev->use_fast_reg) { + if (target->fr_pool) + srp_destroy_fr_pool(target->fr_pool); + } else { + if (target->fmr_pool) + ib_destroy_fmr_pool(target->fmr_pool); + } ib_destroy_qp(target->qp); ib_destroy_cq(target->send_cq); ib_destroy_cq(target->recv_cq); @@ -307,10 +568,18 @@ static void srp_free_target_ib(struct srp_target_port *target) target->qp = NULL; target->send_cq = target->recv_cq = NULL; - for (i = 0; i < SRP_RQ_SIZE; ++i) - srp_free_iu(target->srp_host, target->rx_ring[i]); - for (i = 0; i < SRP_SQ_SIZE; ++i) - srp_free_iu(target->srp_host, target->tx_ring[i]); + if (target->rx_ring) { + for (i = 0; i < target->queue_size; ++i) + srp_free_iu(target->srp_host, target->rx_ring[i]); + kfree(target->rx_ring); + target->rx_ring = NULL; + } + if (target->tx_ring) { + for (i = 0; i < target->queue_size; ++i) + srp_free_iu(target->srp_host, target->tx_ring[i]); + kfree(target->tx_ring); + target->tx_ring = NULL; + } } static void srp_path_rec_completion(int status, @@ -330,6 +599,8 @@ static void srp_path_rec_completion(int status, static int srp_lookup_path(struct srp_target_port *target) { + int ret; + target->path.numb_path = 1; init_completion(&target->done); @@ -350,7 +621,9 @@ static int srp_lookup_path(struct srp_target_port *target) if (target->path_query_id < 0) return target->path_query_id; - wait_for_completion(&target->done); + ret = wait_for_completion_interruptible(&target->done); + if (ret < 0) + return ret; if (target->status < 0) shost_printk(KERN_WARNING, target->scsi_host, @@ -390,7 +663,7 @@ static int srp_send_req(struct srp_target_port *target) req->param.responder_resources = 4; req->param.remote_cm_response_timeout = 20; req->param.local_cm_response_timeout = 20; - req->param.retry_count = 7; + req->param.retry_count = target->tl_retry_count; req->param.rnr_retry_count = 7; req->param.max_cm_retries = 15; @@ -492,12 +765,20 @@ static void srp_disconnect_target(struct srp_target_port *target) static void srp_free_req_data(struct srp_target_port *target) { - struct ib_device *ibdev = target->srp_host->srp_dev->dev; + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; struct srp_request *req; int i; - for (i = 0, req = target->req_ring; i < SRP_CMD_SQ_SIZE; ++i, ++req) { - kfree(req->fmr_list); + if (!target->req_ring) + return; + + for (i = 0; i < target->req_ring_size; ++i) { + req = &target->req_ring[i]; + if (dev->use_fast_reg) + kfree(req->fr_list); + else + kfree(req->fmr_list); kfree(req->map_page); if (req->indirect_dma_addr) { ib_dma_unmap_single(ibdev, req->indirect_dma_addr, @@ -506,6 +787,59 @@ static void srp_free_req_data(struct srp_target_port *target) } kfree(req->indirect_desc); } + + kfree(target->req_ring); + target->req_ring = NULL; +} + +static int srp_alloc_req_data(struct srp_target_port *target) +{ + struct srp_device *srp_dev = target->srp_host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; + struct srp_request *req; + void *mr_list; + dma_addr_t dma_addr; + int i, ret = -ENOMEM; + + INIT_LIST_HEAD(&target->free_reqs); + + target->req_ring = kzalloc(target->req_ring_size * + sizeof(*target->req_ring), GFP_KERNEL); + if (!target->req_ring) + goto out; + + for (i = 0; i < target->req_ring_size; ++i) { + req = &target->req_ring[i]; + mr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *), + GFP_KERNEL); + if (!mr_list) + goto out; + if (srp_dev->use_fast_reg) + req->fr_list = mr_list; + else + req->fmr_list = mr_list; + req->map_page = kmalloc(srp_dev->max_pages_per_mr * + sizeof(void *), GFP_KERNEL); + if (!req->map_page) + goto out; + req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); + if (!req->indirect_desc) + goto out; + + dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, + target->indirect_size, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ibdev, dma_addr)) + goto out; + + req->indirect_dma_addr = dma_addr; + req->index = i; + list_add_tail(&req->list, &target->free_reqs); + } + ret = 0; + +out: + return ret; } /** @@ -528,12 +862,21 @@ static void srp_remove_target(struct srp_target_port *target) WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); srp_del_scsi_host_attr(target->scsi_host); + srp_rport_get(target->rport); srp_remove_host(target->scsi_host); scsi_remove_host(target->scsi_host); + srp_stop_rport_timers(target->rport); srp_disconnect_target(target); ib_destroy_cm_id(target->cm_id); srp_free_target_ib(target); + cancel_work_sync(&target->tl_err_work); + srp_rport_put(target->rport); srp_free_req_data(target); + + spin_lock(&target->srp_host->target_lock); + list_del(&target->list); + spin_unlock(&target->srp_host->target_lock); + scsi_host_put(target->scsi_host); } @@ -545,10 +888,6 @@ static void srp_remove_work(struct work_struct *work) WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); srp_remove_target(target); - - spin_lock(&target->srp_host->target_lock); - list_del(&target->list); - spin_unlock(&target->srp_host->target_lock); } static void srp_rport_delete(struct srp_rport *rport) @@ -576,7 +915,9 @@ static int srp_connect_target(struct srp_target_port *target) ret = srp_send_req(target); if (ret) return ret; - wait_for_completion(&target->done); + ret = wait_for_completion_interruptible(&target->done); + if (ret < 0) + return ret; /* * The CM event handling code will set status to @@ -619,21 +960,56 @@ static int srp_connect_target(struct srp_target_port *target) } } +static int srp_inv_rkey(struct srp_target_port *target, u32 rkey) +{ + struct ib_send_wr *bad_wr; + struct ib_send_wr wr = { + .opcode = IB_WR_LOCAL_INV, + .wr_id = LOCAL_INV_WR_ID_MASK, + .next = NULL, + .num_sge = 0, + .send_flags = 0, + .ex.invalidate_rkey = rkey, + }; + + return ib_post_send(target->qp, &wr, &bad_wr); +} + static void srp_unmap_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, struct srp_request *req) { - struct ib_device *ibdev = target->srp_host->srp_dev->dev; - struct ib_pool_fmr **pfmr; + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + int i, res; if (!scsi_sglist(scmnd) || (scmnd->sc_data_direction != DMA_TO_DEVICE && scmnd->sc_data_direction != DMA_FROM_DEVICE)) return; - pfmr = req->fmr_list; - while (req->nfmr--) - ib_fmr_pool_unmap(*pfmr++); + if (dev->use_fast_reg) { + struct srp_fr_desc **pfr; + + for (i = req->nmdesc, pfr = req->fr_list; i > 0; i--, pfr++) { + res = srp_inv_rkey(target, (*pfr)->mr->rkey); + if (res < 0) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "Queueing INV WR for rkey %#x failed (%d)\n", + (*pfr)->mr->rkey, res); + queue_work(system_long_wq, + &target->tl_err_work); + } + } + if (req->nmdesc) + srp_fr_pool_put(target->fr_pool, req->fr_list, + req->nmdesc); + } else { + struct ib_pool_fmr **pfmr; + + for (i = req->nmdesc, pfmr = req->fmr_list; i > 0; i--, pfmr++) + ib_fmr_pool_unmap(*pfmr); + } ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd), scmnd->sc_data_direction); @@ -643,6 +1019,7 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd, * srp_claim_req - Take ownership of the scmnd associated with a request. * @target: SRP target port. * @req: SRP request. + * @sdev: If not NULL, only take ownership for this SCSI device. * @scmnd: If NULL, take ownership of @req->scmnd. If not NULL, only take * ownership of @req->scmnd if it equals @scmnd. * @@ -651,16 +1028,17 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd, */ static struct scsi_cmnd *srp_claim_req(struct srp_target_port *target, struct srp_request *req, + struct scsi_device *sdev, struct scsi_cmnd *scmnd) { unsigned long flags; spin_lock_irqsave(&target->lock, flags); - if (!scmnd) { + if (req->scmnd && + (!sdev || req->scmnd->device == sdev) && + (!scmnd || req->scmnd == scmnd)) { scmnd = req->scmnd; req->scmnd = NULL; - } else if (req->scmnd == scmnd) { - req->scmnd = NULL; } else { scmnd = NULL; } @@ -671,6 +1049,10 @@ static struct scsi_cmnd *srp_claim_req(struct srp_target_port *target, /** * srp_free_req() - Unmap data and add request to the free request list. + * @target: SRP target port. + * @req: Request to be freed. + * @scmnd: SCSI command associated with @req. + * @req_lim_delta: Amount to be added to @target->req_lim. */ static void srp_free_req(struct srp_target_port *target, struct srp_request *req, struct scsi_cmnd *scmnd, @@ -686,23 +1068,52 @@ static void srp_free_req(struct srp_target_port *target, spin_unlock_irqrestore(&target->lock, flags); } -static void srp_reset_req(struct srp_target_port *target, struct srp_request *req) +static void srp_finish_req(struct srp_target_port *target, + struct srp_request *req, struct scsi_device *sdev, + int result) { - struct scsi_cmnd *scmnd = srp_claim_req(target, req, NULL); + struct scsi_cmnd *scmnd = srp_claim_req(target, req, sdev, NULL); if (scmnd) { srp_free_req(target, req, scmnd, 0); - scmnd->result = DID_RESET << 16; + scmnd->result = result; scmnd->scsi_done(scmnd); } } -static int srp_reconnect_target(struct srp_target_port *target) +static void srp_terminate_io(struct srp_rport *rport) { + struct srp_target_port *target = rport->lld_data; struct Scsi_Host *shost = target->scsi_host; - int i, ret; + struct scsi_device *sdev; + int i; - scsi_target_block(&shost->shost_gendev); + /* + * Invoking srp_terminate_io() while srp_queuecommand() is running + * is not safe. Hence the warning statement below. + */ + shost_for_each_device(sdev, shost) + WARN_ON_ONCE(sdev->request_queue->request_fn_active); + + for (i = 0; i < target->req_ring_size; ++i) { + struct srp_request *req = &target->req_ring[i]; + srp_finish_req(target, req, NULL, DID_TRANSPORT_FAILFAST << 16); + } +} + +/* + * It is up to the caller to ensure that srp_rport_reconnect() calls are + * serialized and that no concurrent srp_queuecommand(), srp_abort(), + * srp_reset_device() or srp_reset_host() calls will occur while this function + * is in progress. One way to realize that is not to call this function + * directly but to call srp_reconnect_rport() instead since that last function + * serializes calls of this function via rport->mutex and also blocks + * srp_queuecommand() calls before invoking this function. + */ +static int srp_rport_reconnect(struct srp_rport *rport) +{ + struct srp_target_port *target = rport->lld_data; + int i, ret; srp_disconnect_target(target); /* @@ -711,51 +1122,29 @@ static int srp_reconnect_target(struct srp_target_port *target) * callbacks will have finished before a new QP is allocated. */ ret = srp_new_cm_id(target); - /* - * Whether or not creating a new CM ID succeeded, create a new - * QP. This guarantees that all completion callback function - * invocations have finished before request resetting starts. - */ - if (ret == 0) - ret = srp_create_target_ib(target); - else - srp_create_target_ib(target); - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { + for (i = 0; i < target->req_ring_size; ++i) { struct srp_request *req = &target->req_ring[i]; - if (req->scmnd) - srp_reset_req(target, req); + srp_finish_req(target, req, NULL, DID_RESET << 16); } + /* + * Whether or not creating a new CM ID succeeded, create a new + * QP. This guarantees that all callback functions for the old QP have + * finished before any send requests are posted on the new QP. + */ + ret += srp_create_target_ib(target); + INIT_LIST_HEAD(&target->free_tx); - for (i = 0; i < SRP_SQ_SIZE; ++i) + for (i = 0; i < target->queue_size; ++i) list_add(&target->tx_ring[i]->list, &target->free_tx); if (ret == 0) ret = srp_connect_target(target); - scsi_target_unblock(&shost->shost_gendev, ret == 0 ? SDEV_RUNNING : - SDEV_TRANSPORT_OFFLINE); - target->transport_offline = !!ret; - - if (ret) - goto err; - - shost_printk(KERN_INFO, target->scsi_host, PFX "reconnect succeeded\n"); - - return ret; - -err: - shost_printk(KERN_ERR, target->scsi_host, - PFX "reconnect failed (%d), removing target port.\n", ret); - - /* - * We couldn't reconnect, so kill our target port off. - * However, we have to defer the real removal because we - * are in the context of the SCSI error handler now, which - * will deadlock if we call scsi_remove_host(). - */ - srp_queue_remove_work(target); + if (ret == 0) + shost_printk(KERN_INFO, target->scsi_host, + PFX "reconnect succeeded\n"); return ret; } @@ -777,33 +1166,87 @@ static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr, static int srp_map_finish_fmr(struct srp_map_state *state, struct srp_target_port *target) { - struct srp_device *dev = target->srp_host->srp_dev; struct ib_pool_fmr *fmr; u64 io_addr = 0; - if (!state->npages) - return 0; - - if (state->npages == 1) { - srp_map_desc(state, state->base_dma_addr, state->fmr_len, - target->rkey); - state->npages = state->fmr_len = 0; - return 0; - } - - fmr = ib_fmr_pool_map_phys(dev->fmr_pool, state->pages, + fmr = ib_fmr_pool_map_phys(target->fmr_pool, state->pages, state->npages, io_addr); if (IS_ERR(fmr)) return PTR_ERR(fmr); *state->next_fmr++ = fmr; - state->nfmr++; + state->nmdesc++; + + srp_map_desc(state, 0, state->dma_len, fmr->fmr->rkey); - srp_map_desc(state, 0, state->fmr_len, fmr->fmr->rkey); - state->npages = state->fmr_len = 0; return 0; } +static int srp_map_finish_fr(struct srp_map_state *state, + struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_send_wr *bad_wr; + struct ib_send_wr wr; + struct srp_fr_desc *desc; + u32 rkey; + + desc = srp_fr_pool_get(target->fr_pool); + if (!desc) + return -ENOMEM; + + rkey = ib_inc_rkey(desc->mr->rkey); + ib_update_fast_reg_key(desc->mr, rkey); + + memcpy(desc->frpl->page_list, state->pages, + sizeof(state->pages[0]) * state->npages); + + memset(&wr, 0, sizeof(wr)); + wr.opcode = IB_WR_FAST_REG_MR; + wr.wr_id = FAST_REG_WR_ID_MASK; + wr.wr.fast_reg.iova_start = state->base_dma_addr; + wr.wr.fast_reg.page_list = desc->frpl; + wr.wr.fast_reg.page_list_len = state->npages; + wr.wr.fast_reg.page_shift = ilog2(dev->mr_page_size); + wr.wr.fast_reg.length = state->dma_len; + wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE); + wr.wr.fast_reg.rkey = desc->mr->lkey; + + *state->next_fr++ = desc; + state->nmdesc++; + + srp_map_desc(state, state->base_dma_addr, state->dma_len, + desc->mr->rkey); + + return ib_post_send(target->qp, &wr, &bad_wr); +} + +static int srp_finish_mapping(struct srp_map_state *state, + struct srp_target_port *target) +{ + int ret = 0; + + if (state->npages == 0) + return 0; + + if (state->npages == 1 && !register_always) + srp_map_desc(state, state->base_dma_addr, state->dma_len, + target->rkey); + else + ret = target->srp_host->srp_dev->use_fast_reg ? + srp_map_finish_fr(state, target) : + srp_map_finish_fmr(state, target); + + if (ret == 0) { + state->npages = 0; + state->dma_len = 0; + } + + return ret; +} + static void srp_map_update_start(struct srp_map_state *state, struct scatterlist *sg, int sg_index, dma_addr_t dma_addr) @@ -816,7 +1259,7 @@ static void srp_map_update_start(struct srp_map_state *state, static int srp_map_sg_entry(struct srp_map_state *state, struct srp_target_port *target, struct scatterlist *sg, int sg_index, - int use_fmr) + bool use_mr) { struct srp_device *dev = target->srp_host->srp_dev; struct ib_device *ibdev = dev->dev; @@ -828,23 +1271,25 @@ static int srp_map_sg_entry(struct srp_map_state *state, if (!dma_len) return 0; - if (use_fmr == SRP_MAP_NO_FMR) { - /* Once we're in direct map mode for a request, we don't - * go back to FMR mode, so no need to update anything + if (!use_mr) { + /* + * Once we're in direct map mode for a request, we don't + * go back to FMR or FR mode, so no need to update anything * other than the descriptor. */ srp_map_desc(state, dma_addr, dma_len, target->rkey); return 0; } - /* If we start at an offset into the FMR page, don't merge into - * the current FMR. Finish it out, and use the kernel's MR for this - * sg entry. This is to avoid potential bugs on some SRP targets - * that were never quite defined, but went away when the initiator - * avoided using FMR on such page fragments. + /* + * Since not all RDMA HW drivers support non-zero page offsets for + * FMR, if we start at an offset into a page, don't merge into the + * current FMR mapping. Finish it out, and use the kernel's MR for + * this sg entry. */ - if (dma_addr & ~dev->fmr_page_mask || dma_len > dev->fmr_max_size) { - ret = srp_map_finish_fmr(state, target); + if ((!dev->use_fast_reg && dma_addr & ~dev->mr_page_mask) || + dma_len > dev->mr_max_size) { + ret = srp_finish_mapping(state, target); if (ret) return ret; @@ -853,52 +1298,106 @@ static int srp_map_sg_entry(struct srp_map_state *state, return 0; } - /* If this is the first sg to go into the FMR, save our position. - * We need to know the first unmapped entry, its index, and the - * first unmapped address within that entry to be able to restart - * mapping after an error. + /* + * If this is the first sg that will be mapped via FMR or via FR, save + * our position. We need to know the first unmapped entry, its index, + * and the first unmapped address within that entry to be able to + * restart mapping after an error. */ if (!state->unmapped_sg) srp_map_update_start(state, sg, sg_index, dma_addr); while (dma_len) { - if (state->npages == SRP_FMR_SIZE) { - ret = srp_map_finish_fmr(state, target); + unsigned offset = dma_addr & ~dev->mr_page_mask; + if (state->npages == dev->max_pages_per_mr || offset != 0) { + ret = srp_finish_mapping(state, target); if (ret) return ret; srp_map_update_start(state, sg, sg_index, dma_addr); } - len = min_t(unsigned int, dma_len, dev->fmr_page_size); + len = min_t(unsigned int, dma_len, dev->mr_page_size - offset); if (!state->npages) state->base_dma_addr = dma_addr; - state->pages[state->npages++] = dma_addr; - state->fmr_len += len; + state->pages[state->npages++] = dma_addr & dev->mr_page_mask; + state->dma_len += len; dma_addr += len; dma_len -= len; } - /* If the last entry of the FMR wasn't a full page, then we need to + /* + * If the last entry of the MR wasn't a full page, then we need to * close it out and start a new one -- we can only merge at page * boundries. */ ret = 0; - if (len != dev->fmr_page_size) { - ret = srp_map_finish_fmr(state, target); + if (len != dev->mr_page_size) { + ret = srp_finish_mapping(state, target); if (!ret) srp_map_update_start(state, NULL, 0, 0); } return ret; } +static int srp_map_sg(struct srp_map_state *state, + struct srp_target_port *target, struct srp_request *req, + struct scatterlist *scat, int count) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + struct scatterlist *sg; + int i; + bool use_mr; + + state->desc = req->indirect_desc; + state->pages = req->map_page; + if (dev->use_fast_reg) { + state->next_fr = req->fr_list; + use_mr = !!target->fr_pool; + } else { + state->next_fmr = req->fmr_list; + use_mr = !!target->fmr_pool; + } + + for_each_sg(scat, sg, count, i) { + if (srp_map_sg_entry(state, target, sg, i, use_mr)) { + /* + * Memory registration failed, so backtrack to the + * first unmapped entry and continue on without using + * memory registration. + */ + dma_addr_t dma_addr; + unsigned int dma_len; + +backtrack: + sg = state->unmapped_sg; + i = state->unmapped_index; + + dma_addr = ib_sg_dma_address(ibdev, sg); + dma_len = ib_sg_dma_len(ibdev, sg); + dma_len -= (state->unmapped_addr - dma_addr); + dma_addr = state->unmapped_addr; + use_mr = false; + srp_map_desc(state, dma_addr, dma_len, target->rkey); + } + } + + if (use_mr && srp_finish_mapping(state, target)) + goto backtrack; + + req->nmdesc = state->nmdesc; + + return 0; +} + static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, struct srp_request *req) { - struct scatterlist *scat, *sg; + struct scatterlist *scat; struct srp_cmd *cmd = req->cmd->buf; - int i, len, nents, count, use_fmr; + int len, nents, count; struct srp_device *dev; struct ib_device *ibdev; struct srp_map_state state; @@ -930,7 +1429,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, fmt = SRP_DATA_DESC_DIRECT; len = sizeof (struct srp_cmd) + sizeof (struct srp_direct_buf); - if (count == 1) { + if (count == 1 && !register_always) { /* * The midlayer only generated a single gather/scatter * entry, or DMA mapping coalesced everything to a @@ -943,13 +1442,13 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, buf->key = cpu_to_be32(target->rkey); buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat)); - req->nfmr = 0; + req->nmdesc = 0; goto map_complete; } - /* We have more than one scatter/gather entry, so build our indirect - * descriptor table, trying to merge as many entries with FMR as we - * can. + /* + * We have more than one scatter/gather entry, so build our indirect + * descriptor table, trying to merge as many entries as we can. */ indirect_hdr = (void *) cmd->add_data; @@ -957,35 +1456,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, target->indirect_size, DMA_TO_DEVICE); memset(&state, 0, sizeof(state)); - state.desc = req->indirect_desc; - state.pages = req->map_page; - state.next_fmr = req->fmr_list; - - use_fmr = dev->fmr_pool ? SRP_MAP_ALLOW_FMR : SRP_MAP_NO_FMR; - - for_each_sg(scat, sg, count, i) { - if (srp_map_sg_entry(&state, target, sg, i, use_fmr)) { - /* FMR mapping failed, so backtrack to the first - * unmapped entry and continue on without using FMR. - */ - dma_addr_t dma_addr; - unsigned int dma_len; - -backtrack: - sg = state.unmapped_sg; - i = state.unmapped_index; - - dma_addr = ib_sg_dma_address(ibdev, sg); - dma_len = ib_sg_dma_len(ibdev, sg); - dma_len -= (state.unmapped_addr - dma_addr); - dma_addr = state.unmapped_addr; - use_fmr = SRP_MAP_NO_FMR; - srp_map_desc(&state, dma_addr, dma_len, target->rkey); - } - } - - if (use_fmr == SRP_MAP_ALLOW_FMR && srp_map_finish_fmr(&state, target)) - goto backtrack; + srp_map_sg(&state, target, req, scat, count); /* We've mapped the request, now pull as much of the indirect * descriptor table as we can into the command buffer. If this @@ -993,9 +1464,9 @@ backtrack: * guaranteed to fit into the command, as the SCSI layer won't * give us more S/G entries than we allow. */ - req->nfmr = state.nfmr; if (state.ndesc == 1) { - /* FMR mapping was able to collapse this to one entry, + /* + * Memory registration collapsed the sg-list into one entry, * so use a direct descriptor. */ struct srp_direct_buf *buf = (void *) cmd->add_data; @@ -1151,7 +1622,7 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) complete(&target->tsk_mgmt_done); } else { req = &target->req_ring[rsp->tag]; - scmnd = srp_claim_req(target, req, NULL); + scmnd = srp_claim_req(target, req, NULL, NULL); if (!scmnd) { shost_printk(KERN_ERR, target->scsi_host, "Null scmnd for RSP w/tag %016llx\n", @@ -1302,15 +1773,41 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) PFX "Recv failed with error code %d\n", res); } -static void srp_handle_qp_err(enum ib_wc_status wc_status, - enum ib_wc_opcode wc_opcode, - struct srp_target_port *target) +/** + * srp_tl_err_work() - handle a transport layer error + * @work: Work structure embedded in an SRP target port. + * + * Note: This function may get invoked before the rport has been created, + * hence the target->rport test. + */ +static void srp_tl_err_work(struct work_struct *work) +{ + struct srp_target_port *target; + + target = container_of(work, struct srp_target_port, tl_err_work); + if (target->rport) + srp_start_tl_fail_timers(target->rport); +} + +static void srp_handle_qp_err(u64 wr_id, enum ib_wc_status wc_status, + bool send_err, struct srp_target_port *target) { if (target->connected && !target->qp_in_error) { - shost_printk(KERN_ERR, target->scsi_host, - PFX "failed %s status %d\n", - wc_opcode & IB_WC_RECV ? "receive" : "send", - wc_status); + if (wr_id & LOCAL_INV_WR_ID_MASK) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "LOCAL_INV failed with status %d\n", + wc_status); + } else if (wr_id & FAST_REG_WR_ID_MASK) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "FAST_REG_MR failed status %d\n", + wc_status); + } else { + shost_printk(KERN_ERR, target->scsi_host, + PFX "failed %s status %d for iu %p\n", + send_err ? "send" : "receive", + wc_status, (void *)(uintptr_t)wr_id); + } + queue_work(system_long_wq, &target->tl_err_work); } target->qp_in_error = true; } @@ -1325,7 +1822,7 @@ static void srp_recv_completion(struct ib_cq *cq, void *target_ptr) if (likely(wc.status == IB_WC_SUCCESS)) { srp_handle_recv(target, &wc); } else { - srp_handle_qp_err(wc.status, wc.opcode, target); + srp_handle_qp_err(wc.wr_id, wc.status, false, target); } } } @@ -1341,7 +1838,7 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) iu = (struct srp_iu *) (uintptr_t) wc.wr_id; list_add(&iu->list, &target->free_tx); } else { - srp_handle_qp_err(wc.status, wc.opcode, target); + srp_handle_qp_err(wc.wr_id, wc.status, true, target); } } } @@ -1349,18 +1846,27 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(shost); + struct srp_rport *rport = target->rport; struct srp_request *req; struct srp_iu *iu; struct srp_cmd *cmd; struct ib_device *dev; unsigned long flags; - int len; + int len, ret; + const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler; - if (unlikely(target->transport_offline)) { - scmnd->result = DID_NO_CONNECT << 16; - scmnd->scsi_done(scmnd); - return 0; - } + /* + * The SCSI EH thread is the only context from which srp_queuecommand() + * can get invoked for blocked devices (SDEV_BLOCK / + * SDEV_CREATED_BLOCK). Avoid racing with srp_reconnect_rport() by + * locking the rport mutex if invoked from inside the SCSI EH. + */ + if (in_scsi_eh) + mutex_lock(&rport->mutex); + + scmnd->result = srp_chkready(target->rport); + if (unlikely(scmnd->result)) + goto err; spin_lock_irqsave(&target->lock, flags); iu = __srp_get_tx_iu(target, SRP_IU_CMD); @@ -1375,7 +1881,6 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_iu_len, DMA_TO_DEVICE); - scmnd->result = 0; scmnd->host_scribble = (void *) req; cmd = iu->buf; @@ -1392,7 +1897,15 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) len = srp_map_data(scmnd, target, req); if (len < 0) { shost_printk(KERN_ERR, target->scsi_host, - PFX "Failed to map data\n"); + PFX "Failed to map data (%d)\n", len); + /* + * If we ran out of memory descriptors (-ENOMEM) because an + * application is queuing many requests with more than + * max_pages_per_mr sg-list elements, tell the SCSI mid-layer + * to reduce queue depth temporarily. + */ + scmnd->result = len == -ENOMEM ? + DID_OK << 16 | QUEUE_FULL << 1 : DID_ERROR << 16; goto err_iu; } @@ -1404,7 +1917,13 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) goto err_unmap; } - return 0; + ret = 0; + +unlock_rport: + if (in_scsi_eh) + mutex_unlock(&rport->mutex); + + return ret; err_unmap: srp_unmap_data(scmnd, target, req); @@ -1412,20 +1931,47 @@ err_unmap: err_iu: srp_put_tx_iu(target, iu, SRP_IU_CMD); + /* + * Avoid that the loops that iterate over the request ring can + * encounter a dangling SCSI command pointer. + */ + req->scmnd = NULL; + spin_lock_irqsave(&target->lock, flags); list_add(&req->list, &target->free_reqs); err_unlock: spin_unlock_irqrestore(&target->lock, flags); - return SCSI_MLQUEUE_HOST_BUSY; +err: + if (scmnd->result) { + scmnd->scsi_done(scmnd); + ret = 0; + } else { + ret = SCSI_MLQUEUE_HOST_BUSY; + } + + goto unlock_rport; } +/* + * Note: the resources allocated in this function are freed in + * srp_free_target_ib(). + */ static int srp_alloc_iu_bufs(struct srp_target_port *target) { int i; - for (i = 0; i < SRP_RQ_SIZE; ++i) { + target->rx_ring = kzalloc(target->queue_size * sizeof(*target->rx_ring), + GFP_KERNEL); + if (!target->rx_ring) + goto err_no_ring; + target->tx_ring = kzalloc(target->queue_size * sizeof(*target->tx_ring), + GFP_KERNEL); + if (!target->tx_ring) + goto err_no_ring; + + for (i = 0; i < target->queue_size; ++i) { target->rx_ring[i] = srp_alloc_iu(target->srp_host, target->max_ti_iu_len, GFP_KERNEL, DMA_FROM_DEVICE); @@ -1433,7 +1979,7 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target) goto err; } - for (i = 0; i < SRP_SQ_SIZE; ++i) { + for (i = 0; i < target->queue_size; ++i) { target->tx_ring[i] = srp_alloc_iu(target->srp_host, target->max_iu_len, GFP_KERNEL, DMA_TO_DEVICE); @@ -1446,16 +1992,18 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target) return 0; err: - for (i = 0; i < SRP_RQ_SIZE; ++i) { + for (i = 0; i < target->queue_size; ++i) { srp_free_iu(target->srp_host, target->rx_ring[i]); - target->rx_ring[i] = NULL; - } - - for (i = 0; i < SRP_SQ_SIZE; ++i) { srp_free_iu(target->srp_host, target->tx_ring[i]); - target->tx_ring[i] = NULL; } + +err_no_ring: + kfree(target->tx_ring); + target->tx_ring = NULL; + kfree(target->rx_ring); + target->rx_ring = NULL; + return -ENOMEM; } @@ -1506,6 +2054,9 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, target->scsi_host->can_queue = min(target->req_lim - SRP_TSK_MGMT_SQ_SIZE, target->scsi_host->can_queue); + target->scsi_host->cmd_per_lun + = min_t(int, target->scsi_host->can_queue, + target->scsi_host->cmd_per_lun); } else { shost_printk(KERN_WARNING, target->scsi_host, PFX "Unhandled RSP opcode %#x\n", lrsp->opcode); @@ -1513,7 +2064,7 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, goto error; } - if (!target->rx_ring[0]) { + if (!target->rx_ring) { ret = srp_alloc_iu_bufs(target); if (ret) goto error; @@ -1533,7 +2084,7 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, if (ret) goto error_free; - for (i = 0; i < SRP_RQ_SIZE; i++) { + for (i = 0; i < target->queue_size; i++) { struct srp_iu *iu = target->rx_ring[i]; ret = srp_post_recv(target, iu); if (ret) @@ -1619,8 +2170,10 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, shost_printk(KERN_WARNING, shost, PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n"); else - shost_printk(KERN_WARNING, shost, - PFX "SRP LOGIN REJECTED, reason 0x%08x\n", reason); + shost_printk(KERN_WARNING, shost, PFX + "SRP LOGIN from %pI6 to %pI6 REJECTED, reason 0x%08x\n", + target->path.sgid.raw, + target->orig_dgid, reason); } else shost_printk(KERN_WARNING, shost, " REJ reason: IB_CM_REJ_CONSUMER_DEFINED," @@ -1672,11 +2225,13 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) if (ib_send_cm_drep(cm_id, NULL, 0)) shost_printk(KERN_ERR, target->scsi_host, PFX "Sending CM DREP failed\n"); + queue_work(system_long_wq, &target->tl_err_work); break; case IB_CM_TIMEWAIT_EXIT: shost_printk(KERN_ERR, target->scsi_host, PFX "connection closed\n"); + comp = 1; target->status = 0; break; @@ -1698,9 +2253,61 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) return 0; } +/** + * srp_change_queue_type - changing device queue tag type + * @sdev: scsi device struct + * @tag_type: requested tag type + * + * Returns queue tag type. + */ +static int +srp_change_queue_type(struct scsi_device *sdev, int tag_type) +{ + if (sdev->tagged_supported) { + scsi_set_tag_type(sdev, tag_type); + if (tag_type) + scsi_activate_tcq(sdev, sdev->queue_depth); + else + scsi_deactivate_tcq(sdev, sdev->queue_depth); + } else + tag_type = 0; + + return tag_type; +} + +/** + * srp_change_queue_depth - setting device queue depth + * @sdev: scsi device struct + * @qdepth: requested queue depth + * @reason: SCSI_QDEPTH_DEFAULT/SCSI_QDEPTH_QFULL/SCSI_QDEPTH_RAMP_UP + * (see include/scsi/scsi_host.h for definition) + * + * Returns queue depth. + */ +static int +srp_change_queue_depth(struct scsi_device *sdev, int qdepth, int reason) +{ + struct Scsi_Host *shost = sdev->host; + int max_depth; + if (reason == SCSI_QDEPTH_DEFAULT || reason == SCSI_QDEPTH_RAMP_UP) { + max_depth = shost->can_queue; + if (!sdev->tagged_supported) + max_depth = 1; + if (qdepth > max_depth) + qdepth = max_depth; + scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), qdepth); + } else if (reason == SCSI_QDEPTH_QFULL) + scsi_track_queue_full(sdev, qdepth); + else + return -EOPNOTSUPP; + + return sdev->queue_depth; +} + static int srp_send_tsk_mgmt(struct srp_target_port *target, u64 req_tag, unsigned int lun, u8 func) { + struct srp_rport *rport = target->rport; struct ib_device *dev = target->srp_host->srp_dev->dev; struct srp_iu *iu; struct srp_tsk_mgmt *tsk_mgmt; @@ -1710,12 +2317,20 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, init_completion(&target->tsk_mgmt_done); + /* + * Lock the rport mutex to avoid that srp_create_target_ib() is + * invoked while a task management function is being sent. + */ + mutex_lock(&rport->mutex); spin_lock_irq(&target->lock); iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT); spin_unlock_irq(&target->lock); - if (!iu) + if (!iu) { + mutex_unlock(&rport->mutex); + return -1; + } ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt, DMA_TO_DEVICE); @@ -1732,8 +2347,11 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, DMA_TO_DEVICE); if (srp_post_send(target, iu, sizeof *tsk_mgmt)) { srp_put_tx_iu(target, iu, SRP_IU_TSK_MGMT); + mutex_unlock(&rport->mutex); + return -1; } + mutex_unlock(&rport->mutex); if (!wait_for_completion_timeout(&target->tsk_mgmt_done, msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS))) @@ -1750,12 +2368,12 @@ static int srp_abort(struct scsi_cmnd *scmnd) shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n"); - if (!req || !srp_claim_req(target, req, scmnd)) - return FAILED; + if (!req || !srp_claim_req(target, req, NULL, scmnd)) + return SUCCESS; if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun, SRP_TSK_ABORT_TASK) == 0) ret = SUCCESS; - else if (target->transport_offline) + else if (target->rport->state == SRP_RPORT_LOST) ret = FAST_IO_FAIL; else ret = FAILED; @@ -1779,10 +2397,9 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) if (target->tsk_mgmt_status) return FAILED; - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { + for (i = 0; i < target->req_ring_size; ++i) { struct srp_request *req = &target->req_ring[i]; - if (req->scmnd && req->scmnd->device == scmnd->device) - srp_reset_req(target, req); + srp_finish_req(target, req, scmnd->device, DID_RESET << 16); } return SUCCESS; @@ -1791,14 +2408,10 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) static int srp_reset_host(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); - int ret = FAILED; shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n"); - if (!srp_reconnect_target(target)) - ret = SUCCESS; - - return ret; + return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED; } static int srp_slave_configure(struct scsi_device *sdev) @@ -1851,6 +2464,14 @@ static ssize_t show_pkey(struct device *dev, struct device_attribute *attr, return sprintf(buf, "0x%04x\n", be16_to_cpu(target->path.pkey)); } +static ssize_t show_sgid(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%pI6\n", target->path.sgid.raw); +} + static ssize_t show_dgid(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1907,6 +2528,14 @@ static ssize_t show_comp_vector(struct device *dev, return sprintf(buf, "%d\n", target->comp_vector); } +static ssize_t show_tl_retry_count(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%d\n", target->tl_retry_count); +} + static ssize_t show_cmd_sg_entries(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1927,6 +2556,7 @@ static DEVICE_ATTR(id_ext, S_IRUGO, show_id_ext, NULL); static DEVICE_ATTR(ioc_guid, S_IRUGO, show_ioc_guid, NULL); static DEVICE_ATTR(service_id, S_IRUGO, show_service_id, NULL); static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); +static DEVICE_ATTR(sgid, S_IRUGO, show_sgid, NULL); static DEVICE_ATTR(dgid, S_IRUGO, show_dgid, NULL); static DEVICE_ATTR(orig_dgid, S_IRUGO, show_orig_dgid, NULL); static DEVICE_ATTR(req_lim, S_IRUGO, show_req_lim, NULL); @@ -1934,6 +2564,7 @@ static DEVICE_ATTR(zero_req_lim, S_IRUGO, show_zero_req_lim, NULL); static DEVICE_ATTR(local_ib_port, S_IRUGO, show_local_ib_port, NULL); static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL); static DEVICE_ATTR(comp_vector, S_IRUGO, show_comp_vector, NULL); +static DEVICE_ATTR(tl_retry_count, S_IRUGO, show_tl_retry_count, NULL); static DEVICE_ATTR(cmd_sg_entries, S_IRUGO, show_cmd_sg_entries, NULL); static DEVICE_ATTR(allow_ext_sg, S_IRUGO, show_allow_ext_sg, NULL); @@ -1942,6 +2573,7 @@ static struct device_attribute *srp_host_attrs[] = { &dev_attr_ioc_guid, &dev_attr_service_id, &dev_attr_pkey, + &dev_attr_sgid, &dev_attr_dgid, &dev_attr_orig_dgid, &dev_attr_req_lim, @@ -1949,6 +2581,7 @@ static struct device_attribute *srp_host_attrs[] = { &dev_attr_local_ib_port, &dev_attr_local_ib_device, &dev_attr_comp_vector, + &dev_attr_tl_retry_count, &dev_attr_cmd_sg_entries, &dev_attr_allow_ext_sg, NULL @@ -1961,14 +2594,16 @@ static struct scsi_host_template srp_template = { .slave_configure = srp_slave_configure, .info = srp_target_info, .queuecommand = srp_queuecommand, + .change_queue_depth = srp_change_queue_depth, + .change_queue_type = srp_change_queue_type, .eh_abort_handler = srp_abort, .eh_device_reset_handler = srp_reset_device, .eh_host_reset_handler = srp_reset_host, .skip_settle_delay = true, .sg_tablesize = SRP_DEF_SG_TABLESIZE, - .can_queue = SRP_CMD_SQ_SIZE, + .can_queue = SRP_DEFAULT_CMD_SQ_SIZE, .this_id = -1, - .cmd_per_lun = SRP_CMD_SQ_SIZE, + .cmd_per_lun = SRP_DEFAULT_CMD_SQ_SIZE, .use_clustering = ENABLE_CLUSTERING, .shost_attrs = srp_host_attrs }; @@ -1994,6 +2629,7 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) } rport->lld_data = target; + target->rport = rport; spin_lock(&host->target_lock); list_add_tail(&target->list, &host->target_list); @@ -2022,6 +2658,8 @@ static struct class srp_class = { /** * srp_conn_unique() - check whether the connection to a target is unique + * @host: SRP host. + * @target: SRP target port. */ static bool srp_conn_unique(struct srp_host *host, struct srp_target_port *target) @@ -2073,6 +2711,8 @@ enum { SRP_OPT_ALLOW_EXT_SG = 1 << 10, SRP_OPT_SG_TABLESIZE = 1 << 11, SRP_OPT_COMP_VECTOR = 1 << 12, + SRP_OPT_TL_RETRY_COUNT = 1 << 13, + SRP_OPT_QUEUE_SIZE = 1 << 14, SRP_OPT_ALL = (SRP_OPT_ID_EXT | SRP_OPT_IOC_GUID | SRP_OPT_DGID | @@ -2094,6 +2734,8 @@ static const match_table_t srp_opt_tokens = { { SRP_OPT_ALLOW_EXT_SG, "allow_ext_sg=%u" }, { SRP_OPT_SG_TABLESIZE, "sg_tablesize=%u" }, { SRP_OPT_COMP_VECTOR, "comp_vector=%u" }, + { SRP_OPT_TL_RETRY_COUNT, "tl_retry_count=%u" }, + { SRP_OPT_QUEUE_SIZE, "queue_size=%d" }, { SRP_OPT_ERR, NULL } }; @@ -2188,13 +2830,25 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) target->scsi_host->max_sectors = token; break; + case SRP_OPT_QUEUE_SIZE: + if (match_int(args, &token) || token < 1) { + pr_warn("bad queue_size parameter '%s'\n", p); + goto out; + } + target->scsi_host->can_queue = token; + target->queue_size = token + SRP_RSP_SQ_SIZE + + SRP_TSK_MGMT_SQ_SIZE; + if (!(opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) + target->scsi_host->cmd_per_lun = token; + break; + case SRP_OPT_MAX_CMD_PER_LUN: - if (match_int(args, &token)) { + if (match_int(args, &token) || token < 1) { pr_warn("bad max cmd_per_lun parameter '%s'\n", p); goto out; } - target->scsi_host->cmd_per_lun = min(token, SRP_CMD_SQ_SIZE); + target->scsi_host->cmd_per_lun = token; break; case SRP_OPT_IO_CLASS: @@ -2257,6 +2911,15 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) target->comp_vector = token; break; + case SRP_OPT_TL_RETRY_COUNT: + if (match_int(args, &token) || token < 2 || token > 7) { + pr_warn("bad tl_retry_count parameter '%s' (must be a number between 2 and 7)\n", + p); + goto out; + } + target->tl_retry_count = token; + break; + default: pr_warn("unknown parameter or missing value '%s' in target creation request\n", p); @@ -2273,6 +2936,12 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) pr_warn("target creation request is missing parameter '%s'\n", srp_opt_tokens[i].pattern); + if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue + && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) + pr_warn("cmd_per_lun = %d > queue_size = %d\n", + target->scsi_host->cmd_per_lun, + target->scsi_host->can_queue); + out: kfree(options); return ret; @@ -2286,9 +2955,9 @@ static ssize_t srp_create_target(struct device *dev, container_of(dev, struct srp_host, dev); struct Scsi_Host *target_host; struct srp_target_port *target; - struct ib_device *ibdev = host->srp_dev->dev; - dma_addr_t dma_addr; - int i, ret; + struct srp_device *srp_dev = host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; + int ret; target_host = scsi_host_alloc(&srp_template, sizeof (struct srp_target_port)); @@ -2311,11 +2980,17 @@ static ssize_t srp_create_target(struct device *dev, target->cmd_sg_cnt = cmd_sg_entries; target->sg_tablesize = indirect_sg_entries ? : cmd_sg_entries; target->allow_ext_sg = allow_ext_sg; + target->tl_retry_count = 7; + target->queue_size = SRP_DEFAULT_QUEUE_SIZE; + + mutex_lock(&host->add_target_mutex); ret = srp_parse_options(buf, target); if (ret) goto err; + target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE; + if (!srp_conn_unique(target->srp_host, target)) { shost_printk(KERN_INFO, target->scsi_host, PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n", @@ -2326,9 +3001,9 @@ static ssize_t srp_create_target(struct device *dev, goto err; } - if (!host->srp_dev->fmr_pool && !target->allow_ext_sg && - target->cmd_sg_cnt < target->sg_tablesize) { - pr_warn("No FMR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); + if (!srp_dev->has_fmr && !srp_dev->has_fr && !target->allow_ext_sg && + target->cmd_sg_cnt < target->sg_tablesize) { + pr_warn("No MR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); target->sg_tablesize = target->cmd_sg_cnt; } @@ -2339,42 +3014,17 @@ static ssize_t srp_create_target(struct device *dev, sizeof (struct srp_indirect_buf) + target->cmd_sg_cnt * sizeof (struct srp_direct_buf); + INIT_WORK(&target->tl_err_work, srp_tl_err_work); INIT_WORK(&target->remove_work, srp_remove_work); spin_lock_init(&target->lock); INIT_LIST_HEAD(&target->free_tx); - INIT_LIST_HEAD(&target->free_reqs); - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { - struct srp_request *req = &target->req_ring[i]; - - req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof (void *), - GFP_KERNEL); - req->map_page = kmalloc(SRP_FMR_SIZE * sizeof (void *), - GFP_KERNEL); - req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); - if (!req->fmr_list || !req->map_page || !req->indirect_desc) - goto err_free_mem; - - dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, - target->indirect_size, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(ibdev, dma_addr)) - goto err_free_mem; - - req->indirect_dma_addr = dma_addr; - req->index = i; - list_add_tail(&req->list, &target->free_reqs); - } - - ib_query_gid(ibdev, host->port, 0, &target->path.sgid); + ret = srp_alloc_req_data(target); + if (ret) + goto err_free_mem; - shost_printk(KERN_DEBUG, target->scsi_host, PFX - "new target: id_ext %016llx ioc_guid %016llx pkey %04x " - "service_id %016llx dgid %pI6\n", - (unsigned long long) be64_to_cpu(target->id_ext), - (unsigned long long) be64_to_cpu(target->ioc_guid), - be16_to_cpu(target->path.pkey), - (unsigned long long) be64_to_cpu(target->service_id), - target->path.dgid.raw); + ret = ib_query_gid(ibdev, host->port, 0, &target->path.sgid); + if (ret) + goto err_free_mem; ret = srp_create_target_ib(target); if (ret) @@ -2395,7 +3045,19 @@ static ssize_t srp_create_target(struct device *dev, if (ret) goto err_disconnect; - return count; + shost_printk(KERN_DEBUG, target->scsi_host, PFX + "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + be16_to_cpu(target->path.pkey), + be64_to_cpu(target->service_id), + target->path.sgid.raw, target->path.dgid.raw); + + ret = count; + +out: + mutex_unlock(&host->add_target_mutex); + return ret; err_disconnect: srp_disconnect_target(target); @@ -2411,8 +3073,7 @@ err_free_mem: err: scsi_host_put(target_host); - - return ret; + goto out; } static DEVICE_ATTR(add_target, S_IWUSR, NULL, srp_create_target); @@ -2448,6 +3109,7 @@ static struct srp_host *srp_add_port(struct srp_device *device, u8 port) INIT_LIST_HEAD(&host->target_list); spin_lock_init(&host->target_lock); init_completion(&host->released); + mutex_init(&host->add_target_mutex); host->srp_dev = device; host->port = port; @@ -2479,9 +3141,9 @@ static void srp_add_one(struct ib_device *device) { struct srp_device *srp_dev; struct ib_device_attr *dev_attr; - struct ib_fmr_pool_param fmr_param; struct srp_host *host; - int max_pages_per_fmr, fmr_page_shift, s, e, p; + int mr_page_shift, s, e, p; + u64 max_pages_per_mr; dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); if (!dev_attr) @@ -2496,15 +3158,39 @@ static void srp_add_one(struct ib_device *device) if (!srp_dev) goto free_attr; + srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && + device->map_phys_fmr && device->unmap_fmr); + srp_dev->has_fr = (dev_attr->device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + if (!srp_dev->has_fmr && !srp_dev->has_fr) + dev_warn(&device->dev, "neither FMR nor FR is supported\n"); + + srp_dev->use_fast_reg = (srp_dev->has_fr && + (!srp_dev->has_fmr || prefer_fr)); + /* * Use the smallest page size supported by the HCA, down to a * minimum of 4096 bytes. We're unlikely to build large sglists * out of smaller entries. */ - fmr_page_shift = max(12, ffs(dev_attr->page_size_cap) - 1); - srp_dev->fmr_page_size = 1 << fmr_page_shift; - srp_dev->fmr_page_mask = ~((u64) srp_dev->fmr_page_size - 1); - srp_dev->fmr_max_size = srp_dev->fmr_page_size * SRP_FMR_SIZE; + mr_page_shift = max(12, ffs(dev_attr->page_size_cap) - 1); + srp_dev->mr_page_size = 1 << mr_page_shift; + srp_dev->mr_page_mask = ~((u64) srp_dev->mr_page_size - 1); + max_pages_per_mr = dev_attr->max_mr_size; + do_div(max_pages_per_mr, srp_dev->mr_page_size); + srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR, + max_pages_per_mr); + if (srp_dev->use_fast_reg) { + srp_dev->max_pages_per_mr = + min_t(u32, srp_dev->max_pages_per_mr, + dev_attr->max_fast_reg_page_list_len); + } + srp_dev->mr_max_size = srp_dev->mr_page_size * + srp_dev->max_pages_per_mr; + pr_debug("%s: mr_page_shift = %d, dev_attr->max_mr_size = %#llx, dev_attr->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n", + device->name, mr_page_shift, dev_attr->max_mr_size, + dev_attr->max_fast_reg_page_list_len, + srp_dev->max_pages_per_mr, srp_dev->mr_max_size); INIT_LIST_HEAD(&srp_dev->dev_list); @@ -2520,27 +3206,6 @@ static void srp_add_one(struct ib_device *device) if (IS_ERR(srp_dev->mr)) goto err_pd; - for (max_pages_per_fmr = SRP_FMR_SIZE; - max_pages_per_fmr >= SRP_FMR_MIN_SIZE; - max_pages_per_fmr /= 2, srp_dev->fmr_max_size /= 2) { - memset(&fmr_param, 0, sizeof fmr_param); - fmr_param.pool_size = SRP_FMR_POOL_SIZE; - fmr_param.dirty_watermark = SRP_FMR_DIRTY_SIZE; - fmr_param.cache = 1; - fmr_param.max_pages_per_fmr = max_pages_per_fmr; - fmr_param.page_shift = fmr_page_shift; - fmr_param.access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ); - - srp_dev->fmr_pool = ib_create_fmr_pool(srp_dev->pd, &fmr_param); - if (!IS_ERR(srp_dev->fmr_pool)) - break; - } - - if (IS_ERR(srp_dev->fmr_pool)) - srp_dev->fmr_pool = NULL; - if (device->node_type == RDMA_NODE_IB_SWITCH) { s = 0; e = 0; @@ -2603,8 +3268,6 @@ static void srp_remove_one(struct ib_device *device) kfree(host); } - if (srp_dev->fmr_pool) - ib_destroy_fmr_pool(srp_dev->fmr_pool); ib_dereg_mr(srp_dev->mr); ib_dealloc_pd(srp_dev->pd); @@ -2612,7 +3275,14 @@ static void srp_remove_one(struct ib_device *device) } static struct srp_function_template ib_srp_transport_functions = { + .has_rport_state = true, + .reset_timer_if_blocked = true, + .reconnect_delay = &srp_reconnect_delay, + .fast_io_fail_tmo = &srp_fast_io_fail_tmo, + .dev_loss_tmo = &srp_dev_loss_tmo, + .reconnect = srp_rport_reconnect, .rport_delete = srp_rport_delete, + .terminate_rport_io = srp_terminate_io, }; static int __init srp_init_module(void) diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index e641088c14d..e46ecb15aa0 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -57,25 +57,19 @@ enum { SRP_MAX_LUN = 512, SRP_DEF_SG_TABLESIZE = 12, - SRP_RQ_SHIFT = 6, - SRP_RQ_SIZE = 1 << SRP_RQ_SHIFT, - - SRP_SQ_SIZE = SRP_RQ_SIZE, + SRP_DEFAULT_QUEUE_SIZE = 1 << 6, SRP_RSP_SQ_SIZE = 1, - SRP_REQ_SQ_SIZE = SRP_SQ_SIZE - SRP_RSP_SQ_SIZE, SRP_TSK_MGMT_SQ_SIZE = 1, - SRP_CMD_SQ_SIZE = SRP_REQ_SQ_SIZE - SRP_TSK_MGMT_SQ_SIZE, + SRP_DEFAULT_CMD_SQ_SIZE = SRP_DEFAULT_QUEUE_SIZE - SRP_RSP_SQ_SIZE - + SRP_TSK_MGMT_SQ_SIZE, SRP_TAG_NO_REQ = ~0U, SRP_TAG_TSK_MGMT = 1U << 31, - SRP_FMR_SIZE = 512, - SRP_FMR_MIN_SIZE = 128, - SRP_FMR_POOL_SIZE = 1024, - SRP_FMR_DIRTY_SIZE = SRP_FMR_POOL_SIZE / 4, + SRP_MAX_PAGES_PER_MR = 512, - SRP_MAP_ALLOW_FMR = 0, - SRP_MAP_NO_FMR = 1, + LOCAL_INV_WR_ID_MASK = 1, + FAST_REG_WR_ID_MASK = 2, }; enum srp_target_state { @@ -89,15 +83,24 @@ enum srp_iu_type { SRP_IU_RSP, }; +/* + * @mr_page_mask: HCA memory registration page mask. + * @mr_page_size: HCA memory registration page size. + * @mr_max_size: Maximum size in bytes of a single FMR / FR registration + * request. + */ struct srp_device { struct list_head dev_list; struct ib_device *dev; struct ib_pd *pd; struct ib_mr *mr; - struct ib_fmr_pool *fmr_pool; - u64 fmr_page_mask; - int fmr_page_size; - int fmr_max_size; + u64 mr_page_mask; + int mr_page_size; + int mr_max_size; + int max_pages_per_mr; + bool has_fmr; + bool has_fr; + bool use_fast_reg; }; struct srp_host { @@ -108,17 +111,21 @@ struct srp_host { spinlock_t target_lock; struct completion released; struct list_head list; + struct mutex add_target_mutex; }; struct srp_request { struct list_head list; struct scsi_cmnd *scmnd; struct srp_iu *cmd; - struct ib_pool_fmr **fmr_list; + union { + struct ib_pool_fmr **fmr_list; + struct srp_fr_desc **fr_list; + }; u64 *map_page; struct srp_direct_buf *indirect_desc; dma_addr_t indirect_dma_addr; - short nfmr; + short nmdesc; short index; }; @@ -133,6 +140,10 @@ struct srp_target_port { struct ib_cq *send_cq ____cacheline_aligned_in_smp; struct ib_cq *recv_cq; struct ib_qp *qp; + union { + struct ib_fmr_pool *fmr_pool; + struct srp_fr_pool *fr_pool; + }; u32 lkey; u32 rkey; enum srp_target_state state; @@ -140,7 +151,6 @@ struct srp_target_port { unsigned int cmd_sg_cnt; unsigned int indirect_size; bool allow_ext_sg; - bool transport_offline; /* Everything above this point is used in the hot path of * command processing. Try to keep them packed into cachelines. @@ -153,10 +163,14 @@ struct srp_target_port { u16 io_class; struct srp_host *srp_host; struct Scsi_Host *scsi_host; + struct srp_rport *rport; char target_name[32]; unsigned int scsi_id; unsigned int sg_tablesize; + int queue_size; + int req_ring_size; int comp_vector; + int tl_retry_count; struct ib_sa_path_rec path; __be16 orig_dgid[8]; @@ -172,10 +186,11 @@ struct srp_target_port { int zero_req_lim; - struct srp_iu *tx_ring[SRP_SQ_SIZE]; - struct srp_iu *rx_ring[SRP_RQ_SIZE]; - struct srp_request req_ring[SRP_CMD_SQ_SIZE]; + struct srp_iu **tx_ring; + struct srp_iu **rx_ring; + struct srp_request *req_ring; + struct work_struct tl_err_work; struct work_struct remove_work; struct list_head list; @@ -195,15 +210,66 @@ struct srp_iu { enum dma_data_direction direction; }; +/** + * struct srp_fr_desc - fast registration work request arguments + * @entry: Entry in srp_fr_pool.free_list. + * @mr: Memory region. + * @frpl: Fast registration page list. + */ +struct srp_fr_desc { + struct list_head entry; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; +}; + +/** + * struct srp_fr_pool - pool of fast registration descriptors + * + * An entry is available for allocation if and only if it occurs in @free_list. + * + * @size: Number of descriptors in this pool. + * @max_page_list_len: Maximum fast registration work request page list length. + * @lock: Protects free_list. + * @free_list: List of free descriptors. + * @desc: Fast registration descriptor pool. + */ +struct srp_fr_pool { + int size; + int max_page_list_len; + spinlock_t lock; + struct list_head free_list; + struct srp_fr_desc desc[0]; +}; + +/** + * struct srp_map_state - per-request DMA memory mapping state + * @desc: Pointer to the element of the SRP buffer descriptor array + * that is being filled in. + * @pages: Array with DMA addresses of pages being considered for + * memory registration. + * @base_dma_addr: DMA address of the first page that has not yet been mapped. + * @dma_len: Number of bytes that will be registered with the next + * FMR or FR memory registration call. + * @total_len: Total number of bytes in the sg-list being mapped. + * @npages: Number of page addresses in the pages[] array. + * @nmdesc: Number of FMR or FR memory descriptors used for mapping. + * @ndesc: Number of SRP buffer descriptors that have been filled in. + * @unmapped_sg: First element of the sg-list that is mapped via FMR or FR. + * @unmapped_index: Index of the first element mapped via FMR or FR. + * @unmapped_addr: DMA address of the first element mapped via FMR or FR. + */ struct srp_map_state { - struct ib_pool_fmr **next_fmr; + union { + struct ib_pool_fmr **next_fmr; + struct srp_fr_desc **next_fr; + }; struct srp_direct_buf *desc; u64 *pages; dma_addr_t base_dma_addr; - u32 fmr_len; + u32 dma_len; u32 total_len; unsigned int npages; - unsigned int nfmr; + unsigned int nmdesc; unsigned int ndesc; struct scatterlist *unmapped_sg; int unmapped_index; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 653ac6bfc57..fe09f2788b1 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1078,6 +1078,7 @@ static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch, static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, struct srpt_send_ioctx *ioctx) { + struct ib_device *dev = ch->sport->sdev->device; struct se_cmd *cmd; struct scatterlist *sg, *sg_orig; int sg_cnt; @@ -1124,7 +1125,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, db = ioctx->rbufs; tsize = cmd->data_length; - dma_len = sg_dma_len(&sg[0]); + dma_len = ib_sg_dma_len(dev, &sg[0]); riu = ioctx->rdma_ius; /* @@ -1155,7 +1156,8 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, ++j; if (j < count) { sg = sg_next(sg); - dma_len = sg_dma_len(sg); + dma_len = ib_sg_dma_len( + dev, sg); } } } else { @@ -1192,8 +1194,8 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, tsize = cmd->data_length; riu = ioctx->rdma_ius; sg = sg_orig; - dma_len = sg_dma_len(&sg[0]); - dma_addr = sg_dma_address(&sg[0]); + dma_len = ib_sg_dma_len(dev, &sg[0]); + dma_addr = ib_sg_dma_address(dev, &sg[0]); /* this second loop is really mapped sg_addres to rdma_iu->ib_sge */ for (i = 0, j = 0; @@ -1216,8 +1218,10 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, ++j; if (j < count) { sg = sg_next(sg); - dma_len = sg_dma_len(sg); - dma_addr = sg_dma_address(sg); + dma_len = ib_sg_dma_len( + dev, sg); + dma_addr = ib_sg_dma_address( + dev, sg); } } } else { @@ -1352,11 +1356,8 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) /* XXX(hch): this is a horrible layering violation.. */ spin_lock_irqsave(&ioctx->cmd.t_state_lock, flags); - ioctx->cmd.transport_state |= CMD_T_LUN_STOP; ioctx->cmd.transport_state &= ~CMD_T_ACTIVE; spin_unlock_irqrestore(&ioctx->cmd.t_state_lock, flags); - - complete(&ioctx->cmd.transport_lun_stop_comp); break; case SRPT_STATE_CMD_RSP_SENT: /* @@ -1364,9 +1365,6 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) * not been received in time. */ srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); - spin_lock_irqsave(&ioctx->cmd.t_state_lock, flags); - ioctx->cmd.transport_state |= CMD_T_LUN_STOP; - spin_unlock_irqrestore(&ioctx->cmd.t_state_lock, flags); target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); break; case SRPT_STATE_MGMT_RSP_SENT: @@ -1476,7 +1474,6 @@ static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch, { struct se_cmd *cmd; enum srpt_command_state state; - unsigned long flags; cmd = &ioctx->cmd; state = srpt_get_cmd_state(ioctx); @@ -1496,9 +1493,6 @@ static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch, __func__, __LINE__, state); break; case SRPT_RDMA_WRITE_LAST: - spin_lock_irqsave(&ioctx->cmd.t_state_lock, flags); - ioctx->cmd.transport_state |= CMD_T_LUN_STOP; - spin_unlock_irqrestore(&ioctx->cmd.t_state_lock, flags); break; default: printk(KERN_ERR "%s[%d]: opcode = %u\n", __func__, @@ -1588,7 +1582,7 @@ static int srpt_build_tskmgmt_rsp(struct srpt_rdma_ch *ch, int resp_data_len; int resp_len; - resp_data_len = (rsp_code == SRP_TSK_MGMT_SUCCESS) ? 0 : 4; + resp_data_len = 4; resp_len = sizeof(*srp_rsp) + resp_data_len; srp_rsp = ioctx->ioctx.buf; @@ -1600,11 +1594,9 @@ static int srpt_build_tskmgmt_rsp(struct srpt_rdma_ch *ch, + atomic_xchg(&ch->req_lim_delta, 0)); srp_rsp->tag = tag; - if (rsp_code != SRP_TSK_MGMT_SUCCESS) { - srp_rsp->flags |= SRP_RSP_FLAG_RSPVALID; - srp_rsp->resp_data_len = cpu_to_be32(resp_data_len); - srp_rsp->data[3] = rsp_code; - } + srp_rsp->flags |= SRP_RSP_FLAG_RSPVALID; + srp_rsp->resp_data_len = cpu_to_be32(resp_data_len); + srp_rsp->data[3] = rsp_code; return resp_len; } @@ -2358,6 +2350,8 @@ static void srpt_release_channel_work(struct work_struct *w) transport_deregister_session(se_sess); ch->sess = NULL; + ib_destroy_cm_id(ch->cm_id); + srpt_destroy_ch_ib(ch); srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring, @@ -2368,8 +2362,6 @@ static void srpt_release_channel_work(struct work_struct *w) list_del(&ch->list); spin_unlock_irq(&sdev->spinlock); - ib_destroy_cm_id(ch->cm_id); - if (ch->release_done) complete(ch->release_done); @@ -2592,7 +2584,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, goto destroy_ib; } - ch->sess = transport_init_session(); + ch->sess = transport_init_session(TARGET_PROT_NORMAL); if (IS_ERR(ch->sess)) { rej->reason = __constant_cpu_to_be32( SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); @@ -3093,6 +3085,14 @@ static void srpt_queue_tm_rsp(struct se_cmd *cmd) srpt_queue_response(cmd); } +static void srpt_aborted_task(struct se_cmd *cmd) +{ + struct srpt_send_ioctx *ioctx = container_of(cmd, + struct srpt_send_ioctx, cmd); + + srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); +} + static int srpt_queue_status(struct se_cmd *cmd) { struct srpt_send_ioctx *ioctx; @@ -3678,9 +3678,9 @@ static ssize_t srpt_tpg_attrib_store_srp_max_rdma_size( unsigned long val; int ret; - ret = strict_strtoul(page, 0, &val); + ret = kstrtoul(page, 0, &val); if (ret < 0) { - pr_err("strict_strtoul() failed with ret: %d\n", ret); + pr_err("kstrtoul() failed with ret: %d\n", ret); return -EINVAL; } if (val > MAX_SRPT_RDMA_SIZE) { @@ -3718,9 +3718,9 @@ static ssize_t srpt_tpg_attrib_store_srp_max_rsp_size( unsigned long val; int ret; - ret = strict_strtoul(page, 0, &val); + ret = kstrtoul(page, 0, &val); if (ret < 0) { - pr_err("strict_strtoul() failed with ret: %d\n", ret); + pr_err("kstrtoul() failed with ret: %d\n", ret); return -EINVAL; } if (val > MAX_SRPT_RSP_SIZE) { @@ -3758,9 +3758,9 @@ static ssize_t srpt_tpg_attrib_store_srp_sq_size( unsigned long val; int ret; - ret = strict_strtoul(page, 0, &val); + ret = kstrtoul(page, 0, &val); if (ret < 0) { - pr_err("strict_strtoul() failed with ret: %d\n", ret); + pr_err("kstrtoul() failed with ret: %d\n", ret); return -EINVAL; } if (val > MAX_SRPT_SRQ_SIZE) { @@ -3805,7 +3805,7 @@ static ssize_t srpt_tpg_store_enable( unsigned long tmp; int ret; - ret = strict_strtoul(page, 0, &tmp); + ret = kstrtoul(page, 0, &tmp); if (ret < 0) { printk(KERN_ERR "Unable to extract srpt_tpg_store_enable\n"); return -EINVAL; @@ -3940,6 +3940,7 @@ static struct target_core_fabric_ops srpt_template = { .queue_data_in = srpt_queue_data_in, .queue_status = srpt_queue_status, .queue_tm_rsp = srpt_queue_tm_rsp, + .aborted_task = srpt_aborted_task, /* * Setup function pointers for generic logic in * target_core_fabric_configfs.c |
