diff options
Diffstat (limited to 'drivers/infiniband/hw')
126 files changed, 12688 insertions, 2442 deletions
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile new file mode 100644 index 00000000000..e900b03531a --- /dev/null +++ b/drivers/infiniband/hw/Makefile @@ -0,0 +1,12 @@ +obj-$(CONFIG_INFINIBAND_MTHCA)		+= mthca/ +obj-$(CONFIG_INFINIBAND_IPATH)		+= ipath/ +obj-$(CONFIG_INFINIBAND_QIB)		+= qib/ +obj-$(CONFIG_INFINIBAND_EHCA)		+= ehca/ +obj-$(CONFIG_INFINIBAND_AMSO1100)	+= amso1100/ +obj-$(CONFIG_INFINIBAND_CXGB3)		+= cxgb3/ +obj-$(CONFIG_INFINIBAND_CXGB4)		+= cxgb4/ +obj-$(CONFIG_MLX4_INFINIBAND)		+= mlx4/ +obj-$(CONFIG_MLX5_INFINIBAND)		+= mlx5/ +obj-$(CONFIG_INFINIBAND_NES)		+= nes/ +obj-$(CONFIG_INFINIBAND_OCRDMA)		+= ocrdma/ +obj-$(CONFIG_INFINIBAND_USNIC)		+= usnic/ diff --git a/drivers/infiniband/hw/amso1100/c2.c b/drivers/infiniband/hw/amso1100/c2.c index d53cf519f42..00400c352c1 100644 --- a/drivers/infiniband/hw/amso1100/c2.c +++ b/drivers/infiniband/hw/amso1100/c2.c @@ -1082,6 +1082,7 @@ static int c2_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)  	/* Initialize network device */  	if ((netdev = c2_devinit(c2dev, mmio_regs)) == NULL) { +		ret = -ENOMEM;  		iounmap(mmio_regs);  		goto bail4;  	} @@ -1151,7 +1152,8 @@ static int c2_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)  		goto bail10;  	} -	if (c2_register_device(c2dev)) +	ret = c2_register_device(c2dev); +	if (ret)  		goto bail10;  	return 0; diff --git a/drivers/infiniband/hw/amso1100/c2_ae.c b/drivers/infiniband/hw/amso1100/c2_ae.c index d5d1929753e..cedda25232b 100644 --- a/drivers/infiniband/hw/amso1100/c2_ae.c +++ b/drivers/infiniband/hw/amso1100/c2_ae.c @@ -141,7 +141,7 @@ static const char *to_qp_state_str(int state)  		return "C2_QP_STATE_ERROR";  	default:  		return "<invalid QP state>"; -	}; +	}  }  void c2_ae_event(struct c2_dev *c2dev, u32 mq_index) diff --git a/drivers/infiniband/hw/amso1100/c2_intr.c b/drivers/infiniband/hw/amso1100/c2_intr.c index 8951db4ae29..3a17d9b36db 100644 --- a/drivers/infiniband/hw/amso1100/c2_intr.c +++ b/drivers/infiniband/hw/amso1100/c2_intr.c @@ -169,7 +169,8 @@ static void handle_vq(struct c2_dev *c2dev, u32 mq_index)  		 * We should never get here, as the adapter should  		 * never send us a reply that we're not expecting.  		 */ -		vq_repbuf_free(c2dev, host_msg); +		if (reply_msg != NULL) +			vq_repbuf_free(c2dev, host_msg);  		pr_debug("handle_vq: UNEXPECTEDLY got NULL req\n");  		return;  	} diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c index 07eb3a8067d..8af33cf1fc4 100644 --- a/drivers/infiniband/hw/amso1100/c2_provider.c +++ b/drivers/infiniband/hw/amso1100/c2_provider.c @@ -431,9 +431,9 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  	u64 *pages;  	u64 kva = 0;  	int shift, n, len; -	int i, j, k; +	int i, k, entry;  	int err = 0; -	struct ib_umem_chunk *chunk; +	struct scatterlist *sg;  	struct c2_pd *c2pd = to_c2pd(pd);  	struct c2_mr *c2mr; @@ -452,10 +452,7 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  	}  	shift = ffs(c2mr->umem->page_size) - 1; - -	n = 0; -	list_for_each_entry(chunk, &c2mr->umem->chunk_list, list) -		n += chunk->nents; +	n = c2mr->umem->nmap;  	pages = kmalloc(n * sizeof(u64), GFP_KERNEL);  	if (!pages) { @@ -464,14 +461,12 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  	}  	i = 0; -	list_for_each_entry(chunk, &c2mr->umem->chunk_list, list) { -		for (j = 0; j < chunk->nmap; ++j) { -			len = sg_dma_len(&chunk->page_list[j]) >> shift; -			for (k = 0; k < len; ++k) { -				pages[i++] = -					sg_dma_address(&chunk->page_list[j]) + -					(c2mr->umem->page_size * k); -			} +	for_each_sg(c2mr->umem->sg_head.sgl, sg, c2mr->umem->nmap, entry) { +		len = sg_dma_len(sg) >> shift; +		for (k = 0; k < len; ++k) { +			pages[i++] = +				sg_dma_address(sg) + +				(c2mr->umem->page_size * k);  		}  	} diff --git a/drivers/infiniband/hw/amso1100/c2_rnic.c b/drivers/infiniband/hw/amso1100/c2_rnic.c index b7c98699005..d2a6d961344 100644 --- a/drivers/infiniband/hw/amso1100/c2_rnic.c +++ b/drivers/infiniband/hw/amso1100/c2_rnic.c @@ -576,7 +576,8 @@ int c2_rnic_init(struct c2_dev *c2dev)  		goto bail4;  	/* Initialize cached the adapter limits */ -	if (c2_rnic_query(c2dev, &c2dev->props)) +	err = c2_rnic_query(c2dev, &c2dev->props); +	if (err)  		goto bail5;  	/* Initialize the PD pool */ diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index c3f5aca4ef0..de1c61b417d 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -735,14 +735,12 @@ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry,  			((perm & TPT_MW_BIND) ? F_TPT_MW_BIND_ENABLE : 0) |  			V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) |  			V_TPT_PAGE_SIZE(page_size)); -		tpt.rsvd_pbl_addr = reset_tpt_entry ? 0 : -				    cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3)); +		tpt.rsvd_pbl_addr = cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3));  		tpt.len = cpu_to_be32(len);  		tpt.va_hi = cpu_to_be32((u32) (to >> 32));  		tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL));  		tpt.rsvd_bind_cnt_or_pstag = 0; -		tpt.rsvd_pbl_size = reset_tpt_entry ? 0 : -				  cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2)); +		tpt.rsvd_pbl_size = cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2));  	}  	err = cxio_hal_ctrl_qp_write_mem(rdev_p,  				       stag_idx + diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index 095bb046e2c..cb78b1e9bcd 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -418,6 +418,7 @@ static int send_abort(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp)  	skb->priority = CPL_PRIORITY_DATA;  	set_arp_failure_handler(skb, abort_arp_failure);  	req = (struct cpl_abort_req *) skb_put(skb, sizeof(*req)); +	memset(req, 0, sizeof(*req));  	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));  	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));  	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid)); diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index d2283837d45..811b24a539c 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -618,14 +618,13 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  {  	__be64 *pages;  	int shift, n, len; -	int i, j, k; +	int i, k, entry;  	int err = 0; -	struct ib_umem_chunk *chunk;  	struct iwch_dev *rhp;  	struct iwch_pd *php;  	struct iwch_mr *mhp;  	struct iwch_reg_user_mr_resp uresp; - +	struct scatterlist *sg;  	PDBG("%s ib_pd %p\n", __func__, pd);  	php = to_iwch_pd(pd); @@ -645,9 +644,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  	shift = ffs(mhp->umem->page_size) - 1; -	n = 0; -	list_for_each_entry(chunk, &mhp->umem->chunk_list, list) -		n += chunk->nents; +	n = mhp->umem->nmap;  	err = iwch_alloc_pbl(mhp, n);  	if (err) @@ -661,12 +658,10 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  	i = n = 0; -	list_for_each_entry(chunk, &mhp->umem->chunk_list, list) -		for (j = 0; j < chunk->nmap; ++j) { -			len = sg_dma_len(&chunk->page_list[j]) >> shift; +	for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { +			len = sg_dma_len(sg) >> shift;  			for (k = 0; k < len; ++k) { -				pages[i++] = cpu_to_be64(sg_dma_address( -					&chunk->page_list[j]) + +				pages[i++] = cpu_to_be64(sg_dma_address(sg) +  					mhp->umem->page_size * k);  				if (i == PAGE_SIZE / sizeof *pages) {  					err = iwch_write_pbl(mhp, pages, i, n); @@ -676,7 +671,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  					i = 0;  				}  			} -		} +	}  	if (i)  		err = iwch_write_pbl(mhp, pages, i, n); diff --git a/drivers/infiniband/hw/cxgb4/Kconfig b/drivers/infiniband/hw/cxgb4/Kconfig index d4e8983fba5..23f38cf2c5c 100644 --- a/drivers/infiniband/hw/cxgb4/Kconfig +++ b/drivers/infiniband/hw/cxgb4/Kconfig @@ -1,10 +1,10 @@  config INFINIBAND_CXGB4 -	tristate "Chelsio T4 RDMA Driver" +	tristate "Chelsio T4/T5 RDMA Driver"  	depends on CHELSIO_T4 && INET && (IPV6 || IPV6=n)  	select GENERIC_ALLOCATOR  	---help--- -	  This is an iWARP/RDMA driver for the Chelsio T4 1GbE and -	  10GbE adapters. +	  This is an iWARP/RDMA driver for the Chelsio T4 and T5 +	  1GbE, 10GbE adapters and T5 40GbE adapter.  	  For general information about Chelsio and our products, visit  	  our website at <http://www.chelsio.com>. diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 12fef76c791..768a0fb67dd 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1,5 +1,5 @@  /* - * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * Copyright (c) 2009-2014 Chelsio, Inc. All rights reserved.   *   * This software is available to you under a choice of one of two   * licenses.  You may choose to be licensed under the terms of the GNU @@ -47,6 +47,8 @@  #include <net/ip6_route.h>  #include <net/addrconf.h> +#include <rdma/ib_addr.h> +  #include "iw_cxgb4.h"  static char *states[] = { @@ -98,9 +100,9 @@ int c4iw_debug;  module_param(c4iw_debug, int, 0644);  MODULE_PARM_DESC(c4iw_debug, "Enable debug logging (default=0)"); -static int peer2peer; +static int peer2peer = 1;  module_param(peer2peer, int, 0644); -MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=0)"); +MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=1)");  static int p2p_type = FW_RI_INIT_P2PTYPE_READ_REQ;  module_param(p2p_type, int, 0644); @@ -173,12 +175,15 @@ static void start_ep_timer(struct c4iw_ep *ep)  	add_timer(&ep->timer);  } -static void stop_ep_timer(struct c4iw_ep *ep) +static int stop_ep_timer(struct c4iw_ep *ep)  {  	PDBG("%s ep %p stopping\n", __func__, ep);  	del_timer_sync(&ep->timer); -	if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) +	if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) {  		c4iw_put_ep(&ep->com); +		return 0; +	} +	return 1;  }  static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb, @@ -229,12 +234,16 @@ static void release_tid(struct c4iw_rdev *rdev, u32 hwtid, struct sk_buff *skb)  static void set_emss(struct c4iw_ep *ep, u16 opt)  { -	ep->emss = ep->com.dev->rdev.lldi.mtus[GET_TCPOPT_MSS(opt)] - 40; +	ep->emss = ep->com.dev->rdev.lldi.mtus[GET_TCPOPT_MSS(opt)] - +		   sizeof(struct iphdr) - sizeof(struct tcphdr);  	ep->mss = ep->emss;  	if (GET_TCPOPT_TSTAMP(opt))  		ep->emss -= 12;  	if (ep->emss < 128)  		ep->emss = 128; +	if (ep->emss & 7) +		PDBG("Warning: misaligned mtu idx %u mss %u emss=%u\n", +		     GET_TCPOPT_MSS(opt), ep->mss, ep->emss);  	PDBG("%s mss_idx %u mss %u emss=%u\n", __func__, GET_TCPOPT_MSS(opt),  	     ep->mss, ep->emss);  } @@ -291,6 +300,12 @@ void _c4iw_free_ep(struct kref *kref)  		dst_release(ep->dst);  		cxgb4_l2t_release(ep->l2t);  	} +	if (test_bit(RELEASE_MAPINFO, &ep->com.flags)) { +		print_addr(&ep->com, __func__, "remove_mapinfo/mapping"); +		iwpm_remove_mapinfo(&ep->com.local_addr, +				    &ep->com.mapped_local_addr); +		iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW); +	}  	kfree(ep);  } @@ -338,10 +353,7 @@ static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp)  static struct net_device *get_real_dev(struct net_device *egress_dev)  { -	struct net_device *phys_dev = egress_dev; -	if (egress_dev->priv_flags & IFF_802_1Q_VLAN) -		phys_dev = vlan_dev_real_dev(egress_dev); -	return phys_dev; +	return rdma_vlan_dev_real_dev(egress_dev) ? : egress_dev;  }  static int our_interface(struct c4iw_dev *dev, struct net_device *egress_dev) @@ -400,7 +412,8 @@ static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip,  	n = dst_neigh_lookup(&rt->dst, &peer_ip);  	if (!n)  		return NULL; -	if (!our_interface(dev, n->dev)) { +	if (!our_interface(dev, n->dev) && +	    !(n->dev->flags & IFF_LOOPBACK)) {  		dst_release(&rt->dst);  		return NULL;  	} @@ -419,8 +432,17 @@ static void arp_failure_discard(void *handle, struct sk_buff *skb)   */  static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)  { +	struct c4iw_ep *ep = handle; +  	printk(KERN_ERR MOD "ARP failure duing connect\n");  	kfree_skb(skb); +	connect_reply_upcall(ep, -EHOSTUNREACH); +	state_set(&ep->com, DEAD); +	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); +	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); +	dst_release(ep->dst); +	cxgb4_l2t_release(ep->l2t); +	c4iw_put_ep(&ep->com);  }  /* @@ -464,7 +486,7 @@ static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)  	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT;  	flowc->mnemval[5].val = cpu_to_be32(ep->rcv_seq);  	flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF; -	flowc->mnemval[6].val = cpu_to_be32(snd_win); +	flowc->mnemval[6].val = cpu_to_be32(ep->snd_win);  	flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS;  	flowc->mnemval[7].val = cpu_to_be32(ep->emss);  	/* Pad WR to 16 byte boundary */ @@ -524,48 +546,47 @@ static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)  	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);  } -#define VLAN_NONE 0xfff -#define FILTER_SEL_VLAN_NONE 0xffff -#define FILTER_SEL_WIDTH_P_FC (3+1) /* port uses 3 bits, FCoE one bit */ -#define FILTER_SEL_WIDTH_VIN_P_FC \ -	(6 + 7 + FILTER_SEL_WIDTH_P_FC) /* 6 bits are unused, VF uses 7 bits*/ -#define FILTER_SEL_WIDTH_TAG_P_FC \ -	(3 + FILTER_SEL_WIDTH_VIN_P_FC) /* PF uses 3 bits */ -#define FILTER_SEL_WIDTH_VLD_TAG_P_FC (1 + FILTER_SEL_WIDTH_TAG_P_FC) +/* + * c4iw_form_pm_msg - Form a port mapper message with mapping info + */ +static void c4iw_form_pm_msg(struct c4iw_ep *ep, +				struct iwpm_sa_data *pm_msg) +{ +	memcpy(&pm_msg->loc_addr, &ep->com.local_addr, +		sizeof(ep->com.local_addr)); +	memcpy(&pm_msg->rem_addr, &ep->com.remote_addr, +		sizeof(ep->com.remote_addr)); +} -static unsigned int select_ntuple(struct c4iw_dev *dev, struct dst_entry *dst, -				  struct l2t_entry *l2t) +/* + * c4iw_form_reg_msg - Form a port mapper message with dev info + */ +static void c4iw_form_reg_msg(struct c4iw_dev *dev, +				struct iwpm_dev_data *pm_msg)  { -	unsigned int ntuple = 0; -	u32 viid; +	memcpy(pm_msg->dev_name, dev->ibdev.name, IWPM_DEVNAME_SIZE); +	memcpy(pm_msg->if_name, dev->rdev.lldi.ports[0]->name, +				IWPM_IFNAME_SIZE); +} -	switch (dev->rdev.lldi.filt_mode) { +static void c4iw_record_pm_msg(struct c4iw_ep *ep, +			struct iwpm_sa_data *pm_msg) +{ +	memcpy(&ep->com.mapped_local_addr, &pm_msg->mapped_loc_addr, +		sizeof(ep->com.mapped_local_addr)); +	memcpy(&ep->com.mapped_remote_addr, &pm_msg->mapped_rem_addr, +		sizeof(ep->com.mapped_remote_addr)); +} -	/* default filter mode */ -	case HW_TPL_FR_MT_PR_IV_P_FC: -		if (l2t->vlan == VLAN_NONE) -			ntuple |= FILTER_SEL_VLAN_NONE << FILTER_SEL_WIDTH_P_FC; -		else { -			ntuple |= l2t->vlan << FILTER_SEL_WIDTH_P_FC; -			ntuple |= 1 << FILTER_SEL_WIDTH_TAG_P_FC; -		} -		ntuple |= l2t->lport << S_PORT | IPPROTO_TCP << -			  FILTER_SEL_WIDTH_VLD_TAG_P_FC; -		break; -	case HW_TPL_FR_MT_PR_OV_P_FC: { -		viid = cxgb4_port_viid(l2t->neigh->dev); - -		ntuple |= FW_VIID_VIN_GET(viid) << FILTER_SEL_WIDTH_P_FC; -		ntuple |= FW_VIID_PFN_GET(viid) << FILTER_SEL_WIDTH_VIN_P_FC; -		ntuple |= FW_VIID_VIVLD_GET(viid) << FILTER_SEL_WIDTH_TAG_P_FC; -		ntuple |= l2t->lport << S_PORT | IPPROTO_TCP << -			  FILTER_SEL_WIDTH_VLD_TAG_P_FC; -		break; -	} -	default: -		break; -	} -	return ntuple; +static void best_mtu(const unsigned short *mtus, unsigned short mtu, +		     unsigned int *idx, int use_ts) +{ +	unsigned short hdr_size = sizeof(struct iphdr) + +				  sizeof(struct tcphdr) + +				  (use_ts ? 12 : 0); +	unsigned short data_size = mtu - hdr_size; + +	cxgb4_best_aligned_mtu(mtus, hdr_size, data_size, 8, idx);  }  static int send_connect(struct c4iw_ep *ep) @@ -586,10 +607,15 @@ static int send_connect(struct c4iw_ep *ep)  	int sizev6 = is_t4(ep->com.dev->rdev.lldi.adapter_type) ?  				sizeof(struct cpl_act_open_req6) :  				sizeof(struct cpl_t5_act_open_req6); -	struct sockaddr_in *la = (struct sockaddr_in *)&ep->com.local_addr; -	struct sockaddr_in *ra = (struct sockaddr_in *)&ep->com.remote_addr; -	struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&ep->com.local_addr; -	struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&ep->com.remote_addr; +	struct sockaddr_in *la = (struct sockaddr_in *) +				 &ep->com.mapped_local_addr; +	struct sockaddr_in *ra = (struct sockaddr_in *) +				 &ep->com.mapped_remote_addr; +	struct sockaddr_in6 *la6 = (struct sockaddr_in6 *) +				   &ep->com.mapped_local_addr; +	struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *) +				   &ep->com.mapped_remote_addr; +	int win;  	wrlen = (ep->com.remote_addr.ss_family == AF_INET) ?  			roundup(sizev4, 16) : @@ -605,8 +631,18 @@ static int send_connect(struct c4iw_ep *ep)  	}  	set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx); -	cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx); +	best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, +		 enable_tcp_timestamps);  	wscale = compute_wscale(rcv_win); + +	/* +	 * Specify the largest window that will fit in opt0. The +	 * remainder will be specified in the rx_data_ack. +	 */ +	win = ep->rcv_win >> 10; +	if (win > RCV_BUFSIZ_MASK) +		win = RCV_BUFSIZ_MASK; +  	opt0 = (nocong ? NO_CONG(1) : 0) |  	       KEEP_ALIVE(1) |  	       DELACK(1) | @@ -617,7 +653,7 @@ static int send_connect(struct c4iw_ep *ep)  	       SMAC_SEL(ep->smac_idx) |  	       DSCP(ep->tos) |  	       ULP_MODE(ULP_MODE_TCPDDP) | -	       RCV_BUFSIZ(rcv_win>>10); +	       RCV_BUFSIZ(win);  	opt2 = RX_CHANNEL(0) |  	       CCTRL_ECN(enable_ecn) |  	       RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid); @@ -627,7 +663,11 @@ static int send_connect(struct c4iw_ep *ep)  		opt2 |= SACK_EN(1);  	if (wscale && enable_tcp_window_scaling)  		opt2 |= WND_SCALE_EN(1); -	t4_set_arp_err_handler(skb, NULL, act_open_req_arp_failure); +	if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) { +		opt2 |= T5_OPT_2_VALID; +		opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); +	} +	t4_set_arp_err_handler(skb, ep, act_open_req_arp_failure);  	if (is_t4(ep->com.dev->rdev.lldi.adapter_type)) {  		if (ep->com.remote_addr.ss_family == AF_INET) { @@ -641,8 +681,9 @@ static int send_connect(struct c4iw_ep *ep)  			req->local_ip = la->sin_addr.s_addr;  			req->peer_ip = ra->sin_addr.s_addr;  			req->opt0 = cpu_to_be64(opt0); -			req->params = cpu_to_be32(select_ntuple(ep->com.dev, -						ep->dst, ep->l2t)); +			req->params = cpu_to_be32(cxgb4_select_ntuple( +						ep->com.dev->rdev.lldi.ports[0], +						ep->l2t));  			req->opt2 = cpu_to_be32(opt2);  		} else {  			req6 = (struct cpl_act_open_req6 *)skb_put(skb, wrlen); @@ -662,12 +703,19 @@ static int send_connect(struct c4iw_ep *ep)  			req6->peer_ip_lo = *((__be64 *)  						(ra6->sin6_addr.s6_addr + 8));  			req6->opt0 = cpu_to_be64(opt0); -			req6->params = cpu_to_be32( -					select_ntuple(ep->com.dev, ep->dst, -						      ep->l2t)); +			req6->params = cpu_to_be32(cxgb4_select_ntuple( +						ep->com.dev->rdev.lldi.ports[0], +						ep->l2t));  			req6->opt2 = cpu_to_be32(opt2);  		}  	} else { +		u32 isn = (prandom_u32() & ~7UL) - 1; + +		opt2 |= T5_OPT_2_VALID; +		opt2 |= CONG_CNTRL_VALID; /* OPT_2_ISS for T5 */ +		if (peer2peer) +			isn += 4; +  		if (ep->com.remote_addr.ss_family == AF_INET) {  			t5_req = (struct cpl_t5_act_open_req *)  				 skb_put(skb, wrlen); @@ -681,8 +729,12 @@ static int send_connect(struct c4iw_ep *ep)  			t5_req->peer_ip = ra->sin_addr.s_addr;  			t5_req->opt0 = cpu_to_be64(opt0);  			t5_req->params = cpu_to_be64(V_FILTER_TUPLE( -						select_ntuple(ep->com.dev, -						ep->dst, ep->l2t))); +						     cxgb4_select_ntuple( +					     ep->com.dev->rdev.lldi.ports[0], +					     ep->l2t))); +			t5_req->rsvd = cpu_to_be32(isn); +			PDBG("%s snd_isn %u\n", __func__, +			     be32_to_cpu(t5_req->rsvd));  			t5_req->opt2 = cpu_to_be32(opt2);  		} else {  			t5_req6 = (struct cpl_t5_act_open_req6 *) @@ -703,7 +755,12 @@ static int send_connect(struct c4iw_ep *ep)  						(ra6->sin6_addr.s6_addr + 8));  			t5_req6->opt0 = cpu_to_be64(opt0);  			t5_req6->params = (__force __be64)cpu_to_be32( -				select_ntuple(ep->com.dev, ep->dst, ep->l2t)); +							cxgb4_select_ntuple( +						ep->com.dev->rdev.lldi.ports[0], +						ep->l2t)); +			t5_req6->rsvd = cpu_to_be32(isn); +			PDBG("%s snd_isn %u\n", __func__, +			     be32_to_cpu(t5_req6->rsvd));  			t5_req6->opt2 = cpu_to_be32(opt2);  		}  	} @@ -799,8 +856,9 @@ static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,  	ep->mpa_skb = skb;  	c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);  	start_ep_timer(ep); -	state_set(&ep->com, MPA_REQ_SENT); +	__state_set(&ep->com, MPA_REQ_SENT);  	ep->mpa_attr.initiator = 1; +	ep->snd_seq += mpalen;  	return;  } @@ -880,6 +938,7 @@ static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen)  	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);  	BUG_ON(ep->mpa_skb);  	ep->mpa_skb = skb; +	ep->snd_seq += mpalen;  	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);  } @@ -963,7 +1022,8 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen)  	skb_get(skb);  	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);  	ep->mpa_skb = skb; -	state_set(&ep->com, MPA_REP_SENT); +	__state_set(&ep->com, MPA_REP_SENT); +	ep->snd_seq += mpalen;  	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);  } @@ -980,6 +1040,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)  	PDBG("%s ep %p tid %u snd_isn %u rcv_isn %u\n", __func__, ep, tid,  	     be32_to_cpu(req->snd_isn), be32_to_cpu(req->rcv_isn)); +	mutex_lock(&ep->com.mutex);  	dst_confirm(ep->dst);  	/* setup the hwtid for this connection */ @@ -1003,17 +1064,18 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)  		send_mpa_req(ep, skb, 1);  	else  		send_mpa_req(ep, skb, mpa_rev); - +	mutex_unlock(&ep->com.mutex);  	return 0;  } -static void close_complete_upcall(struct c4iw_ep *ep) +static void close_complete_upcall(struct c4iw_ep *ep, int status)  {  	struct iw_cm_event event;  	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);  	memset(&event, 0, sizeof(event));  	event.event = IW_CM_EVENT_CLOSE; +	event.status = status;  	if (ep->com.cm_id) {  		PDBG("close complete delivered ep %p cm_id %p tid %u\n",  		     ep, ep->com.cm_id, ep->hwtid); @@ -1027,8 +1089,7 @@ static void close_complete_upcall(struct c4iw_ep *ep)  static int abort_connection(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)  {  	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); -	close_complete_upcall(ep); -	state_set(&ep->com, ABORTING); +	__state_set(&ep->com, ABORTING);  	set_bit(ABORT_CONN, &ep->com.history);  	return send_abort(ep, skb, gfp);  } @@ -1106,9 +1167,10 @@ static void connect_reply_upcall(struct c4iw_ep *ep, int status)  	}  } -static void connect_request_upcall(struct c4iw_ep *ep) +static int connect_request_upcall(struct c4iw_ep *ep)  {  	struct iw_cm_event event; +	int ret;  	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);  	memset(&event, 0, sizeof(event)); @@ -1133,15 +1195,14 @@ static void connect_request_upcall(struct c4iw_ep *ep)  		event.private_data_len = ep->plen;  		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);  	} -	if (state_read(&ep->parent_ep->com) != DEAD) { -		c4iw_get_ep(&ep->com); -		ep->parent_ep->com.cm_id->event_handler( -						ep->parent_ep->com.cm_id, -						&event); -	} +	c4iw_get_ep(&ep->com); +	ret = ep->parent_ep->com.cm_id->event_handler(ep->parent_ep->com.cm_id, +						      &event); +	if (ret) +		c4iw_put_ep(&ep->com);  	set_bit(CONNREQ_UPCALL, &ep->com.history);  	c4iw_put_ep(&ep->parent_ep->com); -	ep->parent_ep = NULL; +	return ret;  }  static void established_upcall(struct c4iw_ep *ep) @@ -1173,6 +1234,14 @@ static int update_rx_credits(struct c4iw_ep *ep, u32 credits)  		return 0;  	} +	/* +	 * If we couldn't specify the entire rcv window at connection setup +	 * due to the limit in the number of bits in the RCV_BUFSIZ field, +	 * then add the overage in to the credits returned. +	 */ +	if (ep->rcv_win > RCV_BUFSIZ_MASK * 1024) +		credits += ep->rcv_win - RCV_BUFSIZ_MASK * 1024; +  	req = (struct cpl_rx_data_ack *) skb_put(skb, wrlen);  	memset(req, 0, wrlen);  	INIT_TP_WR(req, ep->hwtid); @@ -1186,7 +1255,7 @@ static int update_rx_credits(struct c4iw_ep *ep, u32 credits)  	return credits;  } -static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) +static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)  {  	struct mpa_message *mpa;  	struct mpa_v2_conn_params *mpa_v2_params; @@ -1196,17 +1265,17 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)  	struct c4iw_qp_attributes attrs;  	enum c4iw_qp_attr_mask mask;  	int err; +	int disconnect = 0;  	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);  	/* -	 * Stop mpa timer.  If it expired, then the state has -	 * changed and we bail since ep_timeout already aborted -	 * the connection. +	 * Stop mpa timer.  If it expired, then +	 * we ignore the MPA reply.  process_timeout() +	 * will abort the connection.  	 */ -	stop_ep_timer(ep); -	if (state_read(&ep->com) != MPA_REQ_SENT) -		return; +	if (stop_ep_timer(ep)) +		return 0;  	/*  	 * If we get more than the supported amount of private data @@ -1228,7 +1297,7 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)  	 * if we don't even have the mpa message, then bail.  	 */  	if (ep->mpa_pkt_len < sizeof(*mpa)) -		return; +		return 0;  	mpa = (struct mpa_message *) ep->mpa_pkt;  	/* Validate MPA header. */ @@ -1268,7 +1337,7 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)  	 * We'll continue process when more data arrives.  	 */  	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) -		return; +		return 0;  	if (mpa->flags & MPA_REJECT) {  		err = -ECONNREFUSED; @@ -1280,7 +1349,7 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)  	 * start reply message including private data. And  	 * the MPA header is valid.  	 */ -	state_set(&ep->com, FPDU_MODE); +	__state_set(&ep->com, FPDU_MODE);  	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;  	ep->mpa_attr.recv_marker_enabled = markers_enabled;  	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; @@ -1370,9 +1439,11 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)  		attrs.layer_etype = LAYER_MPA | DDP_LLP;  		attrs.ecode = MPA_NOMATCH_RTR;  		attrs.next_state = C4IW_QP_STATE_TERMINATE; +		attrs.send_term = 1;  		err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, -				C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); +				C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);  		err = -ENOMEM; +		disconnect = 1;  		goto out;  	} @@ -1388,18 +1459,20 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)  		attrs.layer_etype = LAYER_MPA | DDP_LLP;  		attrs.ecode = MPA_INSUFF_IRD;  		attrs.next_state = C4IW_QP_STATE_TERMINATE; +		attrs.send_term = 1;  		err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, -				C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); +				C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);  		err = -ENOMEM; +		disconnect = 1;  		goto out;  	}  	goto out;  err: -	state_set(&ep->com, ABORTING); +	__state_set(&ep->com, ABORTING);  	send_abort(ep, skb, GFP_KERNEL);  out:  	connect_reply_upcall(ep, err); -	return; +	return disconnect;  }  static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) @@ -1410,15 +1483,12 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)  	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); -	if (state_read(&ep->com) != MPA_REQ_WAIT) -		return; -  	/*  	 * If we get more than the supported amount of private data  	 * then we must fail this connection.  	 */  	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { -		stop_ep_timer(ep); +		(void)stop_ep_timer(ep);  		abort_connection(ep, skb, GFP_KERNEL);  		return;  	} @@ -1440,7 +1510,6 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)  		return;  	PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__); -	stop_ep_timer(ep);  	mpa = (struct mpa_message *) ep->mpa_pkt;  	/* @@ -1449,13 +1518,13 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)  	if (mpa->revision > mpa_rev) {  		printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d,"  		       " Received = %d\n", __func__, mpa_rev, mpa->revision); -		stop_ep_timer(ep); +		(void)stop_ep_timer(ep);  		abort_connection(ep, skb, GFP_KERNEL);  		return;  	}  	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) { -		stop_ep_timer(ep); +		(void)stop_ep_timer(ep);  		abort_connection(ep, skb, GFP_KERNEL);  		return;  	} @@ -1466,7 +1535,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)  	 * Fail if there's too much private data.  	 */  	if (plen > MPA_MAX_PRIVATE_DATA) { -		stop_ep_timer(ep); +		(void)stop_ep_timer(ep);  		abort_connection(ep, skb, GFP_KERNEL);  		return;  	} @@ -1475,7 +1544,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)  	 * If plen does not account for pkt size  	 */  	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { -		stop_ep_timer(ep); +		(void)stop_ep_timer(ep);  		abort_connection(ep, skb, GFP_KERNEL);  		return;  	} @@ -1532,10 +1601,24 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)  	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version,  	     ep->mpa_attr.p2p_type); -	state_set(&ep->com, MPA_REQ_RCVD); - -	/* drive upcall */ -	connect_request_upcall(ep); +	/* +	 * If the endpoint timer already expired, then we ignore +	 * the start request.  process_timeout() will abort +	 * the connection. +	 */ +	if (!stop_ep_timer(ep)) { +		__state_set(&ep->com, MPA_REQ_RCVD); + +		/* drive upcall */ +		mutex_lock(&ep->parent_ep->com.mutex); +		if (ep->parent_ep->com.state != DEAD) { +			if (connect_request_upcall(ep)) +				abort_connection(ep, skb, GFP_KERNEL); +		} else { +			abort_connection(ep, skb, GFP_KERNEL); +		} +		mutex_unlock(&ep->parent_ep->com.mutex); +	}  	return;  } @@ -1547,19 +1630,23 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)  	unsigned int tid = GET_TID(hdr);  	struct tid_info *t = dev->rdev.lldi.tids;  	__u8 status = hdr->status; +	int disconnect = 0;  	ep = lookup_tid(t, tid); +	if (!ep) +		return 0;  	PDBG("%s ep %p tid %u dlen %u\n", __func__, ep, ep->hwtid, dlen);  	skb_pull(skb, sizeof(*hdr));  	skb_trim(skb, dlen); +	mutex_lock(&ep->com.mutex);  	/* update RX credits */  	update_rx_credits(ep, dlen); -	switch (state_read(&ep->com)) { +	switch (ep->com.state) {  	case MPA_REQ_SENT:  		ep->rcv_seq += dlen; -		process_mpa_reply(ep, skb); +		disconnect = process_mpa_reply(ep, skb);  		break;  	case MPA_REQ_WAIT:  		ep->rcv_seq += dlen; @@ -1572,15 +1659,19 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)  			pr_err("%s Unexpected streaming data." \  			       " qpid %u ep %p state %d tid %u status %d\n",  			       __func__, ep->com.qp->wq.sq.qid, ep, -			       state_read(&ep->com), ep->hwtid, status); +			       ep->com.state, ep->hwtid, status);  		attrs.next_state = C4IW_QP_STATE_TERMINATE;  		c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, -			       C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); +			       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); +		disconnect = 1;  		break;  	}  	default:  		break;  	} +	mutex_unlock(&ep->com.mutex); +	if (disconnect) +		c4iw_ep_disconnect(ep, 0, GFP_KERNEL);  	return 0;  } @@ -1624,18 +1715,20 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)  	unsigned int mtu_idx;  	int wscale;  	struct sockaddr_in *sin; +	int win;  	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);  	req = (struct fw_ofld_connection_wr *)__skb_put(skb, sizeof(*req));  	memset(req, 0, sizeof(*req));  	req->op_compl = htonl(V_WR_OP(FW_OFLD_CONNECTION_WR));  	req->len16_pkd = htonl(FW_WR_LEN16(DIV_ROUND_UP(sizeof(*req), 16))); -	req->le.filter = cpu_to_be32(select_ntuple(ep->com.dev, ep->dst, +	req->le.filter = cpu_to_be32(cxgb4_select_ntuple( +				     ep->com.dev->rdev.lldi.ports[0],  				     ep->l2t)); -	sin = (struct sockaddr_in *)&ep->com.local_addr; +	sin = (struct sockaddr_in *)&ep->com.mapped_local_addr;  	req->le.lport = sin->sin_port;  	req->le.u.ipv4.lip = sin->sin_addr.s_addr; -	sin = (struct sockaddr_in *)&ep->com.remote_addr; +	sin = (struct sockaddr_in *)&ep->com.mapped_remote_addr;  	req->le.pport = sin->sin_port;  	req->le.u.ipv4.pip = sin->sin_addr.s_addr;  	req->tcb.t_state_to_astid = @@ -1645,8 +1738,18 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)  			htons(F_FW_OFLD_CONNECTION_WR_CPLRXDATAACK);  	req->tcb.tx_max = (__force __be32) jiffies;  	req->tcb.rcv_adv = htons(1); -	cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx); +	best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, +		 enable_tcp_timestamps);  	wscale = compute_wscale(rcv_win); + +	/* +	 * Specify the largest window that will fit in opt0. The +	 * remainder will be specified in the rx_data_ack. +	 */ +	win = ep->rcv_win >> 10; +	if (win > RCV_BUFSIZ_MASK) +		win = RCV_BUFSIZ_MASK; +  	req->tcb.opt0 = (__force __be64) (TCAM_BYPASS(1) |  		(nocong ? NO_CONG(1) : 0) |  		KEEP_ALIVE(1) | @@ -1658,7 +1761,7 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)  		SMAC_SEL(ep->smac_idx) |  		DSCP(ep->tos) |  		ULP_MODE(ULP_MODE_TCPDDP) | -		RCV_BUFSIZ(rcv_win >> 10)); +		RCV_BUFSIZ(win));  	req->tcb.opt2 = (__force __be32) (PACE(1) |  		TX_QUEUE(ep->com.dev->rdev.lldi.tx_modq[ep->tx_chan]) |  		RX_CHANNEL(0) | @@ -1686,6 +1789,22 @@ static inline int act_open_has_tid(int status)  	       status != CPL_ERR_ARP_MISS;  } +/* Returns whether a CPL status conveys negative advice. + */ +static int is_neg_adv(unsigned int status) +{ +	return status == CPL_ERR_RTX_NEG_ADVICE || +	       status == CPL_ERR_PERSIST_NEG_ADVICE || +	       status == CPL_ERR_KEEPALV_NEG_ADVICE; +} + +static void set_tcp_window(struct c4iw_ep *ep, struct port_info *pi) +{ +	ep->snd_win = snd_win; +	ep->rcv_win = rcv_win; +	PDBG("%s snd_win %d rcv_win %d\n", __func__, ep->snd_win, ep->rcv_win); +} +  #define ACT_OPEN_RETRY_COUNT 2  static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, @@ -1734,6 +1853,7 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip,  		ep->ctrlq_idx = cxgb4_port_idx(pdev);  		ep->rss_qid = cdev->rdev.lldi.rxq_ids[  			cxgb4_port_idx(pdev) * step]; +		set_tcp_window(ep, (struct port_info *)netdev_priv(pdev));  		dev_put(pdev);  	} else {  		pdev = get_real_dev(n->dev); @@ -1742,16 +1862,17 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip,  		if (!ep->l2t)  			goto out;  		ep->mtu = dst_mtu(dst); -		ep->tx_chan = cxgb4_port_chan(n->dev); -		ep->smac_idx = (cxgb4_port_viid(n->dev) & 0x7F) << 1; +		ep->tx_chan = cxgb4_port_chan(pdev); +		ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1;  		step = cdev->rdev.lldi.ntxq /  			cdev->rdev.lldi.nchan; -		ep->txq_idx = cxgb4_port_idx(n->dev) * step; -		ep->ctrlq_idx = cxgb4_port_idx(n->dev); +		ep->txq_idx = cxgb4_port_idx(pdev) * step; +		ep->ctrlq_idx = cxgb4_port_idx(pdev);  		step = cdev->rdev.lldi.nrxq /  			cdev->rdev.lldi.nchan;  		ep->rss_qid = cdev->rdev.lldi.rxq_ids[ -			cxgb4_port_idx(n->dev) * step]; +			cxgb4_port_idx(pdev) * step]; +		set_tcp_window(ep, (struct port_info *)netdev_priv(pdev));  		if (clear_mpa_v1) {  			ep->retry_with_mpa_v1 = 0; @@ -1866,15 +1987,15 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)  	struct sockaddr_in6 *ra6;  	ep = lookup_atid(t, atid); -	la = (struct sockaddr_in *)&ep->com.local_addr; -	ra = (struct sockaddr_in *)&ep->com.remote_addr; -	la6 = (struct sockaddr_in6 *)&ep->com.local_addr; -	ra6 = (struct sockaddr_in6 *)&ep->com.remote_addr; +	la = (struct sockaddr_in *)&ep->com.mapped_local_addr; +	ra = (struct sockaddr_in *)&ep->com.mapped_remote_addr; +	la6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr; +	ra6 = (struct sockaddr_in6 *)&ep->com.mapped_remote_addr;  	PDBG("%s ep %p atid %u status %u errno %d\n", __func__, ep, atid,  	     status, status2errno(status)); -	if (status == CPL_ERR_RTX_NEG_ADVICE) { +	if (is_neg_adv(status)) {  		printk(KERN_WARNING MOD "Connection problems for atid %u\n",  			atid);  		return 0; @@ -1982,13 +2103,36 @@ static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,  	u64 opt0;  	u32 opt2;  	int wscale; +	struct cpl_t5_pass_accept_rpl *rpl5 = NULL; +	int win;  	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);  	BUG_ON(skb_cloned(skb)); -	skb_trim(skb, sizeof(*rpl)); +  	skb_get(skb); -	cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx); +	rpl = cplhdr(skb); +	if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) { +		skb_trim(skb, roundup(sizeof(*rpl5), 16)); +		rpl5 = (void *)rpl; +		INIT_TP_WR(rpl5, ep->hwtid); +	} else { +		skb_trim(skb, sizeof(*rpl)); +		INIT_TP_WR(rpl, ep->hwtid); +	} +	OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, +						    ep->hwtid)); + +	best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, +		 enable_tcp_timestamps && req->tcpopt.tstamp);  	wscale = compute_wscale(rcv_win); + +	/* +	 * Specify the largest window that will fit in opt0. The +	 * remainder will be specified in the rx_data_ack. +	 */ +	win = ep->rcv_win >> 10; +	if (win > RCV_BUFSIZ_MASK) +		win = RCV_BUFSIZ_MASK;  	opt0 = (nocong ? NO_CONG(1) : 0) |  	       KEEP_ALIVE(1) |  	       DELACK(1) | @@ -1999,7 +2143,7 @@ static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,  	       SMAC_SEL(ep->smac_idx) |  	       DSCP(ep->tos >> 2) |  	       ULP_MODE(ULP_MODE_TCPDDP) | -	       RCV_BUFSIZ(rcv_win>>10); +	       RCV_BUFSIZ(win);  	opt2 = RX_CHANNEL(0) |  	       RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid); @@ -2018,11 +2162,19 @@ static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,  		if (tcph->ece && tcph->cwr)  			opt2 |= CCTRL_ECN(1);  	} +	if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) { +		u32 isn = (prandom_u32() & ~7UL) - 1; +		opt2 |= T5_OPT_2_VALID; +		opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); +		opt2 |= CONG_CNTRL_VALID; /* OPT_2_ISS for T5 */ +		rpl5 = (void *)rpl; +		memset(&rpl5->iss, 0, roundup(sizeof(*rpl5)-sizeof(*rpl), 16)); +		if (peer2peer) +			isn += 4; +		rpl5->iss = cpu_to_be32(isn); +		PDBG("%s iss %u\n", __func__, be32_to_cpu(rpl5->iss)); +	} -	rpl = cplhdr(skb); -	INIT_TP_WR(rpl, ep->hwtid); -	OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, -				      ep->hwtid));  	rpl->opt0 = cpu_to_be64(opt0);  	rpl->opt2 = cpu_to_be32(opt2);  	set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx); @@ -2037,7 +2189,6 @@ static void reject_cr(struct c4iw_dev *dev, u32 hwtid, struct sk_buff *skb)  	PDBG("%s c4iw_dev %p tid %u\n", __func__, dev, hwtid);  	BUG_ON(skb_cloned(skb));  	skb_trim(skb, sizeof(struct cpl_tid_release)); -	skb_get(skb);  	release_tid(&dev->rdev, hwtid, skb);  	return;  } @@ -2087,6 +2238,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)  	int err;  	u16 peer_mss = ntohs(req->tcpopt.mss);  	int iptype; +	unsigned short hdrs;  	parent_ep = lookup_stid(t, stid);  	if (!parent_ep) { @@ -2144,8 +2296,10 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)  		goto reject;  	} -	if (peer_mss && child_ep->mtu > (peer_mss + 40)) -		child_ep->mtu = peer_mss + 40; +	hdrs = sizeof(struct iphdr) + sizeof(struct tcphdr) + +	       ((enable_tcp_timestamps && req->tcpopt.tstamp) ? 12 : 0); +	if (peer_mss && child_ep->mtu > (peer_mss + hdrs)) +		child_ep->mtu = peer_mss + hdrs;  	state_set(&child_ep->com, CONNECTING);  	child_ep->com.dev = dev; @@ -2279,13 +2433,13 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb)  		disconnect = 0;  		break;  	case MORIBUND: -		stop_ep_timer(ep); +		(void)stop_ep_timer(ep);  		if (ep->com.cm_id && ep->com.qp) {  			attrs.next_state = C4IW_QP_STATE_IDLE;  			c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,  				       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);  		} -		close_complete_upcall(ep); +		close_complete_upcall(ep, 0);  		__state_set(&ep->com, DEAD);  		release = 1;  		disconnect = 0; @@ -2304,15 +2458,6 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb)  	return 0;  } -/* - * Returns whether an ABORT_REQ_RSS message is a negative advice. - */ -static int is_neg_adv_abort(unsigned int status) -{ -	return status == CPL_ERR_RTX_NEG_ADVICE || -	       status == CPL_ERR_PERSIST_NEG_ADVICE; -} -  static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)  {  	struct cpl_abort_req_rss *req = cplhdr(skb); @@ -2326,7 +2471,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)  	unsigned int tid = GET_TID(req);  	ep = lookup_tid(t, tid); -	if (is_neg_adv_abort(req->status)) { +	if (is_neg_adv(req->status)) {  		PDBG("%s neg_adv_abort ep %p tid %u\n", __func__, ep,  		     ep->hwtid);  		return 0; @@ -2348,10 +2493,10 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)  	case CONNECTING:  		break;  	case MPA_REQ_WAIT: -		stop_ep_timer(ep); +		(void)stop_ep_timer(ep);  		break;  	case MPA_REQ_SENT: -		stop_ep_timer(ep); +		(void)stop_ep_timer(ep);  		if (mpa_rev == 1 || (mpa_rev == 2 && ep->tried_with_mpa_v1))  			connect_reply_upcall(ep, -ECONNRESET);  		else { @@ -2456,7 +2601,7 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb)  		__state_set(&ep->com, MORIBUND);  		break;  	case MORIBUND: -		stop_ep_timer(ep); +		(void)stop_ep_timer(ep);  		if ((ep->com.cm_id) && (ep->com.qp)) {  			attrs.next_state = C4IW_QP_STATE_IDLE;  			c4iw_modify_qp(ep->com.qp->rhp, @@ -2464,7 +2609,7 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb)  					     C4IW_QP_ATTR_NEXT_STATE,  					     &attrs, 1);  		} -		close_complete_upcall(ep); +		close_complete_upcall(ep, 0);  		__state_set(&ep->com, DEAD);  		release = 1;  		break; @@ -2539,22 +2684,28 @@ static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb)  int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)  { -	int err; +	int err = 0; +	int disconnect = 0;  	struct c4iw_ep *ep = to_ep(cm_id);  	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); -	if (state_read(&ep->com) == DEAD) { +	mutex_lock(&ep->com.mutex); +	if (ep->com.state == DEAD) { +		mutex_unlock(&ep->com.mutex);  		c4iw_put_ep(&ep->com);  		return -ECONNRESET;  	}  	set_bit(ULP_REJECT, &ep->com.history); -	BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); +	BUG_ON(ep->com.state != MPA_REQ_RCVD);  	if (mpa_rev == 0)  		abort_connection(ep, NULL, GFP_KERNEL);  	else {  		err = send_mpa_reject(ep, pdata, pdata_len); -		err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL); +		disconnect = 1;  	} +	mutex_unlock(&ep->com.mutex); +	if (disconnect) +		err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL);  	c4iw_put_ep(&ep->com);  	return 0;  } @@ -2569,12 +2720,14 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  	struct c4iw_qp *qp = get_qhp(h, conn_param->qpn);  	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); -	if (state_read(&ep->com) == DEAD) { + +	mutex_lock(&ep->com.mutex); +	if (ep->com.state == DEAD) {  		err = -ECONNRESET;  		goto err;  	} -	BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); +	BUG_ON(ep->com.state != MPA_REQ_RCVD);  	BUG_ON(!qp);  	set_bit(ULP_ACCEPT, &ep->com.history); @@ -2643,14 +2796,16 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  	if (err)  		goto err1; -	state_set(&ep->com, FPDU_MODE); +	__state_set(&ep->com, FPDU_MODE);  	established_upcall(ep); +	mutex_unlock(&ep->com.mutex);  	c4iw_put_ep(&ep->com);  	return 0;  err1:  	ep->com.cm_id = NULL;  	cm_id->rem_ref(cm_id);  err: +	mutex_unlock(&ep->com.mutex);  	c4iw_put_ep(&ep->com);  	return err;  } @@ -2721,13 +2876,15 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  	struct c4iw_dev *dev = to_c4iw_dev(cm_id->device);  	struct c4iw_ep *ep;  	int err = 0; -	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; -	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; -	struct sockaddr_in6 *laddr6 = (struct sockaddr_in6 *)&cm_id->local_addr; -	struct sockaddr_in6 *raddr6 = (struct sockaddr_in6 *) -				      &cm_id->remote_addr; +	struct sockaddr_in *laddr; +	struct sockaddr_in *raddr; +	struct sockaddr_in6 *laddr6; +	struct sockaddr_in6 *raddr6; +	struct iwpm_dev_data pm_reg_msg; +	struct iwpm_sa_data pm_msg;  	__u8 *ra;  	int iptype; +	int iwpm_err = 0;  	if ((conn_param->ord > c4iw_max_read_depth) ||  	    (conn_param->ird > c4iw_max_read_depth)) { @@ -2758,7 +2915,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  	if (!ep->com.qp) {  		PDBG("%s qpn 0x%x not found!\n", __func__, conn_param->qpn);  		err = -EINVAL; -		goto fail2; +		goto fail1;  	}  	ref_qp(ep);  	PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn, @@ -2771,10 +2928,50 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  	if (ep->atid == -1) {  		printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__);  		err = -ENOMEM; -		goto fail2; +		goto fail1;  	}  	insert_handle(dev, &dev->atid_idr, ep, ep->atid); +	memcpy(&ep->com.local_addr, &cm_id->local_addr, +	       sizeof(ep->com.local_addr)); +	memcpy(&ep->com.remote_addr, &cm_id->remote_addr, +	       sizeof(ep->com.remote_addr)); + +	/* No port mapper available, go with the specified peer information */ +	memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr, +	       sizeof(ep->com.mapped_local_addr)); +	memcpy(&ep->com.mapped_remote_addr, &cm_id->remote_addr, +	       sizeof(ep->com.mapped_remote_addr)); + +	c4iw_form_reg_msg(dev, &pm_reg_msg); +	iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW); +	if (iwpm_err) { +		PDBG("%s: Port Mapper reg pid fail (err = %d).\n", +			__func__, iwpm_err); +	} +	if (iwpm_valid_pid() && !iwpm_err) { +		c4iw_form_pm_msg(ep, &pm_msg); +		iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_C4IW); +		if (iwpm_err) +			PDBG("%s: Port Mapper query fail (err = %d).\n", +				__func__, iwpm_err); +		else +			c4iw_record_pm_msg(ep, &pm_msg); +	} +	if (iwpm_create_mapinfo(&ep->com.local_addr, +				&ep->com.mapped_local_addr, RDMA_NL_C4IW)) { +		iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW); +		err = -ENOMEM; +		goto fail1; +	} +	print_addr(&ep->com, __func__, "add_query/create_mapinfo"); +	set_bit(RELEASE_MAPINFO, &ep->com.flags); + +	laddr = (struct sockaddr_in *)&ep->com.mapped_local_addr; +	raddr = (struct sockaddr_in *)&ep->com.mapped_remote_addr; +	laddr6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr; +	raddr6 = (struct sockaddr_in6 *) &ep->com.mapped_remote_addr; +  	if (cm_id->remote_addr.ss_family == AF_INET) {  		iptype = 4;  		ra = (__u8 *)&raddr->sin_addr; @@ -2785,7 +2982,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  		if ((__force int)raddr->sin_addr.s_addr == INADDR_ANY) {  			err = pick_local_ipaddrs(dev, cm_id);  			if (err) -				goto fail2; +				goto fail1;  		}  		/* find a route */ @@ -2805,7 +3002,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  		if (ipv6_addr_type(&raddr6->sin6_addr) == IPV6_ADDR_ANY) {  			err = pick_local_ip6addrs(dev, cm_id);  			if (err) -				goto fail2; +				goto fail1;  		}  		/* find a route */ @@ -2821,13 +3018,13 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  	if (!ep->dst) {  		printk(KERN_ERR MOD "%s - cannot find route.\n", __func__);  		err = -EHOSTUNREACH; -		goto fail3; +		goto fail2;  	}  	err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, true);  	if (err) {  		printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); -		goto fail4; +		goto fail3;  	}  	PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n", @@ -2836,10 +3033,6 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  	state_set(&ep->com, CONNECTING);  	ep->tos = 0; -	memcpy(&ep->com.local_addr, &cm_id->local_addr, -	       sizeof(ep->com.local_addr)); -	memcpy(&ep->com.remote_addr, &cm_id->remote_addr, -	       sizeof(ep->com.remote_addr));  	/* send connect request to rnic */  	err = send_connect(ep); @@ -2847,12 +3040,12 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  		goto out;  	cxgb4_l2t_release(ep->l2t); -fail4: -	dst_release(ep->dst);  fail3: +	dst_release(ep->dst); +fail2:  	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);  	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); -fail2: +fail1:  	cm_id->rem_ref(cm_id);  	c4iw_put_ep(&ep->com);  out: @@ -2862,7 +3055,8 @@ out:  static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)  {  	int err; -	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ep->com.local_addr; +	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) +				    &ep->com.mapped_local_addr;  	c4iw_init_wr_wait(&ep->com.wr_wait);  	err = cxgb4_create_server6(ep->com.dev->rdev.lldi.ports[0], @@ -2883,7 +3077,8 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)  static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)  {  	int err; -	struct sockaddr_in *sin = (struct sockaddr_in *)&ep->com.local_addr; +	struct sockaddr_in *sin = (struct sockaddr_in *) +				  &ep->com.mapped_local_addr;  	if (dev->rdev.lldi.enable_fw_ofld_conn) {  		do { @@ -2918,6 +3113,9 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)  	int err = 0;  	struct c4iw_dev *dev = to_c4iw_dev(cm_id->device);  	struct c4iw_listen_ep *ep; +	struct iwpm_dev_data pm_reg_msg; +	struct iwpm_sa_data pm_msg; +	int iwpm_err = 0;  	might_sleep(); @@ -2938,7 +3136,8 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)  	/*  	 * Allocate a server TID.  	 */ -	if (dev->rdev.lldi.enable_fw_ofld_conn) +	if (dev->rdev.lldi.enable_fw_ofld_conn && +	    ep->com.local_addr.ss_family == AF_INET)  		ep->stid = cxgb4_alloc_sftid(dev->rdev.lldi.tids,  					     cm_id->local_addr.ss_family, ep);  	else @@ -2951,6 +3150,37 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)  		goto fail2;  	}  	insert_handle(dev, &dev->stid_idr, ep, ep->stid); + +	/* No port mapper available, go with the specified info */ +	memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr, +	       sizeof(ep->com.mapped_local_addr)); + +	c4iw_form_reg_msg(dev, &pm_reg_msg); +	iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW); +	if (iwpm_err) { +		PDBG("%s: Port Mapper reg pid fail (err = %d).\n", +			__func__, iwpm_err); +	} +	if (iwpm_valid_pid() && !iwpm_err) { +		memcpy(&pm_msg.loc_addr, &ep->com.local_addr, +				sizeof(ep->com.local_addr)); +		iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_C4IW); +		if (iwpm_err) +			PDBG("%s: Port Mapper query fail (err = %d).\n", +				__func__, iwpm_err); +		else +			memcpy(&ep->com.mapped_local_addr, +				&pm_msg.mapped_loc_addr, +				sizeof(ep->com.mapped_local_addr)); +	} +	if (iwpm_create_mapinfo(&ep->com.local_addr, +				&ep->com.mapped_local_addr, RDMA_NL_C4IW)) { +		err = -ENOMEM; +		goto fail3; +	} +	print_addr(&ep->com, __func__, "add_mapping/create_mapinfo"); + +	set_bit(RELEASE_MAPINFO, &ep->com.flags);  	state_set(&ep->com, LISTEN);  	if (ep->com.local_addr.ss_family == AF_INET)  		err = create_server4(dev, ep); @@ -2960,6 +3190,8 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)  		cm_id->provider_data = ep;  		goto out;  	} + +fail3:  	cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,  			ep->com.local_addr.ss_family);  fail2: @@ -3018,7 +3250,7 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)  	rdev = &ep->com.dev->rdev;  	if (c4iw_fatal_error(rdev)) {  		fatal = 1; -		close_complete_upcall(ep); +		close_complete_upcall(ep, -EIO);  		ep->com.state = DEAD;  	}  	switch (ep->com.state) { @@ -3040,7 +3272,7 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)  		if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) {  			close = 1;  			if (abrupt) { -				stop_ep_timer(ep); +				(void)stop_ep_timer(ep);  				ep->com.state = ABORTING;  			} else  				ep->com.state = MORIBUND; @@ -3060,7 +3292,7 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)  	if (close) {  		if (abrupt) {  			set_bit(EP_DISC_ABORT, &ep->com.history); -			close_complete_upcall(ep); +			close_complete_upcall(ep, -ECONNRESET);  			ret = send_abort(ep, NULL, gfp);  		} else {  			set_bit(EP_DISC_CLOSE, &ep->com.history); @@ -3241,6 +3473,7 @@ static void send_fw_pass_open_req(struct c4iw_dev *dev, struct sk_buff *skb,  	struct sk_buff *req_skb;  	struct fw_ofld_connection_wr *req;  	struct cpl_pass_accept_req *cpl = cplhdr(skb); +	int ret;  	req_skb = alloc_skb(sizeof(struct fw_ofld_connection_wr), GFP_KERNEL);  	req = (struct fw_ofld_connection_wr *)__skb_put(req_skb, sizeof(*req)); @@ -3277,7 +3510,13 @@ static void send_fw_pass_open_req(struct c4iw_dev *dev, struct sk_buff *skb,  	req->cookie = (unsigned long)skb;  	set_wr_txq(req_skb, CPL_PRIORITY_CONTROL, port_id); -	cxgb4_ofld_send(dev->rdev.lldi.ports[0], req_skb); +	ret = cxgb4_ofld_send(dev->rdev.lldi.ports[0], req_skb); +	if (ret < 0) { +		pr_err("%s - cxgb4_ofld_send error %d - dropping\n", __func__, +		       ret); +		kfree_skb(skb); +		kfree_skb(req_skb); +	}  }  /* @@ -3323,9 +3562,7 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb)  	/*  	 * Calculate the server tid from filter hit index from cpl_rx_pkt.  	 */ -	stid = (__force int) cpu_to_be32((__force u32) rss->hash_val) -					  - dev->rdev.lldi.tids->sftid_base -					  + dev->rdev.lldi.tids->nstids; +	stid = (__force int) cpu_to_be32((__force u32) rss->hash_val);  	lep = (struct c4iw_ep *)lookup_stid(dev->rdev.lldi.tids, stid);  	if (!lep) { @@ -3386,6 +3623,7 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb)  		pi = (struct port_info *)netdev_priv(pdev);  		tx_chan = cxgb4_port_chan(pdev);  	} +	neigh_release(neigh);  	if (!e) {  		pr_err("%s - failed to allocate l2t entry!\n",  		       __func__); @@ -3397,7 +3635,9 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb)  	window = (__force u16) htons((__force u16)tcph->window);  	/* Calcuate filter portion for LE region. */ -	filter = (__force unsigned int) cpu_to_be32(select_ntuple(dev, dst, e)); +	filter = (__force unsigned int) cpu_to_be32(cxgb4_select_ntuple( +						    dev->rdev.lldi.ports[0], +						    e));  	/*  	 * Synthesize the cpl_pass_accept_req. We have everything except the @@ -3464,15 +3704,26 @@ static void process_timeout(struct c4iw_ep *ep)  				     &attrs, 1);  		}  		__state_set(&ep->com, ABORTING); +		close_complete_upcall(ep, -ETIMEDOUT); +		break; +	case ABORTING: +	case DEAD: + +		/* +		 * These states are expected if the ep timed out at the same +		 * time as another thread was calling stop_ep_timer(). +		 * So we silently do nothing for these states. +		 */ +		abort = 0;  		break;  	default:  		WARN(1, "%s unexpected state ep %p tid %u state %u\n",  			__func__, ep, ep->hwtid, ep->com.state);  		abort = 0;  	} -	mutex_unlock(&ep->com.mutex);  	if (abort)  		abort_connection(ep, NULL, GFP_KERNEL); +	mutex_unlock(&ep->com.mutex);  	c4iw_put_ep(&ep->com);  } @@ -3486,6 +3737,8 @@ static void process_timedout_eps(void)  		tmp = timeout_list.next;  		list_del(tmp); +		tmp->next = NULL; +		tmp->prev = NULL;  		spin_unlock_irq(&timeout_lock);  		ep = list_entry(tmp, struct c4iw_ep, entry);  		process_timeout(ep); @@ -3502,6 +3755,7 @@ static void process_work(struct work_struct *work)  	unsigned int opcode;  	int ret; +	process_timedout_eps();  	while ((skb = skb_dequeue(&rxq))) {  		rpl = cplhdr(skb);  		dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *))); @@ -3511,8 +3765,8 @@ static void process_work(struct work_struct *work)  		ret = work_handlers[opcode](dev, skb);  		if (!ret)  			kfree_skb(skb); +		process_timedout_eps();  	} -	process_timedout_eps();  }  static DECLARE_WORK(skb_work, process_work); @@ -3524,8 +3778,13 @@ static void ep_timeout(unsigned long arg)  	spin_lock(&timeout_lock);  	if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) { -		list_add_tail(&ep->entry, &timeout_list); -		kickit = 1; +		/* +		 * Only insert if it is not already on the list. +		 */ +		if (!ep->entry.next) { +			list_add_tail(&ep->entry, &timeout_list); +			kickit = 1; +		}  	}  	spin_unlock(&timeout_lock);  	if (kickit) @@ -3607,7 +3866,7 @@ static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb)  		kfree_skb(skb);  		return 0;  	} -	if (is_neg_adv_abort(req->status)) { +	if (is_neg_adv(req->status)) {  		PDBG("%s neg_adv_abort ep %p tid %u\n", __func__, ep,  		     ep->hwtid);  		kfree_skb(skb); @@ -3666,7 +3925,7 @@ int __init c4iw_cm_init(void)  	return 0;  } -void __exit c4iw_cm_term(void) +void c4iw_cm_term(void)  {  	WARN_ON(!list_empty(&timeout_list));  	flush_workqueue(workq); diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 88de3aa9c5b..c04292c950f 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -134,7 +134,8 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,  			V_FW_RI_RES_WR_IQANUS(0) |  			V_FW_RI_RES_WR_IQANUD(1) |  			F_FW_RI_RES_WR_IQANDST | -			V_FW_RI_RES_WR_IQANDSTINDEX(*rdev->lldi.rxq_ids)); +			V_FW_RI_RES_WR_IQANDSTINDEX( +				rdev->lldi.ciq_ids[cq->vector]));  	res->u.cq.iqdroprss_to_iqesize = cpu_to_be16(  			F_FW_RI_RES_WR_IQDROPRSS |  			V_FW_RI_RES_WR_IQPCIECH(2) | @@ -235,27 +236,21 @@ int c4iw_flush_sq(struct c4iw_qp *qhp)  	struct t4_cq *cq = &chp->cq;  	int idx;  	struct t4_swsqe *swsqe; -	int error = (qhp->attr.state != C4IW_QP_STATE_CLOSING && -			qhp->attr.state != C4IW_QP_STATE_IDLE);  	if (wq->sq.flush_cidx == -1)  		wq->sq.flush_cidx = wq->sq.cidx;  	idx = wq->sq.flush_cidx;  	BUG_ON(idx >= wq->sq.size);  	while (idx != wq->sq.pidx) { -		if (error) { -			swsqe = &wq->sq.sw_sq[idx]; -			BUG_ON(swsqe->flushed); -			swsqe->flushed = 1; -			insert_sq_cqe(wq, cq, swsqe); -			if (wq->sq.oldest_read == swsqe) { -				BUG_ON(swsqe->opcode != FW_RI_READ_REQ); -				advance_oldest_read(wq); -			} -			flushed++; -		} else { -			t4_sq_consume(wq); +		swsqe = &wq->sq.sw_sq[idx]; +		BUG_ON(swsqe->flushed); +		swsqe->flushed = 1; +		insert_sq_cqe(wq, cq, swsqe); +		if (wq->sq.oldest_read == swsqe) { +			BUG_ON(swsqe->opcode != FW_RI_READ_REQ); +			advance_oldest_read(wq);  		} +		flushed++;  		if (++idx == wq->sq.size)  			idx = 0;  	} @@ -365,8 +360,14 @@ void c4iw_flush_hw_cq(struct c4iw_cq *chp)  		if (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP) { -			/* -			 * drop peer2peer RTR reads. +			/* If we have reached here because of async +			 * event or other error, and have egress error +			 * then drop +			 */ +			if (CQE_TYPE(hw_cqe) == 1) +				goto next_cqe; + +			/* drop peer2peer RTR reads.  			 */  			if (CQE_WRID_STAG(hw_cqe) == 1)  				goto next_cqe; @@ -511,8 +512,18 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,  	 */  	if (RQ_TYPE(hw_cqe) && (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP)) { -		/* -		 * If this is an unsolicited read response, then the read +		/* If we have reached here because of async +		 * event or other error, and have egress error +		 * then drop +		 */ +		if (CQE_TYPE(hw_cqe) == 1) { +			if (CQE_STATUS(hw_cqe)) +				t4_set_wq_in_error(wq); +			ret = -EAGAIN; +			goto skip_cqe; +		} + +		/* If this is an unsolicited read response, then the read  		 * was generated by the kernel driver as part of peer-2-peer  		 * connection setup.  So ignore the completion.  		 */ @@ -603,7 +614,7 @@ proc_cqe:  	 */  	if (SQ_TYPE(hw_cqe)) {  		int idx = CQE_WRID_SQ_IDX(hw_cqe); -		BUG_ON(idx > wq->sq.size); +		BUG_ON(idx >= wq->sq.size);  		/*  		* Account for any unsignaled completions completed by @@ -617,7 +628,7 @@ proc_cqe:  			wq->sq.in_use -= wq->sq.size + idx - wq->sq.cidx;  		else  			wq->sq.in_use -= idx - wq->sq.cidx; -		BUG_ON(wq->sq.in_use < 0 && wq->sq.in_use < wq->sq.size); +		BUG_ON(wq->sq.in_use <= 0 && wq->sq.in_use >= wq->sq.size);  		wq->sq.cidx = (uint16_t)idx;  		PDBG("%s completing sq idx %u\n", __func__, wq->sq.cidx); @@ -662,7 +673,7 @@ skip_cqe:  static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc)  {  	struct c4iw_qp *qhp = NULL; -	struct t4_cqe cqe = {0, 0}, *rd_cqe; +	struct t4_cqe uninitialized_var(cqe), *rd_cqe;  	struct t4_wq *wq;  	u32 credit = 0;  	u8 cqe_flushed; @@ -860,6 +871,9 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries,  	rhp = to_c4iw_dev(ibdev); +	if (vector >= rhp->rdev.lldi.nciq) +		return ERR_PTR(-EINVAL); +  	chp = kzalloc(sizeof(*chp), GFP_KERNEL);  	if (!chp)  		return ERR_PTR(-ENOMEM); @@ -881,7 +895,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries,  	/*  	 * Make actual HW queue 2x to avoid cdix_inc overflows.  	 */ -	hwentries = entries * 2; +	hwentries = min(entries * 2, T4_MAX_IQ_SIZE);  	/*  	 * Make HW queue at least 64 entries so GTS updates aren't too @@ -905,6 +919,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries,  	}  	chp->cq.size = hwentries;  	chp->cq.memsize = memsize; +	chp->cq.vector = vector;  	ret = create_cq(&rhp->rdev, &chp->cq,  			ucontext ? &ucontext->uctx : &rhp->rdev.uctx); @@ -940,7 +955,8 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries,  		uresp.gts_key = ucontext->key;  		ucontext->key += PAGE_SIZE;  		spin_unlock(&ucontext->mmap_lock); -		ret = ib_copy_to_udata(udata, &uresp, sizeof uresp); +		ret = ib_copy_to_udata(udata, &uresp, +				       sizeof(uresp) - sizeof(uresp.reserved));  		if (ret)  			goto err5; diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 33d2cc6ab56..7db82b24302 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -64,6 +64,10 @@ struct uld_ctx {  static LIST_HEAD(uld_ctx_list);  static DEFINE_MUTEX(dev_mutex); +#define DB_FC_RESUME_SIZE 64 +#define DB_FC_RESUME_DELAY 1 +#define DB_FC_DRAIN_THRESH 0 +  static struct dentry *c4iw_debugfs_root;  struct c4iw_debugfs_data { @@ -73,6 +77,16 @@ struct c4iw_debugfs_data {  	int pos;  }; +/* registered cxgb4 netlink callbacks */ +static struct ibnl_client_cbs c4iw_nl_cb_table[] = { +	[RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, +	[RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, +	[RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, +	[RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, +	[RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, +	[RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} +}; +  static int count_idrs(int id, void *p, void *data)  {  	int *countp = data; @@ -109,35 +123,49 @@ static int dump_qp(int id, void *p, void *data)  				&qp->ep->com.local_addr;  			struct sockaddr_in *rsin = (struct sockaddr_in *)  				&qp->ep->com.remote_addr; +			struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) +				&qp->ep->com.mapped_local_addr; +			struct sockaddr_in *mapped_rsin = (struct sockaddr_in *) +				&qp->ep->com.mapped_remote_addr;  			cc = snprintf(qpd->buf + qpd->pos, space,  				      "rc qp sq id %u rq id %u state %u "  				      "onchip %u ep tid %u state %u " -				      "%pI4:%u->%pI4:%u\n", +				      "%pI4:%u/%u->%pI4:%u/%u\n",  				      qp->wq.sq.qid, qp->wq.rq.qid,  				      (int)qp->attr.state,  				      qp->wq.sq.flags & T4_SQ_ONCHIP,  				      qp->ep->hwtid, (int)qp->ep->com.state,  				      &lsin->sin_addr, ntohs(lsin->sin_port), -				      &rsin->sin_addr, ntohs(rsin->sin_port)); +				      ntohs(mapped_lsin->sin_port), +				      &rsin->sin_addr, ntohs(rsin->sin_port), +				      ntohs(mapped_rsin->sin_port));  		} else {  			struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *)  				&qp->ep->com.local_addr;  			struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *)  				&qp->ep->com.remote_addr; +			struct sockaddr_in6 *mapped_lsin6 = +				(struct sockaddr_in6 *) +				&qp->ep->com.mapped_local_addr; +			struct sockaddr_in6 *mapped_rsin6 = +				(struct sockaddr_in6 *) +				&qp->ep->com.mapped_remote_addr;  			cc = snprintf(qpd->buf + qpd->pos, space,  				      "rc qp sq id %u rq id %u state %u "  				      "onchip %u ep tid %u state %u " -				      "%pI6:%u->%pI6:%u\n", +				      "%pI6:%u/%u->%pI6:%u/%u\n",  				      qp->wq.sq.qid, qp->wq.rq.qid,  				      (int)qp->attr.state,  				      qp->wq.sq.flags & T4_SQ_ONCHIP,  				      qp->ep->hwtid, (int)qp->ep->com.state,  				      &lsin6->sin6_addr,  				      ntohs(lsin6->sin6_port), +				      ntohs(mapped_lsin6->sin6_port),  				      &rsin6->sin6_addr, -				      ntohs(rsin6->sin6_port)); +				      ntohs(rsin6->sin6_port), +				      ntohs(mapped_rsin6->sin6_port));  		}  	} else  		cc = snprintf(qpd->buf + qpd->pos, space, @@ -282,7 +310,7 @@ static const struct file_operations stag_debugfs_fops = {  	.llseek  = default_llseek,  }; -static char *db_state_str[] = {"NORMAL", "FLOW_CONTROL", "RECOVERY"}; +static char *db_state_str[] = {"NORMAL", "FLOW_CONTROL", "RECOVERY", "STOPPED"};  static int stats_show(struct seq_file *seq, void *v)  { @@ -311,9 +339,10 @@ static int stats_show(struct seq_file *seq, void *v)  	seq_printf(seq, "  DB FULL: %10llu\n", dev->rdev.stats.db_full);  	seq_printf(seq, " DB EMPTY: %10llu\n", dev->rdev.stats.db_empty);  	seq_printf(seq, "  DB DROP: %10llu\n", dev->rdev.stats.db_drop); -	seq_printf(seq, " DB State: %s Transitions %llu\n", +	seq_printf(seq, " DB State: %s Transitions %llu FC Interruptions %llu\n",  		   db_state_str[dev->db_state], -		   dev->rdev.stats.db_state_transitions); +		   dev->rdev.stats.db_state_transitions, +		   dev->rdev.stats.db_fc_interruptions);  	seq_printf(seq, "TCAM_FULL: %10llu\n", dev->rdev.stats.tcam_full);  	seq_printf(seq, "ACT_OFLD_CONN_FAILS: %10llu\n",  		   dev->rdev.stats.act_ofld_conn_fails); @@ -381,31 +410,43 @@ static int dump_ep(int id, void *p, void *data)  			&ep->com.local_addr;  		struct sockaddr_in *rsin = (struct sockaddr_in *)  			&ep->com.remote_addr; +		struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) +			&ep->com.mapped_local_addr; +		struct sockaddr_in *mapped_rsin = (struct sockaddr_in *) +			&ep->com.mapped_remote_addr;  		cc = snprintf(epd->buf + epd->pos, space,  			      "ep %p cm_id %p qp %p state %d flags 0x%lx "  			      "history 0x%lx hwtid %d atid %d " -			      "%pI4:%d <-> %pI4:%d\n", +			      "%pI4:%d/%d <-> %pI4:%d/%d\n",  			      ep, ep->com.cm_id, ep->com.qp,  			      (int)ep->com.state, ep->com.flags,  			      ep->com.history, ep->hwtid, ep->atid,  			      &lsin->sin_addr, ntohs(lsin->sin_port), -			      &rsin->sin_addr, ntohs(rsin->sin_port)); +			      ntohs(mapped_lsin->sin_port), +			      &rsin->sin_addr, ntohs(rsin->sin_port), +			      ntohs(mapped_rsin->sin_port));  	} else {  		struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *)  			&ep->com.local_addr;  		struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *)  			&ep->com.remote_addr; +		struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *) +			&ep->com.mapped_local_addr; +		struct sockaddr_in6 *mapped_rsin6 = (struct sockaddr_in6 *) +			&ep->com.mapped_remote_addr;  		cc = snprintf(epd->buf + epd->pos, space,  			      "ep %p cm_id %p qp %p state %d flags 0x%lx "  			      "history 0x%lx hwtid %d atid %d " -			      "%pI6:%d <-> %pI6:%d\n", +			      "%pI6:%d/%d <-> %pI6:%d/%d\n",  			      ep, ep->com.cm_id, ep->com.qp,  			      (int)ep->com.state, ep->com.flags,  			      ep->com.history, ep->hwtid, ep->atid,  			      &lsin6->sin6_addr, ntohs(lsin6->sin6_port), -			      &rsin6->sin6_addr, ntohs(rsin6->sin6_port)); +			      ntohs(mapped_lsin6->sin6_port), +			      &rsin6->sin6_addr, ntohs(rsin6->sin6_port), +			      ntohs(mapped_rsin6->sin6_port));  	}  	if (cc < space)  		epd->pos += cc; @@ -426,23 +467,29 @@ static int dump_listen_ep(int id, void *p, void *data)  	if (ep->com.local_addr.ss_family == AF_INET) {  		struct sockaddr_in *lsin = (struct sockaddr_in *)  			&ep->com.local_addr; +		struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) +			&ep->com.mapped_local_addr;  		cc = snprintf(epd->buf + epd->pos, space,  			      "ep %p cm_id %p state %d flags 0x%lx stid %d " -			      "backlog %d %pI4:%d\n", +			      "backlog %d %pI4:%d/%d\n",  			      ep, ep->com.cm_id, (int)ep->com.state,  			      ep->com.flags, ep->stid, ep->backlog, -			      &lsin->sin_addr, ntohs(lsin->sin_port)); +			      &lsin->sin_addr, ntohs(lsin->sin_port), +			      ntohs(mapped_lsin->sin_port));  	} else {  		struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *)  			&ep->com.local_addr; +		struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *) +			&ep->com.mapped_local_addr;  		cc = snprintf(epd->buf + epd->pos, space,  			      "ep %p cm_id %p state %d flags 0x%lx stid %d " -			      "backlog %d %pI6:%d\n", +			      "backlog %d %pI6:%d/%d\n",  			      ep, ep->com.cm_id, (int)ep->com.state,  			      ep->com.flags, ep->stid, ep->backlog, -			      &lsin6->sin6_addr, ntohs(lsin6->sin6_port)); +			      &lsin6->sin6_addr, ntohs(lsin6->sin6_port), +			      ntohs(mapped_lsin6->sin6_port));  	}  	if (cc < space)  		epd->pos += cc; @@ -602,10 +649,10 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)  	     rdev->lldi.vr->qp.size,  	     rdev->lldi.vr->cq.start,  	     rdev->lldi.vr->cq.size); -	PDBG("udb len 0x%x udb base %p db_reg %p gts_reg %p qpshift %lu " +	PDBG("udb len 0x%x udb base %llx db_reg %p gts_reg %p qpshift %lu "  	     "qpmask 0x%x cqshift %lu cqmask 0x%x\n",  	     (unsigned)pci_resource_len(rdev->lldi.pdev, 2), -	     (void *)(unsigned long)pci_resource_start(rdev->lldi.pdev, 2), +	     (u64)pci_resource_start(rdev->lldi.pdev, 2),  	     rdev->lldi.db_reg,  	     rdev->lldi.gts_reg,  	     rdev->qpshift, rdev->qpmask, @@ -643,6 +690,13 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)  		printk(KERN_ERR MOD "error %d initializing ocqp pool\n", err);  		goto err4;  	} +	rdev->status_page = (struct t4_dev_status_page *) +			    __get_free_page(GFP_KERNEL); +	if (!rdev->status_page) { +		pr_err(MOD "error allocating status page\n"); +		goto err4; +	} +	rdev->status_page->db_off = 0;  	return 0;  err4:  	c4iw_rqtpool_destroy(rdev); @@ -656,6 +710,7 @@ err1:  static void c4iw_rdev_close(struct c4iw_rdev *rdev)  { +	free_page((unsigned long)rdev->status_page);  	c4iw_pblpool_destroy(rdev);  	c4iw_rqtpool_destroy(rdev);  	c4iw_destroy_resource(&rdev->resource); @@ -670,7 +725,10 @@ static void c4iw_dealloc(struct uld_ctx *ctx)  	idr_destroy(&ctx->dev->hwtid_idr);  	idr_destroy(&ctx->dev->stid_idr);  	idr_destroy(&ctx->dev->atid_idr); -	iounmap(ctx->dev->rdev.oc_mw_kva); +	if (ctx->dev->rdev.bar2_kva) +		iounmap(ctx->dev->rdev.bar2_kva); +	if (ctx->dev->rdev.oc_mw_kva) +		iounmap(ctx->dev->rdev.oc_mw_kva);  	ib_dealloc_device(&ctx->dev->ibdev);  	ctx->dev = NULL;  } @@ -703,18 +761,6 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop)  		pr_info("%s: On-Chip Queues not supported on this device.\n",  			pci_name(infop->pdev)); -	if (!is_t4(infop->adapter_type)) { -		if (!allow_db_fc_on_t5) { -			db_fc_threshold = 100000; -			pr_info("DB Flow Control Disabled.\n"); -		} - -		if (!allow_db_coalescing_on_t5) { -			db_coalescing_threshold = -1; -			pr_info("DB Coalescing Disabled.\n"); -		} -	} -  	devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp));  	if (!devp) {  		printk(KERN_ERR MOD "Cannot allocate ib device\n"); @@ -722,11 +768,33 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop)  	}  	devp->rdev.lldi = *infop; -	devp->rdev.oc_mw_pa = pci_resource_start(devp->rdev.lldi.pdev, 2) + -		(pci_resource_len(devp->rdev.lldi.pdev, 2) - -		 roundup_pow_of_two(devp->rdev.lldi.vr->ocq.size)); -	devp->rdev.oc_mw_kva = ioremap_wc(devp->rdev.oc_mw_pa, -					       devp->rdev.lldi.vr->ocq.size); +	/* +	 * For T5 devices, we map all of BAR2 with WC. +	 * For T4 devices with onchip qp mem, we map only that part +	 * of BAR2 with WC. +	 */ +	devp->rdev.bar2_pa = pci_resource_start(devp->rdev.lldi.pdev, 2); +	if (is_t5(devp->rdev.lldi.adapter_type)) { +		devp->rdev.bar2_kva = ioremap_wc(devp->rdev.bar2_pa, +			pci_resource_len(devp->rdev.lldi.pdev, 2)); +		if (!devp->rdev.bar2_kva) { +			pr_err(MOD "Unable to ioremap BAR2\n"); +			ib_dealloc_device(&devp->ibdev); +			return ERR_PTR(-EINVAL); +		} +	} else if (ocqp_supported(infop)) { +		devp->rdev.oc_mw_pa = +			pci_resource_start(devp->rdev.lldi.pdev, 2) + +			pci_resource_len(devp->rdev.lldi.pdev, 2) - +			roundup_pow_of_two(devp->rdev.lldi.vr->ocq.size); +		devp->rdev.oc_mw_kva = ioremap_wc(devp->rdev.oc_mw_pa, +			devp->rdev.lldi.vr->ocq.size); +		if (!devp->rdev.oc_mw_kva) { +			pr_err(MOD "Unable to ioremap onchip mem\n"); +			ib_dealloc_device(&devp->ibdev); +			return ERR_PTR(-EINVAL); +		} +	}  	PDBG(KERN_INFO MOD "ocq memory: "  	       "hw_start 0x%x size %u mw_pa 0x%lx mw_kva %p\n", @@ -749,6 +817,7 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop)  	spin_lock_init(&devp->lock);  	mutex_init(&devp->rdev.stats.lock);  	mutex_init(&devp->db_mutex); +	INIT_LIST_HEAD(&devp->db_fc_list);  	if (c4iw_debugfs_root) {  		devp->debugfs_root = debugfs_create_dir( @@ -756,6 +825,8 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop)  					c4iw_debugfs_root);  		setup_debugfs(devp);  	} + +  	return devp;  } @@ -897,11 +968,13 @@ static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp,  	}  	opcode = *(u8 *)rsp; -	if (c4iw_handlers[opcode]) +	if (c4iw_handlers[opcode]) {  		c4iw_handlers[opcode](dev, skb); -	else +	} else {  		pr_info("%s no handler opcode 0x%x...\n", __func__,  		       opcode); +		kfree_skb(skb); +	}  	return 0;  nomem: @@ -977,13 +1050,16 @@ static int disable_qp_db(int id, void *p, void *data)  static void stop_queues(struct uld_ctx *ctx)  { -	spin_lock_irq(&ctx->dev->lock); -	if (ctx->dev->db_state == NORMAL) { -		ctx->dev->rdev.stats.db_state_transitions++; -		ctx->dev->db_state = FLOW_CONTROL; +	unsigned long flags; + +	spin_lock_irqsave(&ctx->dev->lock, flags); +	ctx->dev->rdev.stats.db_state_transitions++; +	ctx->dev->db_state = STOPPED; +	if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED)  		idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL); -	} -	spin_unlock_irq(&ctx->dev->lock); +	else +		ctx->dev->rdev.status_page->db_off = 1; +	spin_unlock_irqrestore(&ctx->dev->lock, flags);  }  static int enable_qp_db(int id, void *p, void *data) @@ -994,15 +1070,72 @@ static int enable_qp_db(int id, void *p, void *data)  	return 0;  } +static void resume_rc_qp(struct c4iw_qp *qp) +{ +	spin_lock(&qp->lock); +	t4_ring_sq_db(&qp->wq, qp->wq.sq.wq_pidx_inc, +		      is_t5(qp->rhp->rdev.lldi.adapter_type), NULL); +	qp->wq.sq.wq_pidx_inc = 0; +	t4_ring_rq_db(&qp->wq, qp->wq.rq.wq_pidx_inc, +		      is_t5(qp->rhp->rdev.lldi.adapter_type), NULL); +	qp->wq.rq.wq_pidx_inc = 0; +	spin_unlock(&qp->lock); +} + +static void resume_a_chunk(struct uld_ctx *ctx) +{ +	int i; +	struct c4iw_qp *qp; + +	for (i = 0; i < DB_FC_RESUME_SIZE; i++) { +		qp = list_first_entry(&ctx->dev->db_fc_list, struct c4iw_qp, +				      db_fc_entry); +		list_del_init(&qp->db_fc_entry); +		resume_rc_qp(qp); +		if (list_empty(&ctx->dev->db_fc_list)) +			break; +	} +} +  static void resume_queues(struct uld_ctx *ctx)  {  	spin_lock_irq(&ctx->dev->lock); -	if (ctx->dev->qpcnt <= db_fc_threshold && -	    ctx->dev->db_state == FLOW_CONTROL) { -		ctx->dev->db_state = NORMAL; -		ctx->dev->rdev.stats.db_state_transitions++; -		idr_for_each(&ctx->dev->qpidr, enable_qp_db, NULL); +	if (ctx->dev->db_state != STOPPED) +		goto out; +	ctx->dev->db_state = FLOW_CONTROL; +	while (1) { +		if (list_empty(&ctx->dev->db_fc_list)) { +			WARN_ON(ctx->dev->db_state != FLOW_CONTROL); +			ctx->dev->db_state = NORMAL; +			ctx->dev->rdev.stats.db_state_transitions++; +			if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) { +				idr_for_each(&ctx->dev->qpidr, enable_qp_db, +					     NULL); +			} else { +				ctx->dev->rdev.status_page->db_off = 0; +			} +			break; +		} else { +			if (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1) +			    < (ctx->dev->rdev.lldi.dbfifo_int_thresh << +			       DB_FC_DRAIN_THRESH)) { +				resume_a_chunk(ctx); +			} +			if (!list_empty(&ctx->dev->db_fc_list)) { +				spin_unlock_irq(&ctx->dev->lock); +				if (DB_FC_RESUME_DELAY) { +					set_current_state(TASK_UNINTERRUPTIBLE); +					schedule_timeout(DB_FC_RESUME_DELAY); +				} +				spin_lock_irq(&ctx->dev->lock); +				if (ctx->dev->db_state != FLOW_CONTROL) +					break; +			} +		}  	} +out: +	if (ctx->dev->db_state != NORMAL) +		ctx->dev->rdev.stats.db_fc_interruptions++;  	spin_unlock_irq(&ctx->dev->lock);  } @@ -1028,12 +1161,12 @@ static int count_qps(int id, void *p, void *data)  	return 0;  } -static void deref_qps(struct qp_list qp_list) +static void deref_qps(struct qp_list *qp_list)  {  	int idx; -	for (idx = 0; idx < qp_list.idx; idx++) -		c4iw_qp_rem_ref(&qp_list.qps[idx]->ibqp); +	for (idx = 0; idx < qp_list->idx; idx++) +		c4iw_qp_rem_ref(&qp_list->qps[idx]->ibqp);  }  static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list) @@ -1044,17 +1177,22 @@ static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list)  	for (idx = 0; idx < qp_list->idx; idx++) {  		struct c4iw_qp *qp = qp_list->qps[idx]; +		spin_lock_irq(&qp->rhp->lock); +		spin_lock(&qp->lock);  		ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0],  					  qp->wq.sq.qid,  					  t4_sq_host_wq_pidx(&qp->wq),  					  t4_sq_wq_size(&qp->wq));  		if (ret) { -			printk(KERN_ERR MOD "%s: Fatal error - " +			pr_err(KERN_ERR MOD "%s: Fatal error - "  			       "DB overflow recovery failed - "  			       "error syncing SQ qid %u\n",  			       pci_name(ctx->lldi.pdev), qp->wq.sq.qid); +			spin_unlock(&qp->lock); +			spin_unlock_irq(&qp->rhp->lock);  			return;  		} +		qp->wq.sq.wq_pidx_inc = 0;  		ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0],  					  qp->wq.rq.qid, @@ -1062,12 +1200,17 @@ static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list)  					  t4_rq_wq_size(&qp->wq));  		if (ret) { -			printk(KERN_ERR MOD "%s: Fatal error - " +			pr_err(KERN_ERR MOD "%s: Fatal error - "  			       "DB overflow recovery failed - "  			       "error syncing RQ qid %u\n",  			       pci_name(ctx->lldi.pdev), qp->wq.rq.qid); +			spin_unlock(&qp->lock); +			spin_unlock_irq(&qp->rhp->lock);  			return;  		} +		qp->wq.rq.wq_pidx_inc = 0; +		spin_unlock(&qp->lock); +		spin_unlock_irq(&qp->rhp->lock);  		/* Wait for the dbfifo to drain */  		while (cxgb4_dbfifo_count(qp->rhp->rdev.lldi.ports[0], 1) > 0) { @@ -1083,36 +1226,22 @@ static void recover_queues(struct uld_ctx *ctx)  	struct qp_list qp_list;  	int ret; -	/* lock out kernel db ringers */ -	mutex_lock(&ctx->dev->db_mutex); - -	/* put all queues in to recovery mode */ -	spin_lock_irq(&ctx->dev->lock); -	ctx->dev->db_state = RECOVERY; -	ctx->dev->rdev.stats.db_state_transitions++; -	idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL); -	spin_unlock_irq(&ctx->dev->lock); -  	/* slow everybody down */  	set_current_state(TASK_UNINTERRUPTIBLE);  	schedule_timeout(usecs_to_jiffies(1000)); -	/* Wait for the dbfifo to completely drain. */ -	while (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1) > 0) { -		set_current_state(TASK_UNINTERRUPTIBLE); -		schedule_timeout(usecs_to_jiffies(10)); -	} -  	/* flush the SGE contexts */  	ret = cxgb4_flush_eq_cache(ctx->dev->rdev.lldi.ports[0]);  	if (ret) {  		printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n",  		       pci_name(ctx->lldi.pdev)); -		goto out; +		return;  	}  	/* Count active queues so we can build a list of queues to recover */  	spin_lock_irq(&ctx->dev->lock); +	WARN_ON(ctx->dev->db_state != STOPPED); +	ctx->dev->db_state = RECOVERY;  	idr_for_each(&ctx->dev->qpidr, count_qps, &count);  	qp_list.qps = kzalloc(count * sizeof *qp_list.qps, GFP_ATOMIC); @@ -1120,7 +1249,7 @@ static void recover_queues(struct uld_ctx *ctx)  		printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n",  		       pci_name(ctx->lldi.pdev));  		spin_unlock_irq(&ctx->dev->lock); -		goto out; +		return;  	}  	qp_list.idx = 0; @@ -1133,29 +1262,13 @@ static void recover_queues(struct uld_ctx *ctx)  	recover_lost_dbs(ctx, &qp_list);  	/* we're almost done!  deref the qps and clean up */ -	deref_qps(qp_list); +	deref_qps(&qp_list);  	kfree(qp_list.qps); -	/* Wait for the dbfifo to completely drain again */ -	while (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1) > 0) { -		set_current_state(TASK_UNINTERRUPTIBLE); -		schedule_timeout(usecs_to_jiffies(10)); -	} - -	/* resume the queues */  	spin_lock_irq(&ctx->dev->lock); -	if (ctx->dev->qpcnt > db_fc_threshold) -		ctx->dev->db_state = FLOW_CONTROL; -	else { -		ctx->dev->db_state = NORMAL; -		idr_for_each(&ctx->dev->qpidr, enable_qp_db, NULL); -	} -	ctx->dev->rdev.stats.db_state_transitions++; +	WARN_ON(ctx->dev->db_state != RECOVERY); +	ctx->dev->db_state = STOPPED;  	spin_unlock_irq(&ctx->dev->lock); - -out: -	/* start up kernel db ringers again */ -	mutex_unlock(&ctx->dev->db_mutex);  }  static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...) @@ -1165,9 +1278,7 @@ static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...)  	switch (control) {  	case CXGB4_CONTROL_DB_FULL:  		stop_queues(ctx); -		mutex_lock(&ctx->dev->rdev.stats.lock);  		ctx->dev->rdev.stats.db_full++; -		mutex_unlock(&ctx->dev->rdev.stats.lock);  		break;  	case CXGB4_CONTROL_DB_EMPTY:  		resume_queues(ctx); @@ -1210,6 +1321,20 @@ static int __init c4iw_init_module(void)  		printk(KERN_WARNING MOD  		       "could not create debugfs entry, continuing\n"); +	if (ibnl_add_client(RDMA_NL_C4IW, RDMA_NL_IWPM_NUM_OPS, +			    c4iw_nl_cb_table)) +		pr_err("%s[%u]: Failed to add netlink callback\n" +		       , __func__, __LINE__); + +	err = iwpm_init(RDMA_NL_C4IW); +	if (err) { +		pr_err("port mapper initialization failed with %d\n", err); +		ibnl_remove_client(RDMA_NL_C4IW); +		c4iw_cm_term(); +		debugfs_remove_recursive(c4iw_debugfs_root); +		return err; +	} +  	cxgb4_register_uld(CXGB4_ULD_RDMA, &c4iw_uld_info);  	return 0; @@ -1227,6 +1352,8 @@ static void __exit c4iw_exit_module(void)  	}  	mutex_unlock(&dev_mutex);  	cxgb4_unregister_uld(CXGB4_ULD_RDMA); +	iwpm_exit(RDMA_NL_C4IW); +	ibnl_remove_client(RDMA_NL_C4IW);  	c4iw_cm_term();  	debugfs_remove_recursive(c4iw_debugfs_root);  } diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 23eaeabab93..361fff7a074 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -52,6 +52,8 @@  #include <rdma/ib_verbs.h>  #include <rdma/iw_cm.h> +#include <rdma/rdma_netlink.h> +#include <rdma/iw_portmap.h>  #include "cxgb4.h"  #include "cxgb4_uld.h" @@ -109,6 +111,7 @@ struct c4iw_dev_ucontext {  enum c4iw_rdev_flags {  	T4_FATAL_ERROR = (1<<0), +	T4_STATUS_PAGE_DISABLED = (1<<1),  };  struct c4iw_stat { @@ -130,6 +133,7 @@ struct c4iw_stats {  	u64  db_empty;  	u64  db_drop;  	u64  db_state_transitions; +	u64  db_fc_interruptions;  	u64  tcam_full;  	u64  act_ofld_conn_fails;  	u64  pas_ofld_conn_fails; @@ -147,9 +151,12 @@ struct c4iw_rdev {  	struct gen_pool *ocqp_pool;  	u32 flags;  	struct cxgb4_lld_info lldi; +	unsigned long bar2_pa; +	void __iomem *bar2_kva;  	unsigned long oc_mw_pa;  	void __iomem *oc_mw_kva;  	struct c4iw_stats stats; +	struct t4_dev_status_page *status_page;  };  static inline int c4iw_fatal_error(struct c4iw_rdev *rdev) @@ -211,7 +218,8 @@ static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev,  enum db_state {  	NORMAL = 0,  	FLOW_CONTROL = 1, -	RECOVERY = 2 +	RECOVERY = 2, +	STOPPED = 3  };  struct c4iw_dev { @@ -225,10 +233,10 @@ struct c4iw_dev {  	struct mutex db_mutex;  	struct dentry *debugfs_root;  	enum db_state db_state; -	int qpcnt;  	struct idr hwtid_idr;  	struct idr atid_idr;  	struct idr stid_idr; +	struct list_head db_fc_list;  };  static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev) @@ -369,6 +377,7 @@ struct c4iw_fr_page_list {  	DEFINE_DMA_UNMAP_ADDR(mapping);  	dma_addr_t dma_addr;  	struct c4iw_dev *dev; +	int pll_len;  };  static inline struct c4iw_fr_page_list *to_c4iw_fr_page_list( @@ -428,10 +437,12 @@ struct c4iw_qp_attributes {  	u8 ecode;  	u16 sq_db_inc;  	u16 rq_db_inc; +	u8 send_term;  };  struct c4iw_qp {  	struct ib_qp ibqp; +	struct list_head db_fc_entry;  	struct c4iw_dev *rhp;  	struct c4iw_ep *ep;  	struct c4iw_qp_attributes attr; @@ -441,6 +452,7 @@ struct c4iw_qp {  	atomic_t refcnt;  	wait_queue_head_t wait;  	struct timer_list timer; +	int sq_sig_all;  };  static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp) @@ -718,6 +730,7 @@ enum c4iw_ep_flags {  	CLOSE_SENT		= 3,  	TIMEOUT                 = 4,  	QP_REFERENCED           = 5, +	RELEASE_MAPINFO		= 6,  };  enum c4iw_ep_history { @@ -754,6 +767,8 @@ struct c4iw_ep_common {  	struct mutex mutex;  	struct sockaddr_storage local_addr;  	struct sockaddr_storage remote_addr; +	struct sockaddr_storage mapped_local_addr; +	struct sockaddr_storage mapped_remote_addr;  	struct c4iw_wr_wait wr_wait;  	unsigned long flags;  	unsigned long history; @@ -795,7 +810,48 @@ struct c4iw_ep {  	u8 retry_with_mpa_v1;  	u8 tried_with_mpa_v1;  	unsigned int retry_count; -}; +	int snd_win; +	int rcv_win; +}; + +static inline void print_addr(struct c4iw_ep_common *epc, const char *func, +			      const char *msg) +{ + +#define SINA(a) (&(((struct sockaddr_in *)(a))->sin_addr.s_addr)) +#define SINP(a) ntohs(((struct sockaddr_in *)(a))->sin_port) +#define SIN6A(a) (&(((struct sockaddr_in6 *)(a))->sin6_addr)) +#define SIN6P(a) ntohs(((struct sockaddr_in6 *)(a))->sin6_port) + +	if (c4iw_debug) { +		switch (epc->local_addr.ss_family) { +		case AF_INET: +			PDBG("%s %s %pI4:%u/%u <-> %pI4:%u/%u\n", +			     func, msg, SINA(&epc->local_addr), +			     SINP(&epc->local_addr), +			     SINP(&epc->mapped_local_addr), +			     SINA(&epc->remote_addr), +			     SINP(&epc->remote_addr), +			     SINP(&epc->mapped_remote_addr)); +			break; +		case AF_INET6: +			PDBG("%s %s %pI6:%u/%u <-> %pI6:%u/%u\n", +			     func, msg, SIN6A(&epc->local_addr), +			     SIN6P(&epc->local_addr), +			     SIN6P(&epc->mapped_local_addr), +			     SIN6A(&epc->remote_addr), +			     SIN6P(&epc->remote_addr), +			     SIN6P(&epc->mapped_remote_addr)); +			break; +		default: +			break; +		} +	} +#undef SINA +#undef SINP +#undef SIN6A +#undef SIN6P +}  static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id)  { @@ -852,7 +908,7 @@ int c4iw_destroy_ctrl_qp(struct c4iw_rdev *rdev);  int c4iw_register_device(struct c4iw_dev *dev);  void c4iw_unregister_device(struct c4iw_dev *dev);  int __init c4iw_cm_init(void); -void __exit c4iw_cm_term(void); +void c4iw_cm_term(void);  void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev,  			       struct c4iw_dev_ucontext *uctx);  void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev, diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 4cb8eb24497..ec7a2988a70 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -37,9 +37,9 @@  #include "iw_cxgb4.h" -int use_dsgl = 1; +int use_dsgl = 0;  module_param(use_dsgl, int, 0644); -MODULE_PARM_DESC(use_dsgl, "Use DSGL for PBL/FastReg (default=1)"); +MODULE_PARM_DESC(use_dsgl, "Use DSGL for PBL/FastReg (default=0)");  #define T4_ULPTX_MIN_IO 32  #define C4IW_MAX_INLINE_SIZE 96 @@ -76,7 +76,7 @@ static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,  	INIT_ULPTX_WR(req, wr_len, 0, 0);  	req->wr.wr_hi = cpu_to_be32(FW_WR_OP(FW_ULPTX_WR) |  			(wait ? FW_WR_COMPL(1) : 0)); -	req->wr.wr_lo = wait ? (__force __be64)&wr_wait : 0; +	req->wr.wr_lo = wait ? (__force __be64)(unsigned long) &wr_wait : 0L;  	req->wr.wr_mid = cpu_to_be32(FW_WR_LEN16(DIV_ROUND_UP(wr_len, 16)));  	req->cmd = cpu_to_be32(ULPTX_CMD(ULP_TX_MEM_WRITE));  	req->cmd |= cpu_to_be32(V_T5_ULP_MEMIO_ORDER(1)); @@ -173,7 +173,7 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,  	return ret;  } -int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *data) +static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *data)  {  	u32 remain = len;  	u32 dmalen; @@ -259,8 +259,12 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,  	if ((!reset_tpt_entry) && (*stag == T4_STAG_UNSET)) {  		stag_idx = c4iw_get_resource(&rdev->resource.tpt_table); -		if (!stag_idx) +		if (!stag_idx) { +			mutex_lock(&rdev->stats.lock); +			rdev->stats.stag.fail++; +			mutex_unlock(&rdev->stats.lock);  			return -ENOMEM; +		}  		mutex_lock(&rdev->stats.lock);  		rdev->stats.stag.cur += 32;  		if (rdev->stats.stag.cur > rdev->stats.stag.max) @@ -678,9 +682,9 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  {  	__be64 *pages;  	int shift, n, len; -	int i, j, k; +	int i, k, entry;  	int err = 0; -	struct ib_umem_chunk *chunk; +	struct scatterlist *sg;  	struct c4iw_dev *rhp;  	struct c4iw_pd *php;  	struct c4iw_mr *mhp; @@ -710,10 +714,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  	shift = ffs(mhp->umem->page_size) - 1; -	n = 0; -	list_for_each_entry(chunk, &mhp->umem->chunk_list, list) -		n += chunk->nents; - +	n = mhp->umem->nmap;  	err = alloc_pbl(mhp, n);  	if (err)  		goto err; @@ -726,24 +727,22 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  	i = n = 0; -	list_for_each_entry(chunk, &mhp->umem->chunk_list, list) -		for (j = 0; j < chunk->nmap; ++j) { -			len = sg_dma_len(&chunk->page_list[j]) >> shift; -			for (k = 0; k < len; ++k) { -				pages[i++] = cpu_to_be64(sg_dma_address( -					&chunk->page_list[j]) + -					mhp->umem->page_size * k); -				if (i == PAGE_SIZE / sizeof *pages) { -					err = write_pbl(&mhp->rhp->rdev, -					      pages, -					      mhp->attr.pbl_addr + (n << 3), i); -					if (err) -						goto pbl_done; -					n += i; -					i = 0; -				} +	for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { +		len = sg_dma_len(sg) >> shift; +		for (k = 0; k < len; ++k) { +			pages[i++] = cpu_to_be64(sg_dma_address(sg) + +				mhp->umem->page_size * k); +			if (i == PAGE_SIZE / sizeof *pages) { +				err = write_pbl(&mhp->rhp->rdev, +				      pages, +				      mhp->attr.pbl_addr + (n << 3), i); +				if (err) +					goto pbl_done; +				n += i; +				i = 0;  			}  		} +	}  	if (i)  		err = write_pbl(&mhp->rhp->rdev, pages, @@ -903,7 +902,11 @@ struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl(struct ib_device *device,  	dma_unmap_addr_set(c4pl, mapping, dma_addr);  	c4pl->dma_addr = dma_addr;  	c4pl->dev = dev; -	c4pl->ibpl.max_page_list_len = pll_len; +	c4pl->pll_len = pll_len; + +	PDBG("%s c4pl %p pll_len %u page_list %p dma_addr %pad\n", +	     __func__, c4pl, c4pl->pll_len, c4pl->ibpl.page_list, +	     &c4pl->dma_addr);  	return &c4pl->ibpl;  } @@ -912,8 +915,12 @@ void c4iw_free_fastreg_pbl(struct ib_fast_reg_page_list *ibpl)  {  	struct c4iw_fr_page_list *c4pl = to_c4iw_fr_page_list(ibpl); +	PDBG("%s c4pl %p pll_len %u page_list %p dma_addr %pad\n", +	     __func__, c4pl, c4pl->pll_len, c4pl->ibpl.page_list, +	     &c4pl->dma_addr); +  	dma_free_coherent(&c4pl->dev->rdev.lldi.pdev->dev, -			  c4pl->ibpl.max_page_list_len, +			  c4pl->pll_len,  			  c4pl->ibpl.page_list, dma_unmap_addr(c4pl, mapping));  	kfree(c4pl);  } diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 7e94c9a656a..b1d305338de 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -106,15 +106,57 @@ static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev,  {  	struct c4iw_ucontext *context;  	struct c4iw_dev *rhp = to_c4iw_dev(ibdev); +	static int warned; +	struct c4iw_alloc_ucontext_resp uresp; +	int ret = 0; +	struct c4iw_mm_entry *mm = NULL;  	PDBG("%s ibdev %p\n", __func__, ibdev);  	context = kzalloc(sizeof(*context), GFP_KERNEL); -	if (!context) -		return ERR_PTR(-ENOMEM); +	if (!context) { +		ret = -ENOMEM; +		goto err; +	} +  	c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx);  	INIT_LIST_HEAD(&context->mmaps);  	spin_lock_init(&context->mmap_lock); + +	if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) { +		if (!warned++) +			pr_err(MOD "Warning - downlevel libcxgb4 (non-fatal), device status page disabled."); +		rhp->rdev.flags |= T4_STATUS_PAGE_DISABLED; +	} else { +		mm = kmalloc(sizeof(*mm), GFP_KERNEL); +		if (!mm) { +			ret = -ENOMEM; +			goto err_free; +		} + +		uresp.status_page_size = PAGE_SIZE; + +		spin_lock(&context->mmap_lock); +		uresp.status_page_key = context->key; +		context->key += PAGE_SIZE; +		spin_unlock(&context->mmap_lock); + +		ret = ib_copy_to_udata(udata, &uresp, +				       sizeof(uresp) - sizeof(uresp.reserved)); +		if (ret) +			goto err_mm; + +		mm->key = uresp.status_page_key; +		mm->addr = virt_to_phys(rhp->rdev.status_page); +		mm->len = PAGE_SIZE; +		insert_mmap(context, mm); +	}  	return &context->ibucontext; +err_mm: +	kfree(mm); +err_free: +	kfree(context); +err: +	return ERR_PTR(ret);  }  static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) @@ -287,7 +329,7 @@ static int c4iw_query_device(struct ib_device *ibdev,  	props->max_mr = c4iw_num_stags(&dev->rdev);  	props->max_pd = T4_MAX_NUM_PD;  	props->local_ca_ack_delay = 0; -	props->max_fast_reg_page_list_len = T4_MAX_FR_DEPTH; +	props->max_fast_reg_page_list_len = t4_max_fr_depth(use_dsgl);  	return 0;  } @@ -458,7 +500,7 @@ int c4iw_register_device(struct c4iw_dev *dev)  	dev->ibdev.node_type = RDMA_NODE_RNIC;  	memcpy(dev->ibdev.node_desc, C4IW_NODE_DESC, sizeof(C4IW_NODE_DESC));  	dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports; -	dev->ibdev.num_comp_vectors = 1; +	dev->ibdev.num_comp_vectors =  dev->rdev.lldi.nciq;  	dev->ibdev.dma_device = &(dev->rdev.lldi.pdev->dev);  	dev->ibdev.query_device = c4iw_query_device;  	dev->ibdev.query_port = c4iw_query_port; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 582936708e6..086f62f5dc9 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -212,13 +212,23 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,  	wq->db = rdev->lldi.db_reg;  	wq->gts = rdev->lldi.gts_reg; -	if (user) { -		wq->sq.udb = (u64)pci_resource_start(rdev->lldi.pdev, 2) + -					(wq->sq.qid << rdev->qpshift); -		wq->sq.udb &= PAGE_MASK; -		wq->rq.udb = (u64)pci_resource_start(rdev->lldi.pdev, 2) + -					(wq->rq.qid << rdev->qpshift); -		wq->rq.udb &= PAGE_MASK; +	if (user || is_t5(rdev->lldi.adapter_type)) { +		u32 off; + +		off = (wq->sq.qid << rdev->qpshift) & PAGE_MASK; +		if (user) { +			wq->sq.udb = (u64 __iomem *)(rdev->bar2_pa + off); +		} else { +			off += 128 * (wq->sq.qid & rdev->qpmask) + 8; +			wq->sq.udb = (u64 __iomem *)(rdev->bar2_kva + off); +		} +		off = (wq->rq.qid << rdev->qpshift) & PAGE_MASK; +		if (user) { +			wq->rq.udb = (u64 __iomem *)(rdev->bar2_pa + off); +		} else { +			off += 128 * (wq->rq.qid & rdev->qpmask) + 8; +			wq->rq.udb = (u64 __iomem *)(rdev->bar2_kva + off); +		}  	}  	wq->rdev = rdev;  	wq->rq.msn = 1; @@ -299,9 +309,10 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,  	if (ret)  		goto free_dma; -	PDBG("%s sqid 0x%x rqid 0x%x kdb 0x%p squdb 0x%llx rqudb 0x%llx\n", +	PDBG("%s sqid 0x%x rqid 0x%x kdb 0x%p squdb 0x%lx rqudb 0x%lx\n",  	     __func__, wq->sq.qid, wq->rq.qid, wq->db, -	     (unsigned long long)wq->sq.udb, (unsigned long long)wq->rq.udb); +	     (__force unsigned long) wq->sq.udb, +	     (__force unsigned long) wq->rq.udb);  	return 0;  free_dma: @@ -425,6 +436,8 @@ static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe,  	default:  		return -EINVAL;  	} +	wqe->send.r3 = 0; +	wqe->send.r4 = 0;  	plen = 0;  	if (wr->num_sge) { @@ -555,7 +568,8 @@ static int build_fastreg(struct t4_sq *sq, union t4_wr *wqe,  	int pbllen = roundup(wr->wr.fast_reg.page_list_len * sizeof(u64), 32);  	int rem; -	if (wr->wr.fast_reg.page_list_len > T4_MAX_FR_DEPTH) +	if (wr->wr.fast_reg.page_list_len > +	    t4_max_fr_depth(use_dsgl))  		return -EINVAL;  	wqe->fr.qpbinde_to_dcacpu = 0; @@ -638,6 +652,48 @@ void c4iw_qp_rem_ref(struct ib_qp *qp)  		wake_up(&(to_c4iw_qp(qp)->wait));  } +static void add_to_fc_list(struct list_head *head, struct list_head *entry) +{ +	if (list_empty(entry)) +		list_add_tail(entry, head); +} + +static int ring_kernel_sq_db(struct c4iw_qp *qhp, u16 inc) +{ +	unsigned long flags; + +	spin_lock_irqsave(&qhp->rhp->lock, flags); +	spin_lock(&qhp->lock); +	if (qhp->rhp->db_state == NORMAL) +		t4_ring_sq_db(&qhp->wq, inc, +			      is_t5(qhp->rhp->rdev.lldi.adapter_type), NULL); +	else { +		add_to_fc_list(&qhp->rhp->db_fc_list, &qhp->db_fc_entry); +		qhp->wq.sq.wq_pidx_inc += inc; +	} +	spin_unlock(&qhp->lock); +	spin_unlock_irqrestore(&qhp->rhp->lock, flags); +	return 0; +} + +static int ring_kernel_rq_db(struct c4iw_qp *qhp, u16 inc) +{ +	unsigned long flags; + +	spin_lock_irqsave(&qhp->rhp->lock, flags); +	spin_lock(&qhp->lock); +	if (qhp->rhp->db_state == NORMAL) +		t4_ring_rq_db(&qhp->wq, inc, +			      is_t5(qhp->rhp->rdev.lldi.adapter_type), NULL); +	else { +		add_to_fc_list(&qhp->rhp->db_fc_list, &qhp->db_fc_entry); +		qhp->wq.rq.wq_pidx_inc += inc; +	} +	spin_unlock(&qhp->lock); +	spin_unlock_irqrestore(&qhp->rhp->lock, flags); +	return 0; +} +  int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,  		   struct ib_send_wr **bad_wr)  { @@ -646,7 +702,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,  	enum fw_wr_opcodes fw_opcode = 0;  	enum fw_ri_wr_flags fw_flags;  	struct c4iw_qp *qhp; -	union t4_wr *wqe; +	union t4_wr *wqe = NULL;  	u32 num_wrs;  	struct t4_swsqe *swsqe;  	unsigned long flag; @@ -675,7 +731,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,  		fw_flags = 0;  		if (wr->send_flags & IB_SEND_SOLICITED)  			fw_flags |= FW_RI_SOLICITED_EVENT_FLAG; -		if (wr->send_flags & IB_SEND_SIGNALED) +		if (wr->send_flags & IB_SEND_SIGNALED || qhp->sq_sig_all)  			fw_flags |= FW_RI_COMPLETION_FLAG;  		swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx];  		switch (wr->opcode) { @@ -736,7 +792,8 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,  		}  		swsqe->idx = qhp->wq.sq.pidx;  		swsqe->complete = 0; -		swsqe->signaled = (wr->send_flags & IB_SEND_SIGNALED); +		swsqe->signaled = (wr->send_flags & IB_SEND_SIGNALED) || +				  qhp->sq_sig_all;  		swsqe->flushed = 0;  		swsqe->wr_id = wr->wr_id; @@ -750,9 +807,14 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,  		t4_sq_produce(&qhp->wq, len16);  		idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);  	} -	if (t4_wq_db_enabled(&qhp->wq)) -		t4_ring_sq_db(&qhp->wq, idx); -	spin_unlock_irqrestore(&qhp->lock, flag); +	if (!qhp->rhp->rdev.status_page->db_off) { +		t4_ring_sq_db(&qhp->wq, idx, +			      is_t5(qhp->rhp->rdev.lldi.adapter_type), wqe); +		spin_unlock_irqrestore(&qhp->lock, flag); +	} else { +		spin_unlock_irqrestore(&qhp->lock, flag); +		ring_kernel_sq_db(qhp, idx); +	}  	return err;  } @@ -761,7 +823,7 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,  {  	int err = 0;  	struct c4iw_qp *qhp; -	union t4_recv_wr *wqe; +	union t4_recv_wr *wqe = NULL;  	u32 num_wrs;  	u8 len16 = 0;  	unsigned long flag; @@ -812,9 +874,14 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,  		wr = wr->next;  		num_wrs--;  	} -	if (t4_wq_db_enabled(&qhp->wq)) -		t4_ring_rq_db(&qhp->wq, idx); -	spin_unlock_irqrestore(&qhp->lock, flag); +	if (!qhp->rhp->rdev.status_page->db_off) { +		t4_ring_rq_db(&qhp->wq, idx, +			      is_t5(qhp->rhp->rdev.lldi.adapter_type), wqe); +		spin_unlock_irqrestore(&qhp->lock, flag); +	} else { +		spin_unlock_irqrestore(&qhp->lock, flag); +		ring_kernel_rq_db(qhp, idx); +	}  	return err;  } @@ -1200,35 +1267,6 @@ out:  	return ret;  } -/* - * Called by the library when the qp has user dbs disabled due to - * a DB_FULL condition.  This function will single-thread all user - * DB rings to avoid overflowing the hw db-fifo. - */ -static int ring_kernel_db(struct c4iw_qp *qhp, u32 qid, u16 inc) -{ -	int delay = db_delay_usecs; - -	mutex_lock(&qhp->rhp->db_mutex); -	do { - -		/* -		 * The interrupt threshold is dbfifo_int_thresh << 6. So -		 * make sure we don't cross that and generate an interrupt. -		 */ -		if (cxgb4_dbfifo_count(qhp->rhp->rdev.lldi.ports[0], 1) < -		    (qhp->rhp->rdev.lldi.dbfifo_int_thresh << 5)) { -			writel(QID(qid) | PIDX(inc), qhp->wq.db); -			break; -		} -		set_current_state(TASK_UNINTERRUPTIBLE); -		schedule_timeout(usecs_to_jiffies(delay)); -		delay = min(delay << 1, 2000); -	} while (1); -	mutex_unlock(&qhp->rhp->db_mutex); -	return 0; -} -  int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,  		   enum c4iw_qp_attr_mask mask,  		   struct c4iw_qp_attributes *attrs, @@ -1278,11 +1316,11 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,  	}  	if (mask & C4IW_QP_ATTR_SQ_DB) { -		ret = ring_kernel_db(qhp, qhp->wq.sq.qid, attrs->sq_db_inc); +		ret = ring_kernel_sq_db(qhp, attrs->sq_db_inc);  		goto out;  	}  	if (mask & C4IW_QP_ATTR_RQ_DB) { -		ret = ring_kernel_db(qhp, qhp->wq.rq.qid, attrs->rq_db_inc); +		ret = ring_kernel_rq_db(qhp, attrs->rq_db_inc);  		goto out;  	} @@ -1332,6 +1370,7 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,  		switch (attrs->next_state) {  		case C4IW_QP_STATE_CLOSING:  			BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2); +			t4_set_wq_in_error(&qhp->wq);  			set_state(qhp, C4IW_QP_STATE_CLOSING);  			ep = qhp->ep;  			if (!internal) { @@ -1339,30 +1378,30 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,  				disconnect = 1;  				c4iw_get_ep(&qhp->ep->com);  			} -			t4_set_wq_in_error(&qhp->wq);  			ret = rdma_fini(rhp, qhp, ep);  			if (ret)  				goto err;  			break;  		case C4IW_QP_STATE_TERMINATE: +			t4_set_wq_in_error(&qhp->wq);  			set_state(qhp, C4IW_QP_STATE_TERMINATE);  			qhp->attr.layer_etype = attrs->layer_etype;  			qhp->attr.ecode = attrs->ecode; -			t4_set_wq_in_error(&qhp->wq);  			ep = qhp->ep; -			disconnect = 1; -			if (!internal) +			if (!internal) { +				c4iw_get_ep(&qhp->ep->com);  				terminate = 1; -			else { +				disconnect = 1; +			} else { +				terminate = qhp->attr.send_term;  				ret = rdma_fini(rhp, qhp, ep);  				if (ret)  					goto err;  			} -			c4iw_get_ep(&qhp->ep->com);  			break;  		case C4IW_QP_STATE_ERROR: -			set_state(qhp, C4IW_QP_STATE_ERROR);  			t4_set_wq_in_error(&qhp->wq); +			set_state(qhp, C4IW_QP_STATE_ERROR);  			if (!internal) {  				abort = 1;  				disconnect = 1; @@ -1465,14 +1504,6 @@ out:  	return ret;  } -static int enable_qp_db(int id, void *p, void *data) -{ -	struct c4iw_qp *qp = p; - -	t4_enable_wq_db(&qp->wq); -	return 0; -} -  int c4iw_destroy_qp(struct ib_qp *ib_qp)  {  	struct c4iw_dev *rhp; @@ -1490,22 +1521,15 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp)  		c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0);  	wait_event(qhp->wait, !qhp->ep); -	spin_lock_irq(&rhp->lock); -	remove_handle_nolock(rhp, &rhp->qpidr, qhp->wq.sq.qid); -	rhp->qpcnt--; -	BUG_ON(rhp->qpcnt < 0); -	if (rhp->qpcnt <= db_fc_threshold && rhp->db_state == FLOW_CONTROL) { -		rhp->rdev.stats.db_state_transitions++; -		rhp->db_state = NORMAL; -		idr_for_each(&rhp->qpidr, enable_qp_db, NULL); -	} -	if (db_coalescing_threshold >= 0) -		if (rhp->qpcnt <= db_coalescing_threshold) -			cxgb4_enable_db_coalescing(rhp->rdev.lldi.ports[0]); -	spin_unlock_irq(&rhp->lock); +	remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);  	atomic_dec(&qhp->refcnt);  	wait_event(qhp->wait, !atomic_read(&qhp->refcnt)); +	spin_lock_irq(&rhp->lock); +	if (!list_empty(&qhp->db_fc_entry)) +		list_del_init(&qhp->db_fc_entry); +	spin_unlock_irq(&rhp->lock); +  	ucontext = ib_qp->uobject ?  		   to_c4iw_ucontext(ib_qp->uobject->context) : NULL;  	destroy_qp(&rhp->rdev, &qhp->wq, @@ -1516,14 +1540,6 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp)  	return 0;  } -static int disable_qp_db(int id, void *p, void *data) -{ -	struct c4iw_qp *qp = p; - -	t4_disable_wq_db(&qp->wq); -	return 0; -} -  struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,  			     struct ib_udata *udata)  { @@ -1533,7 +1549,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,  	struct c4iw_cq *schp;  	struct c4iw_cq *rchp;  	struct c4iw_create_qp_resp uresp; -	int sqsize, rqsize; +	unsigned int sqsize, rqsize;  	struct c4iw_ucontext *ucontext;  	int ret;  	struct c4iw_mm_entry *mm1, *mm2, *mm3, *mm4, *mm5 = NULL; @@ -1605,25 +1621,13 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,  	qhp->attr.enable_bind = 1;  	qhp->attr.max_ord = 1;  	qhp->attr.max_ird = 1; +	qhp->sq_sig_all = attrs->sq_sig_type == IB_SIGNAL_ALL_WR;  	spin_lock_init(&qhp->lock);  	mutex_init(&qhp->mutex);  	init_waitqueue_head(&qhp->wait);  	atomic_set(&qhp->refcnt, 1); -	spin_lock_irq(&rhp->lock); -	if (rhp->db_state != NORMAL) -		t4_disable_wq_db(&qhp->wq); -	rhp->qpcnt++; -	if (rhp->qpcnt > db_fc_threshold && rhp->db_state == NORMAL) { -		rhp->rdev.stats.db_state_transitions++; -		rhp->db_state = FLOW_CONTROL; -		idr_for_each(&rhp->qpidr, disable_qp_db, NULL); -	} -	if (db_coalescing_threshold >= 0) -		if (rhp->qpcnt > db_coalescing_threshold) -			cxgb4_disable_db_coalescing(rhp->rdev.lldi.ports[0]); -	ret = insert_handle_nolock(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid); -	spin_unlock_irq(&rhp->lock); +	ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid);  	if (ret)  		goto err2; @@ -1692,11 +1696,11 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,  		mm2->len = PAGE_ALIGN(qhp->wq.rq.memsize);  		insert_mmap(ucontext, mm2);  		mm3->key = uresp.sq_db_gts_key; -		mm3->addr = qhp->wq.sq.udb; +		mm3->addr = (__force unsigned long) qhp->wq.sq.udb;  		mm3->len = PAGE_SIZE;  		insert_mmap(ucontext, mm3);  		mm4->key = uresp.rq_db_gts_key; -		mm4->addr = qhp->wq.rq.udb; +		mm4->addr = (__force unsigned long) qhp->wq.rq.udb;  		mm4->len = PAGE_SIZE;  		insert_mmap(ucontext, mm4);  		if (mm5) { @@ -1709,6 +1713,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,  	}  	qhp->ibqp.qp_num = qhp->wq.sq.qid;  	init_timer(&(qhp->timer)); +	INIT_LIST_HEAD(&qhp->db_fc_entry);  	PDBG("%s qhp %p sq_num_entries %d, rq_num_entries %d qpid 0x%0x\n",  	     __func__, qhp, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries,  	     qhp->wq.sq.qid); @@ -1772,11 +1777,15 @@ int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,  	/*  	 * Use SQ_PSN and RQ_PSN to pass in IDX_INC values for  	 * ringing the queue db when we're in DB_FULL mode. +	 * Only allow this on T4 devices.  	 */  	attrs.sq_db_inc = attr->sq_psn;  	attrs.rq_db_inc = attr->rq_psn;  	mask |= (attr_mask & IB_QP_SQ_PSN) ? C4IW_QP_ATTR_SQ_DB : 0;  	mask |= (attr_mask & IB_QP_RQ_PSN) ? C4IW_QP_ATTR_RQ_DB : 0; +	if (is_t5(to_c4iw_qp(ibqp)->rhp->rdev.lldi.adapter_type) && +	    (mask & (C4IW_QP_ATTR_SQ_DB|C4IW_QP_ATTR_RQ_DB))) +		return -EINVAL;  	return c4iw_modify_qp(rhp, qhp, mask, &attrs, 0);  } diff --git a/drivers/infiniband/hw/cxgb4/resource.c b/drivers/infiniband/hw/cxgb4/resource.c index cdef4d7fb6d..67df71a7012 100644 --- a/drivers/infiniband/hw/cxgb4/resource.c +++ b/drivers/infiniband/hw/cxgb4/resource.c @@ -179,8 +179,12 @@ u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)  		kfree(entry);  	} else {  		qid = c4iw_get_resource(&rdev->resource.qid_table); -		if (!qid) +		if (!qid) { +			mutex_lock(&rdev->stats.lock); +			rdev->stats.qid.fail++; +			mutex_unlock(&rdev->stats.lock);  			goto out; +		}  		mutex_lock(&rdev->stats.lock);  		rdev->stats.qid.cur += rdev->qpmask + 1;  		mutex_unlock(&rdev->stats.lock); @@ -322,8 +326,8 @@ u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size)  	unsigned long addr = gen_pool_alloc(rdev->rqt_pool, size << 6);  	PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size << 6);  	if (!addr) -		printk_ratelimited(KERN_WARNING MOD "%s: Out of RQT memory\n", -		       pci_name(rdev->lldi.pdev)); +		pr_warn_ratelimited(MOD "%s: Out of RQT memory\n", +				    pci_name(rdev->lldi.pdev));  	mutex_lock(&rdev->stats.lock);  	if (addr) {  		rdev->stats.rqt.cur += roundup(size << 6, 1 << MIN_RQT_SHIFT); diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h index e73ace73918..68b0a6bf4eb 100644 --- a/drivers/infiniband/hw/cxgb4/t4.h +++ b/drivers/infiniband/hw/cxgb4/t4.h @@ -84,7 +84,14 @@ struct t4_status_page {  			sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge))  #define T4_MAX_FR_IMMD ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_fr_nsmr_wr) - \  			sizeof(struct fw_ri_immd)) & ~31UL) -#define T4_MAX_FR_DEPTH (1024 / sizeof(u64)) +#define T4_MAX_FR_IMMD_DEPTH (T4_MAX_FR_IMMD / sizeof(u64)) +#define T4_MAX_FR_DSGL 1024 +#define T4_MAX_FR_DSGL_DEPTH (T4_MAX_FR_DSGL / sizeof(u64)) + +static inline int t4_max_fr_depth(int use_dsgl) +{ +	return use_dsgl ? T4_MAX_FR_DSGL_DEPTH : T4_MAX_FR_IMMD_DEPTH; +}  #define T4_RQ_NUM_SLOTS 2  #define T4_RQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_RQ_NUM_SLOTS) @@ -292,7 +299,7 @@ struct t4_sq {  	unsigned long phys_addr;  	struct t4_swsqe *sw_sq;  	struct t4_swsqe *oldest_read; -	u64 udb; +	u64 __iomem *udb;  	size_t memsize;  	u32 qid;  	u16 in_use; @@ -300,6 +307,7 @@ struct t4_sq {  	u16 cidx;  	u16 pidx;  	u16 wq_pidx; +	u16 wq_pidx_inc;  	u16 flags;  	short flush_cidx;  }; @@ -313,7 +321,7 @@ struct t4_rq {  	dma_addr_t dma_addr;  	DEFINE_DMA_UNMAP_ADDR(mapping);  	struct t4_swrqe *sw_rq; -	u64 udb; +	u64 __iomem *udb;  	size_t memsize;  	u32 qid;  	u32 msn; @@ -324,6 +332,7 @@ struct t4_rq {  	u16 cidx;  	u16 pidx;  	u16 wq_pidx; +	u16 wq_pidx_inc;  };  struct t4_wq { @@ -433,15 +442,67 @@ static inline u16 t4_sq_wq_size(struct t4_wq *wq)  		return wq->sq.size * T4_SQ_NUM_SLOTS;  } -static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc) +/* This function copies 64 byte coalesced work request to memory + * mapped BAR2 space. For coalesced WRs, the SGE fetches data + * from the FIFO instead of from Host. + */ +static inline void pio_copy(u64 __iomem *dst, u64 *src) +{ +	int count = 8; + +	while (count) { +		writeq(*src, dst); +		src++; +		dst++; +		count--; +	} +} + +static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc, u8 t5, +				 union t4_wr *wqe)  { + +	/* Flush host queue memory writes. */  	wmb(); +	if (t5) { +		if (inc == 1 && wqe) { +			PDBG("%s: WC wq->sq.pidx = %d\n", +			     __func__, wq->sq.pidx); +			pio_copy(wq->sq.udb + 7, (void *)wqe); +		} else { +			PDBG("%s: DB wq->sq.pidx = %d\n", +			     __func__, wq->sq.pidx); +			writel(PIDX_T5(inc), wq->sq.udb); +		} + +		/* Flush user doorbell area writes. */ +		wmb(); +		return; +	}  	writel(QID(wq->sq.qid) | PIDX(inc), wq->db);  } -static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc) +static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc, u8 t5, +				 union t4_recv_wr *wqe)  { + +	/* Flush host queue memory writes. */  	wmb(); +	if (t5) { +		if (inc == 1 && wqe) { +			PDBG("%s: WC wq->rq.pidx = %d\n", +			     __func__, wq->rq.pidx); +			pio_copy(wq->rq.udb + 7, (void *)wqe); +		} else { +			PDBG("%s: DB wq->rq.pidx = %d\n", +			     __func__, wq->rq.pidx); +			writel(PIDX_T5(inc), wq->rq.udb); +		} + +		/* Flush user doorbell area writes. */ +		wmb(); +		return; +	}  	writel(QID(wq->rq.qid) | PIDX(inc), wq->db);  } @@ -481,6 +542,7 @@ struct t4_cq {  	size_t memsize;  	__be64 bits_type_ts;  	u32 cqid; +	int vector;  	u16 size; /* including status page */  	u16 cidx;  	u16 sw_pidx; @@ -566,6 +628,9 @@ static inline int t4_next_hw_cqe(struct t4_cq *cq, struct t4_cqe **cqe)  		printk(KERN_ERR MOD "cq overflow cqid %u\n", cq->cqid);  		BUG_ON(1);  	} else if (t4_valid_cqe(cq, &cq->queue[cq->cidx])) { + +		/* Ensure CQE is flushed to memory */ +		rmb();  		*cqe = &cq->queue[cq->cidx];  		ret = 0;  	} else @@ -609,3 +674,7 @@ static inline void t4_set_cq_in_error(struct t4_cq *cq)  	((struct t4_status_page *)&cq->queue[cq->size])->qp_err = 1;  }  #endif + +struct t4_dev_status_page { +	u8 db_off; +}; diff --git a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h index dc193c29267..91289a051af 100644 --- a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h +++ b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h @@ -836,4 +836,19 @@ struct ulptx_idata {  #define V_RX_DACK_CHANGE(x) ((x) << S_RX_DACK_CHANGE)  #define F_RX_DACK_CHANGE    V_RX_DACK_CHANGE(1U) +enum {                     /* TCP congestion control algorithms */ +	CONG_ALG_RENO, +	CONG_ALG_TAHOE, +	CONG_ALG_NEWRENO, +	CONG_ALG_HIGHSPEED +}; + +#define S_CONG_CNTRL    14 +#define M_CONG_CNTRL    0x3 +#define V_CONG_CNTRL(x) ((x) << S_CONG_CNTRL) +#define G_CONG_CNTRL(x) (((x) >> S_CONG_CNTRL) & M_CONG_CNTRL) + +#define CONG_CNTRL_VALID   (1 << 18) +#define T5_OPT_2_VALID       (1 << 31) +  #endif /* _T4FW_RI_API_H_ */ diff --git a/drivers/infiniband/hw/cxgb4/user.h b/drivers/infiniband/hw/cxgb4/user.h index 32b754c35ab..cbd0ce17072 100644 --- a/drivers/infiniband/hw/cxgb4/user.h +++ b/drivers/infiniband/hw/cxgb4/user.h @@ -48,6 +48,7 @@ struct c4iw_create_cq_resp {  	__u32 cqid;  	__u32 size;  	__u32 qid_mask; +	__u32 reserved; /* explicit padding (optional for i386) */  }; @@ -70,4 +71,10 @@ struct c4iw_create_qp_resp {  	__u32 qid_mask;  	__u32 flags;  }; + +struct c4iw_alloc_ucontext_resp { +	__u64 status_page_key; +	__u32 status_page_size; +	__u32 reserved; /* explicit padding (optional for i386) */ +};  #endif diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index f08f6eaf3fa..bd45e0f3923 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -322,7 +322,7 @@ struct ehca_mr_pginfo {  		} phy;  		struct { /* type EHCA_MR_PGI_USER section */  			struct ib_umem *region; -			struct ib_umem_chunk *next_chunk; +			struct scatterlist *next_sg;  			u64 next_nmap;  		} usr;  		struct { /* type EHCA_MR_PGI_FMR section */ diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c index 212150c25ea..8cc83753776 100644 --- a/drivers/infiniband/hw/ehca/ehca_cq.c +++ b/drivers/infiniband/hw/ehca/ehca_cq.c @@ -283,6 +283,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector,  			(my_cq->galpas.user.fw_handle & (PAGE_SIZE - 1));  		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {  			ehca_err(device, "Copy to udata failed."); +			cq = ERR_PTR(-EFAULT);  			goto create_cq_exit4;  		}  	} diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c index bcfb0c18362..3488e8c9fcb 100644 --- a/drivers/infiniband/hw/ehca/ehca_mrmw.c +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c @@ -400,10 +400,7 @@ reg_user_mr_fallback:  	pginfo.num_hwpages = num_hwpages;  	pginfo.u.usr.region = e_mr->umem;  	pginfo.next_hwpage = e_mr->umem->offset / hwpage_size; -	pginfo.u.usr.next_chunk = list_prepare_entry(pginfo.u.usr.next_chunk, -						     (&e_mr->umem->chunk_list), -						     list); - +	pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl;  	ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,  			  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,  			  &e_mr->ib.ib_mr.rkey, EHCA_REG_MR); @@ -1858,61 +1855,39 @@ static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo,  				  u64 *kpage)  {  	int ret = 0; -	struct ib_umem_chunk *prev_chunk; -	struct ib_umem_chunk *chunk;  	u64 pgaddr; -	u32 i = 0;  	u32 j = 0;  	int hwpages_per_kpage = PAGE_SIZE / pginfo->hwpage_size; - -	/* loop over desired chunk entries */ -	chunk      = pginfo->u.usr.next_chunk; -	prev_chunk = pginfo->u.usr.next_chunk; -	list_for_each_entry_continue( -		chunk, (&(pginfo->u.usr.region->chunk_list)), list) { -		for (i = pginfo->u.usr.next_nmap; i < chunk->nmap; ) { -			pgaddr = page_to_pfn(sg_page(&chunk->page_list[i])) -				<< PAGE_SHIFT ; -			*kpage = pgaddr + (pginfo->next_hwpage * -					   pginfo->hwpage_size); -			if ( !(*kpage) ) { -				ehca_gen_err("pgaddr=%llx " -					     "chunk->page_list[i]=%llx " -					     "i=%x next_hwpage=%llx", -					     pgaddr, (u64)sg_dma_address( -						     &chunk->page_list[i]), -					     i, pginfo->next_hwpage); -				return -EFAULT; -			} -			(pginfo->hwpage_cnt)++; -			(pginfo->next_hwpage)++; -			kpage++; -			if (pginfo->next_hwpage % hwpages_per_kpage == 0) { -				(pginfo->kpage_cnt)++; -				(pginfo->u.usr.next_nmap)++; -				pginfo->next_hwpage = 0; -				i++; -			} -			j++; -			if (j >= number) break; +	struct scatterlist **sg = &pginfo->u.usr.next_sg; + +	while (*sg != NULL) { +		pgaddr = page_to_pfn(sg_page(*sg)) +			<< PAGE_SHIFT; +		*kpage = pgaddr + (pginfo->next_hwpage * +				   pginfo->hwpage_size); +		if (!(*kpage)) { +			ehca_gen_err("pgaddr=%llx " +				     "sg_dma_address=%llx " +				     "entry=%llx next_hwpage=%llx", +				     pgaddr, (u64)sg_dma_address(*sg), +				     pginfo->u.usr.next_nmap, +				     pginfo->next_hwpage); +			return -EFAULT;  		} -		if ((pginfo->u.usr.next_nmap >= chunk->nmap) && -		    (j >= number)) { -			pginfo->u.usr.next_nmap = 0; -			prev_chunk = chunk; -			break; -		} else if (pginfo->u.usr.next_nmap >= chunk->nmap) { -			pginfo->u.usr.next_nmap = 0; -			prev_chunk = chunk; -		} else if (j >= number) +		(pginfo->hwpage_cnt)++; +		(pginfo->next_hwpage)++; +		kpage++; +		if (pginfo->next_hwpage % hwpages_per_kpage == 0) { +			(pginfo->kpage_cnt)++; +			(pginfo->u.usr.next_nmap)++; +			pginfo->next_hwpage = 0; +			*sg = sg_next(*sg); +		} +		j++; +		if (j >= number)  			break; -		else -			prev_chunk = chunk;  	} -	pginfo->u.usr.next_chunk = -		list_prepare_entry(prev_chunk, -				   (&(pginfo->u.usr.region->chunk_list)), -				   list); +  	return ret;  } @@ -1920,20 +1895,19 @@ static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo,   * check given pages for contiguous layout   * last page addr is returned in prev_pgaddr for further check   */ -static int ehca_check_kpages_per_ate(struct scatterlist *page_list, -				     int start_idx, int end_idx, +static int ehca_check_kpages_per_ate(struct scatterlist **sg, +				     int num_pages,  				     u64 *prev_pgaddr)  { -	int t; -	for (t = start_idx; t <= end_idx; t++) { -		u64 pgaddr = page_to_pfn(sg_page(&page_list[t])) << PAGE_SHIFT; +	for (; *sg && num_pages > 0; *sg = sg_next(*sg), num_pages--) { +		u64 pgaddr = page_to_pfn(sg_page(*sg)) << PAGE_SHIFT;  		if (ehca_debug_level >= 3)  			ehca_gen_dbg("chunk_page=%llx value=%016llx", pgaddr,  				     *(u64 *)__va(pgaddr));  		if (pgaddr - PAGE_SIZE != *prev_pgaddr) {  			ehca_gen_err("uncontiguous page found pgaddr=%llx " -				     "prev_pgaddr=%llx page_list_i=%x", -				     pgaddr, *prev_pgaddr, t); +				     "prev_pgaddr=%llx entries_left_in_hwpage=%x", +				     pgaddr, *prev_pgaddr, num_pages);  			return -EINVAL;  		}  		*prev_pgaddr = pgaddr; @@ -1947,111 +1921,80 @@ static int ehca_set_pagebuf_user2(struct ehca_mr_pginfo *pginfo,  				  u64 *kpage)  {  	int ret = 0; -	struct ib_umem_chunk *prev_chunk; -	struct ib_umem_chunk *chunk;  	u64 pgaddr, prev_pgaddr; -	u32 i = 0;  	u32 j = 0;  	int kpages_per_hwpage = pginfo->hwpage_size / PAGE_SIZE;  	int nr_kpages = kpages_per_hwpage; +	struct scatterlist **sg = &pginfo->u.usr.next_sg; + +	while (*sg != NULL) { -	/* loop over desired chunk entries */ -	chunk      = pginfo->u.usr.next_chunk; -	prev_chunk = pginfo->u.usr.next_chunk; -	list_for_each_entry_continue( -		chunk, (&(pginfo->u.usr.region->chunk_list)), list) { -		for (i = pginfo->u.usr.next_nmap; i < chunk->nmap; ) { -			if (nr_kpages == kpages_per_hwpage) { -				pgaddr = ( page_to_pfn(sg_page(&chunk->page_list[i])) -					   << PAGE_SHIFT ); -				*kpage = pgaddr; -				if ( !(*kpage) ) { -					ehca_gen_err("pgaddr=%llx i=%x", -						     pgaddr, i); +		if (nr_kpages == kpages_per_hwpage) { +			pgaddr = (page_to_pfn(sg_page(*sg)) +				   << PAGE_SHIFT); +			*kpage = pgaddr; +			if (!(*kpage)) { +				ehca_gen_err("pgaddr=%llx entry=%llx", +					     pgaddr, pginfo->u.usr.next_nmap); +				ret = -EFAULT; +				return ret; +			} +			/* +			 * The first page in a hwpage must be aligned; +			 * the first MR page is exempt from this rule. +			 */ +			if (pgaddr & (pginfo->hwpage_size - 1)) { +				if (pginfo->hwpage_cnt) { +					ehca_gen_err( +						"invalid alignment " +						"pgaddr=%llx entry=%llx " +						"mr_pgsize=%llx", +						pgaddr, pginfo->u.usr.next_nmap, +						pginfo->hwpage_size);  					ret = -EFAULT;  					return ret;  				} -				/* -				 * The first page in a hwpage must be aligned; -				 * the first MR page is exempt from this rule. -				 */ -				if (pgaddr & (pginfo->hwpage_size - 1)) { -					if (pginfo->hwpage_cnt) { -						ehca_gen_err( -							"invalid alignment " -							"pgaddr=%llx i=%x " -							"mr_pgsize=%llx", -							pgaddr, i, -							pginfo->hwpage_size); -						ret = -EFAULT; -						return ret; -					} -					/* first MR page */ -					pginfo->kpage_cnt = -						(pgaddr & -						 (pginfo->hwpage_size - 1)) >> -						PAGE_SHIFT; -					nr_kpages -= pginfo->kpage_cnt; -					*kpage = pgaddr & -						 ~(pginfo->hwpage_size - 1); -				} -				if (ehca_debug_level >= 3) { -					u64 val = *(u64 *)__va(pgaddr); -					ehca_gen_dbg("kpage=%llx chunk_page=%llx " -						     "value=%016llx", -						     *kpage, pgaddr, val); -				} -				prev_pgaddr = pgaddr; -				i++; -				pginfo->kpage_cnt++; -				pginfo->u.usr.next_nmap++; -				nr_kpages--; -				if (!nr_kpages) -					goto next_kpage; -				continue; +				/* first MR page */ +				pginfo->kpage_cnt = +					(pgaddr & +					 (pginfo->hwpage_size - 1)) >> +					PAGE_SHIFT; +				nr_kpages -= pginfo->kpage_cnt; +				*kpage = pgaddr & +					 ~(pginfo->hwpage_size - 1);  			} -			if (i + nr_kpages > chunk->nmap) { -				ret = ehca_check_kpages_per_ate( -					chunk->page_list, i, -					chunk->nmap - 1, &prev_pgaddr); -				if (ret) return ret; -				pginfo->kpage_cnt += chunk->nmap - i; -				pginfo->u.usr.next_nmap += chunk->nmap - i; -				nr_kpages -= chunk->nmap - i; -				break; +			if (ehca_debug_level >= 3) { +				u64 val = *(u64 *)__va(pgaddr); +				ehca_gen_dbg("kpage=%llx page=%llx " +					     "value=%016llx", +					     *kpage, pgaddr, val);  			} +			prev_pgaddr = pgaddr; +			*sg = sg_next(*sg); +			pginfo->kpage_cnt++; +			pginfo->u.usr.next_nmap++; +			nr_kpages--; +			if (!nr_kpages) +				goto next_kpage; +			continue; +		} + +		ret = ehca_check_kpages_per_ate(sg, nr_kpages, +						&prev_pgaddr); +		if (ret) +			return ret; +		pginfo->kpage_cnt += nr_kpages; +		pginfo->u.usr.next_nmap += nr_kpages; -			ret = ehca_check_kpages_per_ate(chunk->page_list, i, -							i + nr_kpages - 1, -							&prev_pgaddr); -			if (ret) return ret; -			i += nr_kpages; -			pginfo->kpage_cnt += nr_kpages; -			pginfo->u.usr.next_nmap += nr_kpages;  next_kpage: -			nr_kpages = kpages_per_hwpage; -			(pginfo->hwpage_cnt)++; -			kpage++; -			j++; -			if (j >= number) break; -		} -		if ((pginfo->u.usr.next_nmap >= chunk->nmap) && -		    (j >= number)) { -			pginfo->u.usr.next_nmap = 0; -			prev_chunk = chunk; -			break; -		} else if (pginfo->u.usr.next_nmap >= chunk->nmap) { -			pginfo->u.usr.next_nmap = 0; -			prev_chunk = chunk; -		} else if (j >= number) +		nr_kpages = kpages_per_hwpage; +		(pginfo->hwpage_cnt)++; +		kpage++; +		j++; +		if (j >= number)  			break; -		else -			prev_chunk = chunk;  	} -	pginfo->u.usr.next_chunk = -		list_prepare_entry(prev_chunk, -				   (&(pginfo->u.usr.region->chunk_list)), -				   list); +  	return ret;  } @@ -2591,16 +2534,6 @@ static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg,  	/* This is only a stub; nothing to be done here */  } -static u64 ehca_dma_address(struct ib_device *dev, struct scatterlist *sg) -{ -	return sg->dma_address; -} - -static unsigned int ehca_dma_len(struct ib_device *dev, struct scatterlist *sg) -{ -	return sg->length; -} -  static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr,  					 size_t size,  					 enum dma_data_direction dir) @@ -2653,8 +2586,6 @@ struct ib_dma_mapping_ops ehca_dma_mapping_ops = {  	.unmap_page             = ehca_dma_unmap_page,  	.map_sg                 = ehca_dma_map_sg,  	.unmap_sg               = ehca_dma_unmap_sg, -	.dma_address            = ehca_dma_address, -	.dma_len                = ehca_dma_len,  	.sync_single_for_cpu    = ehca_dma_sync_single_for_cpu,  	.sync_single_for_device = ehca_dma_sync_single_for_device,  	.alloc_coherent         = ehca_dma_alloc_coherent, diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index 00d6861a6a1..2e89356c46f 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -1329,7 +1329,7 @@ static int internal_modify_qp(struct ib_qp *ibqp,  	qp_new_state = attr_mask & IB_QP_STATE ? attr->qp_state : qp_cur_state;  	if (!smi_reset2init &&  	    !ib_modify_qp_is_ok(qp_cur_state, qp_new_state, ibqp->qp_type, -				attr_mask)) { +				attr_mask, IB_LINK_LAYER_UNSPECIFIED)) {  		ret = -EINVAL;  		ehca_err(ibqp->device,  			 "Invalid qp transition new_state=%x cur_state=%x " diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c index 714293b7851..45802e97332 100644 --- a/drivers/infiniband/hw/ipath/ipath_diag.c +++ b/drivers/infiniband/hw/ipath/ipath_diag.c @@ -326,7 +326,7 @@ static ssize_t ipath_diagpkt_write(struct file *fp,  				   size_t count, loff_t *off)  {  	u32 __iomem *piobuf; -	u32 plen, clen, pbufn; +	u32 plen, pbufn, maxlen_reserve;  	struct ipath_diag_pkt odp;  	struct ipath_diag_xpkt dp;  	u32 *tmpbuf = NULL; @@ -335,42 +335,24 @@ static ssize_t ipath_diagpkt_write(struct file *fp,  	u64 val;  	u32 l_state, lt_state; /* LinkState, LinkTrainingState */ -	if (count < sizeof(odp)) { -		ret = -EINVAL; -		goto bail; -	}  	if (count == sizeof(dp)) {  		if (copy_from_user(&dp, data, sizeof(dp))) {  			ret = -EFAULT;  			goto bail;  		} -	} else if (copy_from_user(&odp, data, sizeof(odp))) { -		ret = -EFAULT; -		goto bail; -	} - -	/* -	 * Due to padding/alignment issues (lessened with new struct) -	 * the old and new structs are the same length. We need to -	 * disambiguate them, which we can do because odp.len has never -	 * been less than the total of LRH+BTH+DETH so far, while -	 * dp.unit (same offset) unit is unlikely to get that high. -	 * Similarly, dp.data, the pointer to user at the same offset -	 * as odp.unit, is almost certainly at least one (512byte)page -	 * "above" NULL. The if-block below can be omitted if compatibility -	 * between a new driver and older diagnostic code is unimportant. -	 * compatibility the other direction (new diags, old driver) is -	 * handled in the diagnostic code, with a warning. -	 */ -	if (dp.unit >= 20 && dp.data < 512) { -		/* very probable version mismatch. Fix it up */ -		memcpy(&odp, &dp, sizeof(odp)); -		/* We got a legacy dp, copy elements to dp */ +	} else if (count == sizeof(odp)) { +		if (copy_from_user(&odp, data, sizeof(odp))) { +			ret = -EFAULT; +			goto bail; +		} +		dp.len = odp.len;  		dp.unit = odp.unit;  		dp.data = odp.data; -		dp.len = odp.len; -		dp.pbc_wd = 0; /* Indicate we need to compute PBC wd */ +		dp.pbc_wd = 0; +	} else { +		ret = -EINVAL; +		goto bail;  	}  	/* send count must be an exact number of dwords */ @@ -379,7 +361,7 @@ static ssize_t ipath_diagpkt_write(struct file *fp,  		goto bail;  	} -	clen = dp.len >> 2; +	plen = dp.len >> 2;  	dd = ipath_lookup(dp.unit);  	if (!dd || !(dd->ipath_flags & IPATH_PRESENT) || @@ -422,16 +404,22 @@ static ssize_t ipath_diagpkt_write(struct file *fp,  		goto bail;  	} -	/* need total length before first word written */ -	/* +1 word is for the qword padding */ -	plen = sizeof(u32) + dp.len; - -	if ((plen + 4) > dd->ipath_ibmaxlen) { +	/* +	 * need total length before first word written, plus 2 Dwords. One Dword +	 * is for padding so we get the full user data when not aligned on +	 * a word boundary. The other Dword is to make sure we have room for the +	 * ICRC which gets tacked on later. +	 */ +	maxlen_reserve = 2 * sizeof(u32); +	if (dp.len > dd->ipath_ibmaxlen - maxlen_reserve) {  		ipath_dbg("Pkt len 0x%x > ibmaxlen %x\n", -			  plen - 4, dd->ipath_ibmaxlen); +			  dp.len, dd->ipath_ibmaxlen);  		ret = -EINVAL; -		goto bail;	/* before writing pbc */ +		goto bail;  	} + +	plen = sizeof(u32) + dp.len; +  	tmpbuf = vmalloc(plen);  	if (!tmpbuf) {  		dev_info(&dd->pcidev->dev, "Unable to allocate tmp buffer, " @@ -473,11 +461,11 @@ static ssize_t ipath_diagpkt_write(struct file *fp,  	 */  	if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {  		ipath_flush_wc(); -		__iowrite32_copy(piobuf + 2, tmpbuf, clen - 1); +		__iowrite32_copy(piobuf + 2, tmpbuf, plen - 1);  		ipath_flush_wc(); -		__raw_writel(tmpbuf[clen - 1], piobuf + clen + 1); +		__raw_writel(tmpbuf[plen - 1], piobuf + plen + 1);  	} else -		__iowrite32_copy(piobuf + 2, tmpbuf, clen); +		__iowrite32_copy(piobuf + 2, tmpbuf, plen);  	ipath_flush_wc(); diff --git a/drivers/infiniband/hw/ipath/ipath_dma.c b/drivers/infiniband/hw/ipath/ipath_dma.c index 644c2c74e05..123a8c05353 100644 --- a/drivers/infiniband/hw/ipath/ipath_dma.c +++ b/drivers/infiniband/hw/ipath/ipath_dma.c @@ -115,6 +115,10 @@ static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl,  			ret = 0;  			break;  		} +		sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH +		sg->dma_length = sg->length; +#endif  	}  	return ret;  } @@ -126,21 +130,6 @@ static void ipath_unmap_sg(struct ib_device *dev,  	BUG_ON(!valid_dma_direction(direction));  } -static u64 ipath_sg_dma_address(struct ib_device *dev, struct scatterlist *sg) -{ -	u64 addr = (u64) page_address(sg_page(sg)); - -	if (addr) -		addr += sg->offset; -	return addr; -} - -static unsigned int ipath_sg_dma_len(struct ib_device *dev, -				     struct scatterlist *sg) -{ -	return sg->length; -} -  static void ipath_sync_single_for_cpu(struct ib_device *dev,  				      u64 addr,  				      size_t size, @@ -176,17 +165,15 @@ static void ipath_dma_free_coherent(struct ib_device *dev, size_t size,  }  struct ib_dma_mapping_ops ipath_dma_mapping_ops = { -	ipath_mapping_error, -	ipath_dma_map_single, -	ipath_dma_unmap_single, -	ipath_dma_map_page, -	ipath_dma_unmap_page, -	ipath_map_sg, -	ipath_unmap_sg, -	ipath_sg_dma_address, -	ipath_sg_dma_len, -	ipath_sync_single_for_cpu, -	ipath_sync_single_for_device, -	ipath_dma_alloc_coherent, -	ipath_dma_free_coherent +	.mapping_error = ipath_mapping_error, +	.map_single = ipath_dma_map_single, +	.unmap_single = ipath_dma_unmap_single, +	.map_page = ipath_dma_map_page, +	.unmap_page = ipath_dma_unmap_page, +	.map_sg = ipath_map_sg, +	.unmap_sg = ipath_unmap_sg, +	.sync_single_for_cpu = ipath_sync_single_for_cpu, +	.sync_single_for_device = ipath_sync_single_for_device, +	.alloc_coherent = ipath_dma_alloc_coherent, +	.free_coherent = ipath_dma_free_coherent  }; diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 26dfbc8ee0f..01ba792791a 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -70,7 +70,7 @@ void ipath_disarm_senderrbufs(struct ipath_devdata *dd)  	if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {  		int i;  		if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG) && -			dd->ipath_lastcancel > jiffies) { +			time_after(dd->ipath_lastcancel, jiffies)) {  			__IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG,  					  "SendbufErrs %lx %lx", sbuf[0],  					  sbuf[1]); @@ -755,7 +755,7 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)  	/* likely due to cancel; so suppress message unless verbose */  	if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) && -		dd->ipath_lastcancel > jiffies) { +		time_after(dd->ipath_lastcancel, jiffies)) {  		/* armlaunch takes precedence; it often causes both. */  		ipath_cdbg(VERBOSE,  			"Suppressed %s error (%llx) after sendbuf cancel\n", diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c index e346d3890a0..5e61e9bff69 100644 --- a/drivers/infiniband/hw/ipath/ipath_mr.c +++ b/drivers/infiniband/hw/ipath/ipath_mr.c @@ -188,8 +188,8 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  {  	struct ipath_mr *mr;  	struct ib_umem *umem; -	struct ib_umem_chunk *chunk; -	int n, m, i; +	int n, m, entry; +	struct scatterlist *sg;  	struct ib_mr *ret;  	if (length == 0) { @@ -202,10 +202,7 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  	if (IS_ERR(umem))  		return (void *) umem; -	n = 0; -	list_for_each_entry(chunk, &umem->chunk_list, list) -		n += chunk->nents; - +	n = umem->nmap;  	mr = alloc_mr(n, &to_idev(pd->device)->lk_table);  	if (!mr) {  		ret = ERR_PTR(-ENOMEM); @@ -224,22 +221,20 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  	m = 0;  	n = 0; -	list_for_each_entry(chunk, &umem->chunk_list, list) { -		for (i = 0; i < chunk->nents; i++) { -			void *vaddr; - -			vaddr = page_address(sg_page(&chunk->page_list[i])); -			if (!vaddr) { -				ret = ERR_PTR(-EINVAL); -				goto bail; -			} -			mr->mr.map[m]->segs[n].vaddr = vaddr; -			mr->mr.map[m]->segs[n].length = umem->page_size; -			n++; -			if (n == IPATH_SEGSZ) { -				m++; -				n = 0; -			} +	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { +		void *vaddr; + +		vaddr = page_address(sg_page(sg)); +		if (!vaddr) { +			ret = ERR_PTR(-EINVAL); +			goto bail; +		} +		mr->mr.map[m]->segs[n].vaddr = vaddr; +		mr->mr.map[m]->segs[n].length = umem->page_size; +		n++; +		if (n == IPATH_SEGSZ) { +			m++; +			n = 0;  		}  	}  	ret = &mr->ibmr; diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c index 0857a9c3cd3..face87602dc 100644 --- a/drivers/infiniband/hw/ipath/ipath_qp.c +++ b/drivers/infiniband/hw/ipath/ipath_qp.c @@ -463,7 +463,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,  	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;  	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, -				attr_mask)) +				attr_mask, IB_LINK_LAYER_UNSPECIFIED))  		goto inval;  	if (attr_mask & IB_QP_AV) { diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c index 98ac18ec977..17a517766ad 100644 --- a/drivers/infiniband/hw/ipath/ipath_sdma.c +++ b/drivers/infiniband/hw/ipath/ipath_sdma.c @@ -247,7 +247,7 @@ static void sdma_abort_task(unsigned long opaque)  	/* ipath_sdma_abort() is done, waiting for interrupt */  	if (status == IPATH_SDMA_ABORT_DISARMED) { -		if (jiffies < dd->ipath_sdma_abort_intr_timeout) +		if (time_before(jiffies, dd->ipath_sdma_abort_intr_timeout))  			goto resched_noprint;  		/* give up, intr got lost somewhere */  		ipath_dbg("give up waiting for SDMADISABLED intr\n"); @@ -341,7 +341,7 @@ resched:  	 * JAG - this is bad to just have default be a loop without  	 * state change  	 */ -	if (jiffies > dd->ipath_sdma_abort_jiffies) { +	if (time_after(jiffies, dd->ipath_sdma_abort_jiffies)) {  		ipath_dbg("looping with status 0x%08lx\n",  			  dd->ipath_sdma_status);  		dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ; diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c index f5cb13b2144..cc04b7ba348 100644 --- a/drivers/infiniband/hw/ipath/ipath_user_sdma.c +++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c @@ -280,9 +280,7 @@ static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd,  	int j;  	int ret; -	ret = get_user_pages(current, current->mm, addr, -			     npages, 0, 1, pages, NULL); - +	ret = get_user_pages_fast(addr, npages, 0, pages);  	if (ret != npages) {  		int i; @@ -811,10 +809,7 @@ int ipath_user_sdma_writev(struct ipath_devdata *dd,  	while (dim) {  		const int mxp = 8; -		down_write(¤t->mm->mmap_sem);  		ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp); -		up_write(¤t->mm->mmap_sem); -  		if (ret <= 0)  			goto done_unlock;  		else { diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig index 24ab11a9ad1..fc01deac1d3 100644 --- a/drivers/infiniband/hw/mlx4/Kconfig +++ b/drivers/infiniband/hw/mlx4/Kconfig @@ -1,6 +1,6 @@  config MLX4_INFINIBAND  	tristate "Mellanox ConnectX HCA support" -	depends on NETDEVICES && ETHERNET && PCI +	depends on NETDEVICES && ETHERNET && PCI && INET  	select NET_VENDOR_MELLANOX  	select MLX4_CORE  	---help--- diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index a251becdaa9..2d8c3397774 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -39,25 +39,6 @@  #include "mlx4_ib.h" -int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, -			u8 *mac, int *is_mcast, u8 port) -{ -	struct in6_addr in6; - -	*is_mcast = 0; - -	memcpy(&in6, ah_attr->grh.dgid.raw, sizeof in6); -	if (rdma_link_local_addr(&in6)) -		rdma_get_ll_mac(&in6, mac); -	else if (rdma_is_multicast_addr(&in6)) { -		rdma_get_mcast_mac(&in6, mac); -		*is_mcast = 1; -	} else -		return -EINVAL; - -	return 0; -} -  static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,  				  struct mlx4_ib_ah *ah)  { @@ -92,21 +73,18 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr  {  	struct mlx4_ib_dev *ibdev = to_mdev(pd->device);  	struct mlx4_dev *dev = ibdev->dev; -	union ib_gid sgid; -	u8 mac[6]; -	int err; -	int is_mcast; +	int is_mcast = 0; +	struct in6_addr in6;  	u16 vlan_tag; -	err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, &is_mcast, ah_attr->port_num); -	if (err) -		return ERR_PTR(err); - -	memcpy(ah->av.eth.mac, mac, 6); -	err = ib_get_cached_gid(pd->device, ah_attr->port_num, ah_attr->grh.sgid_index, &sgid); -	if (err) -		return ERR_PTR(err); -	vlan_tag = rdma_get_vlan_id(&sgid); +	memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6)); +	if (rdma_is_multicast_addr(&in6)) { +		is_mcast = 1; +		rdma_get_mcast_mac(&in6, ah->av.eth.mac); +	} else { +		memcpy(ah->av.eth.mac, ah_attr->dmac, ETH_ALEN); +	} +	vlan_tag = ah_attr->vlan_id;  	if (vlan_tag < 0x1000)  		vlan_tag |= (ah_attr->sl & 7) << 13;  	ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index 2f215b93db6..0eb141c4141 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -154,7 +154,7 @@ void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,  			continue;  		slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ; -		if (slave_id >= dev->dev->num_slaves) +		if (slave_id >= dev->dev->num_vfs + 1)  			return;  		tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE];  		form_cache_ag = get_cached_alias_guid(dev, port_num, diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c index d1f5f1dd77b..56a593e0ae5 100644 --- a/drivers/infiniband/hw/mlx4/cm.c +++ b/drivers/infiniband/hw/mlx4/cm.c @@ -61,6 +61,11 @@ struct cm_generic_msg {  	__be32 remote_comm_id;  }; +struct cm_sidr_generic_msg { +	struct ib_mad_hdr hdr; +	__be32 request_id; +}; +  struct cm_req_msg {  	unsigned char unused[0x60];  	union ib_gid primary_path_sgid; @@ -69,28 +74,62 @@ struct cm_req_msg {  static void set_local_comm_id(struct ib_mad *mad, u32 cm_id)  { -	struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; -	msg->local_comm_id = cpu_to_be32(cm_id); +	if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { +		struct cm_sidr_generic_msg *msg = +			(struct cm_sidr_generic_msg *)mad; +		msg->request_id = cpu_to_be32(cm_id); +	} else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { +		pr_err("trying to set local_comm_id in SIDR_REP\n"); +		return; +	} else { +		struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; +		msg->local_comm_id = cpu_to_be32(cm_id); +	}  }  static u32 get_local_comm_id(struct ib_mad *mad)  { -	struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - -	return be32_to_cpu(msg->local_comm_id); +	if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { +		struct cm_sidr_generic_msg *msg = +			(struct cm_sidr_generic_msg *)mad; +		return be32_to_cpu(msg->request_id); +	} else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { +		pr_err("trying to set local_comm_id in SIDR_REP\n"); +		return -1; +	} else { +		struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; +		return be32_to_cpu(msg->local_comm_id); +	}  }  static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id)  { -	struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; -	msg->remote_comm_id = cpu_to_be32(cm_id); +	if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { +		struct cm_sidr_generic_msg *msg = +			(struct cm_sidr_generic_msg *)mad; +		msg->request_id = cpu_to_be32(cm_id); +	} else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { +		pr_err("trying to set remote_comm_id in SIDR_REQ\n"); +		return; +	} else { +		struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; +		msg->remote_comm_id = cpu_to_be32(cm_id); +	}  }  static u32 get_remote_comm_id(struct ib_mad *mad)  { -	struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - -	return be32_to_cpu(msg->remote_comm_id); +	if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { +		struct cm_sidr_generic_msg *msg = +			(struct cm_sidr_generic_msg *)mad; +		return be32_to_cpu(msg->request_id); +	} else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { +		pr_err("trying to set remote_comm_id in SIDR_REQ\n"); +		return -1; +	} else { +		struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; +		return be32_to_cpu(msg->remote_comm_id); +	}  }  static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad) @@ -282,19 +321,21 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id  	u32 sl_cm_id;  	int pv_cm_id = -1; -	sl_cm_id = get_local_comm_id(mad); -  	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || -			mad->mad_hdr.attr_id == CM_REP_ATTR_ID) { +			mad->mad_hdr.attr_id == CM_REP_ATTR_ID || +			mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { +		sl_cm_id = get_local_comm_id(mad);  		id = id_map_alloc(ibdev, slave_id, sl_cm_id);  		if (IS_ERR(id)) {  			mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n",  				__func__, slave_id, sl_cm_id);  			return PTR_ERR(id);  		} -	} else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID) { +	} else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || +		   mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {  		return 0;  	} else { +		sl_cm_id = get_local_comm_id(mad);  		id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id);  	} @@ -315,14 +356,18 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id  }  int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, -							     struct ib_mad *mad) +			     struct ib_mad *mad)  {  	u32 pv_cm_id;  	struct id_map_entry *id; -	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID) { +	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || +	    mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {  		union ib_gid gid; +		if (!slave) +			return 0; +  		gid = gid_from_req_msg(ibdev, mad);  		*slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id);  		if (*slave < 0) { @@ -341,7 +386,8 @@ int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,  		return -ENOENT;  	} -	*slave = id->slave_id; +	if (slave) +		*slave = id->slave_id;  	set_remote_comm_id(mad, id->sl_cm_id);  	if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index d5e60f44ba5..1066eec854a 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -102,7 +102,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *  	int err;  	err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size, -			     PAGE_SIZE * 2, &buf->buf); +			     PAGE_SIZE * 2, &buf->buf, GFP_KERNEL);  	if (err)  		goto out; @@ -113,7 +113,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *  	if (err)  		goto err_buf; -	err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf); +	err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf, GFP_KERNEL);  	if (err)  		goto err_mtt; @@ -209,7 +209,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector  		uar = &to_mucontext(context)->uar;  	} else { -		err = mlx4_db_alloc(dev->dev, &cq->db, 1); +		err = mlx4_db_alloc(dev->dev, &cq->db, 1, GFP_KERNEL);  		if (err)  			goto err_cq; @@ -324,7 +324,7 @@ static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq)  	u32 i;  	i = cq->mcq.cons_index; -	while (get_sw_cqe(cq, i & cq->ibcq.cqe)) +	while (get_sw_cqe(cq, i))  		++i;  	return i - cq->mcq.cons_index; @@ -365,7 +365,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)  	mutex_lock(&cq->resize_mutex); -	if (entries < 1 || entries > dev->dev->caps.max_cqes) { +	if (entries < 1) {  		err = -EINVAL;  		goto out;  	} @@ -376,6 +376,11 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)  		goto out;  	} +	if (entries > dev->dev->caps.max_cqes) { +		err = -EINVAL; +		goto out; +	} +  	if (ibcq->uobject) {  		err = mlx4_alloc_resize_umem(dev, cq, entries, udata);  		if (err) @@ -559,7 +564,7 @@ static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum)  }  static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc, -			   unsigned tail, struct mlx4_cqe *cqe) +			   unsigned tail, struct mlx4_cqe *cqe, int is_eth)  {  	struct mlx4_ib_proxy_sqp_hdr *hdr; @@ -569,12 +574,20 @@ static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct  				   DMA_FROM_DEVICE);  	hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr);  	wc->pkey_index	= be16_to_cpu(hdr->tun.pkey_index); -	wc->slid	= be16_to_cpu(hdr->tun.slid_mac_47_32); -	wc->sl		= (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12);  	wc->src_qp	= be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF;  	wc->wc_flags   |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0;  	wc->dlid_path_bits = 0; +	if (is_eth) { +		wc->vlan_id = be16_to_cpu(hdr->tun.sl_vid); +		memcpy(&(wc->smac[0]), (char *)&hdr->tun.mac_31_0, 4); +		memcpy(&(wc->smac[4]), (char *)&hdr->tun.slid_mac_47_32, 2); +		wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); +	} else { +		wc->slid        = be16_to_cpu(hdr->tun.slid_mac_47_32); +		wc->sl          = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12); +	} +  	return 0;  } @@ -589,6 +602,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,  	struct mlx4_srq *msrq = NULL;  	int is_send;  	int is_error; +	int is_eth;  	u32 g_mlpath_rqpn;  	u16 wqe_ctr;  	unsigned tail = 0; @@ -773,11 +787,15 @@ repoll:  			break;  		} +		is_eth = (rdma_port_get_link_layer(wc->qp->device, +						  (*cur_qp)->port) == +			  IB_LINK_LAYER_ETHERNET);  		if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) {  			if ((*cur_qp)->mlx4_ib_qp_type &  			    (MLX4_IB_QPT_PROXY_SMI_OWNER |  			     MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) -				return use_tunnel_data(*cur_qp, cq, wc, tail, cqe); +				return use_tunnel_data(*cur_qp, cq, wc, tail, +						       cqe, is_eth);  		}  		wc->slid	   = be16_to_cpu(cqe->rlid); @@ -788,11 +806,21 @@ repoll:  		wc->pkey_index     = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f;  		wc->wc_flags	  |= mlx4_ib_ipoib_csum_ok(cqe->status,  					cqe->checksum) ? IB_WC_IP_CSUM_OK : 0; -		if (rdma_port_get_link_layer(wc->qp->device, -				(*cur_qp)->port) == IB_LINK_LAYER_ETHERNET) +		if (is_eth) {  			wc->sl  = be16_to_cpu(cqe->sl_vid) >> 13; -		else +			if (be32_to_cpu(cqe->vlan_my_qpn) & +					MLX4_CQE_VLAN_PRESENT_MASK) { +				wc->vlan_id = be16_to_cpu(cqe->sl_vid) & +					MLX4_CQE_VID_MASK; +			} else { +				wc->vlan_id = 0xffff; +			} +			memcpy(wc->smac, cqe->smac, ETH_ALEN); +			wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); +		} else {  			wc->sl  = be16_to_cpu(cqe->sl_vid) >> 12; +			wc->vlan_id = 0xffff; +		}  	}  	return 0; diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c index 8aee4233b38..c5174098636 100644 --- a/drivers/infiniband/hw/mlx4/doorbell.c +++ b/drivers/infiniband/hw/mlx4/doorbell.c @@ -45,7 +45,6 @@ int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,  			struct mlx4_db *db)  {  	struct mlx4_ib_user_db_page *page; -	struct ib_umem_chunk *chunk;  	int err = 0;  	mutex_lock(&context->db_page_mutex); @@ -73,8 +72,7 @@ int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,  	list_add(&page->list, &context->db_page_list);  found: -	chunk = list_entry(page->umem->chunk_list.next, struct ib_umem_chunk, list); -	db->dma		= sg_dma_address(chunk->page_list) + (virt & ~PAGE_MASK); +	db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK);  	db->u.user_page = page;  	++page->refcnt; diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index f2a3f48107e..287ad0564ac 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -467,6 +467,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,  	int ret = 0;  	u16 tun_pkey_ix;  	u16 cached_pkey; +	u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;  	if (dest_qpt > IB_QPT_GSI)  		return -EINVAL; @@ -477,10 +478,6 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,  	if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE)  		return -EAGAIN; -	/* QP0 forwarding only for Dom0 */ -	if (!dest_qpt && (mlx4_master_func_num(dev->dev) != slave)) -		return -EINVAL; -  	if (!dest_qpt)  		tun_qp = &tun_ctx->qp[0];  	else @@ -509,6 +506,10 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,  	 * The driver will set the force loopback bit in post_send */  	memset(&attr, 0, sizeof attr);  	attr.port_num = port; +	if (is_eth) { +		memcpy(&attr.grh.dgid.raw[0], &grh->dgid.raw[0], 16); +		attr.ah_flags = IB_AH_GRH; +	}  	ah = ib_create_ah(tun_ctx->pd, &attr);  	if (IS_ERR(ah))  		return -ENOMEM; @@ -540,11 +541,36 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,  	/* adjust tunnel data */  	tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix); -	tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); -	tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid);  	tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF);  	tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0; +	if (is_eth) { +		u16 vlan = 0; +		if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan, +						NULL)) { +			/* VST mode */ +			if (vlan != wc->vlan_id) +				/* Packet vlan is not the VST-assigned vlan. +				 * Drop the packet. +				 */ +				goto out; +			 else +				/* Remove the vlan tag before forwarding +				 * the packet to the VF. +				 */ +				vlan = 0xffff; +		} else { +			vlan = wc->vlan_id; +		} + +		tun_mad->hdr.sl_vid = cpu_to_be16(vlan); +		memcpy((char *)&tun_mad->hdr.mac_31_0, &(wc->smac[0]), 4); +		memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2); +	} else { +		tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); +		tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); +	} +  	ib_dma_sync_single_for_device(&dev->ib_dev,  				      tun_qp->tx_ring[tun_tx_ix].buf.map,  				      sizeof (struct mlx4_rcv_tunnel_mad), @@ -580,6 +606,41 @@ static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port,  	int err;  	int slave;  	u8 *slave_id; +	int is_eth = 0; + +	if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) +		is_eth = 0; +	else +		is_eth = 1; + +	if (is_eth) { +		if (!(wc->wc_flags & IB_WC_GRH)) { +			mlx4_ib_warn(ibdev, "RoCE grh not present.\n"); +			return -EINVAL; +		} +		if (mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_CM) { +			mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n"); +			return -EINVAL; +		} +		if (mlx4_get_slave_from_roce_gid(dev->dev, port, grh->dgid.raw, &slave)) { +			mlx4_ib_warn(ibdev, "failed matching grh\n"); +			return -ENOENT; +		} +		if (slave >= dev->dev->caps.sqp_demux) { +			mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", +				     slave, dev->dev->caps.sqp_demux); +			return -ENOENT; +		} + +		if (mlx4_ib_demux_cm_handler(ibdev, port, NULL, mad)) +			return 0; + +		err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); +		if (err) +			pr_debug("failed sending to slave %d via tunnel qp (%d)\n", +				 slave, err); +		return 0; +	}  	/* Initially assume that this mad is for us */  	slave = mlx4_master_func_num(dev->dev); @@ -602,6 +663,21 @@ static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port,  	}  	/* Class-specific handling */  	switch (mad->mad_hdr.mgmt_class) { +	case IB_MGMT_CLASS_SUBN_LID_ROUTED: +	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: +		/* 255 indicates the dom0 */ +		if (slave != 255 && slave != mlx4_master_func_num(dev->dev)) { +			if (!mlx4_vf_smi_enabled(dev->dev, slave, port)) +				return -EPERM; +			/* for a VF. drop unsolicited MADs */ +			if (!(mad->mad_hdr.method & IB_MGMT_METHOD_RESP)) { +				mlx4_ib_warn(ibdev, "demux QP0. rejecting unsolicited mad for slave %d class 0x%x, method 0x%x\n", +					     slave, mad->mad_hdr.mgmt_class, +					     mad->mad_hdr.method); +				return -EINVAL; +			} +		} +		break;  	case IB_MGMT_CLASS_SUBN_ADM:  		if (mlx4_ib_demux_sa_handler(ibdev, port, slave,  					     (struct ib_sa_mad *) mad)) @@ -1076,8 +1152,9 @@ static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave)  int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, -			 enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, -			 u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad) +			 enum ib_qp_type dest_qpt, u16 pkey_index, +			 u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr, +			 u8 *s_mac, struct ib_mad *mad)  {  	struct ib_sge list;  	struct ib_send_wr wr, *bad_wr; @@ -1099,10 +1176,6 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,  	if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE)  		return -EAGAIN; -	/* QP0 forwarding only for Dom0 */ -	if (dest_qpt == IB_QPT_SMI && (mlx4_master_func_num(dev->dev) != slave)) -		return -EINVAL; -  	if (dest_qpt == IB_QPT_SMI) {  		src_qpnum = 0;  		sqp = &sqp_ctx->qp[0]; @@ -1166,6 +1239,9 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,  	wr.num_sge = 1;  	wr.opcode = IB_WR_SEND;  	wr.send_flags = IB_SEND_SIGNALED; +	if (s_mac) +		memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6); +  	ret = ib_post_send(send_qp, &wr, &bad_wr);  out: @@ -1174,6 +1250,22 @@ out:  	return ret;  } +static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port) +{ +	if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) +		return slave; +	return mlx4_get_base_gid_ix(dev->dev, slave, port); +} + +static void fill_in_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port, +				    struct ib_ah_attr *ah_attr) +{ +	if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) +		ah_attr->grh.sgid_index = slave; +	else +		ah_attr->grh.sgid_index += get_slave_base_gid_ix(dev, slave, port); +} +  static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc)  {  	struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); @@ -1184,6 +1276,7 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc  	struct ib_ah_attr ah_attr;  	u8 *slave_id;  	int slave; +	int port;  	/* Get slave that sent this packet */  	if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn || @@ -1199,11 +1292,6 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc  			     "belongs to another slave\n", wc->src_qp);  		return;  	} -	if (slave != mlx4_master_func_num(dev->dev) && !(wc->src_qp & 0x2)) { -		mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: " -			     "non-master trying to send QP0 packets\n", wc->src_qp); -		return; -	}  	/* Map transaction ID */  	ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map, @@ -1231,6 +1319,12 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc  	/* Class-specific handling */  	switch (tunnel->mad.mad_hdr.mgmt_class) { +	case IB_MGMT_CLASS_SUBN_LID_ROUTED: +	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: +		if (slave != mlx4_master_func_num(dev->dev) && +		    !mlx4_vf_smi_enabled(dev->dev, slave, ctx->port)) +			return; +		break;  	case IB_MGMT_CLASS_SUBN_ADM:  		if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave,  			      (struct ib_sa_mad *) &tunnel->mad)) @@ -1260,12 +1354,18 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc  	memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av));  	ah.ibah.device = ctx->ib_dev;  	mlx4_ib_query_ah(&ah.ibah, &ah_attr); -	if ((ah_attr.ah_flags & IB_AH_GRH) && -	    (ah_attr.grh.sgid_index != slave)) { -		mlx4_ib_warn(ctx->ib_dev, "slave:%d accessed invalid sgid_index:%d\n", -			     slave, ah_attr.grh.sgid_index); +	if (ah_attr.ah_flags & IB_AH_GRH) +		fill_in_real_sgid_index(dev, slave, ctx->port, &ah_attr); + +	port = mlx4_slave_convert_port(dev->dev, slave, ah_attr.port_num); +	if (port < 0)  		return; -	} +	ah_attr.port_num = port; +	memcpy(ah_attr.dmac, tunnel->hdr.mac, 6); +	ah_attr.vlan_id = be16_to_cpu(tunnel->hdr.vlan); +	/* if slave have default vlan use it */ +	mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave, +				    &ah_attr.vlan_id, &ah_attr.sl);  	mlx4_ib_send_to_wire(dev, slave, ctx->port,  			     is_proxy_qp0(dev, wc->src_qp, slave) ? @@ -1273,7 +1373,7 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc  			     be16_to_cpu(tunnel->hdr.pkey_index),  			     be32_to_cpu(tunnel->hdr.remote_qpn),  			     be32_to_cpu(tunnel->hdr.qkey), -			     &ah_attr, &tunnel->mad); +			     &ah_attr, wc->smac, &tunnel->mad);  }  static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, @@ -1657,9 +1757,9 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port,  		return -EEXIST;  	ctx->state = DEMUX_PV_STATE_STARTING; -	/* have QP0 only on port owner, and only if link layer is IB */ -	if (ctx->slave == mlx4_master_func_num(to_mdev(ctx->ib_dev)->dev) && -	    rdma_port_get_link_layer(ibdev, ctx->port) == IB_LINK_LAYER_INFINIBAND) +	/* have QP0 only if link layer is IB */ +	if (rdma_port_get_link_layer(ibdev, ctx->port) == +	    IB_LINK_LAYER_INFINIBAND)  		ctx->has_smi = 1;  	if (ctx->has_smi) { @@ -1850,7 +1950,15 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,  	ctx->port = port;  	ctx->ib_dev = &dev->ib_dev; -	for (i = 0; i < dev->dev->caps.sqp_demux; i++) { +	for (i = 0; +	     i < min(dev->dev->caps.sqp_demux, (u16)(dev->dev->num_vfs + 1)); +	     i++) { +		struct mlx4_active_ports actv_ports = +			mlx4_get_active_ports(dev->dev, i); + +		if (!test_bit(port - 1, actv_ports.ports)) +			continue; +  		ret = alloc_pv_object(dev, i, port, &ctx->tun[i]);  		if (ret) {  			ret = -ENOMEM; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index d6c5a73becf..0f7027e7db1 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -39,6 +39,8 @@  #include <linux/inetdevice.h>  #include <linux/rtnetlink.h>  #include <linux/if_vlan.h> +#include <net/ipv6.h> +#include <net/addrconf.h>  #include <rdma/ib_smi.h>  #include <rdma/ib_user_verbs.h> @@ -46,15 +48,17 @@  #include <linux/mlx4/driver.h>  #include <linux/mlx4/cmd.h> +#include <linux/mlx4/qp.h>  #include "mlx4_ib.h"  #include "user.h"  #define DRV_NAME	MLX4_IB_DRV_NAME -#define DRV_VERSION	"1.0" -#define DRV_RELDATE	"April 4, 2008" +#define DRV_VERSION	"2.2-1" +#define DRV_RELDATE	"Feb 2014"  #define MLX4_IB_FLOW_MAX_PRIO 0xFFF +#define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF  MODULE_AUTHOR("Roland Dreier");  MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); @@ -92,21 +96,27 @@ static union ib_gid zgid;  static int check_flow_steering_support(struct mlx4_dev *dev)  { +	int eth_num_ports = 0;  	int ib_num_ports = 0; -	int i; -	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) -		ib_num_ports++; - -	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { -		if (ib_num_ports || mlx4_is_mfunc(dev)) { -			pr_warn("Device managed flow steering is unavailable " -				"for IB ports or in multifunction env.\n"); -			return 0; +	int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED; + +	if (dmfs) { +		int i; +		mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) +			eth_num_ports++; +		mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) +			ib_num_ports++; +		dmfs &= (!ib_num_ports || +			 (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) && +			(!eth_num_ports || +			 (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)); +		if (ib_num_ports && mlx4_is_mfunc(dev)) { +			pr_warn("Device managed flow steering is unavailable for IB port in multifunction env.\n"); +			dmfs = 0;  		} -		return 1;  	} -	return 0; +	return dmfs;  }  static int mlx4_ib_query_device(struct ib_device *ibdev, @@ -165,7 +175,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,  			props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B;  		else  			props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A; -	if (check_flow_steering_support(dev->dev)) +	if (dev->steering_support ==  MLX4_STEERING_MODE_DEVICE_MANAGED)  		props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;  	} @@ -177,18 +187,18 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,  	props->max_mr_size	   = ~0ull;  	props->page_size_cap	   = dev->dev->caps.page_size_cap; -	props->max_qp		   = dev->dev->caps.num_qps - dev->dev->caps.reserved_qps; +	props->max_qp		   = dev->dev->quotas.qp;  	props->max_qp_wr	   = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE;  	props->max_sge		   = min(dev->dev->caps.max_sq_sg,  					 dev->dev->caps.max_rq_sg); -	props->max_cq		   = dev->dev->caps.num_cqs - dev->dev->caps.reserved_cqs; +	props->max_cq		   = dev->dev->quotas.cq;  	props->max_cqe		   = dev->dev->caps.max_cqes; -	props->max_mr		   = dev->dev->caps.num_mpts - dev->dev->caps.reserved_mrws; +	props->max_mr		   = dev->dev->quotas.mpt;  	props->max_pd		   = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds;  	props->max_qp_rd_atom	   = dev->dev->caps.max_qp_dest_rdma;  	props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma;  	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp; -	props->max_srq		   = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs; +	props->max_srq		   = dev->dev->quotas.srq;  	props->max_srq_wr	   = dev->dev->caps.max_srq_wqes - 1;  	props->max_srq_sge	   = dev->dev->caps.max_srq_sge;  	props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; @@ -338,7 +348,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,  	props->active_width	=  (((u8 *)mailbox->buf)[5] == 0x40) ?  						IB_WIDTH_4X : IB_WIDTH_1X;  	props->active_speed	= IB_SPEED_QDR; -	props->port_cap_flags	= IB_PORT_CM_SUP; +	props->port_cap_flags	= IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS;  	props->gid_tbl_len	= mdev->dev->caps.gid_table_len[port];  	props->max_msg_sz	= mdev->dev->caps.max_msg_sz;  	props->pkey_tbl_len	= 1; @@ -526,7 +536,6 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,  	if (IS_ERR(mailbox))  		return 0; -	memset(mailbox->buf, 0, 256);  	memcpy(mailbox->buf, props->node_desc, 64);  	mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0,  		 MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); @@ -536,19 +545,16 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,  	return 0;  } -static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, -			 u32 cap_mask) +static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, +			    u32 cap_mask)  {  	struct mlx4_cmd_mailbox *mailbox;  	int err; -	u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;  	mailbox = mlx4_alloc_cmd_mailbox(dev->dev);  	if (IS_ERR(mailbox))  		return PTR_ERR(mailbox); -	memset(mailbox->buf, 0, 256); -  	if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {  		*(u8 *) mailbox->buf	     = !!reset_qkey_viols << 6;  		((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask); @@ -557,8 +563,8 @@ static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols,  		((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask);  	} -	err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT, -		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); +	err = mlx4_cmd(dev->dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT, +		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);  	mlx4_free_cmd_mailbox(dev->dev, mailbox);  	return err; @@ -567,11 +573,20 @@ static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols,  static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,  			       struct ib_port_modify *props)  { +	struct mlx4_ib_dev *mdev = to_mdev(ibdev); +	u8 is_eth = mdev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;  	struct ib_port_attr attr;  	u32 cap_mask;  	int err; -	mutex_lock(&to_mdev(ibdev)->cap_mask_mutex); +	/* return OK if this is RoCE. CM calls ib_modify_port() regardless +	 * of whether port link layer is ETH or IB. For ETH ports, qkey +	 * violations and port capabilities are not meaningful. +	 */ +	if (is_eth) +		return 0; + +	mutex_lock(&mdev->cap_mask_mutex);  	err = mlx4_ib_query_port(ibdev, port, &attr);  	if (err) @@ -580,9 +595,9 @@ static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,  	cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &  		~props->clr_port_cap_mask; -	err = mlx4_SET_PORT(to_mdev(ibdev), port, -			    !!(mask & IB_PORT_RESET_QKEY_CNTR), -			    cap_mask); +	err = mlx4_ib_SET_PORT(mdev, port, +			       !!(mask & IB_PORT_RESET_QKEY_CNTR), +			       cap_mask);  out:  	mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); @@ -790,7 +805,6 @@ static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid)  int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,  		   union ib_gid *gid)  { -	u8 mac[6];  	struct net_device *ndev;  	int ret = 0; @@ -804,11 +818,7 @@ int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,  	spin_unlock(&mdev->iboe.lock);  	if (ndev) { -		rdma_get_mcast_mac((struct in6_addr *)gid, mac); -		rtnl_lock(); -		dev_mc_add(mdev->iboe.netdevs[mqp->port - 1], mac);  		ret = 1; -		rtnl_unlock();  		dev_put(ndev);  	} @@ -822,6 +832,7 @@ struct mlx4_ib_steering {  };  static int parse_flow_attr(struct mlx4_dev *dev, +			   u32 qp_num,  			   union ib_flow_spec *ib_spec,  			   struct _rule_hw *mlx4_spec)  { @@ -837,6 +848,14 @@ static int parse_flow_attr(struct mlx4_dev *dev,  		mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag;  		mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag;  		break; +	case IB_FLOW_SPEC_IB: +		type = MLX4_NET_TRANS_RULE_ID_IB; +		mlx4_spec->ib.l3_qpn = +			cpu_to_be32(qp_num); +		mlx4_spec->ib.qpn_mask = +			cpu_to_be32(MLX4_IB_FLOW_QPN_MASK); +		break; +  	case IB_FLOW_SPEC_IPV4:  		type = MLX4_NET_TRANS_RULE_ID_IPV4; @@ -868,6 +887,115 @@ static int parse_flow_attr(struct mlx4_dev *dev,  	return mlx4_hw_rule_sz(dev, type);  } +struct default_rules { +	__u32 mandatory_fields[IB_FLOW_SPEC_SUPPORT_LAYERS]; +	__u32 mandatory_not_fields[IB_FLOW_SPEC_SUPPORT_LAYERS]; +	__u32 rules_create_list[IB_FLOW_SPEC_SUPPORT_LAYERS]; +	__u8  link_layer; +}; +static const struct default_rules default_table[] = { +	{ +		.mandatory_fields = {IB_FLOW_SPEC_IPV4}, +		.mandatory_not_fields = {IB_FLOW_SPEC_ETH}, +		.rules_create_list = {IB_FLOW_SPEC_IB}, +		.link_layer = IB_LINK_LAYER_INFINIBAND +	} +}; + +static int __mlx4_ib_default_rules_match(struct ib_qp *qp, +					 struct ib_flow_attr *flow_attr) +{ +	int i, j, k; +	void *ib_flow; +	const struct default_rules *pdefault_rules = default_table; +	u8 link_layer = rdma_port_get_link_layer(qp->device, flow_attr->port); + +	for (i = 0; i < sizeof(default_table)/sizeof(default_table[0]); i++, +	     pdefault_rules++) { +		__u32 field_types[IB_FLOW_SPEC_SUPPORT_LAYERS]; +		memset(&field_types, 0, sizeof(field_types)); + +		if (link_layer != pdefault_rules->link_layer) +			continue; + +		ib_flow = flow_attr + 1; +		/* we assume the specs are sorted */ +		for (j = 0, k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS && +		     j < flow_attr->num_of_specs; k++) { +			union ib_flow_spec *current_flow = +				(union ib_flow_spec *)ib_flow; + +			/* same layer but different type */ +			if (((current_flow->type & IB_FLOW_SPEC_LAYER_MASK) == +			     (pdefault_rules->mandatory_fields[k] & +			      IB_FLOW_SPEC_LAYER_MASK)) && +			    (current_flow->type != +			     pdefault_rules->mandatory_fields[k])) +				goto out; + +			/* same layer, try match next one */ +			if (current_flow->type == +			    pdefault_rules->mandatory_fields[k]) { +				j++; +				ib_flow += +					((union ib_flow_spec *)ib_flow)->size; +			} +		} + +		ib_flow = flow_attr + 1; +		for (j = 0; j < flow_attr->num_of_specs; +		     j++, ib_flow += ((union ib_flow_spec *)ib_flow)->size) +			for (k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS; k++) +				/* same layer and same type */ +				if (((union ib_flow_spec *)ib_flow)->type == +				    pdefault_rules->mandatory_not_fields[k]) +					goto out; + +		return i; +	} +out: +	return -1; +} + +static int __mlx4_ib_create_default_rules( +		struct mlx4_ib_dev *mdev, +		struct ib_qp *qp, +		const struct default_rules *pdefault_rules, +		struct _rule_hw *mlx4_spec) { +	int size = 0; +	int i; + +	for (i = 0; i < sizeof(pdefault_rules->rules_create_list)/ +			sizeof(pdefault_rules->rules_create_list[0]); i++) { +		int ret; +		union ib_flow_spec ib_spec; +		switch (pdefault_rules->rules_create_list[i]) { +		case 0: +			/* no rule */ +			continue; +		case IB_FLOW_SPEC_IB: +			ib_spec.type = IB_FLOW_SPEC_IB; +			ib_spec.size = sizeof(struct ib_flow_spec_ib); + +			break; +		default: +			/* invalid rule */ +			return -EINVAL; +		} +		/* We must put empty rule, qpn is being ignored */ +		ret = parse_flow_attr(mdev->dev, 0, &ib_spec, +				      mlx4_spec); +		if (ret < 0) { +			pr_info("invalid parsing\n"); +			return -EINVAL; +		} + +		mlx4_spec = (void *)mlx4_spec + ret; +		size += ret; +	} +	return size; +} +  static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr,  			  int domain,  			  enum mlx4_net_trans_promisc_mode flow_type, @@ -879,8 +1007,7 @@ static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_att  	struct mlx4_ib_dev *mdev = to_mdev(qp->device);  	struct mlx4_cmd_mailbox *mailbox;  	struct mlx4_net_trans_rule_hw_ctrl *ctrl; -	size_t rule_size = sizeof(struct mlx4_net_trans_rule_hw_ctrl) + -			   (sizeof(struct _rule_hw) * flow_attr->num_of_specs); +	int default_flow;  	static const u16 __mlx4_domain[] = {  		[IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS, @@ -905,7 +1032,6 @@ static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_att  	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);  	if (IS_ERR(mailbox))  		return PTR_ERR(mailbox); -	memset(mailbox->buf, 0, rule_size);  	ctrl = mailbox->buf;  	ctrl->prio = cpu_to_be16(__mlx4_domain[domain] | @@ -916,8 +1042,21 @@ static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_att  	ib_flow = flow_attr + 1;  	size += sizeof(struct mlx4_net_trans_rule_hw_ctrl); +	/* Add default flows */ +	default_flow = __mlx4_ib_default_rules_match(qp, flow_attr); +	if (default_flow >= 0) { +		ret = __mlx4_ib_create_default_rules( +				mdev, qp, default_table + default_flow, +				mailbox->buf + size); +		if (ret < 0) { +			mlx4_free_cmd_mailbox(mdev->dev, mailbox); +			return -EINVAL; +		} +		size += ret; +	}  	for (i = 0; i < flow_attr->num_of_specs; i++) { -		ret = parse_flow_attr(mdev->dev, ib_flow, mailbox->buf + size); +		ret = parse_flow_attr(mdev->dev, qp->qp_num, ib_flow, +				      mailbox->buf + size);  		if (ret < 0) {  			mlx4_free_cmd_mailbox(mdev->dev, mailbox);  			return -EINVAL; @@ -1031,6 +1170,8 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)  	struct mlx4_ib_qp *mqp = to_mqp(ibqp);  	u64 reg_id;  	struct mlx4_ib_steering *ib_steering = NULL; +	enum mlx4_protocol prot = (gid->raw[1] == 0x0e) ? +		MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6;  	if (mdev->dev->caps.steering_mode ==  	    MLX4_STEERING_MODE_DEVICE_MANAGED) { @@ -1042,7 +1183,7 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)  	err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port,  				    !!(mqp->flags &  				       MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), -				    MLX4_PROT_IB_IPV6, ®_id); +				    prot, ®_id);  	if (err)  		goto err_malloc; @@ -1061,7 +1202,7 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)  err_add:  	mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, -			      MLX4_PROT_IB_IPV6, reg_id); +			      prot, reg_id);  err_malloc:  	kfree(ib_steering); @@ -1089,10 +1230,11 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)  	int err;  	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);  	struct mlx4_ib_qp *mqp = to_mqp(ibqp); -	u8 mac[6];  	struct net_device *ndev;  	struct mlx4_ib_gid_entry *ge;  	u64 reg_id = 0; +	enum mlx4_protocol prot = (gid->raw[1] == 0x0e) ? +		MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6;  	if (mdev->dev->caps.steering_mode ==  	    MLX4_STEERING_MODE_DEVICE_MANAGED) { @@ -1115,7 +1257,7 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)  	}  	err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, -				    MLX4_PROT_IB_IPV6, reg_id); +				    prot, reg_id);  	if (err)  		return err; @@ -1127,13 +1269,8 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)  		if (ndev)  			dev_hold(ndev);  		spin_unlock(&mdev->iboe.lock); -		rdma_get_mcast_mac((struct in6_addr *)gid, mac); -		if (ndev) { -			rtnl_lock(); -			dev_mc_del(mdev->iboe.netdevs[ge->port - 1], mac); -			rtnl_unlock(); +		if (ndev)  			dev_put(ndev); -		}  		list_del(&ge->list);  		kfree(ge);  	} else @@ -1229,7 +1366,8 @@ static struct device_attribute *mlx4_class_attributes[] = {  	&dev_attr_board_id  }; -static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev) +static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, +				     struct net_device *dev)  {  	memcpy(eui, dev->dev_addr, 3);  	memcpy(eui + 5, dev->dev_addr + 3, 3); @@ -1265,162 +1403,437 @@ static void update_gids_task(struct work_struct *work)  		       MLX4_CMD_WRAPPED);  	if (err)  		pr_warn("set port command failed\n"); -	else { -		memcpy(gw->dev->iboe.gid_table[gw->port - 1], gw->gids, sizeof gw->gids); +	else  		mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE); + +	mlx4_free_cmd_mailbox(dev, mailbox); +	kfree(gw); +} + +static void reset_gids_task(struct work_struct *work) +{ +	struct update_gid_work *gw = +			container_of(work, struct update_gid_work, work); +	struct mlx4_cmd_mailbox *mailbox; +	union ib_gid *gids; +	int err; +	struct mlx4_dev	*dev = gw->dev->dev; + +	mailbox = mlx4_alloc_cmd_mailbox(dev); +	if (IS_ERR(mailbox)) { +		pr_warn("reset gid table failed\n"); +		goto free; +	} + +	gids = mailbox->buf; +	memcpy(gids, gw->gids, sizeof(gw->gids)); + +	if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) == +				    IB_LINK_LAYER_ETHERNET) { +		err = mlx4_cmd(dev, mailbox->dma, +			       MLX4_SET_PORT_GID_TABLE << 8 | gw->port, +			       1, MLX4_CMD_SET_PORT, +			       MLX4_CMD_TIME_CLASS_B, +			       MLX4_CMD_WRAPPED); +		if (err) +			pr_warn(KERN_WARNING +				"set port %d command failed\n", gw->port);  	}  	mlx4_free_cmd_mailbox(dev, mailbox); +free:  	kfree(gw);  } -static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) +static int update_gid_table(struct mlx4_ib_dev *dev, int port, +			    union ib_gid *gid, int clear, +			    int default_gid)  { -	struct net_device *ndev = dev->iboe.netdevs[port - 1];  	struct update_gid_work *work; -	struct net_device *tmp;  	int i; -	u8 *hits; -	int ret; -	union ib_gid gid; -	int free; -	int found;  	int need_update = 0; -	u16 vid; - -	work = kzalloc(sizeof *work, GFP_ATOMIC); -	if (!work) -		return -ENOMEM; - -	hits = kzalloc(128, GFP_ATOMIC); -	if (!hits) { -		ret = -ENOMEM; -		goto out; -	} +	int free = -1; +	int found = -1; +	int max_gids; -	rcu_read_lock(); -	for_each_netdev_rcu(&init_net, tmp) { -		if (ndev && (tmp == ndev || rdma_vlan_dev_real_dev(tmp) == ndev)) { -			gid.global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); -			vid = rdma_vlan_dev_vlan_id(tmp); -			mlx4_addrconf_ifid_eui48(&gid.raw[8], vid, ndev); -			found = 0; -			free = -1; -			for (i = 0; i < 128; ++i) { -				if (free < 0 && -				    !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) -					free = i; -				if (!memcmp(&dev->iboe.gid_table[port - 1][i], &gid, sizeof gid)) { -					hits[i] = 1; -					found = 1; +	if (default_gid) { +		free = 0; +	} else { +		max_gids = dev->dev->caps.gid_table_len[port]; +		for (i = 1; i < max_gids; ++i) { +			if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid, +				    sizeof(*gid))) +				found = i; + +			if (clear) { +				if (found >= 0) { +					need_update = 1; +					dev->iboe.gid_table[port - 1][found] = +						zgid;  					break;  				} -			} +			} else { +				if (found >= 0) +					break; -			if (!found) { -				if (tmp == ndev && -				    (memcmp(&dev->iboe.gid_table[port - 1][0], -					    &gid, sizeof gid) || -				     !memcmp(&dev->iboe.gid_table[port - 1][0], -					     &zgid, sizeof gid))) { -					dev->iboe.gid_table[port - 1][0] = gid; -					++need_update; -					hits[0] = 1; -				} else if (free >= 0) { -					dev->iboe.gid_table[port - 1][free] = gid; -					hits[free] = 1; -					++need_update; -				} +				if (free < 0 && +				    !memcmp(&dev->iboe.gid_table[port - 1][i], +					    &zgid, sizeof(*gid))) +					free = i;  			}  		}  	} -	rcu_read_unlock(); -	for (i = 0; i < 128; ++i) -		if (!hits[i]) { -			if (memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) -				++need_update; -			dev->iboe.gid_table[port - 1][i] = zgid; -		} +	if (found == -1 && !clear && free >= 0) { +		dev->iboe.gid_table[port - 1][free] = *gid; +		need_update = 1; +	} -	if (need_update) { -		memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof work->gids); -		INIT_WORK(&work->work, update_gids_task); -		work->port = port; -		work->dev = dev; -		queue_work(wq, &work->work); -	} else -		kfree(work); +	if (!need_update) +		return 0; + +	work = kzalloc(sizeof(*work), GFP_ATOMIC); +	if (!work) +		return -ENOMEM; + +	memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids)); +	INIT_WORK(&work->work, update_gids_task); +	work->port = port; +	work->dev = dev; +	queue_work(wq, &work->work); -	kfree(hits);  	return 0; +} -out: -	kfree(work); -	return ret; +static void mlx4_make_default_gid(struct  net_device *dev, union ib_gid *gid) +{ +	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); +	mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev);  } -static void handle_en_event(struct mlx4_ib_dev *dev, int port, unsigned long event) + +static int reset_gid_table(struct mlx4_ib_dev *dev, u8 port)  { -	switch (event) { -	case NETDEV_UP: -	case NETDEV_CHANGEADDR: -		update_ipv6_gids(dev, port, 0); -		break; +	struct update_gid_work *work; -	case NETDEV_DOWN: -		update_ipv6_gids(dev, port, 1); -		dev->iboe.netdevs[port - 1] = NULL; -	} +	work = kzalloc(sizeof(*work), GFP_ATOMIC); +	if (!work) +		return -ENOMEM; + +	memset(dev->iboe.gid_table[port - 1], 0, sizeof(work->gids)); +	memset(work->gids, 0, sizeof(work->gids)); +	INIT_WORK(&work->work, reset_gids_task); +	work->dev = dev; +	work->port = port; +	queue_work(wq, &work->work); +	return 0;  } -static void netdev_added(struct mlx4_ib_dev *dev, int port) +static int mlx4_ib_addr_event(int event, struct net_device *event_netdev, +			      struct mlx4_ib_dev *ibdev, union ib_gid *gid)  { -	update_ipv6_gids(dev, port, 0); +	struct mlx4_ib_iboe *iboe; +	int port = 0; +	struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ? +				rdma_vlan_dev_real_dev(event_netdev) : +				event_netdev; +	union ib_gid default_gid; + +	mlx4_make_default_gid(real_dev, &default_gid); + +	if (!memcmp(gid, &default_gid, sizeof(*gid))) +		return 0; + +	if (event != NETDEV_DOWN && event != NETDEV_UP) +		return 0; + +	if ((real_dev != event_netdev) && +	    (event == NETDEV_DOWN) && +	    rdma_link_local_addr((struct in6_addr *)gid)) +		return 0; + +	iboe = &ibdev->iboe; +	spin_lock(&iboe->lock); + +	for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) +		if ((netif_is_bond_master(real_dev) && +		     (real_dev == iboe->masters[port - 1])) || +		     (!netif_is_bond_master(real_dev) && +		     (real_dev == iboe->netdevs[port - 1]))) +			update_gid_table(ibdev, port, gid, +					 event == NETDEV_DOWN, 0); + +	spin_unlock(&iboe->lock); +	return 0; +  } -static void netdev_removed(struct mlx4_ib_dev *dev, int port) +static u8 mlx4_ib_get_dev_port(struct net_device *dev, +			       struct mlx4_ib_dev *ibdev)  { -	update_ipv6_gids(dev, port, 1); +	u8 port = 0; +	struct mlx4_ib_iboe *iboe; +	struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ? +				rdma_vlan_dev_real_dev(dev) : dev; + +	iboe = &ibdev->iboe; + +	for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) +		if ((netif_is_bond_master(real_dev) && +		     (real_dev == iboe->masters[port - 1])) || +		     (!netif_is_bond_master(real_dev) && +		     (real_dev == iboe->netdevs[port - 1]))) +			break; + +	if ((port == 0) || (port > ibdev->dev->caps.num_ports)) +		return 0; +	else +		return port;  } -static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event, +static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event, +				void *ptr) +{ +	struct mlx4_ib_dev *ibdev; +	struct in_ifaddr *ifa = ptr; +	union ib_gid gid; +	struct net_device *event_netdev = ifa->ifa_dev->dev; + +	ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid); + +	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet); + +	mlx4_ib_addr_event(event, event_netdev, ibdev, &gid); +	return NOTIFY_DONE; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int mlx4_ib_inet6_event(struct notifier_block *this, unsigned long event,  				void *ptr)  { -	struct net_device *dev = netdev_notifier_info_to_dev(ptr);  	struct mlx4_ib_dev *ibdev; -	struct net_device *oldnd; +	struct inet6_ifaddr *ifa = ptr; +	union  ib_gid *gid = (union ib_gid *)&ifa->addr; +	struct net_device *event_netdev = ifa->idev->dev; + +	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet6); + +	mlx4_ib_addr_event(event, event_netdev, ibdev, gid); +	return NOTIFY_DONE; +} +#endif + +#define MLX4_IB_INVALID_MAC	((u64)-1) +static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev, +			       struct net_device *dev, +			       int port) +{ +	u64 new_smac = 0; +	u64 release_mac = MLX4_IB_INVALID_MAC; +	struct mlx4_ib_qp *qp; + +	read_lock(&dev_base_lock); +	new_smac = mlx4_mac_to_u64(dev->dev_addr); +	read_unlock(&dev_base_lock); + +	mutex_lock(&ibdev->qp1_proxy_lock[port - 1]); +	qp = ibdev->qp1_proxy[port - 1]; +	if (qp) { +		int new_smac_index; +		u64 old_smac = qp->pri.smac; +		struct mlx4_update_qp_params update_params; + +		if (new_smac == old_smac) +			goto unlock; + +		new_smac_index = mlx4_register_mac(ibdev->dev, port, new_smac); + +		if (new_smac_index < 0) +			goto unlock; + +		update_params.smac_index = new_smac_index; +		if (mlx4_update_qp(ibdev->dev, &qp->mqp, MLX4_UPDATE_QP_SMAC, +				   &update_params)) { +			release_mac = new_smac; +			goto unlock; +		} + +		qp->pri.smac = new_smac; +		qp->pri.smac_index = new_smac_index; + +		release_mac = old_smac; +	} + +unlock: +	mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]); +	if (release_mac != MLX4_IB_INVALID_MAC) +		mlx4_unregister_mac(ibdev->dev, port, release_mac); +} + +static void mlx4_ib_get_dev_addr(struct net_device *dev, +				 struct mlx4_ib_dev *ibdev, u8 port) +{ +	struct in_device *in_dev; +#if IS_ENABLED(CONFIG_IPV6) +	struct inet6_dev *in6_dev; +	union ib_gid  *pgid; +	struct inet6_ifaddr *ifp; +#endif +	union ib_gid gid; + + +	if ((port == 0) || (port > ibdev->dev->caps.num_ports)) +		return; + +	/* IPv4 gids */ +	in_dev = in_dev_get(dev); +	if (in_dev) { +		for_ifa(in_dev) { +			/*ifa->ifa_address;*/ +			ipv6_addr_set_v4mapped(ifa->ifa_address, +					       (struct in6_addr *)&gid); +			update_gid_table(ibdev, port, &gid, 0, 0); +		} +		endfor_ifa(in_dev); +		in_dev_put(in_dev); +	} +#if IS_ENABLED(CONFIG_IPV6) +	/* IPv6 gids */ +	in6_dev = in6_dev_get(dev); +	if (in6_dev) { +		read_lock_bh(&in6_dev->lock); +		list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { +			pgid = (union ib_gid *)&ifp->addr; +			update_gid_table(ibdev, port, pgid, 0, 0); +		} +		read_unlock_bh(&in6_dev->lock); +		in6_dev_put(in6_dev); +	} +#endif +} + +static void mlx4_ib_set_default_gid(struct mlx4_ib_dev *ibdev, +				 struct  net_device *dev, u8 port) +{ +	union ib_gid gid; +	mlx4_make_default_gid(dev, &gid); +	update_gid_table(ibdev, port, &gid, 0, 1); +} + +static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev) +{ +	struct	net_device *dev; +	struct mlx4_ib_iboe *iboe = &ibdev->iboe; +	int i; + +	for (i = 1; i <= ibdev->num_ports; ++i) +		if (reset_gid_table(ibdev, i)) +			return -1; + +	read_lock(&dev_base_lock); +	spin_lock(&iboe->lock); + +	for_each_netdev(&init_net, dev) { +		u8 port = mlx4_ib_get_dev_port(dev, ibdev); +		if (port) +			mlx4_ib_get_dev_addr(dev, ibdev, port); +	} + +	spin_unlock(&iboe->lock); +	read_unlock(&dev_base_lock); + +	return 0; +} + +static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, +				 struct net_device *dev, +				 unsigned long event) + +{  	struct mlx4_ib_iboe *iboe; +	int update_qps_port = -1;  	int port; -	if (!net_eq(dev_net(dev), &init_net)) -		return NOTIFY_DONE; - -	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb);  	iboe = &ibdev->iboe;  	spin_lock(&iboe->lock);  	mlx4_foreach_ib_transport_port(port, ibdev->dev) { -		oldnd = iboe->netdevs[port - 1]; +		enum ib_port_state	port_state = IB_PORT_NOP; +		struct net_device *old_master = iboe->masters[port - 1]; +		struct net_device *curr_netdev; +		struct net_device *curr_master; +  		iboe->netdevs[port - 1] =  			mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); -		if (oldnd != iboe->netdevs[port - 1]) { -			if (iboe->netdevs[port - 1]) -				netdev_added(ibdev, port); -			else -				netdev_removed(ibdev, port); +		if (iboe->netdevs[port - 1]) +			mlx4_ib_set_default_gid(ibdev, +						iboe->netdevs[port - 1], port); +		curr_netdev = iboe->netdevs[port - 1]; + +		if (iboe->netdevs[port - 1] && +		    netif_is_bond_slave(iboe->netdevs[port - 1])) { +			iboe->masters[port - 1] = netdev_master_upper_dev_get( +				iboe->netdevs[port - 1]); +		} else { +			iboe->masters[port - 1] = NULL; +		} +		curr_master = iboe->masters[port - 1]; + +		if (dev == iboe->netdevs[port - 1] && +		    (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER || +		     event == NETDEV_UP || event == NETDEV_CHANGE)) +			update_qps_port = port; + +		if (curr_netdev) { +			port_state = (netif_running(curr_netdev) && netif_carrier_ok(curr_netdev)) ? +						IB_PORT_ACTIVE : IB_PORT_DOWN; +			mlx4_ib_set_default_gid(ibdev, curr_netdev, port); +		} else { +			reset_gid_table(ibdev, port); +		} +		/* if using bonding/team and a slave port is down, we don't the bond IP +		 * based gids in the table since flows that select port by gid may get +		 * the down port. +		 */ +		if (curr_master && (port_state == IB_PORT_DOWN)) { +			reset_gid_table(ibdev, port); +			mlx4_ib_set_default_gid(ibdev, curr_netdev, port); +		} +		/* if bonding is used it is possible that we add it to masters +		 * only after IP address is assigned to the net bonding +		 * interface. +		*/ +		if (curr_master && (old_master != curr_master)) { +			reset_gid_table(ibdev, port); +			mlx4_ib_set_default_gid(ibdev, curr_netdev, port); +			mlx4_ib_get_dev_addr(curr_master, ibdev, port);  		} -	} -	if (dev == iboe->netdevs[0] || -	    (iboe->netdevs[0] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[0])) -		handle_en_event(ibdev, 1, event); -	else if (dev == iboe->netdevs[1] -		 || (iboe->netdevs[1] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[1])) -		handle_en_event(ibdev, 2, event); +		if (!curr_master && (old_master != curr_master)) { +			reset_gid_table(ibdev, port); +			mlx4_ib_set_default_gid(ibdev, curr_netdev, port); +			mlx4_ib_get_dev_addr(curr_netdev, ibdev, port); +		} +	}  	spin_unlock(&iboe->lock); +	if (update_qps_port > 0) +		mlx4_ib_update_qps(ibdev, dev, update_qps_port); +} + +static int mlx4_ib_netdev_event(struct notifier_block *this, +				unsigned long event, void *ptr) +{ +	struct net_device *dev = netdev_notifier_info_to_dev(ptr); +	struct mlx4_ib_dev *ibdev; + +	if (!net_eq(dev_net(dev), &init_net)) +		return NOTIFY_DONE; + +	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); +	mlx4_ib_scan_netdevs(ibdev, dev, event); +  	return NOTIFY_DONE;  } @@ -1458,7 +1871,7 @@ static void init_pkeys(struct mlx4_ib_dev *ibdev)  static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)  { -	char name[32]; +	char name[80];  	int eq_per_port = 0;  	int added_eqs = 0;  	int total_eqs = 0; @@ -1488,8 +1901,8 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)  	eq = 0;  	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) {  		for (j = 0; j < eq_per_port; j++) { -			sprintf(name, "mlx4-ib-%d-%d@%s", -				i, j, dev->pdev->bus->name); +			snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%s", +				 i, j, dev->pdev->bus->name);  			/* Set IRQ for specific name (per ring) */  			if (mlx4_assign_eq(dev, name, NULL,  					   &ibdev->eq_table[eq])) { @@ -1539,17 +1952,10 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)  	int i, j;  	int err;  	struct mlx4_ib_iboe *iboe; +	int ib_num_ports = 0;  	pr_info_once("%s", mlx4_ib_version); -	mlx4_foreach_non_ib_transport_port(i, dev) -		num_ports++; - -	if (mlx4_is_mfunc(dev) && num_ports) { -		dev_err(&dev->pdev->dev, "RoCE is not supported over SRIOV as yet\n"); -		return NULL; -	} -  	num_ports = 0;  	mlx4_foreach_ib_transport_port(i, dev)  		num_ports++; @@ -1688,12 +2094,13 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)  	}  	if (check_flow_steering_support(dev)) { +		ibdev->steering_support = MLX4_STEERING_MODE_DEVICE_MANAGED;  		ibdev->ib_dev.create_flow	= mlx4_ib_create_flow;  		ibdev->ib_dev.destroy_flow	= mlx4_ib_destroy_flow; -		ibdev->ib_dev.uverbs_cmd_mask	|= -			(1ull << IB_USER_VERBS_CMD_CREATE_FLOW) | -			(1ull << IB_USER_VERBS_CMD_DESTROY_FLOW); +		ibdev->ib_dev.uverbs_ex_cmd_mask	|= +			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | +			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);  	}  	mlx4_ib_alloc_eqs(dev, ibdev); @@ -1704,20 +2111,53 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)  		goto err_map;  	for (i = 0; i < ibdev->num_ports; ++i) { +		mutex_init(&ibdev->qp1_proxy_lock[i]);  		if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) ==  						IB_LINK_LAYER_ETHERNET) {  			err = mlx4_counter_alloc(ibdev->dev, &ibdev->counters[i]);  			if (err)  				ibdev->counters[i] = -1; -		} else -				ibdev->counters[i] = -1; +		} else { +			ibdev->counters[i] = -1; +		}  	} +	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) +		ib_num_ports++; +  	spin_lock_init(&ibdev->sm_lock);  	mutex_init(&ibdev->cap_mask_mutex); +	if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED && +	    ib_num_ports) { +		ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; +		err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, +					    MLX4_IB_UC_STEER_QPN_ALIGN, +					    &ibdev->steer_qpn_base); +		if (err) +			goto err_counter; + +		ibdev->ib_uc_qpns_bitmap = +			kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) * +				sizeof(long), +				GFP_KERNEL); +		if (!ibdev->ib_uc_qpns_bitmap) { +			dev_err(&dev->pdev->dev, "bit map alloc failed\n"); +			goto err_steer_qp_release; +		} + +		bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count); + +		err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE( +				dev, ibdev->steer_qpn_base, +				ibdev->steer_qpn_base + +				ibdev->steer_qpn_count - 1); +		if (err) +			goto err_steer_free_bitmap; +	} +  	if (ib_register_device(&ibdev->ib_dev, NULL)) -		goto err_counter; +		goto err_steer_free_bitmap;  	if (mlx4_ib_mad_init(ibdev))  		goto err_reg; @@ -1725,11 +2165,39 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)  	if (mlx4_ib_init_sriov(ibdev))  		goto err_mad; -	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE && !iboe->nb.notifier_call) { -		iboe->nb.notifier_call = mlx4_ib_netdev_event; -		err = register_netdevice_notifier(&iboe->nb); -		if (err) -			goto err_sriov; +	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) { +		if (!iboe->nb.notifier_call) { +			iboe->nb.notifier_call = mlx4_ib_netdev_event; +			err = register_netdevice_notifier(&iboe->nb); +			if (err) { +				iboe->nb.notifier_call = NULL; +				goto err_notif; +			} +		} +		if (!iboe->nb_inet.notifier_call) { +			iboe->nb_inet.notifier_call = mlx4_ib_inet_event; +			err = register_inetaddr_notifier(&iboe->nb_inet); +			if (err) { +				iboe->nb_inet.notifier_call = NULL; +				goto err_notif; +			} +		} +#if IS_ENABLED(CONFIG_IPV6) +		if (!iboe->nb_inet6.notifier_call) { +			iboe->nb_inet6.notifier_call = mlx4_ib_inet6_event; +			err = register_inet6addr_notifier(&iboe->nb_inet6); +			if (err) { +				iboe->nb_inet6.notifier_call = NULL; +				goto err_notif; +			} +		} +#endif +		for (i = 1 ; i <= ibdev->num_ports ; ++i) +			reset_gid_table(ibdev, i); +		rtnl_lock(); +		mlx4_ib_scan_netdevs(ibdev, NULL, 0); +		rtnl_unlock(); +		mlx4_ib_init_gid_table(ibdev);  	}  	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { @@ -1755,11 +2223,25 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)  	return ibdev;  err_notif: -	if (unregister_netdevice_notifier(&ibdev->iboe.nb)) -		pr_warn("failure unregistering notifier\n"); +	if (ibdev->iboe.nb.notifier_call) { +		if (unregister_netdevice_notifier(&ibdev->iboe.nb)) +			pr_warn("failure unregistering notifier\n"); +		ibdev->iboe.nb.notifier_call = NULL; +	} +	if (ibdev->iboe.nb_inet.notifier_call) { +		if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) +			pr_warn("failure unregistering notifier\n"); +		ibdev->iboe.nb_inet.notifier_call = NULL; +	} +#if IS_ENABLED(CONFIG_IPV6) +	if (ibdev->iboe.nb_inet6.notifier_call) { +		if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6)) +			pr_warn("failure unregistering notifier\n"); +		ibdev->iboe.nb_inet6.notifier_call = NULL; +	} +#endif  	flush_workqueue(wq); -err_sriov:  	mlx4_ib_close_sriov(ibdev);  err_mad: @@ -1768,6 +2250,13 @@ err_mad:  err_reg:  	ib_unregister_device(&ibdev->ib_dev); +err_steer_free_bitmap: +	kfree(ibdev->ib_uc_qpns_bitmap); + +err_steer_qp_release: +	if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) +		mlx4_qp_release_range(dev, ibdev->steer_qpn_base, +				      ibdev->steer_qpn_count);  err_counter:  	for (; i; --i)  		if (ibdev->counters[i - 1] != -1) @@ -1788,6 +2277,69 @@ err_dealloc:  	return NULL;  } +int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn) +{ +	int offset; + +	WARN_ON(!dev->ib_uc_qpns_bitmap); + +	offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap, +					 dev->steer_qpn_count, +					 get_count_order(count)); +	if (offset < 0) +		return offset; + +	*qpn = dev->steer_qpn_base + offset; +	return 0; +} + +void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) +{ +	if (!qpn || +	    dev->steering_support != MLX4_STEERING_MODE_DEVICE_MANAGED) +		return; + +	BUG_ON(qpn < dev->steer_qpn_base); + +	bitmap_release_region(dev->ib_uc_qpns_bitmap, +			      qpn - dev->steer_qpn_base, +			      get_count_order(count)); +} + +int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, +			 int is_attach) +{ +	int err; +	size_t flow_size; +	struct ib_flow_attr *flow = NULL; +	struct ib_flow_spec_ib *ib_spec; + +	if (is_attach) { +		flow_size = sizeof(struct ib_flow_attr) + +			    sizeof(struct ib_flow_spec_ib); +		flow = kzalloc(flow_size, GFP_KERNEL); +		if (!flow) +			return -ENOMEM; +		flow->port = mqp->port; +		flow->num_of_specs = 1; +		flow->size = flow_size; +		ib_spec = (struct ib_flow_spec_ib *)(flow + 1); +		ib_spec->type = IB_FLOW_SPEC_IB; +		ib_spec->size = sizeof(struct ib_flow_spec_ib); +		/* Add an empty rule for IB L2 */ +		memset(&ib_spec->mask, 0, sizeof(ib_spec->mask)); + +		err = __mlx4_ib_create_flow(&mqp->ibqp, flow, +					    IB_FLOW_DOMAIN_NIC, +					    MLX4_FS_REGULAR, +					    &mqp->reg_id); +	} else { +		err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id); +	} +	kfree(flow); +	return err; +} +  static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)  {  	struct mlx4_ib_dev *ibdev = ibdev_ptr; @@ -1801,6 +2353,26 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)  			pr_warn("failure unregistering notifier\n");  		ibdev->iboe.nb.notifier_call = NULL;  	} + +	if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) { +		mlx4_qp_release_range(dev, ibdev->steer_qpn_base, +				      ibdev->steer_qpn_count); +		kfree(ibdev->ib_uc_qpns_bitmap); +	} + +	if (ibdev->iboe.nb_inet.notifier_call) { +		if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) +			pr_warn("failure unregistering notifier\n"); +		ibdev->iboe.nb_inet.notifier_call = NULL; +	} +#if IS_ENABLED(CONFIG_IPV6) +	if (ibdev->iboe.nb_inet6.notifier_call) { +		if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6)) +			pr_warn("failure unregistering notifier\n"); +		ibdev->iboe.nb_inet6.notifier_call = NULL; +	} +#endif +  	iounmap(ibdev->uar_map);  	for (p = 0; p < ibdev->num_ports; ++p)  		if (ibdev->counters[p] != -1) @@ -1821,17 +2393,24 @@ static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init)  	struct mlx4_dev *dev = ibdev->dev;  	int i;  	unsigned long flags; +	struct mlx4_active_ports actv_ports; +	unsigned int ports; +	unsigned int first_port;  	if (!mlx4_is_master(dev))  		return; -	dm = kcalloc(dev->caps.num_ports, sizeof *dm, GFP_ATOMIC); +	actv_ports = mlx4_get_active_ports(dev, slave); +	ports = bitmap_weight(actv_ports.ports, dev->caps.num_ports); +	first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports); + +	dm = kcalloc(ports, sizeof(*dm), GFP_ATOMIC);  	if (!dm) {  		pr_err("failed to allocate memory for tunneling qp update\n");  		goto out;  	} -	for (i = 0; i < dev->caps.num_ports; i++) { +	for (i = 0; i < ports; i++) {  		dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC);  		if (!dm[i]) {  			pr_err("failed to allocate memory for tunneling qp update work struct\n"); @@ -1843,9 +2422,9 @@ static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init)  		}  	}  	/* initialize or tear down tunnel QPs for the slave */ -	for (i = 0; i < dev->caps.num_ports; i++) { +	for (i = 0; i < ports; i++) {  		INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work); -		dm[i]->port = i + 1; +		dm[i]->port = first_port + i + 1;  		dm[i]->slave = slave;  		dm[i]->do_init = do_init;  		dm[i]->dev = ibdev; diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c index 25b2cdff00f..ed327e6c8fd 100644 --- a/drivers/infiniband/hw/mlx4/mcg.c +++ b/drivers/infiniband/hw/mlx4/mcg.c @@ -215,8 +215,9 @@ static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad)  	}  	mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);  	spin_unlock(&dev->sm_lock); -	return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), ctx->port, -				    IB_QPT_GSI, 0, 1, IB_QP1_QKEY, &ah_attr, mad); +	return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), +				    ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY, +				    &ah_attr, NULL, mad);  }  static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx, diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 036b663dd26..369da3ca5d6 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -68,6 +68,8 @@ enum {  /*module param to indicate if SM assigns the alias_GUID*/  extern int mlx4_ib_sm_guid_assign; +#define MLX4_IB_UC_STEER_QPN_ALIGN 1 +#define MLX4_IB_UC_MAX_NUM_QPS     256  struct mlx4_ib_ucontext {  	struct ib_ucontext	ibucontext;  	struct mlx4_uar		uar; @@ -153,6 +155,8 @@ struct mlx4_ib_wq {  enum mlx4_ib_qp_flags {  	MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO,  	MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, +	MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP, +	MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO,  	MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30,  	MLX4_IB_SRIOV_SQP = 1 << 31,  }; @@ -238,6 +242,22 @@ struct mlx4_ib_proxy_sqp_hdr {  	struct mlx4_rcv_tunnel_hdr tun;  }  __packed; +struct mlx4_roce_smac_vlan_info { +	u64 smac; +	int smac_index; +	int smac_port; +	u64 candidate_smac; +	int candidate_smac_index; +	int candidate_smac_port; +	u16 vid; +	int vlan_index; +	int vlan_port; +	u16 candidate_vid; +	int candidate_vlan_index; +	int candidate_vlan_port; +	int update_vid; +}; +  struct mlx4_ib_qp {  	struct ib_qp		ibqp;  	struct mlx4_qp		mqp; @@ -270,7 +290,9 @@ struct mlx4_ib_qp {  	struct list_head	gid_list;  	struct list_head	steering_rules;  	struct mlx4_ib_buf	*sqp_proxy_rcv; - +	struct mlx4_roce_smac_vlan_info pri; +	struct mlx4_roce_smac_vlan_info alt; +	u64			reg_id;  };  struct mlx4_ib_srq { @@ -428,7 +450,10 @@ struct mlx4_ib_sriov {  struct mlx4_ib_iboe {  	spinlock_t		lock;  	struct net_device      *netdevs[MLX4_MAX_PORTS]; +	struct net_device      *masters[MLX4_MAX_PORTS];  	struct notifier_block 	nb; +	struct notifier_block	nb_inet; +	struct notifier_block	nb_inet6;  	union ib_gid		gid_table[MLX4_MAX_PORTS][128];  }; @@ -494,6 +519,13 @@ struct mlx4_ib_dev {  	struct kobject	       *dev_ports_parent[MLX4_MFUNC_MAX];  	struct mlx4_ib_iov_port	iov_ports[MLX4_MAX_PORTS];  	struct pkey_mgt		pkeys; +	unsigned long *ib_uc_qpns_bitmap; +	int steer_qpn_count; +	int steer_qpn_base; +	int steering_support; +	struct mlx4_ib_qp      *qp1_proxy[MLX4_MAX_PORTS]; +	/* lock when destroying qp1_proxy and getting netdev events */ +	struct mutex		qp1_proxy_lock[MLX4_MAX_PORTS];  };  struct ib_event_work { @@ -675,9 +707,6 @@ int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,  int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,  			union ib_gid *gid, int netw_view); -int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, -			u8 *mac, int *is_mcast, u8 port); -  static inline bool mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)  {  	u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3; @@ -712,9 +741,12 @@ void mlx4_ib_tunnels_update_work(struct work_struct *work);  int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,  			  enum ib_qp_type qpt, struct ib_wc *wc,  			  struct ib_grh *grh, struct ib_mad *mad); +  int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,  			 enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, -			 u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad); +			 u32 qkey, struct ib_ah_attr *attr, u8 *s_mac, +			 struct ib_mad *mad); +  __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx);  int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, @@ -752,5 +784,9 @@ void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device);  __be64 mlx4_ib_gen_node_guid(void); +int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn); +void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count); +int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, +			 int is_attach);  #endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index e471f089ff0..cb2a8727f3f 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -90,11 +90,11 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,  			   struct ib_umem *umem)  {  	u64 *pages; -	struct ib_umem_chunk *chunk; -	int i, j, k; +	int i, k, entry;  	int n;  	int len;  	int err = 0; +	struct scatterlist *sg;  	pages = (u64 *) __get_free_page(GFP_KERNEL);  	if (!pages) @@ -102,26 +102,25 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,  	i = n = 0; -	list_for_each_entry(chunk, &umem->chunk_list, list) -		for (j = 0; j < chunk->nmap; ++j) { -			len = sg_dma_len(&chunk->page_list[j]) >> mtt->page_shift; -			for (k = 0; k < len; ++k) { -				pages[i++] = sg_dma_address(&chunk->page_list[j]) + -					umem->page_size * k; -				/* -				 * Be friendly to mlx4_write_mtt() and -				 * pass it chunks of appropriate size. -				 */ -				if (i == PAGE_SIZE / sizeof (u64)) { -					err = mlx4_write_mtt(dev->dev, mtt, n, -							     i, pages); -					if (err) -						goto out; -					n += i; -					i = 0; -				} +	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { +		len = sg_dma_len(sg) >> mtt->page_shift; +		for (k = 0; k < len; ++k) { +			pages[i++] = sg_dma_address(sg) + +				umem->page_size * k; +			/* +			 * Be friendly to mlx4_write_mtt() and +			 * pass it chunks of appropriate size. +			 */ +			if (i == PAGE_SIZE / sizeof (u64)) { +				err = mlx4_write_mtt(dev->dev, mtt, n, +						     i, pages); +				if (err) +					goto out; +				n += i; +				i = 0;  			}  		} +	}  	if (i)  		err = mlx4_write_mtt(dev->dev, mtt, n, i, pages); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 4f10af2905b..67780452f0c 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -90,6 +90,21 @@ enum {  	MLX4_RAW_QP_MSGMAX	= 31,  }; +#ifndef ETH_ALEN +#define ETH_ALEN        6 +#endif +static inline u64 mlx4_mac_to_u64(u8 *addr) +{ +	u64 mac = 0; +	int i; + +	for (i = 0; i < ETH_ALEN; i++) { +		mac <<= 8; +		mac |= addr[i]; +	} +	return mac; +} +  static const __be32 mlx4_ib_opcode[] = {  	[IB_WR_SEND]				= cpu_to_be32(MLX4_OPCODE_SEND),  	[IB_WR_LSO]				= cpu_to_be32(MLX4_OPCODE_LSO), @@ -593,9 +608,20 @@ static int qp_has_rq(struct ib_qp_init_attr *attr)  	return !attr->srq;  } +static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn) +{ +	int i; +	for (i = 0; i < dev->caps.num_ports; i++) { +		if (qpn == dev->caps.qp0_proxy[i]) +			return !!dev->caps.qp0_qkey[i]; +	} +	return 0; +} +  static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,  			    struct ib_qp_init_attr *init_attr, -			    struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp) +			    struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp, +			    gfp_t gfp)  {  	int qpn;  	int err; @@ -610,10 +636,13 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,  		     !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) {  			if (init_attr->qp_type == IB_QPT_GSI)  				qp_type = MLX4_IB_QPT_PROXY_GSI; -			else if (mlx4_is_master(dev->dev)) -				qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER; -			else -				qp_type = MLX4_IB_QPT_PROXY_SMI; +			else { +				if (mlx4_is_master(dev->dev) || +				    qp0_enabled_vf(dev->dev, sqpn)) +					qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER; +				else +					qp_type = MLX4_IB_QPT_PROXY_SMI; +			}  		}  		qpn = sqpn;  		/* add extra sg entry for tunneling */ @@ -628,7 +657,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,  			return -EINVAL;  		if (tnl_init->proxy_qp_type == IB_QPT_GSI)  			qp_type = MLX4_IB_QPT_TUN_GSI; -		else if (tnl_init->slave == mlx4_master_func_num(dev->dev)) +		else if (tnl_init->slave == mlx4_master_func_num(dev->dev) || +			 mlx4_vf_smi_enabled(dev->dev, tnl_init->slave, +					     tnl_init->port))  			qp_type = MLX4_IB_QPT_TUN_SMI_OWNER;  		else  			qp_type = MLX4_IB_QPT_TUN_SMI; @@ -643,14 +674,18 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,  		if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI ||  		    (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER |  				MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) { -			sqp = kzalloc(sizeof (struct mlx4_ib_sqp), GFP_KERNEL); +			sqp = kzalloc(sizeof (struct mlx4_ib_sqp), gfp);  			if (!sqp)  				return -ENOMEM;  			qp = &sqp->qp; +			qp->pri.vid = 0xFFFF; +			qp->alt.vid = 0xFFFF;  		} else { -			qp = kzalloc(sizeof (struct mlx4_ib_qp), GFP_KERNEL); +			qp = kzalloc(sizeof (struct mlx4_ib_qp), gfp);  			if (!qp)  				return -ENOMEM; +			qp->pri.vid = 0xFFFF; +			qp->alt.vid = 0xFFFF;  		}  	} else  		qp = *caller_qp; @@ -716,19 +751,27 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,  		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)  			qp->flags |= MLX4_IB_QP_LSO; +		if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { +			if (dev->steering_support == +			    MLX4_STEERING_MODE_DEVICE_MANAGED) +				qp->flags |= MLX4_IB_QP_NETIF; +			else +				goto err; +		} +  		err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);  		if (err)  			goto err;  		if (qp_has_rq(init_attr)) { -			err = mlx4_db_alloc(dev->dev, &qp->db, 0); +			err = mlx4_db_alloc(dev->dev, &qp->db, 0, gfp);  			if (err)  				goto err;  			*qp->db.db = 0;  		} -		if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) { +		if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) {  			err = -ENOMEM;  			goto err_db;  		} @@ -738,13 +781,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,  		if (err)  			goto err_buf; -		err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf); +		err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, gfp);  		if (err)  			goto err_mtt; -		qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL); -		qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL); - +		qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp); +		qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp);  		if (!qp->sq.wrid || !qp->rq.wrid) {  			err = -ENOMEM;  			goto err_wrid; @@ -765,12 +807,16 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,  		if (init_attr->qp_type == IB_QPT_RAW_PACKET)  			err = mlx4_qp_reserve_range(dev->dev, 1, 1 << 8, &qpn);  		else -			err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn); +			if (qp->flags & MLX4_IB_QP_NETIF) +				err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn); +			else +				err = mlx4_qp_reserve_range(dev->dev, 1, 1, +							    &qpn);  		if (err)  			goto err_proxy;  	} -	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp); +	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, gfp);  	if (err)  		goto err_qpn; @@ -790,8 +836,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,  	return 0;  err_qpn: -	if (!sqpn) -		mlx4_qp_release_range(dev->dev, qpn, 1); +	if (!sqpn) { +		if (qp->flags & MLX4_IB_QP_NETIF) +			mlx4_ib_steer_qp_free(dev, qpn, 1); +		else +			mlx4_qp_release_range(dev->dev, qpn, 1); +	}  err_proxy:  	if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)  		free_proxy_bufs(pd->device, qp); @@ -909,11 +959,32 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,  {  	struct mlx4_ib_cq *send_cq, *recv_cq; -	if (qp->state != IB_QPS_RESET) +	if (qp->state != IB_QPS_RESET) {  		if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),  				   MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))  			pr_warn("modify QP %06x to RESET failed.\n",  			       qp->mqp.qpn); +		if (qp->pri.smac) { +			mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); +			qp->pri.smac = 0; +		} +		if (qp->alt.smac) { +			mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); +			qp->alt.smac = 0; +		} +		if (qp->pri.vid < 0x1000) { +			mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); +			qp->pri.vid = 0xFFFF; +			qp->pri.candidate_vid = 0xFFFF; +			qp->pri.update_vid = 0; +		} +		if (qp->alt.vid < 0x1000) { +			mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); +			qp->alt.vid = 0xFFFF; +			qp->alt.candidate_vid = 0xFFFF; +			qp->alt.update_vid = 0; +		} +	}  	get_cqs(qp, &send_cq, &recv_cq); @@ -932,8 +1003,12 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,  	mlx4_qp_free(dev->dev, &qp->mqp); -	if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) -		mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); +	if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) { +		if (qp->flags & MLX4_IB_QP_NETIF) +			mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1); +		else +			mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); +	}  	mlx4_mtt_cleanup(dev->dev, &qp->mtt); @@ -980,19 +1055,30 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,  	struct mlx4_ib_qp *qp = NULL;  	int err;  	u16 xrcdn = 0; +	gfp_t gfp; +	gfp = (init_attr->create_flags & MLX4_IB_QP_CREATE_USE_GFP_NOIO) ? +		GFP_NOIO : GFP_KERNEL;  	/*  	 * We only support LSO, vendor flag1, and multicast loopback blocking,  	 * and only for kernel UD QPs.  	 */  	if (init_attr->create_flags & ~(MLX4_IB_QP_LSO |  					MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK | -					MLX4_IB_SRIOV_TUNNEL_QP | MLX4_IB_SRIOV_SQP)) +					MLX4_IB_SRIOV_TUNNEL_QP | +					MLX4_IB_SRIOV_SQP | +					MLX4_IB_QP_NETIF | +					MLX4_IB_QP_CREATE_USE_GFP_NOIO))  		return ERR_PTR(-EINVAL); +	if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { +		if (init_attr->qp_type != IB_QPT_UD) +			return ERR_PTR(-EINVAL); +	} +  	if (init_attr->create_flags &&  	    (udata || -	     ((init_attr->create_flags & ~MLX4_IB_SRIOV_SQP) && +	     ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | MLX4_IB_QP_CREATE_USE_GFP_NOIO)) &&  	      init_attr->qp_type != IB_QPT_UD) ||  	     ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) &&  	      init_attr->qp_type > IB_QPT_GSI))) @@ -1012,14 +1098,16 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,  	case IB_QPT_RC:  	case IB_QPT_UC:  	case IB_QPT_RAW_PACKET: -		qp = kzalloc(sizeof *qp, GFP_KERNEL); +		qp = kzalloc(sizeof *qp, gfp);  		if (!qp)  			return ERR_PTR(-ENOMEM); +		qp->pri.vid = 0xFFFF; +		qp->alt.vid = 0xFFFF;  		/* fall through */  	case IB_QPT_UD:  	{  		err = create_qp_common(to_mdev(pd->device), pd, init_attr, -				       udata, 0, &qp); +				       udata, 0, &qp, gfp);  		if (err)  			return ERR_PTR(err); @@ -1037,7 +1125,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,  		err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata,  				       get_sqp_num(to_mdev(pd->device), init_attr), -				       &qp); +				       &qp, gfp);  		if (err)  			return ERR_PTR(err); @@ -1063,6 +1151,12 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp)  	if (is_qp0(dev, mqp))  		mlx4_CLOSE_PORT(dev->dev, mqp->port); +	if (dev->qp1_proxy[mqp->port - 1] == mqp) { +		mutex_lock(&dev->qp1_proxy_lock[mqp->port - 1]); +		dev->qp1_proxy[mqp->port - 1] = NULL; +		mutex_unlock(&dev->qp1_proxy_lock[mqp->port - 1]); +	} +  	pd = get_pd(mqp);  	destroy_qp_common(dev, mqp, !!pd->ibpd.uobject); @@ -1144,16 +1238,16 @@ static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)  	path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);  } -static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, -			 struct mlx4_qp_path *path, u8 port) +static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, +			  u64 smac, u16 vlan_tag, struct mlx4_qp_path *path, +			  struct mlx4_roce_smac_vlan_info *smac_info, u8 port)  { -	int err;  	int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) ==  		IB_LINK_LAYER_ETHERNET; -	u8 mac[6]; -	int is_mcast; -	u16 vlan_tag;  	int vidx; +	int smac_index; +	int err; +  	path->grh_mylmc     = ah->src_path_bits & 0x7f;  	path->rlid	    = cpu_to_be16(ah->dlid); @@ -1182,36 +1276,105 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,  	}  	if (is_eth) { -		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | -			((port - 1) << 6) | ((ah->sl & 7) << 3); -  		if (!(ah->ah_flags & IB_AH_GRH))  			return -1; -		err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast, port); -		if (err) -			return err; - -		memcpy(path->dmac, mac, 6); -		path->ackto = MLX4_IB_LINK_TYPE_ETH; -		/* use index 0 into MAC table for IBoE */ -		path->grh_mylmc &= 0x80; +		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | +			((port - 1) << 6) | ((ah->sl & 7) << 3); -		vlan_tag = rdma_get_vlan_id(&dev->iboe.gid_table[port - 1][ah->grh.sgid_index]); +		path->feup |= MLX4_FEUP_FORCE_ETH_UP;  		if (vlan_tag < 0x1000) { -			if (mlx4_find_cached_vlan(dev->dev, port, vlan_tag, &vidx)) -				return -ENOENT; - -			path->vlan_index = vidx; +			if (smac_info->vid < 0x1000) { +				/* both valid vlan ids */ +				if (smac_info->vid != vlan_tag) { +					/* different VIDs.  unreg old and reg new */ +					err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); +					if (err) +						return err; +					smac_info->candidate_vid = vlan_tag; +					smac_info->candidate_vlan_index = vidx; +					smac_info->candidate_vlan_port = port; +					smac_info->update_vid = 1; +					path->vlan_index = vidx; +				} else { +					path->vlan_index = smac_info->vlan_index; +				} +			} else { +				/* no current vlan tag in qp */ +				err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); +				if (err) +					return err; +				smac_info->candidate_vid = vlan_tag; +				smac_info->candidate_vlan_index = vidx; +				smac_info->candidate_vlan_port = port; +				smac_info->update_vid = 1; +				path->vlan_index = vidx; +			} +			path->feup |= MLX4_FVL_FORCE_ETH_VLAN;  			path->fl = 1 << 6; +		} else { +			/* have current vlan tag. unregister it at modify-qp success */ +			if (smac_info->vid < 0x1000) { +				smac_info->candidate_vid = 0xFFFF; +				smac_info->update_vid = 1; +			}  		} -	} else + +		/* get smac_index for RoCE use. +		 * If no smac was yet assigned, register one. +		 * If one was already assigned, but the new mac differs, +		 * unregister the old one and register the new one. +		*/ +		if (!smac_info->smac || smac_info->smac != smac) { +			/* register candidate now, unreg if needed, after success */ +			smac_index = mlx4_register_mac(dev->dev, port, smac); +			if (smac_index >= 0) { +				smac_info->candidate_smac_index = smac_index; +				smac_info->candidate_smac = smac; +				smac_info->candidate_smac_port = port; +			} else { +				return -EINVAL; +			} +		} else { +			smac_index = smac_info->smac_index; +		} + +		memcpy(path->dmac, ah->dmac, 6); +		path->ackto = MLX4_IB_LINK_TYPE_ETH; +		/* put MAC table smac index for IBoE */ +		path->grh_mylmc = (u8) (smac_index) | 0x80; +	} else {  		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |  			((port - 1) << 6) | ((ah->sl & 0xf) << 2); +	}  	return 0;  } +static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp, +			 enum ib_qp_attr_mask qp_attr_mask, +			 struct mlx4_ib_qp *mqp, +			 struct mlx4_qp_path *path, u8 port) +{ +	return _mlx4_set_path(dev, &qp->ah_attr, +			      mlx4_mac_to_u64((u8 *)qp->smac), +			      (qp_attr_mask & IB_QP_VID) ? qp->vlan_id : 0xffff, +			      path, &mqp->pri, port); +} + +static int mlx4_set_alt_path(struct mlx4_ib_dev *dev, +			     const struct ib_qp_attr *qp, +			     enum ib_qp_attr_mask qp_attr_mask, +			     struct mlx4_ib_qp *mqp, +			     struct mlx4_qp_path *path, u8 port) +{ +	return _mlx4_set_path(dev, &qp->alt_ah_attr, +			      mlx4_mac_to_u64((u8 *)qp->alt_smac), +			      (qp_attr_mask & IB_QP_ALT_VID) ? +			      qp->alt_vlan_id : 0xffff, +			      path, &mqp->alt, port); +} +  static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)  {  	struct mlx4_ib_gid_entry *ge, *tmp; @@ -1224,6 +1387,37 @@ static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)  	}  } +static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, u8 *smac, +				    struct mlx4_qp_context *context) +{ +	struct net_device *ndev; +	u64 u64_mac; +	int smac_index; + + +	ndev = dev->iboe.netdevs[qp->port - 1]; +	if (ndev) { +		smac = ndev->dev_addr; +		u64_mac = mlx4_mac_to_u64(smac); +	} else { +		u64_mac = dev->dev->caps.def_mac[qp->port]; +	} + +	context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6); +	if (!qp->pri.smac) { +		smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac); +		if (smac_index >= 0) { +			qp->pri.candidate_smac_index = smac_index; +			qp->pri.candidate_smac = u64_mac; +			qp->pri.candidate_smac_port = qp->port; +			context->pri_path.grh_mylmc = 0x80 | (u8) smac_index; +		} else { +			return -ENOENT; +		} +	} +	return 0; +} +  static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,  			       const struct ib_qp_attr *attr, int attr_mask,  			       enum ib_qp_state cur_state, enum ib_qp_state new_state) @@ -1235,6 +1429,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,  	struct mlx4_qp_context *context;  	enum mlx4_qp_optpar optpar = 0;  	int sqd_event; +	int steer_qp = 0;  	int err = -EINVAL;  	context = kzalloc(sizeof *context, GFP_KERNEL); @@ -1319,6 +1514,11 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,  			optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX;  		} else  			context->pri_path.counter_index = 0xff; + +		if (qp->flags & MLX4_IB_QP_NETIF) { +			mlx4_ib_steer_qp_reg(dev, qp, 1); +			steer_qp = 1; +		}  	}  	if (attr_mask & IB_QP_PKEY_INDEX) { @@ -1329,7 +1529,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,  	}  	if (attr_mask & IB_QP_AV) { -		if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path, +		if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path,  				  attr_mask & IB_QP_PORT ?  				  attr->port_num : qp->port))  			goto out; @@ -1352,8 +1552,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,  		    dev->dev->caps.pkey_table_len[attr->alt_port_num])  			goto out; -		if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path, -				  attr->alt_port_num)) +		if (mlx4_set_alt_path(dev, attr, attr_mask, qp, +				      &context->alt_path, +				      attr->alt_port_num))  			goto out;  		context->alt_path.pkey_index = attr->alt_pkey_index; @@ -1458,12 +1659,39 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,  				context->pri_path.fl = 0x80;  			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;  		} +		if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) == +		    IB_LINK_LAYER_ETHERNET) { +			if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI || +			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) +				context->pri_path.feup = 1 << 7; /* don't fsm */ +			/* handle smac_index */ +			if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD || +			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI || +			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) { +				err = handle_eth_ud_smac_index(dev, qp, (u8 *)attr->smac, context); +				if (err) +					return -EINVAL; +				if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) +					dev->qp1_proxy[qp->port - 1] = qp; +			} +		}  	}  	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET)  		context->pri_path.ackto = (context->pri_path.ackto & 0xf8) |  					MLX4_IB_LINK_TYPE_ETH; +	if (ibqp->qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) { +		int is_eth = rdma_port_get_link_layer( +				&dev->ib_dev, qp->port) == +				IB_LINK_LAYER_ETHERNET; +		if (is_eth) { +			context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH; +			optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH; +		} +	} + +  	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&  	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)  		sqd_event = 1; @@ -1534,23 +1762,113 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,  	 * If we moved a kernel QP to RESET, clean up all old CQ  	 * entries and reinitialize the QP.  	 */ -	if (new_state == IB_QPS_RESET && !ibqp->uobject) { -		mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, -				 ibqp->srq ? to_msrq(ibqp->srq): NULL); -		if (send_cq != recv_cq) -			mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); +	if (new_state == IB_QPS_RESET) { +		if (!ibqp->uobject) { +			mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, +					 ibqp->srq ? to_msrq(ibqp->srq) : NULL); +			if (send_cq != recv_cq) +				mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + +			qp->rq.head = 0; +			qp->rq.tail = 0; +			qp->sq.head = 0; +			qp->sq.tail = 0; +			qp->sq_next_wqe = 0; +			if (qp->rq.wqe_cnt) +				*qp->db.db  = 0; + +			if (qp->flags & MLX4_IB_QP_NETIF) +				mlx4_ib_steer_qp_reg(dev, qp, 0); +		} +		if (qp->pri.smac) { +			mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); +			qp->pri.smac = 0; +		} +		if (qp->alt.smac) { +			mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); +			qp->alt.smac = 0; +		} +		if (qp->pri.vid < 0x1000) { +			mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); +			qp->pri.vid = 0xFFFF; +			qp->pri.candidate_vid = 0xFFFF; +			qp->pri.update_vid = 0; +		} -		qp->rq.head = 0; -		qp->rq.tail = 0; -		qp->sq.head = 0; -		qp->sq.tail = 0; -		qp->sq_next_wqe = 0; -		if (qp->rq.wqe_cnt) -			*qp->db.db  = 0; +		if (qp->alt.vid < 0x1000) { +			mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); +			qp->alt.vid = 0xFFFF; +			qp->alt.candidate_vid = 0xFFFF; +			qp->alt.update_vid = 0; +		}  	} -  out: +	if (err && steer_qp) +		mlx4_ib_steer_qp_reg(dev, qp, 0);  	kfree(context); +	if (qp->pri.candidate_smac) { +		if (err) { +			mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac); +		} else { +			if (qp->pri.smac) +				mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); +			qp->pri.smac = qp->pri.candidate_smac; +			qp->pri.smac_index = qp->pri.candidate_smac_index; +			qp->pri.smac_port = qp->pri.candidate_smac_port; +		} +		qp->pri.candidate_smac = 0; +		qp->pri.candidate_smac_index = 0; +		qp->pri.candidate_smac_port = 0; +	} +	if (qp->alt.candidate_smac) { +		if (err) { +			mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->alt.candidate_smac); +		} else { +			if (qp->alt.smac) +				mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); +			qp->alt.smac = qp->alt.candidate_smac; +			qp->alt.smac_index = qp->alt.candidate_smac_index; +			qp->alt.smac_port = qp->alt.candidate_smac_port; +		} +		qp->alt.candidate_smac = 0; +		qp->alt.candidate_smac_index = 0; +		qp->alt.candidate_smac_port = 0; +	} + +	if (qp->pri.update_vid) { +		if (err) { +			if (qp->pri.candidate_vid < 0x1000) +				mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port, +						     qp->pri.candidate_vid); +		} else { +			if (qp->pri.vid < 0x1000) +				mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, +						     qp->pri.vid); +			qp->pri.vid = qp->pri.candidate_vid; +			qp->pri.vlan_port = qp->pri.candidate_vlan_port; +			qp->pri.vlan_index =  qp->pri.candidate_vlan_index; +		} +		qp->pri.candidate_vid = 0xFFFF; +		qp->pri.update_vid = 0; +	} + +	if (qp->alt.update_vid) { +		if (err) { +			if (qp->alt.candidate_vid < 0x1000) +				mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port, +						     qp->alt.candidate_vid); +		} else { +			if (qp->alt.vid < 0x1000) +				mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, +						     qp->alt.vid); +			qp->alt.vid = qp->alt.candidate_vid; +			qp->alt.vlan_port = qp->alt.candidate_vlan_port; +			qp->alt.vlan_index =  qp->alt.candidate_vlan_index; +		} +		qp->alt.candidate_vid = 0xFFFF; +		qp->alt.update_vid = 0; +	} +  	return err;  } @@ -1561,13 +1879,21 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,  	struct mlx4_ib_qp *qp = to_mqp(ibqp);  	enum ib_qp_state cur_state, new_state;  	int err = -EINVAL; - +	int ll;  	mutex_lock(&qp->mutex);  	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;  	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; -	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) { +	if (cur_state == new_state && cur_state == IB_QPS_RESET) { +		ll = IB_LINK_LAYER_UNSPECIFIED; +	} else { +		int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; +		ll = rdma_port_get_link_layer(&dev->ib_dev, port); +	} + +	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, +				attr_mask, ll)) {  		pr_debug("qpn 0x%x: invalid attribute mask specified "  			 "for transition %d to %d. qp_type %d,"  			 " attr_mask 0x%x\n", @@ -1631,6 +1957,19 @@ out:  	return err;  } +static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey) +{ +	int i; +	for (i = 0; i < dev->caps.num_ports; i++) { +		if (qpn == dev->caps.qp0_proxy[i] || +		    qpn == dev->caps.qp0_tunnel[i]) { +			*qkey = dev->caps.qp0_qkey[i]; +			return 0; +		} +	} +	return -EINVAL; +} +  static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,  				  struct ib_send_wr *wr,  				  void *wqe, unsigned *mlx_seg_len) @@ -1688,8 +2027,13 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,  			cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]);  	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); -	if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) -		return -EINVAL; +	if (mlx4_is_master(mdev->dev)) { +		if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) +			return -EINVAL; +	} else { +		if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) +			return -EINVAL; +	}  	sqp->ud_header.deth.qkey = cpu_to_be32(qkey);  	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn); @@ -1744,9 +2088,9 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,  {  	struct ib_device *ib_dev = sqp->qp.ibqp.device;  	struct mlx4_wqe_mlx_seg *mlx = wqe; +	struct mlx4_wqe_ctrl_seg *ctrl = wqe;  	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;  	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); -	struct net_device *ndev;  	union ib_gid sgid;  	u16 pkey;  	int send_size; @@ -1770,12 +2114,11 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,  			/* When multi-function is enabled, the ib_core gid  			 * indexes don't necessarily match the hw ones, so  			 * we must use our own cache */ -			sgid.global.subnet_prefix = -				to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. -				subnet_prefix; -			sgid.global.interface_id = -				to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. -				guid_cache[ah->av.ib.gid_index]; +			err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev, +							   be32_to_cpu(ah->av.ib.port_pd) >> 24, +							   ah->av.ib.gid_index, &sgid.raw[0]); +			if (err) +				return err;  		} else  {  			err = ib_get_cached_gid(ib_dev,  						be32_to_cpu(ah->av.ib.port_pd) >> 24, @@ -1784,8 +2127,10 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,  				return err;  		} -		vlan = rdma_get_vlan_id(&sgid); -		is_vlan = vlan < 0x1000; +		if (ah->av.eth.vlan != cpu_to_be16(0xffff)) { +			vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff; +			is_vlan = 1; +		}  	}  	ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header); @@ -1802,6 +2147,9 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,  		sqp->ud_header.grh.flow_label    =  			ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);  		sqp->ud_header.grh.hop_limit     = ah->av.ib.hop_limit; +		if (is_eth) +			memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16); +		else {  		if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {  			/* When multi-function is enabled, the ib_core gid  			 * indexes don't necessarily match the hw ones, so @@ -1817,6 +2165,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,  					  be32_to_cpu(ah->av.ib.port_pd) >> 24,  					  ah->av.ib.gid_index,  					  &sqp->ud_header.grh.source_gid); +		}  		memcpy(sqp->ud_header.grh.destination_gid.raw,  		       ah->av.ib.dgid, 16);  	} @@ -1849,16 +2198,23 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,  	if (is_eth) {  		u8 *smac; +		struct in6_addr in6; +  		u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;  		mlx->sched_prio = cpu_to_be16(pcp);  		memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);  		/* FIXME: cache smac value? */ -		ndev = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]; -		if (!ndev) -			return -ENODEV; -		smac = ndev->dev_addr; +		memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2); +		memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4); +		memcpy(&in6, sgid.raw, sizeof(in6)); + +		if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) +			smac = to_mdev(sqp->qp.ibqp.device)-> +				iboe.netdevs[sqp->qp.port - 1]->dev_addr; +		else	/* use the src mac of the tunnel */ +			smac = ah->av.eth.s_mac;  		memcpy(sqp->ud_header.eth.smac_h, smac, 6);  		if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))  			mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); @@ -2059,7 +2415,8 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,  static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev,  				    struct mlx4_wqe_datagram_seg *dseg, -				    struct ib_send_wr *wr, enum ib_qp_type qpt) +				    struct ib_send_wr *wr, +				    enum mlx4_ib_qp_type qpt)  {  	union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av;  	struct mlx4_av sqp_av = {0}; @@ -2072,8 +2429,10 @@ static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev,  			cpu_to_be32(0xf0000000);  	memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av)); -	/* This function used only for sending on QP1 proxies */ -	dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]); +	if (qpt == MLX4_IB_QPT_PROXY_GSI) +		dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]); +	else +		dseg->dqpn = cpu_to_be32(dev->dev->caps.qp0_tunnel[port - 1]);  	/* Use QKEY from the QP context, which is set by master */  	dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY);  } @@ -2090,6 +2449,8 @@ static void build_tunnel_header(struct ib_send_wr *wr, void *wqe, unsigned *mlx_  	hdr.remote_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);  	hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index);  	hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); +	memcpy(hdr.mac, ah->av.eth.mac, 6); +	hdr.vlan = ah->av.eth.vlan;  	spc = MLX4_INLINE_ALIGN -  		((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); @@ -2366,11 +2727,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,  			break;  		case MLX4_IB_QPT_PROXY_SMI_OWNER: -			if (unlikely(!mlx4_is_master(to_mdev(ibqp->device)->dev))) { -				err = -ENOSYS; -				*bad_wr = wr; -				goto out; -			}  			err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen);  			if (unlikely(err)) {  				*bad_wr = wr; @@ -2387,16 +2743,13 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,  			size += seglen / 16;  			break;  		case MLX4_IB_QPT_PROXY_SMI: -			/* don't allow QP0 sends on guests */ -			err = -ENOSYS; -			*bad_wr = wr; -			goto out;  		case MLX4_IB_QPT_PROXY_GSI:  			/* If we are tunneling special qps, this is a UD qp.  			 * In this case we first add a UD segment targeting  			 * the tunnel qp, and then add a header with address  			 * information */ -			set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, ibqp->qp_type); +			set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, +						qp->mlx4_ib_qp_type);  			wqe  += sizeof (struct mlx4_wqe_datagram_seg);  			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;  			build_tunnel_header(wr, wqe, &seglen); @@ -2762,6 +3115,9 @@ done:  	if (qp->flags & MLX4_IB_QP_LSO)  		qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; +	if (qp->flags & MLX4_IB_QP_NETIF) +		qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP; +  	qp_init_attr->sq_sig_type =  		qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ?  		IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 60c5fb025fc..62d9285300a 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -134,13 +134,14 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,  		if (err)  			goto err_mtt;  	} else { -		err = mlx4_db_alloc(dev->dev, &srq->db, 0); +		err = mlx4_db_alloc(dev->dev, &srq->db, 0, GFP_KERNEL);  		if (err)  			goto err_srq;  		*srq->db.db = 0; -		if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf)) { +		if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf, +				   GFP_KERNEL)) {  			err = -ENOMEM;  			goto err_db;  		} @@ -165,7 +166,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,  		if (err)  			goto err_buf; -		err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf); +		err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf, GFP_KERNEL);  		if (err)  			goto err_mtt; diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c index 97516eb363b..cb4c66e723b 100644 --- a/drivers/infiniband/hw/mlx4/sysfs.c +++ b/drivers/infiniband/hw/mlx4/sysfs.c @@ -389,8 +389,10 @@ struct mlx4_port {  	struct mlx4_ib_dev    *dev;  	struct attribute_group pkey_group;  	struct attribute_group gid_group; -	u8                     port_num; +	struct device_attribute	enable_smi_admin; +	struct device_attribute	smi_enabled;  	int		       slave; +	u8                     port_num;  }; @@ -558,6 +560,101 @@ err:  	return NULL;  } +static ssize_t sysfs_show_smi_enabled(struct device *dev, +				      struct device_attribute *attr, char *buf) +{ +	struct mlx4_port *p = +		container_of(attr, struct mlx4_port, smi_enabled); +	ssize_t len = 0; + +	if (mlx4_vf_smi_enabled(p->dev->dev, p->slave, p->port_num)) +		len = sprintf(buf, "%d\n", 1); +	else +		len = sprintf(buf, "%d\n", 0); + +	return len; +} + +static ssize_t sysfs_show_enable_smi_admin(struct device *dev, +					   struct device_attribute *attr, +					   char *buf) +{ +	struct mlx4_port *p = +		container_of(attr, struct mlx4_port, enable_smi_admin); +	ssize_t len = 0; + +	if (mlx4_vf_get_enable_smi_admin(p->dev->dev, p->slave, p->port_num)) +		len = sprintf(buf, "%d\n", 1); +	else +		len = sprintf(buf, "%d\n", 0); + +	return len; +} + +static ssize_t sysfs_store_enable_smi_admin(struct device *dev, +					    struct device_attribute *attr, +					    const char *buf, size_t count) +{ +	struct mlx4_port *p = +		container_of(attr, struct mlx4_port, enable_smi_admin); +	int enable; + +	if (sscanf(buf, "%i", &enable) != 1 || +	    enable < 0 || enable > 1) +		return -EINVAL; + +	if (mlx4_vf_set_enable_smi_admin(p->dev->dev, p->slave, p->port_num, enable)) +		return -EINVAL; +	return count; +} + +static int add_vf_smi_entries(struct mlx4_port *p) +{ +	int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) == +			IB_LINK_LAYER_ETHERNET; +	int ret; + +	/* do not display entries if eth transport, or if master */ +	if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev)) +		return 0; + +	sysfs_attr_init(&p->smi_enabled.attr); +	p->smi_enabled.show = sysfs_show_smi_enabled; +	p->smi_enabled.store = NULL; +	p->smi_enabled.attr.name = "smi_enabled"; +	p->smi_enabled.attr.mode = 0444; +	ret = sysfs_create_file(&p->kobj, &p->smi_enabled.attr); +	if (ret) { +		pr_err("failed to create smi_enabled\n"); +		return ret; +	} + +	sysfs_attr_init(&p->enable_smi_admin.attr); +	p->enable_smi_admin.show = sysfs_show_enable_smi_admin; +	p->enable_smi_admin.store = sysfs_store_enable_smi_admin; +	p->enable_smi_admin.attr.name = "enable_smi_admin"; +	p->enable_smi_admin.attr.mode = 0644; +	ret = sysfs_create_file(&p->kobj, &p->enable_smi_admin.attr); +	if (ret) { +		pr_err("failed to create enable_smi_admin\n"); +		sysfs_remove_file(&p->kobj, &p->smi_enabled.attr); +		return ret; +	} +	return 0; +} + +static void remove_vf_smi_entries(struct mlx4_port *p) +{ +	int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) == +			IB_LINK_LAYER_ETHERNET; + +	if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev)) +		return; + +	sysfs_remove_file(&p->kobj, &p->smi_enabled.attr); +	sysfs_remove_file(&p->kobj, &p->enable_smi_admin.attr); +} +  static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)  {  	struct mlx4_port *p; @@ -582,8 +679,10 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)  	p->pkey_group.attrs =  		alloc_group_attrs(show_port_pkey, store_port_pkey,  				  dev->dev->caps.pkey_table_len[port_num]); -	if (!p->pkey_group.attrs) +	if (!p->pkey_group.attrs) { +		ret = -ENOMEM;  		goto err_alloc; +	}  	ret = sysfs_create_group(&p->kobj, &p->pkey_group);  	if (ret) @@ -591,13 +690,19 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)  	p->gid_group.name  = "gid_idx";  	p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx, NULL, 1); -	if (!p->gid_group.attrs) +	if (!p->gid_group.attrs) { +		ret = -ENOMEM;  		goto err_free_pkey; +	}  	ret = sysfs_create_group(&p->kobj, &p->gid_group);  	if (ret)  		goto err_free_gid; +	ret = add_vf_smi_entries(p); +	if (ret) +		goto err_free_gid; +  	list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]);  	return 0; @@ -623,6 +728,7 @@ static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave)  	int port;  	struct kobject *p, *t;  	struct mlx4_port *mport; +	struct mlx4_active_ports actv_ports;  	get_name(dev, name, slave, sizeof name); @@ -645,7 +751,11 @@ static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave)  		goto err_ports;  	} +	actv_ports = mlx4_get_active_ports(dev->dev, slave); +  	for (port = 1; port <= dev->dev->caps.num_ports; ++port) { +		if (!test_bit(port - 1, actv_ports.ports)) +			continue;  		err = add_port(dev, port, slave);  		if (err)  			goto err_add; @@ -660,6 +770,7 @@ err_add:  		mport = container_of(p, struct mlx4_port, kobj);  		sysfs_remove_group(p, &mport->pkey_group);  		sysfs_remove_group(p, &mport->gid_group); +		remove_vf_smi_entries(mport);  		kobject_put(p);  	}  	kobject_put(dev->dev_ports_parent[slave]); @@ -704,6 +815,7 @@ static void unregister_pkey_tree(struct mlx4_ib_dev *device)  			port = container_of(p, struct mlx4_port, kobj);  			sysfs_remove_group(p, &port->pkey_group);  			sysfs_remove_group(p, &port->gid_group); +			remove_vf_smi_entries(port);  			kobject_put(p);  			kobject_put(device->dev_ports_parent[slave]);  		} diff --git a/drivers/infiniband/hw/mlx5/Kconfig b/drivers/infiniband/hw/mlx5/Kconfig index 8e6aebfaf8a..10df386c634 100644 --- a/drivers/infiniband/hw/mlx5/Kconfig +++ b/drivers/infiniband/hw/mlx5/Kconfig @@ -1,6 +1,6 @@  config MLX5_INFINIBAND  	tristate "Mellanox Connect-IB HCA support" -	depends on NETDEVICES && ETHERNET && PCI && X86 +	depends on NETDEVICES && ETHERNET && PCI  	select NET_VENDOR_MELLANOX  	select MLX5_CORE  	---help--- diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 344ab03948a..8ae4f896cb4 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -32,6 +32,7 @@  #include <linux/kref.h>  #include <rdma/ib_umem.h> +#include <rdma/ib_user_verbs.h>  #include "mlx5_ib.h"  #include "user.h" @@ -73,14 +74,24 @@ static void *get_cqe(struct mlx5_ib_cq *cq, int n)  	return get_cqe_from_buf(&cq->buf, n, cq->mcq.cqe_sz);  } +static u8 sw_ownership_bit(int n, int nent) +{ +	return (n & nent) ? 1 : 0; +} +  static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n)  {  	void *cqe = get_cqe(cq, n & cq->ibcq.cqe);  	struct mlx5_cqe64 *cqe64;  	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; -	return ((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ -		!!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe; + +	if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) && +	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) { +		return cqe; +	} else { +		return NULL; +	}  }  static void *next_cqe_sw(struct mlx5_ib_cq *cq) @@ -351,6 +362,43 @@ static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,  	qp->sq.last_poll = tail;  } +static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) +{ +	mlx5_buf_free(&dev->mdev, &buf->buf); +} + +static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe, +			     struct ib_sig_err *item) +{ +	u16 syndrome = be16_to_cpu(cqe->syndrome); + +#define GUARD_ERR   (1 << 13) +#define APPTAG_ERR  (1 << 12) +#define REFTAG_ERR  (1 << 11) + +	if (syndrome & GUARD_ERR) { +		item->err_type = IB_SIG_BAD_GUARD; +		item->expected = be32_to_cpu(cqe->expected_trans_sig) >> 16; +		item->actual = be32_to_cpu(cqe->actual_trans_sig) >> 16; +	} else +	if (syndrome & REFTAG_ERR) { +		item->err_type = IB_SIG_BAD_REFTAG; +		item->expected = be32_to_cpu(cqe->expected_reftag); +		item->actual = be32_to_cpu(cqe->actual_reftag); +	} else +	if (syndrome & APPTAG_ERR) { +		item->err_type = IB_SIG_BAD_APPTAG; +		item->expected = be32_to_cpu(cqe->expected_trans_sig) & 0xffff; +		item->actual = be32_to_cpu(cqe->actual_trans_sig) & 0xffff; +	} else { +		pr_err("Got signature completion error with bad syndrome %04x\n", +		       syndrome); +	} + +	item->sig_err_offset = be64_to_cpu(cqe->err_offset); +	item->key = be32_to_cpu(cqe->mkey); +} +  static int mlx5_poll_one(struct mlx5_ib_cq *cq,  			 struct mlx5_ib_qp **cur_qp,  			 struct ib_wc *wc) @@ -360,12 +408,16 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq,  	struct mlx5_cqe64 *cqe64;  	struct mlx5_core_qp *mqp;  	struct mlx5_ib_wq *wq; +	struct mlx5_sig_err_cqe *sig_err_cqe; +	struct mlx5_core_mr *mmr; +	struct mlx5_ib_mr *mr;  	uint8_t opcode;  	uint32_t qpn;  	u16 wqe_ctr;  	void *cqe;  	int idx; +repoll:  	cqe = next_cqe_sw(cq);  	if (!cqe)  		return -EAGAIN; @@ -379,7 +431,18 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq,  	 */  	rmb(); -	/* TBD: resize CQ */ +	opcode = cqe64->op_own >> 4; +	if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) { +		if (likely(cq->resize_buf)) { +			free_cq_buf(dev, &cq->buf); +			cq->buf = *cq->resize_buf; +			kfree(cq->resize_buf); +			cq->resize_buf = NULL; +			goto repoll; +		} else { +			mlx5_ib_warn(dev, "unexpected resize cqe\n"); +		} +	}  	qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff;  	if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) { @@ -398,7 +461,6 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq,  	}  	wc->qp  = &(*cur_qp)->ibqp; -	opcode = cqe64->op_own >> 4;  	switch (opcode) {  	case MLX5_CQE_REQ:  		wq = &(*cur_qp)->sq; @@ -449,6 +511,33 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq,  			}  		}  		break; +	case MLX5_CQE_SIG_ERR: +		sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64; + +		read_lock(&dev->mdev.priv.mr_table.lock); +		mmr = __mlx5_mr_lookup(&dev->mdev, +				       mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); +		if (unlikely(!mmr)) { +			read_unlock(&dev->mdev.priv.mr_table.lock); +			mlx5_ib_warn(dev, "CQE@CQ %06x for unknown MR %6x\n", +				     cq->mcq.cqn, be32_to_cpu(sig_err_cqe->mkey)); +			return -EINVAL; +		} + +		mr = to_mibmr(mmr); +		get_sig_err_item(sig_err_cqe, &mr->sig->err_item); +		mr->sig->sig_err_exists = true; +		mr->sig->sigerr_count++; + +		mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n", +			     cq->mcq.cqn, mr->sig->err_item.key, +			     mr->sig->err_item.err_type, +			     mr->sig->err_item.sig_err_offset, +			     mr->sig->err_item.expected, +			     mr->sig->err_item.actual); + +		read_unlock(&dev->mdev.priv.mr_table.lock); +		goto repoll;  	}  	return 0; @@ -503,29 +592,35 @@ static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf,  		return err;  	buf->cqe_size = cqe_size; +	buf->nent = nent;  	return 0;  } -static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) -{ -	mlx5_buf_free(&dev->mdev, &buf->buf); -} -  static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,  			  struct ib_ucontext *context, struct mlx5_ib_cq *cq,  			  int entries, struct mlx5_create_cq_mbox_in **cqb,  			  int *cqe_size, int *index, int *inlen)  {  	struct mlx5_ib_create_cq ucmd; +	size_t ucmdlen;  	int page_shift;  	int npages;  	int ncont;  	int err; -	if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) +	ucmdlen = +		(udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < +		 sizeof(ucmd)) ? (sizeof(ucmd) - +				  sizeof(ucmd.reserved)) : sizeof(ucmd); + +	if (ib_copy_from_udata(&ucmd, udata, ucmdlen))  		return -EFAULT; +	if (ucmdlen == sizeof(ucmd) && +	    ucmd.reserved != 0) +		return -EINVAL; +  	if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128)  		return -EINVAL; @@ -556,7 +651,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,  		goto err_db;  	}  	mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0); -	(*cqb)->ctx.log_pg_sz = page_shift - PAGE_SHIFT; +	(*cqb)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;  	*index = to_mucontext(context)->uuari.uars[0].index; @@ -576,16 +671,16 @@ static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context)  	ib_umem_release(cq->buf.umem);  } -static void init_cq_buf(struct mlx5_ib_cq *cq, int nent) +static void init_cq_buf(struct mlx5_ib_cq *cq, struct mlx5_ib_cq_buf *buf)  {  	int i;  	void *cqe;  	struct mlx5_cqe64 *cqe64; -	for (i = 0; i < nent; i++) { -		cqe = get_cqe(cq, i); -		cqe64 = (cq->buf.cqe_size == 64) ? cqe : cqe + 64; -		cqe64->op_own = 0xf1; +	for (i = 0; i < buf->nent; i++) { +		cqe = get_cqe_from_buf(buf, i, buf->cqe_size); +		cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; +		cqe64->op_own = MLX5_CQE_INVALID << 4;  	}  } @@ -610,7 +705,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,  	if (err)  		goto err_db; -	init_cq_buf(cq, entries); +	init_cq_buf(cq, &cq->buf);  	*inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * cq->buf.buf.npages;  	*cqb = mlx5_vzalloc(*inlen); @@ -620,7 +715,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,  	}  	mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas); -	(*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - PAGE_SHIFT; +	(*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT;  	*index = dev->mdev.priv.uuari.uars[0].index;  	return 0; @@ -653,8 +748,11 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries,  	int eqn;  	int err; +	if (entries < 0) +		return ERR_PTR(-EINVAL); +  	entries = roundup_pow_of_two(entries + 1); -	if (entries < 1 || entries > dev->mdev.caps.max_cqes) +	if (entries > dev->mdev.caps.max_cqes)  		return ERR_PTR(-EINVAL);  	cq = kzalloc(sizeof(*cq), GFP_KERNEL); @@ -747,17 +845,9 @@ int mlx5_ib_destroy_cq(struct ib_cq *cq)  	return 0;  } -static int is_equal_rsn(struct mlx5_cqe64 *cqe64, struct mlx5_ib_srq *srq, -			u32 rsn) +static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn)  { -	u32 lrsn; - -	if (srq) -		lrsn = be32_to_cpu(cqe64->srqn) & 0xffffff; -	else -		lrsn = be32_to_cpu(cqe64->sop_drop_qpn) & 0xffffff; - -	return rsn == lrsn; +	return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff);  }  void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq) @@ -787,8 +877,8 @@ void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq)  	while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {  		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);  		cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; -		if (is_equal_rsn(cqe64, srq, rsn)) { -			if (srq) +		if (is_equal_rsn(cqe64, rsn)) { +			if (srq && (ntohl(cqe64->srqn) & 0xffffff))  				mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter));  			++nfreed;  		} else if (nfreed) { @@ -823,12 +913,266 @@ void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq)  int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)  { -	return -ENOSYS; +	struct mlx5_modify_cq_mbox_in *in; +	struct mlx5_ib_dev *dev = to_mdev(cq->device); +	struct mlx5_ib_cq *mcq = to_mcq(cq); +	int err; +	u32 fsel; + +	if (!(dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_CQ_MODER)) +		return -ENOSYS; + +	in = kzalloc(sizeof(*in), GFP_KERNEL); +	if (!in) +		return -ENOMEM; + +	in->cqn = cpu_to_be32(mcq->mcq.cqn); +	fsel = (MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT); +	in->ctx.cq_period = cpu_to_be16(cq_period); +	in->ctx.cq_max_count = cpu_to_be16(cq_count); +	in->field_select = cpu_to_be32(fsel); +	err = mlx5_core_modify_cq(&dev->mdev, &mcq->mcq, in, sizeof(*in)); +	kfree(in); + +	if (err) +		mlx5_ib_warn(dev, "modify cq 0x%x failed\n", mcq->mcq.cqn); + +	return err; +} + +static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, +		       int entries, struct ib_udata *udata, int *npas, +		       int *page_shift, int *cqe_size) +{ +	struct mlx5_ib_resize_cq ucmd; +	struct ib_umem *umem; +	int err; +	int npages; +	struct ib_ucontext *context = cq->buf.umem->context; + +	err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); +	if (err) +		return err; + +	if (ucmd.reserved0 || ucmd.reserved1) +		return -EINVAL; + +	umem = ib_umem_get(context, ucmd.buf_addr, entries * ucmd.cqe_size, +			   IB_ACCESS_LOCAL_WRITE, 1); +	if (IS_ERR(umem)) { +		err = PTR_ERR(umem); +		return err; +	} + +	mlx5_ib_cont_pages(umem, ucmd.buf_addr, &npages, page_shift, +			   npas, NULL); + +	cq->resize_umem = umem; +	*cqe_size = ucmd.cqe_size; + +	return 0; +} + +static void un_resize_user(struct mlx5_ib_cq *cq) +{ +	ib_umem_release(cq->resize_umem); +} + +static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, +			 int entries, int cqe_size) +{ +	int err; + +	cq->resize_buf = kzalloc(sizeof(*cq->resize_buf), GFP_KERNEL); +	if (!cq->resize_buf) +		return -ENOMEM; + +	err = alloc_cq_buf(dev, cq->resize_buf, entries, cqe_size); +	if (err) +		goto ex; + +	init_cq_buf(cq, cq->resize_buf); + +	return 0; + +ex: +	kfree(cq->resize_buf); +	return err; +} + +static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ +	free_cq_buf(dev, cq->resize_buf); +	cq->resize_buf = NULL; +} + +static int copy_resize_cqes(struct mlx5_ib_cq *cq) +{ +	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); +	struct mlx5_cqe64 *scqe64; +	struct mlx5_cqe64 *dcqe64; +	void *start_cqe; +	void *scqe; +	void *dcqe; +	int ssize; +	int dsize; +	int i; +	u8 sw_own; + +	ssize = cq->buf.cqe_size; +	dsize = cq->resize_buf->cqe_size; +	if (ssize != dsize) { +		mlx5_ib_warn(dev, "resize from different cqe size is not supported\n"); +		return -EINVAL; +	} + +	i = cq->mcq.cons_index; +	scqe = get_sw_cqe(cq, i); +	scqe64 = ssize == 64 ? scqe : scqe + 64; +	start_cqe = scqe; +	if (!scqe) { +		mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); +		return -EINVAL; +	} + +	while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) { +		dcqe = get_cqe_from_buf(cq->resize_buf, +					(i + 1) & (cq->resize_buf->nent), +					dsize); +		dcqe64 = dsize == 64 ? dcqe : dcqe + 64; +		sw_own = sw_ownership_bit(i + 1, cq->resize_buf->nent); +		memcpy(dcqe, scqe, dsize); +		dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own; + +		++i; +		scqe = get_sw_cqe(cq, i); +		scqe64 = ssize == 64 ? scqe : scqe + 64; +		if (!scqe) { +			mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); +			return -EINVAL; +		} + +		if (scqe == start_cqe) { +			pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n", +				cq->mcq.cqn); +			return -ENOMEM; +		} +	} +	++cq->mcq.cons_index; +	return 0;  }  int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)  { -	return -ENOSYS; +	struct mlx5_ib_dev *dev = to_mdev(ibcq->device); +	struct mlx5_ib_cq *cq = to_mcq(ibcq); +	struct mlx5_modify_cq_mbox_in *in; +	int err; +	int npas; +	int page_shift; +	int inlen; +	int uninitialized_var(cqe_size); +	unsigned long flags; + +	if (!(dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_RESIZE_CQ)) { +		pr_info("Firmware does not support resize CQ\n"); +		return -ENOSYS; +	} + +	if (entries < 1) +		return -EINVAL; + +	entries = roundup_pow_of_two(entries + 1); +	if (entries > dev->mdev.caps.max_cqes + 1) +		return -EINVAL; + +	if (entries == ibcq->cqe + 1) +		return 0; + +	mutex_lock(&cq->resize_mutex); +	if (udata) { +		err = resize_user(dev, cq, entries, udata, &npas, &page_shift, +				  &cqe_size); +	} else { +		cqe_size = 64; +		err = resize_kernel(dev, cq, entries, cqe_size); +		if (!err) { +			npas = cq->resize_buf->buf.npages; +			page_shift = cq->resize_buf->buf.page_shift; +		} +	} + +	if (err) +		goto ex; + +	inlen = sizeof(*in) + npas * sizeof(in->pas[0]); +	in = mlx5_vzalloc(inlen); +	if (!in) { +		err = -ENOMEM; +		goto ex_resize; +	} + +	if (udata) +		mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift, +				     in->pas, 0); +	else +		mlx5_fill_page_array(&cq->resize_buf->buf, in->pas); + +	in->field_select = cpu_to_be32(MLX5_MODIFY_CQ_MASK_LOG_SIZE  | +				       MLX5_MODIFY_CQ_MASK_PG_OFFSET | +				       MLX5_MODIFY_CQ_MASK_PG_SIZE); +	in->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; +	in->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; +	in->ctx.page_offset = 0; +	in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(entries) << 24); +	in->hdr.opmod = cpu_to_be16(MLX5_CQ_OPMOD_RESIZE); +	in->cqn = cpu_to_be32(cq->mcq.cqn); + +	err = mlx5_core_modify_cq(&dev->mdev, &cq->mcq, in, inlen); +	if (err) +		goto ex_alloc; + +	if (udata) { +		cq->ibcq.cqe = entries - 1; +		ib_umem_release(cq->buf.umem); +		cq->buf.umem = cq->resize_umem; +		cq->resize_umem = NULL; +	} else { +		struct mlx5_ib_cq_buf tbuf; +		int resized = 0; + +		spin_lock_irqsave(&cq->lock, flags); +		if (cq->resize_buf) { +			err = copy_resize_cqes(cq); +			if (!err) { +				tbuf = cq->buf; +				cq->buf = *cq->resize_buf; +				kfree(cq->resize_buf); +				cq->resize_buf = NULL; +				resized = 1; +			} +		} +		cq->ibcq.cqe = entries - 1; +		spin_unlock_irqrestore(&cq->lock, flags); +		if (resized) +			free_cq_buf(dev, &tbuf); +	} +	mutex_unlock(&cq->resize_mutex); + +	mlx5_vfree(in); +	return 0; + +ex_alloc: +	mlx5_vfree(in); + +ex_resize: +	if (udata) +		un_resize_user(cq); +	else +		un_resize_kernel(dev, cq); +ex: +	mutex_unlock(&cq->resize_mutex); +	return err;  }  int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq) diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c index 256a23344f2..ece028fc47d 100644 --- a/drivers/infiniband/hw/mlx5/doorbell.c +++ b/drivers/infiniband/hw/mlx5/doorbell.c @@ -47,7 +47,6 @@ int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,  			struct mlx5_db *db)  {  	struct mlx5_ib_user_db_page *page; -	struct ib_umem_chunk *chunk;  	int err = 0;  	mutex_lock(&context->db_page_mutex); @@ -75,8 +74,7 @@ int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,  	list_add(&page->list, &context->db_page_list);  found: -	chunk = list_entry(page->umem->chunk_list.next, struct ib_umem_chunk, list); -	db->dma		= sg_dma_address(chunk->page_list) + (virt & ~PAGE_MASK); +	db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK);  	db->u.user_page = page;  	++page->refcnt; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 3f831de9a4d..364d4b6937f 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -46,8 +46,8 @@  #include "mlx5_ib.h"  #define DRIVER_NAME "mlx5_ib" -#define DRIVER_VERSION "1.0" -#define DRIVER_RELDATE	"June 2013" +#define DRIVER_VERSION "2.2-1" +#define DRIVER_RELDATE	"Feb 2014"  MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");  MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); @@ -164,6 +164,7 @@ int mlx5_vector2eqn(struct mlx5_ib_dev *dev, int vector, int *eqn, int *irqn)  static int alloc_comp_eqs(struct mlx5_ib_dev *dev)  {  	struct mlx5_eq_table *table = &dev->mdev.priv.eq_table; +	char name[MLX5_MAX_EQ_NAME];  	struct mlx5_eq *eq, *n;  	int ncomp_vec;  	int nent; @@ -180,11 +181,10 @@ static int alloc_comp_eqs(struct mlx5_ib_dev *dev)  			goto clean;  		} -		snprintf(eq->name, MLX5_MAX_EQ_NAME, "mlx5_comp%d", i); +		snprintf(name, MLX5_MAX_EQ_NAME, "mlx5_comp%d", i);  		err = mlx5_create_map_eq(&dev->mdev, eq,  					 i + MLX5_EQ_VEC_COMP_BASE, nent, 0, -					 eq->name, -					 &dev->mdev.priv.uuari.uars[0]); +					 name, &dev->mdev.priv.uuari.uars[0]);  		if (err) {  			kfree(eq);  			goto clean; @@ -261,8 +261,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,  	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |  		IB_DEVICE_PORT_ACTIVE_EVENT		|  		IB_DEVICE_SYS_IMAGE_GUID		| -		IB_DEVICE_RC_RNR_NAK_GEN		| -		IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; +		IB_DEVICE_RC_RNR_NAK_GEN;  	flags = dev->mdev.caps.flags;  	if (flags & MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR)  		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; @@ -274,6 +273,17 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,  	if (flags & MLX5_DEV_CAP_FLAG_XRC)  		props->device_cap_flags |= IB_DEVICE_XRC;  	props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; +	if (flags & MLX5_DEV_CAP_FLAG_SIG_HAND_OVER) { +		props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; +		/* At this stage no support for signature handover */ +		props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | +				      IB_PROT_T10DIF_TYPE_2 | +				      IB_PROT_T10DIF_TYPE_3; +		props->sig_guard_cap = IB_GUARD_T10DIF_CRC | +				       IB_GUARD_T10DIF_CSUM; +	} +	if (flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST) +		props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;  	props->vendor_id	   = be32_to_cpup((__be32 *)(out_mad->data + 36)) &  		0xffffff; @@ -301,9 +311,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,  	props->max_srq_sge	   = max_rq_sg - 1;  	props->max_fast_reg_page_list_len = (unsigned int)-1;  	props->local_ca_ack_delay  = dev->mdev.caps.local_ca_ack_delay; -	props->atomic_cap	   = dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_ATOMIC ? -		IB_ATOMIC_HCA : IB_ATOMIC_NONE; -	props->masked_atomic_cap   = IB_ATOMIC_HCA; +	props->atomic_cap	   = IB_ATOMIC_NONE; +	props->masked_atomic_cap   = IB_ATOMIC_NONE;  	props->max_pkeys	   = be16_to_cpup((__be16 *)(out_mad->data + 28));  	props->max_mcast_grp	   = 1 << dev->mdev.caps.log_max_mcg;  	props->max_mcast_qp_attach = dev->mdev.caps.max_qp_mcg; @@ -537,34 +546,51 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,  						  struct ib_udata *udata)  {  	struct mlx5_ib_dev *dev = to_mdev(ibdev); -	struct mlx5_ib_alloc_ucontext_req req; +	struct mlx5_ib_alloc_ucontext_req_v2 req;  	struct mlx5_ib_alloc_ucontext_resp resp;  	struct mlx5_ib_ucontext *context;  	struct mlx5_uuar_info *uuari;  	struct mlx5_uar *uars; +	int gross_uuars;  	int num_uars; +	int ver;  	int uuarn;  	int err;  	int i; +	int reqlen;  	if (!dev->ib_active)  		return ERR_PTR(-EAGAIN); -	err = ib_copy_from_udata(&req, udata, sizeof(req)); +	memset(&req, 0, sizeof(req)); +	reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); +	if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) +		ver = 0; +	else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2)) +		ver = 2; +	else +		return ERR_PTR(-EINVAL); + +	err = ib_copy_from_udata(&req, udata, reqlen);  	if (err)  		return ERR_PTR(err); +	if (req.flags || req.reserved) +		return ERR_PTR(-EINVAL); +  	if (req.total_num_uuars > MLX5_MAX_UUARS)  		return ERR_PTR(-ENOMEM);  	if (req.total_num_uuars == 0)  		return ERR_PTR(-EINVAL); -	req.total_num_uuars = ALIGN(req.total_num_uuars, MLX5_BF_REGS_PER_PAGE); +	req.total_num_uuars = ALIGN(req.total_num_uuars, +				    MLX5_NON_FP_BF_REGS_PER_PAGE);  	if (req.num_low_latency_uuars > req.total_num_uuars - 1)  		return ERR_PTR(-EINVAL); -	num_uars = req.total_num_uuars / MLX5_BF_REGS_PER_PAGE; +	num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE; +	gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE;  	resp.qp_tab_size      = 1 << dev->mdev.caps.log_max_qp;  	resp.bf_reg_size      = dev->mdev.caps.bf_reg_size;  	resp.cache_line_size  = L1_CACHE_BYTES; @@ -586,7 +612,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,  		goto out_ctx;  	} -	uuari->bitmap = kcalloc(BITS_TO_LONGS(req.total_num_uuars), +	uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars),  				sizeof(*uuari->bitmap),  				GFP_KERNEL);  	if (!uuari->bitmap) { @@ -596,13 +622,13 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,  	/*  	 * clear all fast path uuars  	 */ -	for (i = 0; i < req.total_num_uuars; i++) { +	for (i = 0; i < gross_uuars; i++) {  		uuarn = i & 3;  		if (uuarn == 2 || uuarn == 3)  			set_bit(i, uuari->bitmap);  	} -	uuari->count = kcalloc(req.total_num_uuars, sizeof(*uuari->count), GFP_KERNEL); +	uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL);  	if (!uuari->count) {  		err = -ENOMEM;  		goto out_bitmap; @@ -624,6 +650,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,  	if (err)  		goto out_uars; +	uuari->ver = ver;  	uuari->num_low_latency_uuars = req.num_low_latency_uuars;  	uuari->uars = uars;  	uuari->num_uars = num_uars; @@ -746,7 +773,8 @@ static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn)  	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);  	seg->start_addr = 0; -	err = mlx5_core_create_mkey(&dev->mdev, &mr, in, sizeof(*in)); +	err = mlx5_core_create_mkey(&dev->mdev, &mr, in, sizeof(*in), +				    NULL, NULL, NULL);  	if (err) {  		mlx5_ib_warn(dev, "failed to create mkey, %d\n", err);  		goto err_in; @@ -1006,6 +1034,11 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,  	ibev.device	      = &ibdev->ib_dev;  	ibev.element.port_num = port; +	if (port < 1 || port > ibdev->num_ports) { +		mlx5_ib_warn(ibdev, "warning: event on port %d\n", port); +		return; +	} +  	if (ibdev->ib_active)  		ib_dispatch_event(&ibev);  } @@ -1401,12 +1434,15 @@ static int init_one(struct pci_dev *pdev,  	dev->ib_dev.get_dma_mr		= mlx5_ib_get_dma_mr;  	dev->ib_dev.reg_user_mr		= mlx5_ib_reg_user_mr;  	dev->ib_dev.dereg_mr		= mlx5_ib_dereg_mr; +	dev->ib_dev.destroy_mr		= mlx5_ib_destroy_mr;  	dev->ib_dev.attach_mcast	= mlx5_ib_mcg_attach;  	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;  	dev->ib_dev.process_mad		= mlx5_ib_process_mad; +	dev->ib_dev.create_mr		= mlx5_ib_create_mr;  	dev->ib_dev.alloc_fast_reg_mr	= mlx5_ib_alloc_fast_reg_mr;  	dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list;  	dev->ib_dev.free_fast_reg_page_list  = mlx5_ib_free_fast_reg_page_list; +	dev->ib_dev.check_mr_status	= mlx5_ib_check_mr_status;  	if (mdev->caps.flags & MLX5_DEV_CAP_FLAG_XRC) {  		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 3a5322870b9..8499aec94db 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -44,16 +44,17 @@  void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,  			int *ncont, int *order)  { -	struct ib_umem_chunk *chunk;  	unsigned long tmp;  	unsigned long m; -	int i, j, k; +	int i, k;  	u64 base = 0;  	int p = 0;  	int skip;  	int mask;  	u64 len;  	u64 pfn; +	struct scatterlist *sg; +	int entry;  	addr = addr >> PAGE_SHIFT;  	tmp = (unsigned long)addr; @@ -61,32 +62,31 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,  	skip = 1 << m;  	mask = skip - 1;  	i = 0; -	list_for_each_entry(chunk, &umem->chunk_list, list) -		for (j = 0; j < chunk->nmap; j++) { -			len = sg_dma_len(&chunk->page_list[j]) >> PAGE_SHIFT; -			pfn = sg_dma_address(&chunk->page_list[j]) >> PAGE_SHIFT; -			for (k = 0; k < len; k++) { -				if (!(i & mask)) { -					tmp = (unsigned long)pfn; -					m = min(m, find_first_bit(&tmp, sizeof(tmp))); +	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { +		len = sg_dma_len(sg) >> PAGE_SHIFT; +		pfn = sg_dma_address(sg) >> PAGE_SHIFT; +		for (k = 0; k < len; k++) { +			if (!(i & mask)) { +				tmp = (unsigned long)pfn; +				m = min(m, find_first_bit(&tmp, sizeof(tmp))); +				skip = 1 << m; +				mask = skip - 1; +				base = pfn; +				p = 0; +			} else { +				if (base + p != pfn) { +					tmp = (unsigned long)p; +					m = find_first_bit(&tmp, sizeof(tmp));  					skip = 1 << m;  					mask = skip - 1;  					base = pfn;  					p = 0; -				} else { -					if (base + p != pfn) { -						tmp = (unsigned long)p; -						m = find_first_bit(&tmp, sizeof(tmp)); -						skip = 1 << m; -						mask = skip - 1; -						base = pfn; -						p = 0; -					}  				} -				p++; -				i++;  			} +			p++; +			i++;  		} +	}  	if (i) {  		m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m); @@ -112,32 +112,32 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,  {  	int shift = page_shift - PAGE_SHIFT;  	int mask = (1 << shift) - 1; -	struct ib_umem_chunk *chunk; -	int i, j, k; +	int i, k;  	u64 cur = 0;  	u64 base;  	int len; +	struct scatterlist *sg; +	int entry;  	i = 0; -	list_for_each_entry(chunk, &umem->chunk_list, list) -		for (j = 0; j < chunk->nmap; j++) { -			len = sg_dma_len(&chunk->page_list[j]) >> PAGE_SHIFT; -			base = sg_dma_address(&chunk->page_list[j]); -			for (k = 0; k < len; k++) { -				if (!(i & mask)) { -					cur = base + (k << PAGE_SHIFT); -					if (umr) -						cur |= 3; +	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { +		len = sg_dma_len(sg) >> PAGE_SHIFT; +		base = sg_dma_address(sg); +		for (k = 0; k < len; k++) { +			if (!(i & mask)) { +				cur = base + (k << PAGE_SHIFT); +				if (umr) +					cur |= 3; -					pas[i >> shift] = cpu_to_be64(cur); -					mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", -						    i >> shift, be64_to_cpu(pas[i >> shift])); -				}  else -					mlx5_ib_dbg(dev, "=====> 0x%llx\n", -						    base + (k << PAGE_SHIFT)); -				i++; -			} +				pas[i >> shift] = cpu_to_be64(cur); +				mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", +					    i >> shift, be64_to_cpu(pas[i >> shift])); +			}  else +				mlx5_ib_dbg(dev, "=====> 0x%llx\n", +					    base + (k << PAGE_SHIFT)); +			i++;  		} +	}  }  int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 836be915724..f2ccf1a5a29 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -189,12 +189,16 @@ struct mlx5_ib_qp {  	int			create_type;  	u32			pa_lkey; + +	/* Store signature errors */ +	bool			signature_en;  };  struct mlx5_ib_cq_buf {  	struct mlx5_buf		buf;  	struct ib_umem		*umem;  	int			cqe_size; +	int			nent;  };  enum mlx5_ib_qp_flags { @@ -220,7 +224,7 @@ struct mlx5_ib_cq {  	/* protect resize cq  	 */  	struct mutex		resize_mutex; -	struct mlx5_ib_cq_resize *resize_buf; +	struct mlx5_ib_cq_buf  *resize_buf;  	struct ib_umem	       *resize_umem;  	int			cqe_size;  }; @@ -260,8 +264,9 @@ struct mlx5_ib_mr {  	__be64			*pas;  	dma_addr_t		dma;  	int			npages; -	struct completion	done; -	enum ib_wc_status	status; +	struct mlx5_ib_dev     *dev; +	struct mlx5_create_mkey_mbox_out out; +	struct mlx5_core_sig_ctx    *sig;  };  struct mlx5_ib_fast_reg_page_list { @@ -270,6 +275,17 @@ struct mlx5_ib_fast_reg_page_list {  	dma_addr_t			map;  }; +struct mlx5_ib_umr_context { +	enum ib_wc_status	status; +	struct completion	done; +}; + +static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) +{ +	context->status = -1; +	init_completion(&context->done); +} +  struct umr_common {  	struct ib_pd	*pd;  	struct ib_cq	*cq; @@ -323,6 +339,7 @@ struct mlx5_cache_ent {  	struct mlx5_ib_dev     *dev;  	struct work_struct	work;  	struct delayed_work	dwork; +	int			pending;  };  struct mlx5_mr_cache { @@ -358,6 +375,8 @@ struct mlx5_ib_dev {  	spinlock_t			mr_lock;  	struct mlx5_ib_resources	devr;  	struct mlx5_mr_cache		cache; +	struct timer_list		delay_timer; +	int				fill_delay;  };  static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -390,6 +409,11 @@ static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp)  	return container_of(mqp, struct mlx5_ib_qp, mqp);  } +static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr) +{ +	return container_of(mmr, struct mlx5_ib_mr, mmr); +} +  static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd)  {  	return container_of(ibpd, struct mlx5_ib_pd, ibpd); @@ -489,6 +513,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  				  u64 virt_addr, int access_flags,  				  struct ib_udata *udata);  int mlx5_ib_dereg_mr(struct ib_mr *ibmr); +int mlx5_ib_destroy_mr(struct ib_mr *ibmr); +struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, +				struct ib_mr_init_attr *mr_init_attr);  struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,  					int max_page_list_len);  struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, @@ -524,6 +551,8 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);  int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);  int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift);  void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context); +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, +			    struct ib_mr_status *mr_status);  static inline void init_query_mad(struct ib_smp *mad)  { diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index bd41df95b6f..afa873bd028 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -35,11 +35,16 @@  #include <linux/random.h>  #include <linux/debugfs.h>  #include <linux/export.h> +#include <linux/delay.h>  #include <rdma/ib_umem.h>  #include "mlx5_ib.h"  enum { -	DEF_CACHE_SIZE	= 10, +	MAX_PENDING_REG_MR = 8, +}; + +enum { +	MLX5_UMR_ALIGN	= 2048  };  static __be64 *mr_align(__be64 *ptr, int align) @@ -59,15 +64,67 @@ static int order2idx(struct mlx5_ib_dev *dev, int order)  		return order - cache->ent[0].order;  } +static void reg_mr_callback(int status, void *context) +{ +	struct mlx5_ib_mr *mr = context; +	struct mlx5_ib_dev *dev = mr->dev; +	struct mlx5_mr_cache *cache = &dev->cache; +	int c = order2idx(dev, mr->order); +	struct mlx5_cache_ent *ent = &cache->ent[c]; +	u8 key; +	unsigned long flags; +	struct mlx5_mr_table *table = &dev->mdev.priv.mr_table; +	int err; + +	spin_lock_irqsave(&ent->lock, flags); +	ent->pending--; +	spin_unlock_irqrestore(&ent->lock, flags); +	if (status) { +		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); +		kfree(mr); +		dev->fill_delay = 1; +		mod_timer(&dev->delay_timer, jiffies + HZ); +		return; +	} + +	if (mr->out.hdr.status) { +		mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n", +			     mr->out.hdr.status, +			     be32_to_cpu(mr->out.hdr.syndrome)); +		kfree(mr); +		dev->fill_delay = 1; +		mod_timer(&dev->delay_timer, jiffies + HZ); +		return; +	} + +	spin_lock_irqsave(&dev->mdev.priv.mkey_lock, flags); +	key = dev->mdev.priv.mkey_key++; +	spin_unlock_irqrestore(&dev->mdev.priv.mkey_lock, flags); +	mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; + +	cache->last_add = jiffies; + +	spin_lock_irqsave(&ent->lock, flags); +	list_add_tail(&mr->list, &ent->head); +	ent->cur++; +	ent->size++; +	spin_unlock_irqrestore(&ent->lock, flags); + +	write_lock_irqsave(&table->lock, flags); +	err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmr.key), +				&mr->mmr); +	if (err) +		pr_err("Error inserting to mr tree. 0x%x\n", -err); +	write_unlock_irqrestore(&table->lock, flags); +} +  static int add_keys(struct mlx5_ib_dev *dev, int c, int num)  { -	struct device *ddev = dev->ib_dev.dma_device;  	struct mlx5_mr_cache *cache = &dev->cache;  	struct mlx5_cache_ent *ent = &cache->ent[c];  	struct mlx5_create_mkey_mbox_in *in;  	struct mlx5_ib_mr *mr;  	int npages = 1 << ent->order; -	int size = sizeof(u64) * npages;  	int err = 0;  	int i; @@ -76,87 +133,66 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num)  		return -ENOMEM;  	for (i = 0; i < num; i++) { +		if (ent->pending >= MAX_PENDING_REG_MR) { +			err = -EAGAIN; +			break; +		} +  		mr = kzalloc(sizeof(*mr), GFP_KERNEL);  		if (!mr) {  			err = -ENOMEM; -			goto out; +			break;  		}  		mr->order = ent->order;  		mr->umred = 1; -		mr->pas = kmalloc(size + 0x3f, GFP_KERNEL); -		if (!mr->pas) { -			kfree(mr); -			err = -ENOMEM; -			goto out; -		} -		mr->dma = dma_map_single(ddev, mr_align(mr->pas, 0x40), size, -					 DMA_TO_DEVICE); -		if (dma_mapping_error(ddev, mr->dma)) { -			kfree(mr->pas); -			kfree(mr); -			err = -ENOMEM; -			goto out; -		} - +		mr->dev = dev;  		in->seg.status = 1 << 6;  		in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2);  		in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);  		in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN;  		in->seg.log2_page_size = 12; +		spin_lock_irq(&ent->lock); +		ent->pending++; +		spin_unlock_irq(&ent->lock);  		err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, -					    sizeof(*in)); +					    sizeof(*in), reg_mr_callback, +					    mr, &mr->out);  		if (err) {  			mlx5_ib_warn(dev, "create mkey failed %d\n", err); -			dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE); -			kfree(mr->pas);  			kfree(mr); -			goto out; +			break;  		} -		cache->last_add = jiffies; - -		spin_lock(&ent->lock); -		list_add_tail(&mr->list, &ent->head); -		ent->cur++; -		ent->size++; -		spin_unlock(&ent->lock);  	} -out:  	kfree(in);  	return err;  }  static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)  { -	struct device *ddev = dev->ib_dev.dma_device;  	struct mlx5_mr_cache *cache = &dev->cache;  	struct mlx5_cache_ent *ent = &cache->ent[c];  	struct mlx5_ib_mr *mr; -	int size;  	int err;  	int i;  	for (i = 0; i < num; i++) { -		spin_lock(&ent->lock); +		spin_lock_irq(&ent->lock);  		if (list_empty(&ent->head)) { -			spin_unlock(&ent->lock); +			spin_unlock_irq(&ent->lock);  			return;  		}  		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);  		list_del(&mr->list);  		ent->cur--;  		ent->size--; -		spin_unlock(&ent->lock); +		spin_unlock_irq(&ent->lock);  		err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr); -		if (err) { +		if (err)  			mlx5_ib_warn(dev, "failed destroy mkey\n"); -		} else { -			size = ALIGN(sizeof(u64) * (1 << mr->order), 0x40); -			dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE); -			kfree(mr->pas); +		else  			kfree(mr); -		}  	}  } @@ -183,9 +219,13 @@ static ssize_t size_write(struct file *filp, const char __user *buf,  		return -EINVAL;  	if (var > ent->size) { -		err = add_keys(dev, c, var - ent->size); -		if (err) -			return err; +		do { +			err = add_keys(dev, c, var - ent->size); +			if (err && err != -EAGAIN) +				return err; + +			usleep_range(3000, 5000); +		} while (err);  	} else if (var < ent->size) {  		remove_keys(dev, c, ent->size - var);  	} @@ -301,23 +341,37 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)  	struct mlx5_ib_dev *dev = ent->dev;  	struct mlx5_mr_cache *cache = &dev->cache;  	int i = order2idx(dev, ent->order); +	int err;  	if (cache->stopped)  		return;  	ent = &dev->cache.ent[i]; -	if (ent->cur < 2 * ent->limit) { -		add_keys(dev, i, 1); -		if (ent->cur < 2 * ent->limit) -			queue_work(cache->wq, &ent->work); +	if (ent->cur < 2 * ent->limit && !dev->fill_delay) { +		err = add_keys(dev, i, 1); +		if (ent->cur < 2 * ent->limit) { +			if (err == -EAGAIN) { +				mlx5_ib_dbg(dev, "returned eagain, order %d\n", +					    i + 2); +				queue_delayed_work(cache->wq, &ent->dwork, +						   msecs_to_jiffies(3)); +			} else if (err) { +				mlx5_ib_warn(dev, "command failed order %d, err %d\n", +					     i + 2, err); +				queue_delayed_work(cache->wq, &ent->dwork, +						   msecs_to_jiffies(1000)); +			} else { +				queue_work(cache->wq, &ent->work); +			} +		}  	} else if (ent->cur > 2 * ent->limit) {  		if (!someone_adding(cache) && -		    time_after(jiffies, cache->last_add + 60 * HZ)) { +		    time_after(jiffies, cache->last_add + 300 * HZ)) {  			remove_keys(dev, i, 1);  			if (ent->cur > ent->limit)  				queue_work(cache->wq, &ent->work);  		} else { -			queue_delayed_work(cache->wq, &ent->dwork, 60 * HZ); +			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);  		}  	}  } @@ -357,18 +411,18 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)  		mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i); -		spin_lock(&ent->lock); +		spin_lock_irq(&ent->lock);  		if (!list_empty(&ent->head)) {  			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,  					      list);  			list_del(&mr->list);  			ent->cur--; -			spin_unlock(&ent->lock); +			spin_unlock_irq(&ent->lock);  			if (ent->cur < ent->limit)  				queue_work(cache->wq, &ent->work);  			break;  		} -		spin_unlock(&ent->lock); +		spin_unlock_irq(&ent->lock);  		queue_work(cache->wq, &ent->work); @@ -395,12 +449,12 @@ static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)  		return;  	}  	ent = &cache->ent[c]; -	spin_lock(&ent->lock); +	spin_lock_irq(&ent->lock);  	list_add_tail(&mr->list, &ent->head);  	ent->cur++;  	if (ent->cur > 2 * ent->limit)  		shrink = 1; -	spin_unlock(&ent->lock); +	spin_unlock_irq(&ent->lock);  	if (shrink)  		queue_work(cache->wq, &ent->work); @@ -408,33 +462,28 @@ static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)  static void clean_keys(struct mlx5_ib_dev *dev, int c)  { -	struct device *ddev = dev->ib_dev.dma_device;  	struct mlx5_mr_cache *cache = &dev->cache;  	struct mlx5_cache_ent *ent = &cache->ent[c];  	struct mlx5_ib_mr *mr; -	int size;  	int err; +	cancel_delayed_work(&ent->dwork);  	while (1) { -		spin_lock(&ent->lock); +		spin_lock_irq(&ent->lock);  		if (list_empty(&ent->head)) { -			spin_unlock(&ent->lock); +			spin_unlock_irq(&ent->lock);  			return;  		}  		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);  		list_del(&mr->list);  		ent->cur--;  		ent->size--; -		spin_unlock(&ent->lock); +		spin_unlock_irq(&ent->lock);  		err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr); -		if (err) { +		if (err)  			mlx5_ib_warn(dev, "failed destroy mkey\n"); -		} else { -			size = ALIGN(sizeof(u64) * (1 << mr->order), 0x40); -			dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE); -			kfree(mr->pas); +		else  			kfree(mr); -		}  	}  } @@ -490,12 +539,18 @@ static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)  	debugfs_remove_recursive(dev->cache.root);  } +static void delay_time_func(unsigned long ctx) +{ +	struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx; + +	dev->fill_delay = 0; +} +  int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)  {  	struct mlx5_mr_cache *cache = &dev->cache;  	struct mlx5_cache_ent *ent;  	int limit; -	int size;  	int err;  	int i; @@ -505,6 +560,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)  		return -ENOMEM;  	} +	setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev);  	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {  		INIT_LIST_HEAD(&cache->ent[i].head);  		spin_lock_init(&cache->ent[i].lock); @@ -515,13 +571,11 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)  		ent->order = i + 2;  		ent->dev = dev; -		if (dev->mdev.profile->mask & MLX5_PROF_MASK_MR_CACHE) { -			size = dev->mdev.profile->mr_cache[i].size; +		if (dev->mdev.profile->mask & MLX5_PROF_MASK_MR_CACHE)  			limit = dev->mdev.profile->mr_cache[i].limit; -		} else { -			size = DEF_CACHE_SIZE; +		else  			limit = 0; -		} +  		INIT_WORK(&ent->work, cache_work_func);  		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);  		ent->limit = limit; @@ -540,13 +594,16 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)  	int i;  	dev->cache.stopped = 1; -	destroy_workqueue(dev->cache.wq); +	flush_workqueue(dev->cache.wq);  	mlx5_mr_cache_debugfs_cleanup(dev);  	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)  		clean_keys(dev, i); +	destroy_workqueue(dev->cache.wq); +	del_timer_sync(&dev->delay_timer); +  	return 0;  } @@ -575,7 +632,8 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)  	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);  	seg->start_addr = 0; -	err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in)); +	err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL, +				    NULL);  	if (err)  		goto err_in; @@ -650,7 +708,7 @@ static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,  void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context)  { -	struct mlx5_ib_mr *mr; +	struct mlx5_ib_umr_context *context;  	struct ib_wc wc;  	int err; @@ -663,9 +721,9 @@ void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context)  		if (err == 0)  			break; -		mr = (struct mlx5_ib_mr *)(unsigned long)wc.wr_id; -		mr->status = wc.status; -		complete(&mr->done); +		context = (struct mlx5_ib_umr_context *) (unsigned long) wc.wr_id; +		context->status = wc.status; +		complete(&context->done);  	}  	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);  } @@ -675,21 +733,24 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,  				  int page_shift, int order, int access_flags)  {  	struct mlx5_ib_dev *dev = to_mdev(pd->device); +	struct device *ddev = dev->ib_dev.dma_device;  	struct umr_common *umrc = &dev->umrc; +	struct mlx5_ib_umr_context umr_context;  	struct ib_send_wr wr, *bad;  	struct mlx5_ib_mr *mr;  	struct ib_sge sg; -	int err; +	int size = sizeof(u64) * npages; +	int err = 0;  	int i; -	for (i = 0; i < 10; i++) { +	for (i = 0; i < 1; i++) {  		mr = alloc_cached_mr(dev, order);  		if (mr)  			break;  		err = add_keys(dev, order2idx(dev, order), 1); -		if (err) { -			mlx5_ib_warn(dev, "add_keys failed\n"); +		if (err && err != -EAGAIN) { +			mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);  			break;  		}  	} @@ -697,38 +758,58 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,  	if (!mr)  		return ERR_PTR(-EAGAIN); -	mlx5_ib_populate_pas(dev, umem, page_shift, mr_align(mr->pas, 0x40), 1); +	mr->pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); +	if (!mr->pas) { +		err = -ENOMEM; +		goto free_mr; +	} + +	mlx5_ib_populate_pas(dev, umem, page_shift, +			     mr_align(mr->pas, MLX5_UMR_ALIGN), 1); + +	mr->dma = dma_map_single(ddev, mr_align(mr->pas, MLX5_UMR_ALIGN), size, +				 DMA_TO_DEVICE); +	if (dma_mapping_error(ddev, mr->dma)) { +		err = -ENOMEM; +		goto free_pas; +	}  	memset(&wr, 0, sizeof(wr)); -	wr.wr_id = (u64)(unsigned long)mr; +	wr.wr_id = (u64)(unsigned long)&umr_context;  	prep_umr_reg_wqe(pd, &wr, &sg, mr->dma, npages, mr->mmr.key, page_shift, virt_addr, len, access_flags); -	/* We serialize polls so one process does not kidnap another's -	 * completion. This is not a problem since wr is completed in -	 * around 1 usec -	 */ +	mlx5_ib_init_umr_context(&umr_context);  	down(&umrc->sem); -	init_completion(&mr->done);  	err = ib_post_send(umrc->qp, &wr, &bad);  	if (err) {  		mlx5_ib_warn(dev, "post send failed, err %d\n", err); -		up(&umrc->sem); -		goto error; +		goto unmap_dma; +	} else { +		wait_for_completion(&umr_context.done); +		if (umr_context.status != IB_WC_SUCCESS) { +			mlx5_ib_warn(dev, "reg umr failed\n"); +			err = -EFAULT; +		}  	} -	wait_for_completion(&mr->done); + +	mr->mmr.iova = virt_addr; +	mr->mmr.size = len; +	mr->mmr.pd = to_mpd(pd)->pdn; + +unmap_dma:  	up(&umrc->sem); +	dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE); -	if (mr->status != IB_WC_SUCCESS) { -		mlx5_ib_warn(dev, "reg umr failed\n"); -		err = -EFAULT; -		goto error; +free_pas: +	kfree(mr->pas); + +free_mr: +	if (err) { +		free_cached_mr(dev, mr); +		return ERR_PTR(err);  	}  	return mr; - -error: -	free_cached_mr(dev, mr); -	return ERR_PTR(err);  }  static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, @@ -763,8 +844,10 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,  	in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift));  	in->seg.log2_page_size = page_shift;  	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); -	in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); -	err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen); +	in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, +							 1 << page_shift)); +	err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen, NULL, +				    NULL, NULL);  	if (err) {  		mlx5_ib_warn(dev, "create mkey failed\n");  		goto err_2; @@ -855,24 +938,26 @@ error:  static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)  {  	struct umr_common *umrc = &dev->umrc; +	struct mlx5_ib_umr_context umr_context;  	struct ib_send_wr wr, *bad;  	int err;  	memset(&wr, 0, sizeof(wr)); -	wr.wr_id = (u64)(unsigned long)mr; +	wr.wr_id = (u64)(unsigned long)&umr_context;  	prep_umr_unreg_wqe(dev, &wr, mr->mmr.key); +	mlx5_ib_init_umr_context(&umr_context);  	down(&umrc->sem); -	init_completion(&mr->done);  	err = ib_post_send(umrc->qp, &wr, &bad);  	if (err) {  		up(&umrc->sem);  		mlx5_ib_dbg(dev, "err %d\n", err);  		goto error; +	} else { +		wait_for_completion(&umr_context.done); +		up(&umrc->sem);  	} -	wait_for_completion(&mr->done); -	up(&umrc->sem); -	if (mr->status != IB_WC_SUCCESS) { +	if (umr_context.status != IB_WC_SUCCESS) {  		mlx5_ib_warn(dev, "unreg umr failed\n");  		err = -EFAULT;  		goto error; @@ -921,6 +1006,122 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)  	return 0;  } +struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, +				struct ib_mr_init_attr *mr_init_attr) +{ +	struct mlx5_ib_dev *dev = to_mdev(pd->device); +	struct mlx5_create_mkey_mbox_in *in; +	struct mlx5_ib_mr *mr; +	int access_mode, err; +	int ndescs = roundup(mr_init_attr->max_reg_descriptors, 4); + +	mr = kzalloc(sizeof(*mr), GFP_KERNEL); +	if (!mr) +		return ERR_PTR(-ENOMEM); + +	in = kzalloc(sizeof(*in), GFP_KERNEL); +	if (!in) { +		err = -ENOMEM; +		goto err_free; +	} + +	in->seg.status = 1 << 6; /* free */ +	in->seg.xlt_oct_size = cpu_to_be32(ndescs); +	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); +	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); +	access_mode = MLX5_ACCESS_MODE_MTT; + +	if (mr_init_attr->flags & IB_MR_SIGNATURE_EN) { +		u32 psv_index[2]; + +		in->seg.flags_pd = cpu_to_be32(be32_to_cpu(in->seg.flags_pd) | +							   MLX5_MKEY_BSF_EN); +		in->seg.bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); +		mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); +		if (!mr->sig) { +			err = -ENOMEM; +			goto err_free_in; +		} + +		/* create mem & wire PSVs */ +		err = mlx5_core_create_psv(&dev->mdev, to_mpd(pd)->pdn, +					   2, psv_index); +		if (err) +			goto err_free_sig; + +		access_mode = MLX5_ACCESS_MODE_KLM; +		mr->sig->psv_memory.psv_idx = psv_index[0]; +		mr->sig->psv_wire.psv_idx = psv_index[1]; + +		mr->sig->sig_status_checked = true; +		mr->sig->sig_err_exists = false; +		/* Next UMR, Arm SIGERR */ +		++mr->sig->sigerr_count; +	} + +	in->seg.flags = MLX5_PERM_UMR_EN | access_mode; +	err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in), +				    NULL, NULL, NULL); +	if (err) +		goto err_destroy_psv; + +	mr->ibmr.lkey = mr->mmr.key; +	mr->ibmr.rkey = mr->mmr.key; +	mr->umem = NULL; +	kfree(in); + +	return &mr->ibmr; + +err_destroy_psv: +	if (mr->sig) { +		if (mlx5_core_destroy_psv(&dev->mdev, +					  mr->sig->psv_memory.psv_idx)) +			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", +				     mr->sig->psv_memory.psv_idx); +		if (mlx5_core_destroy_psv(&dev->mdev, +					  mr->sig->psv_wire.psv_idx)) +			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", +				     mr->sig->psv_wire.psv_idx); +	} +err_free_sig: +	kfree(mr->sig); +err_free_in: +	kfree(in); +err_free: +	kfree(mr); +	return ERR_PTR(err); +} + +int mlx5_ib_destroy_mr(struct ib_mr *ibmr) +{ +	struct mlx5_ib_dev *dev = to_mdev(ibmr->device); +	struct mlx5_ib_mr *mr = to_mmr(ibmr); +	int err; + +	if (mr->sig) { +		if (mlx5_core_destroy_psv(&dev->mdev, +					  mr->sig->psv_memory.psv_idx)) +			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", +				     mr->sig->psv_memory.psv_idx); +		if (mlx5_core_destroy_psv(&dev->mdev, +					  mr->sig->psv_wire.psv_idx)) +			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", +				     mr->sig->psv_wire.psv_idx); +		kfree(mr->sig); +	} + +	err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr); +	if (err) { +		mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", +			     mr->mmr.key, err); +		return err; +	} + +	kfree(mr); + +	return err; +} +  struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,  					int max_page_list_len)  { @@ -948,7 +1149,8 @@ struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,  	 * TBD not needed - issue 197292 */  	in->seg.log2_page_size = PAGE_SHIFT; -	err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in)); +	err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in), NULL, +				    NULL, NULL);  	kfree(in);  	if (err)  		goto err_free; @@ -1005,3 +1207,44 @@ void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)  	kfree(mfrpl->ibfrpl.page_list);  	kfree(mfrpl);  } + +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, +			    struct ib_mr_status *mr_status) +{ +	struct mlx5_ib_mr *mmr = to_mmr(ibmr); +	int ret = 0; + +	if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { +		pr_err("Invalid status check mask\n"); +		ret = -EINVAL; +		goto done; +	} + +	mr_status->fail_status = 0; +	if (check_mask & IB_MR_CHECK_SIG_STATUS) { +		if (!mmr->sig) { +			ret = -EINVAL; +			pr_err("signature status check requested on a non-signature enabled MR\n"); +			goto done; +		} + +		mmr->sig->sig_status_checked = true; +		if (!mmr->sig->sig_err_exists) +			goto done; + +		if (ibmr->lkey == mmr->sig->err_item.key) +			memcpy(&mr_status->sig_err, &mmr->sig->err_item, +			       sizeof(mr_status->sig_err)); +		else { +			mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; +			mr_status->sig_err.sig_err_offset = 0; +			mr_status->sig_err.key = mmr->sig->err_item.key; +		} + +		mmr->sig->sig_err_exists = false; +		mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; +	} + +done: +	return ret; +} diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 045f8cdbd30..bbbcf389272 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -203,7 +203,7 @@ static int sq_overhead(enum ib_qp_type qp_type)  	switch (qp_type) {  	case IB_QPT_XRC_INI: -		size = sizeof(struct mlx5_wqe_xrc_seg); +		size += sizeof(struct mlx5_wqe_xrc_seg);  		/* fall through */  	case IB_QPT_RC:  		size += sizeof(struct mlx5_wqe_ctrl_seg) + @@ -211,20 +211,25 @@ static int sq_overhead(enum ib_qp_type qp_type)  			sizeof(struct mlx5_wqe_raddr_seg);  		break; +	case IB_QPT_XRC_TGT: +		return 0; +  	case IB_QPT_UC: -		size = sizeof(struct mlx5_wqe_ctrl_seg) + -			sizeof(struct mlx5_wqe_raddr_seg); +		size += sizeof(struct mlx5_wqe_ctrl_seg) + +			sizeof(struct mlx5_wqe_raddr_seg) + +			sizeof(struct mlx5_wqe_umr_ctrl_seg) + +			sizeof(struct mlx5_mkey_seg);  		break;  	case IB_QPT_UD:  	case IB_QPT_SMI:  	case IB_QPT_GSI: -		size = sizeof(struct mlx5_wqe_ctrl_seg) + +		size += sizeof(struct mlx5_wqe_ctrl_seg) +  			sizeof(struct mlx5_wqe_datagram_seg);  		break;  	case MLX5_IB_QPT_REG_UMR: -		size = sizeof(struct mlx5_wqe_ctrl_seg) + +		size += sizeof(struct mlx5_wqe_ctrl_seg) +  			sizeof(struct mlx5_wqe_umr_ctrl_seg) +  			sizeof(struct mlx5_mkey_seg);  		break; @@ -251,8 +256,11 @@ static int calc_send_wqe(struct ib_qp_init_attr *attr)  	}  	size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); - -	return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); +	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN && +	    ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE) +			return MLX5_SIG_WQE_SIZE; +	else +		return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB);  }  static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, @@ -270,7 +278,8 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,  		return wqe_size;  	if (wqe_size > dev->mdev.caps.max_sq_desc_sz) { -		mlx5_ib_dbg(dev, "\n"); +		mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n", +			    wqe_size, dev->mdev.caps.max_sq_desc_sz);  		return -EINVAL;  	} @@ -278,11 +287,20 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,  		sizeof(struct mlx5_wqe_inline_seg);  	attr->cap.max_inline_data = qp->max_inline_data; +	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) +		qp->signature_en = true; +  	wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size);  	qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; +	if (qp->sq.wqe_cnt > dev->mdev.caps.max_wqes) { +		mlx5_ib_dbg(dev, "wqe count(%d) exceeds limits(%d)\n", +			    qp->sq.wqe_cnt, dev->mdev.caps.max_wqes); +		return -ENOMEM; +	}  	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);  	qp->sq.max_gs = attr->cap.max_send_sge; -	qp->sq.max_post = 1 << ilog2(wq_size / wqe_size); +	qp->sq.max_post = wq_size / wqe_size; +	attr->cap.max_send_wr = qp->sq.max_post;  	return wq_size;  } @@ -330,14 +348,57 @@ static int qp_has_rq(struct ib_qp_init_attr *attr)  	return 1;  } +static int first_med_uuar(void) +{ +	return 1; +} + +static int next_uuar(int n) +{ +	n++; + +	while (((n % 4) & 2)) +		n++; + +	return n; +} + +static int num_med_uuar(struct mlx5_uuar_info *uuari) +{ +	int n; + +	n = uuari->num_uars * MLX5_NON_FP_BF_REGS_PER_PAGE - +		uuari->num_low_latency_uuars - 1; + +	return n >= 0 ? n : 0; +} + +static int max_uuari(struct mlx5_uuar_info *uuari) +{ +	return uuari->num_uars * 4; +} + +static int first_hi_uuar(struct mlx5_uuar_info *uuari) +{ +	int med; +	int i; +	int t; + +	med = num_med_uuar(uuari); +	for (t = 0, i = first_med_uuar();; i = next_uuar(i)) { +		t++; +		if (t == med) +			return next_uuar(i); +	} + +	return 0; +} +  static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari)  { -	int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE; -	int start_uuar;  	int i; -	start_uuar = nuuars - uuari->num_low_latency_uuars; -	for (i = start_uuar; i < nuuars; i++) { +	for (i = first_hi_uuar(uuari); i < max_uuari(uuari); i = next_uuar(i)) {  		if (!test_bit(i, uuari->bitmap)) {  			set_bit(i, uuari->bitmap);  			uuari->count[i]++; @@ -350,19 +411,10 @@ static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari)  static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari)  { -	int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE; -	int minidx = 1; -	int uuarn; -	int end; +	int minidx = first_med_uuar();  	int i; -	end = nuuars - uuari->num_low_latency_uuars; - -	for (i = 1; i < end; i++) { -		uuarn = i & 3; -		if (uuarn == 2 || uuarn == 3) -			continue; - +	for (i = first_med_uuar(); i < first_hi_uuar(uuari); i = next_uuar(i)) {  		if (uuari->count[i] < uuari->count[minidx])  			minidx = i;  	} @@ -384,11 +436,17 @@ static int alloc_uuar(struct mlx5_uuar_info *uuari,  		break;  	case MLX5_IB_LATENCY_CLASS_MEDIUM: -		uuarn = alloc_med_class_uuar(uuari); +		if (uuari->ver < 2) +			uuarn = -ENOMEM; +		else +			uuarn = alloc_med_class_uuar(uuari);  		break;  	case MLX5_IB_LATENCY_CLASS_HIGH: -		uuarn = alloc_high_class_uuar(uuari); +		if (uuari->ver < 2) +			uuarn = -ENOMEM; +		else +			uuarn = alloc_high_class_uuar(uuari);  		break;  	case MLX5_IB_LATENCY_CLASS_FAST_PATH: @@ -479,12 +537,12 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,  {  	struct mlx5_ib_ucontext *context;  	struct mlx5_ib_create_qp ucmd; -	int page_shift; +	int page_shift = 0;  	int uar_index;  	int npages; -	u32 offset; +	u32 offset = 0;  	int uuarn; -	int ncont; +	int ncont = 0;  	int err;  	err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); @@ -500,38 +558,53 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,  	uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH);  	if (uuarn < 0) {  		mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n"); -		mlx5_ib_dbg(dev, "reverting to high latency\n"); -		uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); +		mlx5_ib_dbg(dev, "reverting to medium latency\n"); +		uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM);  		if (uuarn < 0) { -			mlx5_ib_dbg(dev, "uuar allocation failed\n"); -			return uuarn; +			mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n"); +			mlx5_ib_dbg(dev, "reverting to high latency\n"); +			uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); +			if (uuarn < 0) { +				mlx5_ib_warn(dev, "uuar allocation failed\n"); +				return uuarn; +			}  		}  	}  	uar_index = uuarn_to_uar_index(&context->uuari, uuarn);  	mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index); +	qp->rq.offset = 0; +	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); +	qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; +  	err = set_user_buf_size(dev, qp, &ucmd);  	if (err)  		goto err_uuar; -	qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, -			       qp->buf_size, 0, 0); -	if (IS_ERR(qp->umem)) { -		mlx5_ib_dbg(dev, "umem_get failed\n"); -		err = PTR_ERR(qp->umem); -		goto err_uuar; +	if (ucmd.buf_addr && qp->buf_size) { +		qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, +				       qp->buf_size, 0, 0); +		if (IS_ERR(qp->umem)) { +			mlx5_ib_dbg(dev, "umem_get failed\n"); +			err = PTR_ERR(qp->umem); +			goto err_uuar; +		} +	} else { +		qp->umem = NULL;  	} -	mlx5_ib_cont_pages(qp->umem, ucmd.buf_addr, &npages, &page_shift, -			   &ncont, NULL); -	err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset); -	if (err) { -		mlx5_ib_warn(dev, "bad offset\n"); -		goto err_umem; +	if (qp->umem) { +		mlx5_ib_cont_pages(qp->umem, ucmd.buf_addr, &npages, &page_shift, +				   &ncont, NULL); +		err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset); +		if (err) { +			mlx5_ib_warn(dev, "bad offset\n"); +			goto err_umem; +		} +		mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n", +			    ucmd.buf_addr, qp->buf_size, npages, page_shift, ncont, offset);  	} -	mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n", -		    ucmd.buf_addr, qp->buf_size, npages, page_shift, ncont, offset);  	*inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont;  	*in = mlx5_vzalloc(*inlen); @@ -539,9 +612,10 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,  		err = -ENOMEM;  		goto err_umem;  	} -	mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); +	if (qp->umem) +		mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0);  	(*in)->ctx.log_pg_sz_remote_qpn = -		cpu_to_be32((page_shift - PAGE_SHIFT) << 24); +		cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24);  	(*in)->ctx.params2 = cpu_to_be32(offset << 6);  	(*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); @@ -570,7 +644,8 @@ err_free:  	mlx5_vfree(*in);  err_umem: -	ib_umem_release(qp->umem); +	if (qp->umem) +		ib_umem_release(qp->umem);  err_uuar:  	free_uuar(&context->uuari, uuarn); @@ -583,7 +658,8 @@ static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp)  	context = to_mucontext(pd->uobject->context);  	mlx5_ib_db_unmap_user(context, &qp->db); -	ib_umem_release(qp->umem); +	if (qp->umem) +		ib_umem_release(qp->umem);  	free_uuar(&context->uuari, qp->uuarn);  } @@ -599,8 +675,8 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,  	int err;  	uuari = &dev->mdev.priv.uuari; -	if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) -		qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK; +	if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) +		return -EINVAL;  	if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR)  		lc = MLX5_IB_LATENCY_CLASS_FAST_PATH; @@ -638,7 +714,8 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,  		goto err_buf;  	}  	(*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); -	(*in)->ctx.log_pg_sz_remote_qpn = cpu_to_be32((qp->buf.page_shift - PAGE_SHIFT) << 24); +	(*in)->ctx.log_pg_sz_remote_qpn = +		cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24);  	/* Set "fast registration enabled" for all kernel QPs */  	(*in)->ctx.params1 |= cpu_to_be32(1 << 11);  	(*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); @@ -734,6 +811,15 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,  	spin_lock_init(&qp->sq.lock);  	spin_lock_init(&qp->rq.lock); +	if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { +		if (!(dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST)) { +			mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n"); +			return -EINVAL; +		} else { +			qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK; +		} +	} +  	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)  		qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; @@ -805,6 +891,9 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,  	if (qp->wq_sig)  		in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_ENABLE_SIG); +	if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) +		in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST); +  	if (qp->scat_cqe && is_connected(init_attr->qp_type)) {  		int rcqe_sz;  		int scqe_sz; @@ -1280,6 +1369,11 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q  					  MLX5_QP_OPTPAR_Q_KEY,  			[MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX	|  					   MLX5_QP_OPTPAR_Q_KEY, +			[MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | +					  MLX5_QP_OPTPAR_RRE            | +					  MLX5_QP_OPTPAR_RAE            | +					  MLX5_QP_OPTPAR_RWE            | +					  MLX5_QP_OPTPAR_PKEY_INDEX,  		},  	},  	[MLX5_QP_STATE_RTR] = { @@ -1302,9 +1396,11 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q  					  MLX5_QP_OPTPAR_RAE		|  					  MLX5_QP_OPTPAR_RWE		|  					  MLX5_QP_OPTPAR_RNR_TIMEOUT	| -					  MLX5_QP_OPTPAR_PM_STATE, +					  MLX5_QP_OPTPAR_PM_STATE	| +					  MLX5_QP_OPTPAR_ALT_ADDR_PATH,  			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE		| -					  MLX5_QP_OPTPAR_PM_STATE, +					  MLX5_QP_OPTPAR_PM_STATE	| +					  MLX5_QP_OPTPAR_ALT_ADDR_PATH,  			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY		|  					  MLX5_QP_OPTPAR_SRQN		|  					  MLX5_QP_OPTPAR_CQN_RCV, @@ -1314,6 +1410,11 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q  		[MLX5_QP_STATE_RTS] = {  			[MLX5_QP_ST_UD]	 = MLX5_QP_OPTPAR_Q_KEY,  			[MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, +			[MLX5_QP_ST_UC]	 = MLX5_QP_OPTPAR_RWE, +			[MLX5_QP_ST_RC]	 = MLX5_QP_OPTPAR_RNR_TIMEOUT	| +					   MLX5_QP_OPTPAR_RWE		| +					   MLX5_QP_OPTPAR_RAE		| +					   MLX5_QP_OPTPAR_RRE,  		},  	},  }; @@ -1530,7 +1631,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,  	mlx5_cur = to_mlx5_state(cur_state);  	mlx5_new = to_mlx5_state(new_state);  	mlx5_st = to_mlx5_st(ibqp->qp_type); -	if (mlx5_cur < 0 || mlx5_new < 0 || mlx5_st < 0) +	if (mlx5_st < 0)  		goto out;  	optpar = ib_mask_to_mlx5_opt(attr_mask); @@ -1593,7 +1694,8 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,  	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;  	if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR && -	    !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) +	    !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, +				IB_LINK_LAYER_UNSPECIFIED))  		goto out;  	if ((attr_mask & IB_QP_PORT) && @@ -1651,29 +1753,6 @@ static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg,  	rseg->reserved = 0;  } -static void set_atomic_seg(struct mlx5_wqe_atomic_seg *aseg, struct ib_send_wr *wr) -{ -	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { -		aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap); -		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add); -	} else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) { -		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); -		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add_mask); -	} else { -		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); -		aseg->compare  = 0; -	} -} - -static void set_masked_atomic_seg(struct mlx5_wqe_masked_atomic_seg *aseg, -				  struct ib_send_wr *wr) -{ -	aseg->swap_add		= cpu_to_be64(wr->wr.atomic.swap); -	aseg->swap_add_mask	= cpu_to_be64(wr->wr.atomic.swap_mask); -	aseg->compare		= cpu_to_be64(wr->wr.atomic.compare_add); -	aseg->compare_mask	= cpu_to_be64(wr->wr.atomic.compare_add_mask); -} -  static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg,  			     struct ib_send_wr *wr)  { @@ -1714,6 +1793,27 @@ static __be64 frwr_mkey_mask(void)  	return cpu_to_be64(result);  } +static __be64 sig_mkey_mask(void) +{ +	u64 result; + +	result = MLX5_MKEY_MASK_LEN		| +		MLX5_MKEY_MASK_PAGE_SIZE	| +		MLX5_MKEY_MASK_START_ADDR	| +		MLX5_MKEY_MASK_EN_SIGERR	| +		MLX5_MKEY_MASK_EN_RINVAL	| +		MLX5_MKEY_MASK_KEY		| +		MLX5_MKEY_MASK_LR		| +		MLX5_MKEY_MASK_LW		| +		MLX5_MKEY_MASK_RR		| +		MLX5_MKEY_MASK_RW		| +		MLX5_MKEY_MASK_SMALL_FENCE	| +		MLX5_MKEY_MASK_FREE		| +		MLX5_MKEY_MASK_BSF_EN; + +	return cpu_to_be64(result); +} +  static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,  				 struct ib_send_wr *wr, int li)  { @@ -1747,6 +1847,7 @@ static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,  			MLX5_MKEY_MASK_PD		|  			MLX5_MKEY_MASK_LR		|  			MLX5_MKEY_MASK_LW		| +			MLX5_MKEY_MASK_KEY		|  			MLX5_MKEY_MASK_RR		|  			MLX5_MKEY_MASK_RW		|  			MLX5_MKEY_MASK_A		| @@ -1768,7 +1869,7 @@ static u8 get_umr_flags(int acc)  	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX5_PERM_REMOTE_WRITE : 0) |  	       (acc & IB_ACCESS_REMOTE_READ   ? MLX5_PERM_REMOTE_READ  : 0) |  	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX5_PERM_LOCAL_WRITE  : 0) | -		MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT; +		MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN;  }  static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, @@ -1780,7 +1881,8 @@ static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr,  		return;  	} -	seg->flags = get_umr_flags(wr->wr.fast_reg.access_flags); +	seg->flags = get_umr_flags(wr->wr.fast_reg.access_flags) | +		     MLX5_ACCESS_MODE_MTT;  	*writ = seg->flags & (MLX5_PERM_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE);  	seg->qpn_mkey7_0 = cpu_to_be32((wr->wr.fast_reg.rkey & 0xff) | 0xffffff00);  	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); @@ -1803,7 +1905,8 @@ static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *w  	seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start);  	seg->len = cpu_to_be64(wr->wr.fast_reg.length);  	seg->log2_page_size = wr->wr.fast_reg.page_shift; -	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); +	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | +				       mlx5_mkey_variant(wr->wr.fast_reg.rkey));  }  static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, @@ -1895,6 +1998,342 @@ static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr,  	return 0;  } +static u16 prot_field_size(enum ib_signature_type type) +{ +	switch (type) { +	case IB_SIG_TYPE_T10_DIF: +		return MLX5_DIF_SIZE; +	default: +		return 0; +	} +} + +static u8 bs_selector(int block_size) +{ +	switch (block_size) { +	case 512:	    return 0x1; +	case 520:	    return 0x2; +	case 4096:	    return 0x3; +	case 4160:	    return 0x4; +	case 1073741824:    return 0x5; +	default:	    return 0; +	} +} + +static int format_selector(struct ib_sig_attrs *attr, +			   struct ib_sig_domain *domain, +			   int *selector) +{ + +#define FORMAT_DIF_NONE		0 +#define FORMAT_DIF_CRC_INC	8 +#define FORMAT_DIF_CRC_NO_INC	12 +#define FORMAT_DIF_CSUM_INC	13 +#define FORMAT_DIF_CSUM_NO_INC	14 + +	switch (domain->sig.dif.type) { +	case IB_T10DIF_NONE: +		/* No DIF */ +		*selector = FORMAT_DIF_NONE; +		break; +	case IB_T10DIF_TYPE1: /* Fall through */ +	case IB_T10DIF_TYPE2: +		switch (domain->sig.dif.bg_type) { +		case IB_T10DIF_CRC: +			*selector = FORMAT_DIF_CRC_INC; +			break; +		case IB_T10DIF_CSUM: +			*selector = FORMAT_DIF_CSUM_INC; +			break; +		default: +			return 1; +		} +		break; +	case IB_T10DIF_TYPE3: +		switch (domain->sig.dif.bg_type) { +		case IB_T10DIF_CRC: +			*selector = domain->sig.dif.type3_inc_reftag ? +					   FORMAT_DIF_CRC_INC : +					   FORMAT_DIF_CRC_NO_INC; +			break; +		case IB_T10DIF_CSUM: +			*selector = domain->sig.dif.type3_inc_reftag ? +					   FORMAT_DIF_CSUM_INC : +					   FORMAT_DIF_CSUM_NO_INC; +			break; +		default: +			return 1; +		} +		break; +	default: +		return 1; +	} + +	return 0; +} + +static int mlx5_set_bsf(struct ib_mr *sig_mr, +			struct ib_sig_attrs *sig_attrs, +			struct mlx5_bsf *bsf, u32 data_size) +{ +	struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig; +	struct mlx5_bsf_basic *basic = &bsf->basic; +	struct ib_sig_domain *mem = &sig_attrs->mem; +	struct ib_sig_domain *wire = &sig_attrs->wire; +	int ret, selector; + +	memset(bsf, 0, sizeof(*bsf)); +	switch (sig_attrs->mem.sig_type) { +	case IB_SIG_TYPE_T10_DIF: +		if (sig_attrs->wire.sig_type != IB_SIG_TYPE_T10_DIF) +			return -EINVAL; + +		/* Input domain check byte mask */ +		basic->check_byte_mask = sig_attrs->check_mask; +		if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval && +		    mem->sig.dif.type == wire->sig.dif.type) { +			/* Same block structure */ +			basic->bsf_size_sbs = 1 << 4; +			if (mem->sig.dif.bg_type == wire->sig.dif.bg_type) +				basic->wire.copy_byte_mask |= 0xc0; +			if (mem->sig.dif.app_tag == wire->sig.dif.app_tag) +				basic->wire.copy_byte_mask |= 0x30; +			if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag) +				basic->wire.copy_byte_mask |= 0x0f; +		} else +			basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval); + +		basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval); +		basic->raw_data_size = cpu_to_be32(data_size); + +		ret = format_selector(sig_attrs, mem, &selector); +		if (ret) +			return -EINVAL; +		basic->m_bfs_psv = cpu_to_be32(selector << 24 | +					       msig->psv_memory.psv_idx); + +		ret = format_selector(sig_attrs, wire, &selector); +		if (ret) +			return -EINVAL; +		basic->w_bfs_psv = cpu_to_be32(selector << 24 | +					       msig->psv_wire.psv_idx); +		break; + +	default: +		return -EINVAL; +	} + +	return 0; +} + +static int set_sig_data_segment(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, +				void **seg, int *size) +{ +	struct ib_sig_attrs *sig_attrs = wr->wr.sig_handover.sig_attrs; +	struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr; +	struct mlx5_bsf *bsf; +	u32 data_len = wr->sg_list->length; +	u32 data_key = wr->sg_list->lkey; +	u64 data_va = wr->sg_list->addr; +	int ret; +	int wqe_size; + +	if (!wr->wr.sig_handover.prot || +	    (data_key == wr->wr.sig_handover.prot->lkey && +	     data_va == wr->wr.sig_handover.prot->addr && +	     data_len == wr->wr.sig_handover.prot->length)) { +		/** +		 * Source domain doesn't contain signature information +		 * or data and protection are interleaved in memory. +		 * So need construct: +		 *                  ------------------ +		 *                 |     data_klm     | +		 *                  ------------------ +		 *                 |       BSF        | +		 *                  ------------------ +		 **/ +		struct mlx5_klm *data_klm = *seg; + +		data_klm->bcount = cpu_to_be32(data_len); +		data_klm->key = cpu_to_be32(data_key); +		data_klm->va = cpu_to_be64(data_va); +		wqe_size = ALIGN(sizeof(*data_klm), 64); +	} else { +		/** +		 * Source domain contains signature information +		 * So need construct a strided block format: +		 *               --------------------------- +		 *              |     stride_block_ctrl     | +		 *               --------------------------- +		 *              |          data_klm         | +		 *               --------------------------- +		 *              |          prot_klm         | +		 *               --------------------------- +		 *              |             BSF           | +		 *               --------------------------- +		 **/ +		struct mlx5_stride_block_ctrl_seg *sblock_ctrl; +		struct mlx5_stride_block_entry *data_sentry; +		struct mlx5_stride_block_entry *prot_sentry; +		u32 prot_key = wr->wr.sig_handover.prot->lkey; +		u64 prot_va = wr->wr.sig_handover.prot->addr; +		u16 block_size = sig_attrs->mem.sig.dif.pi_interval; +		int prot_size; + +		sblock_ctrl = *seg; +		data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl); +		prot_sentry = (void *)data_sentry + sizeof(*data_sentry); + +		prot_size = prot_field_size(sig_attrs->mem.sig_type); +		if (!prot_size) { +			pr_err("Bad block size given: %u\n", block_size); +			return -EINVAL; +		} +		sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size + +							    prot_size); +		sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP); +		sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size); +		sblock_ctrl->num_entries = cpu_to_be16(2); + +		data_sentry->bcount = cpu_to_be16(block_size); +		data_sentry->key = cpu_to_be32(data_key); +		data_sentry->va = cpu_to_be64(data_va); +		data_sentry->stride = cpu_to_be16(block_size); + +		prot_sentry->bcount = cpu_to_be16(prot_size); +		prot_sentry->key = cpu_to_be32(prot_key); +		prot_sentry->va = cpu_to_be64(prot_va); +		prot_sentry->stride = cpu_to_be16(prot_size); + +		wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) + +				 sizeof(*prot_sentry), 64); +	} + +	*seg += wqe_size; +	*size += wqe_size / 16; +	if (unlikely((*seg == qp->sq.qend))) +		*seg = mlx5_get_send_wqe(qp, 0); + +	bsf = *seg; +	ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len); +	if (ret) +		return -EINVAL; + +	*seg += sizeof(*bsf); +	*size += sizeof(*bsf) / 16; +	if (unlikely((*seg == qp->sq.qend))) +		*seg = mlx5_get_send_wqe(qp, 0); + +	return 0; +} + +static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, +				 struct ib_send_wr *wr, u32 nelements, +				 u32 length, u32 pdn) +{ +	struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr; +	u32 sig_key = sig_mr->rkey; +	u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1; + +	memset(seg, 0, sizeof(*seg)); + +	seg->flags = get_umr_flags(wr->wr.sig_handover.access_flags) | +				   MLX5_ACCESS_MODE_KLM; +	seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00); +	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | +				    MLX5_MKEY_BSF_EN | pdn); +	seg->len = cpu_to_be64(length); +	seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(nelements))); +	seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); +} + +static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, +				struct ib_send_wr *wr, u32 nelements) +{ +	memset(umr, 0, sizeof(*umr)); + +	umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE; +	umr->klm_octowords = get_klm_octo(nelements); +	umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE); +	umr->mkey_mask = sig_mkey_mask(); +} + + +static int set_sig_umr_wr(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, +			  void **seg, int *size) +{ +	struct mlx5_ib_mr *sig_mr = to_mmr(wr->wr.sig_handover.sig_mr); +	u32 pdn = get_pd(qp)->pdn; +	u32 klm_oct_size; +	int region_len, ret; + +	if (unlikely(wr->num_sge != 1) || +	    unlikely(wr->wr.sig_handover.access_flags & +		     IB_ACCESS_REMOTE_ATOMIC) || +	    unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) || +	    unlikely(!sig_mr->sig->sig_status_checked)) +		return -EINVAL; + +	/* length of the protected region, data + protection */ +	region_len = wr->sg_list->length; +	if (wr->wr.sig_handover.prot && +	    (wr->wr.sig_handover.prot->lkey != wr->sg_list->lkey  || +	     wr->wr.sig_handover.prot->addr != wr->sg_list->addr  || +	     wr->wr.sig_handover.prot->length != wr->sg_list->length)) +		region_len += wr->wr.sig_handover.prot->length; + +	/** +	 * KLM octoword size - if protection was provided +	 * then we use strided block format (3 octowords), +	 * else we use single KLM (1 octoword) +	 **/ +	klm_oct_size = wr->wr.sig_handover.prot ? 3 : 1; + +	set_sig_umr_segment(*seg, wr, klm_oct_size); +	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); +	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; +	if (unlikely((*seg == qp->sq.qend))) +		*seg = mlx5_get_send_wqe(qp, 0); + +	set_sig_mkey_segment(*seg, wr, klm_oct_size, region_len, pdn); +	*seg += sizeof(struct mlx5_mkey_seg); +	*size += sizeof(struct mlx5_mkey_seg) / 16; +	if (unlikely((*seg == qp->sq.qend))) +		*seg = mlx5_get_send_wqe(qp, 0); + +	ret = set_sig_data_segment(wr, qp, seg, size); +	if (ret) +		return ret; + +	sig_mr->sig->sig_status_checked = false; +	return 0; +} + +static int set_psv_wr(struct ib_sig_domain *domain, +		      u32 psv_idx, void **seg, int *size) +{ +	struct mlx5_seg_set_psv *psv_seg = *seg; + +	memset(psv_seg, 0, sizeof(*psv_seg)); +	psv_seg->psv_num = cpu_to_be32(psv_idx); +	switch (domain->sig_type) { +	case IB_SIG_TYPE_T10_DIF: +		psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 | +						     domain->sig.dif.app_tag); +		psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag); + +		*seg += sizeof(*psv_seg); +		*size += sizeof(*psv_seg) / 16; +		break; + +	default: +		pr_err("Bad signature type given.\n"); +		return 1; +	} + +	return 0; +} +  static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size,  			  struct mlx5_core_dev *mdev, struct mlx5_ib_pd *pd, struct mlx5_ib_qp *qp)  { @@ -1916,6 +2355,10 @@ static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size,  	if (unlikely((*seg == qp->sq.qend)))  		*seg = mlx5_get_send_wqe(qp, 0);  	if (!li) { +		if (unlikely(wr->wr.fast_reg.page_list_len > +			     wr->wr.fast_reg.page_list->max_page_list_len)) +			return	-ENOMEM; +  		set_frwr_pages(*seg, wr, mdev, pd, writ);  		*seg += sizeof(struct mlx5_wqe_data_seg);  		*size += (sizeof(struct mlx5_wqe_data_seg) / 16); @@ -1978,6 +2421,59 @@ static u8 get_fence(u8 fence, struct ib_send_wr *wr)  	}  } +static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, +		     struct mlx5_wqe_ctrl_seg **ctrl, +		     struct ib_send_wr *wr, int *idx, +		     int *size, int nreq) +{ +	int err = 0; + +	if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) { +		err = -ENOMEM; +		return err; +	} + +	*idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); +	*seg = mlx5_get_send_wqe(qp, *idx); +	*ctrl = *seg; +	*(uint32_t *)(*seg + 8) = 0; +	(*ctrl)->imm = send_ieth(wr); +	(*ctrl)->fm_ce_se = qp->sq_signal_bits | +		(wr->send_flags & IB_SEND_SIGNALED ? +		 MLX5_WQE_CTRL_CQ_UPDATE : 0) | +		(wr->send_flags & IB_SEND_SOLICITED ? +		 MLX5_WQE_CTRL_SOLICITED : 0); + +	*seg += sizeof(**ctrl); +	*size = sizeof(**ctrl) / 16; + +	return err; +} + +static void finish_wqe(struct mlx5_ib_qp *qp, +		       struct mlx5_wqe_ctrl_seg *ctrl, +		       u8 size, unsigned idx, u64 wr_id, +		       int nreq, u8 fence, u8 next_fence, +		       u32 mlx5_opcode) +{ +	u8 opmod = 0; + +	ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | +					     mlx5_opcode | ((u32)opmod << 24)); +	ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8)); +	ctrl->fm_ce_se |= fence; +	qp->fm_cache = next_fence; +	if (unlikely(qp->wq_sig)) +		ctrl->signature = wq_sig(ctrl); + +	qp->sq.wrid[idx] = wr_id; +	qp->sq.w_list[idx].opcode = mlx5_opcode; +	qp->sq.wqe_head[idx] = qp->sq.head + nreq; +	qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); +	qp->sq.w_list[idx].next = qp->sq.cur_post; +} + +  int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,  		      struct ib_send_wr **bad_wr)  { @@ -1985,13 +2481,13 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,  	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);  	struct mlx5_core_dev *mdev = &dev->mdev;  	struct mlx5_ib_qp *qp = to_mqp(ibqp); +	struct mlx5_ib_mr *mr;  	struct mlx5_wqe_data_seg *dpseg;  	struct mlx5_wqe_xrc_seg *xrc;  	struct mlx5_bf *bf = qp->bf;  	int uninitialized_var(size);  	void *qend = qp->sq.qend;  	unsigned long flags; -	u32 mlx5_opcode;  	unsigned idx;  	int err = 0;  	int inl = 0; @@ -2000,7 +2496,6 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,  	int nreq;  	int i;  	u8 next_fence = 0; -	u8 opmod = 0;  	u8 fence;  	spin_lock_irqsave(&qp->sq.lock, flags); @@ -2013,36 +2508,23 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,  			goto out;  		} -		if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) { +		fence = qp->fm_cache; +		num_sge = wr->num_sge; +		if (unlikely(num_sge > qp->sq.max_gs)) {  			mlx5_ib_warn(dev, "\n");  			err = -ENOMEM;  			*bad_wr = wr;  			goto out;  		} -		fence = qp->fm_cache; -		num_sge = wr->num_sge; -		if (unlikely(num_sge > qp->sq.max_gs)) { +		err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq); +		if (err) {  			mlx5_ib_warn(dev, "\n");  			err = -ENOMEM;  			*bad_wr = wr;  			goto out;  		} -		idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); -		seg = mlx5_get_send_wqe(qp, idx); -		ctrl = seg; -		*(uint32_t *)(seg + 8) = 0; -		ctrl->imm = send_ieth(wr); -		ctrl->fm_ce_se = qp->sq_signal_bits | -			(wr->send_flags & IB_SEND_SIGNALED ? -			 MLX5_WQE_CTRL_CQ_UPDATE : 0) | -			(wr->send_flags & IB_SEND_SOLICITED ? -			 MLX5_WQE_CTRL_SOLICITED : 0); - -		seg += sizeof(*ctrl); -		size = sizeof(*ctrl) / 16; -  		switch (ibqp->qp_type) {  		case IB_QPT_XRC_INI:  			xrc = seg; @@ -2063,28 +2545,11 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,  			case IB_WR_ATOMIC_CMP_AND_SWP:  			case IB_WR_ATOMIC_FETCH_AND_ADD: -				set_raddr_seg(seg, wr->wr.atomic.remote_addr, -					      wr->wr.atomic.rkey); -				seg  += sizeof(struct mlx5_wqe_raddr_seg); - -				set_atomic_seg(seg, wr); -				seg  += sizeof(struct mlx5_wqe_atomic_seg); - -				size += (sizeof(struct mlx5_wqe_raddr_seg) + -					 sizeof(struct mlx5_wqe_atomic_seg)) / 16; -				break; -  			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: -				set_raddr_seg(seg, wr->wr.atomic.remote_addr, -					      wr->wr.atomic.rkey); -				seg  += sizeof(struct mlx5_wqe_raddr_seg); - -				set_masked_atomic_seg(seg, wr); -				seg  += sizeof(struct mlx5_wqe_masked_atomic_seg); - -				size += (sizeof(struct mlx5_wqe_raddr_seg) + -					 sizeof(struct mlx5_wqe_masked_atomic_seg)) / 16; -				break; +				mlx5_ib_warn(dev, "Atomic operations are not supported yet\n"); +				err = -ENOSYS; +				*bad_wr = wr; +				goto out;  			case IB_WR_LOCAL_INV:  				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; @@ -2112,6 +2577,73 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,  				num_sge = 0;  				break; +			case IB_WR_REG_SIG_MR: +				qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR; +				mr = to_mmr(wr->wr.sig_handover.sig_mr); + +				ctrl->imm = cpu_to_be32(mr->ibmr.rkey); +				err = set_sig_umr_wr(wr, qp, &seg, &size); +				if (err) { +					mlx5_ib_warn(dev, "\n"); +					*bad_wr = wr; +					goto out; +				} + +				finish_wqe(qp, ctrl, size, idx, wr->wr_id, +					   nreq, get_fence(fence, wr), +					   next_fence, MLX5_OPCODE_UMR); +				/* +				 * SET_PSV WQEs are not signaled and solicited +				 * on error +				 */ +				wr->send_flags &= ~IB_SEND_SIGNALED; +				wr->send_flags |= IB_SEND_SOLICITED; +				err = begin_wqe(qp, &seg, &ctrl, wr, +						&idx, &size, nreq); +				if (err) { +					mlx5_ib_warn(dev, "\n"); +					err = -ENOMEM; +					*bad_wr = wr; +					goto out; +				} + +				err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->mem, +						 mr->sig->psv_memory.psv_idx, &seg, +						 &size); +				if (err) { +					mlx5_ib_warn(dev, "\n"); +					*bad_wr = wr; +					goto out; +				} + +				finish_wqe(qp, ctrl, size, idx, wr->wr_id, +					   nreq, get_fence(fence, wr), +					   next_fence, MLX5_OPCODE_SET_PSV); +				err = begin_wqe(qp, &seg, &ctrl, wr, +						&idx, &size, nreq); +				if (err) { +					mlx5_ib_warn(dev, "\n"); +					err = -ENOMEM; +					*bad_wr = wr; +					goto out; +				} + +				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; +				err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->wire, +						 mr->sig->psv_wire.psv_idx, &seg, +						 &size); +				if (err) { +					mlx5_ib_warn(dev, "\n"); +					*bad_wr = wr; +					goto out; +				} + +				finish_wqe(qp, ctrl, size, idx, wr->wr_id, +					   nreq, get_fence(fence, wr), +					   next_fence, MLX5_OPCODE_SET_PSV); +				num_sge = 0; +				goto skip_psv; +  			default:  				break;  			} @@ -2192,22 +2724,10 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,  			}  		} -		mlx5_opcode = mlx5_ib_opcode[wr->opcode]; -		ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8)	| -						     mlx5_opcode			| -						     ((u32)opmod << 24)); -		ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8)); -		ctrl->fm_ce_se |= get_fence(fence, wr); -		qp->fm_cache = next_fence; -		if (unlikely(qp->wq_sig)) -			ctrl->signature = wq_sig(ctrl); - -		qp->sq.wrid[idx] = wr->wr_id; -		qp->sq.w_list[idx].opcode = mlx5_opcode; -		qp->sq.wqe_head[idx] = qp->sq.head + nreq; -		qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); -		qp->sq.w_list[idx].next = qp->sq.cur_post; - +		finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, +			   get_fence(fence, wr), next_fence, +			   mlx5_ib_opcode[wr->opcode]); +skip_psv:  		if (0)  			dump_wqe(qp, idx, size);  	} @@ -2223,6 +2743,10 @@ out:  		qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); +		/* Make sure doorbell record is visible to the HCA before +		 * we hit doorbell */ +		wmb(); +  		if (bf->need_lock)  			spin_lock(&bf->lock); diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 84d297afd6a..384af6dec5e 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -35,6 +35,7 @@  #include <linux/mlx5/srq.h>  #include <linux/slab.h>  #include <rdma/ib_umem.h> +#include <rdma/ib_user_verbs.h>  #include "mlx5_ib.h"  #include "user.h" @@ -78,16 +79,27 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,  {  	struct mlx5_ib_dev *dev = to_mdev(pd->device);  	struct mlx5_ib_create_srq ucmd; +	size_t ucmdlen;  	int err;  	int npages;  	int page_shift;  	int ncont;  	u32 offset; -	if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { +	ucmdlen = +		(udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < +		 sizeof(ucmd)) ? (sizeof(ucmd) - +				  sizeof(ucmd.reserved)) : sizeof(ucmd); + +	if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) {  		mlx5_ib_dbg(dev, "failed copy udata\n");  		return -EFAULT;  	} + +	if (ucmdlen == sizeof(ucmd) && +	    ucmd.reserved != 0) +		return -EINVAL; +  	srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);  	srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size, @@ -123,7 +135,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,  		goto err_in;  	} -	(*in)->ctx.log_pg_sz = page_shift - PAGE_SHIFT; +	(*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;  	(*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26);  	return 0; @@ -192,7 +204,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,  	}  	srq->wq_sig = !!srq_signature; -	(*in)->ctx.log_pg_sz = page_shift - PAGE_SHIFT; +	(*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;  	return 0; @@ -295,7 +307,7 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,  	mlx5_vfree(in);  	if (err) {  		mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err); -		goto err_srq; +		goto err_usr_kern_srq;  	}  	mlx5_ib_dbg(dev, "create SRQ with srqn 0x%x\n", srq->msrq.srqn); @@ -316,6 +328,8 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,  err_core:  	mlx5_core_destroy_srq(&dev->mdev, &srq->msrq); + +err_usr_kern_srq:  	if (pd->uobject)  		destroy_srq_user(pd, srq);  	else @@ -388,9 +402,7 @@ int mlx5_ib_destroy_srq(struct ib_srq *srq)  		mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);  		ib_umem_release(msrq->umem);  	} else { -		kfree(msrq->wrid); -		mlx5_buf_free(&dev->mdev, &msrq->buf); -		mlx5_db_free(&dev->mdev, &msrq->db); +		destroy_srq_kernel(dev, msrq);  	}  	kfree(srq); diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h index a886de3e593..d0ba264ac1e 100644 --- a/drivers/infiniband/hw/mlx5/user.h +++ b/drivers/infiniband/hw/mlx5/user.h @@ -62,6 +62,13 @@ struct mlx5_ib_alloc_ucontext_req {  	__u32	num_low_latency_uuars;  }; +struct mlx5_ib_alloc_ucontext_req_v2 { +	__u32	total_num_uuars; +	__u32	num_low_latency_uuars; +	__u32	flags; +	__u32	reserved; +}; +  struct mlx5_ib_alloc_ucontext_resp {  	__u32	qp_tab_size;  	__u32	bf_reg_size; @@ -84,6 +91,7 @@ struct mlx5_ib_create_cq {  	__u64	buf_addr;  	__u64	db_addr;  	__u32	cqe_size; +	__u32	reserved; /* explicit padding (optional on i386) */  };  struct mlx5_ib_create_cq_resp { @@ -93,12 +101,16 @@ struct mlx5_ib_create_cq_resp {  struct mlx5_ib_resize_cq {  	__u64	buf_addr; +	__u16	cqe_size; +	__u16	reserved0; +	__u32	reserved1;  };  struct mlx5_ib_create_srq {  	__u64	buf_addr;  	__u64	db_addr;  	__u32	flags; +	__u32	reserved; /* explicit padding (optional on i386) */  };  struct mlx5_ib_create_srq_resp { diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c index 7c9d35f39d7..69020173899 100644 --- a/drivers/infiniband/hw/mthca/mthca_eq.c +++ b/drivers/infiniband/hw/mthca/mthca_eq.c @@ -357,7 +357,7 @@ static int mthca_eq_int(struct mthca_dev *dev, struct mthca_eq *eq)  			mthca_warn(dev, "Unhandled event %02x(%02x) on EQ %d\n",  				   eqe->type, eqe->subtype, eq->eqn);  			break; -		}; +		}  		set_eqe_hw(eqe);  		++eq->cons_index; diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 87897b95666..ded76c101dd 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -858,13 +858,9 @@ static int mthca_enable_msi_x(struct mthca_dev *mdev)  	entries[1].entry = 1;  	entries[2].entry = 2; -	err = pci_enable_msix(mdev->pdev, entries, ARRAY_SIZE(entries)); -	if (err) { -		if (err > 0) -			mthca_info(mdev, "Only %d MSI-X vectors available, " -				   "not using MSI-X\n", err); +	err = pci_enable_msix_exact(mdev->pdev, entries, ARRAY_SIZE(entries)); +	if (err)  		return err; -	}  	mdev->eq_table.eq[MTHCA_EQ_COMP ].msi_x_vector = entries[0].vector;  	mdev->eq_table.eq[MTHCA_EQ_ASYNC].msi_x_vector = entries[1].vector; diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 5b71d43bd89..415f8e1a54d 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -695,6 +695,7 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries,  	if (context && ib_copy_to_udata(udata, &cq->cqn, sizeof (__u32))) {  		mthca_free_cq(to_mdev(ibdev), cq); +		err = -EFAULT;  		goto err_free;  	} @@ -976,12 +977,12 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  				       u64 virt, int acc, struct ib_udata *udata)  {  	struct mthca_dev *dev = to_mdev(pd->device); -	struct ib_umem_chunk *chunk; +	struct scatterlist *sg;  	struct mthca_mr *mr;  	struct mthca_reg_mr ucmd;  	u64 *pages;  	int shift, n, len; -	int i, j, k; +	int i, k, entry;  	int err = 0;  	int write_mtt_size; @@ -1009,10 +1010,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  	}  	shift = ffs(mr->umem->page_size) - 1; - -	n = 0; -	list_for_each_entry(chunk, &mr->umem->chunk_list, list) -		n += chunk->nents; +	n = mr->umem->nmap;  	mr->mtt = mthca_alloc_mtt(dev, n);  	if (IS_ERR(mr->mtt)) { @@ -1030,25 +1028,24 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  	write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages)); -	list_for_each_entry(chunk, &mr->umem->chunk_list, list) -		for (j = 0; j < chunk->nmap; ++j) { -			len = sg_dma_len(&chunk->page_list[j]) >> shift; -			for (k = 0; k < len; ++k) { -				pages[i++] = sg_dma_address(&chunk->page_list[j]) + -					mr->umem->page_size * k; -				/* -				 * Be friendly to write_mtt and pass it chunks -				 * of appropriate size. -				 */ -				if (i == write_mtt_size) { -					err = mthca_write_mtt(dev, mr->mtt, n, pages, i); -					if (err) -						goto mtt_done; -					n += i; -					i = 0; -				} +	for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) { +		len = sg_dma_len(sg) >> shift; +		for (k = 0; k < len; ++k) { +			pages[i++] = sg_dma_address(sg) + +				mr->umem->page_size * k; +			/* +			 * Be friendly to write_mtt and pass it chunks +			 * of appropriate size. +			 */ +			if (i == write_mtt_size) { +				err = mthca_write_mtt(dev, mr->mtt, n, pages, i); +				if (err) +					goto mtt_done; +				n += i; +				i = 0;  			}  		} +	}  	if (i)  		err = mthca_write_mtt(dev, mr->mtt, n, pages, i); diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 26a68453610..e354b2f04ad 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -860,7 +860,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,  	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; -	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) { +	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, +				IB_LINK_LAYER_UNSPECIFIED)) {  		mthca_dbg(dev, "Bad QP transition (transport %d) "  			  "%d->%d with attr 0x%08x\n",  			  qp->transport, cur_state, new_state, diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index 429141078ee..3b2a6dc8ea9 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -68,7 +68,6 @@ MODULE_VERSION(DRV_VERSION);  int max_mtu = 9000;  int interrupt_mod_interval = 0; -  /* Interoperability */  int mpa_version = 1;  module_param(mpa_version, int, 0644); @@ -112,6 +111,16 @@ static struct pci_device_id nes_pci_table[] = {  MODULE_DEVICE_TABLE(pci, nes_pci_table); +/* registered nes netlink callbacks */ +static struct ibnl_client_cbs nes_nl_cb_table[] = { +	[RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, +	[RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, +	[RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, +	[RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, +	[RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, +	[RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} +}; +  static int nes_inetaddr_event(struct notifier_block *, unsigned long, void *);  static int nes_net_event(struct notifier_block *, unsigned long, void *);  static int nes_notifiers_registered; @@ -672,11 +681,25 @@ static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)  	}  	nes_notifiers_registered++; +	if (ibnl_add_client(RDMA_NL_NES, RDMA_NL_IWPM_NUM_OPS, nes_nl_cb_table)) +		printk(KERN_ERR PFX "%s[%u]: Failed to add netlink callback\n", +			__func__, __LINE__); + +	ret = iwpm_init(RDMA_NL_NES); +	if (ret) { +		printk(KERN_ERR PFX "%s: port mapper initialization failed\n", +				pci_name(pcidev)); +		goto bail7; +	} +  	INIT_DELAYED_WORK(&nesdev->work, nes_recheck_link_status);  	/* Initialize network devices */ -	if ((netdev = nes_netdev_init(nesdev, mmio_regs)) == NULL) +	netdev = nes_netdev_init(nesdev, mmio_regs); +	if (netdev == NULL) { +		ret = -ENOMEM;  		goto bail7; +	}  	/* Register network device */  	ret = register_netdev(netdev); @@ -707,6 +730,7 @@ static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)  	nes_debug(NES_DBG_INIT, "netdev_count=%d, nesadapter->netdev_count=%d\n",  			nesdev->netdev_count, nesdev->nesadapter->netdev_count); +	ibnl_remove_client(RDMA_NL_NES);  	nes_notifiers_registered--;  	if (nes_notifiers_registered == 0) { @@ -770,6 +794,8 @@ static void nes_remove(struct pci_dev *pcidev)  				nesdev->nesadapter->netdev_count--;  			}  		} +	ibnl_remove_client(RDMA_NL_NES); +	iwpm_exit(RDMA_NL_NES);  	nes_notifiers_registered--;  	if (nes_notifiers_registered == 0) { diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index 33cc58941a3..bd9d132f11c 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -51,6 +51,8 @@  #include <rdma/ib_pack.h>  #include <rdma/rdma_cm.h>  #include <rdma/iw_cm.h> +#include <rdma/rdma_netlink.h> +#include <rdma/iw_portmap.h>  #define NES_SEND_FIRST_WRITE @@ -130,6 +132,7 @@  #define NES_DBG_IW_TX       0x00040000  #define NES_DBG_SHUTDOWN    0x00080000  #define NES_DBG_PAU         0x00100000 +#define NES_DBG_NLMSG       0x00200000  #define NES_DBG_RSVD1       0x10000000  #define NES_DBG_RSVD2       0x20000000  #define NES_DBG_RSVD3       0x40000000 diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 6b29249aa85..6f09a72e78d 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1,5 +1,5 @@  /* - * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved. + * Copyright (c) 2006 - 2014 Intel Corporation.  All rights reserved.   *   * This software is available to you under a choice of one of two   * licenses.  You may choose to be licensed under the terms of the GNU @@ -59,6 +59,7 @@  #include <net/route.h>  #include <net/ip_fib.h>  #include <net/tcp.h> +#include <linux/fcntl.h>  #include "nes.h" @@ -128,6 +129,7 @@ static void build_mpa_v1(struct nes_cm_node *, void *, u8);  static void build_rdma0_msg(struct nes_cm_node *, struct nes_qp **);  static void print_core(struct nes_cm_core *core); +static void record_ird_ord(struct nes_cm_node *, u16, u16);  /* External CM API Interface */  /* instance of function pointers for client API */ @@ -165,7 +167,6 @@ int nes_rem_ref_cm_node(struct nes_cm_node *cm_node)  {  	return rem_ref_cm_node(cm_node->cm_core, cm_node);  } -  /**   * create_event   */ @@ -317,7 +318,6 @@ static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type,  		}  	} -  	if (priv_data_len + mpa_hdr_len != len) {  		nes_debug(NES_DBG_CM, "The received ietf buffer was not right"  			" complete (%x + %x != %x)\n", @@ -356,25 +356,57 @@ static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type,  			/* send reset */  			return -EINVAL;  		} +		if (ird_size == IETF_NO_IRD_ORD || ord_size == IETF_NO_IRD_ORD) +			cm_node->mpav2_ird_ord = IETF_NO_IRD_ORD; -		if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) { +		if (cm_node->mpav2_ird_ord != IETF_NO_IRD_ORD) {  			/* responder */ -			if (cm_node->ord_size > ird_size) -				cm_node->ord_size = ird_size; -		} else { -			/* initiator */ -			if (cm_node->ord_size > ird_size) -				cm_node->ord_size = ird_size; - -			if (cm_node->ird_size < ord_size) { -				/* no resources available */ -				/* send terminate message */ -				return -EINVAL; +			if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) { +				/* we are still negotiating */ +				if (ord_size > NES_MAX_IRD) { +					cm_node->ird_size = NES_MAX_IRD; +				} else { +					cm_node->ird_size = ord_size; +					if (ord_size == 0 && +					(rtr_ctrl_ord & IETF_RDMA0_READ)) { +						cm_node->ird_size = 1; +						nes_debug(NES_DBG_CM, +						"%s: Remote peer doesn't support RDMA0_READ (ord=%u)\n", +							__func__, ord_size); +					} +				} +				if (ird_size > NES_MAX_ORD) +					cm_node->ord_size = NES_MAX_ORD; +				else +					cm_node->ord_size = ird_size; +			} else { /* initiator */ +				if (ord_size > NES_MAX_IRD) { +					nes_debug(NES_DBG_CM, +					"%s: Unable to support the requested (ord =%u)\n", +							__func__, ord_size); +					return -EINVAL; +				} +				cm_node->ird_size = ord_size; + +				if (ird_size > NES_MAX_ORD) { +					cm_node->ord_size = NES_MAX_ORD; +				} else { +					if (ird_size == 0 && +					(rtr_ctrl_ord & IETF_RDMA0_READ)) { +						nes_debug(NES_DBG_CM, +						"%s: Remote peer doesn't support RDMA0_READ (ird=%u)\n", +							__func__, ird_size); +						return -EINVAL; +					} else { +						cm_node->ord_size = ird_size; +					} +				}  			}  		}  		if (rtr_ctrl_ord & IETF_RDMA0_READ) {  			cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; +  		} else if (rtr_ctrl_ord & IETF_RDMA0_WRITE) {  			cm_node->send_rdma0_op = SEND_RDMA_WRITE_ZERO;  		} else {        /* Not supported RDMA0 operation */ @@ -450,11 +482,11 @@ static void form_cm_frame(struct sk_buff *skb,  	iph->ttl = 0x40;  	iph->protocol = 0x06;   /* IPPROTO_TCP */ -	iph->saddr = htonl(cm_node->loc_addr); -	iph->daddr = htonl(cm_node->rem_addr); +	iph->saddr = htonl(cm_node->mapped_loc_addr); +	iph->daddr = htonl(cm_node->mapped_rem_addr); -	tcph->source = htons(cm_node->loc_port); -	tcph->dest = htons(cm_node->rem_port); +	tcph->source = htons(cm_node->mapped_loc_port); +	tcph->dest = htons(cm_node->mapped_rem_port);  	tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num);  	if (flags & SET_ACK) { @@ -493,6 +525,100 @@ static void form_cm_frame(struct sk_buff *skb,  	cm_packets_created++;  } +/* + * nes_create_sockaddr - Record ip addr and tcp port in a sockaddr struct + */ +static void nes_create_sockaddr(__be32 ip_addr, __be16 port, +				struct sockaddr_storage *addr) +{ +	struct sockaddr_in *nes_sockaddr = (struct sockaddr_in *)addr; +	nes_sockaddr->sin_family = AF_INET; +	memcpy(&nes_sockaddr->sin_addr.s_addr, &ip_addr, sizeof(__be32)); +	nes_sockaddr->sin_port = port; +} + +/* + * nes_create_mapinfo - Create a mapinfo object in the port mapper data base + */ +static int nes_create_mapinfo(struct nes_cm_info *cm_info) +{ +	struct sockaddr_storage local_sockaddr; +	struct sockaddr_storage mapped_sockaddr; + +	nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port), +				&local_sockaddr); +	nes_create_sockaddr(htonl(cm_info->mapped_loc_addr), +			htons(cm_info->mapped_loc_port), &mapped_sockaddr); + +	return iwpm_create_mapinfo(&local_sockaddr, +				&mapped_sockaddr, RDMA_NL_NES); +} + +/* + * nes_remove_mapinfo - Remove a mapinfo object from the port mapper data base + *                      and send a remove mapping op message to + *                      the userspace port mapper + */ +static int nes_remove_mapinfo(u32 loc_addr, u16 loc_port, +			u32 mapped_loc_addr, u16 mapped_loc_port) +{ +	struct sockaddr_storage local_sockaddr; +	struct sockaddr_storage mapped_sockaddr; + +	nes_create_sockaddr(htonl(loc_addr), htons(loc_port), &local_sockaddr); +	nes_create_sockaddr(htonl(mapped_loc_addr), htons(mapped_loc_port), +				&mapped_sockaddr); + +	iwpm_remove_mapinfo(&local_sockaddr, &mapped_sockaddr); +	return iwpm_remove_mapping(&local_sockaddr, RDMA_NL_NES); +} + +/* + * nes_form_pm_msg - Form a port mapper message with mapping info + */ +static void nes_form_pm_msg(struct nes_cm_info *cm_info, +				struct iwpm_sa_data *pm_msg) +{ +	nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port), +				&pm_msg->loc_addr); +	nes_create_sockaddr(htonl(cm_info->rem_addr), htons(cm_info->rem_port), +				&pm_msg->rem_addr); +} + +/* + * nes_form_reg_msg - Form a port mapper message with dev info + */ +static void nes_form_reg_msg(struct nes_vnic *nesvnic, +			struct iwpm_dev_data *pm_msg) +{ +	memcpy(pm_msg->dev_name, nesvnic->nesibdev->ibdev.name, +				IWPM_DEVNAME_SIZE); +	memcpy(pm_msg->if_name, nesvnic->netdev->name, IWPM_IFNAME_SIZE); +} + +/* + * nes_record_pm_msg - Save the received mapping info + */ +static void nes_record_pm_msg(struct nes_cm_info *cm_info, +			struct iwpm_sa_data *pm_msg) +{ +	struct sockaddr_in *mapped_loc_addr = +			(struct sockaddr_in *)&pm_msg->mapped_loc_addr; +	struct sockaddr_in *mapped_rem_addr = +			(struct sockaddr_in *)&pm_msg->mapped_rem_addr; + +	if (mapped_loc_addr->sin_family == AF_INET) { +		cm_info->mapped_loc_addr = +			ntohl(mapped_loc_addr->sin_addr.s_addr); +		cm_info->mapped_loc_port = ntohs(mapped_loc_addr->sin_port); +	} +	if (mapped_rem_addr->sin_family == AF_INET) { +		cm_info->mapped_rem_addr = +			ntohl(mapped_rem_addr->sin_addr.s_addr); +		cm_info->mapped_rem_port = ntohs(mapped_rem_addr->sin_port); +	} +} +  /**   * print_core - dump a cm core   */ @@ -514,6 +640,19 @@ static void print_core(struct nes_cm_core *core)  	nes_debug(NES_DBG_CM, "-------------- end core ---------------\n");  } +static void record_ird_ord(struct nes_cm_node *cm_node, +					u16 conn_ird, u16 conn_ord) +{ +	if (conn_ird > NES_MAX_IRD) +		conn_ird = NES_MAX_IRD; + +	if (conn_ord > NES_MAX_ORD) +		conn_ord = NES_MAX_ORD; + +	cm_node->ird_size = conn_ird; +	cm_node->ord_size = conn_ord; +} +  /**   * cm_build_mpa_frame - build a MPA V1 frame or MPA V2 frame   */ @@ -557,11 +696,13 @@ static void build_mpa_v2(struct nes_cm_node *cm_node,  	mpa_frame->priv_data_len += htons(IETF_RTR_MSG_SIZE);  	/* initialize RTR msg */ -	ctrl_ird = (cm_node->ird_size > IETF_NO_IRD_ORD) ? -			    IETF_NO_IRD_ORD : cm_node->ird_size; -	ctrl_ord = (cm_node->ord_size > IETF_NO_IRD_ORD) ? -			    IETF_NO_IRD_ORD : cm_node->ord_size; - +	if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) { +		ctrl_ird = IETF_NO_IRD_ORD; +		ctrl_ord = IETF_NO_IRD_ORD; +	} else { +		ctrl_ird = cm_node->ird_size & IETF_NO_IRD_ORD; +		ctrl_ord = cm_node->ord_size & IETF_NO_IRD_ORD; +	}  	ctrl_ird |= IETF_PEER_TO_PEER;  	ctrl_ird |= IETF_FLPDU_ZERO_LEN; @@ -610,7 +751,7 @@ static void build_rdma0_msg(struct nes_cm_node *cm_node, struct nes_qp **nesqp_a  	struct nes_qp *nesqp = *nesqp_addr;  	struct nes_hw_qp_wqe *wqe = &nesqp->hwqp.sq_vbase[0]; -	u64temp = (unsigned long)nesqp; +	u64temp = (unsigned long)nesqp->nesuqp_addr;  	u64temp |= NES_SW_CONTEXT_ALIGN >> 1;  	set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, u64temp); @@ -1100,8 +1241,11 @@ static struct nes_cm_node *find_node(struct nes_cm_core *cm_core,  			  loc_addr, loc_port,  			  cm_node->rem_addr, cm_node->rem_port,  			  rem_addr, rem_port); -		if ((cm_node->loc_addr == loc_addr) && (cm_node->loc_port == loc_port) && -		    (cm_node->rem_addr == rem_addr) && (cm_node->rem_port == rem_port)) { +		if ((cm_node->mapped_loc_addr == loc_addr) && +			(cm_node->mapped_loc_port == loc_port) && +			(cm_node->mapped_rem_addr == rem_addr) && +			(cm_node->mapped_rem_port == rem_port)) { +  			add_ref_cm_node(cm_node);  			spin_unlock_irqrestore(&cm_core->ht_lock, flags);  			return cm_node; @@ -1118,18 +1262,28 @@ static struct nes_cm_node *find_node(struct nes_cm_core *cm_core,   * find_listener - find a cm node listening on this addr-port pair   */  static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core, -					     nes_addr_t dst_addr, u16 dst_port, enum nes_cm_listener_state listener_state) +					nes_addr_t dst_addr, u16 dst_port, +					enum nes_cm_listener_state listener_state, int local)  {  	unsigned long flags;  	struct nes_cm_listener *listen_node; +	nes_addr_t listen_addr; +	u16 listen_port;  	/* walk list and find cm_node associated with this session ID */  	spin_lock_irqsave(&cm_core->listen_list_lock, flags);  	list_for_each_entry(listen_node, &cm_core->listen_list.list, list) { +		if (local) { +			listen_addr = listen_node->loc_addr; +			listen_port = listen_node->loc_port; +		} else { +			listen_addr = listen_node->mapped_loc_addr; +			listen_port = listen_node->mapped_loc_port; +		}  		/* compare node pair, return node handle if a match */ -		if (((listen_node->loc_addr == dst_addr) || -		     listen_node->loc_addr == 0x00000000) && -		    (listen_node->loc_port == dst_port) && +		if (((listen_addr == dst_addr) || +		     listen_addr == 0x00000000) && +		    (listen_port == dst_port) &&  		    (listener_state & listen_node->listener_state)) {  			atomic_inc(&listen_node->ref_count);  			spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); @@ -1142,7 +1296,6 @@ static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core,  	return NULL;  } -  /**   * add_hte_node - add a cm node to the hash table   */ @@ -1263,9 +1416,20 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core,  		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); -		if (listener->nesvnic) -			nes_manage_apbvt(listener->nesvnic, listener->loc_port, -					 PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); +		if (listener->nesvnic) { +			nes_manage_apbvt(listener->nesvnic, +				listener->mapped_loc_port, +				PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn), +				NES_MANAGE_APBVT_DEL); + +			nes_remove_mapinfo(listener->loc_addr, +					listener->loc_port, +					listener->mapped_loc_addr, +					listener->mapped_loc_port); +			nes_debug(NES_DBG_NLMSG, +					"Delete APBVT mapped_loc_port = %04X\n", +					listener->mapped_loc_port); +		}  		nes_debug(NES_DBG_CM, "destroying listener (%p)\n", listener); @@ -1354,8 +1518,7 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpi  				  neigh->ha, ntohl(rt->rt_gateway));  			if (arpindex >= 0) { -				if (!memcmp(nesadapter->arp_table[arpindex].mac_addr, -					    neigh->ha, ETH_ALEN)) { +				if (ether_addr_equal(nesadapter->arp_table[arpindex].mac_addr, neigh->ha)) {  					/* Mac address same as in nes_arp_table */  					goto out;  				} @@ -1408,10 +1571,16 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core,  	cm_node->loc_port = cm_info->loc_port;  	cm_node->rem_port = cm_info->rem_port; +	cm_node->mapped_loc_addr = cm_info->mapped_loc_addr; +	cm_node->mapped_rem_addr = cm_info->mapped_rem_addr; +	cm_node->mapped_loc_port = cm_info->mapped_loc_port; +	cm_node->mapped_rem_port = cm_info->mapped_rem_port; +  	cm_node->mpa_frame_rev = mpa_version;  	cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; -	cm_node->ird_size = IETF_NO_IRD_ORD; -	cm_node->ord_size = IETF_NO_IRD_ORD; +	cm_node->mpav2_ird_ord = 0; +	cm_node->ird_size = 0; +	cm_node->ord_size = 0;  	nes_debug(NES_DBG_CM, "Make node addresses : loc = %pI4:%x, rem = %pI4:%x\n",  		  &cm_node->loc_addr, cm_node->loc_port, @@ -1453,8 +1622,10 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core,  	cm_node->loopbackpartner = NULL;  	/* get the mac addr for the remote node */ -	oldarpindex = nes_arp_table(nesdev, cm_node->rem_addr, NULL, NES_ARP_RESOLVE); -	arpindex = nes_addr_resolve_neigh(nesvnic, cm_info->rem_addr, oldarpindex); +	oldarpindex = nes_arp_table(nesdev, cm_node->mapped_rem_addr, +				NULL, NES_ARP_RESOLVE); +	arpindex = nes_addr_resolve_neigh(nesvnic, +				cm_node->mapped_rem_addr, oldarpindex);  	if (arpindex < 0) {  		kfree(cm_node);  		return NULL; @@ -1516,11 +1687,14 @@ static int rem_ref_cm_node(struct nes_cm_core *cm_core,  		mini_cm_dec_refcnt_listen(cm_core, cm_node->listener, 0);  	} else {  		if (cm_node->apbvt_set && cm_node->nesvnic) { -			nes_manage_apbvt(cm_node->nesvnic, cm_node->loc_port, -					 PCI_FUNC( -						 cm_node->nesvnic->nesdev->pcidev->devfn), +			nes_manage_apbvt(cm_node->nesvnic, cm_node->mapped_loc_port, +					 PCI_FUNC(cm_node->nesvnic->nesdev->pcidev->devfn),  					 NES_MANAGE_APBVT_DEL);  		} +		nes_debug(NES_DBG_NLMSG, "Delete APBVT mapped_loc_port = %04X\n", +					cm_node->mapped_loc_port); +		nes_remove_mapinfo(cm_node->loc_addr, cm_node->loc_port, +			cm_node->mapped_loc_addr, cm_node->mapped_loc_port);  	}  	atomic_dec(&cm_core->node_cnt); @@ -2188,17 +2362,21 @@ static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb,   * mini_cm_listen - create a listen node with params   */  static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, -					      struct nes_vnic *nesvnic, struct nes_cm_info *cm_info) +			struct nes_vnic *nesvnic, struct nes_cm_info *cm_info)  {  	struct nes_cm_listener *listener; +	struct iwpm_dev_data pm_reg_msg; +	struct iwpm_sa_data pm_msg;  	unsigned long flags; +	int iwpm_err = 0;  	nes_debug(NES_DBG_CM, "Search for 0x%08x : 0x%04x\n",  		  cm_info->loc_addr, cm_info->loc_port);  	/* cannot have multiple matching listeners */ -	listener = find_listener(cm_core, htonl(cm_info->loc_addr), -				 htons(cm_info->loc_port), NES_CM_LISTENER_EITHER_STATE); +	listener = find_listener(cm_core, cm_info->loc_addr, cm_info->loc_port, +				NES_CM_LISTENER_EITHER_STATE, 1); +  	if (listener && listener->listener_state == NES_CM_LISTENER_ACTIVE_STATE) {  		/* find automatically incs ref count ??? */  		atomic_dec(&listener->ref_count); @@ -2207,6 +2385,22 @@ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core,  	}  	if (!listener) { +		nes_form_reg_msg(nesvnic, &pm_reg_msg); +		iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES); +		if (iwpm_err) { +			nes_debug(NES_DBG_NLMSG, +			"Port Mapper reg pid fail (err = %d).\n", iwpm_err); +		} +		if (iwpm_valid_pid() && !iwpm_err) { +			nes_form_pm_msg(cm_info, &pm_msg); +			iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_NES); +			if (iwpm_err) +				nes_debug(NES_DBG_NLMSG, +				"Port Mapper query fail (err = %d).\n", iwpm_err); +			else +				nes_record_pm_msg(cm_info, &pm_msg); +		} +  		/* create a CM listen node (1/2 node to compare incoming traffic to) */  		listener = kzalloc(sizeof(*listener), GFP_ATOMIC);  		if (!listener) { @@ -2214,8 +2408,10 @@ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core,  			return NULL;  		} -		listener->loc_addr = htonl(cm_info->loc_addr); -		listener->loc_port = htons(cm_info->loc_port); +		listener->loc_addr = cm_info->loc_addr; +		listener->loc_port = cm_info->loc_port; +		listener->mapped_loc_addr = cm_info->mapped_loc_addr; +		listener->mapped_loc_port = cm_info->mapped_loc_port;  		listener->reused_node = 0;  		atomic_set(&listener->ref_count, 1); @@ -2277,14 +2473,18 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core,  	if (cm_info->loc_addr == cm_info->rem_addr) {  		loopbackremotelistener = find_listener(cm_core, -						       ntohl(nesvnic->local_ipaddr), cm_node->rem_port, -						       NES_CM_LISTENER_ACTIVE_STATE); +			cm_node->mapped_loc_addr, cm_node->mapped_rem_port, +			NES_CM_LISTENER_ACTIVE_STATE, 0);  		if (loopbackremotelistener == NULL) {  			create_event(cm_node, NES_CM_EVENT_ABORTED);  		} else {  			loopback_cm_info = *cm_info;  			loopback_cm_info.loc_port = cm_info->rem_port;  			loopback_cm_info.rem_port = cm_info->loc_port; +			loopback_cm_info.mapped_loc_port = +				cm_info->mapped_rem_port; +			loopback_cm_info.mapped_rem_port = +				cm_info->mapped_loc_port;  			loopback_cm_info.cm_id = loopbackremotelistener->cm_id;  			loopbackremotenode = make_cm_node(cm_core, nesvnic,  							  &loopback_cm_info, loopbackremotelistener); @@ -2513,6 +2713,12 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core,  	nfo.rem_addr = ntohl(iph->saddr);  	nfo.rem_port = ntohs(tcph->source); +	/* If port mapper is available these should be mapped address info */ +	nfo.mapped_loc_addr = ntohl(iph->daddr); +	nfo.mapped_loc_port = ntohs(tcph->dest); +	nfo.mapped_rem_addr = ntohl(iph->saddr); +	nfo.mapped_rem_port = ntohs(tcph->source); +  	tmp_daddr = cpu_to_be32(iph->daddr);  	tmp_saddr = cpu_to_be32(iph->saddr); @@ -2521,8 +2727,8 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core,  	do {  		cm_node = find_node(cm_core, -				    nfo.rem_port, nfo.rem_addr, -				    nfo.loc_port, nfo.loc_addr); +				    nfo.mapped_rem_port, nfo.mapped_rem_addr, +				    nfo.mapped_loc_port, nfo.mapped_loc_addr);  		if (!cm_node) {  			/* Only type of packet accepted are for */ @@ -2531,9 +2737,9 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core,  				skb_handled = 0;  				break;  			} -			listener = find_listener(cm_core, nfo.loc_addr, -						 nfo.loc_port, -						 NES_CM_LISTENER_ACTIVE_STATE); +			listener = find_listener(cm_core, nfo.mapped_loc_addr, +					nfo.mapped_loc_port, +					NES_CM_LISTENER_ACTIVE_STATE, 0);  			if (!listener) {  				nfo.cm_id = NULL;  				nfo.conn_type = 0; @@ -3028,11 +3234,11 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  		rem_ref_cm_node(cm_node->cm_core, cm_node);  		return -ECONNRESET;  	} -  	/* associate the node with the QP */  	nesqp->cm_node = (void *)cm_node;  	cm_node->nesqp = nesqp; +  	nes_debug(NES_DBG_CM, "QP%u, cm_node=%p, jiffies = %lu listener = %p\n",  		nesqp->hwqp.qp_id, cm_node, jiffies, cm_node->listener);  	atomic_inc(&cm_accepts); @@ -3055,6 +3261,11 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  	if (cm_node->mpa_frame_rev == IETF_MPA_V1)  		mpa_frame_offset = 4; +	if (cm_node->mpa_frame_rev == IETF_MPA_V1 || +			cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) { +		record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); +	} +  	memcpy(mpa_v2_frame->priv_data, conn_param->private_data,  	       conn_param->private_data_len); @@ -3118,7 +3329,6 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  	}  	nesqp->skip_lsmm = 1; -  	/* Cache the cm_id in the qp */  	nesqp->cm_id = cm_id;  	cm_node->cm_id = cm_id; @@ -3133,10 +3343,12 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  	nes_cm_init_tsa_conn(nesqp, cm_node); -	nesqp->nesqp_context->tcpPorts[0] = cpu_to_le16(ntohs(laddr->sin_port)); -	nesqp->nesqp_context->tcpPorts[1] = cpu_to_le16(ntohs(raddr->sin_port)); +	nesqp->nesqp_context->tcpPorts[0] = +				cpu_to_le16(cm_node->mapped_loc_port); +	nesqp->nesqp_context->tcpPorts[1] = +				cpu_to_le16(cm_node->mapped_rem_port); -	nesqp->nesqp_context->ip0 = cpu_to_le32(ntohl(raddr->sin_addr.s_addr)); +	nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr);  	nesqp->nesqp_context->misc2 |= cpu_to_le32(  		(u32)PCI_FUNC(nesdev->pcidev->devfn) << @@ -3155,14 +3367,14 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  	nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(  		((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT));  	nesqp->nesqp_context->ird_ord_sizes |= -		cpu_to_le32((u32)conn_param->ord); +		cpu_to_le32((u32)cm_node->ord_size);  	memset(&nes_quad, 0, sizeof(nes_quad));  	nes_quad.DstIpAdrIndex =  		cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); -	nes_quad.SrcIpadr = raddr->sin_addr.s_addr; -	nes_quad.TcpPorts[0] = raddr->sin_port; -	nes_quad.TcpPorts[1] = laddr->sin_port; +	nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr); +	nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port); +	nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port);  	/* Produce hash key */  	crc_value = get_crc_value(&nes_quad); @@ -3195,6 +3407,9 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  	cm_event.remote_addr = cm_id->remote_addr;  	cm_event.private_data = NULL;  	cm_event.private_data_len = 0; +	cm_event.ird = cm_node->ird_size; +	cm_event.ord = cm_node->ord_size; +  	ret = cm_id->event_handler(cm_id, &cm_event);  	attr.qp_state = IB_QPS_RTS;  	nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); @@ -3261,6 +3476,9 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  	int apbvt_set = 0;  	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;  	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; +	struct iwpm_dev_data pm_reg_msg; +	struct iwpm_sa_data pm_msg; +	int iwpm_err = 0;  	if (cm_id->remote_addr.ss_family != AF_INET)  		return -ENOSYS; @@ -3291,33 +3509,51 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  	/* cache the cm_id in the qp */  	nesqp->cm_id = cm_id; -  	cm_id->provider_data = nesqp; -  	nesqp->private_data_len = conn_param->private_data_len; -	nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32((u32)conn_param->ord); -	/* space for rdma0 read msg */ -	if (conn_param->ord == 0) -		nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(1);  	nes_debug(NES_DBG_CM, "requested ord = 0x%08X.\n", (u32)conn_param->ord);  	nes_debug(NES_DBG_CM, "mpa private data len =%u\n",  		  conn_param->private_data_len); +	/* set up the connection params for the node */ +	cm_info.loc_addr = ntohl(laddr->sin_addr.s_addr); +	cm_info.loc_port = ntohs(laddr->sin_port); +	cm_info.rem_addr = ntohl(raddr->sin_addr.s_addr); +	cm_info.rem_port = ntohs(raddr->sin_port); +	cm_info.cm_id = cm_id; +	cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; + +	/* No port mapper available, go with the specified peer information */ +	cm_info.mapped_loc_addr = cm_info.loc_addr; +	cm_info.mapped_loc_port = cm_info.loc_port; +	cm_info.mapped_rem_addr = cm_info.rem_addr; +	cm_info.mapped_rem_port = cm_info.rem_port; + +	nes_form_reg_msg(nesvnic, &pm_reg_msg); +	iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES); +	if (iwpm_err) { +		nes_debug(NES_DBG_NLMSG, +			"Port Mapper reg pid fail (err = %d).\n", iwpm_err); +	} +	if (iwpm_valid_pid() && !iwpm_err) { +		nes_form_pm_msg(&cm_info, &pm_msg); +		iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_NES); +		if (iwpm_err) +			nes_debug(NES_DBG_NLMSG, +			"Port Mapper query fail (err = %d).\n", iwpm_err); +		else +			nes_record_pm_msg(&cm_info, &pm_msg); +	} +  	if (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr) { -		nes_manage_apbvt(nesvnic, ntohs(laddr->sin_port), -				 PCI_FUNC(nesdev->pcidev->devfn), -				 NES_MANAGE_APBVT_ADD); +		nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port, +			PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD);  		apbvt_set = 1;  	} -	/* set up the connection params for the node */ -	cm_info.loc_addr = htonl(laddr->sin_addr.s_addr); -	cm_info.loc_port = htons(laddr->sin_port); -	cm_info.rem_addr = htonl(raddr->sin_addr.s_addr); -	cm_info.rem_port = htons(raddr->sin_port); -	cm_info.cm_id = cm_id; -	cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; +	if (nes_create_mapinfo(&cm_info)) +		return -ENOMEM;  	cm_id->add_ref(cm_id); @@ -3327,14 +3563,23 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)  					  &cm_info);  	if (!cm_node) {  		if (apbvt_set) -			nes_manage_apbvt(nesvnic, ntohs(laddr->sin_port), +			nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port,  					 PCI_FUNC(nesdev->pcidev->devfn),  					 NES_MANAGE_APBVT_DEL); +		nes_debug(NES_DBG_NLMSG, "Delete mapped_loc_port = %04X\n", +				cm_info.mapped_loc_port); +		nes_remove_mapinfo(cm_info.loc_addr, cm_info.loc_port, +			cm_info.mapped_loc_addr, cm_info.mapped_loc_port);  		cm_id->rem_ref(cm_id);  		return -ENOMEM;  	} +	record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); +	if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO && +				cm_node->ord_size == 0) +		cm_node->ord_size = 1; +  	cm_node->apbvt_set = apbvt_set;  	nesqp->cm_node = cm_node;  	cm_node->nesqp = nesqp; @@ -3371,13 +3616,16 @@ int nes_create_listen(struct iw_cm_id *cm_id, int backlog)  			nesvnic->local_ipaddr, laddr->sin_addr.s_addr);  	/* setup listen params in our api call struct */ -	cm_info.loc_addr = nesvnic->local_ipaddr; -	cm_info.loc_port = laddr->sin_port; +	cm_info.loc_addr = ntohl(nesvnic->local_ipaddr); +	cm_info.loc_port = ntohs(laddr->sin_port);  	cm_info.backlog = backlog;  	cm_info.cm_id = cm_id;  	cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; +	/* No port mapper available, go with the specified info */ +	cm_info.mapped_loc_addr = cm_info.loc_addr; +	cm_info.mapped_loc_port = cm_info.loc_port;  	cm_node = g_cm_core->api->listen(g_cm_core, nesvnic, &cm_info);  	if (!cm_node) { @@ -3389,7 +3637,10 @@ int nes_create_listen(struct iw_cm_id *cm_id, int backlog)  	cm_id->provider_data = cm_node;  	if (!cm_node->reused_node) { -		err = nes_manage_apbvt(nesvnic, ntohs(laddr->sin_port), +		if (nes_create_mapinfo(&cm_info)) +			return -ENOMEM; + +		err = nes_manage_apbvt(nesvnic, cm_node->mapped_loc_port,  				       PCI_FUNC(nesvnic->nesdev->pcidev->devfn),  				       NES_MANAGE_APBVT_ADD);  		if (err) { @@ -3514,9 +3765,11 @@ static void cm_event_connected(struct nes_cm_event *event)  	nes_cm_init_tsa_conn(nesqp, cm_node);  	/* set the QP tsa context */ -	nesqp->nesqp_context->tcpPorts[0] = cpu_to_le16(ntohs(laddr->sin_port)); -	nesqp->nesqp_context->tcpPorts[1] = cpu_to_le16(ntohs(raddr->sin_port)); -	nesqp->nesqp_context->ip0 = cpu_to_le32(ntohl(raddr->sin_addr.s_addr)); +	nesqp->nesqp_context->tcpPorts[0] = +			cpu_to_le16(cm_node->mapped_loc_port); +	nesqp->nesqp_context->tcpPorts[1] = +			cpu_to_le16(cm_node->mapped_rem_port); +	nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr);  	nesqp->nesqp_context->misc2 |= cpu_to_le32(  			(u32)PCI_FUNC(nesdev->pcidev->devfn) << @@ -3531,6 +3784,8 @@ static void cm_event_connected(struct nes_cm_event *event)  	nesqp->nesqp_context->ird_ord_sizes |=  			cpu_to_le32((u32)1 <<  			NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT); +	nesqp->nesqp_context->ird_ord_sizes |= +			cpu_to_le32((u32)cm_node->ord_size);  	/* Adjust tail for not having a LSMM */  	/*nesqp->hwqp.sq_tail = 1;*/ @@ -3544,9 +3799,9 @@ static void cm_event_connected(struct nes_cm_event *event)  	nes_quad.DstIpAdrIndex =  		cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); -	nes_quad.SrcIpadr = raddr->sin_addr.s_addr; -	nes_quad.TcpPorts[0] = raddr->sin_port; -	nes_quad.TcpPorts[1] = laddr->sin_port; +	nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr); +	nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port); +	nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port);  	/* Produce hash key */  	crc_value = get_crc_value(&nes_quad); @@ -3574,7 +3829,7 @@ static void cm_event_connected(struct nes_cm_event *event)  	cm_event.ird = cm_node->ird_size;  	cm_event.ord = cm_node->ord_size; -	cm_event_laddr->sin_addr.s_addr = event->cm_info.rem_addr; +	cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr);  	ret = cm_id->event_handler(cm_id, &cm_event);  	nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); @@ -3743,8 +3998,13 @@ static void cm_event_mpa_req(struct nes_cm_event *event)  	cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr);  	cm_event.private_data = cm_node->mpa_frame_buf;  	cm_event.private_data_len = (u8)cm_node->mpa_frame_size; +	if (cm_node->mpa_frame_rev == IETF_MPA_V1) { +		cm_event.ird = NES_MAX_IRD; +		cm_event.ord = NES_MAX_ORD; +	} else {  	cm_event.ird = cm_node->ird_size;  	cm_event.ord = cm_node->ord_size; +	}  	ret = cm_id->event_handler(cm_id, &cm_event);  	if (ret) diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 4646e666608..f522cf63978 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -1,5 +1,5 @@  /* - * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved. + * Copyright (c) 2006 - 2014 Intel Corporation.  All rights reserved.   *   * This software is available to you under a choice of one of two   * licenses.  You may choose to be licensed under the terms of the GNU @@ -58,6 +58,8 @@  #define IETF_RDMA0_WRITE        0x8000  #define IETF_RDMA0_READ         0x4000  #define IETF_NO_IRD_ORD         0x3FFF +#define NES_MAX_IRD		 0x40 +#define NES_MAX_ORD		 0x7F  enum ietf_mpa_flags {  	IETF_MPA_FLAGS_MARKERS = 0x80,	/* receive Markers */ @@ -291,8 +293,8 @@ struct nes_cm_listener {  	struct list_head           list;  	struct nes_cm_core         *cm_core;  	u8                         loc_mac[ETH_ALEN]; -	nes_addr_t                 loc_addr; -	u16                        loc_port; +	nes_addr_t                 loc_addr, mapped_loc_addr; +	u16                        loc_port, mapped_loc_port;  	struct iw_cm_id            *cm_id;  	enum nes_cm_conn_type      conn_type;  	atomic_t                   ref_count; @@ -306,7 +308,9 @@ struct nes_cm_listener {  /* per connection node and node state information */  struct nes_cm_node {  	nes_addr_t                loc_addr, rem_addr; +	nes_addr_t                mapped_loc_addr, mapped_rem_addr;  	u16                       loc_port, rem_port; +	u16                       mapped_loc_port, mapped_rem_port;  	u8                        loc_mac[ETH_ALEN];  	u8                        rem_mac[ETH_ALEN]; @@ -333,6 +337,7 @@ struct nes_cm_node {  	enum mpa_frame_version    mpa_frame_rev;  	u16			  ird_size;  	u16                       ord_size; +	u16			  mpav2_ird_ord;  	u16                       mpa_frame_size;  	struct iw_cm_id           *cm_id; @@ -361,6 +366,10 @@ struct nes_cm_info {  	u16 rem_port;  	nes_addr_t loc_addr;  	nes_addr_t rem_addr; +	u16 mapped_loc_port; +	u16 mapped_rem_port; +	nes_addr_t mapped_loc_addr; +	nes_addr_t mapped_rem_addr;  	enum nes_cm_conn_type  conn_type;  	int backlog; diff --git a/drivers/infiniband/hw/nes/nes_user.h b/drivers/infiniband/hw/nes/nes_user.h index 4926de74448..529c421bb15 100644 --- a/drivers/infiniband/hw/nes/nes_user.h +++ b/drivers/infiniband/hw/nes/nes_user.h @@ -39,8 +39,8 @@  #include <linux/types.h> -#define NES_ABI_USERSPACE_VER 1 -#define NES_ABI_KERNEL_VER    1 +#define NES_ABI_USERSPACE_VER 2 +#define NES_ABI_KERNEL_VER    2  /*   * Make sure that all structs defined in this file remain laid out so @@ -78,6 +78,7 @@ struct nes_create_cq_req {  struct nes_create_qp_req {  	__u64 user_wqe_buffers; +	__u64 user_qp_buffer;  };  enum iwnes_memreg_type { diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 5b53ca5a228..218dd357428 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1186,11 +1186,13 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,  					nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);  					kfree(nesqp->allocated_buffer);  					nes_debug(NES_DBG_QP, "ib_copy_from_udata() Failed \n"); -					return NULL; +					return ERR_PTR(-EFAULT);  				}  				if (req.user_wqe_buffers) {  					virt_wqs = 1;  				} +				if (req.user_qp_buffer) +					nesqp->nesuqp_addr = req.user_qp_buffer;  				if ((ibpd->uobject) && (ibpd->uobject->context)) {  					nesqp->user_mode = 1;  					nes_ucontext = to_nesucontext(ibpd->uobject->context); @@ -2307,7 +2309,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  	struct nes_device *nesdev = nesvnic->nesdev;  	struct nes_adapter *nesadapter = nesdev->nesadapter;  	struct ib_mr *ibmr = ERR_PTR(-EINVAL); -	struct ib_umem_chunk *chunk; +	struct scatterlist *sg;  	struct nes_ucontext *nes_ucontext;  	struct nes_pbl *nespbl;  	struct nes_mr *nesmr; @@ -2315,7 +2317,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  	struct nes_mem_reg_req req;  	struct nes_vpbl vpbl;  	struct nes_root_vpbl root_vpbl; -	int nmap_index, page_index; +	int entry, page_index;  	int page_count = 0;  	int err, pbl_depth = 0;  	int chunk_pages; @@ -2330,6 +2332,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  	u16 pbl_count;  	u8 single_page = 1;  	u8 stag_key; +	int first_page = 1;  	region = ib_umem_get(pd->uobject->context, start, length, acc, 0);  	if (IS_ERR(region)) { @@ -2380,128 +2383,125 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  			}  			nesmr->region = region; -			list_for_each_entry(chunk, ®ion->chunk_list, list) { -				nes_debug(NES_DBG_MR, "Chunk: nents = %u, nmap = %u .\n", -						chunk->nents, chunk->nmap); -				for (nmap_index = 0; nmap_index < chunk->nmap; ++nmap_index) { -					if (sg_dma_address(&chunk->page_list[nmap_index]) & ~PAGE_MASK) { -						ib_umem_release(region); -						nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); -						nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n", -								(unsigned int) sg_dma_address(&chunk->page_list[nmap_index])); -						ibmr = ERR_PTR(-EINVAL); -						kfree(nesmr); -						goto reg_user_mr_err; -					} +			for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) { +				if (sg_dma_address(sg) & ~PAGE_MASK) { +					ib_umem_release(region); +					nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); +					nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n", +						  (unsigned int) sg_dma_address(sg)); +					ibmr = ERR_PTR(-EINVAL); +					kfree(nesmr); +					goto reg_user_mr_err; +				} -					if (!sg_dma_len(&chunk->page_list[nmap_index])) { -						ib_umem_release(region); -						nes_free_resource(nesadapter, nesadapter->allocated_mrs, -								stag_index); -						nes_debug(NES_DBG_MR, "Invalid Buffer Size\n"); -						ibmr = ERR_PTR(-EINVAL); -						kfree(nesmr); -						goto reg_user_mr_err; -					} +				if (!sg_dma_len(sg)) { +					ib_umem_release(region); +					nes_free_resource(nesadapter, nesadapter->allocated_mrs, +							  stag_index); +					nes_debug(NES_DBG_MR, "Invalid Buffer Size\n"); +					ibmr = ERR_PTR(-EINVAL); +					kfree(nesmr); +					goto reg_user_mr_err; +				} -					region_length += sg_dma_len(&chunk->page_list[nmap_index]); -					chunk_pages = sg_dma_len(&chunk->page_list[nmap_index]) >> 12; -					region_length -= skip_pages << 12; -					for (page_index=skip_pages; page_index < chunk_pages; page_index++) { -						skip_pages = 0; -						if ((page_count!=0)&&(page_count<<12)-(region->offset&(4096-1))>=region->length) -							goto enough_pages; -						if ((page_count&0x01FF) == 0) { -							if (page_count >= 1024 * 512) { +				region_length += sg_dma_len(sg); +				chunk_pages = sg_dma_len(sg) >> 12; +				region_length -= skip_pages << 12; +				for (page_index = skip_pages; page_index < chunk_pages; page_index++) { +					skip_pages = 0; +					if ((page_count != 0) && (page_count<<12)-(region->offset&(4096-1)) >= region->length) +						goto enough_pages; +					if ((page_count&0x01FF) == 0) { +						if (page_count >= 1024 * 512) { +							ib_umem_release(region); +							nes_free_resource(nesadapter, +									  nesadapter->allocated_mrs, stag_index); +							kfree(nesmr); +							ibmr = ERR_PTR(-E2BIG); +							goto reg_user_mr_err; +						} +						if (root_pbl_index == 1) { +							root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, +									8192, &root_vpbl.pbl_pbase); +							nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n", +								  root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase); +							if (!root_vpbl.pbl_vbase) {  								ib_umem_release(region); -								nes_free_resource(nesadapter, -										nesadapter->allocated_mrs, stag_index); +								pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, +										    vpbl.pbl_pbase); +								nes_free_resource(nesadapter, nesadapter->allocated_mrs, +										  stag_index);  								kfree(nesmr); -								ibmr = ERR_PTR(-E2BIG); +								ibmr = ERR_PTR(-ENOMEM);  								goto reg_user_mr_err;  							} -							if (root_pbl_index == 1) { -								root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, -										8192, &root_vpbl.pbl_pbase); -								nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n", -										root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase); -								if (!root_vpbl.pbl_vbase) { -									ib_umem_release(region); -									pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, -											vpbl.pbl_pbase); -									nes_free_resource(nesadapter, nesadapter->allocated_mrs, -											stag_index); -									kfree(nesmr); -									ibmr = ERR_PTR(-ENOMEM); -									goto reg_user_mr_err; -								} -								root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, -										GFP_KERNEL); -								if (!root_vpbl.leaf_vpbl) { -									ib_umem_release(region); -									pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, -											root_vpbl.pbl_pbase); -									pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, -											vpbl.pbl_pbase); -									nes_free_resource(nesadapter, nesadapter->allocated_mrs, -											stag_index); -									kfree(nesmr); -									ibmr = ERR_PTR(-ENOMEM); -									goto reg_user_mr_err; -								} -								root_vpbl.pbl_vbase[0].pa_low = -										cpu_to_le32((u32)vpbl.pbl_pbase); -								root_vpbl.pbl_vbase[0].pa_high = -										cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); -								root_vpbl.leaf_vpbl[0] = vpbl; -							} -							vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, -									&vpbl.pbl_pbase); -							nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n", -									vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase); -							if (!vpbl.pbl_vbase) { +							root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, +									GFP_KERNEL); +							if (!root_vpbl.leaf_vpbl) {  								ib_umem_release(region); -								nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); -								ibmr = ERR_PTR(-ENOMEM); +								pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, +										    root_vpbl.pbl_pbase); +								pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, +										    vpbl.pbl_pbase); +								nes_free_resource(nesadapter, nesadapter->allocated_mrs, +										  stag_index);  								kfree(nesmr); +								ibmr = ERR_PTR(-ENOMEM);  								goto reg_user_mr_err;  							} -							if (1 <= root_pbl_index) { -								root_vpbl.pbl_vbase[root_pbl_index].pa_low = -										cpu_to_le32((u32)vpbl.pbl_pbase); -								root_vpbl.pbl_vbase[root_pbl_index].pa_high = -										cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32))); -								root_vpbl.leaf_vpbl[root_pbl_index] = vpbl; -							} -							root_pbl_index++; -							cur_pbl_index = 0; +							root_vpbl.pbl_vbase[0].pa_low = +									cpu_to_le32((u32)vpbl.pbl_pbase); +							root_vpbl.pbl_vbase[0].pa_high = +									cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); +							root_vpbl.leaf_vpbl[0] = vpbl;  						} -						if (single_page) { -							if (page_count != 0) { -								if ((last_dma_addr+4096) != -										(sg_dma_address(&chunk->page_list[nmap_index])+ -										(page_index*4096))) -									single_page = 0; -								last_dma_addr = sg_dma_address(&chunk->page_list[nmap_index])+ -										(page_index*4096); -							} else { -								first_dma_addr = sg_dma_address(&chunk->page_list[nmap_index])+ -										(page_index*4096); -								last_dma_addr = first_dma_addr; -							} +						vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, +								&vpbl.pbl_pbase); +						nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n", +							  vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase); +						if (!vpbl.pbl_vbase) { +							ib_umem_release(region); +							nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); +							ibmr = ERR_PTR(-ENOMEM); +							kfree(nesmr); +							goto reg_user_mr_err; +						} +						if (1 <= root_pbl_index) { +							root_vpbl.pbl_vbase[root_pbl_index].pa_low = +									cpu_to_le32((u32)vpbl.pbl_pbase); +							root_vpbl.pbl_vbase[root_pbl_index].pa_high = +									cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32))); +							root_vpbl.leaf_vpbl[root_pbl_index] = vpbl; +						} +						root_pbl_index++; +						cur_pbl_index = 0; +					} +					if (single_page) { +						if (page_count != 0) { +							if ((last_dma_addr+4096) != +									(sg_dma_address(sg)+ +									(page_index*4096))) +								single_page = 0; +							last_dma_addr = sg_dma_address(sg)+ +									(page_index*4096); +						} else { +							first_dma_addr = sg_dma_address(sg)+ +									(page_index*4096); +							last_dma_addr = first_dma_addr;  						} - -						vpbl.pbl_vbase[cur_pbl_index].pa_low = -								cpu_to_le32((u32)(sg_dma_address(&chunk->page_list[nmap_index])+ -								(page_index*4096))); -						vpbl.pbl_vbase[cur_pbl_index].pa_high = -								cpu_to_le32((u32)((((u64)(sg_dma_address(&chunk->page_list[nmap_index])+ -								(page_index*4096))) >> 32))); -						cur_pbl_index++; -						page_count++;  					} + +					vpbl.pbl_vbase[cur_pbl_index].pa_low = +							cpu_to_le32((u32)(sg_dma_address(sg)+ +							(page_index*4096))); +					vpbl.pbl_vbase[cur_pbl_index].pa_high = +							cpu_to_le32((u32)((((u64)(sg_dma_address(sg)+ +							(page_index*4096))) >> 32))); +					cur_pbl_index++; +					page_count++;  				}  			} +  			enough_pages:  			nes_debug(NES_DBG_MR, "calculating stag, stag_index=0x%08x, driver_key=0x%08x,"  					" stag_key=0x%08x\n", @@ -2613,25 +2613,28 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  				  nespbl->pbl_size, (unsigned long) nespbl->pbl_pbase,  				  (void *) nespbl->pbl_vbase, nespbl->user_base); -			list_for_each_entry(chunk, ®ion->chunk_list, list) { -				for (nmap_index = 0; nmap_index < chunk->nmap; ++nmap_index) { -					chunk_pages = sg_dma_len(&chunk->page_list[nmap_index]) >> 12; -					chunk_pages += (sg_dma_len(&chunk->page_list[nmap_index]) & (4096-1)) ? 1 : 0; -					nespbl->page = sg_page(&chunk->page_list[0]); -					for (page_index=0; page_index<chunk_pages; page_index++) { -						((__le32 *)pbl)[0] = cpu_to_le32((u32) -								(sg_dma_address(&chunk->page_list[nmap_index])+ -								(page_index*4096))); -						((__le32 *)pbl)[1] = cpu_to_le32(((u64) -								(sg_dma_address(&chunk->page_list[nmap_index])+ -								(page_index*4096)))>>32); -						nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl, -								(unsigned long long)*pbl, -								le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0])); -						pbl++; -					} +			for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) { +				chunk_pages = sg_dma_len(sg) >> 12; +				chunk_pages += (sg_dma_len(sg) & (4096-1)) ? 1 : 0; +				if (first_page) { +					nespbl->page = sg_page(sg); +					first_page = 0; +				} + +				for (page_index = 0; page_index < chunk_pages; page_index++) { +					((__le32 *)pbl)[0] = cpu_to_le32((u32) +							(sg_dma_address(sg)+ +							(page_index*4096))); +					((__le32 *)pbl)[1] = cpu_to_le32(((u64) +							(sg_dma_address(sg)+ +							(page_index*4096)))>>32); +					nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl, +						  (unsigned long long)*pbl, +						  le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0])); +					pbl++;  				}  			} +  			if (req.reg_type == IWNES_MEMREG_TYPE_QP) {  				list_add_tail(&nespbl->list, &nes_ucontext->qp_reg_mem_list);  			} else { @@ -2834,7 +2837,7 @@ static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,  	init_attr->qp_context = nesqp->ibqp.qp_context;  	init_attr->send_cq = nesqp->ibqp.send_cq;  	init_attr->recv_cq = nesqp->ibqp.recv_cq; -	init_attr->srq = nesqp->ibqp.srq = nesqp->ibqp.srq; +	init_attr->srq = nesqp->ibqp.srq;  	init_attr->cap = attr->cap;  	return 0; @@ -3134,9 +3137,7 @@ int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,  				" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",  				nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),  				original_last_aeq, nesqp->last_aeq); -		if ((!ret) || -				((original_last_aeq != NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) && -				(ret))) { +		if (!ret || original_last_aeq != NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) {  			if (dont_wait) {  				if (nesqp->cm_id && nesqp->hw_tcp_state != 0) {  					nes_debug(NES_DBG_MOD_QP, "QP%u Queuing fake disconnect for QP refcount (%d)," diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index 0eff7c44d76..309b31c31ae 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -184,5 +184,6 @@ struct nes_qp {  	u8                    pau_busy;  	u8                    pau_pending;  	u8                    pau_state; +	__u64                 nesuqp_addr;  };  #endif			/* NES_VERBS_H */ diff --git a/drivers/infiniband/hw/ocrdma/Kconfig b/drivers/infiniband/hw/ocrdma/Kconfig index b5b6056c851..c0cddc0192d 100644 --- a/drivers/infiniband/hw/ocrdma/Kconfig +++ b/drivers/infiniband/hw/ocrdma/Kconfig @@ -1,6 +1,6 @@  config INFINIBAND_OCRDMA  	tristate "Emulex One Connect HCA support" -	depends on ETHERNET && NETDEVICES && PCI && (IPV6 || IPV6=n) +	depends on ETHERNET && NETDEVICES && PCI && INET && (IPV6 || IPV6=n)  	select NET_VENDOR_EMULEX  	select BE2NET  	---help--- diff --git a/drivers/infiniband/hw/ocrdma/Makefile b/drivers/infiniband/hw/ocrdma/Makefile index 06a5bed12e4..d1bfd4f4cdd 100644 --- a/drivers/infiniband/hw/ocrdma/Makefile +++ b/drivers/infiniband/hw/ocrdma/Makefile @@ -2,4 +2,4 @@ ccflags-y := -Idrivers/net/ethernet/emulex/benet  obj-$(CONFIG_INFINIBAND_OCRDMA)	+= ocrdma.o -ocrdma-y :=	ocrdma_main.o ocrdma_verbs.o ocrdma_hw.o ocrdma_ah.o +ocrdma-y :=	ocrdma_main.o ocrdma_verbs.o ocrdma_hw.o ocrdma_ah.o ocrdma_stats.o diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index adc11d14f87..19011dbb930 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -35,17 +35,27 @@  #include <rdma/ib_verbs.h>  #include <rdma/ib_user_verbs.h> +#include <rdma/ib_addr.h>  #include <be_roce.h>  #include "ocrdma_sli.h" -#define OCRDMA_ROCE_DEV_VERSION "1.0.0" +#define OCRDMA_ROCE_DRV_VERSION "10.2.145.0u" + +#define OCRDMA_ROCE_DRV_DESC "Emulex OneConnect RoCE Driver"  #define OCRDMA_NODE_DESC "Emulex OneConnect RoCE HCA" +#define OC_NAME_SH	OCRDMA_NODE_DESC "(Skyhawk)" +#define OC_NAME_UNKNOWN OCRDMA_NODE_DESC "(Unknown)" + +#define OC_SKH_DEVICE_PF 0x720 +#define OC_SKH_DEVICE_VF 0x728  #define OCRDMA_MAX_AH 512  #define OCRDMA_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME) +#define convert_to_64bit(lo, hi) ((u64)hi << 32 | (u64)lo) +  struct ocrdma_dev_attr {  	u8 fw_ver[32];  	u32 vendor_id; @@ -65,6 +75,7 @@ struct ocrdma_dev_attr {  	int max_mr;  	u64 max_mr_size;  	u32 max_num_mr_pbl; +	int max_mw;  	int max_fmr;  	int max_map_per_fmr;  	int max_pages_per_frmr; @@ -83,6 +94,12 @@ struct ocrdma_dev_attr {  	u8 num_ird_pages;  }; +struct ocrdma_dma_mem { +	void *va; +	dma_addr_t pa; +	u32 size; +}; +  struct ocrdma_pbl {  	void *va;  	dma_addr_t pa; @@ -122,6 +139,52 @@ struct mqe_ctx {  	bool cmd_done;  }; +struct ocrdma_hw_mr { +	u32 lkey; +	u8 fr_mr; +	u8 remote_atomic; +	u8 remote_rd; +	u8 remote_wr; +	u8 local_rd; +	u8 local_wr; +	u8 mw_bind; +	u8 rsvd; +	u64 len; +	struct ocrdma_pbl *pbl_table; +	u32 num_pbls; +	u32 num_pbes; +	u32 pbl_size; +	u32 pbe_size; +	u64 fbo; +	u64 va; +}; + +struct ocrdma_mr { +	struct ib_mr ibmr; +	struct ib_umem *umem; +	struct ocrdma_hw_mr hwmr; +}; + +struct ocrdma_stats { +	u8 type; +	struct ocrdma_dev *dev; +}; + +struct stats_mem { +	struct ocrdma_mqe mqe; +	void *va; +	dma_addr_t pa; +	u32 size; +	char *debugfs_mem; +}; + +struct phy_info { +	u16 auto_speeds_supported; +	u16 fixed_speeds_supported; +	u16 phy_type; +	u16 interface_type; +}; +  struct ocrdma_dev {  	struct ib_device ibdev;  	struct ocrdma_dev_attr attr; @@ -165,12 +228,30 @@ struct ocrdma_dev {  	struct mqe_ctx mqe_ctx;  	struct be_dev_info nic_info; +	struct phy_info phy; +	char model_number[32]; +	u32 hba_port_num;  	struct list_head entry;  	struct rcu_head rcu;  	int id;  	u64 stag_arr[OCRDMA_MAX_STAG];  	u16 pvid; +	u32 asic_id; + +	ulong last_stats_time; +	struct mutex stats_lock; /* provide synch for debugfs operations */ +	struct stats_mem stats_mem; +	struct ocrdma_stats rsrc_stats; +	struct ocrdma_stats rx_stats; +	struct ocrdma_stats wqe_stats; +	struct ocrdma_stats tx_stats; +	struct ocrdma_stats db_err_stats; +	struct ocrdma_stats tx_qp_err_stats; +	struct ocrdma_stats rx_qp_err_stats; +	struct ocrdma_stats tx_dbg_stats; +	struct ocrdma_stats rx_dbg_stats; +	struct dentry *dir;  };  struct ocrdma_cq { @@ -183,8 +264,8 @@ struct ocrdma_cq {  			 */  	u32 max_hw_cqe;  	bool phase_change; -	bool armed, solicited; -	bool arm_needed; +	bool deferred_arm, deferred_sol; +	bool first_arm;  	spinlock_t cq_lock ____cacheline_aligned; /* provide synchronization  						   * to cq polling @@ -197,6 +278,7 @@ struct ocrdma_cq {  	struct ocrdma_ucontext *ucontext;  	dma_addr_t pa;  	u32 len; +	u32 cqe_cnt;  	/* head of all qp's sq and rq for which cqes need to be flushed  	 * by the software. @@ -206,7 +288,6 @@ struct ocrdma_cq {  struct ocrdma_pd {  	struct ib_pd ibpd; -	struct ocrdma_dev *dev;  	struct ocrdma_ucontext *uctx;  	u32 id;  	int num_dpp_qp; @@ -291,33 +372,6 @@ struct ocrdma_qp {  	bool dpp_enabled;  	u8 *ird_q_va;  	bool signaled; -	u16 db_cache; -}; - -struct ocrdma_hw_mr { -	u32 lkey; -	u8 fr_mr; -	u8 remote_atomic; -	u8 remote_rd; -	u8 remote_wr; -	u8 local_rd; -	u8 local_wr; -	u8 mw_bind; -	u8 rsvd; -	u64 len; -	struct ocrdma_pbl *pbl_table; -	u32 num_pbls; -	u32 num_pbes; -	u32 pbl_size; -	u32 pbe_size; -	u64 fbo; -	u64 va; -}; - -struct ocrdma_mr { -	struct ib_mr ibmr; -	struct ib_umem *umem; -	struct ocrdma_hw_mr hwmr;  };  struct ocrdma_ucontext { @@ -384,13 +438,6 @@ static inline struct ocrdma_srq *get_ocrdma_srq(struct ib_srq *ibsrq)  	return container_of(ibsrq, struct ocrdma_srq, ibsrq);  } - -static inline int ocrdma_get_num_posted_shift(struct ocrdma_qp *qp) -{ -	return ((qp->dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY && -		 qp->id < 128) ? 24 : 16); -} -  static inline int is_cqe_valid(struct ocrdma_cq *cq, struct ocrdma_cqe *cqe)  {  	int cqe_valid; @@ -422,5 +469,53 @@ static inline int is_cqe_wr_imm(struct ocrdma_cqe *cqe)  		OCRDMA_CQE_WRITE_IMM) ? 1 : 0;  } +static inline int ocrdma_resolve_dmac(struct ocrdma_dev *dev, +		struct ib_ah_attr *ah_attr, u8 *mac_addr) +{ +	struct in6_addr in6; + +	memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6)); +	if (rdma_is_multicast_addr(&in6)) +		rdma_get_mcast_mac(&in6, mac_addr); +	else +		memcpy(mac_addr, ah_attr->dmac, ETH_ALEN); +	return 0; +} + +static inline char *hca_name(struct ocrdma_dev *dev) +{ +	switch (dev->nic_info.pdev->device) { +	case OC_SKH_DEVICE_PF: +	case OC_SKH_DEVICE_VF: +		return OC_NAME_SH; +	default: +		return OC_NAME_UNKNOWN; +	} +} + +static inline int ocrdma_get_eq_table_index(struct ocrdma_dev *dev, +		int eqid) +{ +	int indx; + +	for (indx = 0; indx < dev->eq_cnt; indx++) { +		if (dev->eq_tbl[indx].q.id == eqid) +			return indx; +	} + +	return -EINVAL; +} + +static inline u8 ocrdma_get_asic_type(struct ocrdma_dev *dev) +{ +	if (dev->nic_info.dev_family == 0xF && !dev->asic_id) { +		pci_read_config_dword( +			dev->nic_info.pdev, +			OCRDMA_SLI_ASIC_ID_OFFSET, &dev->asic_id); +	} + +	return (dev->asic_id & OCRDMA_SLI_ASIC_GEN_NUM_MASK) >> +				OCRDMA_SLI_ASIC_GEN_NUM_SHIFT; +}  #endif diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_abi.h b/drivers/infiniband/hw/ocrdma/ocrdma_abi.h index fbac8eb4403..1554cca5712 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_abi.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_abi.h @@ -28,7 +28,8 @@  #ifndef __OCRDMA_ABI_H__  #define __OCRDMA_ABI_H__ -#define OCRDMA_ABI_VERSION 1 +#define OCRDMA_ABI_VERSION 2 +#define OCRDMA_BE_ROCE_ABI_VERSION 1  /* user kernel communication data structures. */  struct ocrdma_alloc_ucontext_resp { @@ -107,9 +108,7 @@ struct ocrdma_create_qp_uresp {  	u32 db_sq_offset;  	u32 db_rq_offset;  	u32 db_shift; -	u64 rsvd1; -	u64 rsvd2; -	u64 rsvd3; +	u64 rsvd[11];  } __packed;  struct ocrdma_create_srq_uresp { diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index ee499d94225..d4cc01f10c0 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -49,7 +49,7 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,  	ah->sgid_index = attr->grh.sgid_index; -	vlan_tag = rdma_get_vlan_id(&attr->grh.dgid); +	vlan_tag = attr->vlan_id;  	if (!vlan_tag || (vlan_tag > 0xFFF))  		vlan_tag = dev->pvid;  	if (vlan_tag && (vlan_tag < 0x1000)) { @@ -64,7 +64,8 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,  		eth_sz = sizeof(struct ocrdma_eth_basic);  	}  	memcpy(ð.smac[0], &dev->nic_info.mac_addr[0], ETH_ALEN); -	status = ocrdma_resolve_dgid(dev, &attr->grh.dgid, ð.dmac[0]); +	memcpy(ð.dmac[0], attr->dmac, ETH_ALEN); +	status = ocrdma_resolve_dmac(dev, attr, ð.dmac[0]);  	if (status)  		return status;  	status = ocrdma_query_gid(&dev->ibdev, 1, attr->grh.sgid_index, @@ -84,6 +85,7 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,  	memcpy((u8 *)ah->av + eth_sz, &grh, sizeof(struct ocrdma_grh));  	if (vlan_enabled)  		ah->av->valid |= OCRDMA_AV_VLAN_VALID; +	ah->av->valid = cpu_to_le32(ah->av->valid);  	return status;  } @@ -98,7 +100,7 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)  	if (!(attr->ah_flags & IB_AH_GRH))  		return ERR_PTR(-EINVAL); -	ah = kzalloc(sizeof *ah, GFP_ATOMIC); +	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);  	if (!ah)  		return ERR_PTR(-ENOMEM); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 4ed8235d2d3..3bbf2010a82 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -32,7 +32,6 @@  #include <rdma/ib_verbs.h>  #include <rdma/ib_user_verbs.h> -#include <rdma/ib_addr.h>  #include "ocrdma.h"  #include "ocrdma_hw.h" @@ -150,7 +149,7 @@ enum ib_qp_state get_ibqp_state(enum ocrdma_qp_state qps)  		return IB_QPS_SQE;  	case OCRDMA_QPS_ERR:  		return IB_QPS_ERR; -	}; +	}  	return IB_QPS_ERR;  } @@ -171,7 +170,7 @@ static enum ocrdma_qp_state get_ocrdma_qp_state(enum ib_qp_state qps)  		return OCRDMA_QPS_SQE;  	case IB_QPS_ERR:  		return OCRDMA_QPS_ERR; -	}; +	}  	return OCRDMA_QPS_ERR;  } @@ -243,6 +242,23 @@ static int ocrdma_get_mbx_errno(u32 status)  	return err_num;  } +char *port_speed_string(struct ocrdma_dev *dev) +{ +	char *str = ""; +	u16 speeds_supported; + +	speeds_supported = dev->phy.fixed_speeds_supported | +				dev->phy.auto_speeds_supported; +	if (speeds_supported & OCRDMA_PHY_SPEED_40GBPS) +		str = "40Gbps "; +	else if (speeds_supported & OCRDMA_PHY_SPEED_10GBPS) +		str = "10Gbps "; +	else if (speeds_supported & OCRDMA_PHY_SPEED_1GBPS) +		str = "1Gbps "; + +	return str; +} +  static int ocrdma_get_mbx_cqe_errno(u16 cqe_status)  {  	int err_num = -EINVAL; @@ -332,6 +348,11 @@ static void *ocrdma_init_emb_mqe(u8 opcode, u32 cmd_len)  	return mqe;  } +static void *ocrdma_alloc_mqe(void) +{ +	return kzalloc(sizeof(struct ocrdma_mqe), GFP_KERNEL); +} +  static void ocrdma_free_q(struct ocrdma_dev *dev, struct ocrdma_queue_info *q)  {  	dma_free_coherent(&dev->nic_info.pdev->dev, q->size, q->va, q->dma); @@ -364,8 +385,8 @@ static void ocrdma_build_q_pages(struct ocrdma_pa *q_pa, int cnt,  	}  } -static int ocrdma_mbx_delete_q(struct ocrdma_dev *dev, struct ocrdma_queue_info *q, -			       int queue_type) +static int ocrdma_mbx_delete_q(struct ocrdma_dev *dev, +			       struct ocrdma_queue_info *q, int queue_type)  {  	u8 opcode = 0;  	int status; @@ -444,7 +465,7 @@ mbx_err:  	return status;  } -static int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) +int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)  {  	int irq; @@ -574,6 +595,7 @@ static int ocrdma_create_mq(struct ocrdma_dev *dev)  	if (status)  		goto alloc_err; +	dev->eq_tbl[0].cq_cnt++;  	status = ocrdma_mbx_mq_cq_create(dev, &dev->mq.cq, &dev->eq_tbl[0].q);  	if (status)  		goto mbx_cq_free; @@ -639,7 +661,7 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev,  {  	struct ocrdma_qp *qp = NULL;  	struct ocrdma_cq *cq = NULL; -	struct ib_event ib_evt; +	struct ib_event ib_evt = { 0 };  	int cq_event = 0;  	int qp_event = 1;  	int srq_event = 0; @@ -664,6 +686,8 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev,  	case OCRDMA_CQ_OVERRUN_ERROR:  		ib_evt.element.cq = &cq->ibcq;  		ib_evt.event = IB_EVENT_CQ_ERR; +		cq_event = 1; +		qp_event = 0;  		break;  	case OCRDMA_CQ_QPCAT_ERROR:  		ib_evt.element.qp = &qp->ibqp; @@ -725,6 +749,7 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev,  						     qp->srq->ibsrq.  						     srq_context);  	} else if (dev_event) { +		pr_err("%s: Fatal event received\n", dev->ibdev.name);  		ib_dispatch_event(&ib_evt);  	} @@ -752,7 +777,6 @@ static void ocrdma_process_grp5_aync(struct ocrdma_dev *dev,  	}  } -  static void ocrdma_process_acqe(struct ocrdma_dev *dev, void *ae_cqe)  {  	/* async CQE processing */ @@ -799,8 +823,6 @@ static int ocrdma_mq_cq_handler(struct ocrdma_dev *dev, u16 cq_id)  			ocrdma_process_acqe(dev, cqe);  		else if (cqe->valid_ae_cmpl_cons & OCRDMA_MCQE_CMPL_MASK)  			ocrdma_process_mcqe(dev, cqe); -		else -			pr_err("%s() cqe->compl is not set.\n", __func__);  		memset(cqe, 0, sizeof(struct ocrdma_mcqe));  		ocrdma_mcq_inc_tail(dev);  	} @@ -858,16 +880,8 @@ static void ocrdma_qp_cq_handler(struct ocrdma_dev *dev, u16 cq_idx)  		BUG();  	cq = dev->cq_tbl[cq_idx]; -	if (cq == NULL) { -		pr_err("%s%d invalid id=0x%x\n", __func__, dev->id, cq_idx); +	if (cq == NULL)  		return; -	} -	spin_lock_irqsave(&cq->cq_lock, flags); -	cq->armed = false; -	cq->solicited = false; -	spin_unlock_irqrestore(&cq->cq_lock, flags); - -	ocrdma_ring_cq_db(dev, cq->id, false, false, 0);  	if (cq->ibcq.comp_handler) {  		spin_lock_irqsave(&cq->comp_handler_lock, flags); @@ -892,27 +906,35 @@ static irqreturn_t ocrdma_irq_handler(int irq, void *handle)  	struct ocrdma_dev *dev = eq->dev;  	struct ocrdma_eqe eqe;  	struct ocrdma_eqe *ptr; -	u16 eqe_popped = 0;  	u16 cq_id; -	while (1) { +	int budget = eq->cq_cnt; + +	do {  		ptr = ocrdma_get_eqe(eq);  		eqe = *ptr;  		ocrdma_le32_to_cpu(&eqe, sizeof(eqe));  		if ((eqe.id_valid & OCRDMA_EQE_VALID_MASK) == 0)  			break; -		eqe_popped += 1; +  		ptr->id_valid = 0; +		/* ring eq doorbell as soon as its consumed. */ +		ocrdma_ring_eq_db(dev, eq->q.id, false, true, 1);  		/* check whether its CQE or not. */  		if ((eqe.id_valid & OCRDMA_EQE_FOR_CQE_MASK) == 0) {  			cq_id = eqe.id_valid >> OCRDMA_EQE_RESOURCE_ID_SHIFT;  			ocrdma_cq_handler(dev, cq_id);  		}  		ocrdma_eq_inc_tail(eq); -	} -	ocrdma_ring_eq_db(dev, eq->q.id, true, true, eqe_popped); -	/* Ring EQ doorbell with num_popped to 0 to enable interrupts again. */ -	if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX) -		ocrdma_ring_eq_db(dev, eq->q.id, true, true, 0); + +		/* There can be a stale EQE after the last bound CQ is +		 * destroyed. EQE valid and budget == 0 implies this. +		 */ +		if (budget) +			budget--; + +	} while (budget); + +	ocrdma_ring_eq_db(dev, eq->q.id, true, true, 0);  	return IRQ_HANDLED;  } @@ -949,7 +971,8 @@ static int ocrdma_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe)  {  	int status = 0;  	u16 cqe_status, ext_status; -	struct ocrdma_mqe *rsp; +	struct ocrdma_mqe *rsp_mqe; +	struct ocrdma_mbx_rsp *rsp = NULL;  	mutex_lock(&dev->mqe_ctx.lock);  	ocrdma_post_mqe(dev, mqe); @@ -958,23 +981,61 @@ static int ocrdma_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe)  		goto mbx_err;  	cqe_status = dev->mqe_ctx.cqe_status;  	ext_status = dev->mqe_ctx.ext_status; -	rsp = ocrdma_get_mqe_rsp(dev); -	ocrdma_copy_le32_to_cpu(mqe, rsp, (sizeof(*mqe))); +	rsp_mqe = ocrdma_get_mqe_rsp(dev); +	ocrdma_copy_le32_to_cpu(mqe, rsp_mqe, (sizeof(*mqe))); +	if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >> +				OCRDMA_MQE_HDR_EMB_SHIFT) +		rsp = &mqe->u.rsp; +  	if (cqe_status || ext_status) { -		pr_err("%s() opcode=0x%x, cqe_status=0x%x, ext_status=0x%x\n", -		       __func__, -		     (rsp->u.rsp.subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >> -		     OCRDMA_MBX_RSP_OPCODE_SHIFT, cqe_status, ext_status); +		pr_err("%s() cqe_status=0x%x, ext_status=0x%x,", +		       __func__, cqe_status, ext_status); +		if (rsp) { +			/* This is for embedded cmds. */ +			pr_err("opcode=0x%x, subsystem=0x%x\n", +			       (rsp->subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >> +				OCRDMA_MBX_RSP_OPCODE_SHIFT, +				(rsp->subsys_op & OCRDMA_MBX_RSP_SUBSYS_MASK) >> +				OCRDMA_MBX_RSP_SUBSYS_SHIFT); +		}  		status = ocrdma_get_mbx_cqe_errno(cqe_status);  		goto mbx_err;  	} -	if (mqe->u.rsp.status & OCRDMA_MBX_RSP_STATUS_MASK) +	/* For non embedded, rsp errors are handled in ocrdma_nonemb_mbx_cmd */ +	if (rsp && (mqe->u.rsp.status & OCRDMA_MBX_RSP_STATUS_MASK))  		status = ocrdma_get_mbx_errno(mqe->u.rsp.status);  mbx_err:  	mutex_unlock(&dev->mqe_ctx.lock);  	return status;  } +static int ocrdma_nonemb_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe, +				 void *payload_va) +{ +	int status = 0; +	struct ocrdma_mbx_rsp *rsp = payload_va; + +	if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >> +				OCRDMA_MQE_HDR_EMB_SHIFT) +		BUG(); + +	status = ocrdma_mbx_cmd(dev, mqe); +	if (!status) +		/* For non embedded, only CQE failures are handled in +		 * ocrdma_mbx_cmd. We need to check for RSP errors. +		 */ +		if (rsp->status & OCRDMA_MBX_RSP_STATUS_MASK) +			status = ocrdma_get_mbx_errno(rsp->status); + +	if (status) +		pr_err("opcode=0x%x, subsystem=0x%x\n", +		       (rsp->subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >> +			OCRDMA_MBX_RSP_OPCODE_SHIFT, +			(rsp->subsys_op & OCRDMA_MBX_RSP_SUBSYS_MASK) >> +			OCRDMA_MBX_RSP_SUBSYS_SHIFT); +	return status; +} +  static void ocrdma_get_attr(struct ocrdma_dev *dev,  			      struct ocrdma_dev_attr *attr,  			      struct ocrdma_mbx_query_config *rsp) @@ -985,6 +1046,9 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev,  	attr->max_qp =  	    (rsp->qp_srq_cq_ird_ord & OCRDMA_MBX_QUERY_CFG_MAX_QP_MASK) >>  	    OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT; +	attr->max_srq = +		(rsp->max_srq_rpir_qps & OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK) >> +		OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET;  	attr->max_send_sge = ((rsp->max_write_send_sge &  			       OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK) >>  			      OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT); @@ -1000,9 +1064,6 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev,  	attr->max_ord_per_qp = (rsp->max_ird_ord_per_qp &  				OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_MASK) >>  	    OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_SHIFT; -	attr->max_srq = -		(rsp->max_srq_rpir_qps & OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK) >> -		OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET;  	attr->max_ird_per_qp = (rsp->max_ird_ord_per_qp &  				OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_MASK) >>  	    OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT; @@ -1015,6 +1076,7 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev,  	attr->local_ca_ack_delay = (rsp->max_pd_ca_ack_delay &  				    OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK) >>  	    OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT; +	attr->max_mw = rsp->max_mw;  	attr->max_mr = rsp->max_mr;  	attr->max_mr_size = ~0ull;  	attr->max_fmr = 0; @@ -1036,7 +1098,7 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev,  	attr->max_inline_data =  	    attr->wqe_size - (sizeof(struct ocrdma_hdr_wqe) +  			      sizeof(struct ocrdma_sge)); -	if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { +	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {  		attr->ird = 1;  		attr->ird_page_size = OCRDMA_MIN_Q_PAGE_SIZE;  		attr->num_ird_pages = MAX_OCRDMA_IRD_PAGES; @@ -1110,6 +1172,96 @@ mbx_err:  	return status;  } +int ocrdma_mbx_rdma_stats(struct ocrdma_dev *dev, bool reset) +{ +	struct ocrdma_rdma_stats_req *req = dev->stats_mem.va; +	struct ocrdma_mqe *mqe = &dev->stats_mem.mqe; +	struct ocrdma_rdma_stats_resp *old_stats = NULL; +	int status; + +	old_stats = kzalloc(sizeof(*old_stats), GFP_KERNEL); +	if (old_stats == NULL) +		return -ENOMEM; + +	memset(mqe, 0, sizeof(*mqe)); +	mqe->hdr.pyld_len = dev->stats_mem.size; +	mqe->hdr.spcl_sge_cnt_emb |= +			(1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) & +				OCRDMA_MQE_HDR_SGE_CNT_MASK; +	mqe->u.nonemb_req.sge[0].pa_lo = (u32) (dev->stats_mem.pa & 0xffffffff); +	mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dev->stats_mem.pa); +	mqe->u.nonemb_req.sge[0].len = dev->stats_mem.size; + +	/* Cache the old stats */ +	memcpy(old_stats, req, sizeof(struct ocrdma_rdma_stats_resp)); +	memset(req, 0, dev->stats_mem.size); + +	ocrdma_init_mch((struct ocrdma_mbx_hdr *)req, +			OCRDMA_CMD_GET_RDMA_STATS, +			OCRDMA_SUBSYS_ROCE, +			dev->stats_mem.size); +	if (reset) +		req->reset_stats = reset; + +	status = ocrdma_nonemb_mbx_cmd(dev, mqe, dev->stats_mem.va); +	if (status) +		/* Copy from cache, if mbox fails */ +		memcpy(req, old_stats, sizeof(struct ocrdma_rdma_stats_resp)); +	else +		ocrdma_le32_to_cpu(req, dev->stats_mem.size); + +	kfree(old_stats); +	return status; +} + +static int ocrdma_mbx_get_ctrl_attribs(struct ocrdma_dev *dev) +{ +	int status = -ENOMEM; +	struct ocrdma_dma_mem dma; +	struct ocrdma_mqe *mqe; +	struct ocrdma_get_ctrl_attribs_rsp *ctrl_attr_rsp; +	struct mgmt_hba_attribs *hba_attribs; + +	mqe = ocrdma_alloc_mqe(); +	if (!mqe) +		return status; +	memset(mqe, 0, sizeof(*mqe)); + +	dma.size = sizeof(struct ocrdma_get_ctrl_attribs_rsp); +	dma.va	 = dma_alloc_coherent(&dev->nic_info.pdev->dev, +					dma.size, &dma.pa, GFP_KERNEL); +	if (!dma.va) +		goto free_mqe; + +	mqe->hdr.pyld_len = dma.size; +	mqe->hdr.spcl_sge_cnt_emb |= +			(1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) & +			OCRDMA_MQE_HDR_SGE_CNT_MASK; +	mqe->u.nonemb_req.sge[0].pa_lo = (u32) (dma.pa & 0xffffffff); +	mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dma.pa); +	mqe->u.nonemb_req.sge[0].len = dma.size; + +	memset(dma.va, 0, dma.size); +	ocrdma_init_mch((struct ocrdma_mbx_hdr *)dma.va, +			OCRDMA_CMD_GET_CTRL_ATTRIBUTES, +			OCRDMA_SUBSYS_COMMON, +			dma.size); + +	status = ocrdma_nonemb_mbx_cmd(dev, mqe, dma.va); +	if (!status) { +		ctrl_attr_rsp = (struct ocrdma_get_ctrl_attribs_rsp *)dma.va; +		hba_attribs = &ctrl_attr_rsp->ctrl_attribs.hba_attribs; + +		dev->hba_port_num = hba_attribs->phy_port; +		strncpy(dev->model_number, +			hba_attribs->controller_model_number, 31); +	} +	dma_free_coherent(&dev->nic_info.pdev->dev, dma.size, dma.va, dma.pa); +free_mqe: +	kfree(mqe); +	return status; +} +  static int ocrdma_mbx_query_dev(struct ocrdma_dev *dev)  {  	int status = -ENOMEM; @@ -1157,6 +1309,35 @@ mbx_err:  	return status;  } +static int ocrdma_mbx_get_phy_info(struct ocrdma_dev *dev) +{ +	int status = -ENOMEM; +	struct ocrdma_mqe *cmd; +	struct ocrdma_get_phy_info_rsp *rsp; + +	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_PHY_DETAILS, sizeof(*cmd)); +	if (!cmd) +		return status; + +	ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0], +			OCRDMA_CMD_PHY_DETAILS, OCRDMA_SUBSYS_COMMON, +			sizeof(*cmd)); + +	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); +	if (status) +		goto mbx_err; + +	rsp = (struct ocrdma_get_phy_info_rsp *)cmd; +	dev->phy.phy_type = le16_to_cpu(rsp->phy_type); +	dev->phy.auto_speeds_supported  = +			le16_to_cpu(rsp->auto_speeds_supported); +	dev->phy.fixed_speeds_supported = +			le16_to_cpu(rsp->fixed_speeds_supported); +mbx_err: +	kfree(cmd); +	return status; +} +  int ocrdma_mbx_alloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd)  {  	int status = -ENOMEM; @@ -1226,7 +1407,7 @@ static int ocrdma_build_q_conf(u32 *num_entries, int entry_size,  static int ocrdma_mbx_create_ah_tbl(struct ocrdma_dev *dev)  { -	int i ; +	int i;  	int status = 0;  	int max_ah;  	struct ocrdma_create_ah_tbl *cmd; @@ -1357,12 +1538,10 @@ static void ocrdma_unbind_eq(struct ocrdma_dev *dev, u16 eq_id)  	int i;  	mutex_lock(&dev->dev_lock); -	for (i = 0; i < dev->eq_cnt; i++) { -		if (dev->eq_tbl[i].q.id != eq_id) -			continue; -		dev->eq_tbl[i].cq_cnt -= 1; -		break; -	} +	i = ocrdma_get_eq_table_index(dev, eq_id); +	if (i == -EINVAL) +		BUG(); +	dev->eq_tbl[i].cq_cnt -= 1;  	mutex_unlock(&dev->dev_lock);  } @@ -1380,7 +1559,7 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq,  		       __func__, dev->id, dev->attr.max_cqe, entries);  		return -EINVAL;  	} -	if (dpp_cq && (dev->nic_info.dev_family != OCRDMA_GEN2_FAMILY)) +	if (dpp_cq && (ocrdma_get_asic_type(dev) != OCRDMA_ASIC_GEN_SKH_R))  		return -EINVAL;  	if (dpp_cq) { @@ -1417,6 +1596,7 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq,  	cq->eqn = ocrdma_bind_eq(dev);  	cmd->cmd.req.rsvd_version = OCRDMA_CREATE_CQ_VER3;  	cqe_count = cq->len / cqe_size; +	cq->cqe_cnt = cqe_count;  	if (cqe_count > 1024) {  		/* Set cnt to 3 to indicate more than 1024 cq entries */  		cmd->cmd.ev_cnt_flags |= (0x3 << OCRDMA_CREATE_CQ_CNT_SHIFT); @@ -1439,7 +1619,7 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq,  	}  	/* shared eq between all the consumer cqs. */  	cmd->cmd.eqn = cq->eqn; -	if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { +	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {  		if (dpp_cq)  			cmd->cmd.pgsz_pgcnt |= OCRDMA_CREATE_CQ_DPP <<  				OCRDMA_CREATE_CQ_TYPE_SHIFT; @@ -1484,12 +1664,9 @@ int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq)  	    (cq->id << OCRDMA_DESTROY_CQ_QID_SHIFT) &  	    OCRDMA_DESTROY_CQ_QID_MASK; -	ocrdma_unbind_eq(dev, cq->eqn);  	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); -	if (status) -		goto mbx_err; +	ocrdma_unbind_eq(dev, cq->eqn);  	dma_free_coherent(&dev->nic_info.pdev->dev, cq->len, cq->va, cq->pa); -mbx_err:  	kfree(cmd);  	return status;  } @@ -1783,7 +1960,7 @@ static int ocrdma_set_create_qp_sq_cmd(struct ocrdma_create_qp_req *cmd,  	u32 max_sges = attrs->cap.max_send_sge;  	/* QP1 may exceed 127 */ -	max_wqe_allocated = min_t(int, attrs->cap.max_send_wr + 1, +	max_wqe_allocated = min_t(u32, attrs->cap.max_send_wr + 1,  				dev->attr.max_wqe);  	status = ocrdma_build_q_conf(&max_wqe_allocated, @@ -1982,7 +2159,7 @@ int ocrdma_mbx_create_qp(struct ocrdma_qp *qp, struct ib_qp_init_attr *attrs,  		break;  	default:  		return -EINVAL; -	}; +	}  	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_QP, sizeof(*cmd));  	if (!cmd) @@ -2029,8 +2206,7 @@ int ocrdma_mbx_create_qp(struct ocrdma_qp *qp, struct ib_qp_init_attr *attrs,  				OCRDMA_CREATE_QP_REQ_RQ_CQID_MASK;  	qp->rq_cq = cq; -	if (pd->dpp_enabled && attrs->cap.max_inline_data && pd->num_dpp_qp && -	    (attrs->cap.max_inline_data <= dev->attr.max_inline_data)) { +	if (pd->dpp_enabled && pd->num_dpp_qp) {  		ocrdma_set_create_qp_dpp_cmd(cmd, pd, qp, enable_dpp_cq,  					     dpp_cq_id);  	} @@ -2076,23 +2252,6 @@ mbx_err:  	return status;  } -int ocrdma_resolve_dgid(struct ocrdma_dev *dev, union ib_gid *dgid, -			u8 *mac_addr) -{ -	struct in6_addr in6; - -	memcpy(&in6, dgid, sizeof in6); -	if (rdma_is_multicast_addr(&in6)) { -		rdma_get_mcast_mac(&in6, mac_addr); -	} else if (rdma_link_local_addr(&in6)) { -		rdma_get_ll_mac(&in6, mac_addr); -	} else { -		pr_err("%s() fail to resolve mac_addr.\n", __func__); -		return -EINVAL; -	} -	return 0; -} -  static int ocrdma_set_av_params(struct ocrdma_qp *qp,  				struct ocrdma_modify_qp *cmd,  				struct ib_qp_attr *attrs) @@ -2116,7 +2275,7 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp,  	memcpy(&cmd->params.dgid[0], &ah_attr->grh.dgid.raw[0],  	       sizeof(cmd->params.dgid));  	status = ocrdma_query_gid(&qp->dev->ibdev, 1, -			 ah_attr->grh.sgid_index, &sgid); +			ah_attr->grh.sgid_index, &sgid);  	if (status)  		return status; @@ -2126,14 +2285,14 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp,  	qp->sgid_idx = ah_attr->grh.sgid_index;  	memcpy(&cmd->params.sgid[0], &sgid.raw[0], sizeof(cmd->params.sgid)); -	ocrdma_resolve_dgid(qp->dev, &ah_attr->grh.dgid, &mac_addr[0]); +	ocrdma_resolve_dmac(qp->dev, ah_attr, &mac_addr[0]);  	cmd->params.dmac_b0_to_b3 = mac_addr[0] | (mac_addr[1] << 8) |  				(mac_addr[2] << 16) | (mac_addr[3] << 24);  	/* convert them to LE format. */  	ocrdma_cpu_to_le32(&cmd->params.dgid[0], sizeof(cmd->params.dgid));  	ocrdma_cpu_to_le32(&cmd->params.sgid[0], sizeof(cmd->params.sgid));  	cmd->params.vlan_dmac_b4_to_b5 = mac_addr[4] | (mac_addr[5] << 8); -	vlan_id = rdma_get_vlan_id(&sgid); +	vlan_id = ah_attr->vlan_id;  	if (vlan_id && (vlan_id < 0x1000)) {  		cmd->params.vlan_dmac_b4_to_b5 |=  		    vlan_id << OCRDMA_QP_PARAMS_VLAN_SHIFT; @@ -2144,8 +2303,7 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp,  static int ocrdma_set_qp_params(struct ocrdma_qp *qp,  				struct ocrdma_modify_qp *cmd, -				struct ib_qp_attr *attrs, int attr_mask, -				enum ib_qp_state old_qps) +				struct ib_qp_attr *attrs, int attr_mask)  {  	int status = 0; @@ -2250,8 +2408,7 @@ pmtu_err:  }  int ocrdma_mbx_modify_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp, -			 struct ib_qp_attr *attrs, int attr_mask, -			 enum ib_qp_state old_qps) +			 struct ib_qp_attr *attrs, int attr_mask)  {  	int status = -ENOMEM;  	struct ocrdma_modify_qp *cmd; @@ -2274,7 +2431,7 @@ int ocrdma_mbx_modify_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp,  		    OCRDMA_QP_PARAMS_STATE_MASK;  	} -	status = ocrdma_set_qp_params(qp, cmd, attrs, attr_mask, old_qps); +	status = ocrdma_set_qp_params(qp, cmd, attrs, attr_mask);  	if (status)  		goto mbx_err;  	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); @@ -2505,7 +2662,7 @@ static int ocrdma_create_eqs(struct ocrdma_dev *dev)  	for (i = 0; i < num_eq; i++) {  		status = ocrdma_create_eq(dev, &dev->eq_tbl[i], -					  OCRDMA_EQ_LEN); +					OCRDMA_EQ_LEN);  		if (status) {  			status = -EINVAL;  			break; @@ -2550,6 +2707,13 @@ int ocrdma_init_hw(struct ocrdma_dev *dev)  	status = ocrdma_mbx_create_ah_tbl(dev);  	if (status)  		goto conf_err; +	status = ocrdma_mbx_get_phy_info(dev); +	if (status) +		goto conf_err; +	status = ocrdma_mbx_get_ctrl_attribs(dev); +	if (status) +		goto conf_err; +  	return 0;  conf_err: diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h index f2a89d4cc7c..e513f729314 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h @@ -94,7 +94,6 @@ void ocrdma_ring_cq_db(struct ocrdma_dev *, u16 cq_id, bool armed,  int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed);  int ocrdma_query_config(struct ocrdma_dev *,  			struct ocrdma_mbx_query_config *config); -int ocrdma_resolve_dgid(struct ocrdma_dev *, union ib_gid *dgid, u8 *mac_addr);  int ocrdma_mbx_alloc_pd(struct ocrdma_dev *, struct ocrdma_pd *);  int ocrdma_mbx_dealloc_pd(struct ocrdma_dev *, struct ocrdma_pd *); @@ -113,8 +112,7 @@ int ocrdma_mbx_create_qp(struct ocrdma_qp *, struct ib_qp_init_attr *attrs,  			 u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset,  			 u16 *dpp_credit_lmt);  int ocrdma_mbx_modify_qp(struct ocrdma_dev *, struct ocrdma_qp *, -			 struct ib_qp_attr *attrs, int attr_mask, -			 enum ib_qp_state old_qps); +			 struct ib_qp_attr *attrs, int attr_mask);  int ocrdma_mbx_query_qp(struct ocrdma_dev *, struct ocrdma_qp *,  			struct ocrdma_qp_params *param);  int ocrdma_mbx_destroy_qp(struct ocrdma_dev *, struct ocrdma_qp *); @@ -133,5 +131,8 @@ int ocrdma_qp_state_change(struct ocrdma_qp *, enum ib_qp_state new_state,  bool ocrdma_is_qp_in_sq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *);  bool ocrdma_is_qp_in_rq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *);  void ocrdma_flush_qp(struct ocrdma_qp *); +int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq); +int ocrdma_mbx_rdma_stats(struct ocrdma_dev *, bool reset); +char *port_speed_string(struct ocrdma_dev *dev);  #endif				/* __OCRDMA_HW_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 56e004940f1..7c504e07974 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -39,10 +39,11 @@  #include "ocrdma_ah.h"  #include "be_roce.h"  #include "ocrdma_hw.h" +#include "ocrdma_stats.h"  #include "ocrdma_abi.h" -MODULE_VERSION(OCRDMA_ROCE_DEV_VERSION); -MODULE_DESCRIPTION("Emulex RoCE HCA Driver"); +MODULE_VERSION(OCRDMA_ROCE_DRV_VERSION); +MODULE_DESCRIPTION(OCRDMA_ROCE_DRV_DESC " " OCRDMA_ROCE_DRV_VERSION);  MODULE_AUTHOR("Emulex Corporation");  MODULE_LICENSE("GPL"); @@ -67,46 +68,24 @@ void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid)  	guid[7] = mac_addr[5];  } -static void ocrdma_build_sgid_mac(union ib_gid *sgid, unsigned char *mac_addr, -				  bool is_vlan, u16 vlan_id) -{ -	sgid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); -	sgid->raw[8] = mac_addr[0] ^ 2; -	sgid->raw[9] = mac_addr[1]; -	sgid->raw[10] = mac_addr[2]; -	if (is_vlan) { -		sgid->raw[11] = vlan_id >> 8; -		sgid->raw[12] = vlan_id & 0xff; -	} else { -		sgid->raw[11] = 0xff; -		sgid->raw[12] = 0xfe; -	} -	sgid->raw[13] = mac_addr[3]; -	sgid->raw[14] = mac_addr[4]; -	sgid->raw[15] = mac_addr[5]; -} - -static bool ocrdma_add_sgid(struct ocrdma_dev *dev, unsigned char *mac_addr, -			    bool is_vlan, u16 vlan_id) +static bool ocrdma_add_sgid(struct ocrdma_dev *dev, union ib_gid *new_sgid)  {  	int i; -	union ib_gid new_sgid;  	unsigned long flags;  	memset(&ocrdma_zero_sgid, 0, sizeof(union ib_gid)); -	ocrdma_build_sgid_mac(&new_sgid, mac_addr, is_vlan, vlan_id);  	spin_lock_irqsave(&dev->sgid_lock, flags);  	for (i = 0; i < OCRDMA_MAX_SGID; i++) {  		if (!memcmp(&dev->sgid_tbl[i], &ocrdma_zero_sgid,  			    sizeof(union ib_gid))) {  			/* found free entry */ -			memcpy(&dev->sgid_tbl[i], &new_sgid, +			memcpy(&dev->sgid_tbl[i], new_sgid,  			       sizeof(union ib_gid));  			spin_unlock_irqrestore(&dev->sgid_lock, flags);  			return true; -		} else if (!memcmp(&dev->sgid_tbl[i], &new_sgid, +		} else if (!memcmp(&dev->sgid_tbl[i], new_sgid,  				   sizeof(union ib_gid))) {  			/* entry already present, no addition is required. */  			spin_unlock_irqrestore(&dev->sgid_lock, flags); @@ -117,20 +96,17 @@ static bool ocrdma_add_sgid(struct ocrdma_dev *dev, unsigned char *mac_addr,  	return false;  } -static bool ocrdma_del_sgid(struct ocrdma_dev *dev, unsigned char *mac_addr, -			    bool is_vlan, u16 vlan_id) +static bool ocrdma_del_sgid(struct ocrdma_dev *dev, union ib_gid *sgid)  {  	int found = false;  	int i; -	union ib_gid sgid;  	unsigned long flags; -	ocrdma_build_sgid_mac(&sgid, mac_addr, is_vlan, vlan_id);  	spin_lock_irqsave(&dev->sgid_lock, flags);  	/* first is default sgid, which cannot be deleted. */  	for (i = 1; i < OCRDMA_MAX_SGID; i++) { -		if (!memcmp(&dev->sgid_tbl[i], &sgid, sizeof(union ib_gid))) { +		if (!memcmp(&dev->sgid_tbl[i], sgid, sizeof(union ib_gid))) {  			/* found matching entry */  			memset(&dev->sgid_tbl[i], 0, sizeof(union ib_gid));  			found = true; @@ -141,75 +117,18 @@ static bool ocrdma_del_sgid(struct ocrdma_dev *dev, unsigned char *mac_addr,  	return found;  } -static void ocrdma_add_default_sgid(struct ocrdma_dev *dev) -{ -	/* GID Index 0 - Invariant manufacturer-assigned EUI-64 */ -	union ib_gid *sgid = &dev->sgid_tbl[0]; - -	sgid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); -	ocrdma_get_guid(dev, &sgid->raw[8]); -} - -#if IS_ENABLED(CONFIG_VLAN_8021Q) -static void ocrdma_add_vlan_sgids(struct ocrdma_dev *dev) -{ -	struct net_device *netdev, *tmp; -	u16 vlan_id; -	bool is_vlan; - -	netdev = dev->nic_info.netdev; - -	rcu_read_lock(); -	for_each_netdev_rcu(&init_net, tmp) { -		if (netdev == tmp || vlan_dev_real_dev(tmp) == netdev) { -			if (!netif_running(tmp) || !netif_oper_up(tmp)) -				continue; -			if (netdev != tmp) { -				vlan_id = vlan_dev_vlan_id(tmp); -				is_vlan = true; -			} else { -				is_vlan = false; -				vlan_id = 0; -				tmp = netdev; -			} -			ocrdma_add_sgid(dev, tmp->dev_addr, is_vlan, vlan_id); -		} -	} -	rcu_read_unlock(); -} -#else -static void ocrdma_add_vlan_sgids(struct ocrdma_dev *dev) -{ - -} -#endif /* VLAN */ - -static int ocrdma_build_sgid_tbl(struct ocrdma_dev *dev) -{ -	ocrdma_add_default_sgid(dev); -	ocrdma_add_vlan_sgids(dev); -	return 0; -} - -#if IS_ENABLED(CONFIG_IPV6) - -static int ocrdma_inet6addr_event(struct notifier_block *notifier, -				  unsigned long event, void *ptr) +static int ocrdma_addr_event(unsigned long event, struct net_device *netdev, +			     union ib_gid *gid)  { -	struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; -	struct net_device *netdev = ifa->idev->dev;  	struct ib_event gid_event;  	struct ocrdma_dev *dev;  	bool found = false;  	bool updated = false;  	bool is_vlan = false; -	u16 vid = 0;  	is_vlan = netdev->priv_flags & IFF_802_1Q_VLAN; -	if (is_vlan) { -		vid = vlan_dev_vlan_id(netdev); -		netdev = vlan_dev_real_dev(netdev); -	} +	if (is_vlan) +		netdev = rdma_vlan_dev_real_dev(netdev);  	rcu_read_lock();  	list_for_each_entry_rcu(dev, &ocrdma_dev_list, entry) { @@ -222,16 +141,14 @@ static int ocrdma_inet6addr_event(struct notifier_block *notifier,  	if (!found)  		return NOTIFY_DONE; -	if (!rdma_link_local_addr((struct in6_addr *)&ifa->addr)) -		return NOTIFY_DONE;  	mutex_lock(&dev->dev_lock);  	switch (event) {  	case NETDEV_UP: -		updated = ocrdma_add_sgid(dev, netdev->dev_addr, is_vlan, vid); +		updated = ocrdma_add_sgid(dev, gid);  		break;  	case NETDEV_DOWN: -		updated = ocrdma_del_sgid(dev, netdev->dev_addr, is_vlan, vid); +		updated = ocrdma_del_sgid(dev, gid);  		break;  	default:  		break; @@ -247,6 +164,32 @@ static int ocrdma_inet6addr_event(struct notifier_block *notifier,  	return NOTIFY_OK;  } +static int ocrdma_inetaddr_event(struct notifier_block *notifier, +				  unsigned long event, void *ptr) +{ +	struct in_ifaddr *ifa = ptr; +	union ib_gid gid; +	struct net_device *netdev = ifa->ifa_dev->dev; + +	ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid); +	return ocrdma_addr_event(event, netdev, &gid); +} + +static struct notifier_block ocrdma_inetaddr_notifier = { +	.notifier_call = ocrdma_inetaddr_event +}; + +#if IS_ENABLED(CONFIG_IPV6) + +static int ocrdma_inet6addr_event(struct notifier_block *notifier, +				  unsigned long event, void *ptr) +{ +	struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; +	union  ib_gid *gid = (union ib_gid *)&ifa->addr; +	struct net_device *netdev = ifa->idev->dev; +	return ocrdma_addr_event(event, netdev, gid); +} +  static struct notifier_block ocrdma_inet6addr_notifier = {  	.notifier_call = ocrdma_inet6addr_event  }; @@ -344,7 +287,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)  	dev->ibdev.process_mad = ocrdma_process_mad; -	if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { +	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {  		dev->ibdev.uverbs_cmd_mask |=  		     OCRDMA_UVERBS(CREATE_SRQ) |  		     OCRDMA_UVERBS(MODIFY_SRQ) | @@ -396,9 +339,42 @@ static void ocrdma_free_resources(struct ocrdma_dev *dev)  	kfree(dev->sgid_tbl);  } +/* OCRDMA sysfs interface */ +static ssize_t show_rev(struct device *device, struct device_attribute *attr, +			char *buf) +{ +	struct ocrdma_dev *dev = dev_get_drvdata(device); + +	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor); +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, +			char *buf) +{ +	struct ocrdma_dev *dev = dev_get_drvdata(device); + +	return scnprintf(buf, PAGE_SIZE, "%s", &dev->attr.fw_ver[0]); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); + +static struct device_attribute *ocrdma_attributes[] = { +	&dev_attr_hw_rev, +	&dev_attr_fw_ver +}; + +static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev) +{ +	int i; + +	for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++) +		device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]); +} +  static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)  { -	int status = 0; +	int status = 0, i;  	struct ocrdma_dev *dev;  	dev = (struct ocrdma_dev *)ib_alloc_device(sizeof(struct ocrdma_dev)); @@ -423,19 +399,29 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)  	if (status)  		goto alloc_err; -	status = ocrdma_build_sgid_tbl(dev); -	if (status) -		goto alloc_err; -  	status = ocrdma_register_device(dev);  	if (status)  		goto alloc_err; +	for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++) +		if (device_create_file(&dev->ibdev.dev, ocrdma_attributes[i])) +			goto sysfs_err;  	spin_lock(&ocrdma_devlist_lock);  	list_add_tail_rcu(&dev->entry, &ocrdma_dev_list);  	spin_unlock(&ocrdma_devlist_lock); +	/* Init stats */ +	ocrdma_add_port_stats(dev); + +	pr_info("%s %s: %s \"%s\" port %d\n", +		dev_name(&dev->nic_info.pdev->dev), hca_name(dev), +		port_speed_string(dev), dev->model_number, +		dev->hba_port_num); +	pr_info("%s ocrdma%d driver loaded successfully\n", +		dev_name(&dev->nic_info.pdev->dev), dev->id);  	return dev; +sysfs_err: +	ocrdma_remove_sysfiles(dev);  alloc_err:  	ocrdma_free_resources(dev);  	ocrdma_cleanup_hw(dev); @@ -452,9 +438,6 @@ static void ocrdma_remove_free(struct rcu_head *rcu)  {  	struct ocrdma_dev *dev = container_of(rcu, struct ocrdma_dev, rcu); -	ocrdma_free_resources(dev); -	ocrdma_cleanup_hw(dev); -  	idr_remove(&ocrdma_dev_id, dev->id);  	kfree(dev->mbx_cmd);  	ib_dealloc_device(&dev->ibdev); @@ -465,11 +448,18 @@ static void ocrdma_remove(struct ocrdma_dev *dev)  	/* first unregister with stack to stop all the active traffic  	 * of the registered clients.  	 */ +	ocrdma_rem_port_stats(dev); +	ocrdma_remove_sysfiles(dev); +  	ib_unregister_device(&dev->ibdev);  	spin_lock(&ocrdma_devlist_lock);  	list_del_rcu(&dev->entry);  	spin_unlock(&ocrdma_devlist_lock); + +	ocrdma_free_resources(dev); +	ocrdma_cleanup_hw(dev); +  	call_rcu(&dev->rcu, ocrdma_remove_free);  } @@ -498,7 +488,7 @@ static int ocrdma_close(struct ocrdma_dev *dev)  		cur_qp = dev->qp_tbl;  		for (i = 0; i < OCRDMA_MAX_QP; i++) {  			qp = cur_qp[i]; -			if (qp) { +			if (qp && qp->ibqp.qp_type != IB_QPT_GSI) {  				/* change the QP state to ERROR */  				_ocrdma_modify_qp(&qp->ibqp, &attrs, attr_mask); @@ -531,7 +521,7 @@ static void ocrdma_event_handler(struct ocrdma_dev *dev, u32 event)  	case BE_DEV_DOWN:  		ocrdma_close(dev);  		break; -	}; +	}  }  static struct ocrdma_driver ocrdma_drv = { @@ -539,6 +529,7 @@ static struct ocrdma_driver ocrdma_drv = {  	.add			= ocrdma_add,  	.remove			= ocrdma_remove,  	.state_change_handler	= ocrdma_event_handler, +	.be_abi_version		= OCRDMA_BE_ROCE_ABI_VERSION,  };  static void ocrdma_unregister_inet6addr_notifier(void) @@ -548,20 +539,37 @@ static void ocrdma_unregister_inet6addr_notifier(void)  #endif  } +static void ocrdma_unregister_inetaddr_notifier(void) +{ +	unregister_inetaddr_notifier(&ocrdma_inetaddr_notifier); +} +  static int __init ocrdma_init_module(void)  {  	int status; +	ocrdma_init_debugfs(); + +	status = register_inetaddr_notifier(&ocrdma_inetaddr_notifier); +	if (status) +		return status; +  #if IS_ENABLED(CONFIG_IPV6)  	status = register_inet6addr_notifier(&ocrdma_inet6addr_notifier);  	if (status) -		return status; +		goto err_notifier6;  #endif  	status = be_roce_register_driver(&ocrdma_drv);  	if (status) -		ocrdma_unregister_inet6addr_notifier(); +		goto err_be_reg; +	return 0; + +err_be_reg: +	ocrdma_unregister_inet6addr_notifier(); +err_notifier6: +	ocrdma_unregister_inetaddr_notifier();  	return status;  } @@ -569,6 +577,8 @@ static void __exit ocrdma_exit_module(void)  {  	be_roce_unregister_driver(&ocrdma_drv);  	ocrdma_unregister_inet6addr_notifier(); +	ocrdma_unregister_inetaddr_notifier(); +	ocrdma_rem_debugfs();  }  module_init(ocrdma_init_module); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h index 9f9570ec3c2..96c9ee602ba 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -30,8 +30,16 @@  #define Bit(_b) (1 << (_b)) -#define OCRDMA_GEN1_FAMILY	0xB -#define OCRDMA_GEN2_FAMILY	0x2 +enum { +	OCRDMA_ASIC_GEN_SKH_R = 0x04, +	OCRDMA_ASIC_GEN_LANCER = 0x0B +}; + +enum { +	OCRDMA_ASIC_REV_A0 = 0x00, +	OCRDMA_ASIC_REV_B0 = 0x10, +	OCRDMA_ASIC_REV_C0 = 0x20 +};  #define OCRDMA_SUBSYS_ROCE 10  enum { @@ -64,6 +72,7 @@ enum {  	OCRDMA_CMD_ATTACH_MCAST,  	OCRDMA_CMD_DETACH_MCAST, +	OCRDMA_CMD_GET_RDMA_STATS,  	OCRDMA_CMD_MAX  }; @@ -74,12 +83,14 @@ enum {  	OCRDMA_CMD_CREATE_CQ		= 12,  	OCRDMA_CMD_CREATE_EQ		= 13,  	OCRDMA_CMD_CREATE_MQ		= 21, +	OCRDMA_CMD_GET_CTRL_ATTRIBUTES  = 32,  	OCRDMA_CMD_GET_FW_VER		= 35,  	OCRDMA_CMD_DELETE_MQ		= 53,  	OCRDMA_CMD_DELETE_CQ		= 54,  	OCRDMA_CMD_DELETE_EQ		= 55,  	OCRDMA_CMD_GET_FW_CONFIG	= 58, -	OCRDMA_CMD_CREATE_MQ_EXT	= 90 +	OCRDMA_CMD_CREATE_MQ_EXT	= 90, +	OCRDMA_CMD_PHY_DETAILS		= 102  };  enum { @@ -103,7 +114,10 @@ enum {  	OCRDMA_DB_GEN2_SRQ_OFFSET	= OCRDMA_DB_GEN2_RQ_OFFSET,  	OCRDMA_DB_CQ_OFFSET		= 0x120,  	OCRDMA_DB_EQ_OFFSET		= OCRDMA_DB_CQ_OFFSET, -	OCRDMA_DB_MQ_OFFSET		= 0x140 +	OCRDMA_DB_MQ_OFFSET		= 0x140, + +	OCRDMA_DB_SQ_SHIFT		= 16, +	OCRDMA_DB_RQ_SHIFT		= 24  };  #define OCRDMA_DB_CQ_RING_ID_MASK       0x3FF	/* bits 0 - 9 */ @@ -138,6 +152,10 @@ enum {  #define OCRDMA_MIN_Q_PAGE_SIZE (4096)  #define OCRDMA_MAX_Q_PAGES     (8) +#define OCRDMA_SLI_ASIC_ID_OFFSET	0x9C +#define OCRDMA_SLI_ASIC_REV_MASK	0x000000FF +#define OCRDMA_SLI_ASIC_GEN_NUM_MASK	0x0000FF00 +#define OCRDMA_SLI_ASIC_GEN_NUM_SHIFT	0x08  /*  # 0: 4K Bytes  # 1: 8K Bytes @@ -562,6 +580,30 @@ enum {  	OCRDMA_FN_MODE_RDMA	= 0x4  }; +struct ocrdma_get_phy_info_rsp { +	struct ocrdma_mqe_hdr hdr; +	struct ocrdma_mbx_rsp rsp; + +	u16 phy_type; +	u16 interface_type; +	u32 misc_params; +	u16 ext_phy_details; +	u16 rsvd; +	u16 auto_speeds_supported; +	u16 fixed_speeds_supported; +	u32 future_use[2]; +}; + +enum { +	OCRDMA_PHY_SPEED_ZERO = 0x0, +	OCRDMA_PHY_SPEED_10MBPS = 0x1, +	OCRDMA_PHY_SPEED_100MBPS = 0x2, +	OCRDMA_PHY_SPEED_1GBPS = 0x4, +	OCRDMA_PHY_SPEED_10GBPS = 0x8, +	OCRDMA_PHY_SPEED_40GBPS = 0x20 +}; + +  struct ocrdma_get_link_speed_rsp {  	struct ocrdma_mqe_hdr hdr;  	struct ocrdma_mbx_rsp rsp; @@ -590,7 +632,7 @@ enum {  enum {  	OCRDMA_CREATE_CQ_VER2			= 2, -	OCRDMA_CREATE_CQ_VER3                   = 3, +	OCRDMA_CREATE_CQ_VER3			= 3,  	OCRDMA_CREATE_CQ_PAGE_CNT_MASK		= 0xFFFF,  	OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT	= 16, @@ -1050,6 +1092,7 @@ enum {  	OCRDMA_MODIFY_QP_RSP_MAX_ORD_MASK	= 0xFFFF <<  					OCRDMA_MODIFY_QP_RSP_MAX_ORD_SHIFT  }; +  struct ocrdma_modify_qp_rsp {  	struct ocrdma_mqe_hdr hdr;  	struct ocrdma_mbx_rsp rsp; @@ -1062,8 +1105,8 @@ struct ocrdma_query_qp {  	struct ocrdma_mqe_hdr hdr;  	struct ocrdma_mbx_hdr req; -#define OCRDMA_QUERY_UP_QP_ID_SHIFT 0 -#define OCRDMA_QUERY_UP_QP_ID_MASK   0xFFFFFF +#define OCRDMA_QUERY_UP_QP_ID_SHIFT	0 +#define OCRDMA_QUERY_UP_QP_ID_MASK	0xFFFFFF  	u32 qp_id;  }; @@ -1694,7 +1737,7 @@ struct ocrdma_grh {  	u16	rsvd;  } __packed; -#define OCRDMA_AV_VALID		Bit(0) +#define OCRDMA_AV_VALID		Bit(7)  #define OCRDMA_AV_VLAN_VALID	Bit(1)  struct ocrdma_av { @@ -1703,4 +1746,208 @@ struct ocrdma_av {  	u32 valid;  } __packed; +struct ocrdma_rsrc_stats { +	u32 dpp_pds; +	u32 non_dpp_pds; +	u32 rc_dpp_qps; +	u32 uc_dpp_qps; +	u32 ud_dpp_qps; +	u32 rc_non_dpp_qps; +	u32 rsvd; +	u32 uc_non_dpp_qps; +	u32 ud_non_dpp_qps; +	u32 rsvd1; +	u32 srqs; +	u32 rbqs; +	u32 r64K_nsmr; +	u32 r64K_to_2M_nsmr; +	u32 r2M_to_44M_nsmr; +	u32 r44M_to_1G_nsmr; +	u32 r1G_to_4G_nsmr; +	u32 nsmr_count_4G_to_32G; +	u32 r32G_to_64G_nsmr; +	u32 r64G_to_128G_nsmr; +	u32 r128G_to_higher_nsmr; +	u32 embedded_nsmr; +	u32 frmr; +	u32 prefetch_qps; +	u32 ondemand_qps; +	u32 phy_mr; +	u32 mw; +	u32 rsvd2[7]; +}; + +struct ocrdma_db_err_stats { +	u32 sq_doorbell_errors; +	u32 cq_doorbell_errors; +	u32 rq_srq_doorbell_errors; +	u32 cq_overflow_errors; +	u32 rsvd[4]; +}; + +struct ocrdma_wqe_stats { +	u32 large_send_rc_wqes_lo; +	u32 large_send_rc_wqes_hi; +	u32 large_write_rc_wqes_lo; +	u32 large_write_rc_wqes_hi; +	u32 rsvd[4]; +	u32 read_wqes_lo; +	u32 read_wqes_hi; +	u32 frmr_wqes_lo; +	u32 frmr_wqes_hi; +	u32 mw_bind_wqes_lo; +	u32 mw_bind_wqes_hi; +	u32 invalidate_wqes_lo; +	u32 invalidate_wqes_hi; +	u32 rsvd1[2]; +	u32 dpp_wqe_drops; +	u32 rsvd2[5]; +}; + +struct ocrdma_tx_stats { +	u32 send_pkts_lo; +	u32 send_pkts_hi; +	u32 write_pkts_lo; +	u32 write_pkts_hi; +	u32 read_pkts_lo; +	u32 read_pkts_hi; +	u32 read_rsp_pkts_lo; +	u32 read_rsp_pkts_hi; +	u32 ack_pkts_lo; +	u32 ack_pkts_hi; +	u32 send_bytes_lo; +	u32 send_bytes_hi; +	u32 write_bytes_lo; +	u32 write_bytes_hi; +	u32 read_req_bytes_lo; +	u32 read_req_bytes_hi; +	u32 read_rsp_bytes_lo; +	u32 read_rsp_bytes_hi; +	u32 ack_timeouts; +	u32 rsvd[5]; +}; + + +struct ocrdma_tx_qp_err_stats { +	u32 local_length_errors; +	u32 local_protection_errors; +	u32 local_qp_operation_errors; +	u32 retry_count_exceeded_errors; +	u32 rnr_retry_count_exceeded_errors; +	u32 rsvd[3]; +}; + +struct ocrdma_rx_stats { +	u32 roce_frame_bytes_lo; +	u32 roce_frame_bytes_hi; +	u32 roce_frame_icrc_drops; +	u32 roce_frame_payload_len_drops; +	u32 ud_drops; +	u32 qp1_drops; +	u32 psn_error_request_packets; +	u32 psn_error_resp_packets; +	u32 rnr_nak_timeouts; +	u32 rnr_nak_receives; +	u32 roce_frame_rxmt_drops; +	u32 nak_count_psn_sequence_errors; +	u32 rc_drop_count_lookup_errors; +	u32 rq_rnr_naks; +	u32 srq_rnr_naks; +	u32 roce_frames_lo; +	u32 roce_frames_hi; +	u32 rsvd; +}; + +struct ocrdma_rx_qp_err_stats { +	u32 nak_invalid_requst_errors; +	u32 nak_remote_operation_errors; +	u32 nak_count_remote_access_errors; +	u32 local_length_errors; +	u32 local_protection_errors; +	u32 local_qp_operation_errors; +	u32 rsvd[2]; +}; + +struct ocrdma_tx_dbg_stats { +	u32 data[100]; +}; + +struct ocrdma_rx_dbg_stats { +	u32 data[200]; +}; + +struct ocrdma_rdma_stats_req { +	struct ocrdma_mbx_hdr hdr; +	u8 reset_stats; +	u8 rsvd[3]; +} __packed; + +struct ocrdma_rdma_stats_resp { +	struct ocrdma_mbx_hdr hdr; +	struct ocrdma_rsrc_stats act_rsrc_stats; +	struct ocrdma_rsrc_stats th_rsrc_stats; +	struct ocrdma_db_err_stats	db_err_stats; +	struct ocrdma_wqe_stats		wqe_stats; +	struct ocrdma_tx_stats		tx_stats; +	struct ocrdma_tx_qp_err_stats	tx_qp_err_stats; +	struct ocrdma_rx_stats		rx_stats; +	struct ocrdma_rx_qp_err_stats	rx_qp_err_stats; +	struct ocrdma_tx_dbg_stats	tx_dbg_stats; +	struct ocrdma_rx_dbg_stats	rx_dbg_stats; +} __packed; + + +struct mgmt_hba_attribs { +	u8 flashrom_version_string[32]; +	u8 manufacturer_name[32]; +	u32 supported_modes; +	u32 rsvd0[3]; +	u8 ncsi_ver_string[12]; +	u32 default_extended_timeout; +	u8 controller_model_number[32]; +	u8 controller_description[64]; +	u8 controller_serial_number[32]; +	u8 ip_version_string[32]; +	u8 firmware_version_string[32]; +	u8 bios_version_string[32]; +	u8 redboot_version_string[32]; +	u8 driver_version_string[32]; +	u8 fw_on_flash_version_string[32]; +	u32 functionalities_supported; +	u16 max_cdblength; +	u8 asic_revision; +	u8 generational_guid[16]; +	u8 hba_port_count; +	u16 default_link_down_timeout; +	u8 iscsi_ver_min_max; +	u8 multifunction_device; +	u8 cache_valid; +	u8 hba_status; +	u8 max_domains_supported; +	u8 phy_port; +	u32 firmware_post_status; +	u32 hba_mtu[8]; +	u32 rsvd1[4]; +}; + +struct mgmt_controller_attrib { +	struct mgmt_hba_attribs hba_attribs; +	u16 pci_vendor_id; +	u16 pci_device_id; +	u16 pci_sub_vendor_id; +	u16 pci_sub_system_id; +	u8 pci_bus_number; +	u8 pci_device_number; +	u8 pci_function_number; +	u8 interface_type; +	u64 unique_identifier; +	u32 rsvd0[5]; +}; + +struct ocrdma_get_ctrl_attribs_rsp { +	struct ocrdma_mbx_hdr hdr; +	struct mgmt_controller_attrib ctrl_attribs; +}; + +  #endif				/* __OCRDMA_SLI_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c new file mode 100644 index 00000000000..41a9aec9998 --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -0,0 +1,616 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for          * + * RoCE (RDMA over Converged Ethernet) adapters.                   * + * Copyright (C) 2008-2014 Emulex. All rights reserved.            * + * EMULEX and SLI are trademarks of Emulex.                        * + * www.emulex.com                                                  * + *                                                                 * + * This program is free software; you can redistribute it and/or   * + * modify it under the terms of version 2 of the GNU General       * + * Public License as published by the Free Software Foundation.    * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID.  See the GNU General Public License for  * + * more details, a copy of which can be found in the file COPYING  * + * included with this package.                                     * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#include <rdma/ib_addr.h> +#include "ocrdma_stats.h" + +static struct dentry *ocrdma_dbgfs_dir; + +static int ocrdma_add_stat(char *start, char *pcur, +				char *name, u64 count) +{ +	char buff[128] = {0}; +	int cpy_len = 0; + +	snprintf(buff, 128, "%s: %llu\n", name, count); +	cpy_len = strlen(buff); + +	if (pcur + cpy_len > start + OCRDMA_MAX_DBGFS_MEM) { +		pr_err("%s: No space in stats buff\n", __func__); +		return 0; +	} + +	memcpy(pcur, buff, cpy_len); +	return cpy_len; +} + +static bool ocrdma_alloc_stats_mem(struct ocrdma_dev *dev) +{ +	struct stats_mem *mem = &dev->stats_mem; + +	/* Alloc mbox command mem*/ +	mem->size = max_t(u32, sizeof(struct ocrdma_rdma_stats_req), +			sizeof(struct ocrdma_rdma_stats_resp)); + +	mem->va   = dma_alloc_coherent(&dev->nic_info.pdev->dev, mem->size, +					 &mem->pa, GFP_KERNEL); +	if (!mem->va) { +		pr_err("%s: stats mbox allocation failed\n", __func__); +		return false; +	} + +	memset(mem->va, 0, mem->size); + +	/* Alloc debugfs mem */ +	mem->debugfs_mem = kzalloc(OCRDMA_MAX_DBGFS_MEM, GFP_KERNEL); +	if (!mem->debugfs_mem) { +		pr_err("%s: stats debugfs mem allocation failed\n", __func__); +		return false; +	} + +	return true; +} + +static void ocrdma_release_stats_mem(struct ocrdma_dev *dev) +{ +	struct stats_mem *mem = &dev->stats_mem; + +	if (mem->va) +		dma_free_coherent(&dev->nic_info.pdev->dev, mem->size, +				  mem->va, mem->pa); +	kfree(mem->debugfs_mem); +} + +static char *ocrdma_resource_stats(struct ocrdma_dev *dev) +{ +	char *stats = dev->stats_mem.debugfs_mem, *pcur; +	struct ocrdma_rdma_stats_resp *rdma_stats = +			(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; +	struct ocrdma_rsrc_stats *rsrc_stats = &rdma_stats->act_rsrc_stats; + +	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + +	pcur = stats; +	pcur += ocrdma_add_stat(stats, pcur, "active_dpp_pds", +				(u64)rsrc_stats->dpp_pds); +	pcur += ocrdma_add_stat(stats, pcur, "active_non_dpp_pds", +				(u64)rsrc_stats->non_dpp_pds); +	pcur += ocrdma_add_stat(stats, pcur, "active_rc_dpp_qps", +				(u64)rsrc_stats->rc_dpp_qps); +	pcur += ocrdma_add_stat(stats, pcur, "active_uc_dpp_qps", +				(u64)rsrc_stats->uc_dpp_qps); +	pcur += ocrdma_add_stat(stats, pcur, "active_ud_dpp_qps", +				(u64)rsrc_stats->ud_dpp_qps); +	pcur += ocrdma_add_stat(stats, pcur, "active_rc_non_dpp_qps", +				(u64)rsrc_stats->rc_non_dpp_qps); +	pcur += ocrdma_add_stat(stats, pcur, "active_uc_non_dpp_qps", +				(u64)rsrc_stats->uc_non_dpp_qps); +	pcur += ocrdma_add_stat(stats, pcur, "active_ud_non_dpp_qps", +				(u64)rsrc_stats->ud_non_dpp_qps); +	pcur += ocrdma_add_stat(stats, pcur, "active_srqs", +				(u64)rsrc_stats->srqs); +	pcur += ocrdma_add_stat(stats, pcur, "active_rbqs", +				(u64)rsrc_stats->rbqs); +	pcur += ocrdma_add_stat(stats, pcur, "active_64K_nsmr", +				(u64)rsrc_stats->r64K_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "active_64K_to_2M_nsmr", +				(u64)rsrc_stats->r64K_to_2M_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "active_2M_to_44M_nsmr", +				(u64)rsrc_stats->r2M_to_44M_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "active_44M_to_1G_nsmr", +				(u64)rsrc_stats->r44M_to_1G_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "active_1G_to_4G_nsmr", +				(u64)rsrc_stats->r1G_to_4G_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "active_nsmr_count_4G_to_32G", +				(u64)rsrc_stats->nsmr_count_4G_to_32G); +	pcur += ocrdma_add_stat(stats, pcur, "active_32G_to_64G_nsmr", +				(u64)rsrc_stats->r32G_to_64G_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "active_64G_to_128G_nsmr", +				(u64)rsrc_stats->r64G_to_128G_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "active_128G_to_higher_nsmr", +				(u64)rsrc_stats->r128G_to_higher_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "active_embedded_nsmr", +				(u64)rsrc_stats->embedded_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "active_frmr", +				(u64)rsrc_stats->frmr); +	pcur += ocrdma_add_stat(stats, pcur, "active_prefetch_qps", +				(u64)rsrc_stats->prefetch_qps); +	pcur += ocrdma_add_stat(stats, pcur, "active_ondemand_qps", +				(u64)rsrc_stats->ondemand_qps); +	pcur += ocrdma_add_stat(stats, pcur, "active_phy_mr", +				(u64)rsrc_stats->phy_mr); +	pcur += ocrdma_add_stat(stats, pcur, "active_mw", +				(u64)rsrc_stats->mw); + +	/* Print the threshold stats */ +	rsrc_stats = &rdma_stats->th_rsrc_stats; + +	pcur += ocrdma_add_stat(stats, pcur, "threshold_dpp_pds", +				(u64)rsrc_stats->dpp_pds); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_non_dpp_pds", +				(u64)rsrc_stats->non_dpp_pds); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_rc_dpp_qps", +				(u64)rsrc_stats->rc_dpp_qps); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_uc_dpp_qps", +				(u64)rsrc_stats->uc_dpp_qps); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_ud_dpp_qps", +				(u64)rsrc_stats->ud_dpp_qps); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_rc_non_dpp_qps", +				(u64)rsrc_stats->rc_non_dpp_qps); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_uc_non_dpp_qps", +				(u64)rsrc_stats->uc_non_dpp_qps); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_ud_non_dpp_qps", +				(u64)rsrc_stats->ud_non_dpp_qps); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_srqs", +				(u64)rsrc_stats->srqs); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_rbqs", +				(u64)rsrc_stats->rbqs); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_64K_nsmr", +				(u64)rsrc_stats->r64K_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_64K_to_2M_nsmr", +				(u64)rsrc_stats->r64K_to_2M_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_2M_to_44M_nsmr", +				(u64)rsrc_stats->r2M_to_44M_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_44M_to_1G_nsmr", +				(u64)rsrc_stats->r44M_to_1G_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_1G_to_4G_nsmr", +				(u64)rsrc_stats->r1G_to_4G_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_nsmr_count_4G_to_32G", +				(u64)rsrc_stats->nsmr_count_4G_to_32G); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_32G_to_64G_nsmr", +				(u64)rsrc_stats->r32G_to_64G_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_64G_to_128G_nsmr", +				(u64)rsrc_stats->r64G_to_128G_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_128G_to_higher_nsmr", +				(u64)rsrc_stats->r128G_to_higher_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_embedded_nsmr", +				(u64)rsrc_stats->embedded_nsmr); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_frmr", +				(u64)rsrc_stats->frmr); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_prefetch_qps", +				(u64)rsrc_stats->prefetch_qps); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_ondemand_qps", +				(u64)rsrc_stats->ondemand_qps); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_phy_mr", +				(u64)rsrc_stats->phy_mr); +	pcur += ocrdma_add_stat(stats, pcur, "threshold_mw", +				(u64)rsrc_stats->mw); +	return stats; +} + +static char *ocrdma_rx_stats(struct ocrdma_dev *dev) +{ +	char *stats = dev->stats_mem.debugfs_mem, *pcur; +	struct ocrdma_rdma_stats_resp *rdma_stats = +		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; +	struct ocrdma_rx_stats *rx_stats = &rdma_stats->rx_stats; + +	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + +	pcur = stats; +	pcur += ocrdma_add_stat +		(stats, pcur, "roce_frame_bytes", +		 convert_to_64bit(rx_stats->roce_frame_bytes_lo, +		 rx_stats->roce_frame_bytes_hi)); +	pcur += ocrdma_add_stat(stats, pcur, "roce_frame_icrc_drops", +				(u64)rx_stats->roce_frame_icrc_drops); +	pcur += ocrdma_add_stat(stats, pcur, "roce_frame_payload_len_drops", +				(u64)rx_stats->roce_frame_payload_len_drops); +	pcur += ocrdma_add_stat(stats, pcur, "ud_drops", +				(u64)rx_stats->ud_drops); +	pcur += ocrdma_add_stat(stats, pcur, "qp1_drops", +				(u64)rx_stats->qp1_drops); +	pcur += ocrdma_add_stat(stats, pcur, "psn_error_request_packets", +				(u64)rx_stats->psn_error_request_packets); +	pcur += ocrdma_add_stat(stats, pcur, "psn_error_resp_packets", +				(u64)rx_stats->psn_error_resp_packets); +	pcur += ocrdma_add_stat(stats, pcur, "rnr_nak_timeouts", +				(u64)rx_stats->rnr_nak_timeouts); +	pcur += ocrdma_add_stat(stats, pcur, "rnr_nak_receives", +				(u64)rx_stats->rnr_nak_receives); +	pcur += ocrdma_add_stat(stats, pcur, "roce_frame_rxmt_drops", +				(u64)rx_stats->roce_frame_rxmt_drops); +	pcur += ocrdma_add_stat(stats, pcur, "nak_count_psn_sequence_errors", +				(u64)rx_stats->nak_count_psn_sequence_errors); +	pcur += ocrdma_add_stat(stats, pcur, "rc_drop_count_lookup_errors", +				(u64)rx_stats->rc_drop_count_lookup_errors); +	pcur += ocrdma_add_stat(stats, pcur, "rq_rnr_naks", +				(u64)rx_stats->rq_rnr_naks); +	pcur += ocrdma_add_stat(stats, pcur, "srq_rnr_naks", +				(u64)rx_stats->srq_rnr_naks); +	pcur += ocrdma_add_stat(stats, pcur, "roce_frames", +				convert_to_64bit(rx_stats->roce_frames_lo, +						 rx_stats->roce_frames_hi)); + +	return stats; +} + +static char *ocrdma_tx_stats(struct ocrdma_dev *dev) +{ +	char *stats = dev->stats_mem.debugfs_mem, *pcur; +	struct ocrdma_rdma_stats_resp *rdma_stats = +		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; +	struct ocrdma_tx_stats *tx_stats = &rdma_stats->tx_stats; + +	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + +	pcur = stats; +	pcur += ocrdma_add_stat(stats, pcur, "send_pkts", +				convert_to_64bit(tx_stats->send_pkts_lo, +						 tx_stats->send_pkts_hi)); +	pcur += ocrdma_add_stat(stats, pcur, "write_pkts", +				convert_to_64bit(tx_stats->write_pkts_lo, +						 tx_stats->write_pkts_hi)); +	pcur += ocrdma_add_stat(stats, pcur, "read_pkts", +				convert_to_64bit(tx_stats->read_pkts_lo, +						 tx_stats->read_pkts_hi)); +	pcur += ocrdma_add_stat(stats, pcur, "read_rsp_pkts", +				convert_to_64bit(tx_stats->read_rsp_pkts_lo, +						 tx_stats->read_rsp_pkts_hi)); +	pcur += ocrdma_add_stat(stats, pcur, "ack_pkts", +				convert_to_64bit(tx_stats->ack_pkts_lo, +						 tx_stats->ack_pkts_hi)); +	pcur += ocrdma_add_stat(stats, pcur, "send_bytes", +				convert_to_64bit(tx_stats->send_bytes_lo, +						 tx_stats->send_bytes_hi)); +	pcur += ocrdma_add_stat(stats, pcur, "write_bytes", +				convert_to_64bit(tx_stats->write_bytes_lo, +						 tx_stats->write_bytes_hi)); +	pcur += ocrdma_add_stat(stats, pcur, "read_req_bytes", +				convert_to_64bit(tx_stats->read_req_bytes_lo, +						 tx_stats->read_req_bytes_hi)); +	pcur += ocrdma_add_stat(stats, pcur, "read_rsp_bytes", +				convert_to_64bit(tx_stats->read_rsp_bytes_lo, +						 tx_stats->read_rsp_bytes_hi)); +	pcur += ocrdma_add_stat(stats, pcur, "ack_timeouts", +				(u64)tx_stats->ack_timeouts); + +	return stats; +} + +static char *ocrdma_wqe_stats(struct ocrdma_dev *dev) +{ +	char *stats = dev->stats_mem.debugfs_mem, *pcur; +	struct ocrdma_rdma_stats_resp *rdma_stats = +		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; +	struct ocrdma_wqe_stats	*wqe_stats = &rdma_stats->wqe_stats; + +	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + +	pcur = stats; +	pcur += ocrdma_add_stat(stats, pcur, "large_send_rc_wqes", +		convert_to_64bit(wqe_stats->large_send_rc_wqes_lo, +				 wqe_stats->large_send_rc_wqes_hi)); +	pcur += ocrdma_add_stat(stats, pcur, "large_write_rc_wqes", +		convert_to_64bit(wqe_stats->large_write_rc_wqes_lo, +				 wqe_stats->large_write_rc_wqes_hi)); +	pcur += ocrdma_add_stat(stats, pcur, "read_wqes", +				convert_to_64bit(wqe_stats->read_wqes_lo, +						 wqe_stats->read_wqes_hi)); +	pcur += ocrdma_add_stat(stats, pcur, "frmr_wqes", +				convert_to_64bit(wqe_stats->frmr_wqes_lo, +						 wqe_stats->frmr_wqes_hi)); +	pcur += ocrdma_add_stat(stats, pcur, "mw_bind_wqes", +				convert_to_64bit(wqe_stats->mw_bind_wqes_lo, +						 wqe_stats->mw_bind_wqes_hi)); +	pcur += ocrdma_add_stat(stats, pcur, "invalidate_wqes", +		convert_to_64bit(wqe_stats->invalidate_wqes_lo, +				 wqe_stats->invalidate_wqes_hi)); +	pcur += ocrdma_add_stat(stats, pcur, "dpp_wqe_drops", +				(u64)wqe_stats->dpp_wqe_drops); +	return stats; +} + +static char *ocrdma_db_errstats(struct ocrdma_dev *dev) +{ +	char *stats = dev->stats_mem.debugfs_mem, *pcur; +	struct ocrdma_rdma_stats_resp *rdma_stats = +		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; +	struct ocrdma_db_err_stats *db_err_stats = &rdma_stats->db_err_stats; + +	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + +	pcur = stats; +	pcur += ocrdma_add_stat(stats, pcur, "sq_doorbell_errors", +				(u64)db_err_stats->sq_doorbell_errors); +	pcur += ocrdma_add_stat(stats, pcur, "cq_doorbell_errors", +				(u64)db_err_stats->cq_doorbell_errors); +	pcur += ocrdma_add_stat(stats, pcur, "rq_srq_doorbell_errors", +				(u64)db_err_stats->rq_srq_doorbell_errors); +	pcur += ocrdma_add_stat(stats, pcur, "cq_overflow_errors", +				(u64)db_err_stats->cq_overflow_errors); +	return stats; +} + +static char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev) +{ +	char *stats = dev->stats_mem.debugfs_mem, *pcur; +	struct ocrdma_rdma_stats_resp *rdma_stats = +		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; +	struct ocrdma_rx_qp_err_stats *rx_qp_err_stats = +		 &rdma_stats->rx_qp_err_stats; + +	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + +	pcur = stats; +	pcur += ocrdma_add_stat(stats, pcur, "nak_invalid_requst_errors", +			(u64)rx_qp_err_stats->nak_invalid_requst_errors); +	pcur += ocrdma_add_stat(stats, pcur, "nak_remote_operation_errors", +			(u64)rx_qp_err_stats->nak_remote_operation_errors); +	pcur += ocrdma_add_stat(stats, pcur, "nak_count_remote_access_errors", +			(u64)rx_qp_err_stats->nak_count_remote_access_errors); +	pcur += ocrdma_add_stat(stats, pcur, "local_length_errors", +			(u64)rx_qp_err_stats->local_length_errors); +	pcur += ocrdma_add_stat(stats, pcur, "local_protection_errors", +			(u64)rx_qp_err_stats->local_protection_errors); +	pcur += ocrdma_add_stat(stats, pcur, "local_qp_operation_errors", +			(u64)rx_qp_err_stats->local_qp_operation_errors); +	return stats; +} + +static char *ocrdma_txqp_errstats(struct ocrdma_dev *dev) +{ +	char *stats = dev->stats_mem.debugfs_mem, *pcur; +	struct ocrdma_rdma_stats_resp *rdma_stats = +		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; +	struct ocrdma_tx_qp_err_stats *tx_qp_err_stats = +		&rdma_stats->tx_qp_err_stats; + +	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + +	pcur = stats; +	pcur += ocrdma_add_stat(stats, pcur, "local_length_errors", +			(u64)tx_qp_err_stats->local_length_errors); +	pcur += ocrdma_add_stat(stats, pcur, "local_protection_errors", +			(u64)tx_qp_err_stats->local_protection_errors); +	pcur += ocrdma_add_stat(stats, pcur, "local_qp_operation_errors", +			(u64)tx_qp_err_stats->local_qp_operation_errors); +	pcur += ocrdma_add_stat(stats, pcur, "retry_count_exceeded_errors", +			(u64)tx_qp_err_stats->retry_count_exceeded_errors); +	pcur += ocrdma_add_stat(stats, pcur, "rnr_retry_count_exceeded_errors", +			(u64)tx_qp_err_stats->rnr_retry_count_exceeded_errors); +	return stats; +} + +static char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev) +{ +	int i; +	char *pstats = dev->stats_mem.debugfs_mem; +	struct ocrdma_rdma_stats_resp *rdma_stats = +		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; +	struct ocrdma_tx_dbg_stats *tx_dbg_stats = +		&rdma_stats->tx_dbg_stats; + +	memset(pstats, 0, (OCRDMA_MAX_DBGFS_MEM)); + +	for (i = 0; i < 100; i++) +		pstats += snprintf(pstats, 80, "DW[%d] = 0x%x\n", i, +				 tx_dbg_stats->data[i]); + +	return dev->stats_mem.debugfs_mem; +} + +static char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev) +{ +	int i; +	char *pstats = dev->stats_mem.debugfs_mem; +	struct ocrdma_rdma_stats_resp *rdma_stats = +		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; +	struct ocrdma_rx_dbg_stats *rx_dbg_stats = +		&rdma_stats->rx_dbg_stats; + +	memset(pstats, 0, (OCRDMA_MAX_DBGFS_MEM)); + +	for (i = 0; i < 200; i++) +		pstats += snprintf(pstats, 80, "DW[%d] = 0x%x\n", i, +				 rx_dbg_stats->data[i]); + +	return dev->stats_mem.debugfs_mem; +} + +static void ocrdma_update_stats(struct ocrdma_dev *dev) +{ +	ulong now = jiffies, secs; +	int status = 0; + +	secs = jiffies_to_msecs(now - dev->last_stats_time) / 1000U; +	if (secs) { +		/* update */ +		status = ocrdma_mbx_rdma_stats(dev, false); +		if (status) +			pr_err("%s: stats mbox failed with status = %d\n", +			       __func__, status); +		dev->last_stats_time = jiffies; +	} +} + +static ssize_t ocrdma_dbgfs_ops_read(struct file *filp, char __user *buffer, +					size_t usr_buf_len, loff_t *ppos) +{ +	struct ocrdma_stats *pstats = filp->private_data; +	struct ocrdma_dev *dev = pstats->dev; +	ssize_t status = 0; +	char *data = NULL; + +	/* No partial reads */ +	if (*ppos != 0) +		return 0; + +	mutex_lock(&dev->stats_lock); + +	ocrdma_update_stats(dev); + +	switch (pstats->type) { +	case OCRDMA_RSRC_STATS: +		data = ocrdma_resource_stats(dev); +		break; +	case OCRDMA_RXSTATS: +		data = ocrdma_rx_stats(dev); +		break; +	case OCRDMA_WQESTATS: +		data = ocrdma_wqe_stats(dev); +		break; +	case OCRDMA_TXSTATS: +		data = ocrdma_tx_stats(dev); +		break; +	case OCRDMA_DB_ERRSTATS: +		data = ocrdma_db_errstats(dev); +		break; +	case OCRDMA_RXQP_ERRSTATS: +		data = ocrdma_rxqp_errstats(dev); +		break; +	case OCRDMA_TXQP_ERRSTATS: +		data = ocrdma_txqp_errstats(dev); +		break; +	case OCRDMA_TX_DBG_STATS: +		data = ocrdma_tx_dbg_stats(dev); +		break; +	case OCRDMA_RX_DBG_STATS: +		data = ocrdma_rx_dbg_stats(dev); +		break; + +	default: +		status = -EFAULT; +		goto exit; +	} + +	if (usr_buf_len < strlen(data)) { +		status = -ENOSPC; +		goto exit; +	} + +	status = simple_read_from_buffer(buffer, usr_buf_len, ppos, data, +					 strlen(data)); +exit: +	mutex_unlock(&dev->stats_lock); +	return status; +} + +static const struct file_operations ocrdma_dbg_ops = { +	.owner = THIS_MODULE, +	.open = simple_open, +	.read = ocrdma_dbgfs_ops_read, +}; + +void ocrdma_add_port_stats(struct ocrdma_dev *dev) +{ +	if (!ocrdma_dbgfs_dir) +		return; + +	/* Create post stats base dir */ +	dev->dir = debugfs_create_dir(dev->ibdev.name, ocrdma_dbgfs_dir); +	if (!dev->dir) +		goto err; + +	dev->rsrc_stats.type = OCRDMA_RSRC_STATS; +	dev->rsrc_stats.dev = dev; +	if (!debugfs_create_file("resource_stats", S_IRUSR, dev->dir, +				 &dev->rsrc_stats, &ocrdma_dbg_ops)) +		goto err; + +	dev->rx_stats.type = OCRDMA_RXSTATS; +	dev->rx_stats.dev = dev; +	if (!debugfs_create_file("rx_stats", S_IRUSR, dev->dir, +				 &dev->rx_stats, &ocrdma_dbg_ops)) +		goto err; + +	dev->wqe_stats.type = OCRDMA_WQESTATS; +	dev->wqe_stats.dev = dev; +	if (!debugfs_create_file("wqe_stats", S_IRUSR, dev->dir, +				 &dev->wqe_stats, &ocrdma_dbg_ops)) +		goto err; + +	dev->tx_stats.type = OCRDMA_TXSTATS; +	dev->tx_stats.dev = dev; +	if (!debugfs_create_file("tx_stats", S_IRUSR, dev->dir, +				 &dev->tx_stats, &ocrdma_dbg_ops)) +		goto err; + +	dev->db_err_stats.type = OCRDMA_DB_ERRSTATS; +	dev->db_err_stats.dev = dev; +	if (!debugfs_create_file("db_err_stats", S_IRUSR, dev->dir, +				 &dev->db_err_stats, &ocrdma_dbg_ops)) +		goto err; + + +	dev->tx_qp_err_stats.type = OCRDMA_TXQP_ERRSTATS; +	dev->tx_qp_err_stats.dev = dev; +	if (!debugfs_create_file("tx_qp_err_stats", S_IRUSR, dev->dir, +				 &dev->tx_qp_err_stats, &ocrdma_dbg_ops)) +		goto err; + +	dev->rx_qp_err_stats.type = OCRDMA_RXQP_ERRSTATS; +	dev->rx_qp_err_stats.dev = dev; +	if (!debugfs_create_file("rx_qp_err_stats", S_IRUSR, dev->dir, +				 &dev->rx_qp_err_stats, &ocrdma_dbg_ops)) +		goto err; + + +	dev->tx_dbg_stats.type = OCRDMA_TX_DBG_STATS; +	dev->tx_dbg_stats.dev = dev; +	if (!debugfs_create_file("tx_dbg_stats", S_IRUSR, dev->dir, +				 &dev->tx_dbg_stats, &ocrdma_dbg_ops)) +		goto err; + +	dev->rx_dbg_stats.type = OCRDMA_RX_DBG_STATS; +	dev->rx_dbg_stats.dev = dev; +	if (!debugfs_create_file("rx_dbg_stats", S_IRUSR, dev->dir, +				 &dev->rx_dbg_stats, &ocrdma_dbg_ops)) +		goto err; + +	/* Now create dma_mem for stats mbx command */ +	if (!ocrdma_alloc_stats_mem(dev)) +		goto err; + +	mutex_init(&dev->stats_lock); + +	return; +err: +	ocrdma_release_stats_mem(dev); +	debugfs_remove_recursive(dev->dir); +	dev->dir = NULL; +} + +void ocrdma_rem_port_stats(struct ocrdma_dev *dev) +{ +	if (!dev->dir) +		return; +	mutex_destroy(&dev->stats_lock); +	ocrdma_release_stats_mem(dev); +	debugfs_remove(dev->dir); +} + +void ocrdma_init_debugfs(void) +{ +	/* Create base dir in debugfs root dir */ +	ocrdma_dbgfs_dir = debugfs_create_dir("ocrdma", NULL); +} + +void ocrdma_rem_debugfs(void) +{ +	debugfs_remove_recursive(ocrdma_dbgfs_dir); +} diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.h b/drivers/infiniband/hw/ocrdma/ocrdma_stats.h new file mode 100644 index 00000000000..5f5e20c46d7 --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.h @@ -0,0 +1,54 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for          * + * RoCE (RDMA over Converged Ethernet) adapters.                   * + * Copyright (C) 2008-2014 Emulex. All rights reserved.            * + * EMULEX and SLI are trademarks of Emulex.                        * + * www.emulex.com                                                  * + *                                                                 * + * This program is free software; you can redistribute it and/or   * + * modify it under the terms of version 2 of the GNU General       * + * Public License as published by the Free Software Foundation.    * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID.  See the GNU General Public License for  * + * more details, a copy of which can be found in the file COPYING  * + * included with this package.                                     * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_STATS_H__ +#define __OCRDMA_STATS_H__ + +#include <linux/debugfs.h> +#include "ocrdma.h" +#include "ocrdma_hw.h" + +#define OCRDMA_MAX_DBGFS_MEM 4096 + +enum OCRDMA_STATS_TYPE { +	OCRDMA_RSRC_STATS, +	OCRDMA_RXSTATS, +	OCRDMA_WQESTATS, +	OCRDMA_TXSTATS, +	OCRDMA_DB_ERRSTATS, +	OCRDMA_RXQP_ERRSTATS, +	OCRDMA_TXQP_ERRSTATS, +	OCRDMA_TX_DBG_STATS, +	OCRDMA_RX_DBG_STATS +}; + +void ocrdma_rem_debugfs(void); +void ocrdma_init_debugfs(void); +void ocrdma_rem_port_stats(struct ocrdma_dev *dev); +void ocrdma_add_port_stats(struct ocrdma_dev *dev); + +#endif	/* __OCRDMA_STATS_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 6e982bb43c3..edf6211d84b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -53,7 +53,7 @@ int ocrdma_query_gid(struct ib_device *ibdev, u8 port,  	dev = get_ocrdma_dev(ibdev);  	memset(sgid, 0, sizeof(*sgid)); -	if (index >= OCRDMA_MAX_SGID) +	if (index > OCRDMA_MAX_SGID)  		return -EINVAL;  	memcpy(sgid, &dev->sgid_tbl[index], sizeof(*sgid)); @@ -89,7 +89,7 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr)  	attr->max_cq = dev->attr.max_cq;  	attr->max_cqe = dev->attr.max_cqe;  	attr->max_mr = dev->attr.max_mr; -	attr->max_mw = 0; +	attr->max_mw = dev->attr.max_mw;  	attr->max_pd = dev->attr.max_pd;  	attr->atomic_cap = 0;  	attr->max_fmr = 0; @@ -141,10 +141,9 @@ static inline void get_link_speed_and_width(struct ocrdma_dev *dev,  		/* Unsupported */  		*ib_speed = IB_SPEED_SDR;  		*ib_width = IB_WIDTH_1X; -	}; +	}  } -  int ocrdma_query_port(struct ib_device *ibdev,  		      u8 port, struct ib_port_attr *props)  { @@ -176,7 +175,7 @@ int ocrdma_query_port(struct ib_device *ibdev,  	props->port_cap_flags =  	    IB_PORT_CM_SUP |  	    IB_PORT_REINIT_SUP | -	    IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP; +	    IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_IP_BASED_GIDS;  	props->gid_tbl_len = OCRDMA_MAX_SGID;  	props->pkey_tbl_len = 1;  	props->bad_pkey_cntr = 0; @@ -267,7 +266,7 @@ static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev,  	if (udata && uctx) {  		pd->dpp_enabled = -			dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY; +			ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R;  		pd->num_dpp_qp =  			pd->dpp_enabled ? OCRDMA_PD_MAX_DPP_ENABLED_QP : 0;  	} @@ -726,10 +725,10 @@ static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr,  			    u32 num_pbes)  {  	struct ocrdma_pbe *pbe; -	struct ib_umem_chunk *chunk; +	struct scatterlist *sg;  	struct ocrdma_pbl *pbl_tbl = mr->hwmr.pbl_table;  	struct ib_umem *umem = mr->umem; -	int i, shift, pg_cnt, pages, pbe_cnt, total_num_pbes = 0; +	int shift, pg_cnt, pages, pbe_cnt, entry, total_num_pbes = 0;  	if (!mr->hwmr.num_pbes)  		return; @@ -739,39 +738,37 @@ static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr,  	shift = ilog2(umem->page_size); -	list_for_each_entry(chunk, &umem->chunk_list, list) { -		/* get all the dma regions from the chunk. */ -		for (i = 0; i < chunk->nmap; i++) { -			pages = sg_dma_len(&chunk->page_list[i]) >> shift; -			for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) { -				/* store the page address in pbe */ -				pbe->pa_lo = -				    cpu_to_le32(sg_dma_address -						(&chunk->page_list[i]) + -						(umem->page_size * pg_cnt)); -				pbe->pa_hi = -				    cpu_to_le32(upper_32_bits -						((sg_dma_address -						  (&chunk->page_list[i]) + -						  umem->page_size * pg_cnt))); -				pbe_cnt += 1; -				total_num_pbes += 1; -				pbe++; - -				/* if done building pbes, issue the mbx cmd. */ -				if (total_num_pbes == num_pbes) -					return; - -				/* if the given pbl is full storing the pbes, -				 * move to next pbl. -				 */ -				if (pbe_cnt == -					(mr->hwmr.pbl_size / sizeof(u64))) { -					pbl_tbl++; -					pbe = (struct ocrdma_pbe *)pbl_tbl->va; -					pbe_cnt = 0; -				} +	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { +		pages = sg_dma_len(sg) >> shift; +		for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) { +			/* store the page address in pbe */ +			pbe->pa_lo = +			    cpu_to_le32(sg_dma_address +					(sg) + +					(umem->page_size * pg_cnt)); +			pbe->pa_hi = +			    cpu_to_le32(upper_32_bits +					((sg_dma_address +					  (sg) + +					  umem->page_size * pg_cnt))); +			pbe_cnt += 1; +			total_num_pbes += 1; +			pbe++; + +			/* if done building pbes, issue the mbx cmd. */ +			if (total_num_pbes == num_pbes) +				return; + +			/* if the given pbl is full storing the pbes, +			 * move to next pbl. +			 */ +			if (pbe_cnt == +				(mr->hwmr.pbl_size / sizeof(u64))) { +				pbl_tbl++; +				pbe = (struct ocrdma_pbe *)pbl_tbl->va; +				pbe_cnt = 0;  			} +  		}  	}  } @@ -840,8 +837,7 @@ int ocrdma_dereg_mr(struct ib_mr *ib_mr)  	status = ocrdma_mbx_dealloc_lkey(dev, mr->hwmr.fr_mr, mr->hwmr.lkey); -	if (mr->hwmr.fr_mr == 0) -		ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); +	ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);  	/* it could be user registered memory. */  	if (mr->umem) @@ -910,6 +906,7 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, int entries, int vector,  	spin_lock_init(&cq->comp_handler_lock);  	INIT_LIST_HEAD(&cq->sq_head);  	INIT_LIST_HEAD(&cq->rq_head); +	cq->first_arm = true;  	if (ib_ctx) {  		uctx = get_ocrdma_ucontext(ib_ctx); @@ -927,9 +924,7 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, int entries, int vector,  			goto ctx_err;  	}  	cq->phase = OCRDMA_CQE_VALID; -	cq->arm_needed = true;  	dev->cq_tbl[cq->id] = cq; -  	return &cq->ibcq;  ctx_err: @@ -952,15 +947,52 @@ int ocrdma_resize_cq(struct ib_cq *ibcq, int new_cnt,  	return status;  } +static void ocrdma_flush_cq(struct ocrdma_cq *cq) +{ +	int cqe_cnt; +	int valid_count = 0; +	unsigned long flags; + +	struct ocrdma_dev *dev = get_ocrdma_dev(cq->ibcq.device); +	struct ocrdma_cqe *cqe = NULL; + +	cqe = cq->va; +	cqe_cnt = cq->cqe_cnt; + +	/* Last irq might have scheduled a polling thread +	 * sync-up with it before hard flushing. +	 */ +	spin_lock_irqsave(&cq->cq_lock, flags); +	while (cqe_cnt) { +		if (is_cqe_valid(cq, cqe)) +			valid_count++; +		cqe++; +		cqe_cnt--; +	} +	ocrdma_ring_cq_db(dev, cq->id, false, false, valid_count); +	spin_unlock_irqrestore(&cq->cq_lock, flags); +} +  int ocrdma_destroy_cq(struct ib_cq *ibcq)  {  	int status;  	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); +	struct ocrdma_eq *eq = NULL;  	struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device);  	int pdid = 0; +	u32 irq, indx; -	status = ocrdma_mbx_destroy_cq(dev, cq); +	dev->cq_tbl[cq->id] = NULL; +	indx = ocrdma_get_eq_table_index(dev, cq->eqn); +	if (indx == -EINVAL) +		BUG(); +	eq = &dev->eq_tbl[indx]; +	irq = ocrdma_get_irq(dev, eq); +	synchronize_irq(irq); +	ocrdma_flush_cq(cq); + +	status = ocrdma_mbx_destroy_cq(dev, cq);  	if (cq->ucontext) {  		pdid = cq->ucontext->cntxt_pd->id;  		ocrdma_del_mmap(cq->ucontext, (u64) cq->pa, @@ -969,7 +1001,6 @@ int ocrdma_destroy_cq(struct ib_cq *ibcq)  				ocrdma_get_db_addr(dev, pdid),  				dev->nic_info.db_page_size);  	} -	dev->cq_tbl[cq->id] = NULL;  	kfree(cq);  	return status; @@ -1092,15 +1123,9 @@ static int ocrdma_copy_qp_uresp(struct ocrdma_qp *qp,  	}  	uresp.db_page_addr = usr_db;  	uresp.db_page_size = dev->nic_info.db_page_size; -	if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { -		uresp.db_sq_offset = OCRDMA_DB_GEN2_SQ_OFFSET; -		uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET; -		uresp.db_shift = 24; -	} else { -		uresp.db_sq_offset = OCRDMA_DB_SQ_OFFSET; -		uresp.db_rq_offset = OCRDMA_DB_RQ_OFFSET; -		uresp.db_shift = 16; -	} +	uresp.db_sq_offset = OCRDMA_DB_GEN2_SQ_OFFSET; +	uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET; +	uresp.db_shift = OCRDMA_DB_RQ_SHIFT;  	if (qp->dpp_enabled) {  		uresp.dpp_credit = dpp_credit_lmt; @@ -1132,7 +1157,7 @@ err:  static void ocrdma_set_qp_db(struct ocrdma_dev *dev, struct ocrdma_qp *qp,  			     struct ocrdma_pd *pd)  { -	if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { +	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {  		qp->sq_db = dev->nic_info.db +  			(pd->id * dev->nic_info.db_page_size) +  			OCRDMA_DB_GEN2_SQ_OFFSET; @@ -1182,7 +1207,6 @@ static void ocrdma_set_qp_init_params(struct ocrdma_qp *qp,  	qp->signaled = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) ? true : false;  } -  static void ocrdma_store_gsi_qp_cq(struct ocrdma_dev *dev,  				   struct ib_qp_init_attr *attrs)  { @@ -1268,17 +1292,6 @@ gen_err:  	return ERR_PTR(status);  } - -static void ocrdma_flush_rq_db(struct ocrdma_qp *qp) -{ -	if (qp->db_cache) { -		u32 val = qp->rq.dbid | (qp->db_cache << -				ocrdma_get_num_posted_shift(qp)); -		iowrite32(val, qp->rq_db); -		qp->db_cache = 0; -	} -} -  int _ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,  		      int attr_mask)  { @@ -1296,9 +1309,7 @@ int _ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,  	 */  	if (status < 0)  		return status; -	status = ocrdma_mbx_modify_qp(dev, qp, attr, attr_mask, old_qps); -	if (!status && attr_mask & IB_QP_STATE && attr->qp_state == IB_QPS_RTR) -		ocrdma_flush_rq_db(qp); +	status = ocrdma_mbx_modify_qp(dev, qp, attr, attr_mask);  	return status;  } @@ -1326,7 +1337,8 @@ int ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,  		new_qps = old_qps;  	spin_unlock_irqrestore(&qp->q_lock, flags); -	if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask)) { +	if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask, +				IB_LINK_LAYER_ETHERNET)) {  		pr_err("%s(%d) invalid attribute mask=0x%x specified for\n"  		       "qpn=0x%x of type=0x%x old_qps=0x%x, new_qps=0x%x\n",  		       __func__, dev->id, attr_mask, qp->id, ibqp->qp_type, @@ -1415,7 +1427,7 @@ int ocrdma_query_qp(struct ib_qp *ibqp,  					  OCRDMA_QP_PARAMS_HOP_LMT_MASK) >>  						OCRDMA_QP_PARAMS_HOP_LMT_SHIFT;  	qp_attr->ah_attr.grh.traffic_class = (params.tclass_sq_psn & -					      OCRDMA_QP_PARAMS_SQ_PSN_MASK) >> +					      OCRDMA_QP_PARAMS_TCLASS_MASK) >>  						OCRDMA_QP_PARAMS_TCLASS_SHIFT;  	qp_attr->ah_attr.ah_flags = IB_AH_GRH; @@ -1509,7 +1521,7 @@ static void ocrdma_discard_cqes(struct ocrdma_qp *qp, struct ocrdma_cq *cq)  	int discard_cnt = 0;  	u32 cur_getp, stop_getp;  	struct ocrdma_cqe *cqe; -	u32 qpn = 0; +	u32 qpn = 0, wqe_idx = 0;  	spin_lock_irqsave(&cq->cq_lock, cq_flags); @@ -1538,24 +1550,29 @@ static void ocrdma_discard_cqes(struct ocrdma_qp *qp, struct ocrdma_cq *cq)  		if (qpn == 0 || qpn != qp->id)  			goto skip_cqe; -		/* mark cqe discarded so that it is not picked up later -		 * in the poll_cq(). -		 */ -		discard_cnt += 1; -		cqe->cmn.qpn = 0;  		if (is_cqe_for_sq(cqe)) {  			ocrdma_hwq_inc_tail(&qp->sq);  		} else {  			if (qp->srq) { +				wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >> +					OCRDMA_CQE_BUFTAG_SHIFT) & +					qp->srq->rq.max_wqe_idx; +				if (wqe_idx < 1) +					BUG();  				spin_lock_irqsave(&qp->srq->q_lock, flags);  				ocrdma_hwq_inc_tail(&qp->srq->rq); -				ocrdma_srq_toggle_bit(qp->srq, cur_getp); +				ocrdma_srq_toggle_bit(qp->srq, wqe_idx - 1);  				spin_unlock_irqrestore(&qp->srq->q_lock, flags);  			} else {  				ocrdma_hwq_inc_tail(&qp->rq);  			}  		} +		/* mark cqe discarded so that it is not picked up later +		 * in the poll_cq(). +		 */ +		discard_cnt += 1; +		cqe->cmn.qpn = 0;  skip_cqe:  		cur_getp = (cur_getp + 1) % cq->max_hw_cqe;  	} while (cur_getp != stop_getp); @@ -1658,7 +1675,7 @@ static int ocrdma_copy_srq_uresp(struct ocrdma_dev *dev, struct ocrdma_srq *srq,  	    (srq->pd->id * dev->nic_info.db_page_size);  	uresp.db_page_size = dev->nic_info.db_page_size;  	uresp.num_rqe_allocated = srq->rq.max_cnt; -	if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { +	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {  		uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET;  		uresp.db_shift = 24;  	} else { @@ -1981,9 +1998,7 @@ static int ocrdma_build_fr(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr,  	wqe_size = roundup(wqe_size, OCRDMA_WQE_ALIGN_BYTES); -	if ((wr->wr.fast_reg.page_list_len > -		qp->dev->attr.max_pages_per_frmr) || -		(wr->wr.fast_reg.length > 0xffffffffULL)) +	if (wr->wr.fast_reg.page_list_len > qp->dev->attr.max_pages_per_frmr)  		return -EINVAL;  	hdr->cw |= (OCRDMA_FR_MR << OCRDMA_WQE_OPCODE_SHIFT); @@ -2010,15 +2025,15 @@ static int ocrdma_build_fr(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr,  	fast_reg->num_sges = wr->wr.fast_reg.page_list_len;  	fast_reg->size_sge =  		get_encoded_page_size(1 << wr->wr.fast_reg.page_shift); -	mr = (struct ocrdma_mr *) (unsigned long) qp->dev->stag_arr[(hdr->lkey >> 8) & -		(OCRDMA_MAX_STAG - 1)]; +	mr = (struct ocrdma_mr *) (unsigned long) +		qp->dev->stag_arr[(hdr->lkey >> 8) & (OCRDMA_MAX_STAG - 1)];  	build_frmr_pbes(wr, mr->hwmr.pbl_table, &mr->hwmr);  	return 0;  }  static void ocrdma_ring_sq_db(struct ocrdma_qp *qp)  { -	u32 val = qp->sq.dbid | (1 << 16); +	u32 val = qp->sq.dbid | (1 << OCRDMA_DB_SQ_SHIFT);  	iowrite32(val, qp->sq_db);  } @@ -2123,12 +2138,9 @@ int ocrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,  static void ocrdma_ring_rq_db(struct ocrdma_qp *qp)  { -	u32 val = qp->rq.dbid | (1 << ocrdma_get_num_posted_shift(qp)); +	u32 val = qp->rq.dbid | (1 << OCRDMA_DB_RQ_SHIFT); -	if (qp->state != OCRDMA_QPS_INIT) -		iowrite32(val, qp->rq_db); -	else -		qp->db_cache++; +	iowrite32(val, qp->rq_db);  }  static void ocrdma_build_rqe(struct ocrdma_hdr_wqe *rqe, struct ib_recv_wr *wr, @@ -2214,7 +2226,7 @@ static int ocrdma_srq_get_idx(struct ocrdma_srq *srq)  	if (row == srq->bit_fields_len)  		BUG(); -	return indx; +	return indx + 1; /* Use from index 1 */  }  static void ocrdma_ring_srq_db(struct ocrdma_srq *srq) @@ -2331,7 +2343,7 @@ static enum ib_wc_status ocrdma_to_ibwc_err(u16 status)  	default:  		ibwc_status = IB_WC_GENERAL_ERR;  		break; -	}; +	}  	return ibwc_status;  } @@ -2370,7 +2382,7 @@ static void ocrdma_update_wc(struct ocrdma_qp *qp, struct ib_wc *ibwc,  		pr_err("%s() invalid opcode received = 0x%x\n",  		       __func__, hdr->cw & OCRDMA_WQE_OPCODE_MASK);  		break; -	}; +	}  }  static void ocrdma_set_cqe_status_flushed(struct ocrdma_qp *qp, @@ -2551,10 +2563,13 @@ static void ocrdma_update_free_srq_cqe(struct ib_wc *ibwc,  	srq = get_ocrdma_srq(qp->ibqp.srq);  	wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >> -			OCRDMA_CQE_BUFTAG_SHIFT) & srq->rq.max_wqe_idx; +		OCRDMA_CQE_BUFTAG_SHIFT) & srq->rq.max_wqe_idx; +	if (wqe_idx < 1) +		BUG(); +  	ibwc->wr_id = srq->rqe_wr_id_tbl[wqe_idx];  	spin_lock_irqsave(&srq->q_lock, flags); -	ocrdma_srq_toggle_bit(srq, wqe_idx); +	ocrdma_srq_toggle_bit(srq, wqe_idx - 1);  	spin_unlock_irqrestore(&srq->q_lock, flags);  	ocrdma_hwq_inc_tail(&srq->rq);  } @@ -2706,10 +2721,18 @@ expand_cqe:  	}  stop_cqe:  	cq->getp = cur_getp; -	if (polled_hw_cqes || expand || stop) { -		ocrdma_ring_cq_db(dev, cq->id, cq->armed, cq->solicited, +	if (cq->deferred_arm) { +		ocrdma_ring_cq_db(dev, cq->id, true, cq->deferred_sol,  				  polled_hw_cqes); +		cq->deferred_arm = false; +		cq->deferred_sol = false; +	} else { +		/* We need to pop the CQE. No need to arm */ +		ocrdma_ring_cq_db(dev, cq->id, false, cq->deferred_sol, +				  polled_hw_cqes); +		cq->deferred_sol = false;  	} +  	return i;  } @@ -2781,30 +2804,28 @@ int ocrdma_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags cq_flags)  	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);  	struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device);  	u16 cq_id; -	u16 cur_getp; -	struct ocrdma_cqe *cqe;  	unsigned long flags; +	bool arm_needed = false, sol_needed = false;  	cq_id = cq->id;  	spin_lock_irqsave(&cq->cq_lock, flags);  	if (cq_flags & IB_CQ_NEXT_COMP || cq_flags & IB_CQ_SOLICITED) -		cq->armed = true; +		arm_needed = true;  	if (cq_flags & IB_CQ_SOLICITED) -		cq->solicited = true; - -	cur_getp = cq->getp; -	cqe = cq->va + cur_getp; +		sol_needed = true; -	/* check whether any valid cqe exist or not, if not then safe to -	 * arm. If cqe is not yet consumed, then let it get consumed and then -	 * we arm it to avoid false interrupts. -	 */ -	if (!is_cqe_valid(cq, cqe) || cq->arm_needed) { -		cq->arm_needed = false; -		ocrdma_ring_cq_db(dev, cq_id, cq->armed, cq->solicited, 0); +	if (cq->first_arm) { +		ocrdma_ring_cq_db(dev, cq_id, arm_needed, sol_needed, 0); +		cq->first_arm = false; +		goto skip_defer;  	} +	cq->deferred_arm = true; + +skip_defer: +	cq->deferred_sol = sol_needed;  	spin_unlock_irqrestore(&cq->cq_lock, flags); +  	return 0;  } @@ -2839,7 +2860,8 @@ struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *ibpd, int max_page_list_len)  		goto mbx_err;  	mr->ibmr.rkey = mr->hwmr.lkey;  	mr->ibmr.lkey = mr->hwmr.lkey; -	dev->stag_arr[(mr->hwmr.lkey >> 8) & (OCRDMA_MAX_STAG - 1)] = (unsigned long) mr; +	dev->stag_arr[(mr->hwmr.lkey >> 8) & (OCRDMA_MAX_STAG - 1)] = +		(unsigned long) mr;  	return &mr->ibmr;  mbx_err:  	ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 1946101419a..c00ae093b6f 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -868,8 +868,10 @@ struct qib_devdata {  	/* last buffer for user use */  	u32 lastctxt_piobuf; -	/* saturating counter of (non-port-specific) device interrupts */ -	u32 int_counter; +	/* reset value */ +	u64 z_int_counter; +	/* percpu intcounter */ +	u64 __percpu *int_counter;  	/* pio bufs allocated per ctxt */  	u32 pbufsctxt; @@ -1184,7 +1186,7 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *);  void qib_set_ctxtcnt(struct qib_devdata *);  int qib_create_ctxts(struct qib_devdata *dd);  struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *, u32, int); -void qib_init_pportdata(struct qib_pportdata *, struct qib_devdata *, u8, u8); +int qib_init_pportdata(struct qib_pportdata *, struct qib_devdata *, u8, u8);  void qib_free_ctxtdata(struct qib_devdata *, struct qib_ctxtdata *);  u32 qib_kreceive(struct qib_ctxtdata *, u32 *, u32 *); @@ -1449,6 +1451,10 @@ void qib_nomsi(struct qib_devdata *);  void qib_nomsix(struct qib_devdata *);  void qib_pcie_getcmd(struct qib_devdata *, u16 *, u8 *, u8 *);  void qib_pcie_reenable(struct qib_devdata *, u16, u8, u8); +/* interrupts for device */ +u64 qib_int_counter(struct qib_devdata *); +/* interrupt for all devices */ +u64 qib_sps_ints(void);  /*   * dma_addr wrappers - all 0's invalid for hw diff --git a/drivers/infiniband/hw/qib/qib_diag.c b/drivers/infiniband/hw/qib/qib_diag.c index 1686fd4bda8..5dfda4c5cc9 100644 --- a/drivers/infiniband/hw/qib/qib_diag.c +++ b/drivers/infiniband/hw/qib/qib_diag.c @@ -546,7 +546,7 @@ static ssize_t qib_diagpkt_write(struct file *fp,  				 size_t count, loff_t *off)  {  	u32 __iomem *piobuf; -	u32 plen, clen, pbufn; +	u32 plen, pbufn, maxlen_reserve;  	struct qib_diag_xpkt dp;  	u32 *tmpbuf = NULL;  	struct qib_devdata *dd; @@ -590,15 +590,20 @@ static ssize_t qib_diagpkt_write(struct file *fp,  	}  	ppd = &dd->pport[dp.port - 1]; -	/* need total length before first word written */ -	/* +1 word is for the qword padding */ -	plen = sizeof(u32) + dp.len; -	clen = dp.len >> 2; - -	if ((plen + 4) > ppd->ibmaxlen) { +	/* +	 * need total length before first word written, plus 2 Dwords. One Dword +	 * is for padding so we get the full user data when not aligned on +	 * a word boundary. The other Dword is to make sure we have room for the +	 * ICRC which gets tacked on later. +	 */ +	maxlen_reserve = 2 * sizeof(u32); +	if (dp.len > ppd->ibmaxlen - maxlen_reserve) {  		ret = -EINVAL; -		goto bail;      /* before writing pbc */ +		goto bail;  	} + +	plen = sizeof(u32) + dp.len; +  	tmpbuf = vmalloc(plen);  	if (!tmpbuf) {  		qib_devinfo(dd->pcidev, @@ -638,11 +643,11 @@ static ssize_t qib_diagpkt_write(struct file *fp,  	 */  	if (dd->flags & QIB_PIO_FLUSH_WC) {  		qib_flush_wc(); -		qib_pio_copy(piobuf + 2, tmpbuf, clen - 1); +		qib_pio_copy(piobuf + 2, tmpbuf, plen - 1);  		qib_flush_wc(); -		__raw_writel(tmpbuf[clen - 1], piobuf + clen + 1); +		__raw_writel(tmpbuf[plen - 1], piobuf + plen + 1);  	} else -		qib_pio_copy(piobuf + 2, tmpbuf, clen); +		qib_pio_copy(piobuf + 2, tmpbuf, plen);  	if (dd->flags & QIB_USE_SPCL_TRIG) {  		u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023; @@ -689,28 +694,23 @@ int qib_register_observer(struct qib_devdata *dd,  			  const struct diag_observer *op)  {  	struct diag_observer_list_elt *olp; -	int ret = -EINVAL; +	unsigned long flags;  	if (!dd || !op) -		goto bail; -	ret = -ENOMEM; +		return -EINVAL;  	olp = vmalloc(sizeof *olp);  	if (!olp) {  		pr_err("vmalloc for observer failed\n"); -		goto bail; +		return -ENOMEM;  	} -	if (olp) { -		unsigned long flags; -		spin_lock_irqsave(&dd->qib_diag_trans_lock, flags); -		olp->op = op; -		olp->next = dd->diag_observer_list; -		dd->diag_observer_list = olp; -		spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags); -		ret = 0; -	} -bail: -	return ret; +	spin_lock_irqsave(&dd->qib_diag_trans_lock, flags); +	olp->op = op; +	olp->next = dd->diag_observer_list; +	dd->diag_observer_list = olp; +	spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags); + +	return 0;  }  /* Remove all registered observers when device is closed */ diff --git a/drivers/infiniband/hw/qib/qib_dma.c b/drivers/infiniband/hw/qib/qib_dma.c index 2920bb39a65..59fe092b4b0 100644 --- a/drivers/infiniband/hw/qib/qib_dma.c +++ b/drivers/infiniband/hw/qib/qib_dma.c @@ -108,6 +108,10 @@ static int qib_map_sg(struct ib_device *dev, struct scatterlist *sgl,  			ret = 0;  			break;  		} +		sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH +		sg->dma_length = sg->length; +#endif  	}  	return ret;  } @@ -119,21 +123,6 @@ static void qib_unmap_sg(struct ib_device *dev,  	BUG_ON(!valid_dma_direction(direction));  } -static u64 qib_sg_dma_address(struct ib_device *dev, struct scatterlist *sg) -{ -	u64 addr = (u64) page_address(sg_page(sg)); - -	if (addr) -		addr += sg->offset; -	return addr; -} - -static unsigned int qib_sg_dma_len(struct ib_device *dev, -				   struct scatterlist *sg) -{ -	return sg->length; -} -  static void qib_sync_single_for_cpu(struct ib_device *dev, u64 addr,  				    size_t size, enum dma_data_direction dir)  { @@ -173,8 +162,6 @@ struct ib_dma_mapping_ops qib_dma_mapping_ops = {  	.unmap_page = qib_dma_unmap_page,  	.map_sg = qib_map_sg,  	.unmap_sg = qib_unmap_sg, -	.dma_address = qib_sg_dma_address, -	.dma_len = qib_sg_dma_len,  	.sync_single_for_cpu = qib_sync_single_for_cpu,  	.sync_single_for_device = qib_sync_single_for_device,  	.alloc_coherent = qib_dma_alloc_coherent, diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 275f247f9fc..b15e34eeef6 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -1459,7 +1459,7 @@ static int get_a_ctxt(struct file *fp, const struct qib_user_info *uinfo,  					cused++;  				else  					cfree++; -			if (pusable && cfree && cused < inuse) { +			if (cfree && cused < inuse) {  				udd = dd;  				inuse = cused;  			} @@ -1578,7 +1578,7 @@ static int do_qib_user_sdma_queue_create(struct file *fp)  	struct qib_ctxtdata *rcd = fd->rcd;  	struct qib_devdata *dd = rcd->dd; -	if (dd->flags & QIB_HAS_SEND_DMA) +	if (dd->flags & QIB_HAS_SEND_DMA) {  		fd->pq = qib_user_sdma_queue_create(&dd->pcidev->dev,  						    dd->unit, @@ -1586,6 +1586,7 @@ static int do_qib_user_sdma_queue_create(struct file *fp)  						    fd->subctxt);  		if (!fd->pq)  			return -ENOMEM; +	}  	return 0;  } diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c index f247fc6e618..cab610ccd50 100644 --- a/drivers/infiniband/hw/qib/qib_fs.c +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -105,6 +105,7 @@ static int create_file(const char *name, umode_t mode,  static ssize_t driver_stats_read(struct file *file, char __user *buf,  				 size_t count, loff_t *ppos)  { +	qib_stats.sps_ints = qib_sps_ints();  	return simple_read_from_buffer(buf, count, ppos, &qib_stats,  				       sizeof qib_stats);  } @@ -456,13 +457,13 @@ static int remove_file(struct dentry *parent, char *name)  	spin_lock(&tmp->d_lock);  	if (!(d_unhashed(tmp) && tmp->d_inode)) { -		dget_dlock(tmp);  		__d_drop(tmp);  		spin_unlock(&tmp->d_lock);  		simple_unlink(parent->d_inode, tmp);  	} else {  		spin_unlock(&tmp->d_lock);  	} +	dput(tmp);  	ret = 0;  bail: @@ -491,6 +492,7 @@ static int remove_device_files(struct super_block *sb,  		goto bail;  	} +	mutex_lock(&dir->d_inode->i_mutex);  	remove_file(dir, "counters");  	remove_file(dir, "counter_names");  	remove_file(dir, "portcounter_names"); @@ -505,8 +507,10 @@ static int remove_device_files(struct super_block *sb,  		}  	}  	remove_file(dir, "flash"); -	d_delete(dir); +	mutex_unlock(&dir->d_inode->i_mutex);  	ret = simple_rmdir(root->d_inode, dir); +	d_delete(dir); +	dput(dir);  bail:  	mutex_unlock(&root->d_inode->i_mutex); diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c index 84e593d6007..d68266ac761 100644 --- a/drivers/infiniband/hw/qib/qib_iba6120.c +++ b/drivers/infiniband/hw/qib/qib_iba6120.c @@ -1634,9 +1634,7 @@ static irqreturn_t qib_6120intr(int irq, void *data)  		goto bail;  	} -	qib_stats.sps_ints++; -	if (dd->int_counter != (u32) -1) -		dd->int_counter++; +	this_cpu_inc(*dd->int_counter);  	if (unlikely(istat & (~QLOGIC_IB_I_BITSEXTANT |  			      QLOGIC_IB_I_GPIO | QLOGIC_IB_I_ERROR))) @@ -1808,7 +1806,8 @@ static int qib_6120_setup_reset(struct qib_devdata *dd)  	 * isn't set.  	 */  	dd->flags &= ~(QIB_INITTED | QIB_PRESENT); -	dd->int_counter = 0; /* so we check interrupts work again */ +	/* so we check interrupts work again */ +	dd->z_int_counter = qib_int_counter(dd);  	val = dd->control | QLOGIC_IB_C_RESET;  	writeq(val, &dd->kregbase[kr_control]);  	mb(); /* prevent compiler re-ordering around actual reset */ @@ -3266,7 +3265,9 @@ static int init_6120_variables(struct qib_devdata *dd)  	dd->eep_st_masks[2].errs_to_log = ERR_MASK(ResetNegated); -	qib_init_pportdata(ppd, dd, 0, 1); +	ret = qib_init_pportdata(ppd, dd, 0, 1); +	if (ret) +		goto bail;  	ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;  	ppd->link_speed_supported = QIB_IB_SDR;  	ppd->link_width_enabled = IB_WIDTH_4X; diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c index 454c2e7668f..7dec89fdc12 100644 --- a/drivers/infiniband/hw/qib/qib_iba7220.c +++ b/drivers/infiniband/hw/qib/qib_iba7220.c @@ -1962,10 +1962,7 @@ static irqreturn_t qib_7220intr(int irq, void *data)  		goto bail;  	} -	qib_stats.sps_ints++; -	if (dd->int_counter != (u32) -1) -		dd->int_counter++; - +	this_cpu_inc(*dd->int_counter);  	if (unlikely(istat & (~QLOGIC_IB_I_BITSEXTANT |  			      QLOGIC_IB_I_GPIO | QLOGIC_IB_I_ERROR)))  		unlikely_7220_intr(dd, istat); @@ -2120,7 +2117,8 @@ static int qib_setup_7220_reset(struct qib_devdata *dd)  	 * isn't set.  	 */  	dd->flags &= ~(QIB_INITTED | QIB_PRESENT); -	dd->int_counter = 0; /* so we check interrupts work again */ +	/* so we check interrupts work again */ +	dd->z_int_counter = qib_int_counter(dd);  	val = dd->control | QLOGIC_IB_C_RESET;  	writeq(val, &dd->kregbase[kr_control]);  	mb(); /* prevent compiler reordering around actual reset */ @@ -4061,7 +4059,9 @@ static int qib_init_7220_variables(struct qib_devdata *dd)  	init_waitqueue_head(&cpspec->autoneg_wait);  	INIT_DELAYED_WORK(&cpspec->autoneg_work, autoneg_7220_work); -	qib_init_pportdata(ppd, dd, 0, 1); +	ret = qib_init_pportdata(ppd, dd, 0, 1); +	if (ret) +		goto bail;  	ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;  	ppd->link_speed_supported = QIB_IB_SDR | QIB_IB_DDR; diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 016e7429adf..a7eb32517a0 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -2395,6 +2395,11 @@ static int qib_7322_bringup_serdes(struct qib_pportdata *ppd)  	qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a);  	qib_write_kreg(dd, kr_scratch, 0ULL); +	/* ensure previous Tx parameters are not still forced */ +	qib_write_kreg_port(ppd, krp_tx_deemph_override, +		SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, +		reset_tx_deemphasis_override)); +  	if (qib_compat_ddr_negotiate) {  		ppd->cpspec->ibdeltainprog = 1;  		ppd->cpspec->ibsymsnap = read_7322_creg32_port(ppd, @@ -3110,9 +3115,7 @@ static irqreturn_t qib_7322intr(int irq, void *data)  		goto bail;  	} -	qib_stats.sps_ints++; -	if (dd->int_counter != (u32) -1) -		dd->int_counter++; +	this_cpu_inc(*dd->int_counter);  	/* handle "errors" of various kinds first, device ahead of port */  	if (unlikely(istat & (~QIB_I_BITSEXTANT | QIB_I_GPIO | @@ -3181,9 +3184,7 @@ static irqreturn_t qib_7322pintr(int irq, void *data)  		 */  		return IRQ_HANDLED; -	qib_stats.sps_ints++; -	if (dd->int_counter != (u32) -1) -		dd->int_counter++; +	this_cpu_inc(*dd->int_counter);  	/* Clear the interrupt bit we expect to be set. */  	qib_write_kreg(dd, kr_intclear, ((1ULL << QIB_I_RCVAVAIL_LSB) | @@ -3210,9 +3211,7 @@ static irqreturn_t qib_7322bufavail(int irq, void *data)  		 */  		return IRQ_HANDLED; -	qib_stats.sps_ints++; -	if (dd->int_counter != (u32) -1) -		dd->int_counter++; +	this_cpu_inc(*dd->int_counter);  	/* Clear the interrupt bit we expect to be set. */  	qib_write_kreg(dd, kr_intclear, QIB_I_SPIOBUFAVAIL); @@ -3243,9 +3242,7 @@ static irqreturn_t sdma_intr(int irq, void *data)  		 */  		return IRQ_HANDLED; -	qib_stats.sps_ints++; -	if (dd->int_counter != (u32) -1) -		dd->int_counter++; +	this_cpu_inc(*dd->int_counter);  	/* Clear the interrupt bit we expect to be set. */  	qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? @@ -3272,9 +3269,7 @@ static irqreturn_t sdma_idle_intr(int irq, void *data)  		 */  		return IRQ_HANDLED; -	qib_stats.sps_ints++; -	if (dd->int_counter != (u32) -1) -		dd->int_counter++; +	this_cpu_inc(*dd->int_counter);  	/* Clear the interrupt bit we expect to be set. */  	qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? @@ -3301,9 +3296,7 @@ static irqreturn_t sdma_progress_intr(int irq, void *data)  		 */  		return IRQ_HANDLED; -	qib_stats.sps_ints++; -	if (dd->int_counter != (u32) -1) -		dd->int_counter++; +	this_cpu_inc(*dd->int_counter);  	/* Clear the interrupt bit we expect to be set. */  	qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? @@ -3331,9 +3324,7 @@ static irqreturn_t sdma_cleanup_intr(int irq, void *data)  		 */  		return IRQ_HANDLED; -	qib_stats.sps_ints++; -	if (dd->int_counter != (u32) -1) -		dd->int_counter++; +	this_cpu_inc(*dd->int_counter);  	/* Clear the interrupt bit we expect to be set. */  	qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? @@ -3718,7 +3709,8 @@ static int qib_do_7322_reset(struct qib_devdata *dd)  	dd->pport->cpspec->ibsymdelta = 0;  	dd->pport->cpspec->iblnkerrdelta = 0;  	dd->pport->cpspec->ibmalfdelta = 0; -	dd->int_counter = 0; /* so we check interrupts work again */ +	/* so we check interrupts work again */ +	dd->z_int_counter = qib_int_counter(dd);  	/*  	 * Keep chip from being accessed until we are ready.  Use @@ -6190,21 +6182,20 @@ static int setup_txselect(const char *str, struct kernel_param *kp)  {  	struct qib_devdata *dd;  	unsigned long val; -	int ret; - +	char *n;  	if (strlen(str) >= MAX_ATTEN_LEN) {  		pr_info("txselect_values string too long\n");  		return -ENOSPC;  	} -	ret = kstrtoul(str, 0, &val); -	if (ret || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + +	val = simple_strtoul(str, &n, 0); +	if (n == str || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ +  				TXDDS_MFG_SZ)) {  		pr_info("txselect_values must start with a number < %d\n",  			TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ); -		return ret ? ret : -EINVAL; +		return -EINVAL;  	} -  	strcpy(txselect_list, str); +  	list_for_each_entry(dd, &qib_dev_list, list)  		if (dd->deviceid == PCI_DEVICE_ID_QLOGIC_IB_7322)  			set_no_qsfp_atten(dd, 1); @@ -6553,7 +6544,11 @@ static int qib_init_7322_variables(struct qib_devdata *dd)  		}  		dd->num_pports++; -		qib_init_pportdata(ppd, dd, pidx, dd->num_pports); +		ret = qib_init_pportdata(ppd, dd, pidx, dd->num_pports); +		if (ret) { +			dd->num_pports--; +			goto bail; +		}  		ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;  		ppd->link_width_enabled = IB_WIDTH_4X; diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 24e802f4ea2..8d3c78ddc90 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -130,7 +130,6 @@ void qib_set_ctxtcnt(struct qib_devdata *dd)  int qib_create_ctxts(struct qib_devdata *dd)  {  	unsigned i; -	int ret;  	int local_node_id = pcibus_to_node(dd->pcidev->bus);  	if (local_node_id < 0) @@ -145,8 +144,7 @@ int qib_create_ctxts(struct qib_devdata *dd)  	if (!dd->rcd) {  		qib_dev_err(dd,  			"Unable to allocate ctxtdata array, failing\n"); -		ret = -ENOMEM; -		goto done; +		return -ENOMEM;  	}  	/* create (one or more) kctxt */ @@ -163,15 +161,14 @@ int qib_create_ctxts(struct qib_devdata *dd)  		if (!rcd) {  			qib_dev_err(dd,  				"Unable to allocate ctxtdata for Kernel ctxt, failing\n"); -			ret = -ENOMEM; -			goto done; +			kfree(dd->rcd); +			dd->rcd = NULL; +			return -ENOMEM;  		}  		rcd->pkeys[0] = QIB_DEFAULT_P_KEY;  		rcd->seq_cnt = 1;  	} -	ret = 0; -done: -	return ret; +	return 0;  }  /* @@ -233,7 +230,7 @@ struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt,  /*   * Common code for initializing the physical port structure.   */ -void qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, +int qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd,  			u8 hw_pidx, u8 port)  {  	int size; @@ -243,6 +240,7 @@ void qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd,  	spin_lock_init(&ppd->sdma_lock);  	spin_lock_init(&ppd->lflags_lock); +	spin_lock_init(&ppd->cc_shadow_lock);  	init_waitqueue_head(&ppd->state_wait);  	init_timer(&ppd->symerr_clear_timer); @@ -250,8 +248,10 @@ void qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd,  	ppd->symerr_clear_timer.data = (unsigned long)ppd;  	ppd->qib_wq = NULL; - -	spin_lock_init(&ppd->cc_shadow_lock); +	ppd->ibport_data.pmastats = +		alloc_percpu(struct qib_pma_counters); +	if (!ppd->ibport_data.pmastats) +		return -ENOMEM;  	if (qib_cc_table_size < IB_CCT_MIN_ENTRIES)  		goto bail; @@ -299,7 +299,7 @@ void qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd,  		goto bail_3;  	} -	return; +	return 0;  bail_3:  	kfree(ppd->ccti_entries_shadow); @@ -313,7 +313,7 @@ bail_1:  bail:  	/* User is intentionally disabling the congestion control agent */  	if (!qib_cc_table_size) -		return; +		return 0;  	if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) {  		qib_cc_table_size = 0; @@ -324,7 +324,7 @@ bail:  	qib_dev_err(dd, "Congestion Control Agent disabled for port %d\n",  		port); -	return; +	return 0;  }  static int init_pioavailregs(struct qib_devdata *dd) @@ -525,6 +525,7 @@ static void enable_chip(struct qib_devdata *dd)  static void verify_interrupt(unsigned long opaque)  {  	struct qib_devdata *dd = (struct qib_devdata *) opaque; +	u64 int_counter;  	if (!dd)  		return; /* being torn down */ @@ -533,7 +534,8 @@ static void verify_interrupt(unsigned long opaque)  	 * If we don't have a lid or any interrupts, let the user know and  	 * don't bother checking again.  	 */ -	if (dd->int_counter == 0) { +	int_counter = qib_int_counter(dd) - dd->z_int_counter; +	if (int_counter == 0) {  		if (!dd->f_intr_fallback(dd))  			dev_err(&dd->pcidev->dev,  				"No interrupts detected, not usable.\n"); @@ -633,6 +635,12 @@ wq_error:  	return -ENOMEM;  } +static void qib_free_pportdata(struct qib_pportdata *ppd) +{ +	free_percpu(ppd->ibport_data.pmastats); +	ppd->ibport_data.pmastats = NULL; +} +  /**   * qib_init - do the actual initialization sequence on the chip   * @dd: the qlogic_ib device @@ -920,6 +928,7 @@ static void qib_shutdown_device(struct qib_devdata *dd)  			destroy_workqueue(ppd->qib_wq);  			ppd->qib_wq = NULL;  		} +		qib_free_pportdata(ppd);  	}  	qib_update_eeprom_log(dd); @@ -1079,9 +1088,34 @@ void qib_free_devdata(struct qib_devdata *dd)  #ifdef CONFIG_DEBUG_FS  	qib_dbg_ibdev_exit(&dd->verbs_dev);  #endif +	free_percpu(dd->int_counter);  	ib_dealloc_device(&dd->verbs_dev.ibdev);  } +u64 qib_int_counter(struct qib_devdata *dd) +{ +	int cpu; +	u64 int_counter = 0; + +	for_each_possible_cpu(cpu) +		int_counter += *per_cpu_ptr(dd->int_counter, cpu); +	return int_counter; +} + +u64 qib_sps_ints(void) +{ +	unsigned long flags; +	struct qib_devdata *dd; +	u64 sps_ints = 0; + +	spin_lock_irqsave(&qib_devs_lock, flags); +	list_for_each_entry(dd, &qib_dev_list, list) { +		sps_ints += qib_int_counter(dd); +	} +	spin_unlock_irqrestore(&qib_devs_lock, flags); +	return sps_ints; +} +  /*   * Allocate our primary per-unit data structure.  Must be done via verbs   * allocator, because the verbs cleanup process both does cleanup and @@ -1097,14 +1131,10 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra)  	int ret;  	dd = (struct qib_devdata *) ib_alloc_device(sizeof(*dd) + extra); -	if (!dd) { -		dd = ERR_PTR(-ENOMEM); -		goto bail; -	} +	if (!dd) +		return ERR_PTR(-ENOMEM); -#ifdef CONFIG_DEBUG_FS -	qib_dbg_ibdev_init(&dd->verbs_dev); -#endif +	INIT_LIST_HEAD(&dd->list);  	idr_preload(GFP_KERNEL);  	spin_lock_irqsave(&qib_devs_lock, flags); @@ -1121,11 +1151,13 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra)  	if (ret < 0) {  		qib_early_err(&pdev->dev,  			      "Could not allocate unit ID: error %d\n", -ret); -#ifdef CONFIG_DEBUG_FS -		qib_dbg_ibdev_exit(&dd->verbs_dev); -#endif -		ib_dealloc_device(&dd->verbs_dev.ibdev); -		dd = ERR_PTR(ret); +		goto bail; +	} +	dd->int_counter = alloc_percpu(u64); +	if (!dd->int_counter) { +		ret = -ENOMEM; +		qib_early_err(&pdev->dev, +			      "Could not allocate per-cpu int_counter\n");  		goto bail;  	} @@ -1139,9 +1171,15 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra)  			qib_early_err(&pdev->dev,  				"Could not alloc cpulist info, cpu affinity might be wrong\n");  	} - -bail: +#ifdef CONFIG_DEBUG_FS +	qib_dbg_ibdev_init(&dd->verbs_dev); +#endif  	return dd; +bail: +	if (!list_empty(&dd->list)) +		list_del_init(&dd->list); +	ib_dealloc_device(&dd->verbs_dev.ibdev); +	return ERR_PTR(ret);;  }  /* @@ -1234,7 +1272,7 @@ static int qib_notify_dca(struct notifier_block *nb, unsigned long event,   * Do all the generic driver unit- and chip-independent memory   * allocation and initialization.   */ -static int __init qlogic_ib_init(void) +static int __init qib_ib_init(void)  {  	int ret; @@ -1278,12 +1316,12 @@ bail:  	return ret;  } -module_init(qlogic_ib_init); +module_init(qib_ib_init);  /*   * Do the non-unit driver cleanup, memory free, etc. at unload.   */ -static void __exit qlogic_ib_cleanup(void) +static void __exit qib_ib_cleanup(void)  {  	int ret; @@ -1308,7 +1346,7 @@ static void __exit qlogic_ib_cleanup(void)  	qib_dev_cleanup();  } -module_exit(qlogic_ib_cleanup); +module_exit(qib_ib_cleanup);  /* this can only be called after a successful initialization */  static void cleanup_device_data(struct qib_devdata *dd) diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c index ccb119143d2..22c720e5740 100644 --- a/drivers/infiniband/hw/qib/qib_mad.c +++ b/drivers/infiniband/hw/qib/qib_mad.c @@ -1028,7 +1028,7 @@ static int set_pkeys(struct qib_devdata *dd, u8 port, u16 *pkeys)  		event.event = IB_EVENT_PKEY_CHANGE;  		event.device = &dd->verbs_dev.ibdev; -		event.element.port_num = 1; +		event.element.port_num = port;  		ib_dispatch_event(&event);  	}  	return 0; @@ -1634,6 +1634,23 @@ static int pma_get_portcounters_cong(struct ib_pma_mad *pmp,  	return reply((struct ib_smp *)pmp);  } +static void qib_snapshot_pmacounters( +	struct qib_ibport *ibp, +	struct qib_pma_counters *pmacounters) +{ +	struct qib_pma_counters *p; +	int cpu; + +	memset(pmacounters, 0, sizeof(*pmacounters)); +	for_each_possible_cpu(cpu) { +		p = per_cpu_ptr(ibp->pmastats, cpu); +		pmacounters->n_unicast_xmit += p->n_unicast_xmit; +		pmacounters->n_unicast_rcv += p->n_unicast_rcv; +		pmacounters->n_multicast_xmit += p->n_multicast_xmit; +		pmacounters->n_multicast_rcv += p->n_multicast_rcv; +	} +} +  static int pma_get_portcounters_ext(struct ib_pma_mad *pmp,  				    struct ib_device *ibdev, u8 port)  { @@ -1642,6 +1659,7 @@ static int pma_get_portcounters_ext(struct ib_pma_mad *pmp,  	struct qib_ibport *ibp = to_iport(ibdev, port);  	struct qib_pportdata *ppd = ppd_from_ibp(ibp);  	u64 swords, rwords, spkts, rpkts, xwait; +	struct qib_pma_counters pma;  	u8 port_select = p->port_select;  	memset(pmp->data, 0, sizeof(pmp->data)); @@ -1664,10 +1682,17 @@ static int pma_get_portcounters_ext(struct ib_pma_mad *pmp,  	p->port_rcv_data = cpu_to_be64(rwords);  	p->port_xmit_packets = cpu_to_be64(spkts);  	p->port_rcv_packets = cpu_to_be64(rpkts); -	p->port_unicast_xmit_packets = cpu_to_be64(ibp->n_unicast_xmit); -	p->port_unicast_rcv_packets = cpu_to_be64(ibp->n_unicast_rcv); -	p->port_multicast_xmit_packets = cpu_to_be64(ibp->n_multicast_xmit); -	p->port_multicast_rcv_packets = cpu_to_be64(ibp->n_multicast_rcv); + +	qib_snapshot_pmacounters(ibp, &pma); + +	p->port_unicast_xmit_packets = cpu_to_be64(pma.n_unicast_xmit +		- ibp->z_unicast_xmit); +	p->port_unicast_rcv_packets = cpu_to_be64(pma.n_unicast_rcv +		- ibp->z_unicast_rcv); +	p->port_multicast_xmit_packets = cpu_to_be64(pma.n_multicast_xmit +		- ibp->z_multicast_xmit); +	p->port_multicast_rcv_packets = cpu_to_be64(pma.n_multicast_rcv +		- ibp->z_multicast_rcv);  bail:  	return reply((struct ib_smp *) pmp); @@ -1795,6 +1820,7 @@ static int pma_set_portcounters_ext(struct ib_pma_mad *pmp,  	struct qib_ibport *ibp = to_iport(ibdev, port);  	struct qib_pportdata *ppd = ppd_from_ibp(ibp);  	u64 swords, rwords, spkts, rpkts, xwait; +	struct qib_pma_counters pma;  	qib_snapshot_counters(ppd, &swords, &rwords, &spkts, &rpkts, &xwait); @@ -1810,17 +1836,19 @@ static int pma_set_portcounters_ext(struct ib_pma_mad *pmp,  	if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS)  		ibp->z_port_rcv_packets = rpkts; +	qib_snapshot_pmacounters(ibp, &pma); +  	if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS) -		ibp->n_unicast_xmit = 0; +		ibp->z_unicast_xmit = pma.n_unicast_xmit;  	if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS) -		ibp->n_unicast_rcv = 0; +		ibp->z_unicast_rcv = pma.n_unicast_rcv;  	if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS) -		ibp->n_multicast_xmit = 0; +		ibp->z_multicast_xmit = pma.n_multicast_xmit;  	if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS) -		ibp->n_multicast_rcv = 0; +		ibp->z_multicast_rcv = pma.n_multicast_rcv;  	return pma_get_portcounters_ext(pmp, ibdev, port);  } diff --git a/drivers/infiniband/hw/qib/qib_mad.h b/drivers/infiniband/hw/qib/qib_mad.h index 28874f8606f..941d4d50d8e 100644 --- a/drivers/infiniband/hw/qib/qib_mad.h +++ b/drivers/infiniband/hw/qib/qib_mad.h @@ -54,7 +54,7 @@ struct ib_node_info {  	__be32 revision;  	u8 local_port_num;  	u8 vendor_id[3]; -} __attribute__ ((packed)); +} __packed;  struct ib_mad_notice_attr {  	u8 generic_type; @@ -73,7 +73,7 @@ struct ib_mad_notice_attr {  			__be16	reserved;  			__be16	lid;		/* where violation happened */  			u8	port_num;	/* where violation happened */ -		} __attribute__ ((packed)) ntc_129_131; +		} __packed ntc_129_131;  		struct {  			__be16	reserved; @@ -83,14 +83,14 @@ struct ib_mad_notice_attr {  			__be32	new_cap_mask;	/* new capability mask */  			u8	reserved3;  			u8	change_flags;	/* low 3 bits only */ -		} __attribute__ ((packed)) ntc_144; +		} __packed ntc_144;  		struct {  			__be16	reserved;  			__be16	lid;		/* lid where sys guid changed */  			__be16	reserved2;  			__be64	new_sys_guid; -		} __attribute__ ((packed)) ntc_145; +		} __packed ntc_145;  		struct {  			__be16	reserved; @@ -104,7 +104,7 @@ struct ib_mad_notice_attr {  			u8	reserved3;  			u8	dr_trunc_hop;  			u8	dr_rtn_path[30]; -		} __attribute__ ((packed)) ntc_256; +		} __packed ntc_256;  		struct {  			__be16		reserved; @@ -115,7 +115,7 @@ struct ib_mad_notice_attr {  			__be32		qp2;	/* high 8 bits reserved */  			union ib_gid	gid1;  			union ib_gid	gid2; -		} __attribute__ ((packed)) ntc_257_258; +		} __packed ntc_257_258;  	} details;  }; @@ -209,7 +209,7 @@ struct ib_pma_portcounters_cong {  	__be64 port_rcv_packets;  	__be64 port_xmit_wait;  	__be64 port_adr_events; -} __attribute__ ((packed)); +} __packed;  #define IB_PMA_CONG_HW_CONTROL_TIMER            0x00  #define IB_PMA_CONG_HW_CONTROL_SAMPLE           0x01 diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c index e6687ded821..9bbb55347cc 100644 --- a/drivers/infiniband/hw/qib/qib_mr.c +++ b/drivers/infiniband/hw/qib/qib_mr.c @@ -232,8 +232,8 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  {  	struct qib_mr *mr;  	struct ib_umem *umem; -	struct ib_umem_chunk *chunk; -	int n, m, i; +	struct scatterlist *sg; +	int n, m, entry;  	struct ib_mr *ret;  	if (length == 0) { @@ -246,9 +246,7 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  	if (IS_ERR(umem))  		return (void *) umem; -	n = 0; -	list_for_each_entry(chunk, &umem->chunk_list, list) -		n += chunk->nents; +	n = umem->nmap;  	mr = alloc_mr(n, pd);  	if (IS_ERR(mr)) { @@ -268,11 +266,10 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  		mr->mr.page_shift = ilog2(umem->page_size);  	m = 0;  	n = 0; -	list_for_each_entry(chunk, &umem->chunk_list, list) { -		for (i = 0; i < chunk->nents; i++) { +	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {  			void *vaddr; -			vaddr = page_address(sg_page(&chunk->page_list[i])); +			vaddr = page_address(sg_page(sg));  			if (!vaddr) {  				ret = ERR_PTR(-EINVAL);  				goto bail; @@ -284,7 +281,6 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,  				m++;  				n = 0;  			} -		}  	}  	ret = &mr->ibmr; diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c index 3f14009fb66..61a0046efb7 100644 --- a/drivers/infiniband/hw/qib/qib_pcie.c +++ b/drivers/infiniband/hw/qib/qib_pcie.c @@ -51,8 +51,8 @@   * file calls, even though this violates some   * expectations of harmlessness.   */ -static int qib_tune_pcie_caps(struct qib_devdata *); -static int qib_tune_pcie_coalesce(struct qib_devdata *); +static void qib_tune_pcie_caps(struct qib_devdata *); +static void qib_tune_pcie_coalesce(struct qib_devdata *);  /*   * Do all the common PCIe setup and initialization. @@ -197,46 +197,47 @@ static void qib_msix_setup(struct qib_devdata *dd, int pos, u32 *msixcnt,  			   struct qib_msix_entry *qib_msix_entry)  {  	int ret; -	u32 tabsize = 0; -	u16 msix_flags; +	int nvec = *msixcnt;  	struct msix_entry *msix_entry;  	int i; +	ret = pci_msix_vec_count(dd->pcidev); +	if (ret < 0) +		goto do_intx; + +	nvec = min(nvec, ret); +  	/* We can't pass qib_msix_entry array to qib_msix_setup  	 * so use a dummy msix_entry array and copy the allocated  	 * irq back to the qib_msix_entry array. */ -	msix_entry = kmalloc(*msixcnt * sizeof(*msix_entry), GFP_KERNEL); -	if (!msix_entry) { -		ret = -ENOMEM; +	msix_entry = kmalloc(nvec * sizeof(*msix_entry), GFP_KERNEL); +	if (!msix_entry)  		goto do_intx; -	} -	for (i = 0; i < *msixcnt; i++) + +	for (i = 0; i < nvec; i++)  		msix_entry[i] = qib_msix_entry[i].msix; -	pci_read_config_word(dd->pcidev, pos + PCI_MSIX_FLAGS, &msix_flags); -	tabsize = 1 + (msix_flags & PCI_MSIX_FLAGS_QSIZE); -	if (tabsize > *msixcnt) -		tabsize = *msixcnt; -	ret = pci_enable_msix(dd->pcidev, msix_entry, tabsize); -	if (ret > 0) { -		tabsize = ret; -		ret = pci_enable_msix(dd->pcidev, msix_entry, tabsize); -	} -do_intx: -	if (ret) { -		qib_dev_err(dd, -			"pci_enable_msix %d vectors failed: %d, falling back to INTx\n", -			tabsize, ret); -		tabsize = 0; -	} -	for (i = 0; i < tabsize; i++) +	ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec); +	if (ret < 0) +		goto free_msix_entry; +	else +		nvec = ret; + +	for (i = 0; i < nvec; i++)  		qib_msix_entry[i].msix = msix_entry[i]; +  	kfree(msix_entry); -	*msixcnt = tabsize; +	*msixcnt = nvec; +	return; -	if (ret) -		qib_enable_intx(dd->pcidev); +free_msix_entry: +	kfree(msix_entry); +do_intx: +	qib_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, " +			"falling back to INTx\n", nvec, ret); +	*msixcnt = 0; +	qib_enable_intx(dd->pcidev);  }  /** @@ -476,30 +477,6 @@ void qib_pcie_reenable(struct qib_devdata *dd, u16 cmd, u8 iline, u8 cline)  			"pci_enable_device failed after reset: %d\n", r);  } -/* code to adjust PCIe capabilities. */ - -static int fld2val(int wd, int mask) -{ -	int lsbmask; - -	if (!mask) -		return 0; -	wd &= mask; -	lsbmask = mask ^ (mask & (mask - 1)); -	wd /= lsbmask; -	return wd; -} - -static int val2fld(int wd, int mask) -{ -	int lsbmask; - -	if (!mask) -		return 0; -	lsbmask = mask ^ (mask & (mask - 1)); -	wd *= lsbmask; -	return wd; -}  static int qib_pcie_coalesce;  module_param_named(pcie_coalesce, qib_pcie_coalesce, int, S_IRUGO); @@ -511,7 +488,7 @@ MODULE_PARM_DESC(pcie_coalesce, "tune PCIe colescing on some Intel chipsets");   * of these chipsets, with some BIOS settings, and enabling it on those   * systems may result in the system crashing, and/or data corruption.   */ -static int qib_tune_pcie_coalesce(struct qib_devdata *dd) +static void qib_tune_pcie_coalesce(struct qib_devdata *dd)  {  	int r;  	struct pci_dev *parent; @@ -519,18 +496,18 @@ static int qib_tune_pcie_coalesce(struct qib_devdata *dd)  	u32 mask, bits, val;  	if (!qib_pcie_coalesce) -		return 0; +		return;  	/* Find out supported and configured values for parent (root) */  	parent = dd->pcidev->bus->self;  	if (parent->bus->parent) {  		qib_devinfo(dd->pcidev, "Parent not root\n"); -		return 1; +		return;  	}  	if (!pci_is_pcie(parent)) -		return 1; +		return;  	if (parent->vendor != 0x8086) -		return 1; +		return;  	/*  	 *  - bit 12: Max_rdcmp_Imt_EN: need to set to 1 @@ -563,13 +540,12 @@ static int qib_tune_pcie_coalesce(struct qib_devdata *dd)  		mask = (3U << 24) | (7U << 10);  	} else {  		/* not one of the chipsets that we know about */ -		return 1; +		return;  	}  	pci_read_config_dword(parent, 0x48, &val);  	val &= ~mask;  	val |= bits;  	r = pci_write_config_dword(parent, 0x48, val); -	return 0;  }  /* @@ -580,55 +556,44 @@ static int qib_pcie_caps;  module_param_named(pcie_caps, qib_pcie_caps, int, S_IRUGO);  MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)"); -static int qib_tune_pcie_caps(struct qib_devdata *dd) +static void qib_tune_pcie_caps(struct qib_devdata *dd)  { -	int ret = 1; /* Assume the worst */  	struct pci_dev *parent; -	u16 pcaps, pctl, ecaps, ectl; -	int rc_sup, ep_sup; -	int rc_cur, ep_cur; +	u16 rc_mpss, rc_mps, ep_mpss, ep_mps; +	u16 rc_mrrs, ep_mrrs, max_mrrs;  	/* Find out supported and configured values for parent (root) */  	parent = dd->pcidev->bus->self; -	if (parent->bus->parent) { +	if (!pci_is_root_bus(parent->bus)) {  		qib_devinfo(dd->pcidev, "Parent not root\n"); -		goto bail; +		return;  	}  	if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev)) -		goto bail; -	pcie_capability_read_word(parent, PCI_EXP_DEVCAP, &pcaps); -	pcie_capability_read_word(parent, PCI_EXP_DEVCTL, &pctl); +		return; + +	rc_mpss = parent->pcie_mpss; +	rc_mps = ffs(pcie_get_mps(parent)) - 8;  	/* Find out supported and configured values for endpoint (us) */ -	pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCAP, &ecaps); -	pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl); +	ep_mpss = dd->pcidev->pcie_mpss; +	ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8; -	ret = 0;  	/* Find max payload supported by root, endpoint */ -	rc_sup = fld2val(pcaps, PCI_EXP_DEVCAP_PAYLOAD); -	ep_sup = fld2val(ecaps, PCI_EXP_DEVCAP_PAYLOAD); -	if (rc_sup > ep_sup) -		rc_sup = ep_sup; - -	rc_cur = fld2val(pctl, PCI_EXP_DEVCTL_PAYLOAD); -	ep_cur = fld2val(ectl, PCI_EXP_DEVCTL_PAYLOAD); +	if (rc_mpss > ep_mpss) +		rc_mpss = ep_mpss;  	/* If Supported greater than limit in module param, limit it */ -	if (rc_sup > (qib_pcie_caps & 7)) -		rc_sup = qib_pcie_caps & 7; +	if (rc_mpss > (qib_pcie_caps & 7)) +		rc_mpss = qib_pcie_caps & 7;  	/* If less than (allowed, supported), bump root payload */ -	if (rc_sup > rc_cur) { -		rc_cur = rc_sup; -		pctl = (pctl & ~PCI_EXP_DEVCTL_PAYLOAD) | -			val2fld(rc_cur, PCI_EXP_DEVCTL_PAYLOAD); -		pcie_capability_write_word(parent, PCI_EXP_DEVCTL, pctl); +	if (rc_mpss > rc_mps) { +		rc_mps = rc_mpss; +		pcie_set_mps(parent, 128 << rc_mps);  	}  	/* If less than (allowed, supported), bump endpoint payload */ -	if (rc_sup > ep_cur) { -		ep_cur = rc_sup; -		ectl = (ectl & ~PCI_EXP_DEVCTL_PAYLOAD) | -			val2fld(ep_cur, PCI_EXP_DEVCTL_PAYLOAD); -		pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl); +	if (rc_mpss > ep_mps) { +		ep_mps = rc_mpss; +		pcie_set_mps(dd->pcidev, 128 << ep_mps);  	}  	/* @@ -636,26 +601,22 @@ static int qib_tune_pcie_caps(struct qib_devdata *dd)  	 * No field for max supported, but PCIe spec limits it to 4096,  	 * which is code '5' (log2(4096) - 7)  	 */ -	rc_sup = 5; -	if (rc_sup > ((qib_pcie_caps >> 4) & 7)) -		rc_sup = (qib_pcie_caps >> 4) & 7; -	rc_cur = fld2val(pctl, PCI_EXP_DEVCTL_READRQ); -	ep_cur = fld2val(ectl, PCI_EXP_DEVCTL_READRQ); - -	if (rc_sup > rc_cur) { -		rc_cur = rc_sup; -		pctl = (pctl & ~PCI_EXP_DEVCTL_READRQ) | -			val2fld(rc_cur, PCI_EXP_DEVCTL_READRQ); -		pcie_capability_write_word(parent, PCI_EXP_DEVCTL, pctl); +	max_mrrs = 5; +	if (max_mrrs > ((qib_pcie_caps >> 4) & 7)) +		max_mrrs = (qib_pcie_caps >> 4) & 7; + +	max_mrrs = 128 << max_mrrs; +	rc_mrrs = pcie_get_readrq(parent); +	ep_mrrs = pcie_get_readrq(dd->pcidev); + +	if (max_mrrs > rc_mrrs) { +		rc_mrrs = max_mrrs; +		pcie_set_readrq(parent, rc_mrrs);  	} -	if (rc_sup > ep_cur) { -		ep_cur = rc_sup; -		ectl = (ectl & ~PCI_EXP_DEVCTL_READRQ) | -			val2fld(ep_cur, PCI_EXP_DEVCTL_READRQ); -		pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl); +	if (max_mrrs > ep_mrrs) { +		ep_mrrs = max_mrrs; +		pcie_set_readrq(dd->pcidev, ep_mrrs);  	} -bail: -	return ret;  }  /* End of PCIe capability tuning */ diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 3cca55b51e5..7fcc150d603 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -585,7 +585,7 @@ int qib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,  	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;  	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, -				attr_mask)) +				attr_mask, IB_LINK_LAYER_UNSPECIFIED))  		goto inval;  	if (attr_mask & IB_QP_AV) { @@ -985,7 +985,8 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd,  	struct ib_qp *ret;  	if (init_attr->cap.max_send_sge > ib_qib_max_sges || -	    init_attr->cap.max_send_wr > ib_qib_max_qp_wrs) { +	    init_attr->cap.max_send_wr > ib_qib_max_qp_wrs || +	    init_attr->create_flags) {  		ret = ERR_PTR(-EINVAL);  		goto bail;  	} diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 3ab341320ea..2f2501890c4 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -752,7 +752,7 @@ void qib_send_rc_ack(struct qib_qp *qp)  	qib_flush_wc();  	qib_sendbuf_done(dd, pbufn); -	ibp->n_unicast_xmit++; +	this_cpu_inc(ibp->pmastats->n_unicast_xmit);  	goto done;  queue_ack: diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index 357b6cfcd46..4c07a8b34ff 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -703,6 +703,7 @@ void qib_make_ruc_header(struct qib_qp *qp, struct qib_other_headers *ohdr,  	ohdr->bth[0] = cpu_to_be32(bth0);  	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);  	ohdr->bth[2] = cpu_to_be32(bth2); +	this_cpu_inc(ibp->pmastats->n_unicast_xmit);  }  /** diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index d6c7fe7f88d..aaf7039f8ed 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -57,13 +57,20 @@ static void qib_ud_loopback(struct qib_qp *sqp, struct qib_swqe *swqe)  	struct qib_sge *sge;  	struct ib_wc wc;  	u32 length; +	enum ib_qp_type sqptype, dqptype;  	qp = qib_lookup_qpn(ibp, swqe->wr.wr.ud.remote_qpn);  	if (!qp) {  		ibp->n_pkt_drops++;  		return;  	} -	if (qp->ibqp.qp_type != sqp->ibqp.qp_type || + +	sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ? +			IB_QPT_UD : sqp->ibqp.qp_type; +	dqptype = qp->ibqp.qp_type == IB_QPT_GSI ? +			IB_QPT_UD : qp->ibqp.qp_type; + +	if (dqptype != sqptype ||  	    !(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) {  		ibp->n_pkt_drops++;  		goto drop; @@ -273,11 +280,11 @@ int qib_make_ud_req(struct qib_qp *qp)  	ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;  	if (ah_attr->dlid >= QIB_MULTICAST_LID_BASE) {  		if (ah_attr->dlid != QIB_PERMISSIVE_LID) -			ibp->n_multicast_xmit++; +			this_cpu_inc(ibp->pmastats->n_multicast_xmit);  		else -			ibp->n_unicast_xmit++; +			this_cpu_inc(ibp->pmastats->n_unicast_xmit);  	} else { -		ibp->n_unicast_xmit++; +		this_cpu_inc(ibp->pmastats->n_unicast_xmit);  		lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);  		if (unlikely(lid == ppd->lid)) {  			/* diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c index d0a0ea0c14d..d2806cae234 100644 --- a/drivers/infiniband/hw/qib/qib_user_sdma.c +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -52,6 +52,17 @@  /* attempt to drain the queue for 5secs */  #define QIB_USER_SDMA_DRAIN_TIMEOUT 500 +/* + * track how many times a process open this driver. + */ +static struct rb_root qib_user_sdma_rb_root = RB_ROOT; + +struct qib_user_sdma_rb_node { +	struct rb_node node; +	int refcount; +	pid_t pid; +}; +  struct qib_user_sdma_pkt {  	struct list_head list;  /* list element */ @@ -120,15 +131,60 @@ struct qib_user_sdma_queue {  	/* dma page table */  	struct rb_root dma_pages_root; +	struct qib_user_sdma_rb_node *sdma_rb_node; +  	/* protect everything above... */  	struct mutex lock;  }; +static struct qib_user_sdma_rb_node * +qib_user_sdma_rb_search(struct rb_root *root, pid_t pid) +{ +	struct qib_user_sdma_rb_node *sdma_rb_node; +	struct rb_node *node = root->rb_node; + +	while (node) { +		sdma_rb_node = container_of(node, +			struct qib_user_sdma_rb_node, node); +		if (pid < sdma_rb_node->pid) +			node = node->rb_left; +		else if (pid > sdma_rb_node->pid) +			node = node->rb_right; +		else +			return sdma_rb_node; +	} +	return NULL; +} + +static int +qib_user_sdma_rb_insert(struct rb_root *root, struct qib_user_sdma_rb_node *new) +{ +	struct rb_node **node = &(root->rb_node); +	struct rb_node *parent = NULL; +	struct qib_user_sdma_rb_node *got; + +	while (*node) { +		got = container_of(*node, struct qib_user_sdma_rb_node, node); +		parent = *node; +		if (new->pid < got->pid) +			node = &((*node)->rb_left); +		else if (new->pid > got->pid) +			node = &((*node)->rb_right); +		else +			return 0; +	} + +	rb_link_node(&new->node, parent, node); +	rb_insert_color(&new->node, root); +	return 1; +} +  struct qib_user_sdma_queue *  qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt)  {  	struct qib_user_sdma_queue *pq =  		kmalloc(sizeof(struct qib_user_sdma_queue), GFP_KERNEL); +	struct qib_user_sdma_rb_node *sdma_rb_node;  	if (!pq)  		goto done; @@ -138,6 +194,7 @@ qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt)  	pq->num_pending = 0;  	pq->num_sending = 0;  	pq->added = 0; +	pq->sdma_rb_node = NULL;  	INIT_LIST_HEAD(&pq->sent);  	spin_lock_init(&pq->sent_lock); @@ -163,8 +220,30 @@ qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt)  	pq->dma_pages_root = RB_ROOT; +	sdma_rb_node = qib_user_sdma_rb_search(&qib_user_sdma_rb_root, +					current->pid); +	if (sdma_rb_node) { +		sdma_rb_node->refcount++; +	} else { +		int ret; +		sdma_rb_node = kmalloc(sizeof( +			struct qib_user_sdma_rb_node), GFP_KERNEL); +		if (!sdma_rb_node) +			goto err_rb; + +		sdma_rb_node->refcount = 1; +		sdma_rb_node->pid = current->pid; + +		ret = qib_user_sdma_rb_insert(&qib_user_sdma_rb_root, +					sdma_rb_node); +		BUG_ON(ret == 0); +	} +	pq->sdma_rb_node = sdma_rb_node; +  	goto done; +err_rb: +	dma_pool_destroy(pq->header_cache);  err_slab:  	kmem_cache_destroy(pq->pkt_slab);  err_kfree: @@ -594,8 +673,7 @@ static int qib_user_sdma_pin_pages(const struct qib_devdata *dd,  		else  			j = npages; -		ret = get_user_pages(current, current->mm, addr, -			     j, 0, 1, pages, NULL); +		ret = get_user_pages_fast(addr, j, 0, pages);  		if (ret != j) {  			i = 0;  			j = ret; @@ -1021,8 +1099,13 @@ void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq)  	if (!pq)  		return; -	kmem_cache_destroy(pq->pkt_slab); +	pq->sdma_rb_node->refcount--; +	if (pq->sdma_rb_node->refcount == 0) { +		rb_erase(&pq->sdma_rb_node->node, &qib_user_sdma_rb_root); +		kfree(pq->sdma_rb_node); +	}  	dma_pool_destroy(pq->header_cache); +	kmem_cache_destroy(pq->pkt_slab);  	kfree(pq);  } @@ -1242,26 +1325,52 @@ static int qib_user_sdma_push_pkts(struct qib_pportdata *ppd,  				 struct qib_user_sdma_queue *pq,  				 struct list_head *pktlist, int count)  { -	int ret = 0;  	unsigned long flags;  	if (unlikely(!(ppd->lflags & QIBL_LINKACTIVE)))  		return -ECOMM; -	spin_lock_irqsave(&ppd->sdma_lock, flags); - -	if (unlikely(!__qib_sdma_running(ppd))) { -		ret = -ECOMM; -		goto unlock; +	/* non-blocking mode */ +	if (pq->sdma_rb_node->refcount > 1) { +		spin_lock_irqsave(&ppd->sdma_lock, flags); +		if (unlikely(!__qib_sdma_running(ppd))) { +			spin_unlock_irqrestore(&ppd->sdma_lock, flags); +			return -ECOMM; +		} +		pq->num_pending += count; +		list_splice_tail_init(pktlist, &ppd->sdma_userpending); +		qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending); +		spin_unlock_irqrestore(&ppd->sdma_lock, flags); +		return 0;  	} +	/* In this case, descriptors from this process are not +	 * linked to ppd pending queue, interrupt handler +	 * won't update this process, it is OK to directly +	 * modify without sdma lock. +	 */ + +  	pq->num_pending += count; -	list_splice_tail_init(pktlist, &ppd->sdma_userpending); -	qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending); +	/* +	 * Blocking mode for single rail process, we must +	 * release/regain sdma_lock to give other process +	 * chance to make progress. This is important for +	 * performance. +	 */ +	do { +		spin_lock_irqsave(&ppd->sdma_lock, flags); +		if (unlikely(!__qib_sdma_running(ppd))) { +			spin_unlock_irqrestore(&ppd->sdma_lock, flags); +			return -ECOMM; +		} +		qib_user_sdma_send_desc(ppd, pktlist); +		if (!list_empty(pktlist)) +			qib_sdma_make_progress(ppd); +		spin_unlock_irqrestore(&ppd->sdma_lock, flags); +	} while (!list_empty(pktlist)); -unlock: -	spin_unlock_irqrestore(&ppd->sdma_lock, flags); -	return ret; +	return 0;  }  int qib_user_sdma_writev(struct qib_ctxtdata *rcd, @@ -1291,14 +1400,11 @@ int qib_user_sdma_writev(struct qib_ctxtdata *rcd,  		qib_user_sdma_queue_clean(ppd, pq);  	while (dim) { -		int mxp = 8; +		int mxp = 1;  		int ndesc = 0; -		down_write(¤t->mm->mmap_sem);  		ret = qib_user_sdma_queue_pkts(dd, ppd, pq,  				iov, dim, &list, &mxp, &ndesc); -		up_write(¤t->mm->mmap_sem); -  		if (ret < 0)  			goto done_unlock;  		else { diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 092b0bb1bb7..9bcfbd84298 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -662,7 +662,7 @@ void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen)  		mcast = qib_mcast_find(ibp, &hdr->u.l.grh.dgid);  		if (mcast == NULL)  			goto drop; -		ibp->n_multicast_rcv++; +		this_cpu_inc(ibp->pmastats->n_multicast_rcv);  		list_for_each_entry_rcu(p, &mcast->qp_list, list)  			qib_qp_rcv(rcd, hdr, 1, data, tlen, p->qp);  		/* @@ -678,8 +678,8 @@ void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen)  					&rcd->lookaside_qp->refcount))  					wake_up(  					 &rcd->lookaside_qp->wait); -					rcd->lookaside_qp = NULL; -				} +				rcd->lookaside_qp = NULL; +			}  		}  		if (!rcd->lookaside_qp) {  			qp = qib_lookup_qpn(ibp, qp_num); @@ -689,7 +689,7 @@ void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen)  			rcd->lookaside_qpn = qp_num;  		} else  			qp = rcd->lookaside_qp; -		ibp->n_unicast_rcv++; +		this_cpu_inc(ibp->pmastats->n_unicast_rcv);  		qib_qp_rcv(rcd, hdr, lnh == QIB_LRH_GRH, data, tlen, qp);  	}  	return; diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 012e2c7575a..bfc8948fdd3 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -150,14 +150,14 @@ struct ib_reth {  	__be64 vaddr;  	__be32 rkey;  	__be32 length; -} __attribute__ ((packed)); +} __packed;  struct ib_atomic_eth {  	__be32 vaddr[2];        /* unaligned so access as 2 32-bit words */  	__be32 rkey;  	__be64 swap_data;  	__be64 compare_data; -} __attribute__ ((packed)); +} __packed;  struct qib_other_headers {  	__be32 bth[3]; @@ -178,7 +178,7 @@ struct qib_other_headers {  		__be32 aeth;  		struct ib_atomic_eth atomic_eth;  	} u; -} __attribute__ ((packed)); +} __packed;  /*   * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes @@ -195,12 +195,12 @@ struct qib_ib_header {  		} l;  		struct qib_other_headers oth;  	} u; -} __attribute__ ((packed)); +} __packed;  struct qib_pio_header {  	__le32 pbc[2];  	struct qib_ib_header hdr; -} __attribute__ ((packed)); +} __packed;  /*   * There is one struct qib_mcast for each multicast GID. @@ -664,6 +664,13 @@ struct qib_opcode_stats_perctx {  	struct qib_opcode_stats stats[128];  }; +struct qib_pma_counters { +	u64 n_unicast_xmit;     /* total unicast packets sent */ +	u64 n_unicast_rcv;      /* total unicast packets received */ +	u64 n_multicast_xmit;   /* total multicast packets sent */ +	u64 n_multicast_rcv;    /* total multicast packets received */ +}; +  struct qib_ibport {  	struct qib_qp __rcu *qp0;  	struct qib_qp __rcu *qp1; @@ -680,10 +687,11 @@ struct qib_ibport {  	__be64 mkey;  	__be64 guids[QIB_GUIDS_PER_PORT	- 1];	/* writable GUIDs */  	u64 tid;		/* TID for traps */ -	u64 n_unicast_xmit;     /* total unicast packets sent */ -	u64 n_unicast_rcv;      /* total unicast packets received */ -	u64 n_multicast_xmit;   /* total multicast packets sent */ -	u64 n_multicast_rcv;    /* total multicast packets received */ +	struct qib_pma_counters __percpu *pmastats; +	u64 z_unicast_xmit;     /* starting count for PMA */ +	u64 z_unicast_rcv;      /* starting count for PMA */ +	u64 z_multicast_xmit;   /* starting count for PMA */ +	u64 z_multicast_rcv;    /* starting count for PMA */  	u64 z_symbol_error_counter;             /* starting count for PMA */  	u64 z_link_error_recovery_counter;      /* starting count for PMA */  	u64 z_link_downed_counter;              /* starting count for PMA */ diff --git a/drivers/infiniband/hw/usnic/Kconfig b/drivers/infiniband/hw/usnic/Kconfig new file mode 100644 index 00000000000..29ab11c34f3 --- /dev/null +++ b/drivers/infiniband/hw/usnic/Kconfig @@ -0,0 +1,10 @@ +config INFINIBAND_USNIC +	tristate "Verbs support for Cisco VIC" +	depends on NETDEVICES && ETHERNET && INET && PCI && INTEL_IOMMU +	select ENIC +	select NET_VENDOR_CISCO +	select PCI_IOV +	select INFINIBAND_USER_ACCESS +	---help--- +	  This is a low-level driver for Cisco's Virtual Interface +	  Cards (VICs), including the VIC 1240 and 1280 cards. diff --git a/drivers/infiniband/hw/usnic/Makefile b/drivers/infiniband/hw/usnic/Makefile new file mode 100644 index 00000000000..99fb2db47cd --- /dev/null +++ b/drivers/infiniband/hw/usnic/Makefile @@ -0,0 +1,15 @@ +ccflags-y := -Idrivers/net/ethernet/cisco/enic + +obj-$(CONFIG_INFINIBAND_USNIC)+= usnic_verbs.o + +usnic_verbs-y=\ +usnic_fwd.o \ +usnic_transport.o \ +usnic_uiom.o \ +usnic_uiom_interval_tree.o \ +usnic_vnic.o \ +usnic_ib_main.o \ +usnic_ib_qp_grp.o \ +usnic_ib_sysfs.o \ +usnic_ib_verbs.o \ +usnic_debugfs.o \ diff --git a/drivers/infiniband/hw/usnic/usnic.h b/drivers/infiniband/hw/usnic/usnic.h new file mode 100644 index 00000000000..5be13d8991b --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_H_ +#define USNIC_H_ + +#define DRV_NAME	"usnic_verbs" + +#define PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC	0x00cf	/* User space NIC */ + +#define DRV_VERSION    "1.0.3" +#define DRV_RELDATE    "December 19, 2013" + +#endif /* USNIC_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_abi.h b/drivers/infiniband/hw/usnic/usnic_abi.h new file mode 100644 index 00000000000..04a66229584 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_abi.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + + +#ifndef USNIC_ABI_H +#define USNIC_ABI_H + +/* ABI between userspace and kernel */ +#define USNIC_UVERBS_ABI_VERSION	4 + +#define USNIC_QP_GRP_MAX_WQS		8 +#define USNIC_QP_GRP_MAX_RQS		8 +#define USNIC_QP_GRP_MAX_CQS		16 + +enum usnic_transport_type { +	USNIC_TRANSPORT_UNKNOWN		= 0, +	USNIC_TRANSPORT_ROCE_CUSTOM	= 1, +	USNIC_TRANSPORT_IPV4_UDP	= 2, +	USNIC_TRANSPORT_MAX		= 3, +}; + +struct usnic_transport_spec { +	enum usnic_transport_type	trans_type; +	union { +		struct { +			uint16_t	port_num; +		} usnic_roce; +		struct { +			uint32_t	sock_fd; +		} udp; +	}; +}; + +struct usnic_ib_create_qp_cmd { +	struct usnic_transport_spec	spec; +}; + +/*TODO: Future - usnic_modify_qp needs to pass in generic filters */ +struct usnic_ib_create_qp_resp { +	u32				vfid; +	u32				qp_grp_id; +	u64				bar_bus_addr; +	u32				bar_len; +/* + * WQ, RQ, CQ are explicity specified bc exposing a generic resources inteface + * expands the scope of ABI to many files. + */ +	u32				wq_cnt; +	u32				rq_cnt; +	u32				cq_cnt; +	u32				wq_idx[USNIC_QP_GRP_MAX_WQS]; +	u32				rq_idx[USNIC_QP_GRP_MAX_RQS]; +	u32				cq_idx[USNIC_QP_GRP_MAX_CQS]; +	u32				transport; +	u32				reserved[9]; +}; + +#endif /* USNIC_ABI_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h b/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h new file mode 100644 index 00000000000..39356726614 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_CMN_PKT_HDR_H +#define USNIC_CMN_PKT_HDR_H + +#define USNIC_ROCE_ETHERTYPE		(0x8915) +#define USNIC_ROCE_GRH_VER              (8) +#define USNIC_PROTO_VER                 (1) +#define USNIC_ROCE_GRH_VER_SHIFT        (4) + +#endif /* USNIC_COMMON_PKT_HDR_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_common_util.h b/drivers/infiniband/hw/usnic/usnic_common_util.h new file mode 100644 index 00000000000..9d737ed5e55 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_common_util.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_CMN_UTIL_H +#define USNIC_CMN_UTIL_H + +static inline void +usnic_mac_to_gid(const char *const mac, char *raw_gid) +{ +	raw_gid[0] = 0xfe; +	raw_gid[1] = 0x80; +	memset(&raw_gid[2], 0, 6); +	raw_gid[8] = mac[0]^2; +	raw_gid[9] = mac[1]; +	raw_gid[10] = mac[2]; +	raw_gid[11] = 0xff; +	raw_gid[12] = 0xfe; +	raw_gid[13] = mac[3]; +	raw_gid[14] = mac[4]; +	raw_gid[15] = mac[5]; +} + +static inline void +usnic_mac_ip_to_gid(const char *const mac, const __be32 inaddr, char *raw_gid) +{ +	raw_gid[0] = 0xfe; +	raw_gid[1] = 0x80; +	memset(&raw_gid[2], 0, 2); +	memcpy(&raw_gid[4], &inaddr, 4); +	raw_gid[8] = mac[0]^2; +	raw_gid[9] = mac[1]; +	raw_gid[10] = mac[2]; +	raw_gid[11] = 0xff; +	raw_gid[12] = 0xfe; +	raw_gid[13] = mac[3]; +	raw_gid[14] = mac[4]; +	raw_gid[15] = mac[5]; +} + +static inline void +usnic_write_gid_if_id_from_mac(char *mac, char *raw_gid) +{ +	raw_gid[8] = mac[0]^2; +	raw_gid[9] = mac[1]; +	raw_gid[10] = mac[2]; +	raw_gid[11] = 0xff; +	raw_gid[12] = 0xfe; +	raw_gid[13] = mac[3]; +	raw_gid[14] = mac[4]; +	raw_gid[15] = mac[5]; +} + +#endif /* USNIC_COMMON_UTIL_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.c b/drivers/infiniband/hw/usnic/usnic_debugfs.c new file mode 100644 index 00000000000..5d13860161a --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_debugfs.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/debugfs.h> +#include <linux/module.h> + +#include "usnic.h" +#include "usnic_log.h" +#include "usnic_debugfs.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_transport.h" + +static struct dentry *debugfs_root; +static struct dentry *flows_dentry; + +static ssize_t usnic_debugfs_buildinfo_read(struct file *f, char __user *data, +						size_t count, loff_t *ppos) +{ +	char buf[500]; +	int res; + +	if (*ppos > 0) +		return 0; + +	res = scnprintf(buf, sizeof(buf), +			"version:       %s\n" +			"build date:    %s\n", +			DRV_VERSION, DRV_RELDATE); + +	return simple_read_from_buffer(data, count, ppos, buf, res); +} + +static const struct file_operations usnic_debugfs_buildinfo_ops = { +	.owner = THIS_MODULE, +	.open = simple_open, +	.read = usnic_debugfs_buildinfo_read +}; + +static ssize_t flowinfo_read(struct file *f, char __user *data, +				size_t count, loff_t *ppos) +{ +	struct usnic_ib_qp_grp_flow *qp_flow; +	int n; +	int left; +	char *ptr; +	char buf[512]; + +	qp_flow = f->private_data; +	ptr = buf; +	left = count; + +	if (*ppos > 0) +		return 0; + +	spin_lock(&qp_flow->qp_grp->lock); +	n = scnprintf(ptr, left, +			"QP Grp ID: %d Transport: %s ", +			qp_flow->qp_grp->grp_id, +			usnic_transport_to_str(qp_flow->trans_type)); +	UPDATE_PTR_LEFT(n, ptr, left); +	if (qp_flow->trans_type == USNIC_TRANSPORT_ROCE_CUSTOM) { +		n = scnprintf(ptr, left, "Port_Num:%hu\n", +					qp_flow->usnic_roce.port_num); +		UPDATE_PTR_LEFT(n, ptr, left); +	} else if (qp_flow->trans_type == USNIC_TRANSPORT_IPV4_UDP) { +		n = usnic_transport_sock_to_str(ptr, left, +				qp_flow->udp.sock); +		UPDATE_PTR_LEFT(n, ptr, left); +		n = scnprintf(ptr, left, "\n"); +		UPDATE_PTR_LEFT(n, ptr, left); +	} +	spin_unlock(&qp_flow->qp_grp->lock); + +	return simple_read_from_buffer(data, count, ppos, buf, ptr - buf); +} + +static const struct file_operations flowinfo_ops = { +	.owner = THIS_MODULE, +	.open = simple_open, +	.read = flowinfo_read, +}; + +void usnic_debugfs_init(void) +{ +	debugfs_root = debugfs_create_dir(DRV_NAME, NULL); +	if (IS_ERR(debugfs_root)) { +		usnic_err("Failed to create debugfs root dir, check if debugfs is enabled in kernel configuration\n"); +		goto out_clear_root; +	} + +	flows_dentry = debugfs_create_dir("flows", debugfs_root); +	if (IS_ERR_OR_NULL(flows_dentry)) { +		usnic_err("Failed to create debugfs flow dir with err %ld\n", +				PTR_ERR(flows_dentry)); +		goto out_free_root; +	} + +	debugfs_create_file("build-info", S_IRUGO, debugfs_root, +				NULL, &usnic_debugfs_buildinfo_ops); +	return; + +out_free_root: +	debugfs_remove_recursive(debugfs_root); +out_clear_root: +	debugfs_root = NULL; +} + +void usnic_debugfs_exit(void) +{ +	if (!debugfs_root) +		return; + +	debugfs_remove_recursive(debugfs_root); +	debugfs_root = NULL; +} + +void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow) +{ +	if (IS_ERR_OR_NULL(flows_dentry)) +		return; + +	scnprintf(qp_flow->dentry_name, sizeof(qp_flow->dentry_name), +			"%u", qp_flow->flow->flow_id); +	qp_flow->dbgfs_dentry = debugfs_create_file(qp_flow->dentry_name, +							S_IRUGO, +							flows_dentry, +							qp_flow, +							&flowinfo_ops); +	if (IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) { +		usnic_err("Failed to create dbg fs entry for flow %u\n", +				qp_flow->flow->flow_id); +	} +} + +void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow) +{ +	if (!IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) +		debugfs_remove(qp_flow->dbgfs_dentry); +} diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.h b/drivers/infiniband/hw/usnic/usnic_debugfs.h new file mode 100644 index 00000000000..4087d24a88f --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_debugfs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#ifndef USNIC_DEBUGFS_H_ +#define USNIC_DEBUGFS_H_ + +#include "usnic_ib_qp_grp.h" + +void usnic_debugfs_init(void); + +void usnic_debugfs_exit(void); +void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow); +void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow); + +#endif /*!USNIC_DEBUGFS_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.c b/drivers/infiniband/hw/usnic/usnic_fwd.c new file mode 100644 index 00000000000..e3c9bd9d3ba --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_fwd.c @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/netdevice.h> +#include <linux/pci.h> + +#include "enic_api.h" +#include "usnic_common_pkt_hdr.h" +#include "usnic_fwd.h" +#include "usnic_log.h" + +static int usnic_fwd_devcmd_locked(struct usnic_fwd_dev *ufdev, int vnic_idx, +					enum vnic_devcmd_cmd cmd, u64 *a0, +					u64 *a1) +{ +	int status; +	struct net_device *netdev = ufdev->netdev; + +	lockdep_assert_held(&ufdev->lock); + +	status = enic_api_devcmd_proxy_by_index(netdev, +			vnic_idx, +			cmd, +			a0, a1, +			1000); +	if (status) { +		if (status == ERR_EINVAL && cmd == CMD_DEL_FILTER) { +			usnic_dbg("Dev %s vnic idx %u cmd %u already deleted", +					ufdev->name, vnic_idx, cmd); +		} else { +			usnic_err("Dev %s vnic idx %u cmd %u failed with status %d\n", +					ufdev->name, vnic_idx, cmd, +					status); +		} +	} else { +		usnic_dbg("Dev %s vnic idx %u cmd %u success", +				ufdev->name, vnic_idx, cmd); +	} + +	return status; +} + +static int usnic_fwd_devcmd(struct usnic_fwd_dev *ufdev, int vnic_idx, +				enum vnic_devcmd_cmd cmd, u64 *a0, u64 *a1) +{ +	int status; + +	spin_lock(&ufdev->lock); +	status = usnic_fwd_devcmd_locked(ufdev, vnic_idx, cmd, a0, a1); +	spin_unlock(&ufdev->lock); + +	return status; +} + +struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev) +{ +	struct usnic_fwd_dev *ufdev; + +	ufdev = kzalloc(sizeof(*ufdev), GFP_KERNEL); +	if (!ufdev) +		return NULL; + +	ufdev->pdev = pdev; +	ufdev->netdev = pci_get_drvdata(pdev); +	spin_lock_init(&ufdev->lock); +	strncpy(ufdev->name, netdev_name(ufdev->netdev), +			sizeof(ufdev->name) - 1); + +	return ufdev; +} + +void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev) +{ +	kfree(ufdev); +} + +void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN]) +{ +	spin_lock(&ufdev->lock); +	memcpy(&ufdev->mac, mac, sizeof(ufdev->mac)); +	spin_unlock(&ufdev->lock); +} + +int usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr) +{ +	int status; + +	spin_lock(&ufdev->lock); +	if (ufdev->inaddr == 0) { +		ufdev->inaddr = inaddr; +		status = 0; +	} else { +		status = -EFAULT; +	} +	spin_unlock(&ufdev->lock); + +	return status; +} + +void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev) +{ +	spin_lock(&ufdev->lock); +	ufdev->inaddr = 0; +	spin_unlock(&ufdev->lock); +} + +void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev) +{ +	spin_lock(&ufdev->lock); +	ufdev->link_up = 1; +	spin_unlock(&ufdev->lock); +} + +void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev) +{ +	spin_lock(&ufdev->lock); +	ufdev->link_up = 0; +	spin_unlock(&ufdev->lock); +} + +void usnic_fwd_set_mtu(struct usnic_fwd_dev *ufdev, unsigned int mtu) +{ +	spin_lock(&ufdev->lock); +	ufdev->mtu = mtu; +	spin_unlock(&ufdev->lock); +} + +static int usnic_fwd_dev_ready_locked(struct usnic_fwd_dev *ufdev) +{ +	lockdep_assert_held(&ufdev->lock); + +	if (!ufdev->link_up) +		return -EPERM; + +	return 0; +} + +static int validate_filter_locked(struct usnic_fwd_dev *ufdev, +					struct filter *filter) +{ + +	lockdep_assert_held(&ufdev->lock); + +	if (filter->type == FILTER_IPV4_5TUPLE) { +		if (!(filter->u.ipv4.flags & FILTER_FIELD_5TUP_DST_AD)) +			return -EACCES; +		if (!(filter->u.ipv4.flags & FILTER_FIELD_5TUP_DST_PT)) +			return -EBUSY; +		else if (ufdev->inaddr == 0) +			return -EINVAL; +		else if (filter->u.ipv4.dst_port == 0) +			return -ERANGE; +		else if (ntohl(ufdev->inaddr) != filter->u.ipv4.dst_addr) +			return -EFAULT; +		else +			return 0; +	} + +	return 0; +} + +static void fill_tlv(struct filter_tlv *tlv, struct filter *filter, +		struct filter_action *action) +{ +	tlv->type = CLSF_TLV_FILTER; +	tlv->length = sizeof(struct filter); +	*((struct filter *)&tlv->val) = *filter; + +	tlv = (struct filter_tlv *)((char *)tlv + sizeof(struct filter_tlv) + +			sizeof(struct filter)); +	tlv->type = CLSF_TLV_ACTION; +	tlv->length = sizeof(struct filter_action); +	*((struct filter_action *)&tlv->val) = *action; +} + +struct usnic_fwd_flow* +usnic_fwd_alloc_flow(struct usnic_fwd_dev *ufdev, struct filter *filter, +				struct usnic_filter_action *uaction) +{ +	struct filter_tlv *tlv; +	struct pci_dev *pdev; +	struct usnic_fwd_flow *flow; +	uint64_t a0, a1; +	uint64_t tlv_size; +	dma_addr_t tlv_pa; +	int status; + +	pdev = ufdev->pdev; +	tlv_size = (2*sizeof(struct filter_tlv) + sizeof(struct filter) + +			sizeof(struct filter_action)); + +	flow = kzalloc(sizeof(*flow), GFP_ATOMIC); +	if (!flow) +		return ERR_PTR(-ENOMEM); + +	tlv = pci_alloc_consistent(pdev, tlv_size, &tlv_pa); +	if (!tlv) { +		usnic_err("Failed to allocate memory\n"); +		status = -ENOMEM; +		goto out_free_flow; +	} + +	fill_tlv(tlv, filter, &uaction->action); + +	spin_lock(&ufdev->lock); +	status = usnic_fwd_dev_ready_locked(ufdev); +	if (status) { +		usnic_err("Forwarding dev %s not ready with status %d\n", +				ufdev->name, status); +		goto out_free_tlv; +	} + +	status = validate_filter_locked(ufdev, filter); +	if (status) { +		usnic_err("Failed to validate filter with status %d\n", +				status); +		goto out_free_tlv; +	} + +	/* Issue Devcmd */ +	a0 = tlv_pa; +	a1 = tlv_size; +	status = usnic_fwd_devcmd_locked(ufdev, uaction->vnic_idx, +						CMD_ADD_FILTER, &a0, &a1); +	if (status) { +		usnic_err("VF %s Filter add failed with status:%d", +				ufdev->name, status); +		status = -EFAULT; +		goto out_free_tlv; +	} else { +		usnic_dbg("VF %s FILTER ID:%llu", ufdev->name, a0); +	} + +	flow->flow_id = (uint32_t) a0; +	flow->vnic_idx = uaction->vnic_idx; +	flow->ufdev = ufdev; + +out_free_tlv: +	spin_unlock(&ufdev->lock); +	pci_free_consistent(pdev, tlv_size, tlv, tlv_pa); +	if (!status) +		return flow; +out_free_flow: +	kfree(flow); +	return ERR_PTR(status); +} + +int usnic_fwd_dealloc_flow(struct usnic_fwd_flow *flow) +{ +	int status; +	u64 a0, a1; + +	a0 = flow->flow_id; + +	status = usnic_fwd_devcmd(flow->ufdev, flow->vnic_idx, +					CMD_DEL_FILTER, &a0, &a1); +	if (status) { +		if (status == ERR_EINVAL) { +			usnic_dbg("Filter %u already deleted for VF Idx %u pf: %s status: %d", +					flow->flow_id, flow->vnic_idx, +					flow->ufdev->name, status); +		} else { +			usnic_err("PF %s VF Idx %u Filter: %u FILTER DELETE failed with status %d", +					flow->ufdev->name, flow->vnic_idx, +					flow->flow_id, status); +		} +		status = 0; +		/* +		 * Log the error and fake success to the caller because if +		 * a flow fails to be deleted in the firmware, it is an +		 * unrecoverable error. +		 */ +	} else { +		usnic_dbg("PF %s VF Idx %u Filter: %u FILTER DELETED", +				flow->ufdev->name, flow->vnic_idx, +				flow->flow_id); +	} + +	kfree(flow); +	return status; +} + +int usnic_fwd_enable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx) +{ +	int status; +	struct net_device *pf_netdev; +	u64 a0, a1; + +	pf_netdev = ufdev->netdev; +	a0 = qp_idx; +	a1 = CMD_QP_RQWQ; + +	status = usnic_fwd_devcmd(ufdev, vnic_idx, CMD_QP_ENABLE, +						&a0, &a1); +	if (status) { +		usnic_err("PF %s VNIC Index %u RQ Index: %u ENABLE Failed with status %d", +				netdev_name(pf_netdev), +				vnic_idx, +				qp_idx, +				status); +	} else { +		usnic_dbg("PF %s VNIC Index %u RQ Index: %u ENABLED", +				netdev_name(pf_netdev), +				vnic_idx, qp_idx); +	} + +	return status; +} + +int usnic_fwd_disable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx) +{ +	int status; +	u64 a0, a1; +	struct net_device *pf_netdev; + +	pf_netdev = ufdev->netdev; +	a0 = qp_idx; +	a1 = CMD_QP_RQWQ; + +	status = usnic_fwd_devcmd(ufdev, vnic_idx, CMD_QP_DISABLE, +			&a0, &a1); +	if (status) { +		usnic_err("PF %s VNIC Index %u RQ Index: %u DISABLE Failed with status %d", +				netdev_name(pf_netdev), +				vnic_idx, +				qp_idx, +				status); +	} else { +		usnic_dbg("PF %s VNIC Index %u RQ Index: %u DISABLED", +				netdev_name(pf_netdev), +				vnic_idx, +				qp_idx); +	} + +	return status; +} diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.h b/drivers/infiniband/hw/usnic/usnic_fwd.h new file mode 100644 index 00000000000..93713a2230b --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_fwd.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_FWD_H_ +#define USNIC_FWD_H_ + +#include <linux/if.h> +#include <linux/netdevice.h> +#include <linux/pci.h> +#include <linux/in.h> + +#include "usnic_abi.h" +#include "usnic_common_pkt_hdr.h" +#include "vnic_devcmd.h" + +struct usnic_fwd_dev { +	struct pci_dev			*pdev; +	struct net_device		*netdev; +	spinlock_t			lock; +	/* +	 * The following fields can be read directly off the device. +	 * However, they should be set by a accessor function, except name, +	 * which cannot be changed. +	 */ +	bool				link_up; +	char				mac[ETH_ALEN]; +	unsigned int			mtu; +	__be32				inaddr; +	char				name[IFNAMSIZ+1]; +}; + +struct usnic_fwd_flow { +	uint32_t			flow_id; +	struct usnic_fwd_dev		*ufdev; +	unsigned int			vnic_idx; +}; + +struct usnic_filter_action { +	int				vnic_idx; +	struct filter_action		action; +}; + +struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev); +void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev); + +void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN]); +int usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr); +void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev); +void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev); +void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev); +void usnic_fwd_set_mtu(struct usnic_fwd_dev *ufdev, unsigned int mtu); + +/* + * Allocate a flow on this forwarding device. Whoever calls this function, + * must monitor netdev events on ufdev's netdevice. If NETDEV_REBOOT or + * NETDEV_DOWN is seen, flow will no longer function and must be + * immediately freed by calling usnic_dealloc_flow. + */ +struct usnic_fwd_flow* +usnic_fwd_alloc_flow(struct usnic_fwd_dev *ufdev, struct filter *filter, +				struct usnic_filter_action *action); +int usnic_fwd_dealloc_flow(struct usnic_fwd_flow *flow); +int usnic_fwd_enable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx); +int usnic_fwd_disable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx); + +static inline void usnic_fwd_init_usnic_filter(struct filter *filter, +						uint32_t usnic_id) +{ +	filter->type = FILTER_USNIC_ID; +	filter->u.usnic.ethtype = USNIC_ROCE_ETHERTYPE; +	filter->u.usnic.flags = FILTER_FIELD_USNIC_ETHTYPE | +				FILTER_FIELD_USNIC_ID | +				FILTER_FIELD_USNIC_PROTO; +	filter->u.usnic.proto_version = (USNIC_ROCE_GRH_VER << +					 USNIC_ROCE_GRH_VER_SHIFT) | +					 USNIC_PROTO_VER; +	filter->u.usnic.usnic_id = usnic_id; +} + +static inline void usnic_fwd_init_udp_filter(struct filter *filter, +						uint32_t daddr, uint16_t dport) +{ +	filter->type = FILTER_IPV4_5TUPLE; +	filter->u.ipv4.flags = FILTER_FIELD_5TUP_PROTO; +	filter->u.ipv4.protocol = PROTO_UDP; + +	if (daddr) { +		filter->u.ipv4.flags |= FILTER_FIELD_5TUP_DST_AD; +		filter->u.ipv4.dst_addr = daddr; +	} + +	if (dport) { +		filter->u.ipv4.flags |= FILTER_FIELD_5TUP_DST_PT; +		filter->u.ipv4.dst_port = dport; +	} +} + +#endif /* !USNIC_FWD_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib.h b/drivers/infiniband/hw/usnic/usnic_ib.h new file mode 100644 index 00000000000..e5a9297dd1b --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_H_ +#define USNIC_IB_H_ + +#include <linux/iommu.h> +#include <linux/netdevice.h> + +#include <rdma/ib_verbs.h> + + +#include "usnic.h" +#include "usnic_abi.h" +#include "usnic_vnic.h" + +#define USNIC_IB_PORT_CNT		1 +#define USNIC_IB_NUM_COMP_VECTORS	1 + +extern unsigned int usnic_ib_share_vf; + +struct usnic_ib_ucontext { +	struct ib_ucontext		ibucontext; +	/* Protected by usnic_ib_dev->usdev_lock */ +	struct list_head		qp_grp_list; +	struct list_head		link; +}; + +struct usnic_ib_pd { +	struct ib_pd			ibpd; +	struct usnic_uiom_pd		*umem_pd; +}; + +struct usnic_ib_mr { +	struct ib_mr			ibmr; +	struct usnic_uiom_reg		*umem; +}; + +struct usnic_ib_dev { +	struct ib_device		ib_dev; +	struct pci_dev			*pdev; +	struct net_device		*netdev; +	struct usnic_fwd_dev		*ufdev; +	struct list_head		ib_dev_link; +	struct list_head		vf_dev_list; +	struct list_head		ctx_list; +	struct mutex			usdev_lock; + +	/* provisioning information */ +	struct kref			vf_cnt; +	unsigned int			vf_res_cnt[USNIC_VNIC_RES_TYPE_MAX]; + +	/* sysfs vars for QPN reporting */ +	struct kobject *qpn_kobj; +}; + +struct usnic_ib_vf { +	struct usnic_ib_dev		*pf; +	spinlock_t			lock; +	struct usnic_vnic		*vnic; +	unsigned int			qp_grp_ref_cnt; +	struct usnic_ib_pd		*pd; +	struct list_head		link; +}; + +static inline +struct usnic_ib_dev *to_usdev(struct ib_device *ibdev) +{ +	return container_of(ibdev, struct usnic_ib_dev, ib_dev); +} + +static inline +struct usnic_ib_ucontext *to_ucontext(struct ib_ucontext *ibucontext) +{ +	return container_of(ibucontext, struct usnic_ib_ucontext, ibucontext); +} + +static inline +struct usnic_ib_pd *to_upd(struct ib_pd *ibpd) +{ +	return container_of(ibpd, struct usnic_ib_pd, ibpd); +} + +static inline +struct usnic_ib_ucontext *to_uucontext(struct ib_ucontext *ibucontext) +{ +	return container_of(ibucontext, struct usnic_ib_ucontext, ibucontext); +} + +static inline +struct usnic_ib_mr *to_umr(struct ib_mr *ibmr) +{ +	return container_of(ibmr, struct usnic_ib_mr, ibmr); +} +void usnic_ib_log_vf(struct usnic_ib_vf *vf); + +#define UPDATE_PTR_LEFT(N, P, L)			\ +do {							\ +	L -= (N);					\ +	P += (N);					\ +} while (0) + +#endif /* USNIC_IB_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c new file mode 100644 index 00000000000..fb6d026f92c --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -0,0 +1,682 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Author: Upinder Malhi <umalhi@cisco.com> + * Author: Anant Deepak <anadeepa@cisco.com> + * Author: Cesare Cantu' <cantuc@cisco.com> + * Author: Jeff Squyres <jsquyres@cisco.com> + * Author: Kiran Thirumalai <kithirum@cisco.com> + * Author: Xuyang Wang <xuywang@cisco.com> + * Author: Reese Faucette <rfaucett@cisco.com> + * + */ + +#include <linux/module.h> +#include <linux/inetdevice.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/errno.h> +#include <linux/pci.h> +#include <linux/netdevice.h> + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_addr.h> + +#include "usnic_abi.h" +#include "usnic_common_util.h" +#include "usnic_ib.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_log.h" +#include "usnic_fwd.h" +#include "usnic_debugfs.h" +#include "usnic_ib_verbs.h" +#include "usnic_transport.h" +#include "usnic_uiom.h" +#include "usnic_ib_sysfs.h" + +unsigned int usnic_log_lvl = USNIC_LOG_LVL_ERR; +unsigned int usnic_ib_share_vf = 1; + +static const char usnic_version[] = +	DRV_NAME ": Cisco VIC (USNIC) Verbs Driver v" +	DRV_VERSION " (" DRV_RELDATE ")\n"; + +static DEFINE_MUTEX(usnic_ib_ibdev_list_lock); +static LIST_HEAD(usnic_ib_ibdev_list); + +/* Callback dump funcs */ +static int usnic_ib_dump_vf_hdr(void *obj, char *buf, int buf_sz) +{ +	struct usnic_ib_vf *vf = obj; +	return scnprintf(buf, buf_sz, "PF: %s ", vf->pf->ib_dev.name); +} +/* End callback dump funcs */ + +static void usnic_ib_dump_vf(struct usnic_ib_vf *vf, char *buf, int buf_sz) +{ +	usnic_vnic_dump(vf->vnic, buf, buf_sz, vf, +			usnic_ib_dump_vf_hdr, +			usnic_ib_qp_grp_dump_hdr, usnic_ib_qp_grp_dump_rows); +} + +void usnic_ib_log_vf(struct usnic_ib_vf *vf) +{ +	char buf[1000]; +	usnic_ib_dump_vf(vf, buf, sizeof(buf)); +	usnic_dbg("%s\n", buf); +} + +/* Start of netdev section */ +static inline const char *usnic_ib_netdev_event_to_string(unsigned long event) +{ +	const char *event2str[] = {"NETDEV_NONE", "NETDEV_UP", "NETDEV_DOWN", +		"NETDEV_REBOOT", "NETDEV_CHANGE", +		"NETDEV_REGISTER", "NETDEV_UNREGISTER", "NETDEV_CHANGEMTU", +		"NETDEV_CHANGEADDR", "NETDEV_GOING_DOWN", "NETDEV_FEAT_CHANGE", +		"NETDEV_BONDING_FAILOVER", "NETDEV_PRE_UP", +		"NETDEV_PRE_TYPE_CHANGE", "NETDEV_POST_TYPE_CHANGE", +		"NETDEV_POST_INT", "NETDEV_UNREGISTER_FINAL", "NETDEV_RELEASE", +		"NETDEV_NOTIFY_PEERS", "NETDEV_JOIN" +	}; + +	if (event >= ARRAY_SIZE(event2str)) +		return "UNKNOWN_NETDEV_EVENT"; +	else +		return event2str[event]; +} + +static void usnic_ib_qp_grp_modify_active_to_err(struct usnic_ib_dev *us_ibdev) +{ +	struct usnic_ib_ucontext *ctx; +	struct usnic_ib_qp_grp *qp_grp; +	enum ib_qp_state cur_state; +	int status; + +	BUG_ON(!mutex_is_locked(&us_ibdev->usdev_lock)); + +	list_for_each_entry(ctx, &us_ibdev->ctx_list, link) { +		list_for_each_entry(qp_grp, &ctx->qp_grp_list, link) { +			cur_state = qp_grp->state; +			if (cur_state == IB_QPS_INIT || +				cur_state == IB_QPS_RTR || +				cur_state == IB_QPS_RTS) { +				status = usnic_ib_qp_grp_modify(qp_grp, +								IB_QPS_ERR, +								NULL); +				if (status) { +					usnic_err("Failed to transistion qp grp %u from %s to %s\n", +						qp_grp->grp_id, +						usnic_ib_qp_grp_state_to_string +						(cur_state), +						usnic_ib_qp_grp_state_to_string +						(IB_QPS_ERR)); +				} +			} +		} +	} +} + +static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, +					unsigned long event) +{ +	struct net_device *netdev; +	struct ib_event ib_event; + +	memset(&ib_event, 0, sizeof(ib_event)); + +	mutex_lock(&us_ibdev->usdev_lock); +	netdev = us_ibdev->netdev; +	switch (event) { +	case NETDEV_REBOOT: +		usnic_info("PF Reset on %s\n", us_ibdev->ib_dev.name); +		usnic_ib_qp_grp_modify_active_to_err(us_ibdev); +		ib_event.event = IB_EVENT_PORT_ERR; +		ib_event.device = &us_ibdev->ib_dev; +		ib_event.element.port_num = 1; +		ib_dispatch_event(&ib_event); +		break; +	case NETDEV_UP: +	case NETDEV_DOWN: +	case NETDEV_CHANGE: +		if (!us_ibdev->ufdev->link_up && +				netif_carrier_ok(netdev)) { +			usnic_fwd_carrier_up(us_ibdev->ufdev); +			usnic_info("Link UP on %s\n", us_ibdev->ib_dev.name); +			ib_event.event = IB_EVENT_PORT_ACTIVE; +			ib_event.device = &us_ibdev->ib_dev; +			ib_event.element.port_num = 1; +			ib_dispatch_event(&ib_event); +		} else if (us_ibdev->ufdev->link_up && +				!netif_carrier_ok(netdev)) { +			usnic_fwd_carrier_down(us_ibdev->ufdev); +			usnic_info("Link DOWN on %s\n", us_ibdev->ib_dev.name); +			usnic_ib_qp_grp_modify_active_to_err(us_ibdev); +			ib_event.event = IB_EVENT_PORT_ERR; +			ib_event.device = &us_ibdev->ib_dev; +			ib_event.element.port_num = 1; +			ib_dispatch_event(&ib_event); +		} else { +			usnic_dbg("Ignoring %s on %s\n", +					usnic_ib_netdev_event_to_string(event), +					us_ibdev->ib_dev.name); +		} +		break; +	case NETDEV_CHANGEADDR: +		if (!memcmp(us_ibdev->ufdev->mac, netdev->dev_addr, +				sizeof(us_ibdev->ufdev->mac))) { +			usnic_dbg("Ignoring addr change on %s\n", +					us_ibdev->ib_dev.name); +		} else { +			usnic_info(" %s old mac: %pM new mac: %pM\n", +					us_ibdev->ib_dev.name, +					us_ibdev->ufdev->mac, +					netdev->dev_addr); +			usnic_fwd_set_mac(us_ibdev->ufdev, netdev->dev_addr); +			usnic_ib_qp_grp_modify_active_to_err(us_ibdev); +			ib_event.event = IB_EVENT_GID_CHANGE; +			ib_event.device = &us_ibdev->ib_dev; +			ib_event.element.port_num = 1; +			ib_dispatch_event(&ib_event); +		} + +		break; +	case NETDEV_CHANGEMTU: +		if (us_ibdev->ufdev->mtu != netdev->mtu) { +			usnic_info("MTU Change on %s old: %u new: %u\n", +					us_ibdev->ib_dev.name, +					us_ibdev->ufdev->mtu, netdev->mtu); +			usnic_fwd_set_mtu(us_ibdev->ufdev, netdev->mtu); +			usnic_ib_qp_grp_modify_active_to_err(us_ibdev); +		} else { +			usnic_dbg("Ignoring MTU change on %s\n", +					us_ibdev->ib_dev.name); +		} +		break; +	default: +		usnic_dbg("Ignoring event %s on %s", +				usnic_ib_netdev_event_to_string(event), +				us_ibdev->ib_dev.name); +	} +	mutex_unlock(&us_ibdev->usdev_lock); +} + +static int usnic_ib_netdevice_event(struct notifier_block *notifier, +					unsigned long event, void *ptr) +{ +	struct usnic_ib_dev *us_ibdev; + +	struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + +	mutex_lock(&usnic_ib_ibdev_list_lock); +	list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) { +		if (us_ibdev->netdev == netdev) { +			usnic_ib_handle_usdev_event(us_ibdev, event); +			break; +		} +	} +	mutex_unlock(&usnic_ib_ibdev_list_lock); + +	return NOTIFY_DONE; +} + +static struct notifier_block usnic_ib_netdevice_notifier = { +	.notifier_call = usnic_ib_netdevice_event +}; +/* End of netdev section */ + +/* Start of inet section */ +static int usnic_ib_handle_inet_event(struct usnic_ib_dev *us_ibdev, +					unsigned long event, void *ptr) +{ +	struct in_ifaddr *ifa = ptr; +	struct ib_event ib_event; + +	mutex_lock(&us_ibdev->usdev_lock); + +	switch (event) { +	case NETDEV_DOWN: +		usnic_info("%s via ip notifiers", +				usnic_ib_netdev_event_to_string(event)); +		usnic_fwd_del_ipaddr(us_ibdev->ufdev); +		usnic_ib_qp_grp_modify_active_to_err(us_ibdev); +		ib_event.event = IB_EVENT_GID_CHANGE; +		ib_event.device = &us_ibdev->ib_dev; +		ib_event.element.port_num = 1; +		ib_dispatch_event(&ib_event); +		break; +	case NETDEV_UP: +		usnic_fwd_add_ipaddr(us_ibdev->ufdev, ifa->ifa_address); +		usnic_info("%s via ip notifiers: ip %pI4", +				usnic_ib_netdev_event_to_string(event), +				&us_ibdev->ufdev->inaddr); +		ib_event.event = IB_EVENT_GID_CHANGE; +		ib_event.device = &us_ibdev->ib_dev; +		ib_event.element.port_num = 1; +		ib_dispatch_event(&ib_event); +		break; +	default: +		usnic_info("Ignoring event %s on %s", +				usnic_ib_netdev_event_to_string(event), +				us_ibdev->ib_dev.name); +	} +	mutex_unlock(&us_ibdev->usdev_lock); + +	return NOTIFY_DONE; +} + +static int usnic_ib_inetaddr_event(struct notifier_block *notifier, +					unsigned long event, void *ptr) +{ +	struct usnic_ib_dev *us_ibdev; +	struct in_ifaddr *ifa = ptr; +	struct net_device *netdev = ifa->ifa_dev->dev; + +	mutex_lock(&usnic_ib_ibdev_list_lock); +	list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) { +		if (us_ibdev->netdev == netdev) { +			usnic_ib_handle_inet_event(us_ibdev, event, ptr); +			break; +		} +	} +	mutex_unlock(&usnic_ib_ibdev_list_lock); + +	return NOTIFY_DONE; +} +static struct notifier_block usnic_ib_inetaddr_notifier = { +	.notifier_call = usnic_ib_inetaddr_event +}; +/* End of inet section*/ + +/* Start of PF discovery section */ +static void *usnic_ib_device_add(struct pci_dev *dev) +{ +	struct usnic_ib_dev *us_ibdev; +	union ib_gid gid; +	struct in_ifaddr *in; +	struct net_device *netdev; + +	usnic_dbg("\n"); +	netdev = pci_get_drvdata(dev); + +	us_ibdev = (struct usnic_ib_dev *)ib_alloc_device(sizeof(*us_ibdev)); +	if (IS_ERR_OR_NULL(us_ibdev)) { +		usnic_err("Device %s context alloc failed\n", +				netdev_name(pci_get_drvdata(dev))); +		return ERR_PTR(us_ibdev ? PTR_ERR(us_ibdev) : -EFAULT); +	} + +	us_ibdev->ufdev = usnic_fwd_dev_alloc(dev); +	if (IS_ERR_OR_NULL(us_ibdev->ufdev)) { +		usnic_err("Failed to alloc ufdev for %s with err %ld\n", +				pci_name(dev), PTR_ERR(us_ibdev->ufdev)); +		goto err_dealloc; +	} + +	mutex_init(&us_ibdev->usdev_lock); +	INIT_LIST_HEAD(&us_ibdev->vf_dev_list); +	INIT_LIST_HEAD(&us_ibdev->ctx_list); + +	us_ibdev->pdev = dev; +	us_ibdev->netdev = pci_get_drvdata(dev); +	us_ibdev->ib_dev.owner = THIS_MODULE; +	us_ibdev->ib_dev.node_type = RDMA_NODE_USNIC_UDP; +	us_ibdev->ib_dev.phys_port_cnt = USNIC_IB_PORT_CNT; +	us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS; +	us_ibdev->ib_dev.dma_device = &dev->dev; +	us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION; +	strlcpy(us_ibdev->ib_dev.name, "usnic_%d", IB_DEVICE_NAME_MAX); + +	us_ibdev->ib_dev.uverbs_cmd_mask = +		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | +		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | +		(1ull << IB_USER_VERBS_CMD_QUERY_PORT) | +		(1ull << IB_USER_VERBS_CMD_ALLOC_PD) | +		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | +		(1ull << IB_USER_VERBS_CMD_REG_MR) | +		(1ull << IB_USER_VERBS_CMD_DEREG_MR) | +		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | +		(1ull << IB_USER_VERBS_CMD_CREATE_CQ) | +		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | +		(1ull << IB_USER_VERBS_CMD_CREATE_QP) | +		(1ull << IB_USER_VERBS_CMD_MODIFY_QP) | +		(1ull << IB_USER_VERBS_CMD_QUERY_QP) | +		(1ull << IB_USER_VERBS_CMD_DESTROY_QP) | +		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | +		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | +		(1ull << IB_USER_VERBS_CMD_OPEN_QP); + +	us_ibdev->ib_dev.query_device = usnic_ib_query_device; +	us_ibdev->ib_dev.query_port = usnic_ib_query_port; +	us_ibdev->ib_dev.query_pkey = usnic_ib_query_pkey; +	us_ibdev->ib_dev.query_gid = usnic_ib_query_gid; +	us_ibdev->ib_dev.get_link_layer = usnic_ib_port_link_layer; +	us_ibdev->ib_dev.alloc_pd = usnic_ib_alloc_pd; +	us_ibdev->ib_dev.dealloc_pd = usnic_ib_dealloc_pd; +	us_ibdev->ib_dev.create_qp = usnic_ib_create_qp; +	us_ibdev->ib_dev.modify_qp = usnic_ib_modify_qp; +	us_ibdev->ib_dev.query_qp = usnic_ib_query_qp; +	us_ibdev->ib_dev.destroy_qp = usnic_ib_destroy_qp; +	us_ibdev->ib_dev.create_cq = usnic_ib_create_cq; +	us_ibdev->ib_dev.destroy_cq = usnic_ib_destroy_cq; +	us_ibdev->ib_dev.reg_user_mr = usnic_ib_reg_mr; +	us_ibdev->ib_dev.dereg_mr = usnic_ib_dereg_mr; +	us_ibdev->ib_dev.alloc_ucontext = usnic_ib_alloc_ucontext; +	us_ibdev->ib_dev.dealloc_ucontext = usnic_ib_dealloc_ucontext; +	us_ibdev->ib_dev.mmap = usnic_ib_mmap; +	us_ibdev->ib_dev.create_ah = usnic_ib_create_ah; +	us_ibdev->ib_dev.destroy_ah = usnic_ib_destroy_ah; +	us_ibdev->ib_dev.post_send = usnic_ib_post_send; +	us_ibdev->ib_dev.post_recv = usnic_ib_post_recv; +	us_ibdev->ib_dev.poll_cq = usnic_ib_poll_cq; +	us_ibdev->ib_dev.req_notify_cq = usnic_ib_req_notify_cq; +	us_ibdev->ib_dev.get_dma_mr = usnic_ib_get_dma_mr; + + +	if (ib_register_device(&us_ibdev->ib_dev, NULL)) +		goto err_fwd_dealloc; + +	usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu); +	usnic_fwd_set_mac(us_ibdev->ufdev, us_ibdev->netdev->dev_addr); +	if (netif_carrier_ok(us_ibdev->netdev)) +		usnic_fwd_carrier_up(us_ibdev->ufdev); + +	in = ((struct in_device *)(netdev->ip_ptr))->ifa_list; +	if (in != NULL) +		usnic_fwd_add_ipaddr(us_ibdev->ufdev, in->ifa_address); + +	usnic_mac_ip_to_gid(us_ibdev->netdev->perm_addr, +				us_ibdev->ufdev->inaddr, &gid.raw[0]); +	memcpy(&us_ibdev->ib_dev.node_guid, &gid.global.interface_id, +		sizeof(gid.global.interface_id)); +	kref_init(&us_ibdev->vf_cnt); + +	usnic_info("Added ibdev: %s netdev: %s with mac %pM Link: %u MTU: %u\n", +			us_ibdev->ib_dev.name, netdev_name(us_ibdev->netdev), +			us_ibdev->ufdev->mac, us_ibdev->ufdev->link_up, +			us_ibdev->ufdev->mtu); +	return us_ibdev; + +err_fwd_dealloc: +	usnic_fwd_dev_free(us_ibdev->ufdev); +err_dealloc: +	usnic_err("failed -- deallocing device\n"); +	ib_dealloc_device(&us_ibdev->ib_dev); +	return NULL; +} + +static void usnic_ib_device_remove(struct usnic_ib_dev *us_ibdev) +{ +	usnic_info("Unregistering %s\n", us_ibdev->ib_dev.name); +	usnic_ib_sysfs_unregister_usdev(us_ibdev); +	usnic_fwd_dev_free(us_ibdev->ufdev); +	ib_unregister_device(&us_ibdev->ib_dev); +	ib_dealloc_device(&us_ibdev->ib_dev); +} + +static void usnic_ib_undiscover_pf(struct kref *kref) +{ +	struct usnic_ib_dev *us_ibdev, *tmp; +	struct pci_dev *dev; +	bool found = false; + +	dev = container_of(kref, struct usnic_ib_dev, vf_cnt)->pdev; +	mutex_lock(&usnic_ib_ibdev_list_lock); +	list_for_each_entry_safe(us_ibdev, tmp, +				&usnic_ib_ibdev_list, ib_dev_link) { +		if (us_ibdev->pdev == dev) { +			list_del(&us_ibdev->ib_dev_link); +			usnic_ib_device_remove(us_ibdev); +			found = true; +			break; +		} +	} + +	WARN(!found, "Failed to remove PF %s\n", pci_name(dev)); + +	mutex_unlock(&usnic_ib_ibdev_list_lock); +} + +static struct usnic_ib_dev *usnic_ib_discover_pf(struct usnic_vnic *vnic) +{ +	struct usnic_ib_dev *us_ibdev; +	struct pci_dev *parent_pci, *vf_pci; +	int err; + +	vf_pci = usnic_vnic_get_pdev(vnic); +	parent_pci = pci_physfn(vf_pci); + +	BUG_ON(!parent_pci); + +	mutex_lock(&usnic_ib_ibdev_list_lock); +	list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) { +		if (us_ibdev->pdev == parent_pci) { +			kref_get(&us_ibdev->vf_cnt); +			goto out; +		} +	} + +	us_ibdev = usnic_ib_device_add(parent_pci); +	if (IS_ERR_OR_NULL(us_ibdev)) { +		us_ibdev = us_ibdev ? us_ibdev : ERR_PTR(-EFAULT); +		goto out; +	} + +	err = usnic_ib_sysfs_register_usdev(us_ibdev); +	if (err) { +		usnic_ib_device_remove(us_ibdev); +		us_ibdev = ERR_PTR(err); +		goto out; +	} + +	list_add(&us_ibdev->ib_dev_link, &usnic_ib_ibdev_list); +out: +	mutex_unlock(&usnic_ib_ibdev_list_lock); +	return us_ibdev; +} +/* End of PF discovery section */ + +/* Start of PCI section */ + +static DEFINE_PCI_DEVICE_TABLE(usnic_ib_pci_ids) = { +	{PCI_DEVICE(PCI_VENDOR_ID_CISCO, PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC)}, +	{0,} +}; + +static int usnic_ib_pci_probe(struct pci_dev *pdev, +				const struct pci_device_id *id) +{ +	int err; +	struct usnic_ib_dev *pf; +	struct usnic_ib_vf *vf; +	enum usnic_vnic_res_type res_type; + +	vf = kzalloc(sizeof(*vf), GFP_KERNEL); +	if (!vf) +		return -ENOMEM; + +	err = pci_enable_device(pdev); +	if (err) { +		usnic_err("Failed to enable %s with err %d\n", +				pci_name(pdev), err); +		goto out_clean_vf; +	} + +	err = pci_request_regions(pdev, DRV_NAME); +	if (err) { +		usnic_err("Failed to request region for %s with err %d\n", +				pci_name(pdev), err); +		goto out_disable_device; +	} + +	pci_set_master(pdev); +	pci_set_drvdata(pdev, vf); + +	vf->vnic = usnic_vnic_alloc(pdev); +	if (IS_ERR_OR_NULL(vf->vnic)) { +		err = vf->vnic ? PTR_ERR(vf->vnic) : -ENOMEM; +		usnic_err("Failed to alloc vnic for %s with err %d\n", +				pci_name(pdev), err); +		goto out_release_regions; +	} + +	pf = usnic_ib_discover_pf(vf->vnic); +	if (IS_ERR_OR_NULL(pf)) { +		usnic_err("Failed to discover pf of vnic %s with err%ld\n", +				pci_name(pdev), PTR_ERR(pf)); +		err = pf ? PTR_ERR(pf) : -EFAULT; +		goto out_clean_vnic; +	} + +	vf->pf = pf; +	spin_lock_init(&vf->lock); +	mutex_lock(&pf->usdev_lock); +	list_add_tail(&vf->link, &pf->vf_dev_list); +	/* +	 * Save max settings (will be same for each VF, easier to re-write than +	 * to say "if (!set) { set_values(); set=1; } +	 */ +	for (res_type = USNIC_VNIC_RES_TYPE_EOL+1; +			res_type < USNIC_VNIC_RES_TYPE_MAX; +			res_type++) { +		pf->vf_res_cnt[res_type] = usnic_vnic_res_cnt(vf->vnic, +								res_type); +	} + +	mutex_unlock(&pf->usdev_lock); + +	usnic_info("Registering usnic VF %s into PF %s\n", pci_name(pdev), +			pf->ib_dev.name); +	usnic_ib_log_vf(vf); +	return 0; + +out_clean_vnic: +	usnic_vnic_free(vf->vnic); +out_release_regions: +	pci_set_drvdata(pdev, NULL); +	pci_clear_master(pdev); +	pci_release_regions(pdev); +out_disable_device: +	pci_disable_device(pdev); +out_clean_vf: +	kfree(vf); +	return err; +} + +static void usnic_ib_pci_remove(struct pci_dev *pdev) +{ +	struct usnic_ib_vf *vf = pci_get_drvdata(pdev); +	struct usnic_ib_dev *pf = vf->pf; + +	mutex_lock(&pf->usdev_lock); +	list_del(&vf->link); +	mutex_unlock(&pf->usdev_lock); + +	kref_put(&pf->vf_cnt, usnic_ib_undiscover_pf); +	usnic_vnic_free(vf->vnic); +	pci_set_drvdata(pdev, NULL); +	pci_clear_master(pdev); +	pci_release_regions(pdev); +	pci_disable_device(pdev); +	kfree(vf); + +	usnic_info("Removed VF %s\n", pci_name(pdev)); +} + +/* PCI driver entry points */ +static struct pci_driver usnic_ib_pci_driver = { +	.name = DRV_NAME, +	.id_table = usnic_ib_pci_ids, +	.probe = usnic_ib_pci_probe, +	.remove = usnic_ib_pci_remove, +}; +/* End of PCI section */ + +/* Start of module section */ +static int __init usnic_ib_init(void) +{ +	int err; + +	printk_once(KERN_INFO "%s", usnic_version); + +	err = usnic_uiom_init(DRV_NAME); +	if (err) { +		usnic_err("Unable to initalize umem with err %d\n", err); +		return err; +	} + +	if (pci_register_driver(&usnic_ib_pci_driver)) { +		usnic_err("Unable to register with PCI\n"); +		goto out_umem_fini; +	} + +	err = register_netdevice_notifier(&usnic_ib_netdevice_notifier); +	if (err) { +		usnic_err("Failed to register netdev notifier\n"); +		goto out_pci_unreg; +	} + +	err = register_inetaddr_notifier(&usnic_ib_inetaddr_notifier); +	if (err) { +		usnic_err("Failed to register inet addr notifier\n"); +		goto out_unreg_netdev_notifier; +	} + +	err = usnic_transport_init(); +	if (err) { +		usnic_err("Failed to initialize transport\n"); +		goto out_unreg_inetaddr_notifier; +	} + +	usnic_debugfs_init(); + +	return 0; + +out_unreg_inetaddr_notifier: +	unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier); +out_unreg_netdev_notifier: +	unregister_netdevice_notifier(&usnic_ib_netdevice_notifier); +out_pci_unreg: +	pci_unregister_driver(&usnic_ib_pci_driver); +out_umem_fini: +	usnic_uiom_fini(); + +	return err; +} + +static void __exit usnic_ib_destroy(void) +{ +	usnic_dbg("\n"); +	usnic_debugfs_exit(); +	usnic_transport_fini(); +	unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier); +	unregister_netdevice_notifier(&usnic_ib_netdevice_notifier); +	pci_unregister_driver(&usnic_ib_pci_driver); +	usnic_uiom_fini(); +} + +MODULE_DESCRIPTION("Cisco VIC (usNIC) Verbs Driver"); +MODULE_AUTHOR("Upinder Malhi <umalhi@cisco.com>"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); +module_param(usnic_log_lvl, uint, S_IRUGO | S_IWUSR); +module_param(usnic_ib_share_vf, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(usnic_log_lvl, " Off=0, Err=1, Info=2, Debug=3"); +MODULE_PARM_DESC(usnic_ib_share_vf, "Off=0, On=1 VF sharing amongst QPs"); +MODULE_DEVICE_TABLE(pci, usnic_ib_pci_ids); + +module_init(usnic_ib_init); +module_exit(usnic_ib_destroy); +/* End of module section */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c new file mode 100644 index 00000000000..f8dfd76be89 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c @@ -0,0 +1,761 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/bug.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/spinlock.h> + +#include "usnic_log.h" +#include "usnic_vnic.h" +#include "usnic_fwd.h" +#include "usnic_uiom.h" +#include "usnic_debugfs.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_ib_sysfs.h" +#include "usnic_transport.h" + +#define DFLT_RQ_IDX	0 + +const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state) +{ +	switch (state) { +	case IB_QPS_RESET: +		return "Rst"; +	case IB_QPS_INIT: +		return "Init"; +	case IB_QPS_RTR: +		return "RTR"; +	case IB_QPS_RTS: +		return "RTS"; +	case IB_QPS_SQD: +		return "SQD"; +	case IB_QPS_SQE: +		return "SQE"; +	case IB_QPS_ERR: +		return "ERR"; +	default: +		return "UNKOWN STATE"; + +	} +} + +int usnic_ib_qp_grp_dump_hdr(char *buf, int buf_sz) +{ +	return scnprintf(buf, buf_sz, "|QPN\t|State\t|PID\t|VF Idx\t|Fil ID"); +} + +int usnic_ib_qp_grp_dump_rows(void *obj, char *buf, int buf_sz) +{ +	struct usnic_ib_qp_grp *qp_grp = obj; +	struct usnic_ib_qp_grp_flow *default_flow; +	if (obj) { +		default_flow = list_first_entry(&qp_grp->flows_lst, +					struct usnic_ib_qp_grp_flow, link); +		return scnprintf(buf, buf_sz, "|%d\t|%s\t|%d\t|%hu\t|%d", +					qp_grp->ibqp.qp_num, +					usnic_ib_qp_grp_state_to_string( +							qp_grp->state), +					qp_grp->owner_pid, +					usnic_vnic_get_index(qp_grp->vf->vnic), +					default_flow->flow->flow_id); +	} else { +		return scnprintf(buf, buf_sz, "|N/A\t|N/A\t|N/A\t|N/A\t|N/A"); +	} +} + +static struct usnic_vnic_res_chunk * +get_qp_res_chunk(struct usnic_ib_qp_grp *qp_grp) +{ +	lockdep_assert_held(&qp_grp->lock); +	/* +	 * The QP res chunk, used to derive qp indices, +	 * are just indices of the RQs +	 */ +	return usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ); +} + +static int enable_qp_grp(struct usnic_ib_qp_grp *qp_grp) +{ + +	int status; +	int i, vnic_idx; +	struct usnic_vnic_res_chunk *res_chunk; +	struct usnic_vnic_res *res; + +	lockdep_assert_held(&qp_grp->lock); + +	vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + +	res_chunk = get_qp_res_chunk(qp_grp); +	if (IS_ERR_OR_NULL(res_chunk)) { +		usnic_err("Unable to get qp res with err %ld\n", +				PTR_ERR(res_chunk)); +		return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM; +	} + +	for (i = 0; i < res_chunk->cnt; i++) { +		res = res_chunk->res[i]; +		status = usnic_fwd_enable_qp(qp_grp->ufdev, vnic_idx, +						res->vnic_idx); +		if (status) { +			usnic_err("Failed to enable qp %d of %s:%d\n with err %d\n", +					res->vnic_idx, qp_grp->ufdev->name, +					vnic_idx, status); +			goto out_err; +		} +	} + +	return 0; + +out_err: +	for (i--; i >= 0; i--) { +		res = res_chunk->res[i]; +		usnic_fwd_disable_qp(qp_grp->ufdev, vnic_idx, +					res->vnic_idx); +	} + +	return status; +} + +static int disable_qp_grp(struct usnic_ib_qp_grp *qp_grp) +{ +	int i, vnic_idx; +	struct usnic_vnic_res_chunk *res_chunk; +	struct usnic_vnic_res *res; +	int status = 0; + +	lockdep_assert_held(&qp_grp->lock); +	vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + +	res_chunk = get_qp_res_chunk(qp_grp); +	if (IS_ERR_OR_NULL(res_chunk)) { +		usnic_err("Unable to get qp res with err %ld\n", +			PTR_ERR(res_chunk)); +		return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM; +	} + +	for (i = 0; i < res_chunk->cnt; i++) { +		res = res_chunk->res[i]; +		status = usnic_fwd_disable_qp(qp_grp->ufdev, vnic_idx, +						res->vnic_idx); +		if (status) { +			usnic_err("Failed to disable rq %d of %s:%d\n with err %d\n", +					res->vnic_idx, +					qp_grp->ufdev->name, +					vnic_idx, status); +		} +	} + +	return status; + +} + +static int init_filter_action(struct usnic_ib_qp_grp *qp_grp, +				struct usnic_filter_action *uaction) +{ +	struct usnic_vnic_res_chunk *res_chunk; + +	res_chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ); +	if (IS_ERR_OR_NULL(res_chunk)) { +		usnic_err("Unable to get %s with err %ld\n", +			usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_RQ), +			PTR_ERR(res_chunk)); +		return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM; +	} + +	uaction->vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); +	uaction->action.type = FILTER_ACTION_RQ_STEERING; +	uaction->action.u.rq_idx = res_chunk->res[DFLT_RQ_IDX]->vnic_idx; + +	return 0; +} + +static struct usnic_ib_qp_grp_flow* +create_roce_custom_flow(struct usnic_ib_qp_grp *qp_grp, +			struct usnic_transport_spec *trans_spec) +{ +	uint16_t port_num; +	int err; +	struct filter filter; +	struct usnic_filter_action uaction; +	struct usnic_ib_qp_grp_flow *qp_flow; +	struct usnic_fwd_flow *flow; +	enum usnic_transport_type trans_type; + +	trans_type = trans_spec->trans_type; +	port_num = trans_spec->usnic_roce.port_num; + +	/* Reserve Port */ +	port_num = usnic_transport_rsrv_port(trans_type, port_num); +	if (port_num == 0) +		return ERR_PTR(-EINVAL); + +	/* Create Flow */ +	usnic_fwd_init_usnic_filter(&filter, port_num); +	err = init_filter_action(qp_grp, &uaction); +	if (err) +		goto out_unreserve_port; + +	flow = usnic_fwd_alloc_flow(qp_grp->ufdev, &filter, &uaction); +	if (IS_ERR_OR_NULL(flow)) { +		usnic_err("Unable to alloc flow failed with err %ld\n", +				PTR_ERR(flow)); +		err = flow ? PTR_ERR(flow) : -EFAULT; +		goto out_unreserve_port; +	} + +	/* Create Flow Handle */ +	qp_flow = kzalloc(sizeof(*qp_flow), GFP_ATOMIC); +	if (IS_ERR_OR_NULL(qp_flow)) { +		err = qp_flow ? PTR_ERR(qp_flow) : -ENOMEM; +		goto out_dealloc_flow; +	} +	qp_flow->flow = flow; +	qp_flow->trans_type = trans_type; +	qp_flow->usnic_roce.port_num = port_num; +	qp_flow->qp_grp = qp_grp; +	return qp_flow; + +out_dealloc_flow: +	usnic_fwd_dealloc_flow(flow); +out_unreserve_port: +	usnic_transport_unrsrv_port(trans_type, port_num); +	return ERR_PTR(err); +} + +static void release_roce_custom_flow(struct usnic_ib_qp_grp_flow *qp_flow) +{ +	usnic_fwd_dealloc_flow(qp_flow->flow); +	usnic_transport_unrsrv_port(qp_flow->trans_type, +					qp_flow->usnic_roce.port_num); +	kfree(qp_flow); +} + +static struct usnic_ib_qp_grp_flow* +create_udp_flow(struct usnic_ib_qp_grp *qp_grp, +		struct usnic_transport_spec *trans_spec) +{ +	struct socket *sock; +	int sock_fd; +	int err; +	struct filter filter; +	struct usnic_filter_action uaction; +	struct usnic_ib_qp_grp_flow *qp_flow; +	struct usnic_fwd_flow *flow; +	enum usnic_transport_type trans_type; +	uint32_t addr; +	uint16_t port_num; +	int proto; + +	trans_type = trans_spec->trans_type; +	sock_fd = trans_spec->udp.sock_fd; + +	/* Get and check socket */ +	sock = usnic_transport_get_socket(sock_fd); +	if (IS_ERR_OR_NULL(sock)) +		return ERR_CAST(sock); + +	err = usnic_transport_sock_get_addr(sock, &proto, &addr, &port_num); +	if (err) +		goto out_put_sock; + +	if (proto != IPPROTO_UDP) { +		usnic_err("Protocol for fd %d is not UDP", sock_fd); +		err = -EPERM; +		goto out_put_sock; +	} + +	/* Create flow */ +	usnic_fwd_init_udp_filter(&filter, addr, port_num); +	err = init_filter_action(qp_grp, &uaction); +	if (err) +		goto out_put_sock; + +	flow = usnic_fwd_alloc_flow(qp_grp->ufdev, &filter, &uaction); +	if (IS_ERR_OR_NULL(flow)) { +		usnic_err("Unable to alloc flow failed with err %ld\n", +				PTR_ERR(flow)); +		err = flow ? PTR_ERR(flow) : -EFAULT; +		goto out_put_sock; +	} + +	/* Create qp_flow */ +	qp_flow = kzalloc(sizeof(*qp_flow), GFP_ATOMIC); +	if (IS_ERR_OR_NULL(qp_flow)) { +		err = qp_flow ? PTR_ERR(qp_flow) : -ENOMEM; +		goto out_dealloc_flow; +	} +	qp_flow->flow = flow; +	qp_flow->trans_type = trans_type; +	qp_flow->udp.sock = sock; +	qp_flow->qp_grp = qp_grp; +	return qp_flow; + +out_dealloc_flow: +	usnic_fwd_dealloc_flow(flow); +out_put_sock: +	usnic_transport_put_socket(sock); +	return ERR_PTR(err); +} + +static void release_udp_flow(struct usnic_ib_qp_grp_flow *qp_flow) +{ +	usnic_fwd_dealloc_flow(qp_flow->flow); +	usnic_transport_put_socket(qp_flow->udp.sock); +	kfree(qp_flow); +} + +static struct usnic_ib_qp_grp_flow* +create_and_add_flow(struct usnic_ib_qp_grp *qp_grp, +			struct usnic_transport_spec *trans_spec) +{ +	struct usnic_ib_qp_grp_flow *qp_flow; +	enum usnic_transport_type trans_type; + +	trans_type = trans_spec->trans_type; +	switch (trans_type) { +	case USNIC_TRANSPORT_ROCE_CUSTOM: +		qp_flow = create_roce_custom_flow(qp_grp, trans_spec); +		break; +	case USNIC_TRANSPORT_IPV4_UDP: +		qp_flow = create_udp_flow(qp_grp, trans_spec); +		break; +	default: +		usnic_err("Unsupported transport %u\n", +				trans_spec->trans_type); +		return ERR_PTR(-EINVAL); +	} + +	if (!IS_ERR_OR_NULL(qp_flow)) { +		list_add_tail(&qp_flow->link, &qp_grp->flows_lst); +		usnic_debugfs_flow_add(qp_flow); +	} + + +	return qp_flow; +} + +static void release_and_remove_flow(struct usnic_ib_qp_grp_flow *qp_flow) +{ +	usnic_debugfs_flow_remove(qp_flow); +	list_del(&qp_flow->link); + +	switch (qp_flow->trans_type) { +	case USNIC_TRANSPORT_ROCE_CUSTOM: +		release_roce_custom_flow(qp_flow); +		break; +	case USNIC_TRANSPORT_IPV4_UDP: +		release_udp_flow(qp_flow); +		break; +	default: +		WARN(1, "Unsupported transport %u\n", +				qp_flow->trans_type); +		break; +	} +} + +static void release_and_remove_all_flows(struct usnic_ib_qp_grp *qp_grp) +{ +	struct usnic_ib_qp_grp_flow *qp_flow, *tmp; +	list_for_each_entry_safe(qp_flow, tmp, &qp_grp->flows_lst, link) +		release_and_remove_flow(qp_flow); +} + +int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp, +				enum ib_qp_state new_state, +				void *data) +{ +	int status = 0; +	int vnic_idx; +	struct ib_event ib_event; +	enum ib_qp_state old_state; +	struct usnic_transport_spec *trans_spec; +	struct usnic_ib_qp_grp_flow *qp_flow; + +	old_state = qp_grp->state; +	vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); +	trans_spec = (struct usnic_transport_spec *) data; + +	spin_lock(&qp_grp->lock); +	switch (new_state) { +	case IB_QPS_RESET: +		switch (old_state) { +		case IB_QPS_RESET: +			/* NO-OP */ +			break; +		case IB_QPS_INIT: +			release_and_remove_all_flows(qp_grp); +			status = 0; +			break; +		case IB_QPS_RTR: +		case IB_QPS_RTS: +		case IB_QPS_ERR: +			status = disable_qp_grp(qp_grp); +			release_and_remove_all_flows(qp_grp); +			break; +		default: +			status = -EINVAL; +		} +		break; +	case IB_QPS_INIT: +		switch (old_state) { +		case IB_QPS_RESET: +			if (trans_spec) { +				qp_flow = create_and_add_flow(qp_grp, +								trans_spec); +				if (IS_ERR_OR_NULL(qp_flow)) { +					status = qp_flow ? PTR_ERR(qp_flow) : -EFAULT; +					break; +				} +			} else { +				/* +				 * Optional to specify filters. +				 */ +				status = 0; +			} +			break; +		case IB_QPS_INIT: +			if (trans_spec) { +				qp_flow = create_and_add_flow(qp_grp, +								trans_spec); +				if (IS_ERR_OR_NULL(qp_flow)) { +					status = qp_flow ? PTR_ERR(qp_flow) : -EFAULT; +					break; +				} +			} else { +				/* +				 * Doesn't make sense to go into INIT state +				 * from INIT state w/o adding filters. +				 */ +				status = -EINVAL; +			} +			break; +		case IB_QPS_RTR: +			status = disable_qp_grp(qp_grp); +			break; +		case IB_QPS_RTS: +			status = disable_qp_grp(qp_grp); +			break; +		default: +			status = -EINVAL; +		} +		break; +	case IB_QPS_RTR: +		switch (old_state) { +		case IB_QPS_INIT: +			status = enable_qp_grp(qp_grp); +			break; +		default: +			status = -EINVAL; +		} +		break; +	case IB_QPS_RTS: +		switch (old_state) { +		case IB_QPS_RTR: +			/* NO-OP FOR NOW */ +			break; +		default: +			status = -EINVAL; +		} +		break; +	case IB_QPS_ERR: +		ib_event.device = &qp_grp->vf->pf->ib_dev; +		ib_event.element.qp = &qp_grp->ibqp; +		ib_event.event = IB_EVENT_QP_FATAL; + +		switch (old_state) { +		case IB_QPS_RESET: +			qp_grp->ibqp.event_handler(&ib_event, +					qp_grp->ibqp.qp_context); +			break; +		case IB_QPS_INIT: +			release_and_remove_all_flows(qp_grp); +			qp_grp->ibqp.event_handler(&ib_event, +					qp_grp->ibqp.qp_context); +			break; +		case IB_QPS_RTR: +		case IB_QPS_RTS: +			status = disable_qp_grp(qp_grp); +			release_and_remove_all_flows(qp_grp); +			qp_grp->ibqp.event_handler(&ib_event, +					qp_grp->ibqp.qp_context); +			break; +		default: +			status = -EINVAL; +		} +		break; +	default: +		status = -EINVAL; +	} +	spin_unlock(&qp_grp->lock); + +	if (!status) { +		qp_grp->state = new_state; +		usnic_info("Transistioned %u from %s to %s", +		qp_grp->grp_id, +		usnic_ib_qp_grp_state_to_string(old_state), +		usnic_ib_qp_grp_state_to_string(new_state)); +	} else { +		usnic_err("Failed to transistion %u from %s to %s", +		qp_grp->grp_id, +		usnic_ib_qp_grp_state_to_string(old_state), +		usnic_ib_qp_grp_state_to_string(new_state)); +	} + +	return status; +} + +static struct usnic_vnic_res_chunk** +alloc_res_chunk_list(struct usnic_vnic *vnic, +			struct usnic_vnic_res_spec *res_spec, void *owner_obj) +{ +	enum usnic_vnic_res_type res_type; +	struct usnic_vnic_res_chunk **res_chunk_list; +	int err, i, res_cnt, res_lst_sz; + +	for (res_lst_sz = 0; +		res_spec->resources[res_lst_sz].type != USNIC_VNIC_RES_TYPE_EOL; +		res_lst_sz++) { +		/* Do Nothing */ +	} + +	res_chunk_list = kzalloc(sizeof(*res_chunk_list)*(res_lst_sz+1), +					GFP_ATOMIC); +	if (!res_chunk_list) +		return ERR_PTR(-ENOMEM); + +	for (i = 0; res_spec->resources[i].type != USNIC_VNIC_RES_TYPE_EOL; +		i++) { +		res_type = res_spec->resources[i].type; +		res_cnt = res_spec->resources[i].cnt; + +		res_chunk_list[i] = usnic_vnic_get_resources(vnic, res_type, +					res_cnt, owner_obj); +		if (IS_ERR_OR_NULL(res_chunk_list[i])) { +			err = res_chunk_list[i] ? +					PTR_ERR(res_chunk_list[i]) : -ENOMEM; +			usnic_err("Failed to get %s from %s with err %d\n", +				usnic_vnic_res_type_to_str(res_type), +				usnic_vnic_pci_name(vnic), +				err); +			goto out_free_res; +		} +	} + +	return res_chunk_list; + +out_free_res: +	for (i--; i > 0; i--) +		usnic_vnic_put_resources(res_chunk_list[i]); +	kfree(res_chunk_list); +	return ERR_PTR(err); +} + +static void free_qp_grp_res(struct usnic_vnic_res_chunk **res_chunk_list) +{ +	int i; +	for (i = 0; res_chunk_list[i]; i++) +		usnic_vnic_put_resources(res_chunk_list[i]); +	kfree(res_chunk_list); +} + +static int qp_grp_and_vf_bind(struct usnic_ib_vf *vf, +				struct usnic_ib_pd *pd, +				struct usnic_ib_qp_grp *qp_grp) +{ +	int err; +	struct pci_dev *pdev; + +	lockdep_assert_held(&vf->lock); + +	pdev = usnic_vnic_get_pdev(vf->vnic); +	if (vf->qp_grp_ref_cnt == 0) { +		err = usnic_uiom_attach_dev_to_pd(pd->umem_pd, &pdev->dev); +		if (err) { +			usnic_err("Failed to attach %s to domain\n", +					pci_name(pdev)); +			return err; +		} +		vf->pd = pd; +	} +	vf->qp_grp_ref_cnt++; + +	WARN_ON(vf->pd != pd); +	qp_grp->vf = vf; + +	return 0; +} + +static void qp_grp_and_vf_unbind(struct usnic_ib_qp_grp *qp_grp) +{ +	struct pci_dev *pdev; +	struct usnic_ib_pd *pd; + +	lockdep_assert_held(&qp_grp->vf->lock); + +	pd = qp_grp->vf->pd; +	pdev = usnic_vnic_get_pdev(qp_grp->vf->vnic); +	if (--qp_grp->vf->qp_grp_ref_cnt == 0) { +		qp_grp->vf->pd = NULL; +		usnic_uiom_detach_dev_from_pd(pd->umem_pd, &pdev->dev); +	} +	qp_grp->vf = NULL; +} + +static void log_spec(struct usnic_vnic_res_spec *res_spec) +{ +	char buf[512]; +	usnic_vnic_spec_dump(buf, sizeof(buf), res_spec); +	usnic_dbg("%s\n", buf); +} + +static int qp_grp_id_from_flow(struct usnic_ib_qp_grp_flow *qp_flow, +				uint32_t *id) +{ +	enum usnic_transport_type trans_type = qp_flow->trans_type; +	int err; +	uint16_t port_num = 0; + +	switch (trans_type) { +	case USNIC_TRANSPORT_ROCE_CUSTOM: +		*id = qp_flow->usnic_roce.port_num; +		break; +	case USNIC_TRANSPORT_IPV4_UDP: +		err = usnic_transport_sock_get_addr(qp_flow->udp.sock, +							NULL, NULL, +							&port_num); +		if (err) +			return err; +		/* +		 * Copy port_num to stack first and then to *id, +		 * so that the short to int cast works for little +		 * and big endian systems. +		 */ +		*id = port_num; +		break; +	default: +		usnic_err("Unsupported transport %u\n", trans_type); +		return -EINVAL; +	} + +	return 0; +} + +struct usnic_ib_qp_grp * +usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf, +			struct usnic_ib_pd *pd, +			struct usnic_vnic_res_spec *res_spec, +			struct usnic_transport_spec *transport_spec) +{ +	struct usnic_ib_qp_grp *qp_grp; +	int err; +	enum usnic_transport_type transport = transport_spec->trans_type; +	struct usnic_ib_qp_grp_flow *qp_flow; + +	lockdep_assert_held(&vf->lock); + +	err = usnic_vnic_res_spec_satisfied(&min_transport_spec[transport], +						res_spec); +	if (err) { +		usnic_err("Spec does not meet miniumum req for transport %d\n", +				transport); +		log_spec(res_spec); +		return ERR_PTR(err); +	} + +	qp_grp = kzalloc(sizeof(*qp_grp), GFP_ATOMIC); +	if (!qp_grp) { +		usnic_err("Unable to alloc qp_grp - Out of memory\n"); +		return NULL; +	} + +	qp_grp->res_chunk_list = alloc_res_chunk_list(vf->vnic, res_spec, +							qp_grp); +	if (IS_ERR_OR_NULL(qp_grp->res_chunk_list)) { +		err = qp_grp->res_chunk_list ? +				PTR_ERR(qp_grp->res_chunk_list) : -ENOMEM; +		usnic_err("Unable to alloc res for %d with err %d\n", +				qp_grp->grp_id, err); +		goto out_free_qp_grp; +	} + +	err = qp_grp_and_vf_bind(vf, pd, qp_grp); +	if (err) +		goto out_free_res; + +	INIT_LIST_HEAD(&qp_grp->flows_lst); +	spin_lock_init(&qp_grp->lock); +	qp_grp->ufdev = ufdev; +	qp_grp->state = IB_QPS_RESET; +	qp_grp->owner_pid = current->pid; + +	qp_flow = create_and_add_flow(qp_grp, transport_spec); +	if (IS_ERR_OR_NULL(qp_flow)) { +		usnic_err("Unable to create and add flow with err %ld\n", +				PTR_ERR(qp_flow)); +		err = qp_flow ? PTR_ERR(qp_flow) : -EFAULT; +		goto out_qp_grp_vf_unbind; +	} + +	err = qp_grp_id_from_flow(qp_flow, &qp_grp->grp_id); +	if (err) +		goto out_release_flow; +	qp_grp->ibqp.qp_num = qp_grp->grp_id; + +	usnic_ib_sysfs_qpn_add(qp_grp); + +	return qp_grp; + +out_release_flow: +	release_and_remove_flow(qp_flow); +out_qp_grp_vf_unbind: +	qp_grp_and_vf_unbind(qp_grp); +out_free_res: +	free_qp_grp_res(qp_grp->res_chunk_list); +out_free_qp_grp: +	kfree(qp_grp); + +	return ERR_PTR(err); +} + +void usnic_ib_qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp) +{ + +	WARN_ON(qp_grp->state != IB_QPS_RESET); +	lockdep_assert_held(&qp_grp->vf->lock); + +	release_and_remove_all_flows(qp_grp); +	usnic_ib_sysfs_qpn_remove(qp_grp); +	qp_grp_and_vf_unbind(qp_grp); +	free_qp_grp_res(qp_grp->res_chunk_list); +	kfree(qp_grp); +} + +struct usnic_vnic_res_chunk* +usnic_ib_qp_grp_get_chunk(struct usnic_ib_qp_grp *qp_grp, +				enum usnic_vnic_res_type res_type) +{ +	int i; + +	for (i = 0; qp_grp->res_chunk_list[i]; i++) { +		if (qp_grp->res_chunk_list[i]->type == res_type) +			return qp_grp->res_chunk_list[i]; +	} + +	return ERR_PTR(-EINVAL); +} diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h new file mode 100644 index 00000000000..b0aafe8db0c --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_QP_GRP_H_ +#define USNIC_IB_QP_GRP_H_ + +#include <linux/debugfs.h> +#include <rdma/ib_verbs.h> + +#include "usnic_ib.h" +#include "usnic_abi.h" +#include "usnic_fwd.h" +#include "usnic_vnic.h" + +/* + * The qp group struct represents all the hw resources needed to present a ib_qp + */ +struct usnic_ib_qp_grp { +	struct ib_qp				ibqp; +	enum ib_qp_state			state; +	int					grp_id; + +	struct usnic_fwd_dev			*ufdev; +	struct usnic_ib_ucontext		*ctx; +	struct list_head			flows_lst; + +	struct usnic_vnic_res_chunk		**res_chunk_list; + +	pid_t					owner_pid; +	struct usnic_ib_vf			*vf; +	struct list_head			link; + +	spinlock_t				lock; + +	struct kobject				kobj; +}; + +struct usnic_ib_qp_grp_flow { +	struct usnic_fwd_flow		*flow; +	enum usnic_transport_type	trans_type; +	union { +		struct { +			uint16_t	port_num; +		} usnic_roce; +		struct { +			struct socket	*sock; +		} udp; +	}; +	struct usnic_ib_qp_grp		*qp_grp; +	struct list_head		link; + +	/* Debug FS */ +	struct dentry			*dbgfs_dentry; +	char				dentry_name[32]; +}; + +static const struct +usnic_vnic_res_spec min_transport_spec[USNIC_TRANSPORT_MAX] = { +	{ /*USNIC_TRANSPORT_UNKNOWN*/ +		.resources = { +			{.type = USNIC_VNIC_RES_TYPE_EOL,	.cnt = 0,}, +		}, +	}, +	{ /*USNIC_TRANSPORT_ROCE_CUSTOM*/ +		.resources = { +			{.type = USNIC_VNIC_RES_TYPE_WQ,	.cnt = 1,}, +			{.type = USNIC_VNIC_RES_TYPE_RQ,	.cnt = 1,}, +			{.type = USNIC_VNIC_RES_TYPE_CQ,	.cnt = 1,}, +			{.type = USNIC_VNIC_RES_TYPE_EOL,	.cnt = 0,}, +		}, +	}, +	{ /*USNIC_TRANSPORT_IPV4_UDP*/ +		.resources = { +			{.type = USNIC_VNIC_RES_TYPE_WQ,	.cnt = 1,}, +			{.type = USNIC_VNIC_RES_TYPE_RQ,	.cnt = 1,}, +			{.type = USNIC_VNIC_RES_TYPE_CQ,	.cnt = 1,}, +			{.type = USNIC_VNIC_RES_TYPE_EOL,	.cnt = 0,}, +		}, +	}, +}; + +const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state); +int usnic_ib_qp_grp_dump_hdr(char *buf, int buf_sz); +int usnic_ib_qp_grp_dump_rows(void *obj, char *buf, int buf_sz); +struct usnic_ib_qp_grp * +usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf, +			struct usnic_ib_pd *pd, +			struct usnic_vnic_res_spec *res_spec, +			struct usnic_transport_spec *trans_spec); +void usnic_ib_qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp); +int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp, +				enum ib_qp_state new_state, +				void *data); +struct usnic_vnic_res_chunk +*usnic_ib_qp_grp_get_chunk(struct usnic_ib_qp_grp *qp_grp, +				enum usnic_vnic_res_type type); +static inline +struct usnic_ib_qp_grp *to_uqp_grp(struct ib_qp *ibqp) +{ +	return container_of(ibqp, struct usnic_ib_qp_grp, ibqp); +} +#endif /* USNIC_IB_QP_GRP_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c new file mode 100644 index 00000000000..27dc67c1689 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/errno.h> + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_addr.h> + +#include "usnic_common_util.h" +#include "usnic_ib.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_vnic.h" +#include "usnic_ib_verbs.h" +#include "usnic_log.h" + +static ssize_t usnic_ib_show_fw_ver(struct device *device, +					struct device_attribute *attr, +					char *buf) +{ +	struct usnic_ib_dev *us_ibdev = +		container_of(device, struct usnic_ib_dev, ib_dev.dev); +	struct ethtool_drvinfo info; + +	mutex_lock(&us_ibdev->usdev_lock); +	us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info); +	mutex_unlock(&us_ibdev->usdev_lock); + +	return scnprintf(buf, PAGE_SIZE, "%s\n", info.fw_version); +} + +static ssize_t usnic_ib_show_board(struct device *device, +					struct device_attribute *attr, +					char *buf) +{ +	struct usnic_ib_dev *us_ibdev = +		container_of(device, struct usnic_ib_dev, ib_dev.dev); +	unsigned short subsystem_device_id; + +	mutex_lock(&us_ibdev->usdev_lock); +	subsystem_device_id = us_ibdev->pdev->subsystem_device; +	mutex_unlock(&us_ibdev->usdev_lock); + +	return scnprintf(buf, PAGE_SIZE, "%hu\n", subsystem_device_id); +} + +/* + * Report the configuration for this PF + */ +static ssize_t +usnic_ib_show_config(struct device *device, struct device_attribute *attr, +			char *buf) +{ +	struct usnic_ib_dev *us_ibdev; +	char *ptr; +	unsigned left; +	unsigned n; +	enum usnic_vnic_res_type res_type; + +	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + +	/* Buffer space limit is 1 page */ +	ptr = buf; +	left = PAGE_SIZE; + +	mutex_lock(&us_ibdev->usdev_lock); +	if (atomic_read(&us_ibdev->vf_cnt.refcount) > 0) { +		char *busname; + +		/* +		 * bus name seems to come with annoying prefix. +		 * Remove it if it is predictable +		 */ +		busname = us_ibdev->pdev->bus->name; +		if (strncmp(busname, "PCI Bus ", 8) == 0) +			busname += 8; + +		n = scnprintf(ptr, left, +			"%s: %s:%d.%d, %s, %pM, %u VFs\n Per VF:", +			us_ibdev->ib_dev.name, +			busname, +			PCI_SLOT(us_ibdev->pdev->devfn), +			PCI_FUNC(us_ibdev->pdev->devfn), +			netdev_name(us_ibdev->netdev), +			us_ibdev->ufdev->mac, +			atomic_read(&us_ibdev->vf_cnt.refcount)); +		UPDATE_PTR_LEFT(n, ptr, left); + +		for (res_type = USNIC_VNIC_RES_TYPE_EOL; +				res_type < USNIC_VNIC_RES_TYPE_MAX; +				res_type++) { +			if (us_ibdev->vf_res_cnt[res_type] == 0) +				continue; +			n = scnprintf(ptr, left, " %d %s%s", +				us_ibdev->vf_res_cnt[res_type], +				usnic_vnic_res_type_to_str(res_type), +				(res_type < (USNIC_VNIC_RES_TYPE_MAX - 1)) ? +				 "," : ""); +			UPDATE_PTR_LEFT(n, ptr, left); +		} +		n = scnprintf(ptr, left, "\n"); +		UPDATE_PTR_LEFT(n, ptr, left); +	} else { +		n = scnprintf(ptr, left, "%s: no VFs\n", +				us_ibdev->ib_dev.name); +		UPDATE_PTR_LEFT(n, ptr, left); +	} +	mutex_unlock(&us_ibdev->usdev_lock); + +	return ptr - buf; +} + +static ssize_t +usnic_ib_show_iface(struct device *device, struct device_attribute *attr, +			char *buf) +{ +	struct usnic_ib_dev *us_ibdev; + +	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + +	return scnprintf(buf, PAGE_SIZE, "%s\n", +			netdev_name(us_ibdev->netdev)); +} + +static ssize_t +usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr, +			char *buf) +{ +	struct usnic_ib_dev *us_ibdev; + +	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + +	return scnprintf(buf, PAGE_SIZE, "%u\n", +			atomic_read(&us_ibdev->vf_cnt.refcount)); +} + +static ssize_t +usnic_ib_show_qp_per_vf(struct device *device, struct device_attribute *attr, +			char *buf) +{ +	struct usnic_ib_dev *us_ibdev; +	int qp_per_vf; + +	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); +	qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ], +			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]); + +	return scnprintf(buf, PAGE_SIZE, +				"%d\n", qp_per_vf); +} + +static ssize_t +usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr, +			char *buf) +{ +	struct usnic_ib_dev *us_ibdev; + +	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + +	return scnprintf(buf, PAGE_SIZE, "%d\n", +			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]); +} + +static DEVICE_ATTR(fw_ver, S_IRUGO, usnic_ib_show_fw_ver, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, usnic_ib_show_board, NULL); +static DEVICE_ATTR(config, S_IRUGO, usnic_ib_show_config, NULL); +static DEVICE_ATTR(iface, S_IRUGO, usnic_ib_show_iface, NULL); +static DEVICE_ATTR(max_vf, S_IRUGO, usnic_ib_show_max_vf, NULL); +static DEVICE_ATTR(qp_per_vf, S_IRUGO, usnic_ib_show_qp_per_vf, NULL); +static DEVICE_ATTR(cq_per_vf, S_IRUGO, usnic_ib_show_cq_per_vf, NULL); + +static struct device_attribute *usnic_class_attributes[] = { +	&dev_attr_fw_ver, +	&dev_attr_board_id, +	&dev_attr_config, +	&dev_attr_iface, +	&dev_attr_max_vf, +	&dev_attr_qp_per_vf, +	&dev_attr_cq_per_vf, +}; + +struct qpn_attribute { +	struct attribute attr; +	ssize_t (*show)(struct usnic_ib_qp_grp *, char *buf); +}; + +/* + * Definitions for supporting QPN entries in sysfs + */ +static ssize_t +usnic_ib_qpn_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ +	struct usnic_ib_qp_grp *qp_grp; +	struct qpn_attribute *qpn_attr; + +	qp_grp = container_of(kobj, struct usnic_ib_qp_grp, kobj); +	qpn_attr = container_of(attr, struct qpn_attribute, attr); + +	return qpn_attr->show(qp_grp, buf); +} + +static const struct sysfs_ops usnic_ib_qpn_sysfs_ops = { +	.show = usnic_ib_qpn_attr_show +}; + +#define QPN_ATTR_RO(NAME) \ +struct qpn_attribute qpn_attr_##NAME = __ATTR_RO(NAME) + +static ssize_t context_show(struct usnic_ib_qp_grp *qp_grp, char *buf) +{ +	return scnprintf(buf, PAGE_SIZE, "0x%p\n", qp_grp->ctx); +} + +static ssize_t summary_show(struct usnic_ib_qp_grp *qp_grp, char *buf) +{ +	int i, j, n; +	int left; +	char *ptr; +	struct usnic_vnic_res_chunk *res_chunk; +	struct usnic_vnic_res *vnic_res; + +	left = PAGE_SIZE; +	ptr = buf; + +	n = scnprintf(ptr, left, +			"QPN: %d State: (%s) PID: %u VF Idx: %hu ", +			qp_grp->ibqp.qp_num, +			usnic_ib_qp_grp_state_to_string(qp_grp->state), +			qp_grp->owner_pid, +			usnic_vnic_get_index(qp_grp->vf->vnic)); +	UPDATE_PTR_LEFT(n, ptr, left); + +	for (i = 0; qp_grp->res_chunk_list[i]; i++) { +		res_chunk = qp_grp->res_chunk_list[i]; +		for (j = 0; j < res_chunk->cnt; j++) { +			vnic_res = res_chunk->res[j]; +			n = scnprintf(ptr, left, "%s[%d] ", +				usnic_vnic_res_type_to_str(vnic_res->type), +				vnic_res->vnic_idx); +			UPDATE_PTR_LEFT(n, ptr, left); +		} +	} + +	n = scnprintf(ptr, left, "\n"); +	UPDATE_PTR_LEFT(n, ptr, left); + +	return ptr - buf; +} + +static QPN_ATTR_RO(context); +static QPN_ATTR_RO(summary); + +static struct attribute *usnic_ib_qpn_default_attrs[] = { +	&qpn_attr_context.attr, +	&qpn_attr_summary.attr, +	NULL +}; + +static struct kobj_type usnic_ib_qpn_type = { +	.sysfs_ops = &usnic_ib_qpn_sysfs_ops, +	.default_attrs = usnic_ib_qpn_default_attrs +}; + +int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev) +{ +	int i; +	int err; +	for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) { +		err = device_create_file(&us_ibdev->ib_dev.dev, +						usnic_class_attributes[i]); +		if (err) { +			usnic_err("Failed to create device file %d for %s eith err %d", +				i, us_ibdev->ib_dev.name, err); +			return -EINVAL; +		} +	} + +	/* create kernel object for looking at individual QPs */ +	kobject_get(&us_ibdev->ib_dev.dev.kobj); +	us_ibdev->qpn_kobj = kobject_create_and_add("qpn", +			&us_ibdev->ib_dev.dev.kobj); +	if (us_ibdev->qpn_kobj == NULL) { +		kobject_put(&us_ibdev->ib_dev.dev.kobj); +		return -ENOMEM; +	} + +	return 0; +} + +void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev) +{ +	int i; +	for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) { +		device_remove_file(&us_ibdev->ib_dev.dev, +					usnic_class_attributes[i]); +	} + +	kobject_put(us_ibdev->qpn_kobj); +} + +void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp) +{ +	struct usnic_ib_dev *us_ibdev; +	int err; + +	us_ibdev = qp_grp->vf->pf; + +	err = kobject_init_and_add(&qp_grp->kobj, &usnic_ib_qpn_type, +			kobject_get(us_ibdev->qpn_kobj), +			"%d", qp_grp->grp_id); +	if (err) { +		kobject_put(us_ibdev->qpn_kobj); +		return; +	} +} + +void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp) +{ +	struct usnic_ib_dev *us_ibdev; + +	us_ibdev = qp_grp->vf->pf; + +	kobject_put(&qp_grp->kobj); +	kobject_put(us_ibdev->qpn_kobj); +} diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h new file mode 100644 index 00000000000..0d09b493cd0 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_SYSFS_H_ +#define USNIC_IB_SYSFS_H_ + +#include "usnic_ib.h" + +int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev); +void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev); +void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp); +void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp); + +#endif /* !USNIC_IB_SYSFS_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c new file mode 100644 index 00000000000..53bd6a2d9cd --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -0,0 +1,768 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/errno.h> + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_addr.h> + +#include "usnic_abi.h" +#include "usnic_ib.h" +#include "usnic_common_util.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_fwd.h" +#include "usnic_log.h" +#include "usnic_uiom.h" +#include "usnic_transport.h" + +#define USNIC_DEFAULT_TRANSPORT USNIC_TRANSPORT_ROCE_CUSTOM + +static void usnic_ib_fw_string_to_u64(char *fw_ver_str, u64 *fw_ver) +{ +	*fw_ver = (u64) *fw_ver_str; +} + +static int usnic_ib_fill_create_qp_resp(struct usnic_ib_qp_grp *qp_grp, +					struct ib_udata *udata) +{ +	struct usnic_ib_dev *us_ibdev; +	struct usnic_ib_create_qp_resp resp; +	struct pci_dev *pdev; +	struct vnic_dev_bar *bar; +	struct usnic_vnic_res_chunk *chunk; +	struct usnic_ib_qp_grp_flow *default_flow; +	int i, err; + +	memset(&resp, 0, sizeof(resp)); + +	us_ibdev = qp_grp->vf->pf; +	pdev = usnic_vnic_get_pdev(qp_grp->vf->vnic); +	if (!pdev) { +		usnic_err("Failed to get pdev of qp_grp %d\n", +				qp_grp->grp_id); +		return -EFAULT; +	} + +	bar = usnic_vnic_get_bar(qp_grp->vf->vnic, 0); +	if (!bar) { +		usnic_err("Failed to get bar0 of qp_grp %d vf %s", +				qp_grp->grp_id, pci_name(pdev)); +		return -EFAULT; +	} + +	resp.vfid = usnic_vnic_get_index(qp_grp->vf->vnic); +	resp.bar_bus_addr = bar->bus_addr; +	resp.bar_len = bar->len; + +	chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ); +	if (IS_ERR_OR_NULL(chunk)) { +		usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n", +			usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_RQ), +			qp_grp->grp_id, +			PTR_ERR(chunk)); +		return chunk ? PTR_ERR(chunk) : -ENOMEM; +	} + +	WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_RQ); +	resp.rq_cnt = chunk->cnt; +	for (i = 0; i < chunk->cnt; i++) +		resp.rq_idx[i] = chunk->res[i]->vnic_idx; + +	chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_WQ); +	if (IS_ERR_OR_NULL(chunk)) { +		usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n", +			usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_WQ), +			qp_grp->grp_id, +			PTR_ERR(chunk)); +		return chunk ? PTR_ERR(chunk) : -ENOMEM; +	} + +	WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_WQ); +	resp.wq_cnt = chunk->cnt; +	for (i = 0; i < chunk->cnt; i++) +		resp.wq_idx[i] = chunk->res[i]->vnic_idx; + +	chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_CQ); +	if (IS_ERR_OR_NULL(chunk)) { +		usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n", +			usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_CQ), +			qp_grp->grp_id, +			PTR_ERR(chunk)); +		return chunk ? PTR_ERR(chunk) : -ENOMEM; +	} + +	WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_CQ); +	resp.cq_cnt = chunk->cnt; +	for (i = 0; i < chunk->cnt; i++) +		resp.cq_idx[i] = chunk->res[i]->vnic_idx; + +	default_flow = list_first_entry(&qp_grp->flows_lst, +					struct usnic_ib_qp_grp_flow, link); +	resp.transport = default_flow->trans_type; + +	err = ib_copy_to_udata(udata, &resp, sizeof(resp)); +	if (err) { +		usnic_err("Failed to copy udata for %s", us_ibdev->ib_dev.name); +		return err; +	} + +	return 0; +} + +static struct usnic_ib_qp_grp* +find_free_vf_and_create_qp_grp(struct usnic_ib_dev *us_ibdev, +				struct usnic_ib_pd *pd, +				struct usnic_transport_spec *trans_spec, +				struct usnic_vnic_res_spec *res_spec) +{ +	struct usnic_ib_vf *vf; +	struct usnic_vnic *vnic; +	struct usnic_ib_qp_grp *qp_grp; +	struct device *dev, **dev_list; +	int i, found = 0; + +	BUG_ON(!mutex_is_locked(&us_ibdev->usdev_lock)); + +	if (list_empty(&us_ibdev->vf_dev_list)) { +		usnic_info("No vfs to allocate\n"); +		return NULL; +	} + +	if (usnic_ib_share_vf) { +		/* Try to find resouces on a used vf which is in pd */ +		dev_list = usnic_uiom_get_dev_list(pd->umem_pd); +		for (i = 0; dev_list[i]; i++) { +			dev = dev_list[i]; +			vf = pci_get_drvdata(to_pci_dev(dev)); +			spin_lock(&vf->lock); +			vnic = vf->vnic; +			if (!usnic_vnic_check_room(vnic, res_spec)) { +				usnic_dbg("Found used vnic %s from %s\n", +						us_ibdev->ib_dev.name, +						pci_name(usnic_vnic_get_pdev( +									vnic))); +				found = 1; +				break; +			} +			spin_unlock(&vf->lock); + +		} +		usnic_uiom_free_dev_list(dev_list); +	} + +	if (!found) { +		/* Try to find resources on an unused vf */ +		list_for_each_entry(vf, &us_ibdev->vf_dev_list, link) { +			spin_lock(&vf->lock); +			vnic = vf->vnic; +			if (vf->qp_grp_ref_cnt == 0 && +				usnic_vnic_check_room(vnic, res_spec) == 0) { +				found = 1; +				break; +			} +			spin_unlock(&vf->lock); +		} +	} + +	if (!found) { +		usnic_info("No free qp grp found on %s\n", +				us_ibdev->ib_dev.name); +		return ERR_PTR(-ENOMEM); +	} + +	qp_grp = usnic_ib_qp_grp_create(us_ibdev->ufdev, vf, pd, res_spec, +						trans_spec); +	spin_unlock(&vf->lock); +	if (IS_ERR_OR_NULL(qp_grp)) { +		usnic_err("Failed to allocate qp_grp\n"); +		return ERR_PTR(qp_grp ? PTR_ERR(qp_grp) : -ENOMEM); +	} + +	return qp_grp; +} + +static void qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp) +{ +	struct usnic_ib_vf *vf = qp_grp->vf; + +	WARN_ON(qp_grp->state != IB_QPS_RESET); + +	spin_lock(&vf->lock); +	usnic_ib_qp_grp_destroy(qp_grp); +	spin_unlock(&vf->lock); +} + +static void eth_speed_to_ib_speed(int speed, u8 *active_speed, +					u8 *active_width) +{ +	if (speed <= 10000) { +		*active_width = IB_WIDTH_1X; +		*active_speed = IB_SPEED_FDR10; +	} else if (speed <= 20000) { +		*active_width = IB_WIDTH_4X; +		*active_speed = IB_SPEED_DDR; +	} else if (speed <= 30000) { +		*active_width = IB_WIDTH_4X; +		*active_speed = IB_SPEED_QDR; +	} else if (speed <= 40000) { +		*active_width = IB_WIDTH_4X; +		*active_speed = IB_SPEED_FDR10; +	} else { +		*active_width = IB_WIDTH_4X; +		*active_speed = IB_SPEED_EDR; +	} +} + +static int create_qp_validate_user_data(struct usnic_ib_create_qp_cmd cmd) +{ +	if (cmd.spec.trans_type <= USNIC_TRANSPORT_UNKNOWN || +			cmd.spec.trans_type >= USNIC_TRANSPORT_MAX) +		return -EINVAL; + +	return 0; +} + +/* Start of ib callback functions */ + +enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device, +						u8 port_num) +{ +	return IB_LINK_LAYER_ETHERNET; +} + +int usnic_ib_query_device(struct ib_device *ibdev, +				struct ib_device_attr *props) +{ +	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); +	union ib_gid gid; +	struct ethtool_drvinfo info; +	struct ethtool_cmd cmd; +	int qp_per_vf; + +	usnic_dbg("\n"); +	mutex_lock(&us_ibdev->usdev_lock); +	us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info); +	us_ibdev->netdev->ethtool_ops->get_settings(us_ibdev->netdev, &cmd); +	memset(props, 0, sizeof(*props)); +	usnic_mac_ip_to_gid(us_ibdev->ufdev->mac, us_ibdev->ufdev->inaddr, +			&gid.raw[0]); +	memcpy(&props->sys_image_guid, &gid.global.interface_id, +		sizeof(gid.global.interface_id)); +	usnic_ib_fw_string_to_u64(&info.fw_version[0], &props->fw_ver); +	props->max_mr_size = USNIC_UIOM_MAX_MR_SIZE; +	props->page_size_cap = USNIC_UIOM_PAGE_SIZE; +	props->vendor_id = PCI_VENDOR_ID_CISCO; +	props->vendor_part_id = PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC; +	props->hw_ver = us_ibdev->pdev->subsystem_device; +	qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ], +			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]); +	props->max_qp = qp_per_vf * +		atomic_read(&us_ibdev->vf_cnt.refcount); +	props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT | +		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; +	props->max_cq = us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ] * +		atomic_read(&us_ibdev->vf_cnt.refcount); +	props->max_pd = USNIC_UIOM_MAX_PD_CNT; +	props->max_mr = USNIC_UIOM_MAX_MR_CNT; +	props->local_ca_ack_delay = 0; +	props->max_pkeys = 0; +	props->atomic_cap = IB_ATOMIC_NONE; +	props->masked_atomic_cap = props->atomic_cap; +	props->max_qp_rd_atom = 0; +	props->max_qp_init_rd_atom = 0; +	props->max_res_rd_atom = 0; +	props->max_srq = 0; +	props->max_srq_wr = 0; +	props->max_srq_sge = 0; +	props->max_fast_reg_page_list_len = 0; +	props->max_mcast_grp = 0; +	props->max_mcast_qp_attach = 0; +	props->max_total_mcast_qp_attach = 0; +	props->max_map_per_fmr = 0; +	/* Owned by Userspace +	 * max_qp_wr, max_sge, max_sge_rd, max_cqe */ +	mutex_unlock(&us_ibdev->usdev_lock); + +	return 0; +} + +int usnic_ib_query_port(struct ib_device *ibdev, u8 port, +				struct ib_port_attr *props) +{ +	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); +	struct ethtool_cmd cmd; + +	usnic_dbg("\n"); + +	mutex_lock(&us_ibdev->usdev_lock); +	us_ibdev->netdev->ethtool_ops->get_settings(us_ibdev->netdev, &cmd); +	memset(props, 0, sizeof(*props)); + +	props->lid = 0; +	props->lmc = 1; +	props->sm_lid = 0; +	props->sm_sl = 0; + +	if (!us_ibdev->ufdev->link_up) { +		props->state = IB_PORT_DOWN; +		props->phys_state = 3; +	} else if (!us_ibdev->ufdev->inaddr) { +		props->state = IB_PORT_INIT; +		props->phys_state = 4; +	} else { +		props->state = IB_PORT_ACTIVE; +		props->phys_state = 5; +	} + +	props->port_cap_flags = 0; +	props->gid_tbl_len = 1; +	props->pkey_tbl_len = 1; +	props->bad_pkey_cntr = 0; +	props->qkey_viol_cntr = 0; +	eth_speed_to_ib_speed(cmd.speed, &props->active_speed, +				&props->active_width); +	props->max_mtu = IB_MTU_4096; +	props->active_mtu = iboe_get_mtu(us_ibdev->ufdev->mtu); +	/* Userspace will adjust for hdrs */ +	props->max_msg_sz = us_ibdev->ufdev->mtu; +	props->max_vl_num = 1; +	mutex_unlock(&us_ibdev->usdev_lock); + +	return 0; +} + +int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, +				int qp_attr_mask, +				struct ib_qp_init_attr *qp_init_attr) +{ +	struct usnic_ib_qp_grp *qp_grp; +	struct usnic_ib_vf *vf; +	int err; + +	usnic_dbg("\n"); + +	memset(qp_attr, 0, sizeof(*qp_attr)); +	memset(qp_init_attr, 0, sizeof(*qp_init_attr)); + +	qp_grp = to_uqp_grp(qp); +	vf = qp_grp->vf; +	mutex_lock(&vf->pf->usdev_lock); +	usnic_dbg("\n"); +	qp_attr->qp_state = qp_grp->state; +	qp_attr->cur_qp_state = qp_grp->state; + +	switch (qp_grp->ibqp.qp_type) { +	case IB_QPT_UD: +		qp_attr->qkey = 0; +		break; +	default: +		usnic_err("Unexpected qp_type %d\n", qp_grp->ibqp.qp_type); +		err = -EINVAL; +		goto err_out; +	} + +	mutex_unlock(&vf->pf->usdev_lock); +	return 0; + +err_out: +	mutex_unlock(&vf->pf->usdev_lock); +	return err; +} + +int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index, +				union ib_gid *gid) +{ + +	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); +	usnic_dbg("\n"); + +	if (index > 1) +		return -EINVAL; + +	mutex_lock(&us_ibdev->usdev_lock); +	memset(&(gid->raw[0]), 0, sizeof(gid->raw)); +	usnic_mac_ip_to_gid(us_ibdev->ufdev->mac, us_ibdev->ufdev->inaddr, +			&gid->raw[0]); +	mutex_unlock(&us_ibdev->usdev_lock); + +	return 0; +} + +int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, +				u16 *pkey) +{ +	if (index > 1) +		return -EINVAL; + +	*pkey = 0xffff; +	return 0; +} + +struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, +					struct ib_ucontext *context, +					struct ib_udata *udata) +{ +	struct usnic_ib_pd *pd; +	void *umem_pd; + +	usnic_dbg("\n"); + +	pd = kzalloc(sizeof(*pd), GFP_KERNEL); +	if (!pd) +		return ERR_PTR(-ENOMEM); + +	umem_pd = pd->umem_pd = usnic_uiom_alloc_pd(); +	if (IS_ERR_OR_NULL(umem_pd)) { +		kfree(pd); +		return ERR_PTR(umem_pd ? PTR_ERR(umem_pd) : -ENOMEM); +	} + +	usnic_info("domain 0x%p allocated for context 0x%p and device %s\n", +			pd, context, ibdev->name); +	return &pd->ibpd; +} + +int usnic_ib_dealloc_pd(struct ib_pd *pd) +{ +	usnic_info("freeing domain 0x%p\n", pd); + +	usnic_uiom_dealloc_pd((to_upd(pd))->umem_pd); +	kfree(pd); +	return 0; +} + +struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, +					struct ib_qp_init_attr *init_attr, +					struct ib_udata *udata) +{ +	int err; +	struct usnic_ib_dev *us_ibdev; +	struct usnic_ib_qp_grp *qp_grp; +	struct usnic_ib_ucontext *ucontext; +	int cq_cnt; +	struct usnic_vnic_res_spec res_spec; +	struct usnic_ib_create_qp_cmd cmd; +	struct usnic_transport_spec trans_spec; + +	usnic_dbg("\n"); + +	ucontext = to_uucontext(pd->uobject->context); +	us_ibdev = to_usdev(pd->device); + +	if (init_attr->create_flags) +		return ERR_PTR(-EINVAL); + +	err = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); +	if (err) { +		usnic_err("%s: cannot copy udata for create_qp\n", +				us_ibdev->ib_dev.name); +		return ERR_PTR(-EINVAL); +	} + +	err = create_qp_validate_user_data(cmd); +	if (err) { +		usnic_err("%s: Failed to validate user data\n", +				us_ibdev->ib_dev.name); +		return ERR_PTR(-EINVAL); +	} + +	if (init_attr->qp_type != IB_QPT_UD) { +		usnic_err("%s asked to make a non-UD QP: %d\n", +				us_ibdev->ib_dev.name, init_attr->qp_type); +		return ERR_PTR(-EINVAL); +	} + +	trans_spec = cmd.spec; +	mutex_lock(&us_ibdev->usdev_lock); +	cq_cnt = (init_attr->send_cq == init_attr->recv_cq) ? 1 : 2; +	res_spec = min_transport_spec[trans_spec.trans_type]; +	usnic_vnic_res_spec_update(&res_spec, USNIC_VNIC_RES_TYPE_CQ, cq_cnt); +	qp_grp = find_free_vf_and_create_qp_grp(us_ibdev, to_upd(pd), +						&trans_spec, +						&res_spec); +	if (IS_ERR_OR_NULL(qp_grp)) { +		err = qp_grp ? PTR_ERR(qp_grp) : -ENOMEM; +		goto out_release_mutex; +	} + +	err = usnic_ib_fill_create_qp_resp(qp_grp, udata); +	if (err) { +		err = -EBUSY; +		goto out_release_qp_grp; +	} + +	qp_grp->ctx = ucontext; +	list_add_tail(&qp_grp->link, &ucontext->qp_grp_list); +	usnic_ib_log_vf(qp_grp->vf); +	mutex_unlock(&us_ibdev->usdev_lock); +	return &qp_grp->ibqp; + +out_release_qp_grp: +	qp_grp_destroy(qp_grp); +out_release_mutex: +	mutex_unlock(&us_ibdev->usdev_lock); +	return ERR_PTR(err); +} + +int usnic_ib_destroy_qp(struct ib_qp *qp) +{ +	struct usnic_ib_qp_grp *qp_grp; +	struct usnic_ib_vf *vf; + +	usnic_dbg("\n"); + +	qp_grp = to_uqp_grp(qp); +	vf = qp_grp->vf; +	mutex_lock(&vf->pf->usdev_lock); +	if (usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RESET, NULL)) { +		usnic_err("Failed to move qp grp %u to reset\n", +				qp_grp->grp_id); +	} + +	list_del(&qp_grp->link); +	qp_grp_destroy(qp_grp); +	mutex_unlock(&vf->pf->usdev_lock); + +	return 0; +} + +int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, +				int attr_mask, struct ib_udata *udata) +{ +	struct usnic_ib_qp_grp *qp_grp; +	int status; +	usnic_dbg("\n"); + +	qp_grp = to_uqp_grp(ibqp); + +	/* TODO: Future Support All States */ +	mutex_lock(&qp_grp->vf->pf->usdev_lock); +	if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT) { +		status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_INIT, NULL); +	} else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTR) { +		status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTR, NULL); +	} else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTS) { +		status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTS, NULL); +	} else { +		usnic_err("Unexpected combination mask: %u state: %u\n", +				attr_mask & IB_QP_STATE, attr->qp_state); +		status = -EINVAL; +	} + +	mutex_unlock(&qp_grp->vf->pf->usdev_lock); +	return status; +} + +struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, int entries, +					int vector, struct ib_ucontext *context, +					struct ib_udata *udata) +{ +	struct ib_cq *cq; + +	usnic_dbg("\n"); +	cq = kzalloc(sizeof(*cq), GFP_KERNEL); +	if (!cq) +		return ERR_PTR(-EBUSY); + +	return cq; +} + +int usnic_ib_destroy_cq(struct ib_cq *cq) +{ +	usnic_dbg("\n"); +	kfree(cq); +	return 0; +} + +struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, +					u64 virt_addr, int access_flags, +					struct ib_udata *udata) +{ +	struct usnic_ib_mr *mr; +	int err; + +	usnic_dbg("start 0x%llx va 0x%llx length 0x%llx\n", start, +			virt_addr, length); + +	mr = kzalloc(sizeof(*mr), GFP_KERNEL); +	if (IS_ERR_OR_NULL(mr)) +		return ERR_PTR(mr ? PTR_ERR(mr) : -ENOMEM); + +	mr->umem = usnic_uiom_reg_get(to_upd(pd)->umem_pd, start, length, +					access_flags, 0); +	if (IS_ERR_OR_NULL(mr->umem)) { +		err = mr->umem ? PTR_ERR(mr->umem) : -EFAULT; +		goto err_free; +	} + +	mr->ibmr.lkey = mr->ibmr.rkey = 0; +	return &mr->ibmr; + +err_free: +	kfree(mr); +	return ERR_PTR(err); +} + +int usnic_ib_dereg_mr(struct ib_mr *ibmr) +{ +	struct usnic_ib_mr *mr = to_umr(ibmr); + +	usnic_dbg("va 0x%lx length 0x%zx\n", mr->umem->va, mr->umem->length); + +	usnic_uiom_reg_release(mr->umem, ibmr->pd->uobject->context->closing); +	kfree(mr); +	return 0; +} + +struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev, +							struct ib_udata *udata) +{ +	struct usnic_ib_ucontext *context; +	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); +	usnic_dbg("\n"); + +	context = kmalloc(sizeof(*context), GFP_KERNEL); +	if (!context) +		return ERR_PTR(-ENOMEM); + +	INIT_LIST_HEAD(&context->qp_grp_list); +	mutex_lock(&us_ibdev->usdev_lock); +	list_add_tail(&context->link, &us_ibdev->ctx_list); +	mutex_unlock(&us_ibdev->usdev_lock); + +	return &context->ibucontext; +} + +int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ +	struct usnic_ib_ucontext *context = to_uucontext(ibcontext); +	struct usnic_ib_dev *us_ibdev = to_usdev(ibcontext->device); +	usnic_dbg("\n"); + +	mutex_lock(&us_ibdev->usdev_lock); +	BUG_ON(!list_empty(&context->qp_grp_list)); +	list_del(&context->link); +	mutex_unlock(&us_ibdev->usdev_lock); +	kfree(context); +	return 0; +} + +int usnic_ib_mmap(struct ib_ucontext *context, +				struct vm_area_struct *vma) +{ +	struct usnic_ib_ucontext *uctx = to_ucontext(context); +	struct usnic_ib_dev *us_ibdev; +	struct usnic_ib_qp_grp *qp_grp; +	struct usnic_ib_vf *vf; +	struct vnic_dev_bar *bar; +	dma_addr_t bus_addr; +	unsigned int len; +	unsigned int vfid; + +	usnic_dbg("\n"); + +	us_ibdev = to_usdev(context->device); +	vma->vm_flags |= VM_IO; +	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); +	vfid = vma->vm_pgoff; +	usnic_dbg("Page Offset %lu PAGE_SHIFT %u VFID %u\n", +			vma->vm_pgoff, PAGE_SHIFT, vfid); + +	mutex_lock(&us_ibdev->usdev_lock); +	list_for_each_entry(qp_grp, &uctx->qp_grp_list, link) { +		vf = qp_grp->vf; +		if (usnic_vnic_get_index(vf->vnic) == vfid) { +			bar = usnic_vnic_get_bar(vf->vnic, 0); +			if ((vma->vm_end - vma->vm_start) != bar->len) { +				usnic_err("Bar0 Len %lu - Request map %lu\n", +						bar->len, +						vma->vm_end - vma->vm_start); +				mutex_unlock(&us_ibdev->usdev_lock); +				return -EINVAL; +			} +			bus_addr = bar->bus_addr; +			len = bar->len; +			usnic_dbg("bus: %pa vaddr: %p size: %ld\n", +					&bus_addr, bar->vaddr, bar->len); +			mutex_unlock(&us_ibdev->usdev_lock); + +			return remap_pfn_range(vma, +						vma->vm_start, +						bus_addr >> PAGE_SHIFT, +						len, vma->vm_page_prot); +		} +	} + +	mutex_unlock(&us_ibdev->usdev_lock); +	usnic_err("No VF %u found\n", vfid); +	return -EINVAL; +} + +/* In ib callbacks section -  Start of stub funcs */ +struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd, +					struct ib_ah_attr *ah_attr) +{ +	usnic_dbg("\n"); +	return ERR_PTR(-EPERM); +} + +int usnic_ib_destroy_ah(struct ib_ah *ah) +{ +	usnic_dbg("\n"); +	return -EINVAL; +} + +int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, +				struct ib_send_wr **bad_wr) +{ +	usnic_dbg("\n"); +	return -EINVAL; +} + +int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, +				struct ib_recv_wr **bad_wr) +{ +	usnic_dbg("\n"); +	return -EINVAL; +} + +int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries, +				struct ib_wc *wc) +{ +	usnic_dbg("\n"); +	return -EINVAL; +} + +int usnic_ib_req_notify_cq(struct ib_cq *cq, +					enum ib_cq_notify_flags flags) +{ +	usnic_dbg("\n"); +	return -EINVAL; +} + +struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ +	usnic_dbg("\n"); +	return ERR_PTR(-ENOMEM); +} + + +/* In ib callbacks section - End of stub funcs */ +/* End of ib callbacks section */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h new file mode 100644 index 00000000000..bb864f5aed7 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_VERBS_H_ +#define USNIC_IB_VERBS_H_ + +#include "usnic_ib.h" + +enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device, +						u8 port_num); +int usnic_ib_query_device(struct ib_device *ibdev, +				struct ib_device_attr *props); +int usnic_ib_query_port(struct ib_device *ibdev, u8 port, +				struct ib_port_attr *props); +int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, +				int qp_attr_mask, +				struct ib_qp_init_attr *qp_init_attr); +int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index, +				union ib_gid *gid); +int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, +				u16 *pkey); +struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, +				struct ib_ucontext *context, +				struct ib_udata *udata); +int usnic_ib_dealloc_pd(struct ib_pd *pd); +struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, +					struct ib_qp_init_attr *init_attr, +					struct ib_udata *udata); +int usnic_ib_destroy_qp(struct ib_qp *qp); +int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, +				int attr_mask, struct ib_udata *udata); +struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, int entries, +					int vector, struct ib_ucontext *context, +					struct ib_udata *udata); +int usnic_ib_destroy_cq(struct ib_cq *cq); +struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, +				u64 virt_addr, int access_flags, +				struct ib_udata *udata); +int usnic_ib_dereg_mr(struct ib_mr *ibmr); +struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev, +						struct ib_udata *udata); +int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext); +int usnic_ib_mmap(struct ib_ucontext *context, +			struct vm_area_struct *vma); +struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd, +					struct ib_ah_attr *ah_attr); +int usnic_ib_destroy_ah(struct ib_ah *ah); +int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, +			struct ib_send_wr **bad_wr); +int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, +			struct ib_recv_wr **bad_wr); +int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries, +			struct ib_wc *wc); +int usnic_ib_req_notify_cq(struct ib_cq *cq, +				enum ib_cq_notify_flags flags); +struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc); +#endif /* !USNIC_IB_VERBS_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_log.h b/drivers/infiniband/hw/usnic/usnic_log.h new file mode 100644 index 00000000000..75777a66c68 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_log.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_LOG_H_ +#define USNIC_LOG_H_ + +#include "usnic.h" + +extern unsigned int usnic_log_lvl; + +#define USNIC_LOG_LVL_NONE		(0) +#define USNIC_LOG_LVL_ERR		(1) +#define USNIC_LOG_LVL_INFO		(2) +#define USNIC_LOG_LVL_DBG		(3) + +#define usnic_printk(lvl, args...) \ +	do { \ +		printk(lvl "%s:%s:%d: ", DRV_NAME, __func__, \ +				__LINE__); \ +		printk(args); \ +	} while (0) + +#define usnic_dbg(args...) \ +	do { \ +		if (unlikely(usnic_log_lvl >= USNIC_LOG_LVL_DBG)) { \ +			usnic_printk(KERN_INFO, args); \ +	} \ +} while (0) + +#define usnic_info(args...) \ +do { \ +	if (usnic_log_lvl >= USNIC_LOG_LVL_INFO) { \ +			usnic_printk(KERN_INFO, args); \ +	} \ +} while (0) + +#define usnic_err(args...) \ +	do { \ +		if (usnic_log_lvl >= USNIC_LOG_LVL_ERR) { \ +			usnic_printk(KERN_ERR, args); \ +		} \ +	} while (0) +#endif /* !USNIC_LOG_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_transport.c b/drivers/infiniband/hw/usnic/usnic_transport.c new file mode 100644 index 00000000000..ddef6f77a78 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_transport.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/bitmap.h> +#include <linux/file.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <net/inet_sock.h> + +#include "usnic_transport.h" +#include "usnic_log.h" + +/* ROCE */ +static unsigned long *roce_bitmap; +static u16 roce_next_port = 1; +#define ROCE_BITMAP_SZ ((1 << (8 /*CHAR_BIT*/ * sizeof(u16)))/8 /*CHAR BIT*/) +static DEFINE_SPINLOCK(roce_bitmap_lock); + +const char *usnic_transport_to_str(enum usnic_transport_type type) +{ +	switch (type) { +	case USNIC_TRANSPORT_UNKNOWN: +		return "Unknown"; +	case USNIC_TRANSPORT_ROCE_CUSTOM: +		return "roce custom"; +	case USNIC_TRANSPORT_IPV4_UDP: +		return "IPv4 UDP"; +	case USNIC_TRANSPORT_MAX: +		return "Max?"; +	default: +		return "Not known"; +	} +} + +int usnic_transport_sock_to_str(char *buf, int buf_sz, +					struct socket *sock) +{ +	int err; +	uint32_t addr; +	uint16_t port; +	int proto; + +	memset(buf, 0, buf_sz); +	err = usnic_transport_sock_get_addr(sock, &proto, &addr, &port); +	if (err) +		return 0; + +	return scnprintf(buf, buf_sz, "Proto:%u Addr:%pI4h Port:%hu", +			proto, &addr, port); +} + +/* + * reserve a port number.  if "0" specified, we will try to pick one + * starting at roce_next_port.  roce_next_port will take on the values + * 1..4096 + */ +u16 usnic_transport_rsrv_port(enum usnic_transport_type type, u16 port_num) +{ +	if (type == USNIC_TRANSPORT_ROCE_CUSTOM) { +		spin_lock(&roce_bitmap_lock); +		if (!port_num) { +			port_num = bitmap_find_next_zero_area(roce_bitmap, +						ROCE_BITMAP_SZ, +						roce_next_port /* start */, +						1 /* nr */, +						0 /* align */); +			roce_next_port = (port_num & 4095) + 1; +		} else if (test_bit(port_num, roce_bitmap)) { +			usnic_err("Failed to allocate port for %s\n", +					usnic_transport_to_str(type)); +			spin_unlock(&roce_bitmap_lock); +			goto out_fail; +		} +		bitmap_set(roce_bitmap, port_num, 1); +		spin_unlock(&roce_bitmap_lock); +	} else { +		usnic_err("Failed to allocate port - transport %s unsupported\n", +				usnic_transport_to_str(type)); +		goto out_fail; +	} + +	usnic_dbg("Allocating port %hu for %s\n", port_num, +			usnic_transport_to_str(type)); +	return port_num; + +out_fail: +	return 0; +} + +void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num) +{ +	if (type == USNIC_TRANSPORT_ROCE_CUSTOM) { +		spin_lock(&roce_bitmap_lock); +		if (!port_num) { +			usnic_err("Unreserved unvalid port num 0 for %s\n", +					usnic_transport_to_str(type)); +			goto out_roce_custom; +		} + +		if (!test_bit(port_num, roce_bitmap)) { +			usnic_err("Unreserving invalid %hu for %s\n", +					port_num, +					usnic_transport_to_str(type)); +			goto out_roce_custom; +		} +		bitmap_clear(roce_bitmap, port_num, 1); +		usnic_dbg("Freeing port %hu for %s\n", port_num, +				usnic_transport_to_str(type)); +out_roce_custom: +		spin_unlock(&roce_bitmap_lock); +	} else { +		usnic_err("Freeing invalid port %hu for %d\n", port_num, type); +	} +} + +struct socket *usnic_transport_get_socket(int sock_fd) +{ +	struct socket *sock; +	int err; +	char buf[25]; + +	/* sockfd_lookup will internally do a fget */ +	sock = sockfd_lookup(sock_fd, &err); +	if (!sock) { +		usnic_err("Unable to lookup socket for fd %d with err %d\n", +				sock_fd, err); +		return ERR_PTR(-ENOENT); +	} + +	usnic_transport_sock_to_str(buf, sizeof(buf), sock); +	usnic_dbg("Get sock %s\n", buf); + +	return sock; +} + +void usnic_transport_put_socket(struct socket *sock) +{ +	char buf[100]; + +	usnic_transport_sock_to_str(buf, sizeof(buf), sock); +	usnic_dbg("Put sock %s\n", buf); +	sockfd_put(sock); +} + +int usnic_transport_sock_get_addr(struct socket *sock, int *proto, +					uint32_t *addr, uint16_t *port) +{ +	int len; +	int err; +	struct sockaddr_in sock_addr; + +	err = sock->ops->getname(sock, +				(struct sockaddr *)&sock_addr, +				&len, 0); +	if (err) +		return err; + +	if (sock_addr.sin_family != AF_INET) +		return -EINVAL; + +	if (proto) +		*proto = sock->sk->sk_protocol; +	if (port) +		*port = ntohs(((struct sockaddr_in *)&sock_addr)->sin_port); +	if (addr) +		*addr = ntohl(((struct sockaddr_in *) +					&sock_addr)->sin_addr.s_addr); + +	return 0; +} + +int usnic_transport_init(void) +{ +	roce_bitmap = kzalloc(ROCE_BITMAP_SZ, GFP_KERNEL); +	if (!roce_bitmap) { +		usnic_err("Failed to allocate bit map"); +		return -ENOMEM; +	} + +	/* Do not ever allocate bit 0, hence set it here */ +	bitmap_set(roce_bitmap, 0, 1); +	return 0; +} + +void usnic_transport_fini(void) +{ +	kfree(roce_bitmap); +} diff --git a/drivers/infiniband/hw/usnic/usnic_transport.h b/drivers/infiniband/hw/usnic/usnic_transport.h new file mode 100644 index 00000000000..7e5dc6d9f46 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_transport.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_TRANSPORT_H_ +#define USNIC_TRANSPORT_H_ + +#include "usnic_abi.h" + +const char *usnic_transport_to_str(enum usnic_transport_type trans_type); +/* + * Returns number of bytes written, excluding null terminator. If + * nothing was written, the function returns 0. + */ +int usnic_transport_sock_to_str(char *buf, int buf_sz, +					struct socket *sock); +/* + * Reserve a port. If "port_num" is set, then the function will try + * to reserve that particular port. + */ +u16 usnic_transport_rsrv_port(enum usnic_transport_type type, u16 port_num); +void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num); +/* + * Do a fget on the socket refered to by sock_fd and returns the socket. + * Socket will not be destroyed before usnic_transport_put_socket has + * been called. + */ +struct socket *usnic_transport_get_socket(int sock_fd); +void usnic_transport_put_socket(struct socket *sock); +/* + * Call usnic_transport_get_socket before calling *_sock_get_addr + */ +int usnic_transport_sock_get_addr(struct socket *sock, int *proto, +					uint32_t *addr, uint16_t *port); +int usnic_transport_init(void); +void usnic_transport_fini(void); +#endif /* !USNIC_TRANSPORT_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c new file mode 100644 index 00000000000..801a1d6937e --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -0,0 +1,604 @@ +/* + * Copyright (c) 2005 Topspin Communications.  All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2013 Cisco Systems.  All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses.  You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + *     Redistribution and use in source and binary forms, with or + *     without modification, are permitted provided that the following + *     conditions are met: + * + *      - Redistributions of source code must retain the above + *        copyright notice, this list of conditions and the following + *        disclaimer. + * + *      - Redistributions in binary form must reproduce the above + *        copyright notice, this list of conditions and the following + *        disclaimer in the documentation and/or other materials + *        provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/mm.h> +#include <linux/dma-mapping.h> +#include <linux/sched.h> +#include <linux/hugetlb.h> +#include <linux/dma-attrs.h> +#include <linux/iommu.h> +#include <linux/workqueue.h> +#include <linux/list.h> +#include <linux/pci.h> + +#include "usnic_log.h" +#include "usnic_uiom.h" +#include "usnic_uiom_interval_tree.h" + +static struct workqueue_struct *usnic_uiom_wq; + +#define USNIC_UIOM_PAGE_CHUNK						\ +	((PAGE_SIZE - offsetof(struct usnic_uiom_chunk, page_list))	/\ +	((void *) &((struct usnic_uiom_chunk *) 0)->page_list[1] -	\ +	(void *) &((struct usnic_uiom_chunk *) 0)->page_list[0])) + +static void usnic_uiom_reg_account(struct work_struct *work) +{ +	struct usnic_uiom_reg *umem = container_of(work, +						struct usnic_uiom_reg, work); + +	down_write(&umem->mm->mmap_sem); +	umem->mm->locked_vm -= umem->diff; +	up_write(&umem->mm->mmap_sem); +	mmput(umem->mm); +	kfree(umem); +} + +static int usnic_uiom_dma_fault(struct iommu_domain *domain, +				struct device *dev, +				unsigned long iova, int flags, +				void *token) +{ +	usnic_err("Device %s iommu fault domain 0x%pK va 0x%lx flags 0x%x\n", +		dev_name(dev), +		domain, iova, flags); +	return -ENOSYS; +} + +static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty) +{ +	struct usnic_uiom_chunk *chunk, *tmp; +	struct page *page; +	struct scatterlist *sg; +	int i; +	dma_addr_t pa; + +	list_for_each_entry_safe(chunk, tmp, chunk_list, list) { +		for_each_sg(chunk->page_list, sg, chunk->nents, i) { +			page = sg_page(sg); +			pa = sg_phys(sg); +			if (dirty) +				set_page_dirty_lock(page); +			put_page(page); +			usnic_dbg("pa: %pa\n", &pa); +		} +		kfree(chunk); +	} +} + +static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, +				int dmasync, struct list_head *chunk_list) +{ +	struct page **page_list; +	struct scatterlist *sg; +	struct usnic_uiom_chunk *chunk; +	unsigned long locked; +	unsigned long lock_limit; +	unsigned long cur_base; +	unsigned long npages; +	int ret; +	int off; +	int i; +	int flags; +	dma_addr_t pa; +	DEFINE_DMA_ATTRS(attrs); + +	if (dmasync) +		dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); + +	if (!can_do_mlock()) +		return -EPERM; + +	INIT_LIST_HEAD(chunk_list); + +	page_list = (struct page **) __get_free_page(GFP_KERNEL); +	if (!page_list) +		return -ENOMEM; + +	npages = PAGE_ALIGN(size + (addr & ~PAGE_MASK)) >> PAGE_SHIFT; + +	down_write(¤t->mm->mmap_sem); + +	locked = npages + current->mm->locked_vm; +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + +	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { +		ret = -ENOMEM; +		goto out; +	} + +	flags = IOMMU_READ | IOMMU_CACHE; +	flags |= (writable) ? IOMMU_WRITE : 0; +	cur_base = addr & PAGE_MASK; +	ret = 0; + +	while (npages) { +		ret = get_user_pages(current, current->mm, cur_base, +					min_t(unsigned long, npages, +					PAGE_SIZE / sizeof(struct page *)), +					1, !writable, page_list, NULL); + +		if (ret < 0) +			goto out; + +		npages -= ret; +		off = 0; + +		while (ret) { +			chunk = kmalloc(sizeof(*chunk) + +					sizeof(struct scatterlist) * +					min_t(int, ret, USNIC_UIOM_PAGE_CHUNK), +					GFP_KERNEL); +			if (!chunk) { +				ret = -ENOMEM; +				goto out; +			} + +			chunk->nents = min_t(int, ret, USNIC_UIOM_PAGE_CHUNK); +			sg_init_table(chunk->page_list, chunk->nents); +			for_each_sg(chunk->page_list, sg, chunk->nents, i) { +				sg_set_page(sg, page_list[i + off], +						PAGE_SIZE, 0); +				pa = sg_phys(sg); +				usnic_dbg("va: 0x%lx pa: %pa\n", +						cur_base + i*PAGE_SIZE, &pa); +			} +			cur_base += chunk->nents * PAGE_SIZE; +			ret -= chunk->nents; +			off += chunk->nents; +			list_add_tail(&chunk->list, chunk_list); +		} + +		ret = 0; +	} + +out: +	if (ret < 0) +		usnic_uiom_put_pages(chunk_list, 0); +	else +		current->mm->locked_vm = locked; + +	up_write(¤t->mm->mmap_sem); +	free_page((unsigned long) page_list); +	return ret; +} + +static void usnic_uiom_unmap_sorted_intervals(struct list_head *intervals, +						struct usnic_uiom_pd *pd) +{ +	struct usnic_uiom_interval_node *interval, *tmp; +	long unsigned va, size; + +	list_for_each_entry_safe(interval, tmp, intervals, link) { +		va = interval->start << PAGE_SHIFT; +		size = ((interval->last - interval->start) + 1) << PAGE_SHIFT; +		while (size > 0) { +			/* Workaround for RH 970401 */ +			usnic_dbg("va 0x%lx size 0x%lx", va, PAGE_SIZE); +			iommu_unmap(pd->domain, va, PAGE_SIZE); +			va += PAGE_SIZE; +			size -= PAGE_SIZE; +		} +	} +} + +static void __usnic_uiom_reg_release(struct usnic_uiom_pd *pd, +					struct usnic_uiom_reg *uiomr, +					int dirty) +{ +	int npages; +	unsigned long vpn_start, vpn_last; +	struct usnic_uiom_interval_node *interval, *tmp; +	int writable = 0; +	LIST_HEAD(rm_intervals); + +	npages = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; +	vpn_start = (uiomr->va & PAGE_MASK) >> PAGE_SHIFT; +	vpn_last = vpn_start + npages - 1; + +	spin_lock(&pd->lock); +	usnic_uiom_remove_interval(&pd->rb_root, vpn_start, +					vpn_last, &rm_intervals); +	usnic_uiom_unmap_sorted_intervals(&rm_intervals, pd); + +	list_for_each_entry_safe(interval, tmp, &rm_intervals, link) { +		if (interval->flags & IOMMU_WRITE) +			writable = 1; +		list_del(&interval->link); +		kfree(interval); +	} + +	usnic_uiom_put_pages(&uiomr->chunk_list, dirty & writable); +	spin_unlock(&pd->lock); +} + +static int usnic_uiom_map_sorted_intervals(struct list_head *intervals, +						struct usnic_uiom_reg *uiomr) +{ +	int i, err; +	size_t size; +	struct usnic_uiom_chunk *chunk; +	struct usnic_uiom_interval_node *interval_node; +	dma_addr_t pa; +	dma_addr_t pa_start = 0; +	dma_addr_t pa_end = 0; +	long int va_start = -EINVAL; +	struct usnic_uiom_pd *pd = uiomr->pd; +	long int va = uiomr->va & PAGE_MASK; +	int flags = IOMMU_READ | IOMMU_CACHE; + +	flags |= (uiomr->writable) ? IOMMU_WRITE : 0; +	chunk = list_first_entry(&uiomr->chunk_list, struct usnic_uiom_chunk, +									list); +	list_for_each_entry(interval_node, intervals, link) { +iter_chunk: +		for (i = 0; i < chunk->nents; i++, va += PAGE_SIZE) { +			pa = sg_phys(&chunk->page_list[i]); +			if ((va >> PAGE_SHIFT) < interval_node->start) +				continue; + +			if ((va >> PAGE_SHIFT) == interval_node->start) { +				/* First page of the interval */ +				va_start = va; +				pa_start = pa; +				pa_end = pa; +			} + +			WARN_ON(va_start == -EINVAL); + +			if ((pa_end + PAGE_SIZE != pa) && +					(pa != pa_start)) { +				/* PAs are not contiguous */ +				size = pa_end - pa_start + PAGE_SIZE; +				usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x", +					va_start, &pa_start, size, flags); +				err = iommu_map(pd->domain, va_start, pa_start, +							size, flags); +				if (err) { +					usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", +						va_start, &pa_start, size, err); +					goto err_out; +				} +				va_start = va; +				pa_start = pa; +				pa_end = pa; +			} + +			if ((va >> PAGE_SHIFT) == interval_node->last) { +				/* Last page of the interval */ +				size = pa - pa_start + PAGE_SIZE; +				usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x\n", +					va_start, &pa_start, size, flags); +				err = iommu_map(pd->domain, va_start, pa_start, +						size, flags); +				if (err) { +					usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", +						va_start, &pa_start, size, err); +					goto err_out; +				} +				break; +			} + +			if (pa != pa_start) +				pa_end += PAGE_SIZE; +		} + +		if (i == chunk->nents) { +			/* +			 * Hit last entry of the chunk, +			 * hence advance to next chunk +			 */ +			chunk = list_first_entry(&chunk->list, +							struct usnic_uiom_chunk, +							list); +			goto iter_chunk; +		} +	} + +	return 0; + +err_out: +	usnic_uiom_unmap_sorted_intervals(intervals, pd); +	return err; +} + +struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, +						unsigned long addr, size_t size, +						int writable, int dmasync) +{ +	struct usnic_uiom_reg *uiomr; +	unsigned long va_base, vpn_start, vpn_last; +	unsigned long npages; +	int offset, err; +	LIST_HEAD(sorted_diff_intervals); + +	/* +	 * Intel IOMMU map throws an error if a translation entry is +	 * changed from read to write.  This module may not unmap +	 * and then remap the entry after fixing the permission +	 * b/c this open up a small windows where hw DMA may page fault +	 * Hence, make all entries to be writable. +	 */ +	writable = 1; + +	va_base = addr & PAGE_MASK; +	offset = addr & ~PAGE_MASK; +	npages = PAGE_ALIGN(size + offset) >> PAGE_SHIFT; +	vpn_start = (addr & PAGE_MASK) >> PAGE_SHIFT; +	vpn_last = vpn_start + npages - 1; + +	uiomr = kmalloc(sizeof(*uiomr), GFP_KERNEL); +	if (!uiomr) +		return ERR_PTR(-ENOMEM); + +	uiomr->va = va_base; +	uiomr->offset = offset; +	uiomr->length = size; +	uiomr->writable = writable; +	uiomr->pd = pd; + +	err = usnic_uiom_get_pages(addr, size, writable, dmasync, +					&uiomr->chunk_list); +	if (err) { +		usnic_err("Failed get_pages vpn [0x%lx,0x%lx] err %d\n", +				vpn_start, vpn_last, err); +		goto out_free_uiomr; +	} + +	spin_lock(&pd->lock); +	err = usnic_uiom_get_intervals_diff(vpn_start, vpn_last, +						(writable) ? IOMMU_WRITE : 0, +						IOMMU_WRITE, +						&pd->rb_root, +						&sorted_diff_intervals); +	if (err) { +		usnic_err("Failed disjoint interval vpn [0x%lx,0x%lx] err %d\n", +						vpn_start, vpn_last, err); +		goto out_put_pages; +	} + +	err = usnic_uiom_map_sorted_intervals(&sorted_diff_intervals, uiomr); +	if (err) { +		usnic_err("Failed map interval vpn [0x%lx,0x%lx] err %d\n", +						vpn_start, vpn_last, err); +		goto out_put_intervals; + +	} + +	err = usnic_uiom_insert_interval(&pd->rb_root, vpn_start, vpn_last, +					(writable) ? IOMMU_WRITE : 0); +	if (err) { +		usnic_err("Failed insert interval vpn [0x%lx,0x%lx] err %d\n", +						vpn_start, vpn_last, err); +		goto out_unmap_intervals; +	} + +	usnic_uiom_put_interval_set(&sorted_diff_intervals); +	spin_unlock(&pd->lock); + +	return uiomr; + +out_unmap_intervals: +	usnic_uiom_unmap_sorted_intervals(&sorted_diff_intervals, pd); +out_put_intervals: +	usnic_uiom_put_interval_set(&sorted_diff_intervals); +out_put_pages: +	usnic_uiom_put_pages(&uiomr->chunk_list, 0); +	spin_unlock(&pd->lock); +out_free_uiomr: +	kfree(uiomr); +	return ERR_PTR(err); +} + +void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing) +{ +	struct mm_struct *mm; +	unsigned long diff; + +	__usnic_uiom_reg_release(uiomr->pd, uiomr, 1); + +	mm = get_task_mm(current); +	if (!mm) { +		kfree(uiomr); +		return; +	} + +	diff = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; + +	/* +	 * We may be called with the mm's mmap_sem already held.  This +	 * can happen when a userspace munmap() is the call that drops +	 * the last reference to our file and calls our release +	 * method.  If there are memory regions to destroy, we'll end +	 * up here and not be able to take the mmap_sem.  In that case +	 * we defer the vm_locked accounting to the system workqueue. +	 */ +	if (closing) { +		if (!down_write_trylock(&mm->mmap_sem)) { +			INIT_WORK(&uiomr->work, usnic_uiom_reg_account); +			uiomr->mm = mm; +			uiomr->diff = diff; + +			queue_work(usnic_uiom_wq, &uiomr->work); +			return; +		} +	} else +		down_write(&mm->mmap_sem); + +	current->mm->locked_vm -= diff; +	up_write(&mm->mmap_sem); +	mmput(mm); +	kfree(uiomr); +} + +struct usnic_uiom_pd *usnic_uiom_alloc_pd(void) +{ +	struct usnic_uiom_pd *pd; +	void *domain; + +	pd = kzalloc(sizeof(*pd), GFP_KERNEL); +	if (!pd) +		return ERR_PTR(-ENOMEM); + +	pd->domain = domain = iommu_domain_alloc(&pci_bus_type); +	if (IS_ERR_OR_NULL(domain)) { +		usnic_err("Failed to allocate IOMMU domain with err %ld\n", +				PTR_ERR(pd->domain)); +		kfree(pd); +		return ERR_PTR(domain ? PTR_ERR(domain) : -ENOMEM); +	} + +	iommu_set_fault_handler(pd->domain, usnic_uiom_dma_fault, NULL); + +	spin_lock_init(&pd->lock); +	INIT_LIST_HEAD(&pd->devs); + +	return pd; +} + +void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd) +{ +	iommu_domain_free(pd->domain); +	kfree(pd); +} + +int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev) +{ +	struct usnic_uiom_dev *uiom_dev; +	int err; + +	uiom_dev = kzalloc(sizeof(*uiom_dev), GFP_ATOMIC); +	if (!uiom_dev) +		return -ENOMEM; +	uiom_dev->dev = dev; + +	err = iommu_attach_device(pd->domain, dev); +	if (err) +		goto out_free_dev; + +	if (!iommu_domain_has_cap(pd->domain, IOMMU_CAP_CACHE_COHERENCY)) { +		usnic_err("IOMMU of %s does not support cache coherency\n", +				dev_name(dev)); +		err = -EINVAL; +		goto out_detach_device; +	} + +	spin_lock(&pd->lock); +	list_add_tail(&uiom_dev->link, &pd->devs); +	pd->dev_cnt++; +	spin_unlock(&pd->lock); + +	return 0; + +out_detach_device: +	iommu_detach_device(pd->domain, dev); +out_free_dev: +	kfree(uiom_dev); +	return err; +} + +void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd, struct device *dev) +{ +	struct usnic_uiom_dev *uiom_dev; +	int found = 0; + +	spin_lock(&pd->lock); +	list_for_each_entry(uiom_dev, &pd->devs, link) { +		if (uiom_dev->dev == dev) { +			found = 1; +			break; +		} +	} + +	if (!found) { +		usnic_err("Unable to free dev %s - not found\n", +				dev_name(dev)); +		spin_unlock(&pd->lock); +		return; +	} + +	list_del(&uiom_dev->link); +	pd->dev_cnt--; +	spin_unlock(&pd->lock); + +	return iommu_detach_device(pd->domain, dev); +} + +struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd) +{ +	struct usnic_uiom_dev *uiom_dev; +	struct device **devs; +	int i = 0; + +	spin_lock(&pd->lock); +	devs = kcalloc(pd->dev_cnt + 1, sizeof(*devs), GFP_ATOMIC); +	if (!devs) { +		devs = ERR_PTR(-ENOMEM); +		goto out; +	} + +	list_for_each_entry(uiom_dev, &pd->devs, link) { +		devs[i++] = uiom_dev->dev; +	} +out: +	spin_unlock(&pd->lock); +	return devs; +} + +void usnic_uiom_free_dev_list(struct device **devs) +{ +	kfree(devs); +} + +int usnic_uiom_init(char *drv_name) +{ +	if (!iommu_present(&pci_bus_type)) { +		usnic_err("IOMMU required but not present or enabled.  USNIC QPs will not function w/o enabling IOMMU\n"); +		return -EPERM; +	} + +	usnic_uiom_wq = create_workqueue(drv_name); +	if (!usnic_uiom_wq) { +		usnic_err("Unable to alloc wq for drv %s\n", drv_name); +		return -ENOMEM; +	} + +	return 0; +} + +void usnic_uiom_fini(void) +{ +	flush_workqueue(usnic_uiom_wq); +	destroy_workqueue(usnic_uiom_wq); +} diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h new file mode 100644 index 00000000000..70440996e8f --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_uiom.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_UIOM_H_ +#define USNIC_UIOM_H_ + +#include <linux/list.h> +#include <linux/scatterlist.h> + +#include "usnic_uiom_interval_tree.h" + +#define USNIC_UIOM_READ			(1) +#define USNIC_UIOM_WRITE		(2) + +#define USNIC_UIOM_MAX_PD_CNT		(1000) +#define USNIC_UIOM_MAX_MR_CNT		(1000000) +#define USNIC_UIOM_MAX_MR_SIZE		(~0UL) +#define USNIC_UIOM_PAGE_SIZE		(PAGE_SIZE) + +struct usnic_uiom_dev { +	struct device			*dev; +	struct list_head		link; +}; + +struct usnic_uiom_pd { +	struct iommu_domain		*domain; +	spinlock_t			lock; +	struct rb_root			rb_root; +	struct list_head		devs; +	int				dev_cnt; +}; + +struct usnic_uiom_reg { +	struct usnic_uiom_pd		*pd; +	unsigned long			va; +	size_t				length; +	int				offset; +	int				page_size; +	int				writable; +	struct list_head		chunk_list; +	struct work_struct		work; +	struct mm_struct		*mm; +	unsigned long			diff; +}; + +struct usnic_uiom_chunk { +	struct list_head		list; +	int				nents; +	struct scatterlist		page_list[0]; +}; + +struct usnic_uiom_pd *usnic_uiom_alloc_pd(void); +void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd); +int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev); +void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd, +					struct device *dev); +struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd); +void usnic_uiom_free_dev_list(struct device **devs); +struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, +						unsigned long addr, size_t size, +						int access, int dmasync); +void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing); +int usnic_uiom_init(char *drv_name); +void usnic_uiom_fini(void); +#endif /* USNIC_UIOM_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c new file mode 100644 index 00000000000..3a4288e0fba --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/init.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/list_sort.h> + +#include <linux/interval_tree_generic.h> +#include "usnic_uiom_interval_tree.h" + +#define START(node) ((node)->start) +#define LAST(node) ((node)->last) + +#define MAKE_NODE(node, start, end, ref_cnt, flags, err, err_out)	\ +		do {							\ +			node = usnic_uiom_interval_node_alloc(start,	\ +					end, ref_cnt, flags);		\ +				if (!node) {				\ +					err = -ENOMEM;			\ +					goto err_out;			\ +				}					\ +		} while (0) + +#define MARK_FOR_ADD(node, list) (list_add_tail(&node->link, list)) + +#define MAKE_NODE_AND_APPEND(node, start, end, ref_cnt, flags, err,	\ +				err_out, list)				\ +				do {					\ +					MAKE_NODE(node, start, end,	\ +						ref_cnt, flags, err,	\ +						err_out);		\ +					MARK_FOR_ADD(node, list);	\ +				} while (0) + +#define FLAGS_EQUAL(flags1, flags2, mask)				\ +			(((flags1) & (mask)) == ((flags2) & (mask))) + +static struct usnic_uiom_interval_node* +usnic_uiom_interval_node_alloc(long int start, long int last, int ref_cnt, +				int flags) +{ +	struct usnic_uiom_interval_node *interval = kzalloc(sizeof(*interval), +								GFP_ATOMIC); +	if (!interval) +		return NULL; + +	interval->start = start; +	interval->last = last; +	interval->flags = flags; +	interval->ref_cnt = ref_cnt; + +	return interval; +} + +static int interval_cmp(void *priv, struct list_head *a, struct list_head *b) +{ +	struct usnic_uiom_interval_node *node_a, *node_b; + +	node_a = list_entry(a, struct usnic_uiom_interval_node, link); +	node_b = list_entry(b, struct usnic_uiom_interval_node, link); + +	/* long to int */ +	if (node_a->start < node_b->start) +		return -1; +	else if (node_a->start > node_b->start) +		return 1; + +	return 0; +} + +static void +find_intervals_intersection_sorted(struct rb_root *root, unsigned long start, +					unsigned long last, +					struct list_head *list) +{ +	struct usnic_uiom_interval_node *node; + +	INIT_LIST_HEAD(list); + +	for (node = usnic_uiom_interval_tree_iter_first(root, start, last); +		node; +		node = usnic_uiom_interval_tree_iter_next(node, start, last)) +		list_add_tail(&node->link, list); + +	list_sort(NULL, list, interval_cmp); +} + +int usnic_uiom_get_intervals_diff(unsigned long start, unsigned long last, +					int flags, int flag_mask, +					struct rb_root *root, +					struct list_head *diff_set) +{ +	struct usnic_uiom_interval_node *interval, *tmp; +	int err = 0; +	long int pivot = start; +	LIST_HEAD(intersection_set); + +	INIT_LIST_HEAD(diff_set); + +	find_intervals_intersection_sorted(root, start, last, +						&intersection_set); + +	list_for_each_entry(interval, &intersection_set, link) { +		if (pivot < interval->start) { +			MAKE_NODE_AND_APPEND(tmp, pivot, interval->start - 1, +						1, flags, err, err_out, +						diff_set); +			pivot = interval->start; +		} + +		/* +		 * Invariant: Set [start, pivot] is either in diff_set or root, +		 * but not in both. +		 */ + +		if (pivot > interval->last) { +			continue; +		} else if (pivot <= interval->last && +				FLAGS_EQUAL(interval->flags, flags, +				flag_mask)) { +			pivot = interval->last + 1; +		} +	} + +	if (pivot <= last) +		MAKE_NODE_AND_APPEND(tmp, pivot, last, 1, flags, err, err_out, +					diff_set); + +	return 0; + +err_out: +	list_for_each_entry_safe(interval, tmp, diff_set, link) { +		list_del(&interval->link); +		kfree(interval); +	} + +	return err; +} + +void usnic_uiom_put_interval_set(struct list_head *intervals) +{ +	struct usnic_uiom_interval_node *interval, *tmp; +	list_for_each_entry_safe(interval, tmp, intervals, link) +		kfree(interval); +} + +int usnic_uiom_insert_interval(struct rb_root *root, unsigned long start, +				unsigned long last, int flags) +{ +	struct usnic_uiom_interval_node *interval, *tmp; +	unsigned long istart, ilast; +	int iref_cnt, iflags; +	unsigned long lpivot = start; +	int err = 0; +	LIST_HEAD(to_add); +	LIST_HEAD(intersection_set); + +	find_intervals_intersection_sorted(root, start, last, +						&intersection_set); + +	list_for_each_entry(interval, &intersection_set, link) { +		/* +		 * Invariant - lpivot is the left edge of next interval to be +		 * inserted +		 */ +		istart = interval->start; +		ilast = interval->last; +		iref_cnt = interval->ref_cnt; +		iflags = interval->flags; + +		if (istart < lpivot) { +			MAKE_NODE_AND_APPEND(tmp, istart, lpivot - 1, iref_cnt, +						iflags, err, err_out, &to_add); +		} else if (istart > lpivot) { +			MAKE_NODE_AND_APPEND(tmp, lpivot, istart - 1, 1, flags, +						err, err_out, &to_add); +			lpivot = istart; +		} else { +			lpivot = istart; +		} + +		if (ilast > last) { +			MAKE_NODE_AND_APPEND(tmp, lpivot, last, iref_cnt + 1, +						iflags | flags, err, err_out, +						&to_add); +			MAKE_NODE_AND_APPEND(tmp, last + 1, ilast, iref_cnt, +						iflags, err, err_out, &to_add); +		} else { +			MAKE_NODE_AND_APPEND(tmp, lpivot, ilast, iref_cnt + 1, +						iflags | flags, err, err_out, +						&to_add); +		} + +		lpivot = ilast + 1; +	} + +	if (lpivot <= last) +		MAKE_NODE_AND_APPEND(tmp, lpivot, last, 1, flags, err, err_out, +					&to_add); + +	list_for_each_entry_safe(interval, tmp, &intersection_set, link) { +		usnic_uiom_interval_tree_remove(interval, root); +		kfree(interval); +	} + +	list_for_each_entry(interval, &to_add, link) +		usnic_uiom_interval_tree_insert(interval, root); + +	return 0; + +err_out: +	list_for_each_entry_safe(interval, tmp, &to_add, link) +		kfree(interval); + +	return err; +} + +void usnic_uiom_remove_interval(struct rb_root *root, unsigned long start, +				unsigned long last, struct list_head *removed) +{ +	struct usnic_uiom_interval_node *interval; + +	for (interval = usnic_uiom_interval_tree_iter_first(root, start, last); +			interval; +			interval = usnic_uiom_interval_tree_iter_next(interval, +									start, +									last)) { +		if (--interval->ref_cnt == 0) +			list_add_tail(&interval->link, removed); +	} + +	list_for_each_entry(interval, removed, link) +		usnic_uiom_interval_tree_remove(interval, root); +} + +INTERVAL_TREE_DEFINE(struct usnic_uiom_interval_node, rb, +			unsigned long, __subtree_last, +			START, LAST, , usnic_uiom_interval_tree) diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h new file mode 100644 index 00000000000..d4f752e258f --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_UIOM_INTERVAL_TREE_H_ +#define USNIC_UIOM_INTERVAL_TREE_H_ + +#include <linux/rbtree.h> + +struct usnic_uiom_interval_node { +	struct rb_node			rb; +	struct list_head		link; +	unsigned long			start; +	unsigned long			last; +	unsigned long			__subtree_last; +	unsigned int			ref_cnt; +	int				flags; +}; + +extern void +usnic_uiom_interval_tree_insert(struct usnic_uiom_interval_node *node, +					struct rb_root *root); +extern void +usnic_uiom_interval_tree_remove(struct usnic_uiom_interval_node *node, +					struct rb_root *root); +extern struct usnic_uiom_interval_node * +usnic_uiom_interval_tree_iter_first(struct rb_root *root, +					unsigned long start, +					unsigned long last); +extern struct usnic_uiom_interval_node * +usnic_uiom_interval_tree_iter_next(struct usnic_uiom_interval_node *node, +			unsigned long start, unsigned long last); +/* + * Inserts {start...last} into {root}.  If there are overlaps, + * nodes will be broken up and merged + */ +int usnic_uiom_insert_interval(struct rb_root *root, +				unsigned long start, unsigned long last, +				int flags); +/* + * Removed {start...last} from {root}.  The nodes removed are returned in + * 'removed.' The caller is responsibile for freeing memory of nodes in + * 'removed.' + */ +void usnic_uiom_remove_interval(struct rb_root *root, +				unsigned long start, unsigned long last, +				struct list_head *removed); +/* + * Returns {start...last} - {root} (relative complement of {start...last} in + * {root}) in diff_set sorted ascendingly + */ +int usnic_uiom_get_intervals_diff(unsigned long start, +					unsigned long last, int flags, +					int flag_mask, +					struct rb_root *root, +					struct list_head *diff_set); +/* Call this to free diff_set returned by usnic_uiom_get_intervals_diff */ +void usnic_uiom_put_interval_set(struct list_head *intervals); +#endif /* USNIC_UIOM_INTERVAL_TREE_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.c b/drivers/infiniband/hw/usnic/usnic_vnic.c new file mode 100644 index 00000000000..656b88c39ed --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_vnic.c @@ -0,0 +1,467 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/pci.h> + +#include "usnic_ib.h" +#include "vnic_resource.h" +#include "usnic_log.h" +#include "usnic_vnic.h" + +struct usnic_vnic { +	struct vnic_dev			*vdev; +	struct vnic_dev_bar		bar[PCI_NUM_RESOURCES]; +	struct usnic_vnic_res_chunk	chunks[USNIC_VNIC_RES_TYPE_MAX]; +	spinlock_t			res_lock; +}; + +static enum vnic_res_type _to_vnic_res_type(enum usnic_vnic_res_type res_type) +{ +#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \ +		vnic_res_type, +#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \ +		vnic_res_type, +	static enum vnic_res_type usnic_vnic_type_2_vnic_type[] = { +						USNIC_VNIC_RES_TYPES}; +#undef DEFINE_USNIC_VNIC_RES +#undef DEFINE_USNIC_VNIC_RES_AT + +	if (res_type >= USNIC_VNIC_RES_TYPE_MAX) +		return RES_TYPE_MAX; + +	return usnic_vnic_type_2_vnic_type[res_type]; +} + +const char *usnic_vnic_res_type_to_str(enum usnic_vnic_res_type res_type) +{ +#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \ +		desc, +#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \ +		desc, +	static const char * const usnic_vnic_res_type_desc[] = { +						USNIC_VNIC_RES_TYPES}; +#undef DEFINE_USNIC_VNIC_RES +#undef DEFINE_USNIC_VNIC_RES_AT + +	if (res_type >= USNIC_VNIC_RES_TYPE_MAX) +		return "unknown"; + +	return usnic_vnic_res_type_desc[res_type]; + +} + +const char *usnic_vnic_pci_name(struct usnic_vnic *vnic) +{ +	return pci_name(usnic_vnic_get_pdev(vnic)); +} + +int usnic_vnic_dump(struct usnic_vnic *vnic, char *buf, +			int buf_sz, +			void *hdr_obj, +			int (*printtitle)(void *, char*, int), +			int (*printcols)(char *, int), +			int (*printrow)(void *, char *, int)) +{ +	struct usnic_vnic_res_chunk *chunk; +	struct usnic_vnic_res *res; +	struct vnic_dev_bar *bar0; +	int i, j, offset; + +	offset = 0; +	bar0 = usnic_vnic_get_bar(vnic, 0); +	offset += scnprintf(buf + offset, buf_sz - offset, +			"VF:%hu BAR0 bus_addr=%pa vaddr=0x%p size=%ld ", +			usnic_vnic_get_index(vnic), +			&bar0->bus_addr, +			bar0->vaddr, bar0->len); +	if (printtitle) +		offset += printtitle(hdr_obj, buf + offset, buf_sz - offset); +	offset += scnprintf(buf + offset, buf_sz - offset, "\n"); +	offset += scnprintf(buf + offset, buf_sz - offset, +			"|RES\t|CTRL_PIN\t\t|IN_USE\t"); +	if (printcols) +		offset += printcols(buf + offset, buf_sz - offset); +	offset += scnprintf(buf + offset, buf_sz - offset, "\n"); + +	spin_lock(&vnic->res_lock); +	for (i = 0; i < ARRAY_SIZE(vnic->chunks); i++) { +		chunk = &vnic->chunks[i]; +		for (j = 0; j < chunk->cnt; j++) { +			res = chunk->res[j]; +			offset += scnprintf(buf + offset, buf_sz - offset, +					"|%s[%u]\t|0x%p\t|%u\t", +					usnic_vnic_res_type_to_str(res->type), +					res->vnic_idx, res->ctrl, !!res->owner); +			if (printrow) { +				offset += printrow(res->owner, buf + offset, +							buf_sz - offset); +			} +			offset += scnprintf(buf + offset, buf_sz - offset, +						"\n"); +		} +	} +	spin_unlock(&vnic->res_lock); +	return offset; +} + +void usnic_vnic_res_spec_update(struct usnic_vnic_res_spec *spec, +				enum usnic_vnic_res_type trgt_type, +				u16 cnt) +{ +	int i; + +	for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { +		if (spec->resources[i].type == trgt_type) { +			spec->resources[i].cnt = cnt; +			return; +		} +	} + +	WARN_ON(1); +} + +int usnic_vnic_res_spec_satisfied(const struct usnic_vnic_res_spec *min_spec, +					struct usnic_vnic_res_spec *res_spec) +{ +	int found, i, j; + +	for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { +		found = 0; + +		for (j = 0; j < USNIC_VNIC_RES_TYPE_MAX; j++) { +			if (res_spec->resources[i].type != +				min_spec->resources[i].type) +				continue; +			found = 1; +			if (min_spec->resources[i].cnt > +					res_spec->resources[i].cnt) +				return -EINVAL; +			break; +		} + +		if (!found) +			return -EINVAL; +	} +	return 0; +} + +int usnic_vnic_spec_dump(char *buf, int buf_sz, +				struct usnic_vnic_res_spec *res_spec) +{ +	enum usnic_vnic_res_type res_type; +	int res_cnt; +	int i; +	int offset = 0; + +	for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { +		res_type = res_spec->resources[i].type; +		res_cnt = res_spec->resources[i].cnt; +		offset += scnprintf(buf + offset, buf_sz - offset, +				"Res: %s Cnt: %d ", +				usnic_vnic_res_type_to_str(res_type), +				res_cnt); +	} + +	return offset; +} + +int usnic_vnic_check_room(struct usnic_vnic *vnic, +				struct usnic_vnic_res_spec *res_spec) +{ +	int i; +	enum usnic_vnic_res_type res_type; +	int res_cnt; + +	for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { +		res_type = res_spec->resources[i].type; +		res_cnt = res_spec->resources[i].cnt; + +		if (res_type == USNIC_VNIC_RES_TYPE_EOL) +			break; + +		if (res_cnt > usnic_vnic_res_free_cnt(vnic, res_type)) +			return -EBUSY; +	} + +	return 0; +} + +int usnic_vnic_res_cnt(struct usnic_vnic *vnic, +				enum usnic_vnic_res_type type) +{ +	return vnic->chunks[type].cnt; +} + +int usnic_vnic_res_free_cnt(struct usnic_vnic *vnic, +				enum usnic_vnic_res_type type) +{ +	return vnic->chunks[type].free_cnt; +} + +struct usnic_vnic_res_chunk * +usnic_vnic_get_resources(struct usnic_vnic *vnic, enum usnic_vnic_res_type type, +				int cnt, void *owner) +{ +	struct usnic_vnic_res_chunk *src, *ret; +	struct usnic_vnic_res *res; +	int i; + +	if (usnic_vnic_res_free_cnt(vnic, type) < cnt || cnt < 1 || !owner) +		return ERR_PTR(-EINVAL); + +	ret = kzalloc(sizeof(*ret), GFP_ATOMIC); +	if (!ret) { +		usnic_err("Failed to allocate chunk for %s - Out of memory\n", +				usnic_vnic_pci_name(vnic)); +		return ERR_PTR(-ENOMEM); +	} + +	ret->res = kzalloc(sizeof(*(ret->res))*cnt, GFP_ATOMIC); +	if (!ret->res) { +		usnic_err("Failed to allocate resources for %s. Out of memory\n", +				usnic_vnic_pci_name(vnic)); +		kfree(ret); +		return ERR_PTR(-ENOMEM); +	} + +	spin_lock(&vnic->res_lock); +	src = &vnic->chunks[type]; +	for (i = 0; i < src->cnt && ret->cnt < cnt; i++) { +		res = src->res[i]; +		if (!res->owner) { +			src->free_cnt--; +			res->owner = owner; +			ret->res[ret->cnt++] = res; +		} +	} + +	spin_unlock(&vnic->res_lock); +	ret->type = type; +	ret->vnic = vnic; +	WARN_ON(ret->cnt != cnt); + +	return ret; +} + +void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk) +{ + +	struct usnic_vnic_res *res; +	int i; +	struct usnic_vnic *vnic = chunk->vnic; + +	spin_lock(&vnic->res_lock); +	while ((i = --chunk->cnt) >= 0) { +		res = chunk->res[i]; +		chunk->res[i] = NULL; +		res->owner = NULL; +		vnic->chunks[res->type].free_cnt++; +	} +	spin_unlock(&vnic->res_lock); + +	kfree(chunk->res); +	kfree(chunk); +} + +u16 usnic_vnic_get_index(struct usnic_vnic *vnic) +{ +	return usnic_vnic_get_pdev(vnic)->devfn - 1; +} + +static int usnic_vnic_alloc_res_chunk(struct usnic_vnic *vnic, +					enum usnic_vnic_res_type type, +					struct usnic_vnic_res_chunk *chunk) +{ +	int cnt, err, i; +	struct usnic_vnic_res *res; + +	cnt = vnic_dev_get_res_count(vnic->vdev, _to_vnic_res_type(type)); +	if (cnt < 1) +		return -EINVAL; + +	chunk->cnt = chunk->free_cnt = cnt; +	chunk->res = kzalloc(sizeof(*(chunk->res))*cnt, GFP_KERNEL); +	if (!chunk->res) +		return -ENOMEM; + +	for (i = 0; i < cnt; i++) { +		res = kzalloc(sizeof(*res), GFP_KERNEL); +		if (!res) { +			err = -ENOMEM; +			goto fail; +		} +		res->type = type; +		res->vnic_idx = i; +		res->vnic = vnic; +		res->ctrl = vnic_dev_get_res(vnic->vdev, +						_to_vnic_res_type(type), i); +		chunk->res[i] = res; +	} + +	chunk->vnic = vnic; +	return 0; +fail: +	for (i--; i >= 0; i--) +		kfree(chunk->res[i]); +	kfree(chunk->res); +	return err; +} + +static void usnic_vnic_free_res_chunk(struct usnic_vnic_res_chunk *chunk) +{ +	int i; +	for (i = 0; i < chunk->cnt; i++) +		kfree(chunk->res[i]); +	kfree(chunk->res); +} + +static int usnic_vnic_discover_resources(struct pci_dev *pdev, +						struct usnic_vnic *vnic) +{ +	enum usnic_vnic_res_type res_type; +	int i; +	int err = 0; + +	for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) { +		if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) +			continue; +		vnic->bar[i].len = pci_resource_len(pdev, i); +		vnic->bar[i].vaddr = pci_iomap(pdev, i, vnic->bar[i].len); +		if (!vnic->bar[i].vaddr) { +			usnic_err("Cannot memory-map BAR %d, aborting\n", +					i); +			err = -ENODEV; +			goto out_clean_bar; +		} +		vnic->bar[i].bus_addr = pci_resource_start(pdev, i); +	} + +	vnic->vdev = vnic_dev_register(NULL, pdev, pdev, vnic->bar, +			ARRAY_SIZE(vnic->bar)); +	if (!vnic->vdev) { +		usnic_err("Failed to register device %s\n", +				pci_name(pdev)); +		err = -EINVAL; +		goto out_clean_bar; +	} + +	for (res_type = USNIC_VNIC_RES_TYPE_EOL + 1; +			res_type < USNIC_VNIC_RES_TYPE_MAX; res_type++) { +		err = usnic_vnic_alloc_res_chunk(vnic, res_type, +						&vnic->chunks[res_type]); +		if (err) { +			usnic_err("Failed to alloc res %s with err %d\n", +					usnic_vnic_res_type_to_str(res_type), +					err); +			goto out_clean_chunks; +		} +	} + +	return 0; + +out_clean_chunks: +	for (res_type--; res_type > USNIC_VNIC_RES_TYPE_EOL; res_type--) +		usnic_vnic_free_res_chunk(&vnic->chunks[res_type]); +	vnic_dev_unregister(vnic->vdev); +out_clean_bar: +	for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) { +		if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) +			continue; +		if (!vnic->bar[i].vaddr) +			break; + +		iounmap(vnic->bar[i].vaddr); +	} + +	return err; +} + +struct pci_dev *usnic_vnic_get_pdev(struct usnic_vnic *vnic) +{ +	return vnic_dev_get_pdev(vnic->vdev); +} + +struct vnic_dev_bar *usnic_vnic_get_bar(struct usnic_vnic *vnic, +				int bar_num) +{ +	return (bar_num < ARRAY_SIZE(vnic->bar)) ? &vnic->bar[bar_num] : NULL; +} + +static void usnic_vnic_release_resources(struct usnic_vnic *vnic) +{ +	int i; +	struct pci_dev *pdev; +	enum usnic_vnic_res_type res_type; + +	pdev = usnic_vnic_get_pdev(vnic); + +	for (res_type = USNIC_VNIC_RES_TYPE_EOL + 1; +			res_type < USNIC_VNIC_RES_TYPE_MAX; res_type++) +		usnic_vnic_free_res_chunk(&vnic->chunks[res_type]); + +	vnic_dev_unregister(vnic->vdev); + +	for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) { +		if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) +			continue; +		iounmap(vnic->bar[i].vaddr); +	} +} + +struct usnic_vnic *usnic_vnic_alloc(struct pci_dev *pdev) +{ +	struct usnic_vnic *vnic; +	int err = 0; + +	if (!pci_is_enabled(pdev)) { +		usnic_err("PCI dev %s is disabled\n", pci_name(pdev)); +		return ERR_PTR(-EINVAL); +	} + +	vnic = kzalloc(sizeof(*vnic), GFP_KERNEL); +	if (!vnic) { +		usnic_err("Failed to alloc vnic for %s - out of memory\n", +				pci_name(pdev)); +		return ERR_PTR(-ENOMEM); +	} + +	spin_lock_init(&vnic->res_lock); + +	err = usnic_vnic_discover_resources(pdev, vnic); +	if (err) { +		usnic_err("Failed to discover %s resources with err %d\n", +				pci_name(pdev), err); +		goto out_free_vnic; +	} + +	usnic_dbg("Allocated vnic for %s\n", usnic_vnic_pci_name(vnic)); + +	return vnic; + +out_free_vnic: +	kfree(vnic); + +	return ERR_PTR(err); +} + +void usnic_vnic_free(struct usnic_vnic *vnic) +{ +	usnic_vnic_release_resources(vnic); +	kfree(vnic); +} diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.h b/drivers/infiniband/hw/usnic/usnic_vnic.h new file mode 100644 index 00000000000..14d931a8829 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_vnic.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_VNIC_H_ +#define USNIC_VNIC_H_ + +#include <linux/pci.h> + +#include "vnic_dev.h" + +/*                      =USNIC_VNIC_RES_TYPE= =VNIC_RES=   =DESC= */ +#define USNIC_VNIC_RES_TYPES \ +	DEFINE_USNIC_VNIC_RES_AT(EOL, RES_TYPE_EOL, "EOL", 0) \ +	DEFINE_USNIC_VNIC_RES(WQ, RES_TYPE_WQ, "WQ") \ +	DEFINE_USNIC_VNIC_RES(RQ, RES_TYPE_RQ, "RQ") \ +	DEFINE_USNIC_VNIC_RES(CQ, RES_TYPE_CQ, "CQ") \ +	DEFINE_USNIC_VNIC_RES(INTR, RES_TYPE_INTR_CTRL, "INT") \ +	DEFINE_USNIC_VNIC_RES(MAX, RES_TYPE_MAX, "MAX")\ + +#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \ +	USNIC_VNIC_RES_TYPE_##usnic_vnic_res_t = val, +#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \ +	USNIC_VNIC_RES_TYPE_##usnic_vnic_res_t, +enum usnic_vnic_res_type { +	USNIC_VNIC_RES_TYPES +}; +#undef DEFINE_USNIC_VNIC_RES +#undef DEFINE_USNIC_VNIC_RES_AT + +struct usnic_vnic_res { +	enum usnic_vnic_res_type	type; +	unsigned int			vnic_idx; +	struct usnic_vnic		*vnic; +	void __iomem			*ctrl; +	void				*owner; +}; + +struct usnic_vnic_res_chunk { +	enum usnic_vnic_res_type	type; +	int				cnt; +	int				free_cnt; +	struct usnic_vnic_res		**res; +	struct usnic_vnic		*vnic; +}; + +struct usnic_vnic_res_desc { +	enum usnic_vnic_res_type	type; +	uint16_t			cnt; +}; + +struct usnic_vnic_res_spec { +	struct usnic_vnic_res_desc resources[USNIC_VNIC_RES_TYPE_MAX]; +}; + +const char *usnic_vnic_res_type_to_str(enum usnic_vnic_res_type res_type); +const char *usnic_vnic_pci_name(struct usnic_vnic *vnic); +int usnic_vnic_dump(struct usnic_vnic *vnic, char *buf, int buf_sz, +			void *hdr_obj, +			int (*printtitle)(void *, char*, int), +			int (*printcols)(char *, int), +			int (*printrow)(void *, char *, int)); +void usnic_vnic_res_spec_update(struct usnic_vnic_res_spec *spec, +				enum usnic_vnic_res_type trgt_type, +				u16 cnt); +int usnic_vnic_res_spec_satisfied(const struct usnic_vnic_res_spec *min_spec, +					struct usnic_vnic_res_spec *res_spec); +int usnic_vnic_spec_dump(char *buf, int buf_sz, +				struct usnic_vnic_res_spec *res_spec); +int usnic_vnic_check_room(struct usnic_vnic *vnic, +				struct usnic_vnic_res_spec *res_spec); +int usnic_vnic_res_cnt(struct usnic_vnic *vnic, +				enum usnic_vnic_res_type type); +int usnic_vnic_res_free_cnt(struct usnic_vnic *vnic, +				enum usnic_vnic_res_type type); +struct usnic_vnic_res_chunk * +usnic_vnic_get_resources(struct usnic_vnic *vnic, +				enum usnic_vnic_res_type type, +				int cnt, +				void *owner); +void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk); +struct pci_dev *usnic_vnic_get_pdev(struct usnic_vnic *vnic); +struct vnic_dev_bar *usnic_vnic_get_bar(struct usnic_vnic *vnic, +				int bar_num); +struct usnic_vnic *usnic_vnic_alloc(struct pci_dev *pdev); +void usnic_vnic_free(struct usnic_vnic *vnic); +u16 usnic_vnic_get_index(struct usnic_vnic *vnic); + +#endif /*!USNIC_VNIC_H_*/  | 
