diff options
Diffstat (limited to 'drivers/infiniband/ulp')
30 files changed, 12838 insertions, 1809 deletions
diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile new file mode 100644 index 00000000000..f3c7dcf0309 --- /dev/null +++ b/drivers/infiniband/ulp/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_INFINIBAND_IPOIB)		+= ipoib/ +obj-$(CONFIG_INFINIBAND_SRP)		+= srp/ +obj-$(CONFIG_INFINIBAND_SRPT)		+= srpt/ +obj-$(CONFIG_INFINIBAND_ISER)		+= iser/ +obj-$(CONFIG_INFINIBAND_ISERT)		+= isert/ diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig index 9d9a9dc51f1..cda8eac55ff 100644 --- a/drivers/infiniband/ulp/ipoib/Kconfig +++ b/drivers/infiniband/ulp/ipoib/Kconfig @@ -1,7 +1,6 @@  config INFINIBAND_IPOIB  	tristate "IP-over-InfiniBand"  	depends on NETDEVICES && INET && (IPV6 || IPV6=n) -	select INET_LRO  	---help---  	  Support for the IP-over-InfiniBand protocol (IPoIB). This  	  transports IP packets over InfiniBand so you can use your IB @@ -25,7 +24,7 @@ config INFINIBAND_IPOIB_CM  	  unless you limit mtu for these destinations to 2044.  config INFINIBAND_IPOIB_DEBUG -	bool "IP-over-InfiniBand debugging" if EMBEDDED +	bool "IP-over-InfiniBand debugging" if EXPERT  	depends on INFINIBAND_IPOIB  	default y  	---help--- diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile index 3090100f0de..e5430dd5076 100644 --- a/drivers/infiniband/ulp/ipoib/Makefile +++ b/drivers/infiniband/ulp/ipoib/Makefile @@ -5,7 +5,8 @@ ib_ipoib-y					:= ipoib_main.o \  						   ipoib_multicast.o \  						   ipoib_verbs.o \  						   ipoib_vlan.o \ -						   ipoib_ethtool.o +						   ipoib_ethtool.o \ +						   ipoib_netlink.o  ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM)		+= ipoib_cm.o  ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG)	+= ipoib_fs.o diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 753a983a5fd..c639f90cfda 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -44,13 +44,14 @@  #include <linux/mutex.h>  #include <net/neighbour.h> +#include <net/sch_generic.h> -#include <asm/atomic.h> +#include <linux/atomic.h>  #include <rdma/ib_verbs.h>  #include <rdma/ib_pack.h>  #include <rdma/ib_sa.h> -#include <linux/inet_lro.h> +#include <linux/sched.h>  /* constants */ @@ -91,7 +92,8 @@ enum {  	IPOIB_STOP_REAPER	  = 7,  	IPOIB_FLAG_ADMIN_CM	  = 9,  	IPOIB_FLAG_UMCAST	  = 10, -	IPOIB_FLAG_CSUM		  = 11, +	IPOIB_STOP_NEIGH_GC	  = 11, +	IPOIB_NEIGH_TBL_FLUSH	  = 12,  	IPOIB_MAX_BACKOFF_SECONDS = 16, @@ -99,12 +101,14 @@ enum {  	IPOIB_MCAST_FLAG_SENDONLY = 1,  	IPOIB_MCAST_FLAG_BUSY	  = 2,	/* joining or already joined */  	IPOIB_MCAST_FLAG_ATTACHED = 3, - -	IPOIB_MAX_LRO_DESCRIPTORS = 8, -	IPOIB_LRO_MAX_AGGR 	  = 64, +	IPOIB_MCAST_JOIN_STARTED  = 4,  	MAX_SEND_CQE		  = 16,  	IPOIB_CM_COPYBREAK	  = 256, + +	IPOIB_NON_CHILD		  = 0, +	IPOIB_LEGACY_CHILD	  = 1, +	IPOIB_RTNL_CHILD	  = 2,  };  #define	IPOIB_OP_RECV   (1ul << 31) @@ -114,6 +118,8 @@ enum {  #define	IPOIB_OP_CM     (0)  #endif +#define IPOIB_QPN_MASK ((__force u32) cpu_to_be32(0xFFFFFF)) +  /* structs */  struct ipoib_header { @@ -121,8 +127,9 @@ struct ipoib_header {  	u16	reserved;  }; -struct ipoib_pseudoheader { -	u8  hwaddr[INFINIBAND_ALEN]; +struct ipoib_cb { +	struct qdisc_skb_cb	qdisc_cb; +	u8			hwaddr[INFINIBAND_ALEN];  };  /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ @@ -145,6 +152,7 @@ struct ipoib_mcast {  	struct sk_buff_head pkt_queue;  	struct net_device *dev; +	struct completion done;  };  struct ipoib_rx_buf { @@ -262,9 +270,21 @@ struct ipoib_ethtool_st {  	u16     max_coalesced_frames;  }; -struct ipoib_lro { -	struct net_lro_mgr lro_mgr; -	struct net_lro_desc lro_desc[IPOIB_MAX_LRO_DESCRIPTORS]; +struct ipoib_neigh_table; + +struct ipoib_neigh_hash { +	struct ipoib_neigh_table       *ntbl; +	struct ipoib_neigh __rcu      **buckets; +	struct rcu_head			rcu; +	u32				mask; +	u32				size; +}; + +struct ipoib_neigh_table { +	struct ipoib_neigh_hash __rcu  *htbl; +	atomic_t			entries; +	struct completion		flushed; +	struct completion		deleted;  };  /* @@ -281,11 +301,13 @@ struct ipoib_dev_priv {  	unsigned long flags; -	struct mutex vlan_mutex; +	struct rw_semaphore vlan_rwsem;  	struct rb_root  path_tree;  	struct list_head path_list; +	struct ipoib_neigh_table ntbl; +  	struct ipoib_mcast *broadcast;  	struct list_head multicast_list;  	struct rb_root multicast_tree; @@ -298,7 +320,7 @@ struct ipoib_dev_priv {  	struct work_struct flush_heavy;  	struct work_struct restart_task;  	struct delayed_work ah_reap_task; - +	struct delayed_work neigh_reap_task;  	struct ib_device *ca;  	u8		  port;  	u16		  pkey; @@ -339,6 +361,7 @@ struct ipoib_dev_priv {  	struct net_device *parent;  	struct list_head child_intfs;  	struct list_head list; +	int    child_type;  #ifdef CONFIG_INFINIBAND_IPOIB_CM  	struct ipoib_cm_dev_priv cm; @@ -352,8 +375,6 @@ struct ipoib_dev_priv {  	int	hca_caps;  	struct ipoib_ethtool_st ethtool;  	struct timer_list poll_timer; - -	struct ipoib_lro lro;  };  struct ipoib_ah { @@ -386,13 +407,16 @@ struct ipoib_neigh {  #ifdef CONFIG_INFINIBAND_IPOIB_CM  	struct ipoib_cm_tx *cm;  #endif -	union ib_gid	    dgid; +	u8     daddr[INFINIBAND_ALEN];  	struct sk_buff_head queue; -	struct neighbour   *neighbour;  	struct net_device *dev;  	struct list_head    list; +	struct ipoib_neigh __rcu *hnext; +	struct rcu_head     rcu; +	atomic_t	    refcnt; +	unsigned long       alive;  };  #define IPOIB_UD_MTU(ib_mtu)		(ib_mtu - IPOIB_ENCAP_LEN) @@ -403,21 +427,17 @@ static inline int ipoib_ud_need_sg(unsigned int ib_mtu)  	return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE;  } -/* - * We stash a pointer to our private neighbour information after our - * hardware address in neigh->ha.  The ALIGN() expression here makes - * sure that this pointer is stored aligned so that an unaligned - * load is not needed to dereference it. - */ -static inline struct ipoib_neigh **to_ipoib_neigh(struct neighbour *neigh) +void ipoib_neigh_dtor(struct ipoib_neigh *neigh); +static inline void ipoib_neigh_put(struct ipoib_neigh *neigh)  { -	return (void*) neigh + ALIGN(offsetof(struct neighbour, ha) + -				     INFINIBAND_ALEN, sizeof(void *)); +	if (atomic_dec_and_test(&neigh->refcnt)) +		ipoib_neigh_dtor(neigh);  } - -struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh, +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr); +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,  				      struct net_device *dev); -void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh); +void ipoib_neigh_free(struct ipoib_neigh *neigh); +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid);  extern struct workqueue_struct *ipoib_workqueue; @@ -434,7 +454,6 @@ static inline void ipoib_put_ah(struct ipoib_ah *ah)  {  	kref_put(&ah->ref, ipoib_free_ah);  } -  int ipoib_open(struct net_device *dev);  int ipoib_add_pkey_attr(struct net_device *dev);  int ipoib_add_umcast_attr(struct net_device *dev); @@ -464,7 +483,7 @@ void ipoib_dev_cleanup(struct net_device *dev);  void ipoib_mcast_join_task(struct work_struct *work);  void ipoib_mcast_carrier_on_task(struct work_struct *work); -void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb); +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);  void ipoib_mcast_restart_task(struct work_struct *work);  int ipoib_mcast_start_thread(struct net_device *dev); @@ -502,6 +521,17 @@ void ipoib_event(struct ib_event_handler *handler,  int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey);  int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey); +int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, +		     u16 pkey, int child_type); + +int  __init ipoib_netlink_init(void); +void __exit ipoib_netlink_fini(void); + +void ipoib_set_umcast(struct net_device *ndev, int umcast_val); +int  ipoib_set_mode(struct net_device *dev, const char *buf); + +void ipoib_setup(struct net_device *dev); +  void ipoib_pkey_poll(struct work_struct *work);  int ipoib_pkey_dev_delay_open(struct net_device *dev);  void ipoib_drain_cq(struct net_device *dev); @@ -509,14 +539,14 @@ void ipoib_drain_cq(struct net_device *dev);  void ipoib_set_ethtool_ops(struct net_device *dev);  int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca); -#ifdef CONFIG_INFINIBAND_IPOIB_CM -  #define IPOIB_FLAGS_RC		0x80  #define IPOIB_FLAGS_UC		0x40  /* We don't support UC connections at the moment */  #define IPOIB_CM_SUPPORTED(ha)   (ha[0] & (IPOIB_FLAGS_RC)) +#ifdef CONFIG_INFINIBAND_IPOIB_CM +  extern int ipoib_max_conn_qp;  static inline int ipoib_cm_admin_enabled(struct net_device *dev) @@ -526,10 +556,10 @@ static inline int ipoib_cm_admin_enabled(struct net_device *dev)  		test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);  } -static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr)  {  	struct ipoib_dev_priv *priv = netdev_priv(dev); -	return IPOIB_CM_SUPPORTED(n->ha) && +	return IPOIB_CM_SUPPORTED(hwaddr) &&  		test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);  } @@ -584,7 +614,7 @@ static inline int ipoib_cm_admin_enabled(struct net_device *dev)  {  	return 0;  } -static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr)  {  	return 0; @@ -734,4 +764,6 @@ extern int ipoib_debug_level;  #define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) +extern const char ipoib_driver_version[]; +  #endif /* _IPOIB_H */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index bb1004114de..933efcea0d0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -37,6 +37,7 @@  #include <linux/delay.h>  #include <linux/slab.h>  #include <linux/vmalloc.h> +#include <linux/moduleparam.h>  #include "ipoib.h" @@ -84,7 +85,7 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,  	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);  	for (i = 0; i < frags; ++i) -		ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); +		ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);  }  static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) @@ -139,7 +140,8 @@ static int ipoib_cm_post_receive_nonsrq(struct net_device *dev,  static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,  					     struct ipoib_cm_rx_buf *rx_ring,  					     int id, int frags, -					     u64 mapping[IPOIB_CM_RX_SG]) +					     u64 mapping[IPOIB_CM_RX_SG], +					     gfp_t gfp)  {  	struct ipoib_dev_priv *priv = netdev_priv(dev);  	struct sk_buff *skb; @@ -163,13 +165,13 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,  	}  	for (i = 0; i < frags; i++) { -		struct page *page = alloc_page(GFP_ATOMIC); +		struct page *page = alloc_page(gfp);  		if (!page)  			goto partial_error;  		skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE); -		mapping[i + 1] = ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[i].page, +		mapping[i + 1] = ib_dma_map_page(priv->ca, page,  						 0, PAGE_SIZE, DMA_FROM_DEVICE);  		if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1])))  			goto partial_error; @@ -183,7 +185,7 @@ partial_error:  	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);  	for (; i > 0; --i) -		ib_dma_unmap_single(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE); +		ib_dma_unmap_page(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE);  	dev_kfree_skb_any(skb);  	return NULL; @@ -352,15 +354,13 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i  	int ret;  	int i; -	rx->rx_ring = vmalloc(ipoib_recvq_size * sizeof *rx->rx_ring); +	rx->rx_ring = vzalloc(ipoib_recvq_size * sizeof *rx->rx_ring);  	if (!rx->rx_ring) {  		printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n",  		       priv->ca->name, ipoib_recvq_size);  		return -ENOMEM;  	} -	memset(rx->rx_ring, 0, ipoib_recvq_size * sizeof *rx->rx_ring); -  	t = kmalloc(sizeof *t, GFP_KERNEL);  	if (!t) {  		ret = -ENOMEM; @@ -383,7 +383,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i  	for (i = 0; i < ipoib_recvq_size; ++i) {  		if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1, -					   rx->rx_ring[i].mapping)) { +					   rx->rx_ring[i].mapping, +					   GFP_KERNEL)) {  			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);  				ret = -ENOMEM;  				goto err_count; @@ -461,7 +462,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even  		goto err_qp;  	} -	psn = random32() & 0xffffff; +	psn = prandom_u32() & 0xffffff;  	ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);  	if (ret)  		goto err_modify; @@ -539,12 +540,13 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,  		if (length == 0) {  			/* don't need this page */ -			skb_fill_page_desc(toskb, i, frag->page, 0, PAGE_SIZE); +			skb_fill_page_desc(toskb, i, skb_frag_page(frag), +					   0, PAGE_SIZE);  			--skb_shinfo(skb)->nr_frags;  		} else {  			size = min(length, (unsigned) PAGE_SIZE); -			frag->size = size; +			skb_frag_size_set(frag, size);  			skb->data_len += size;  			skb->truesize += size;  			skb->len += size; @@ -639,7 +641,8 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)  	frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,  					      (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; -	newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping); +	newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, +				       mapping, GFP_ATOMIC);  	if (unlikely(!newskb)) {  		/*  		 * If we can't allocate a new RX buffer, dump @@ -741,6 +744,9 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_  	tx_req->mapping = addr; +	skb_orphan(skb); +	skb_dst_drop(skb); +  	rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),  		       addr, skb->len);  	if (unlikely(rc)) { @@ -755,9 +761,13 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_  		if (++priv->tx_outstanding == ipoib_sendq_size) {  			ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",  				  tx->qp->qp_num); -			if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) -				ipoib_warn(priv, "request notify on send CQ failed\n");  			netif_stop_queue(dev); +			rc = ib_req_notify_cq(priv->send_cq, +				IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); +			if (rc < 0) +				ipoib_warn(priv, "request notify on send CQ failed\n"); +			else if (rc) +				ipoib_send_comp_handler(priv->send_cq, dev);  		}  	}  } @@ -810,10 +820,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)  		if (neigh) {  			neigh->cm = NULL; -			list_del(&neigh->list); -			if (neigh->ah) -				ipoib_put_ah(neigh->ah); -			ipoib_neigh_free(dev, neigh); +			ipoib_neigh_free(neigh);  			tx->neigh = NULL;  		} @@ -1023,10 +1030,20 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_  		.cap.max_send_sge	= 1,  		.sq_sig_type		= IB_SIGNAL_ALL_WR,  		.qp_type		= IB_QPT_RC, -		.qp_context		= tx +		.qp_context		= tx, +		.create_flags		= IB_QP_CREATE_USE_GFP_NOIO  	}; -	return ib_create_qp(priv->pd, &attr); +	struct ib_qp *tx_qp; + +	tx_qp = ib_create_qp(priv->pd, &attr); +	if (PTR_ERR(tx_qp) == -EINVAL) { +		ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n", +			   priv->ca->name); +		attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO; +		tx_qp = ib_create_qp(priv->pd, &attr); +	} +	return tx_qp;  }  static int ipoib_cm_send_req(struct net_device *dev, @@ -1097,7 +1114,8 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,  	struct ipoib_dev_priv *priv = netdev_priv(p->dev);  	int ret; -	p->tx_ring = vmalloc(ipoib_sendq_size * sizeof *p->tx_ring); +	p->tx_ring = __vmalloc(ipoib_sendq_size * sizeof *p->tx_ring, +			       GFP_NOIO, PAGE_KERNEL);  	if (!p->tx_ring) {  		ipoib_warn(priv, "failed to allocate tx ring\n");  		ret = -ENOMEM; @@ -1230,10 +1248,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,  		if (neigh) {  			neigh->cm = NULL; -			list_del(&neigh->list); -			if (neigh->ah) -				ipoib_put_ah(neigh->ah); -			ipoib_neigh_free(dev, neigh); +			ipoib_neigh_free(neigh);  			tx->neigh = NULL;  		} @@ -1276,12 +1291,15 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path  void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)  {  	struct ipoib_dev_priv *priv = netdev_priv(tx->dev); +	unsigned long flags;  	if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { +		spin_lock_irqsave(&priv->lock, flags);  		list_move(&tx->list, &priv->cm.reap_list);  		queue_work(ipoib_workqueue, &priv->cm.reap_task);  		ipoib_dbg(priv, "Reap connection for gid %pI6\n", -			  tx->neigh->dgid.raw); +			  tx->neigh->daddr + 4);  		tx->neigh = NULL; +		spin_unlock_irqrestore(&priv->lock, flags);  	}  } @@ -1305,7 +1323,7 @@ static void ipoib_cm_tx_start(struct work_struct *work)  		p = list_entry(priv->cm.start_list.next, typeof(*p), list);  		list_del_init(&p->list);  		neigh = p->neigh; -		qpn = IPOIB_QPN(neigh->neighbour->ha); +		qpn = IPOIB_QPN(neigh->daddr);  		memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);  		spin_unlock_irqrestore(&priv->lock, flags); @@ -1320,10 +1338,7 @@ static void ipoib_cm_tx_start(struct work_struct *work)  			neigh = p->neigh;  			if (neigh) {  				neigh->cm = NULL; -				list_del(&neigh->list); -				if (neigh->ah) -					ipoib_put_ah(neigh->ah); -				ipoib_neigh_free(dev, neigh); +				ipoib_neigh_free(neigh);  			}  			list_del(&p->list);  			kfree(p); @@ -1377,7 +1392,7 @@ static void ipoib_cm_skb_reap(struct work_struct *work)  		if (skb->protocol == htons(ETH_P_IP))  			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  		else if (skb->protocol == htons(ETH_P_IPV6))  			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);  #endif @@ -1398,7 +1413,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,  	int e = skb_queue_empty(&priv->cm.skb_queue);  	if (skb_dst(skb)) -		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); +		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);  	skb_queue_tail(&priv->cm.skb_queue, skb);  	if (e) @@ -1456,42 +1471,19 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr,  			const char *buf, size_t count)  {  	struct net_device *dev = to_net_dev(d); -	struct ipoib_dev_priv *priv = netdev_priv(dev); +	int ret;  	if (!rtnl_trylock())  		return restart_syscall(); -	/* flush paths if we switch modes so that connections are restarted */ -	if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { -		set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); -		ipoib_warn(priv, "enabling connected mode " -			   "will cause multicast packet drops\n"); - -		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO); -		rtnl_unlock(); -		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; - -		ipoib_flush_paths(dev); -		return count; -	} - -	if (!strcmp(buf, "datagram\n")) { -		clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +	ret = ipoib_set_mode(dev, buf); -		if (test_bit(IPOIB_FLAG_CSUM, &priv->flags)) { -			dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG; -			if (priv->hca_caps & IB_DEVICE_UD_TSO) -				dev->features |= NETIF_F_TSO; -		} -		dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); -		rtnl_unlock(); -		ipoib_flush_paths(dev); +	rtnl_unlock(); +	if (!ret)  		return count; -	} -	rtnl_unlock(); -	return -EINVAL; +	return ret;  }  static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode); @@ -1505,6 +1497,7 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)  {  	struct ipoib_dev_priv *priv = netdev_priv(dev);  	struct ib_srq_init_attr srq_init_attr = { +		.srq_type = IB_SRQT_BASIC,  		.attr = {  			.max_wr  = ipoib_recvq_size,  			.max_sge = max_sge @@ -1520,7 +1513,7 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)  		return;  	} -	priv->cm.srq_ring = vmalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring); +	priv->cm.srq_ring = vzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring);  	if (!priv->cm.srq_ring) {  		printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n",  		       priv->ca->name, ipoib_recvq_size); @@ -1529,7 +1522,6 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)  		return;  	} -	memset(priv->cm.srq_ring, 0, ipoib_recvq_size * sizeof *priv->cm.srq_ring);  }  int ipoib_cm_dev_init(struct net_device *dev) @@ -1579,7 +1571,8 @@ int ipoib_cm_dev_init(struct net_device *dev)  		for (i = 0; i < ipoib_recvq_size; ++i) {  			if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i,  						   priv->cm.num_frags - 1, -						   priv->cm.srq_ring[i].mapping)) { +						   priv->cm.srq_ring[i].mapping, +						   GFP_KERNEL)) {  				ipoib_warn(priv, "failed to allocate "  					   "receive buffer %d\n", i);  				ipoib_cm_dev_cleanup(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 1a1657c82ed..078cadd6c79 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -39,33 +39,24 @@  static void ipoib_get_drvinfo(struct net_device *netdev,  			      struct ethtool_drvinfo *drvinfo)  { -	strncpy(drvinfo->driver, "ipoib", sizeof(drvinfo->driver) - 1); -} +	struct ipoib_dev_priv *priv = netdev_priv(netdev); +	struct ib_device_attr *attr; -static u32 ipoib_get_rx_csum(struct net_device *dev) -{ -	struct ipoib_dev_priv *priv = netdev_priv(dev); -	return test_bit(IPOIB_FLAG_CSUM, &priv->flags) && -		!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); -} +	attr = kmalloc(sizeof(*attr), GFP_KERNEL); +	if (attr && !ib_query_device(priv->ca, attr)) +		snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), +			 "%d.%d.%d", (int)(attr->fw_ver >> 32), +			 (int)(attr->fw_ver >> 16) & 0xffff, +			 (int)attr->fw_ver & 0xffff); +	kfree(attr); -static int ipoib_set_tso(struct net_device *dev, u32 data) -{ -	struct ipoib_dev_priv *priv = netdev_priv(dev); +	strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device), +		sizeof(drvinfo->bus_info)); -	if (data) { -		if (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && -		    (dev->features & NETIF_F_SG) && -		    (priv->hca_caps & IB_DEVICE_UD_TSO)) { -			dev->features |= NETIF_F_TSO; -		} else { -			ipoib_warn(priv, "can't set TSO on\n"); -			return -EOPNOTSUPP; -		} -	} else -		dev->features &= ~NETIF_F_TSO; +	strlcpy(drvinfo->version, ipoib_driver_version, +		sizeof(drvinfo->version)); -	return 0; +	strlcpy(drvinfo->driver, "ib_ipoib", sizeof(drvinfo->driver));  }  static int ipoib_get_coalesce(struct net_device *dev, @@ -106,66 +97,13 @@ static int ipoib_set_coalesce(struct net_device *dev,  	return 0;  } -static const char ipoib_stats_keys[][ETH_GSTRING_LEN] = { -	"LRO aggregated", "LRO flushed", -	"LRO avg aggr", "LRO no desc" -}; - -static void ipoib_get_strings(struct net_device *netdev, u32 stringset, u8 *data) -{ -	switch (stringset) { -	case ETH_SS_STATS: -		memcpy(data, *ipoib_stats_keys,	sizeof(ipoib_stats_keys)); -		break; -	} -} - -static int ipoib_get_sset_count(struct net_device *dev, int sset) -{ -	switch (sset) { -	case ETH_SS_STATS: -		return ARRAY_SIZE(ipoib_stats_keys); -	default: -		return -EOPNOTSUPP; -	} -} - -static void ipoib_get_ethtool_stats(struct net_device *dev, -				struct ethtool_stats *stats, uint64_t *data) -{ -	struct ipoib_dev_priv *priv = netdev_priv(dev); -	int index = 0; - -	/* Get LRO statistics */ -	data[index++] = priv->lro.lro_mgr.stats.aggregated; -	data[index++] = priv->lro.lro_mgr.stats.flushed; -	if (priv->lro.lro_mgr.stats.flushed) -		data[index++] = priv->lro.lro_mgr.stats.aggregated / -				priv->lro.lro_mgr.stats.flushed; -	else -		data[index++] = 0; -	data[index++] = priv->lro.lro_mgr.stats.no_desc; -} - -static int ipoib_set_flags(struct net_device *dev, u32 flags) -{ -	return ethtool_op_set_flags(dev, flags, ETH_FLAG_LRO); -} -  static const struct ethtool_ops ipoib_ethtool_ops = {  	.get_drvinfo		= ipoib_get_drvinfo, -	.get_rx_csum		= ipoib_get_rx_csum, -	.set_tso		= ipoib_set_tso,  	.get_coalesce		= ipoib_get_coalesce,  	.set_coalesce		= ipoib_set_coalesce, -	.get_flags		= ethtool_op_get_flags, -	.set_flags		= ipoib_set_flags, -	.get_strings		= ipoib_get_strings, -	.get_sset_count		= ipoib_get_sset_count, -	.get_ethtool_stats	= ipoib_get_ethtool_stats,  };  void ipoib_set_ethtool_ops(struct net_device *dev)  { -	SET_ETHTOOL_OPS(dev, &ipoib_ethtool_ops); +	dev->ethtool_ops = &ipoib_ethtool_ops;  } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c index 86eae229dc4..50061854616 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c @@ -37,6 +37,7 @@  struct file_operations;  #include <linux/debugfs.h> +#include <linux/export.h>  #include "ipoib.h" @@ -212,16 +213,15 @@ static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr)  		   gid_buf, path.pathrec.dlid ? "yes" : "no");  	if (path.pathrec.dlid) { -		rate = ib_rate_to_mult(path.pathrec.rate) * 25; +		rate = ib_rate_to_mbps(path.pathrec.rate);  		seq_printf(file,  			   "  DLID:     0x%04x\n"  			   "  SL: %12d\n" -			   "  rate: %*d%s Gb/sec\n", +			   "  rate: %8d.%d Gb/sec\n",  			   be16_to_cpu(path.pathrec.dlid),  			   path.pathrec.sl, -			   10 - ((rate % 10) ? 2 : 0), -			   rate / 10, rate % 10 ? ".5" : ""); +			   rate / 1000, rate % 1000);  	}  	seq_putc(file, '\n'); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index dfa71903d6e..6a7003ddb0b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -34,6 +34,7 @@   */  #include <linux/delay.h> +#include <linux/moduleparam.h>  #include <linux/dma-mapping.h>  #include <linux/slab.h> @@ -56,21 +57,24 @@ struct ipoib_ah *ipoib_create_ah(struct net_device *dev,  				 struct ib_pd *pd, struct ib_ah_attr *attr)  {  	struct ipoib_ah *ah; +	struct ib_ah *vah;  	ah = kmalloc(sizeof *ah, GFP_KERNEL);  	if (!ah) -		return NULL; +		return ERR_PTR(-ENOMEM);  	ah->dev       = dev;  	ah->last_send = 0;  	kref_init(&ah->ref); -	ah->ah = ib_create_ah(pd, attr); -	if (IS_ERR(ah->ah)) { +	vah = ib_create_ah(pd, attr); +	if (IS_ERR(vah)) {  		kfree(ah); -		ah = NULL; -	} else +		ah = (struct ipoib_ah *)vah; +	} else { +		ah->ah = vah;  		ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah); +	}  	return ah;  } @@ -117,9 +121,9 @@ static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv,  		size = length - IPOIB_UD_HEAD_SIZE; -		frag->size     = size; +		skb_frag_size_set(frag, size);  		skb->data_len += size; -		skb->truesize += size; +		skb->truesize += PAGE_SIZE;  	} else  		skb_put(skb, length); @@ -152,14 +156,18 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)  	struct ipoib_dev_priv *priv = netdev_priv(dev);  	struct sk_buff *skb;  	int buf_size; +	int tailroom;  	u64 *mapping; -	if (ipoib_ud_need_sg(priv->max_ib_mtu)) +	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {  		buf_size = IPOIB_UD_HEAD_SIZE; -	else +		tailroom = 128; /* reserve some tailroom for IP/TCP headers */ +	} else {  		buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); +		tailroom = 0; +	} -	skb = dev_alloc_skb(buf_size + 4); +	skb = dev_alloc_skb(buf_size + tailroom + 4);  	if (unlikely(!skb))  		return NULL; @@ -182,7 +190,7 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)  			goto partial_error;  		skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE);  		mapping[1] = -			ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[0].page, +			ib_dma_map_page(priv->ca, page,  					0, PAGE_SIZE, DMA_FROM_DEVICE);  		if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1])))  			goto partial_error; @@ -292,13 +300,11 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)  	dev->stats.rx_bytes += skb->len;  	skb->dev = dev; -	if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok)) +	if ((dev->features & NETIF_F_RXCSUM) && +			likely(wc->wc_flags & IB_WC_IP_CSUM_OK))  		skb->ip_summed = CHECKSUM_UNNECESSARY; -	if (dev->features & NETIF_F_LRO) -		lro_receive_skb(&priv->lro.lro_mgr, skb, NULL); -	else -		netif_receive_skb(skb); +	napi_gro_receive(&priv->napi, skb);  repost:  	if (unlikely(ipoib_ib_post_receive(dev, wr_id))) @@ -325,9 +331,10 @@ static int ipoib_dma_map_tx(struct ib_device *ca,  		off = 0;  	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { -		skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; -		mapping[i + off] = ib_dma_map_page(ca, frag->page, -						 frag->page_offset, frag->size, +		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; +		mapping[i + off] = ib_dma_map_page(ca, +						 skb_frag_page(frag), +						 frag->page_offset, skb_frag_size(frag),  						 DMA_TO_DEVICE);  		if (unlikely(ib_dma_mapping_error(ca, mapping[i + off])))  			goto partial_error; @@ -336,8 +343,9 @@ static int ipoib_dma_map_tx(struct ib_device *ca,  partial_error:  	for (; i > 0; --i) { -		skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; -		ib_dma_unmap_page(ca, mapping[i - !off], frag->size, DMA_TO_DEVICE); +		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; + +		ib_dma_unmap_page(ca, mapping[i - !off], skb_frag_size(frag), DMA_TO_DEVICE);  	}  	if (off) @@ -361,8 +369,9 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,  		off = 0;  	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { -		skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; -		ib_dma_unmap_page(ca, mapping[i + off], frag->size, +		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + +		ib_dma_unmap_page(ca, mapping[i + off], skb_frag_size(frag),  				  DMA_TO_DEVICE);  	}  } @@ -450,9 +459,6 @@ poll_more:  	}  	if (done < budget) { -		if (dev->features & NETIF_F_LRO) -			lro_flush_all(&priv->lro.lro_mgr); -  		napi_complete(napi);  		if (unlikely(ib_req_notify_cq(priv->recv_cq,  					      IB_CQ_NEXT_COMP | @@ -515,7 +521,7 @@ static inline int post_send(struct ipoib_dev_priv *priv,  	for (i = 0; i < nr_frags; ++i) {  		priv->tx_sge[i + off].addr = mapping[i + off]; -		priv->tx_sge[i + off].length = frags[i].size; +		priv->tx_sge[i + off].length = skb_frag_size(&frags[i]);  	}  	priv->tx_wr.num_sge	     = nr_frags + off;  	priv->tx_wr.wr_id 	     = wr_id; @@ -594,6 +600,9 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,  		netif_stop_queue(dev);  	} +	skb_orphan(skb); +	skb_dst_drop(skb); +  	rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),  		       address->ah, qpn, tx_req, phead, hlen);  	if (unlikely(rc)) { @@ -609,8 +618,6 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,  		address->last_send = priv->tx_head;  		++priv->tx_head; -		skb_orphan(skb); -  	}  	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) @@ -678,15 +685,13 @@ int ipoib_ib_dev_open(struct net_device *dev)  	ret = ipoib_ib_post_receives(dev);  	if (ret) {  		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); -		ipoib_ib_dev_stop(dev, 1); -		return -1; +		goto dev_stop;  	}  	ret = ipoib_cm_dev_open(dev);  	if (ret) {  		ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); -		ipoib_ib_dev_stop(dev, 1); -		return -1; +		goto dev_stop;  	}  	clear_bit(IPOIB_STOP_REAPER, &priv->flags); @@ -697,6 +702,11 @@ int ipoib_ib_dev_open(struct net_device *dev)  		napi_enable(&priv->napi);  	return 0; +dev_stop: +	if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) +		napi_enable(&priv->napi); +	ipoib_ib_dev_stop(dev, 1); +	return -1;  }  static void ipoib_pkey_dev_check_presence(struct net_device *dev) @@ -739,10 +749,8 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush)  	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {  		mutex_lock(&pkey_mutex);  		set_bit(IPOIB_PKEY_STOP, &priv->flags); -		cancel_delayed_work(&priv->pkey_poll_task); +		cancel_delayed_work_sync(&priv->pkey_poll_task);  		mutex_unlock(&pkey_mutex); -		if (flush) -			flush_workqueue(ipoib_workqueue);  	}  	ipoib_mcast_stop_thread(dev, flush); @@ -925,14 +933,49 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)  	return 0;  } +/* + * Takes whatever value which is in pkey index 0 and updates priv->pkey + * returns 0 if the pkey value was changed. + */ +static inline int update_parent_pkey(struct ipoib_dev_priv *priv) +{ +	int result; +	u16 prev_pkey; + +	prev_pkey = priv->pkey; +	result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); +	if (result) { +		ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n", +			   priv->port, result); +		return result; +	} + +	priv->pkey |= 0x8000; + +	if (prev_pkey != priv->pkey) { +		ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n", +			  prev_pkey, priv->pkey); +		/* +		 * Update the pkey in the broadcast address, while making sure to set +		 * the full membership bit, so that we join the right broadcast group. +		 */ +		priv->dev->broadcast[8] = priv->pkey >> 8; +		priv->dev->broadcast[9] = priv->pkey & 0xff; +		return 0; +	} + +	return 1; +} +  static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,  				enum ipoib_flush_level level)  {  	struct ipoib_dev_priv *cpriv;  	struct net_device *dev = priv->dev;  	u16 new_index; +	int result; -	mutex_lock(&priv->vlan_mutex); +	down_read(&priv->vlan_rwsem);  	/*  	 * Flush any child interfaces too -- they might be up even if @@ -941,9 +984,13 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,  	list_for_each_entry(cpriv, &priv->child_intfs, list)  		__ipoib_ib_dev_flush(cpriv, level); -	mutex_unlock(&priv->vlan_mutex); +	up_read(&priv->vlan_rwsem);  	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { +		/* for non-child devices must check/update the pkey value here */ +		if (level == IPOIB_FLUSH_HEAVY && +		    !test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) +			update_parent_pkey(priv);  		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");  		return;  	} @@ -954,21 +1001,32 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,  	}  	if (level == IPOIB_FLUSH_HEAVY) { -		if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { -			clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); -			ipoib_ib_dev_down(dev, 0); -			ipoib_ib_dev_stop(dev, 0); -			if (ipoib_pkey_dev_delay_open(dev)) +		/* child devices chase their origin pkey value, while non-child +		 * (parent) devices should always takes what present in pkey index 0 +		 */ +		if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { +			if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { +				clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); +				ipoib_ib_dev_down(dev, 0); +				ipoib_ib_dev_stop(dev, 0); +				if (ipoib_pkey_dev_delay_open(dev)) +					return; +			} +			/* restart QP only if P_Key index is changed */ +			if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && +			    new_index == priv->pkey_index) { +				ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");  				return; +			} +			priv->pkey_index = new_index; +		} else { +			result = update_parent_pkey(priv); +			/* restart QP only if P_Key value changed */ +			if (result) { +				ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n"); +				return; +			}  		} - -		/* restart QP only if P_Key index is changed */ -		if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && -		    new_index == priv->pkey_index) { -			ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); -			return; -		} -		priv->pkey_index = new_index;  	}  	if (level == IPOIB_FLUSH_LIGHT) { @@ -1024,6 +1082,11 @@ void ipoib_ib_dev_cleanup(struct net_device *dev)  	struct ipoib_dev_priv *priv = netdev_priv(dev);  	ipoib_dbg(priv, "cleaning up ib_dev\n"); +	/* +	 * We must make sure there are no more (path) completions +	 * that may wish to touch priv fields that are no longer valid +	 */ +	ipoib_flush_paths(dev);  	ipoib_mcast_stop_thread(dev, 1);  	ipoib_mcast_dev_flush(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 9ff7bc73ed9..5786a78ff8b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -46,11 +46,17 @@  #include <linux/ip.h>  #include <linux/in.h> -#include <net/dst.h> +#include <linux/jhash.h> +#include <net/arp.h> + +#define DRV_VERSION "1.0.0" + +const char ipoib_driver_version[] = DRV_VERSION;  MODULE_AUTHOR("Roland Dreier");  MODULE_DESCRIPTION("IP-over-InfiniBand net driver");  MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION);  int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;  int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; @@ -60,15 +66,6 @@ MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");  module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);  MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); -static int lro; -module_param(lro, bool, 0444); -MODULE_PARM_DESC(lro,  "Enable LRO (Large Receive Offload)"); - -static int lro_max_aggr = IPOIB_LRO_MAX_AGGR; -module_param(lro_max_aggr, int, 0644); -MODULE_PARM_DESC(lro_max_aggr, "LRO: Max packets to be aggregated " -		"(default = 64)"); -  #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG  int ipoib_debug_level; @@ -93,6 +90,7 @@ struct ib_sa_client ipoib_sa_client;  static void ipoib_add_one(struct ib_device *device);  static void ipoib_remove_one(struct ib_device *device); +static void ipoib_neigh_reclaim(struct rcu_head *rp);  static struct ib_client ipoib_client = {  	.name   = "ipoib", @@ -106,6 +104,8 @@ int ipoib_open(struct net_device *dev)  	ipoib_dbg(priv, "bringing up interface\n"); +	netif_carrier_off(dev); +  	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);  	if (ipoib_pkey_dev_delay_open(dev)) @@ -121,7 +121,7 @@ int ipoib_open(struct net_device *dev)  		struct ipoib_dev_priv *cpriv;  		/* Bring up any child interfaces too */ -		mutex_lock(&priv->vlan_mutex); +		down_read(&priv->vlan_rwsem);  		list_for_each_entry(cpriv, &priv->child_intfs, list) {  			int flags; @@ -131,7 +131,7 @@ int ipoib_open(struct net_device *dev)  			dev_change_flags(cpriv->dev, flags | IFF_UP);  		} -		mutex_unlock(&priv->vlan_mutex); +		up_read(&priv->vlan_rwsem);  	}  	netif_start_queue(dev); @@ -157,14 +157,14 @@ static int ipoib_stop(struct net_device *dev)  	netif_stop_queue(dev); -	ipoib_ib_dev_down(dev, 0); +	ipoib_ib_dev_down(dev, 1);  	ipoib_ib_dev_stop(dev, 0);  	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {  		struct ipoib_dev_priv *cpriv;  		/* Bring down any child interfaces too */ -		mutex_lock(&priv->vlan_mutex); +		down_read(&priv->vlan_rwsem);  		list_for_each_entry(cpriv, &priv->child_intfs, list) {  			int flags; @@ -174,12 +174,27 @@ static int ipoib_stop(struct net_device *dev)  			dev_change_flags(cpriv->dev, flags & ~IFF_UP);  		} -		mutex_unlock(&priv->vlan_mutex); +		up_read(&priv->vlan_rwsem);  	}  	return 0;  } +static void ipoib_uninit(struct net_device *dev) +{ +	ipoib_dev_cleanup(dev); +} + +static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features) +{ +	struct ipoib_dev_priv *priv = netdev_priv(dev); + +	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) +		features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO); + +	return features; +} +  static int ipoib_change_mtu(struct net_device *dev, int new_mtu)  {  	struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -207,6 +222,37 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)  	return 0;  } +int ipoib_set_mode(struct net_device *dev, const char *buf) +{ +	struct ipoib_dev_priv *priv = netdev_priv(dev); + +	/* flush paths if we switch modes so that connections are restarted */ +	if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { +		set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +		ipoib_warn(priv, "enabling connected mode " +			   "will cause multicast packet drops\n"); +		netdev_update_features(dev); +		rtnl_unlock(); +		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + +		ipoib_flush_paths(dev); +		rtnl_lock(); +		return 0; +	} + +	if (!strcmp(buf, "datagram\n")) { +		clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +		netdev_update_features(dev); +		dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); +		rtnl_unlock(); +		ipoib_flush_paths(dev); +		rtnl_lock(); +		return 0; +	} + +	return -EINVAL; +} +  static struct ipoib_path *__path_find(struct net_device *dev, void *gid)  {  	struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -263,30 +309,15 @@ static int __path_add(struct net_device *dev, struct ipoib_path *path)  static void path_free(struct net_device *dev, struct ipoib_path *path)  { -	struct ipoib_dev_priv *priv = netdev_priv(dev); -	struct ipoib_neigh *neigh, *tn;  	struct sk_buff *skb; -	unsigned long flags;  	while ((skb = __skb_dequeue(&path->queue)))  		dev_kfree_skb_irq(skb); -	spin_lock_irqsave(&priv->lock, flags); - -	list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { -		/* -		 * It's safe to call ipoib_put_ah() inside priv->lock -		 * here, because we know that path->ah will always -		 * hold one more reference, so ipoib_put_ah() will -		 * never do more than decrement the ref count. -		 */ -		if (neigh->ah) -			ipoib_put_ah(neigh->ah); - -		ipoib_neigh_free(dev, neigh); -	} +	ipoib_dbg(netdev_priv(dev), "path_free\n"); -	spin_unlock_irqrestore(&priv->lock, flags); +	/* remove all neigh connected to this path */ +	ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);  	if (path->ah)  		ipoib_put_ah(path->ah); @@ -431,7 +462,7 @@ static void path_rec_completion(int status,  	spin_lock_irqsave(&priv->lock, flags); -	if (ah) { +	if (!IS_ERR_OR_NULL(ah)) {  		path->pathrec = *pathrec;  		old_ah   = path->ah; @@ -457,19 +488,14 @@ static void path_rec_completion(int status,  			}  			kref_get(&path->ah->ref);  			neigh->ah = path->ah; -			memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, -			       sizeof(union ib_gid)); -			if (ipoib_cm_enabled(dev, neigh->neighbour)) { +			if (ipoib_cm_enabled(dev, neigh->daddr)) {  				if (!ipoib_cm_get(neigh))  					ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,  									       path,  									       neigh));  				if (!ipoib_cm_get(neigh)) { -					list_del(&neigh->list); -					if (neigh->ah) -						ipoib_put_ah(neigh->ah); -					ipoib_neigh_free(dev, neigh); +					ipoib_neigh_free(neigh);  					continue;  				}  			} @@ -485,6 +511,9 @@ static void path_rec_completion(int status,  	spin_unlock_irqrestore(&priv->lock, flags); +	if (IS_ERR_OR_NULL(ah)) +		ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); +  	if (old_ah)  		ipoib_put_ah(old_ah); @@ -554,25 +583,26 @@ static int path_rec_start(struct net_device *dev,  	return 0;  } -static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) +static void neigh_add_path(struct sk_buff *skb, u8 *daddr, +			   struct net_device *dev)  {  	struct ipoib_dev_priv *priv = netdev_priv(dev);  	struct ipoib_path *path;  	struct ipoib_neigh *neigh;  	unsigned long flags; -	neigh = ipoib_neigh_alloc(skb_dst(skb)->neighbour, skb->dev); +	spin_lock_irqsave(&priv->lock, flags); +	neigh = ipoib_neigh_alloc(daddr, dev);  	if (!neigh) { +		spin_unlock_irqrestore(&priv->lock, flags);  		++dev->stats.tx_dropped;  		dev_kfree_skb_any(skb);  		return;  	} -	spin_lock_irqsave(&priv->lock, flags); - -	path = __path_find(dev, skb_dst(skb)->neighbour->ha + 4); +	path = __path_find(dev, daddr + 4);  	if (!path) { -		path = path_rec_create(dev, skb_dst(skb)->neighbour->ha + 4); +		path = path_rec_create(dev, daddr + 4);  		if (!path)  			goto err_path; @@ -584,17 +614,12 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)  	if (path->ah) {  		kref_get(&path->ah->ref);  		neigh->ah = path->ah; -		memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, -		       sizeof(union ib_gid)); -		if (ipoib_cm_enabled(dev, neigh->neighbour)) { +		if (ipoib_cm_enabled(dev, neigh->daddr)) {  			if (!ipoib_cm_get(neigh))  				ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));  			if (!ipoib_cm_get(neigh)) { -				list_del(&neigh->list); -				if (neigh->ah) -					ipoib_put_ah(neigh->ah); -				ipoib_neigh_free(dev, neigh); +				ipoib_neigh_free(neigh);  				goto err_drop;  			}  			if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) @@ -606,51 +631,35 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)  			}  		} else {  			spin_unlock_irqrestore(&priv->lock, flags); -			ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb_dst(skb)->neighbour->ha)); +			ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr)); +			ipoib_neigh_put(neigh);  			return;  		}  	} else {  		neigh->ah  = NULL;  		if (!path->query && path_rec_start(dev, path)) -			goto err_list; +			goto err_path;  		__skb_queue_tail(&neigh->queue, skb);  	}  	spin_unlock_irqrestore(&priv->lock, flags); +	ipoib_neigh_put(neigh);  	return; -err_list: -	list_del(&neigh->list); -  err_path: -	ipoib_neigh_free(dev, neigh); +	ipoib_neigh_free(neigh);  err_drop:  	++dev->stats.tx_dropped;  	dev_kfree_skb_any(skb);  	spin_unlock_irqrestore(&priv->lock, flags); -} - -static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev) -{ -	struct ipoib_dev_priv *priv = netdev_priv(skb->dev); - -	/* Look up path record for unicasts */ -	if (skb_dst(skb)->neighbour->ha[4] != 0xff) { -		neigh_add_path(skb, dev); -		return; -	} - -	/* Add in the P_Key for multicasts */ -	skb_dst(skb)->neighbour->ha[8] = (priv->pkey >> 8) & 0xff; -	skb_dst(skb)->neighbour->ha[9] = priv->pkey & 0xff; -	ipoib_mcast_send(dev, skb_dst(skb)->neighbour->ha + 4, skb); +	ipoib_neigh_put(neigh);  }  static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, -			     struct ipoib_pseudoheader *phdr) +			     struct ipoib_cb *cb)  {  	struct ipoib_dev_priv *priv = netdev_priv(dev);  	struct ipoib_path *path; @@ -658,17 +667,15 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,  	spin_lock_irqsave(&priv->lock, flags); -	path = __path_find(dev, phdr->hwaddr + 4); +	path = __path_find(dev, cb->hwaddr + 4);  	if (!path || !path->valid) {  		int new_path = 0;  		if (!path) { -			path = path_rec_create(dev, phdr->hwaddr + 4); +			path = path_rec_create(dev, cb->hwaddr + 4);  			new_path = 1;  		}  		if (path) { -			/* put pseudoheader back on for next time */ -			skb_push(skb, sizeof *phdr);  			__skb_queue_tail(&path->queue, skb);  			if (!path->query && path_rec_start(dev, path)) { @@ -692,12 +699,10 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,  			  be16_to_cpu(path->pathrec.dlid));  		spin_unlock_irqrestore(&priv->lock, flags); -		ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr)); +		ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr));  		return;  	} else if ((path->query || !path_rec_start(dev, path)) &&  		   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { -		/* put pseudoheader back on for next time */ -		skb_push(skb, sizeof *phdr);  		__skb_queue_tail(&path->queue, skb);  	} else {  		++dev->stats.tx_dropped; @@ -711,85 +716,82 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)  {  	struct ipoib_dev_priv *priv = netdev_priv(dev);  	struct ipoib_neigh *neigh; +	struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; +	struct ipoib_header *header;  	unsigned long flags; -	if (likely(skb_dst(skb) && skb_dst(skb)->neighbour)) { -		if (unlikely(!*to_ipoib_neigh(skb_dst(skb)->neighbour))) { -			ipoib_path_lookup(skb, dev); -			return NETDEV_TX_OK; -		} +	header = (struct ipoib_header *) skb->data; -		neigh = *to_ipoib_neigh(skb_dst(skb)->neighbour); - -		if (unlikely((memcmp(&neigh->dgid.raw, -				     skb_dst(skb)->neighbour->ha + 4, -				     sizeof(union ib_gid))) || -			     (neigh->dev != dev))) { -			spin_lock_irqsave(&priv->lock, flags); -			/* -			 * It's safe to call ipoib_put_ah() inside -			 * priv->lock here, because we know that -			 * path->ah will always hold one more reference, -			 * so ipoib_put_ah() will never do more than -			 * decrement the ref count. -			 */ -			if (neigh->ah) -				ipoib_put_ah(neigh->ah); -			list_del(&neigh->list); -			ipoib_neigh_free(dev, neigh); -			spin_unlock_irqrestore(&priv->lock, flags); -			ipoib_path_lookup(skb, dev); +	if (unlikely(cb->hwaddr[4] == 0xff)) { +		/* multicast, arrange "if" according to probability */ +		if ((header->proto != htons(ETH_P_IP)) && +		    (header->proto != htons(ETH_P_IPV6)) && +		    (header->proto != htons(ETH_P_ARP)) && +		    (header->proto != htons(ETH_P_RARP)) && +		    (header->proto != htons(ETH_P_TIPC))) { +			/* ethertype not supported by IPoIB */ +			++dev->stats.tx_dropped; +			dev_kfree_skb_any(skb);  			return NETDEV_TX_OK;  		} +		/* Add in the P_Key for multicast*/ +		cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; +		cb->hwaddr[9] = priv->pkey & 0xff; + +		neigh = ipoib_neigh_get(dev, cb->hwaddr); +		if (likely(neigh)) +			goto send_using_neigh; +		ipoib_mcast_send(dev, cb->hwaddr, skb); +		return NETDEV_TX_OK; +	} -		if (ipoib_cm_get(neigh)) { -			if (ipoib_cm_up(neigh)) { -				ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); -				return NETDEV_TX_OK; -			} -		} else if (neigh->ah) { -			ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb_dst(skb)->neighbour->ha)); +	/* unicast, arrange "switch" according to probability */ +	switch (header->proto) { +	case htons(ETH_P_IP): +	case htons(ETH_P_IPV6): +	case htons(ETH_P_TIPC): +		neigh = ipoib_neigh_get(dev, cb->hwaddr); +		if (unlikely(!neigh)) { +			neigh_add_path(skb, cb->hwaddr, dev);  			return NETDEV_TX_OK;  		} +		break; +	case htons(ETH_P_ARP): +	case htons(ETH_P_RARP): +		/* for unicast ARP and RARP should always perform path find */ +		unicast_arp_send(skb, dev, cb); +		return NETDEV_TX_OK; +	default: +		/* ethertype not supported by IPoIB */ +		++dev->stats.tx_dropped; +		dev_kfree_skb_any(skb); +		return NETDEV_TX_OK; +	} -		if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { -			spin_lock_irqsave(&priv->lock, flags); -			__skb_queue_tail(&neigh->queue, skb); -			spin_unlock_irqrestore(&priv->lock, flags); -		} else { -			++dev->stats.tx_dropped; -			dev_kfree_skb_any(skb); +send_using_neigh: +	/* note we now hold a ref to neigh */ +	if (ipoib_cm_get(neigh)) { +		if (ipoib_cm_up(neigh)) { +			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); +			goto unref;  		} -	} else { -		struct ipoib_pseudoheader *phdr = -			(struct ipoib_pseudoheader *) skb->data; -		skb_pull(skb, sizeof *phdr); - -		if (phdr->hwaddr[4] == 0xff) { -			/* Add in the P_Key for multicast*/ -			phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; -			phdr->hwaddr[9] = priv->pkey & 0xff; - -			ipoib_mcast_send(dev, phdr->hwaddr + 4, skb); -		} else { -			/* unicast GID -- should be ARP or RARP reply */ - -			if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) && -			    (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) { -				ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x %pI6\n", -					   skb_dst(skb) ? "neigh" : "dst", -					   be16_to_cpup((__be16 *) skb->data), -					   IPOIB_QPN(phdr->hwaddr), -					   phdr->hwaddr + 4); -				dev_kfree_skb_any(skb); -				++dev->stats.tx_dropped; -				return NETDEV_TX_OK; -			} +	} else if (neigh->ah) { +		ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr)); +		goto unref; +	} -			unicast_arp_send(skb, dev, phdr); -		} +	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { +		spin_lock_irqsave(&priv->lock, flags); +		__skb_queue_tail(&neigh->queue, skb); +		spin_unlock_irqrestore(&priv->lock, flags); +	} else { +		++dev->stats.tx_dropped; +		dev_kfree_skb_any(skb);  	} +unref: +	ipoib_neigh_put(neigh); +  	return NETDEV_TX_OK;  } @@ -811,6 +813,7 @@ static int ipoib_hard_header(struct sk_buff *skb,  			     const void *daddr, const void *saddr, unsigned len)  {  	struct ipoib_header *header; +	struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;  	header = (struct ipoib_header *) skb_push(skb, sizeof *header); @@ -818,17 +821,13 @@ static int ipoib_hard_header(struct sk_buff *skb,  	header->reserved = 0;  	/* -	 * If we don't have a neighbour structure, stuff the -	 * destination address onto the front of the skb so we can -	 * figure out where to send the packet later. +	 * we don't rely on dst_entry structure,  always stuff the +	 * destination address into skb->cb so we can figure out where +	 * to send the packet later.  	 */ -	if ((!skb_dst(skb) || !skb_dst(skb)->neighbour) && daddr) { -		struct ipoib_pseudoheader *phdr = -			(struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr); -		memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); -	} +	memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); -	return 0; +	return sizeof *header;  }  static void ipoib_set_mcast_list(struct net_device *dev) @@ -843,95 +842,443 @@ static void ipoib_set_mcast_list(struct net_device *dev)  	queue_work(ipoib_workqueue, &priv->restart_task);  } -static void ipoib_neigh_cleanup(struct neighbour *n) +static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)  { -	struct ipoib_neigh *neigh; -	struct ipoib_dev_priv *priv = netdev_priv(n->dev); +	/* +	 * Use only the address parts that contributes to spreading +	 * The subnet prefix is not used as one can not connect to +	 * same remote port (GUID) using the same remote QPN via two +	 * different subnets. +	 */ +	 /* qpn octets[1:4) & port GUID octets[12:20) */ +	u32 *d32 = (u32 *) daddr; +	u32 hv; + +	hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0); +	return hv & htbl->mask; +} + +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) +{ +	struct ipoib_dev_priv *priv = netdev_priv(dev); +	struct ipoib_neigh_table *ntbl = &priv->ntbl; +	struct ipoib_neigh_hash *htbl; +	struct ipoib_neigh *neigh = NULL; +	u32 hash_val; + +	rcu_read_lock_bh(); + +	htbl = rcu_dereference_bh(ntbl->htbl); + +	if (!htbl) +		goto out_unlock; + +	hash_val = ipoib_addr_hash(htbl, daddr); +	for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]); +	     neigh != NULL; +	     neigh = rcu_dereference_bh(neigh->hnext)) { +		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { +			/* found, take one ref on behalf of the caller */ +			if (!atomic_inc_not_zero(&neigh->refcnt)) { +				/* deleted */ +				neigh = NULL; +				goto out_unlock; +			} +			neigh->alive = jiffies; +			goto out_unlock; +		} +	} + +out_unlock: +	rcu_read_unlock_bh(); +	return neigh; +} + +static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) +{ +	struct ipoib_neigh_table *ntbl = &priv->ntbl; +	struct ipoib_neigh_hash *htbl; +	unsigned long neigh_obsolete; +	unsigned long dt;  	unsigned long flags; -	struct ipoib_ah *ah = NULL; +	int i; -	neigh = *to_ipoib_neigh(n); -	if (neigh) -		priv = netdev_priv(neigh->dev); -	else +	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))  		return; -	ipoib_dbg(priv, -		  "neigh_cleanup for %06x %pI6\n", -		  IPOIB_QPN(n->ha), -		  n->ha + 4);  	spin_lock_irqsave(&priv->lock, flags); -	if (neigh->ah) -		ah = neigh->ah; -	list_del(&neigh->list); -	ipoib_neigh_free(n->dev, neigh); +	htbl = rcu_dereference_protected(ntbl->htbl, +					 lockdep_is_held(&priv->lock)); + +	if (!htbl) +		goto out_unlock; + +	/* neigh is obsolete if it was idle for two GC periods */ +	dt = 2 * arp_tbl.gc_interval; +	neigh_obsolete = jiffies - dt; +	/* handle possible race condition */ +	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) +		goto out_unlock; + +	for (i = 0; i < htbl->size; i++) { +		struct ipoib_neigh *neigh; +		struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + +		while ((neigh = rcu_dereference_protected(*np, +							  lockdep_is_held(&priv->lock))) != NULL) { +			/* was the neigh idle for two GC periods */ +			if (time_after(neigh_obsolete, neigh->alive)) { +				rcu_assign_pointer(*np, +						   rcu_dereference_protected(neigh->hnext, +									     lockdep_is_held(&priv->lock))); +				/* remove from path/mc list */ +				list_del(&neigh->list); +				call_rcu(&neigh->rcu, ipoib_neigh_reclaim); +			} else { +				np = &neigh->hnext; +			} +		} +	} + +out_unlock:  	spin_unlock_irqrestore(&priv->lock, flags); +} -	if (ah) -		ipoib_put_ah(ah); +static void ipoib_reap_neigh(struct work_struct *work) +{ +	struct ipoib_dev_priv *priv = +		container_of(work, struct ipoib_dev_priv, neigh_reap_task.work); + +	__ipoib_reap_neigh(priv); + +	if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) +		queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, +				   arp_tbl.gc_interval);  } -struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour, + +static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,  				      struct net_device *dev)  {  	struct ipoib_neigh *neigh; -	neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); +	neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);  	if (!neigh)  		return NULL; -	neigh->neighbour = neighbour;  	neigh->dev = dev; -	memset(&neigh->dgid.raw, 0, sizeof (union ib_gid)); -	*to_ipoib_neigh(neighbour) = neigh; +	memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));  	skb_queue_head_init(&neigh->queue); +	INIT_LIST_HEAD(&neigh->list);  	ipoib_cm_set(neigh, NULL); +	/* one ref on behalf of the caller */ +	atomic_set(&neigh->refcnt, 1);  	return neigh;  } -void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh) +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, +				      struct net_device *dev)  { +	struct ipoib_dev_priv *priv = netdev_priv(dev); +	struct ipoib_neigh_table *ntbl = &priv->ntbl; +	struct ipoib_neigh_hash *htbl; +	struct ipoib_neigh *neigh; +	u32 hash_val; + +	htbl = rcu_dereference_protected(ntbl->htbl, +					 lockdep_is_held(&priv->lock)); +	if (!htbl) { +		neigh = NULL; +		goto out_unlock; +	} + +	/* need to add a new neigh, but maybe some other thread succeeded? +	 * recalc hash, maybe hash resize took place so we do a search +	 */ +	hash_val = ipoib_addr_hash(htbl, daddr); +	for (neigh = rcu_dereference_protected(htbl->buckets[hash_val], +					       lockdep_is_held(&priv->lock)); +	     neigh != NULL; +	     neigh = rcu_dereference_protected(neigh->hnext, +					       lockdep_is_held(&priv->lock))) { +		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { +			/* found, take one ref on behalf of the caller */ +			if (!atomic_inc_not_zero(&neigh->refcnt)) { +				/* deleted */ +				neigh = NULL; +				break; +			} +			neigh->alive = jiffies; +			goto out_unlock; +		} +	} + +	neigh = ipoib_neigh_ctor(daddr, dev); +	if (!neigh) +		goto out_unlock; + +	/* one ref on behalf of the hash table */ +	atomic_inc(&neigh->refcnt); +	neigh->alive = jiffies; +	/* put in hash */ +	rcu_assign_pointer(neigh->hnext, +			   rcu_dereference_protected(htbl->buckets[hash_val], +						     lockdep_is_held(&priv->lock))); +	rcu_assign_pointer(htbl->buckets[hash_val], neigh); +	atomic_inc(&ntbl->entries); + +out_unlock: + +	return neigh; +} + +void ipoib_neigh_dtor(struct ipoib_neigh *neigh) +{ +	/* neigh reference count was dropprd to zero */ +	struct net_device *dev = neigh->dev; +	struct ipoib_dev_priv *priv = netdev_priv(dev);  	struct sk_buff *skb; -	*to_ipoib_neigh(neigh->neighbour) = NULL; +	if (neigh->ah) +		ipoib_put_ah(neigh->ah);  	while ((skb = __skb_dequeue(&neigh->queue))) {  		++dev->stats.tx_dropped;  		dev_kfree_skb_any(skb);  	}  	if (ipoib_cm_get(neigh))  		ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); +	ipoib_dbg(netdev_priv(dev), +		  "neigh free for %06x %pI6\n", +		  IPOIB_QPN(neigh->daddr), +		  neigh->daddr + 4);  	kfree(neigh); +	if (atomic_dec_and_test(&priv->ntbl.entries)) { +		if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags)) +			complete(&priv->ntbl.flushed); +	}  } -static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms) +static void ipoib_neigh_reclaim(struct rcu_head *rp)  { -	parms->neigh_cleanup = ipoib_neigh_cleanup; +	/* Called as a result of removal from hash table */ +	struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu); +	/* note TX context may hold another ref */ +	ipoib_neigh_put(neigh); +} + +void ipoib_neigh_free(struct ipoib_neigh *neigh) +{ +	struct net_device *dev = neigh->dev; +	struct ipoib_dev_priv *priv = netdev_priv(dev); +	struct ipoib_neigh_table *ntbl = &priv->ntbl; +	struct ipoib_neigh_hash *htbl; +	struct ipoib_neigh __rcu **np; +	struct ipoib_neigh *n; +	u32 hash_val; + +	htbl = rcu_dereference_protected(ntbl->htbl, +					lockdep_is_held(&priv->lock)); +	if (!htbl) +		return; + +	hash_val = ipoib_addr_hash(htbl, neigh->daddr); +	np = &htbl->buckets[hash_val]; +	for (n = rcu_dereference_protected(*np, +					    lockdep_is_held(&priv->lock)); +	     n != NULL; +	     n = rcu_dereference_protected(*np, +					lockdep_is_held(&priv->lock))) { +		if (n == neigh) { +			/* found */ +			rcu_assign_pointer(*np, +					   rcu_dereference_protected(neigh->hnext, +								     lockdep_is_held(&priv->lock))); +			/* remove from parent list */ +			list_del(&neigh->list); +			call_rcu(&neigh->rcu, ipoib_neigh_reclaim); +			return; +		} else { +			np = &n->hnext; +		} +	} +} + +static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) +{ +	struct ipoib_neigh_table *ntbl = &priv->ntbl; +	struct ipoib_neigh_hash *htbl; +	struct ipoib_neigh **buckets; +	u32 size; + +	clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); +	ntbl->htbl = NULL; +	htbl = kzalloc(sizeof(*htbl), GFP_KERNEL); +	if (!htbl) +		return -ENOMEM; +	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); +	size = roundup_pow_of_two(arp_tbl.gc_thresh3); +	buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL); +	if (!buckets) { +		kfree(htbl); +		return -ENOMEM; +	} +	htbl->size = size; +	htbl->mask = (size - 1); +	htbl->buckets = buckets; +	ntbl->htbl = htbl; +	htbl->ntbl = ntbl; +	atomic_set(&ntbl->entries, 0); + +	/* start garbage collection */ +	clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); +	queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, +			   arp_tbl.gc_interval);  	return 0;  } +static void neigh_hash_free_rcu(struct rcu_head *head) +{ +	struct ipoib_neigh_hash *htbl = container_of(head, +						    struct ipoib_neigh_hash, +						    rcu); +	struct ipoib_neigh __rcu **buckets = htbl->buckets; +	struct ipoib_neigh_table *ntbl = htbl->ntbl; + +	kfree(buckets); +	kfree(htbl); +	complete(&ntbl->deleted); +} + +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) +{ +	struct ipoib_dev_priv *priv = netdev_priv(dev); +	struct ipoib_neigh_table *ntbl = &priv->ntbl; +	struct ipoib_neigh_hash *htbl; +	unsigned long flags; +	int i; + +	/* remove all neigh connected to a given path or mcast */ +	spin_lock_irqsave(&priv->lock, flags); + +	htbl = rcu_dereference_protected(ntbl->htbl, +					 lockdep_is_held(&priv->lock)); + +	if (!htbl) +		goto out_unlock; + +	for (i = 0; i < htbl->size; i++) { +		struct ipoib_neigh *neigh; +		struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + +		while ((neigh = rcu_dereference_protected(*np, +							  lockdep_is_held(&priv->lock))) != NULL) { +			/* delete neighs belong to this parent */ +			if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) { +				rcu_assign_pointer(*np, +						   rcu_dereference_protected(neigh->hnext, +									     lockdep_is_held(&priv->lock))); +				/* remove from parent list */ +				list_del(&neigh->list); +				call_rcu(&neigh->rcu, ipoib_neigh_reclaim); +			} else { +				np = &neigh->hnext; +			} + +		} +	} +out_unlock: +	spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) +{ +	struct ipoib_neigh_table *ntbl = &priv->ntbl; +	struct ipoib_neigh_hash *htbl; +	unsigned long flags; +	int i, wait_flushed = 0; + +	init_completion(&priv->ntbl.flushed); + +	spin_lock_irqsave(&priv->lock, flags); + +	htbl = rcu_dereference_protected(ntbl->htbl, +					lockdep_is_held(&priv->lock)); +	if (!htbl) +		goto out_unlock; + +	wait_flushed = atomic_read(&priv->ntbl.entries); +	if (!wait_flushed) +		goto free_htbl; + +	for (i = 0; i < htbl->size; i++) { +		struct ipoib_neigh *neigh; +		struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + +		while ((neigh = rcu_dereference_protected(*np, +				       lockdep_is_held(&priv->lock))) != NULL) { +			rcu_assign_pointer(*np, +					   rcu_dereference_protected(neigh->hnext, +								     lockdep_is_held(&priv->lock))); +			/* remove from path/mc list */ +			list_del(&neigh->list); +			call_rcu(&neigh->rcu, ipoib_neigh_reclaim); +		} +	} + +free_htbl: +	rcu_assign_pointer(ntbl->htbl, NULL); +	call_rcu(&htbl->rcu, neigh_hash_free_rcu); + +out_unlock: +	spin_unlock_irqrestore(&priv->lock, flags); +	if (wait_flushed) +		wait_for_completion(&priv->ntbl.flushed); +} + +static void ipoib_neigh_hash_uninit(struct net_device *dev) +{ +	struct ipoib_dev_priv *priv = netdev_priv(dev); +	int stopped; + +	ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n"); +	init_completion(&priv->ntbl.deleted); +	set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); + +	/* Stop GC if called at init fail need to cancel work */ +	stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); +	if (!stopped) +		cancel_delayed_work(&priv->neigh_reap_task); + +	ipoib_flush_neighs(priv); + +	wait_for_completion(&priv->ntbl.deleted); +} + +  int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)  {  	struct ipoib_dev_priv *priv = netdev_priv(dev); +	if (ipoib_neigh_hash_init(priv) < 0) +		goto out;  	/* Allocate RX/TX "rings" to hold queued skbs */  	priv->rx_ring =	kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,  				GFP_KERNEL);  	if (!priv->rx_ring) {  		printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",  		       ca->name, ipoib_recvq_size); -		goto out; +		goto out_neigh_hash_cleanup;  	} -	priv->tx_ring = vmalloc(ipoib_sendq_size * sizeof *priv->tx_ring); +	priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);  	if (!priv->tx_ring) {  		printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",  		       ca->name, ipoib_sendq_size);  		goto out_rx_ring_cleanup;  	} -	memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring);  	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */ @@ -946,6 +1293,8 @@ out_tx_ring_cleanup:  out_rx_ring_cleanup:  	kfree(priv->rx_ring); +out_neigh_hash_cleanup: +	ipoib_neigh_hash_uninit(dev);  out:  	return -ENOMEM;  } @@ -953,15 +1302,20 @@ out:  void ipoib_dev_cleanup(struct net_device *dev)  {  	struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv; +	LIST_HEAD(head); + +	ASSERT_RTNL();  	ipoib_delete_debug_files(dev);  	/* Delete any child interfaces first */  	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { -		unregister_netdev(cpriv->dev); -		ipoib_dev_cleanup(cpriv->dev); -		free_netdev(cpriv->dev); +		/* Stop GC on child */ +		set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags); +		cancel_delayed_work(&cpriv->neigh_reap_task); +		unregister_netdevice_queue(cpriv->dev, &head);  	} +	unregister_netdevice_many(&head);  	ipoib_ib_dev_cleanup(dev); @@ -970,71 +1324,26 @@ void ipoib_dev_cleanup(struct net_device *dev)  	priv->rx_ring = NULL;  	priv->tx_ring = NULL; + +	ipoib_neigh_hash_uninit(dev);  }  static const struct header_ops ipoib_header_ops = {  	.create	= ipoib_hard_header,  }; -static int get_skb_hdr(struct sk_buff *skb, void **iphdr, -		       void **tcph, u64 *hdr_flags, void *priv) -{ -	unsigned int ip_len; -	struct iphdr *iph; - -	if (unlikely(skb->protocol != htons(ETH_P_IP))) -		return -1; - -	/* -	 * In the future we may add an else clause that verifies the -	 * checksum and allows devices which do not calculate checksum -	 * to use LRO. -	 */ -	if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY)) -		return -1; - -	/* Check for non-TCP packet */ -	skb_reset_network_header(skb); -	iph = ip_hdr(skb); -	if (iph->protocol != IPPROTO_TCP) -		return -1; - -	ip_len = ip_hdrlen(skb); -	skb_set_transport_header(skb, ip_len); -	*tcph = tcp_hdr(skb); - -	/* check if IP header and TCP header are complete */ -	if (ntohs(iph->tot_len) < ip_len + tcp_hdrlen(skb)) -		return -1; - -	*hdr_flags = LRO_IPV4 | LRO_TCP; -	*iphdr = iph; - -	return 0; -} - -static void ipoib_lro_setup(struct ipoib_dev_priv *priv) -{ -	priv->lro.lro_mgr.max_aggr	 = lro_max_aggr; -	priv->lro.lro_mgr.max_desc	 = IPOIB_MAX_LRO_DESCRIPTORS; -	priv->lro.lro_mgr.lro_arr	 = priv->lro.lro_desc; -	priv->lro.lro_mgr.get_skb_header = get_skb_hdr; -	priv->lro.lro_mgr.features	 = LRO_F_NAPI; -	priv->lro.lro_mgr.dev		 = priv->dev; -	priv->lro.lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY; -} -  static const struct net_device_ops ipoib_netdev_ops = { +	.ndo_uninit		 = ipoib_uninit,  	.ndo_open		 = ipoib_open,  	.ndo_stop		 = ipoib_stop,  	.ndo_change_mtu		 = ipoib_change_mtu, +	.ndo_fix_features	 = ipoib_fix_features,  	.ndo_start_xmit	 	 = ipoib_start_xmit,  	.ndo_tx_timeout		 = ipoib_timeout, -	.ndo_set_multicast_list	 = ipoib_set_mcast_list, -	.ndo_neigh_setup	 = ipoib_neigh_setup_dev, +	.ndo_set_rx_mode	 = ipoib_set_mcast_list,  }; -static void ipoib_setup(struct net_device *dev) +void ipoib_setup(struct net_device *dev)  {  	struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -1043,17 +1352,13 @@ static void ipoib_setup(struct net_device *dev)  	ipoib_set_ethtool_ops(dev); -	netif_napi_add(dev, &priv->napi, ipoib_poll, 100); +	netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT);  	dev->watchdog_timeo	 = HZ;  	dev->flags		|= IFF_BROADCAST | IFF_MULTICAST; -	/* -	 * We add in INFINIBAND_ALEN to allow for the destination -	 * address "pseudoheader" for skbs without neighbour struct. -	 */ -	dev->hard_header_len	 = IPOIB_ENCAP_LEN + INFINIBAND_ALEN; +	dev->hard_header_len	 = IPOIB_ENCAP_LEN;  	dev->addr_len		 = INFINIBAND_ALEN;  	dev->type		 = ARPHRD_INFINIBAND;  	dev->tx_queue_len	 = ipoib_sendq_size * 2; @@ -1063,15 +1368,11 @@ static void ipoib_setup(struct net_device *dev)  	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); -	netif_carrier_off(dev); -  	priv->dev = dev; -	ipoib_lro_setup(priv); -  	spin_lock_init(&priv->lock); -	mutex_init(&priv->vlan_mutex); +	init_rwsem(&priv->vlan_rwsem);  	INIT_LIST_HEAD(&priv->path_list);  	INIT_LIST_HEAD(&priv->child_intfs); @@ -1086,6 +1387,7 @@ static void ipoib_setup(struct net_device *dev)  	INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);  	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);  	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); +	INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);  }  struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) @@ -1117,12 +1419,9 @@ static ssize_t show_umcast(struct device *dev,  	return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));  } -static ssize_t set_umcast(struct device *dev, -			  struct device_attribute *attr, -			  const char *buf, size_t count) +void ipoib_set_umcast(struct net_device *ndev, int umcast_val)  { -	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); -	unsigned long umcast_val = simple_strtoul(buf, NULL, 0); +	struct ipoib_dev_priv *priv = netdev_priv(ndev);  	if (umcast_val > 0) {  		set_bit(IPOIB_FLAG_UMCAST, &priv->flags); @@ -1130,6 +1429,15 @@ static ssize_t set_umcast(struct device *dev,  				"by userspace\n");  	} else  		clear_bit(IPOIB_FLAG_UMCAST, &priv->flags); +} + +static ssize_t set_umcast(struct device *dev, +			  struct device_attribute *attr, +			  const char *buf, size_t count) +{ +	unsigned long umcast_val = simple_strtoul(buf, NULL, 0); + +	ipoib_set_umcast(to_net_dev(dev), umcast_val);  	return count;  } @@ -1150,7 +1458,7 @@ static ssize_t create_child(struct device *dev,  	if (sscanf(buf, "%i", &pkey) != 1)  		return -EINVAL; -	if (pkey < 0 || pkey > 0xffff) +	if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000)  		return -EINVAL;  	/* @@ -1214,20 +1522,18 @@ int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)  	kfree(device_attr);  	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { -		set_bit(IPOIB_FLAG_CSUM, &priv->flags); -		priv->dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM; -	} +		priv->dev->hw_features = NETIF_F_SG | +			NETIF_F_IP_CSUM | NETIF_F_RXCSUM; -	if (lro) -		priv->dev->features |= NETIF_F_LRO; +		if (priv->hca_caps & IB_DEVICE_UD_TSO) +			priv->dev->hw_features |= NETIF_F_TSO; -	if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) -		priv->dev->features |= NETIF_F_TSO; +		priv->dev->features |= priv->dev->hw_features; +	}  	return 0;  } -  static struct net_device *ipoib_add_port(const char *format,  					 struct ib_device *hca, u8 port)  { @@ -1254,6 +1560,8 @@ static struct net_device *ipoib_add_port(const char *format,  	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);  	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu; +	priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh); +  	result = ib_query_pkey(hca, port, 0, &priv->pkey);  	if (result) {  		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", @@ -1326,6 +1634,9 @@ sysfs_failed:  register_failed:  	ib_unregister_event_handler(&priv->event_handler); +	/* Stop GC if started before flush */ +	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); +	cancel_delayed_work(&priv->neigh_reap_task);  	flush_workqueue(ipoib_workqueue);  event_failed: @@ -1384,6 +1695,8 @@ static void ipoib_remove_one(struct ib_device *device)  		return;  	dev_list = ib_get_client_data(device, &ipoib_client); +	if (!dev_list) +		return;  	list_for_each_entry_safe(priv, tmp, dev_list, list) {  		ib_unregister_event_handler(&priv->event_handler); @@ -1392,10 +1705,12 @@ static void ipoib_remove_one(struct ib_device *device)  		dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);  		rtnl_unlock(); +		/* Stop GC */ +		set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); +		cancel_delayed_work(&priv->neigh_reap_task);  		flush_workqueue(ipoib_workqueue);  		unregister_netdev(priv->dev); -		ipoib_dev_cleanup(priv->dev);  		free_netdev(priv->dev);  	} @@ -1447,8 +1762,15 @@ static int __init ipoib_init_module(void)  	if (ret)  		goto err_sa; +	ret = ipoib_netlink_init(); +	if (ret) +		goto err_client; +  	return 0; +err_client: +	ib_unregister_client(&ipoib_client); +  err_sa:  	ib_sa_unregister_client(&ipoib_sa_client);  	destroy_workqueue(ipoib_workqueue); @@ -1461,6 +1783,7 @@ err_fs:  static void __exit ipoib_cleanup_module(void)  { +	ipoib_netlink_fini();  	ib_unregister_client(&ipoib_client);  	ib_sa_unregister_client(&ipoib_sa_client);  	ipoib_unregister_debugfs(); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 3871ac66355..d4e005720d0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -34,6 +34,7 @@  #include <linux/skbuff.h>  #include <linux/rtnetlink.h> +#include <linux/moduleparam.h>  #include <linux/ip.h>  #include <linux/in.h>  #include <linux/igmp.h> @@ -68,28 +69,13 @@ struct ipoib_mcast_iter {  static void ipoib_mcast_free(struct ipoib_mcast *mcast)  {  	struct net_device *dev = mcast->dev; -	struct ipoib_dev_priv *priv = netdev_priv(dev); -	struct ipoib_neigh *neigh, *tmp;  	int tx_dropped = 0;  	ipoib_dbg_mcast(netdev_priv(dev), "deleting multicast group %pI6\n",  			mcast->mcmember.mgid.raw); -	spin_lock_irq(&priv->lock); - -	list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list) { -		/* -		 * It's safe to call ipoib_put_ah() inside priv->lock -		 * here, because we know that mcast->ah will always -		 * hold one more reference, so ipoib_put_ah() will -		 * never do more than decrement the ref count. -		 */ -		if (neigh->ah) -			ipoib_put_ah(neigh->ah); -		ipoib_neigh_free(dev, neigh); -	} - -	spin_unlock_irq(&priv->lock); +	/* remove all neigh connected to this mcast */ +	ipoib_del_neighs_by_gid(dev, mcast->mcmember.mgid.raw);  	if (mcast->ah)  		ipoib_put_ah(mcast->ah); @@ -189,7 +175,9 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,  	mcast->mcmember = *mcmember; -	/* Set the cached Q_Key before we attach if it's the broadcast group */ +	/* Set the multicast MTU and cached Q_Key before we attach if it's +	 * the broadcast group. +	 */  	if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4,  		    sizeof (union ib_gid))) {  		spin_lock_irq(&priv->lock); @@ -197,10 +185,17 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,  			spin_unlock_irq(&priv->lock);  			return -EAGAIN;  		} +		priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));  		priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);  		spin_unlock_irq(&priv->lock);  		priv->tx_wr.wr.ud.remote_qkey = priv->qkey;  		set_qkey = 1; + +		if (!ipoib_cm_admin_enabled(dev)) { +			rtnl_lock(); +			dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); +			rtnl_unlock(); +		}  	}  	if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { @@ -239,8 +234,11 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,  		av.grh.dgid = mcast->mcmember.mgid;  		ah = ipoib_create_ah(dev, priv->pd, &av); -		if (!ah) { -			ipoib_warn(priv, "ib_address_create failed\n"); +		if (IS_ERR(ah)) { +			ipoib_warn(priv, "ib_address_create failed %ld\n", +				-PTR_ERR(ah)); +			/* use original error */ +			return PTR_ERR(ah);  		} else {  			spin_lock_irq(&priv->lock);  			mcast->ah = ah; @@ -258,17 +256,13 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,  	netif_tx_lock_bh(dev);  	while (!skb_queue_empty(&mcast->pkt_queue)) {  		struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue); +  		netif_tx_unlock_bh(dev);  		skb->dev = dev; - -		if (!skb_dst(skb) || !skb_dst(skb)->neighbour) { -			/* put pseudoheader back on for next time */ -			skb_push(skb, sizeof (struct ipoib_pseudoheader)); -		} -  		if (dev_queue_xmit(skb))  			ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n"); +  		netif_tx_lock_bh(dev);  	}  	netif_tx_unlock_bh(dev); @@ -392,8 +386,10 @@ static int ipoib_mcast_join_complete(int status,  			mcast->mcmember.mgid.raw, status);  	/* We trap for port events ourselves. */ -	if (status == -ENETRESET) -		return 0; +	if (status == -ENETRESET) { +		status = 0; +		goto out; +	}  	if (!status)  		status = ipoib_mcast_join_finish(mcast, &multicast->rec); @@ -413,7 +409,8 @@ static int ipoib_mcast_join_complete(int status,  		if (mcast == priv->broadcast)  			queue_work(ipoib_workqueue, &priv->carrier_on_task); -		return 0; +		status = 0; +		goto out;  	}  	if (mcast->logcount++ < 20) { @@ -440,7 +437,8 @@ static int ipoib_mcast_join_complete(int status,  				   mcast->backoff * HZ);  	spin_unlock_irq(&priv->lock);  	mutex_unlock(&mcast_mutex); - +out: +	complete(&mcast->done);  	return status;  } @@ -490,11 +488,15 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,  	}  	set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); +	init_completion(&mcast->done); +	set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); +  	mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,  					 &rec, comp_mask, GFP_KERNEL,  					 ipoib_mcast_join_complete, mcast);  	if (IS_ERR(mcast->mc)) {  		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); +		complete(&mcast->done);  		ret = PTR_ERR(mcast->mc);  		ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); @@ -516,10 +518,18 @@ void ipoib_mcast_join_task(struct work_struct *work)  	struct ipoib_dev_priv *priv =  		container_of(work, struct ipoib_dev_priv, mcast_task.work);  	struct net_device *dev = priv->dev; +	struct ib_port_attr port_attr;  	if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))  		return; +	if (ib_query_port(priv->ca, priv->port, &port_attr) || +	    port_attr.state != IB_PORT_ACTIVE) { +		ipoib_dbg(priv, "port state is not ACTIVE (state = %d) suspending join task\n", +			  port_attr.state); +		return; +	} +  	if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))  		ipoib_warn(priv, "ib_query_gid() failed\n");  	else @@ -589,14 +599,6 @@ void ipoib_mcast_join_task(struct work_struct *work)  		return;  	} -	priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); - -	if (!ipoib_cm_admin_enabled(dev)) { -		rtnl_lock(); -		dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); -		rtnl_unlock(); -	} -  	ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");  	clear_bit(IPOIB_MCAST_RUN, &priv->flags); @@ -655,11 +657,12 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)  	return 0;  } -void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb) +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)  {  	struct ipoib_dev_priv *priv = netdev_priv(dev);  	struct ipoib_mcast *mcast;  	unsigned long flags; +	void *mgid = daddr + 4;  	spin_lock_irqsave(&priv->lock, flags); @@ -715,21 +718,23 @@ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb)  out:  	if (mcast && mcast->ah) { -		if (skb_dst(skb)		&& -		    skb_dst(skb)->neighbour && -		    !*to_ipoib_neigh(skb_dst(skb)->neighbour)) { -			struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb_dst(skb)->neighbour, -									skb->dev); +		struct ipoib_neigh *neigh; +		spin_unlock_irqrestore(&priv->lock, flags); +		neigh = ipoib_neigh_get(dev, daddr); +		spin_lock_irqsave(&priv->lock, flags); +		if (!neigh) { +			neigh = ipoib_neigh_alloc(daddr, dev);  			if (neigh) {  				kref_get(&mcast->ah->ref);  				neigh->ah	= mcast->ah;  				list_add_tail(&neigh->list, &mcast->neigh_list);  			}  		} -  		spin_unlock_irqrestore(&priv->lock, flags);  		ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); +		if (neigh) +			ipoib_neigh_put(neigh);  		return;  	} @@ -762,6 +767,11 @@ void ipoib_mcast_dev_flush(struct net_device *dev)  	spin_unlock_irqrestore(&priv->lock, flags); +	/* seperate between the wait to the leave*/ +	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) +		if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) +			wait_for_completion(&mcast->done); +  	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {  		ipoib_mcast_leave(dev, mcast);  		ipoib_mcast_free(mcast); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c new file mode 100644 index 00000000000..cdc7df4fdb8 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. -  All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses.  You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + *     Redistribution and use in source and binary forms, with or + *     without modification, are permitted provided that the following + *     conditions are met: + * + *      - Redistributions of source code must retain the above + *        copyright notice, this list of conditions and the following + *        disclaimer. + * + *      - Redistributions in binary form must reproduce the above + *        copyright notice, this list of conditions and the following + *        disclaimer in the documentation and/or other materials + *        provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/netdevice.h> +#include <linux/if_arp.h>      /* For ARPHRD_xxx */ +#include <linux/module.h> +#include <net/rtnetlink.h> +#include "ipoib.h" + +static const struct nla_policy ipoib_policy[IFLA_IPOIB_MAX + 1] = { +	[IFLA_IPOIB_PKEY]	= { .type = NLA_U16 }, +	[IFLA_IPOIB_MODE]	= { .type = NLA_U16 }, +	[IFLA_IPOIB_UMCAST]	= { .type = NLA_U16 }, +}; + +static int ipoib_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ +	struct ipoib_dev_priv *priv = netdev_priv(dev); +	u16 val; + +	if (nla_put_u16(skb, IFLA_IPOIB_PKEY, priv->pkey)) +		goto nla_put_failure; + +	val = test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +	if (nla_put_u16(skb, IFLA_IPOIB_MODE, val)) +		goto nla_put_failure; + +	val = test_bit(IPOIB_FLAG_UMCAST, &priv->flags); +	if (nla_put_u16(skb, IFLA_IPOIB_UMCAST, val)) +		goto nla_put_failure; + +	return 0; + +nla_put_failure: +	return -EMSGSIZE; +} + +static int ipoib_changelink(struct net_device *dev, +			    struct nlattr *tb[], struct nlattr *data[]) +{ +	u16 mode, umcast; +	int ret = 0; + +	if (data[IFLA_IPOIB_MODE]) { +		mode  = nla_get_u16(data[IFLA_IPOIB_MODE]); +		if (mode == IPOIB_MODE_DATAGRAM) +			ret = ipoib_set_mode(dev, "datagram\n"); +		else if (mode == IPOIB_MODE_CONNECTED) +			ret = ipoib_set_mode(dev, "connected\n"); +		else +			ret = -EINVAL; + +		if (ret < 0) +			goto out_err; +	} + +	if (data[IFLA_IPOIB_UMCAST]) { +		umcast = nla_get_u16(data[IFLA_IPOIB_UMCAST]); +		ipoib_set_umcast(dev, umcast); +	} + +out_err: +	return ret; +} + +static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, +			       struct nlattr *tb[], struct nlattr *data[]) +{ +	struct net_device *pdev; +	struct ipoib_dev_priv *ppriv; +	u16 child_pkey; +	int err; + +	if (!tb[IFLA_LINK]) +		return -EINVAL; + +	pdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); +	if (!pdev || pdev->type != ARPHRD_INFINIBAND) +		return -ENODEV; + +	ppriv = netdev_priv(pdev); + +	if (test_bit(IPOIB_FLAG_SUBINTERFACE, &ppriv->flags)) { +		ipoib_warn(ppriv, "child creation disallowed for child devices\n"); +		return -EINVAL; +	} + +	if (!data || !data[IFLA_IPOIB_PKEY]) { +		ipoib_dbg(ppriv, "no pkey specified, using parent pkey\n"); +		child_pkey  = ppriv->pkey; +	} else +		child_pkey  = nla_get_u16(data[IFLA_IPOIB_PKEY]); + +	if (child_pkey == 0 || child_pkey == 0x8000) +		return -EINVAL; + +	/* +	 * Set the full membership bit, so that we join the right +	 * broadcast group, etc. +	 */ +	child_pkey |= 0x8000; + +	err = __ipoib_vlan_add(ppriv, netdev_priv(dev), child_pkey, IPOIB_RTNL_CHILD); + +	if (!err && data) +		err = ipoib_changelink(dev, tb, data); +	return err; +} + +static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head *head) +{ +	struct ipoib_dev_priv *priv, *ppriv; + +	priv = netdev_priv(dev); +	ppriv = netdev_priv(priv->parent); + +	down_write(&ppriv->vlan_rwsem); +	unregister_netdevice_queue(dev, head); +	list_del(&priv->list); +	up_write(&ppriv->vlan_rwsem); +} + +static size_t ipoib_get_size(const struct net_device *dev) +{ +	return nla_total_size(2) +	/* IFLA_IPOIB_PKEY   */ +		nla_total_size(2) +	/* IFLA_IPOIB_MODE   */ +		nla_total_size(2);	/* IFLA_IPOIB_UMCAST */ +} + +static struct rtnl_link_ops ipoib_link_ops __read_mostly = { +	.kind		= "ipoib", +	.maxtype	= IFLA_IPOIB_MAX, +	.policy		= ipoib_policy, +	.priv_size	= sizeof(struct ipoib_dev_priv), +	.setup		= ipoib_setup, +	.newlink	= ipoib_new_child_link, +	.changelink	= ipoib_changelink, +	.dellink	= ipoib_unregister_child_dev, +	.get_size	= ipoib_get_size, +	.fill_info	= ipoib_fill_info, +}; + +int __init ipoib_netlink_init(void) +{ +	return rtnl_link_register(&ipoib_link_ops); +} + +void __exit ipoib_netlink_fini(void) +{ +	rtnl_link_unregister(&ipoib_link_ops); +} + +MODULE_ALIAS_RTNL_LINK("ipoib"); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 049a997caff..c56d5d44c53 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -192,6 +192,9 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)  	if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK)  		init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; +	if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING) +		init_attr.create_flags |= IB_QP_CREATE_NETIF_QP; +  	if (dev->features & NETIF_F_SG)  		init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index d7e9740c724..9fad7b5ac8b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -49,47 +49,11 @@ static ssize_t show_parent(struct device *d, struct device_attribute *attr,  }  static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL); -int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) +int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, +		     u16 pkey, int type)  { -	struct ipoib_dev_priv *ppriv, *priv; -	char intf_name[IFNAMSIZ];  	int result; -	if (!capable(CAP_NET_ADMIN)) -		return -EPERM; - -	ppriv = netdev_priv(pdev); - -	if (!rtnl_trylock()) -		return restart_syscall(); -	mutex_lock(&ppriv->vlan_mutex); - -	/* -	 * First ensure this isn't a duplicate. We check the parent device and -	 * then all of the child interfaces to make sure the Pkey doesn't match. -	 */ -	if (ppriv->pkey == pkey) { -		result = -ENOTUNIQ; -		priv = NULL; -		goto err; -	} - -	list_for_each_entry(priv, &ppriv->child_intfs, list) { -		if (priv->pkey == pkey) { -			result = -ENOTUNIQ; -			priv = NULL; -			goto err; -		} -	} - -	snprintf(intf_name, sizeof intf_name, "%s.%04x", -		 ppriv->dev->name, pkey); -	priv = ipoib_intf_alloc(intf_name); -	if (!priv) { -		result = -ENOMEM; -		goto err; -	} -  	priv->max_ib_mtu = ppriv->max_ib_mtu;  	/* MTU will be reset when mcast join happens */  	priv->dev->mtu   = IPOIB_UD_MTU(priv->max_ib_mtu); @@ -124,24 +88,27 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)  	ipoib_create_debug_files(priv->dev); -	if (ipoib_cm_add_mode_attr(priv->dev)) -		goto sysfs_failed; -	if (ipoib_add_pkey_attr(priv->dev)) -		goto sysfs_failed; -	if (ipoib_add_umcast_attr(priv->dev)) -		goto sysfs_failed; - -	if (device_create_file(&priv->dev->dev, &dev_attr_parent)) -		goto sysfs_failed; +	/* RTNL childs don't need proprietary sysfs entries */ +	if (type == IPOIB_LEGACY_CHILD) { +		if (ipoib_cm_add_mode_attr(priv->dev)) +			goto sysfs_failed; +		if (ipoib_add_pkey_attr(priv->dev)) +			goto sysfs_failed; +		if (ipoib_add_umcast_attr(priv->dev)) +			goto sysfs_failed; + +		if (device_create_file(&priv->dev->dev, &dev_attr_parent)) +			goto sysfs_failed; +	} +	priv->child_type  = type; +	priv->dev->iflink = ppriv->dev->ifindex;  	list_add_tail(&priv->list, &ppriv->child_intfs); -	mutex_unlock(&ppriv->vlan_mutex); -	rtnl_unlock(); -  	return 0;  sysfs_failed: +	result = -ENOMEM;  	ipoib_delete_debug_files(priv->dev);  	unregister_netdevice(priv->dev); @@ -149,11 +116,60 @@ register_failed:  	ipoib_dev_cleanup(priv->dev);  err: -	mutex_unlock(&ppriv->vlan_mutex); -	rtnl_unlock(); -	if (priv) +	return result; +} + +int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) +{ +	struct ipoib_dev_priv *ppriv, *priv; +	char intf_name[IFNAMSIZ]; +	struct ipoib_dev_priv *tpriv; +	int result; + +	if (!capable(CAP_NET_ADMIN)) +		return -EPERM; + +	ppriv = netdev_priv(pdev); + +	snprintf(intf_name, sizeof intf_name, "%s.%04x", +		 ppriv->dev->name, pkey); +	priv = ipoib_intf_alloc(intf_name); +	if (!priv) +		return -ENOMEM; + +	if (!rtnl_trylock()) +		return restart_syscall(); + +	down_write(&ppriv->vlan_rwsem); + +	/* +	 * First ensure this isn't a duplicate. We check the parent device and +	 * then all of the legacy child interfaces to make sure the Pkey +	 * doesn't match. +	 */ +	if (ppriv->pkey == pkey) { +		result = -ENOTUNIQ; +		goto out; +	} + +	list_for_each_entry(tpriv, &ppriv->child_intfs, list) { +		if (tpriv->pkey == pkey && +		    tpriv->child_type == IPOIB_LEGACY_CHILD) { +			result = -ENOTUNIQ; +			goto out; +		} +	} + +	result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD); + +out: +	up_write(&ppriv->vlan_rwsem); + +	if (result)  		free_netdev(priv->dev); +	rtnl_unlock(); +  	return result;  } @@ -169,17 +185,19 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey)  	if (!rtnl_trylock())  		return restart_syscall(); -	mutex_lock(&ppriv->vlan_mutex); + +	down_write(&ppriv->vlan_rwsem);  	list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { -		if (priv->pkey == pkey) { +		if (priv->pkey == pkey && +		    priv->child_type == IPOIB_LEGACY_CHILD) {  			unregister_netdevice(priv->dev); -			ipoib_dev_cleanup(priv->dev);  			list_del(&priv->list);  			dev = priv->dev;  			break;  		}  	} -	mutex_unlock(&ppriv->vlan_mutex); +	up_write(&ppriv->vlan_rwsem); +  	rtnl_unlock();  	if (dev) { diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 7b2fc98e2f2..eb7973957a6 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -5,6 +5,7 @@   * Copyright (C) 2004 Alex Aizman   * Copyright (C) 2005 Mike Christie   * Copyright (c) 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.   * maintained by openib-general@openib.org   *   * This software is available to you under a choice of one of two @@ -57,6 +58,7 @@  #include <linux/scatterlist.h>  #include <linux/delay.h>  #include <linux/slab.h> +#include <linux/module.h>  #include <net/sock.h> @@ -80,15 +82,24 @@ static unsigned int iscsi_max_lun = 512;  module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO);  int iser_debug_level = 0; +bool iser_pi_enable = false; +int iser_pi_guard = 0; -MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover " -		   "v" DRV_VER " (" DRV_DATE ")"); +MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover");  MODULE_LICENSE("Dual BSD/GPL");  MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz"); +MODULE_VERSION(DRV_VER);  module_param_named(debug_level, iser_debug_level, int, 0644);  MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)"); +module_param_named(pi_enable, iser_pi_enable, bool, 0644); +MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)"); + +module_param_named(pi_guard, iser_pi_guard, int, 0644); +MODULE_PARM_DESC(pi_guard, "T10-PI guard_type, 0:CRC|1:IP_CSUM (default:CRC)"); + +static struct workqueue_struct *release_wq;  struct iser_global ig;  void @@ -101,13 +112,17 @@ iscsi_iser_recv(struct iscsi_conn *conn,  	/* verify PDU length */  	datalen = ntoh24(hdr->dlength); -	if (datalen != rx_data_len) { -		printk(KERN_ERR "iscsi_iser: datalen %d (hdr) != %d (IB) \n", -		       datalen, rx_data_len); +	if (datalen > rx_data_len || (datalen + 4) < rx_data_len) { +		iser_err("wrong datalen %d (hdr), %d (IB)\n", +			datalen, rx_data_len);  		rc = ISCSI_ERR_DATALEN;  		goto error;  	} +	if (datalen != rx_data_len) +		iser_dbg("aligned datalen (%d) hdr, %d (IB)\n", +			datalen, rx_data_len); +  	/* read AHS */  	ahslen = hdr->hlength * 4; @@ -132,8 +147,8 @@ static int iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode)  int iser_initialize_task_headers(struct iscsi_task *task,  						struct iser_tx_desc *tx_desc)  { -	struct iscsi_iser_conn *iser_conn = task->conn->dd_data; -	struct iser_device     *device    = iser_conn->ib_conn->device; +	struct iser_conn       *ib_conn   = task->conn->dd_data; +	struct iser_device     *device    = ib_conn->device;  	struct iscsi_iser_task *iser_task = task->dd_data;  	u64 dma_addr; @@ -147,8 +162,7 @@ int iser_initialize_task_headers(struct iscsi_task *task,  	tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;  	tx_desc->tx_sg[0].lkey   = device->mr->lkey; -	iser_task->headers_initialized	= 1; -	iser_task->iser_conn		= iser_conn; +	iser_task->ib_conn = ib_conn;  	return 0;  }  /** @@ -162,8 +176,7 @@ iscsi_iser_task_init(struct iscsi_task *task)  {  	struct iscsi_iser_task *iser_task = task->dd_data; -	if (!iser_task->headers_initialized) -		if (iser_initialize_task_headers(task, &iser_task->desc)) +	if (iser_initialize_task_headers(task, &iser_task->desc))  			return -ENOMEM;  	/* mgmt task */ @@ -172,6 +185,8 @@ iscsi_iser_task_init(struct iscsi_task *task)  	iser_task->command_sent = 0;  	iser_task_rdma_init(iser_task); +	iser_task->sc = task->sc; +  	return 0;  } @@ -274,6 +289,12 @@ iscsi_iser_task_xmit(struct iscsi_task *task)  static void iscsi_iser_cleanup_task(struct iscsi_task *task)  {  	struct iscsi_iser_task *iser_task = task->dd_data; +	struct iser_tx_desc    *tx_desc   = &iser_task->desc; +	struct iser_conn       *ib_conn	  = task->conn->dd_data; +	struct iser_device     *device	  = ib_conn->device; + +	ib_dma_unmap_single(device->ib_device, +		tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);  	/* mgmt tasks do not need special cleanup */  	if (!task->sc) @@ -285,14 +306,25 @@ static void iscsi_iser_cleanup_task(struct iscsi_task *task)  	}  } +static u8 iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector) +{ +	struct iscsi_iser_task *iser_task = task->dd_data; + +	if (iser_task->dir[ISER_DIR_IN]) +		return iser_check_task_pi_status(iser_task, ISER_DIR_IN, +						 sector); +	else +		return iser_check_task_pi_status(iser_task, ISER_DIR_OUT, +						 sector); +} +  static struct iscsi_cls_conn *  iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, uint32_t conn_idx)  {  	struct iscsi_conn *conn;  	struct iscsi_cls_conn *cls_conn; -	struct iscsi_iser_conn *iser_conn; -	cls_conn = iscsi_conn_setup(cls_session, sizeof(*iser_conn), conn_idx); +	cls_conn = iscsi_conn_setup(cls_session, 0, conn_idx);  	if (!cls_conn)  		return NULL;  	conn = cls_conn->dd_data; @@ -303,39 +335,16 @@ iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, uint32_t conn_idx)  	 */  	conn->max_recv_dlength = ISER_RECV_DATA_SEG_LEN; -	iser_conn = conn->dd_data; -	conn->dd_data = iser_conn; -	iser_conn->iscsi_conn = conn; -  	return cls_conn;  } -static void -iscsi_iser_conn_destroy(struct iscsi_cls_conn *cls_conn) -{ -	struct iscsi_conn *conn = cls_conn->dd_data; -	struct iscsi_iser_conn *iser_conn = conn->dd_data; -	struct iser_conn *ib_conn = iser_conn->ib_conn; - -	iscsi_conn_teardown(cls_conn); -	/* -	 * Userspace will normally call the stop callback and -	 * already have freed the ib_conn, but if it goofed up then -	 * we free it here. -	 */ -	if (ib_conn) { -		ib_conn->iser_conn = NULL; -		iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */ -	} -} -  static int  iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,  		     struct iscsi_cls_conn *cls_conn, uint64_t transport_eph,  		     int is_leading)  {  	struct iscsi_conn *conn = cls_conn->dd_data; -	struct iscsi_iser_conn *iser_conn; +	struct iscsi_session *session;  	struct iser_conn *ib_conn;  	struct iscsi_endpoint *ep;  	int error; @@ -354,51 +363,51 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,  	}  	ib_conn = ep->dd_data; +	session = conn->session; +	if (iser_alloc_rx_descriptors(ib_conn, session)) +		return -ENOMEM; +  	/* binds the iSER connection retrieved from the previously  	 * connected ep_handle to the iSCSI layer connection. exchanges  	 * connection pointers */ -	iser_err("binding iscsi/iser conn %p %p to ib_conn %p\n", -					conn, conn->dd_data, ib_conn); -	iser_conn = conn->dd_data; -	ib_conn->iser_conn = iser_conn; -	iser_conn->ib_conn  = ib_conn; -	iser_conn_get(ib_conn); /* ref iscsi/ib conn binding */ +	iser_info("binding iscsi conn %p to ib_conn %p\n", conn, ib_conn); + +	conn->dd_data = ib_conn; +	ib_conn->iscsi_conn = conn; +  	return 0;  } +static int +iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn) +{ +	struct iscsi_conn *iscsi_conn; +	struct iser_conn *ib_conn; + +	iscsi_conn = cls_conn->dd_data; +	ib_conn = iscsi_conn->dd_data; +	reinit_completion(&ib_conn->stop_completion); + +	return iscsi_conn_start(cls_conn); +} +  static void  iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)  {  	struct iscsi_conn *conn = cls_conn->dd_data; -	struct iscsi_iser_conn *iser_conn = conn->dd_data; -	struct iser_conn *ib_conn = iser_conn->ib_conn; +	struct iser_conn *ib_conn = conn->dd_data; + +	iser_dbg("stopping iscsi_conn: %p, ib_conn: %p\n", conn, ib_conn); +	iscsi_conn_stop(cls_conn, flag);  	/*  	 * Userspace may have goofed up and not bound the connection or  	 * might have only partially setup the connection.  	 */  	if (ib_conn) { -		iscsi_conn_stop(cls_conn, flag); -		/* -		 * There is no unbind event so the stop callback -		 * must release the ref from the bind. -		 */ -		iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */ +		conn->dd_data = NULL; +		complete(&ib_conn->stop_completion);  	} -	iser_conn->ib_conn = NULL; -} - -static int -iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn) -{ -	struct iscsi_conn *conn = cls_conn->dd_data; -	int err; - -	err = iser_conn_set_full_featured_mode(conn); -	if (err) -		return err; - -	return iscsi_conn_start(cls_conn);  }  static void iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session) @@ -410,6 +419,17 @@ static void iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session)  	iscsi_host_free(shost);  } +static inline unsigned int +iser_dif_prot_caps(int prot_caps) +{ +	return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ? SHOST_DIF_TYPE1_PROTECTION | +						      SHOST_DIX_TYPE1_PROTECTION : 0) | +	       ((prot_caps & IB_PROT_T10DIF_TYPE_2) ? SHOST_DIF_TYPE2_PROTECTION | +						      SHOST_DIX_TYPE2_PROTECTION : 0) | +	       ((prot_caps & IB_PROT_T10DIF_TYPE_3) ? SHOST_DIF_TYPE3_PROTECTION | +						      SHOST_DIX_TYPE3_PROTECTION : 0); +} +  static struct iscsi_cls_session *  iscsi_iser_session_create(struct iscsi_endpoint *ep,  			  uint16_t cmds_max, uint16_t qdepth, @@ -418,12 +438,13 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,  	struct iscsi_cls_session *cls_session;  	struct iscsi_session *session;  	struct Scsi_Host *shost; -	struct iser_conn *ib_conn; +	struct iser_conn *ib_conn = NULL;  	shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0);  	if (!shost)  		return NULL;  	shost->transportt = iscsi_iser_scsi_transport; +	shost->cmd_per_lun = qdepth;  	shost->max_lun = iscsi_max_lun;  	shost->max_id = 0;  	shost->max_channel = 0; @@ -433,19 +454,31 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,  	 * older userspace tools (before 2.0-870) did not pass us  	 * the leading conn's ep so this will be NULL;  	 */ -	if (ep) +	if (ep) {  		ib_conn = ep->dd_data; +		if (ib_conn->pi_support) { +			u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap; + +			scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps)); +			if (iser_pi_guard) +				scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP); +			else +				scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC); +		} +	}  	if (iscsi_host_add(shost,  			   ep ? ib_conn->device->ib_device->dma_device : NULL))  		goto free_host; -	/* -	 * we do not support setting can_queue cmd_per_lun from userspace yet -	 * because we preallocate so many resources -	 */ +	if (cmds_max > ISER_DEF_XMIT_CMDS_MAX) { +		iser_info("cmds_max changed from %u to %u\n", +			  cmds_max, ISER_DEF_XMIT_CMDS_MAX); +		cmds_max = ISER_DEF_XMIT_CMDS_MAX; +	} +  	cls_session = iscsi_session_setup(&iscsi_iser_transport, shost, -					  ISCSI_DEF_XMIT_CMDS_MAX, 0, +					  cmds_max, 0,  					  sizeof(struct iscsi_iser_task),  					  initial_cmdsn, 0);  	if (!cls_session) @@ -475,28 +508,28 @@ iscsi_iser_set_param(struct iscsi_cls_conn *cls_conn,  	case ISCSI_PARAM_HDRDGST_EN:  		sscanf(buf, "%d", &value);  		if (value) { -			printk(KERN_ERR "DataDigest wasn't negotiated to None"); +			iser_err("DataDigest wasn't negotiated to None\n");  			return -EPROTO;  		}  		break;  	case ISCSI_PARAM_DATADGST_EN:  		sscanf(buf, "%d", &value);  		if (value) { -			printk(KERN_ERR "DataDigest wasn't negotiated to None"); +			iser_err("DataDigest wasn't negotiated to None\n");  			return -EPROTO;  		}  		break;  	case ISCSI_PARAM_IFMARKER_EN:  		sscanf(buf, "%d", &value);  		if (value) { -			printk(KERN_ERR "IFMarker wasn't negotiated to No"); +			iser_err("IFMarker wasn't negotiated to No\n");  			return -EPROTO;  		}  		break;  	case ISCSI_PARAM_OFMARKER_EN:  		sscanf(buf, "%d", &value);  		if (value) { -			printk(KERN_ERR "OFMarker wasn't negotiated to No"); +			iser_err("OFMarker wasn't negotiated to No\n");  			return -EPROTO;  		}  		break; @@ -532,6 +565,29 @@ iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, struct iscsi_stats *s  	stats->custom[3].value = conn->fmr_unalign_cnt;  } +static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep, +				   enum iscsi_param param, char *buf) +{ +	struct iser_conn *ib_conn = ep->dd_data; +	int len; + +	switch (param) { +	case ISCSI_PARAM_CONN_PORT: +	case ISCSI_PARAM_CONN_ADDRESS: +		if (!ib_conn || !ib_conn->cma_id) +			return -ENOTCONN; + +		return iscsi_conn_get_addr_param((struct sockaddr_storage *) +					&ib_conn->cma_id->route.addr.dst_addr, +					param, buf); +		break; +	default: +		return -ENOSYS; +	} + +	return len; +} +  static struct iscsi_endpoint *  iscsi_iser_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr,  		      int non_blocking) @@ -550,10 +606,9 @@ iscsi_iser_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr,  	err = iser_connect(ib_conn, NULL, (struct sockaddr_in *)dst_addr,  			   non_blocking); -	if (err) { -		iscsi_destroy_endpoint(ep); +	if (err)  		return ERR_PTR(err); -	} +  	return ep;  } @@ -574,7 +629,7 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)  	     ib_conn->state == ISER_CONN_DOWN))  		rc = -1; -	iser_err("ib conn %p rc = %d\n", ib_conn, rc); +	iser_info("ib conn %p rc = %d\n", ib_conn, rc);  	if (rc > 0)  		return 1; /* success, this is the equivalent of POLLOUT */ @@ -590,24 +645,79 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep)  	struct iser_conn *ib_conn;  	ib_conn = ep->dd_data; -	if (ib_conn->iser_conn) -		/* -		 * Must suspend xmit path if the ep is bound to the -		 * iscsi_conn, so we know we are not accessing the ib_conn -		 * when we free it. -		 * -		 * This may not be bound if the ep poll failed. -		 */ -		iscsi_suspend_tx(ib_conn->iser_conn->iscsi_conn); - - -	iser_err("ib conn %p state %d\n",ib_conn, ib_conn->state); +	iser_info("ep %p ib conn %p state %d\n", ep, ib_conn, ib_conn->state);  	iser_conn_terminate(ib_conn); + +	/* +	 * if iser_conn and iscsi_conn are bound, we must wait iscsi_conn_stop +	 * call and ISER_CONN_DOWN state before freeing the iser resources. +	 * otherwise we are safe to free resources immediately. +	 */ +	if (ib_conn->iscsi_conn) { +		INIT_WORK(&ib_conn->release_work, iser_release_work); +		queue_work(release_wq, &ib_conn->release_work); +	} else { +		iser_conn_release(ib_conn); +	} +} + +static umode_t iser_attr_is_visible(int param_type, int param) +{ +	switch (param_type) { +	case ISCSI_HOST_PARAM: +		switch (param) { +		case ISCSI_HOST_PARAM_NETDEV_NAME: +		case ISCSI_HOST_PARAM_HWADDRESS: +		case ISCSI_HOST_PARAM_INITIATOR_NAME: +			return S_IRUGO; +		default: +			return 0; +		} +	case ISCSI_PARAM: +		switch (param) { +		case ISCSI_PARAM_MAX_RECV_DLENGTH: +		case ISCSI_PARAM_MAX_XMIT_DLENGTH: +		case ISCSI_PARAM_HDRDGST_EN: +		case ISCSI_PARAM_DATADGST_EN: +		case ISCSI_PARAM_CONN_ADDRESS: +		case ISCSI_PARAM_CONN_PORT: +		case ISCSI_PARAM_EXP_STATSN: +		case ISCSI_PARAM_PERSISTENT_ADDRESS: +		case ISCSI_PARAM_PERSISTENT_PORT: +		case ISCSI_PARAM_PING_TMO: +		case ISCSI_PARAM_RECV_TMO: +		case ISCSI_PARAM_INITIAL_R2T_EN: +		case ISCSI_PARAM_MAX_R2T: +		case ISCSI_PARAM_IMM_DATA_EN: +		case ISCSI_PARAM_FIRST_BURST: +		case ISCSI_PARAM_MAX_BURST: +		case ISCSI_PARAM_PDU_INORDER_EN: +		case ISCSI_PARAM_DATASEQ_INORDER_EN: +		case ISCSI_PARAM_TARGET_NAME: +		case ISCSI_PARAM_TPGT: +		case ISCSI_PARAM_USERNAME: +		case ISCSI_PARAM_PASSWORD: +		case ISCSI_PARAM_USERNAME_IN: +		case ISCSI_PARAM_PASSWORD_IN: +		case ISCSI_PARAM_FAST_ABORT: +		case ISCSI_PARAM_ABORT_TMO: +		case ISCSI_PARAM_LU_RESET_TMO: +		case ISCSI_PARAM_TGT_RESET_TMO: +		case ISCSI_PARAM_IFACE_NAME: +		case ISCSI_PARAM_INITIATOR_NAME: +		case ISCSI_PARAM_DISCOVERY_SESS: +			return S_IRUGO; +		default: +			return 0; +		} +	} + +	return 0;  }  static struct scsi_host_template iscsi_iser_sht = {  	.module                 = THIS_MODULE, -	.name                   = "iSCSI Initiator over iSER, v." DRV_VER, +	.name                   = "iSCSI Initiator over iSER",  	.queuecommand           = iscsi_queuecommand,  	.change_queue_depth	= iscsi_change_queue_depth,  	.sg_tablesize           = ISCSI_ISER_SG_TABLESIZE, @@ -625,40 +735,18 @@ static struct scsi_host_template iscsi_iser_sht = {  static struct iscsi_transport iscsi_iser_transport = {  	.owner                  = THIS_MODULE,  	.name                   = "iser", -	.caps                   = CAP_RECOVERY_L0 | CAP_MULTI_R2T, -	.param_mask		= ISCSI_MAX_RECV_DLENGTH | -				  ISCSI_MAX_XMIT_DLENGTH | -				  ISCSI_HDRDGST_EN | -				  ISCSI_DATADGST_EN | -				  ISCSI_INITIAL_R2T_EN | -				  ISCSI_MAX_R2T | -				  ISCSI_IMM_DATA_EN | -				  ISCSI_FIRST_BURST | -				  ISCSI_MAX_BURST | -				  ISCSI_PDU_INORDER_EN | -				  ISCSI_DATASEQ_INORDER_EN | -				  ISCSI_EXP_STATSN | -				  ISCSI_PERSISTENT_PORT | -				  ISCSI_PERSISTENT_ADDRESS | -				  ISCSI_TARGET_NAME | ISCSI_TPGT | -				  ISCSI_USERNAME | ISCSI_PASSWORD | -				  ISCSI_USERNAME_IN | ISCSI_PASSWORD_IN | -				  ISCSI_FAST_ABORT | ISCSI_ABORT_TMO | -				  ISCSI_LU_RESET_TMO | ISCSI_TGT_RESET_TMO | -				  ISCSI_PING_TMO | ISCSI_RECV_TMO | -				  ISCSI_IFACE_NAME | ISCSI_INITIATOR_NAME, -	.host_param_mask	= ISCSI_HOST_HWADDRESS | -				  ISCSI_HOST_NETDEV_NAME | -				  ISCSI_HOST_INITIATOR_NAME, +	.caps                   = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_TEXT_NEGO,  	/* session management */  	.create_session         = iscsi_iser_session_create,  	.destroy_session        = iscsi_iser_session_destroy,  	/* connection management */  	.create_conn            = iscsi_iser_conn_create,  	.bind_conn              = iscsi_iser_conn_bind, -	.destroy_conn           = iscsi_iser_conn_destroy, +	.destroy_conn           = iscsi_conn_teardown, +	.attr_is_visible	= iser_attr_is_visible,  	.set_param              = iscsi_iser_set_param,  	.get_conn_param		= iscsi_conn_get_param, +	.get_ep_param		= iscsi_iser_get_ep_param,  	.get_session_param	= iscsi_session_get_param,  	.start_conn             = iscsi_iser_conn_start,  	.stop_conn              = iscsi_iser_conn_stop, @@ -672,6 +760,7 @@ static struct iscsi_transport iscsi_iser_transport = {  	.xmit_task		= iscsi_iser_task_xmit,  	.cleanup_task		= iscsi_iser_cleanup_task,  	.alloc_pdu		= iscsi_iser_pdu_alloc, +	.check_protection	= iscsi_iser_check_protection,  	/* recovery */  	.session_recovery_timedout = iscsi_session_recovery_timedout, @@ -687,7 +776,7 @@ static int __init iser_init(void)  	iser_dbg("Starting iSER datamover...\n");  	if (iscsi_max_lun < 1) { -		printk(KERN_ERR "Invalid max_lun value of %u\n", iscsi_max_lun); +		iser_err("Invalid max_lun value of %u\n", iscsi_max_lun);  		return -EINVAL;  	} @@ -706,6 +795,12 @@ static int __init iser_init(void)  	mutex_init(&ig.connlist_mutex);  	INIT_LIST_HEAD(&ig.connlist); +	release_wq = alloc_workqueue("release workqueue", 0, 0); +	if (!release_wq) { +		iser_err("failed to allocate release workqueue\n"); +		return -ENOMEM; +	} +  	iscsi_iser_scsi_transport = iscsi_register_transport(  							&iscsi_iser_transport);  	if (!iscsi_iser_scsi_transport) { @@ -724,7 +819,24 @@ register_transport_failure:  static void __exit iser_exit(void)  { +	struct iser_conn *ib_conn, *n; +	int connlist_empty; +  	iser_dbg("Removing iSER datamover...\n"); +	destroy_workqueue(release_wq); + +	mutex_lock(&ig.connlist_mutex); +	connlist_empty = list_empty(&ig.connlist); +	mutex_unlock(&ig.connlist_mutex); + +	if (!connlist_empty) { +		iser_err("Error cleanup stage completed but we still have iser " +			 "connections, destroying them anyway.\n"); +		list_for_each_entry_safe(ib_conn, n, &ig.connlist, conn_list) { +			iser_conn_release(ib_conn); +		} +	} +  	iscsi_unregister_transport(&iscsi_iser_transport);  	kmem_cache_destroy(ig.desc_cache);  } diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index f1df01567bb..97cd385bf7f 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -8,6 +8,7 @@   *   * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.   * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.   *   * This software is available to you under a choice of one of two   * licenses.  You may choose to be licensed under the terms of the GNU @@ -42,9 +43,13 @@  #include <linux/types.h>  #include <linux/net.h> +#include <linux/printk.h>  #include <scsi/libiscsi.h>  #include <scsi/scsi_transport_iscsi.h> +#include <scsi/scsi_cmnd.h> +#include <scsi/scsi_device.h> +#include <linux/interrupt.h>  #include <linux/wait.h>  #include <linux/sched.h>  #include <linux/list.h> @@ -64,12 +69,11 @@  #define DRV_NAME	"iser"  #define PFX		DRV_NAME ": " -#define DRV_VER		"0.1" -#define DRV_DATE	"May 7th, 2006" +#define DRV_VER		"1.4"  #define iser_dbg(fmt, arg...)				\  	do {						\ -		if (iser_debug_level > 1)		\ +		if (iser_debug_level > 2)		\  			printk(KERN_DEBUG PFX "%s:" fmt,\  				__func__ , ## arg);	\  	} while (0) @@ -77,7 +81,14 @@  #define iser_warn(fmt, arg...)				\  	do {						\  		if (iser_debug_level > 0)		\ -			printk(KERN_DEBUG PFX "%s:" fmt,\ +			pr_warn(PFX "%s:" fmt,          \ +				__func__ , ## arg);	\ +	} while (0) + +#define iser_info(fmt, arg...)				\ +	do {						\ +		if (iser_debug_level > 1)		\ +			pr_info(PFX "%s:" fmt,          \  				__func__ , ## arg);	\  	} while (0) @@ -88,12 +99,18 @@  	} while (0)  #define SHIFT_4K	12 -#define SIZE_4K	(1UL << SHIFT_4K) +#define SIZE_4K	(1ULL << SHIFT_4K)  #define MASK_4K	(~(SIZE_4K-1)) -					/* support upto 512KB in one RDMA */ +					/* support up to 512KB in one RDMA */  #define ISCSI_ISER_SG_TABLESIZE         (0x80000 >> SHIFT_4K) -#define ISER_DEF_CMD_PER_LUN		128 +#define ISER_DEF_XMIT_CMDS_DEFAULT		512 +#if ISCSI_DEF_XMIT_CMDS_MAX > ISER_DEF_XMIT_CMDS_DEFAULT +	#define ISER_DEF_XMIT_CMDS_MAX		ISCSI_DEF_XMIT_CMDS_MAX +#else +	#define ISER_DEF_XMIT_CMDS_MAX		ISER_DEF_XMIT_CMDS_DEFAULT +#endif +#define ISER_DEF_CMD_PER_LUN		ISER_DEF_XMIT_CMDS_MAX  /* QP settings */  /* Maximal bounds on received asynchronous PDUs */ @@ -102,9 +119,9 @@  #define ISER_MAX_TX_MISC_PDUS		6 /* NOOP_OUT(2), TEXT(1),         *  					   * SCSI_TMFUNC(2), LOGOUT(1) */ -#define ISER_QP_MAX_RECV_DTOS		(ISCSI_DEF_XMIT_CMDS_MAX) +#define ISER_QP_MAX_RECV_DTOS		(ISER_DEF_XMIT_CMDS_MAX) -#define ISER_MIN_POSTED_RX		(ISCSI_DEF_XMIT_CMDS_MAX >> 2) +#define ISER_MIN_POSTED_RX		(ISER_DEF_XMIT_CMDS_MAX >> 2)  /* the max TX (send) WR supported by the iSER QP is defined by                 *   * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   * @@ -114,15 +131,26 @@  #define ISER_INFLIGHT_DATAOUTS		8 -#define ISER_QP_MAX_REQ_DTOS		(ISCSI_DEF_XMIT_CMDS_MAX *    \ +#define ISER_QP_MAX_REQ_DTOS		(ISER_DEF_XMIT_CMDS_MAX *    \  					(1 + ISER_INFLIGHT_DATAOUTS) + \  					ISER_MAX_TX_MISC_PDUS        + \  					ISER_MAX_RX_MISC_PDUS) +/* Max registration work requests per command */ +#define ISER_MAX_REG_WR_PER_CMD		5 + +/* For Signature we don't support DATAOUTs so no need to make room for them */ +#define ISER_QP_SIG_MAX_REQ_DTOS	(ISER_DEF_XMIT_CMDS_MAX	*       \ +					(1 + ISER_MAX_REG_WR_PER_CMD) + \ +					ISER_MAX_TX_MISC_PDUS         + \ +					ISER_MAX_RX_MISC_PDUS) +  #define ISER_VER			0x10  #define ISER_WSV			0x08  #define ISER_RSV			0x04 +#define ISER_FASTREG_LI_WRID		0xffffffffffffffffULL +  struct iser_hdr {  	u8      flags;  	u8      rsvd[3]; @@ -132,6 +160,15 @@ struct iser_hdr {  	__be64  read_va;  } __attribute__((packed)); + +#define ISER_ZBVA_NOT_SUPPORTED		0x80 +#define ISER_SEND_W_INV_NOT_SUPPORTED	0x40 + +struct iser_cm_hdr { +	u8      flags; +	u8      rsvd[3]; +} __packed; +  /* Constant PDU lengths calculations */  #define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr)) @@ -176,7 +213,7 @@ struct iser_data_buf {  /* fwd declarations */  struct iser_device; -struct iscsi_iser_conn; +struct iser_cq_desc;  struct iscsi_iser_task;  struct iscsi_endpoint; @@ -186,7 +223,7 @@ struct iser_mem_reg {  	u64  va;  	u64  len;  	void *mem_h; -	int  is_fmr; +	int  is_mr;  };  struct iser_regd_buf { @@ -225,20 +262,63 @@ struct iser_rx_desc {  	char		             pad[ISER_RX_PAD_SIZE];  } __attribute__((packed)); +#define ISER_MAX_CQ 4 + +struct iser_conn; +struct iscsi_iser_task; +  struct iser_device {  	struct ib_device             *ib_device;  	struct ib_pd	             *pd; -	struct ib_cq	             *rx_cq; -	struct ib_cq	             *tx_cq; +	struct ib_device_attr	     dev_attr; +	struct ib_cq	             *rx_cq[ISER_MAX_CQ]; +	struct ib_cq	             *tx_cq[ISER_MAX_CQ];  	struct ib_mr	             *mr; -	struct tasklet_struct	     cq_tasklet; +	struct tasklet_struct	     cq_tasklet[ISER_MAX_CQ];  	struct ib_event_handler      event_handler;  	struct list_head             ig_list; /* entry in ig devices list */  	int                          refcount; +	int                          cq_active_qps[ISER_MAX_CQ]; +	int			     cqs_used; +	struct iser_cq_desc	     *cq_desc; +	int                          (*iser_alloc_rdma_reg_res)(struct iser_conn *ib_conn, +								unsigned cmds_max); +	void                         (*iser_free_rdma_reg_res)(struct iser_conn *ib_conn); +	int                          (*iser_reg_rdma_mem)(struct iscsi_iser_task *iser_task, +							  enum iser_data_dir cmd_dir); +	void                         (*iser_unreg_rdma_mem)(struct iscsi_iser_task *iser_task, +							    enum iser_data_dir cmd_dir); +}; + +#define ISER_CHECK_GUARD	0xc0 +#define ISER_CHECK_REFTAG	0x0f +#define ISER_CHECK_APPTAG	0x30 + +enum iser_reg_indicator { +	ISER_DATA_KEY_VALID	= 1 << 0, +	ISER_PROT_KEY_VALID	= 1 << 1, +	ISER_SIG_KEY_VALID	= 1 << 2, +	ISER_FASTREG_PROTECTED	= 1 << 3, +}; + +struct iser_pi_context { +	struct ib_mr                   *prot_mr; +	struct ib_fast_reg_page_list   *prot_frpl; +	struct ib_mr                   *sig_mr; +}; + +struct fast_reg_descriptor { +	struct list_head		  list; +	/* For fast registration - FRWR */ +	struct ib_mr			 *data_mr; +	struct ib_fast_reg_page_list     *data_frpl; +	struct iser_pi_context		 *pi_ctx; +	/* registration indicators container */ +	u8				  reg_indicators;  };  struct iser_conn { -	struct iscsi_iser_conn       *iser_conn; /* iser conn for upcalls  */ +	struct iscsi_conn	     *iscsi_conn;  	struct iscsi_endpoint	     *ep;  	enum iser_ib_conn_state	     state;	    /* rdma connection state   */  	atomic_t		     refcount; @@ -246,37 +326,51 @@ struct iser_conn {  	struct iser_device           *device;       /* device context          */  	struct rdma_cm_id            *cma_id;       /* CMA ID		       */  	struct ib_qp	             *qp;           /* QP 		       */ -	struct ib_fmr_pool           *fmr_pool;     /* pool of IB FMRs         */  	wait_queue_head_t	     wait;          /* waitq for conn/disconn  */ +	unsigned		     qp_max_recv_dtos; /* num of rx buffers */ +	unsigned		     qp_max_recv_dtos_mask; /* above minus 1 */ +	unsigned		     min_posted_rx; /* qp_max_recv_dtos >> 2 */  	int                          post_recv_buf_count; /* posted rx count  */  	atomic_t                     post_send_buf_count; /* posted tx count   */  	char 			     name[ISER_OBJECT_NAME_SIZE]; -	struct iser_page_vec         *page_vec;     /* represents SG to fmr maps* -						     * maps serialized as tx is*/ +	struct work_struct	     release_work; +	struct completion	     stop_completion;  	struct list_head	     conn_list;       /* entry in ig conn list */  	char  			     *login_buf; -	u64 			     login_dma; +	char			     *login_req_buf, *login_resp_buf; +	u64			     login_req_dma, login_resp_dma;  	unsigned int 		     rx_desc_head;  	struct iser_rx_desc	     *rx_descs;  	struct ib_recv_wr	     rx_wr[ISER_MIN_POSTED_RX]; -}; - -struct iscsi_iser_conn { -	struct iscsi_conn            *iscsi_conn;/* ptr to iscsi conn */ -	struct iser_conn             *ib_conn;   /* iSER IB conn      */ +	bool			     pi_support; + +	/* Connection memory registration pool */ +	union { +		struct { +			struct ib_fmr_pool      *pool;	   /* pool of IB FMRs         */ +			struct iser_page_vec	*page_vec; /* represents SG to fmr maps* +							    * maps serialized as tx is*/ +		} fmr; +		struct { +			struct list_head	pool; +			int			pool_size; +		} fastreg; +	};  };  struct iscsi_iser_task {  	struct iser_tx_desc          desc; -	struct iscsi_iser_conn	     *iser_conn; +	struct iser_conn	     *ib_conn;  	enum iser_task_status 	     status; +	struct scsi_cmnd	     *sc;  	int                          command_sent;  /* set if command  sent  */  	int                          dir[ISER_DIRS_NUM];      /* set if dir use*/  	struct iser_regd_buf         rdma_regd[ISER_DIRS_NUM];/* regd rdma buf */  	struct iser_data_buf         data[ISER_DIRS_NUM];     /* orig. data des*/  	struct iser_data_buf         data_copy[ISER_DIRS_NUM];/* contig. copy  */ -	int                          headers_initialized; +	struct iser_data_buf         prot[ISER_DIRS_NUM];     /* prot desc     */ +	struct iser_data_buf         prot_copy[ISER_DIRS_NUM];/* prot copy     */  };  struct iser_page_vec { @@ -286,6 +380,11 @@ struct iser_page_vec {  	int data_size;  }; +struct iser_cq_desc { +	struct iser_device           *device; +	int                          cq_index; +}; +  struct iser_global {  	struct mutex      device_list_mutex;/*                   */  	struct list_head  device_list;	     /* all iSER devices */ @@ -297,6 +396,8 @@ struct iser_global {  extern struct iser_global ig;  extern int iser_debug_level; +extern bool iser_pi_enable; +extern int iser_pi_guard;  /* allocate connection resources needed for rdma functionality */  int iser_conn_set_full_featured_mode(struct iscsi_conn *conn); @@ -318,12 +419,12 @@ void iscsi_iser_recv(struct iscsi_conn *conn,  void iser_conn_init(struct iser_conn *ib_conn); -void iser_conn_get(struct iser_conn *ib_conn); - -int iser_conn_put(struct iser_conn *ib_conn, int destroy_cma_id_allowed); +void iser_conn_release(struct iser_conn *ib_conn);  void iser_conn_terminate(struct iser_conn *ib_conn); +void iser_release_work(struct work_struct *work); +  void iser_rcv_completion(struct iser_rx_desc *desc,  			 unsigned long    dto_xfer_len,  			struct iser_conn *ib_conn); @@ -336,11 +437,15 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *task);  void iser_free_rx_descriptors(struct iser_conn *ib_conn); -void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *task, -				     enum iser_data_dir         cmd_dir); +void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, +				     struct iser_data_buf *mem, +				     struct iser_data_buf *mem_copy, +				     enum iser_data_dir cmd_dir); -int  iser_reg_rdma_mem(struct iscsi_iser_task *task, -		       enum   iser_data_dir        cmd_dir); +int  iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task, +			   enum iser_data_dir cmd_dir); +int  iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *task, +			       enum iser_data_dir cmd_dir);  int  iser_connect(struct iser_conn   *ib_conn,  		  struct sockaddr_in *src_addr, @@ -351,7 +456,10 @@ int  iser_reg_page_vec(struct iser_conn     *ib_conn,  		       struct iser_page_vec *page_vec,  		       struct iser_mem_reg  *mem_reg); -void iser_unreg_mem(struct iser_mem_reg *mem_reg); +void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, +			enum iser_data_dir cmd_dir); +void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, +			    enum iser_data_dir cmd_dir);  int  iser_post_recvl(struct iser_conn *ib_conn);  int  iser_post_recvm(struct iser_conn *ib_conn, int count); @@ -362,7 +470,15 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,  			    enum   iser_data_dir       iser_dir,  			    enum   dma_data_direction  dma_dir); -void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task); +void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, +			      struct iser_data_buf *data);  int  iser_initialize_task_headers(struct iscsi_task *task,  			struct iser_tx_desc *tx_desc); +int iser_alloc_rx_descriptors(struct iser_conn *ib_conn, struct iscsi_session *session); +int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max); +void iser_free_fmr_pool(struct iser_conn *ib_conn); +int iser_create_fastreg_pool(struct iser_conn *ib_conn, unsigned cmds_max); +void iser_free_fastreg_pool(struct iser_conn *ib_conn); +u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, +			     enum iser_data_dir cmd_dir, sector_t *sector);  #endif diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 95a08a8ca8a..8d44a406063 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -1,5 +1,6 @@  /*   * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.   *   * This software is available to you under a choice of one of two   * licenses.  You may choose to be licensed under the terms of the GNU @@ -40,14 +41,15 @@  #include "iscsi_iser.h"  /* Register user buffer memory and initialize passive rdma - *  dto descriptor. Total data size is stored in - *  iser_task->data[ISER_DIR_IN].data_len + *  dto descriptor. Data size is stored in + *  task->data[ISER_DIR_IN].data_len, Protection size + *  os stored in task->prot[ISER_DIR_IN].data_len   */ -static int iser_prepare_read_cmd(struct iscsi_task *task, -				 unsigned int edtl) +static int iser_prepare_read_cmd(struct iscsi_task *task)  {  	struct iscsi_iser_task *iser_task = task->dd_data; +	struct iser_device  *device = iser_task->ib_conn->device;  	struct iser_regd_buf *regd_buf;  	int err;  	struct iser_hdr *hdr = &iser_task->desc.iser_header; @@ -60,15 +62,18 @@ static int iser_prepare_read_cmd(struct iscsi_task *task,  	if (err)  		return err; -	if (edtl > iser_task->data[ISER_DIR_IN].data_len) { -		iser_err("Total data length: %ld, less than EDTL: " -			 "%d, in READ cmd BHS itt: %d, conn: 0x%p\n", -			 iser_task->data[ISER_DIR_IN].data_len, edtl, -			 task->itt, iser_task->iser_conn); -		return -EINVAL; +	if (scsi_prot_sg_count(iser_task->sc)) { +		struct iser_data_buf *pbuf_in = &iser_task->prot[ISER_DIR_IN]; + +		err = iser_dma_map_task_data(iser_task, +					     pbuf_in, +					     ISER_DIR_IN, +					     DMA_FROM_DEVICE); +		if (err) +			return err;  	} -	err = iser_reg_rdma_mem(iser_task,ISER_DIR_IN); +	err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_IN);  	if (err) {  		iser_err("Failed to set up Data-IN RDMA\n");  		return err; @@ -87,8 +92,9 @@ static int iser_prepare_read_cmd(struct iscsi_task *task,  }  /* Register user buffer memory and initialize passive rdma - *  dto descriptor. Total data size is stored in - *  task->data[ISER_DIR_OUT].data_len + *  dto descriptor. Data size is stored in + *  task->data[ISER_DIR_OUT].data_len, Protection size + *  is stored at task->prot[ISER_DIR_OUT].data_len   */  static int  iser_prepare_write_cmd(struct iscsi_task *task, @@ -97,6 +103,7 @@ iser_prepare_write_cmd(struct iscsi_task *task,  		       unsigned int edtl)  {  	struct iscsi_iser_task *iser_task = task->dd_data; +	struct iser_device  *device = iser_task->ib_conn->device;  	struct iser_regd_buf *regd_buf;  	int err;  	struct iser_hdr *hdr = &iser_task->desc.iser_header; @@ -110,15 +117,18 @@ iser_prepare_write_cmd(struct iscsi_task *task,  	if (err)  		return err; -	if (edtl > iser_task->data[ISER_DIR_OUT].data_len) { -		iser_err("Total data length: %ld, less than EDTL: %d, " -			 "in WRITE cmd BHS itt: %d, conn: 0x%p\n", -			 iser_task->data[ISER_DIR_OUT].data_len, -			 edtl, task->itt, task->conn); -		return -EINVAL; +	if (scsi_prot_sg_count(iser_task->sc)) { +		struct iser_data_buf *pbuf_out = &iser_task->prot[ISER_DIR_OUT]; + +		err = iser_dma_map_task_data(iser_task, +					     pbuf_out, +					     ISER_DIR_OUT, +					     DMA_TO_DEVICE); +		if (err) +			return err;  	} -	err = iser_reg_rdma_mem(iser_task,ISER_DIR_OUT); +	err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_OUT);  	if (err != 0) {  		iser_err("Failed to register write cmd RDMA mem\n");  		return err; @@ -169,8 +179,78 @@ static void iser_create_send_desc(struct iser_conn	*ib_conn,  	}  } +static void iser_free_login_buf(struct iser_conn *ib_conn) +{ +	if (!ib_conn->login_buf) +		return; + +	if (ib_conn->login_req_dma) +		ib_dma_unmap_single(ib_conn->device->ib_device, +				    ib_conn->login_req_dma, +				    ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); + +	if (ib_conn->login_resp_dma) +		ib_dma_unmap_single(ib_conn->device->ib_device, +				    ib_conn->login_resp_dma, +				    ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); -static int iser_alloc_rx_descriptors(struct iser_conn *ib_conn) +	kfree(ib_conn->login_buf); + +	/* make sure we never redo any unmapping */ +	ib_conn->login_req_dma = 0; +	ib_conn->login_resp_dma = 0; +	ib_conn->login_buf = NULL; +} + +static int iser_alloc_login_buf(struct iser_conn *ib_conn) +{ +	struct iser_device	*device; +	int			req_err, resp_err; + +	BUG_ON(ib_conn->device == NULL); + +	device = ib_conn->device; + +	ib_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN + +				     ISER_RX_LOGIN_SIZE, GFP_KERNEL); +	if (!ib_conn->login_buf) +		goto out_err; + +	ib_conn->login_req_buf  = ib_conn->login_buf; +	ib_conn->login_resp_buf = ib_conn->login_buf + +						ISCSI_DEF_MAX_RECV_SEG_LEN; + +	ib_conn->login_req_dma = ib_dma_map_single(ib_conn->device->ib_device, +				(void *)ib_conn->login_req_buf, +				ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); + +	ib_conn->login_resp_dma = ib_dma_map_single(ib_conn->device->ib_device, +				(void *)ib_conn->login_resp_buf, +				ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); + +	req_err  = ib_dma_mapping_error(device->ib_device, +					ib_conn->login_req_dma); +	resp_err = ib_dma_mapping_error(device->ib_device, +					ib_conn->login_resp_dma); + +	if (req_err || resp_err) { +		if (req_err) +			ib_conn->login_req_dma = 0; +		if (resp_err) +			ib_conn->login_resp_dma = 0; +		goto free_login_buf; +	} +	return 0; + +free_login_buf: +	iser_free_login_buf(ib_conn); + +out_err: +	iser_err("unable to alloc or map login buf\n"); +	return -ENOMEM; +} + +int iser_alloc_rx_descriptors(struct iser_conn *ib_conn, struct iscsi_session *session)  {  	int i, j;  	u64 dma_addr; @@ -178,14 +258,24 @@ static int iser_alloc_rx_descriptors(struct iser_conn *ib_conn)  	struct ib_sge       *rx_sg;  	struct iser_device  *device = ib_conn->device; -	ib_conn->rx_descs = kmalloc(ISER_QP_MAX_RECV_DTOS * +	ib_conn->qp_max_recv_dtos = session->cmds_max; +	ib_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */ +	ib_conn->min_posted_rx = ib_conn->qp_max_recv_dtos >> 2; + +	if (device->iser_alloc_rdma_reg_res(ib_conn, session->scsi_cmds_max)) +		goto create_rdma_reg_res_failed; + +	if (iser_alloc_login_buf(ib_conn)) +		goto alloc_login_buf_fail; + +	ib_conn->rx_descs = kmalloc(session->cmds_max *  				sizeof(struct iser_rx_desc), GFP_KERNEL);  	if (!ib_conn->rx_descs)  		goto rx_desc_alloc_fail;  	rx_desc = ib_conn->rx_descs; -	for (i = 0; i < ISER_QP_MAX_RECV_DTOS; i++, rx_desc++)  { +	for (i = 0; i < ib_conn->qp_max_recv_dtos; i++, rx_desc++)  {  		dma_addr = ib_dma_map_single(device->ib_device, (void *)rx_desc,  					ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);  		if (ib_dma_mapping_error(device->ib_device, dma_addr)) @@ -206,10 +296,14 @@ rx_desc_dma_map_failed:  	rx_desc = ib_conn->rx_descs;  	for (j = 0; j < i; j++, rx_desc++)  		ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr, -			ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); +				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);  	kfree(ib_conn->rx_descs);  	ib_conn->rx_descs = NULL;  rx_desc_alloc_fail: +	iser_free_login_buf(ib_conn); +alloc_login_buf_fail: +	device->iser_free_rdma_reg_res(ib_conn); +create_rdma_reg_res_failed:  	iser_err("failed allocating rx descriptors / data buffers\n");  	return -ENOMEM;  } @@ -220,41 +314,51 @@ void iser_free_rx_descriptors(struct iser_conn *ib_conn)  	struct iser_rx_desc *rx_desc;  	struct iser_device *device = ib_conn->device; -	if (ib_conn->login_buf) { -		ib_dma_unmap_single(device->ib_device, ib_conn->login_dma, -			ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); -		kfree(ib_conn->login_buf); -	} -  	if (!ib_conn->rx_descs) -		return; +		goto free_login_buf; + +	if (device->iser_free_rdma_reg_res) +		device->iser_free_rdma_reg_res(ib_conn);  	rx_desc = ib_conn->rx_descs; -	for (i = 0; i < ISER_QP_MAX_RECV_DTOS; i++, rx_desc++) +	for (i = 0; i < ib_conn->qp_max_recv_dtos; i++, rx_desc++)  		ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr, -			ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); +				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);  	kfree(ib_conn->rx_descs); +	/* make sure we never redo any unmapping */ +	ib_conn->rx_descs = NULL; + +free_login_buf: +	iser_free_login_buf(ib_conn);  } -/** - *  iser_conn_set_full_featured_mode - (iSER API) - */ -int iser_conn_set_full_featured_mode(struct iscsi_conn *conn) +static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req)  { -	struct iscsi_iser_conn *iser_conn = conn->dd_data; +	struct iser_conn *ib_conn = conn->dd_data; +	struct iscsi_session *session = conn->session; -	iser_dbg("Initially post: %d\n", ISER_MIN_POSTED_RX); +	iser_dbg("req op %x flags %x\n", req->opcode, req->flags); +	/* check if this is the last login - going to full feature phase */ +	if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE) +		return 0; -	/* Check that there is no posted recv or send buffers left - */ -	/* they must be consumed during the login phase */ -	BUG_ON(iser_conn->ib_conn->post_recv_buf_count != 0); -	BUG_ON(atomic_read(&iser_conn->ib_conn->post_send_buf_count) != 0); +	/* +	 * Check that there is one posted recv buffer (for the last login +	 * response) and no posted send buffers left - they must have been +	 * consumed during previous login phases. +	 */ +	WARN_ON(ib_conn->post_recv_buf_count != 1); +	WARN_ON(atomic_read(&ib_conn->post_send_buf_count) != 0); -	if (iser_alloc_rx_descriptors(iser_conn->ib_conn)) -		return -ENOMEM; +	if (session->discovery_sess) { +		iser_info("Discovery session, re-using login RX buffer\n"); +		return 0; +	} else +		iser_info("Normal session, posting batch of RX %d buffers\n", +			  ib_conn->min_posted_rx);  	/* Initial post receive buffers */ -	if (iser_post_recvm(iser_conn->ib_conn, ISER_MIN_POSTED_RX)) +	if (iser_post_recvm(ib_conn, ib_conn->min_posted_rx))  		return -ENOMEM;  	return 0; @@ -266,12 +370,12 @@ int iser_conn_set_full_featured_mode(struct iscsi_conn *conn)  int iser_send_command(struct iscsi_conn *conn,  		      struct iscsi_task *task)  { -	struct iscsi_iser_conn *iser_conn = conn->dd_data; +	struct iser_conn *ib_conn = conn->dd_data;  	struct iscsi_iser_task *iser_task = task->dd_data;  	unsigned long edtl;  	int err; -	struct iser_data_buf *data_buf; -	struct iscsi_cmd *hdr =  (struct iscsi_cmd *)task->hdr; +	struct iser_data_buf *data_buf, *prot_buf; +	struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr;  	struct scsi_cmnd *sc  =  task->sc;  	struct iser_tx_desc *tx_desc = &iser_task->desc; @@ -279,22 +383,31 @@ int iser_send_command(struct iscsi_conn *conn,  	/* build the tx desc regd header and add it to the tx desc dto */  	tx_desc->type = ISCSI_TX_SCSI_COMMAND; -	iser_create_send_desc(iser_conn->ib_conn, tx_desc); +	iser_create_send_desc(ib_conn, tx_desc); -	if (hdr->flags & ISCSI_FLAG_CMD_READ) +	if (hdr->flags & ISCSI_FLAG_CMD_READ) {  		data_buf = &iser_task->data[ISER_DIR_IN]; -	else +		prot_buf = &iser_task->prot[ISER_DIR_IN]; +	} else {  		data_buf = &iser_task->data[ISER_DIR_OUT]; +		prot_buf = &iser_task->prot[ISER_DIR_OUT]; +	}  	if (scsi_sg_count(sc)) { /* using a scatter list */  		data_buf->buf  = scsi_sglist(sc);  		data_buf->size = scsi_sg_count(sc);  	} -  	data_buf->data_len = scsi_bufflen(sc); +	if (scsi_prot_sg_count(sc)) { +		prot_buf->buf  = scsi_prot_sglist(sc); +		prot_buf->size = scsi_prot_sg_count(sc); +		prot_buf->data_len = data_buf->data_len >> +				     ilog2(sc->device->sector_size) * 8; +	} +  	if (hdr->flags & ISCSI_FLAG_CMD_READ) { -		err = iser_prepare_read_cmd(task, edtl); +		err = iser_prepare_read_cmd(task);  		if (err)  			goto send_command_error;  	} @@ -310,7 +423,7 @@ int iser_send_command(struct iscsi_conn *conn,  	iser_task->status = ISER_TASK_STATUS_STARTED; -	err = iser_post_send(iser_conn->ib_conn, tx_desc); +	err = iser_post_send(ib_conn, tx_desc);  	if (!err)  		return 0; @@ -326,7 +439,7 @@ int iser_send_data_out(struct iscsi_conn *conn,  		       struct iscsi_task *task,  		       struct iscsi_data *hdr)  { -	struct iscsi_iser_conn *iser_conn = conn->dd_data; +	struct iser_conn *ib_conn = conn->dd_data;  	struct iscsi_iser_task *iser_task = task->dd_data;  	struct iser_tx_desc *tx_desc = NULL;  	struct iser_regd_buf *regd_buf; @@ -375,7 +488,7 @@ int iser_send_data_out(struct iscsi_conn *conn,  		 itt, buf_offset, data_seg_len); -	err = iser_post_send(iser_conn->ib_conn, tx_desc); +	err = iser_post_send(ib_conn, tx_desc);  	if (!err)  		return 0; @@ -388,7 +501,7 @@ send_data_out_error:  int iser_send_control(struct iscsi_conn *conn,  		      struct iscsi_task *task)  { -	struct iscsi_iser_conn *iser_conn = conn->dd_data; +	struct iser_conn *ib_conn = conn->dd_data;  	struct iscsi_iser_task *iser_task = task->dd_data;  	struct iser_tx_desc *mdesc = &iser_task->desc;  	unsigned long data_seg_len; @@ -397,9 +510,9 @@ int iser_send_control(struct iscsi_conn *conn,  	/* build the tx desc regd header and add it to the tx desc dto */  	mdesc->type = ISCSI_TX_CONTROL; -	iser_create_send_desc(iser_conn->ib_conn, mdesc); +	iser_create_send_desc(ib_conn, mdesc); -	device = iser_conn->ib_conn->device; +	device = ib_conn->device;  	data_seg_len = ntoh24(task->hdr->dlength); @@ -409,21 +522,35 @@ int iser_send_control(struct iscsi_conn *conn,  			iser_err("data present on non login task!!!\n");  			goto send_control_error;  		} -		memcpy(iser_conn->ib_conn->login_buf, task->data, -							task->data_count); -		tx_dsg->addr    = iser_conn->ib_conn->login_dma; -		tx_dsg->length  = data_seg_len; + +		ib_dma_sync_single_for_cpu(device->ib_device, +			ib_conn->login_req_dma, task->data_count, +			DMA_TO_DEVICE); + +		memcpy(ib_conn->login_req_buf, task->data, task->data_count); + +		ib_dma_sync_single_for_device(device->ib_device, +			ib_conn->login_req_dma, task->data_count, +			DMA_TO_DEVICE); + +		tx_dsg->addr    = ib_conn->login_req_dma; +		tx_dsg->length  = task->data_count;  		tx_dsg->lkey    = device->mr->lkey;  		mdesc->num_sge = 2;  	}  	if (task == conn->login_task) { -		err = iser_post_recvl(iser_conn->ib_conn); +		iser_dbg("op %x dsl %lx, posting login rx buffer\n", +			 task->hdr->opcode, data_seg_len); +		err = iser_post_recvl(ib_conn); +		if (err) +			goto send_control_error; +		err = iser_post_rx_bufs(conn, task->hdr);  		if (err)  			goto send_control_error;  	} -	err = iser_post_send(iser_conn->ib_conn, mdesc); +	err = iser_post_send(ib_conn, mdesc);  	if (!err)  		return 0; @@ -439,14 +566,13 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc,  			 unsigned long rx_xfer_len,  			 struct iser_conn *ib_conn)  { -	struct iscsi_iser_conn *conn = ib_conn->iser_conn;  	struct iscsi_hdr *hdr;  	u64 rx_dma;  	int rx_buflen, outstanding, count, err;  	/* differentiate between login to all other PDUs */ -	if ((char *)rx_desc == ib_conn->login_buf) { -		rx_dma = ib_conn->login_dma; +	if ((char *)rx_desc == ib_conn->login_resp_buf) { +		rx_dma = ib_conn->login_resp_dma;  		rx_buflen = ISER_RX_LOGIN_SIZE;  	} else {  		rx_dma = rx_desc->dma_addr; @@ -461,25 +587,25 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc,  	iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,  			hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN)); -	iscsi_iser_recv(conn->iscsi_conn, hdr, -		rx_desc->data, rx_xfer_len - ISER_HEADERS_LEN); +	iscsi_iser_recv(ib_conn->iscsi_conn, hdr, rx_desc->data, +			rx_xfer_len - ISER_HEADERS_LEN);  	ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma, -			rx_buflen, DMA_FROM_DEVICE); +				      rx_buflen, DMA_FROM_DEVICE);  	/* decrementing conn->post_recv_buf_count only --after-- freeing the   *  	 * task eliminates the need to worry on tasks which are completed in   *  	 * parallel to the execution of iser_conn_term. So the code that waits *  	 * for the posted rx bufs refcount to become zero handles everything   */ -	conn->ib_conn->post_recv_buf_count--; +	ib_conn->post_recv_buf_count--; -	if (rx_dma == ib_conn->login_dma) +	if (rx_dma == ib_conn->login_resp_dma)  		return;  	outstanding = ib_conn->post_recv_buf_count; -	if (outstanding + ISER_MIN_POSTED_RX <= ISER_QP_MAX_RECV_DTOS) { -		count = min(ISER_QP_MAX_RECV_DTOS - outstanding, -						ISER_MIN_POSTED_RX); +	if (outstanding + ib_conn->min_posted_rx <= ib_conn->qp_max_recv_dtos) { +		count = min(ib_conn->qp_max_recv_dtos - outstanding, +						ib_conn->min_posted_rx);  		err = iser_post_recvm(ib_conn, count);  		if (err)  			iser_err("posting %d rx bufs err %d\n", count, err); @@ -496,11 +622,12 @@ void iser_snd_completion(struct iser_tx_desc *tx_desc,  		ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,  					ISER_HEADERS_LEN, DMA_TO_DEVICE);  		kmem_cache_free(ig.desc_cache, tx_desc); +		tx_desc = NULL;  	}  	atomic_dec(&ib_conn->post_send_buf_count); -	if (tx_desc->type == ISCSI_TX_CONTROL) { +	if (tx_desc && tx_desc->type == ISCSI_TX_CONTROL) {  		/* this arithmetic is legal by libiscsi dd_data allocation */  		task = (void *) ((long)(void *)tx_desc -  				  sizeof(struct iscsi_task)); @@ -520,6 +647,9 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task)  	iser_task->data[ISER_DIR_IN].data_len  = 0;  	iser_task->data[ISER_DIR_OUT].data_len = 0; +	iser_task->prot[ISER_DIR_IN].data_len  = 0; +	iser_task->prot[ISER_DIR_OUT].data_len = 0; +  	memset(&iser_task->rdma_regd[ISER_DIR_IN], 0,  	       sizeof(struct iser_regd_buf));  	memset(&iser_task->rdma_regd[ISER_DIR_OUT], 0, @@ -528,34 +658,63 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task)  void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)  { -	int is_rdma_aligned = 1; -	struct iser_regd_buf *regd; +	struct iser_device *device = iser_task->ib_conn->device; +	int is_rdma_data_aligned = 1; +	int is_rdma_prot_aligned = 1; +	int prot_count = scsi_prot_sg_count(iser_task->sc);  	/* if we were reading, copy back to unaligned sglist,  	 * anyway dma_unmap and free the copy  	 */  	if (iser_task->data_copy[ISER_DIR_IN].copy_buf != NULL) { -		is_rdma_aligned = 0; -		iser_finalize_rdma_unaligned_sg(iser_task, ISER_DIR_IN); +		is_rdma_data_aligned = 0; +		iser_finalize_rdma_unaligned_sg(iser_task, +						&iser_task->data[ISER_DIR_IN], +						&iser_task->data_copy[ISER_DIR_IN], +						ISER_DIR_IN);  	} +  	if (iser_task->data_copy[ISER_DIR_OUT].copy_buf != NULL) { -		is_rdma_aligned = 0; -		iser_finalize_rdma_unaligned_sg(iser_task, ISER_DIR_OUT); +		is_rdma_data_aligned = 0; +		iser_finalize_rdma_unaligned_sg(iser_task, +						&iser_task->data[ISER_DIR_OUT], +						&iser_task->data_copy[ISER_DIR_OUT], +						ISER_DIR_OUT); +	} + +	if (iser_task->prot_copy[ISER_DIR_IN].copy_buf != NULL) { +		is_rdma_prot_aligned = 0; +		iser_finalize_rdma_unaligned_sg(iser_task, +						&iser_task->prot[ISER_DIR_IN], +						&iser_task->prot_copy[ISER_DIR_IN], +						ISER_DIR_IN); +	} + +	if (iser_task->prot_copy[ISER_DIR_OUT].copy_buf != NULL) { +		is_rdma_prot_aligned = 0; +		iser_finalize_rdma_unaligned_sg(iser_task, +						&iser_task->prot[ISER_DIR_OUT], +						&iser_task->prot_copy[ISER_DIR_OUT], +						ISER_DIR_OUT);  	}  	if (iser_task->dir[ISER_DIR_IN]) { -		regd = &iser_task->rdma_regd[ISER_DIR_IN]; -		if (regd->reg.is_fmr) -			iser_unreg_mem(®d->reg); +		device->iser_unreg_rdma_mem(iser_task, ISER_DIR_IN); +		if (is_rdma_data_aligned) +			iser_dma_unmap_task_data(iser_task, +						 &iser_task->data[ISER_DIR_IN]); +		if (prot_count && is_rdma_prot_aligned) +			iser_dma_unmap_task_data(iser_task, +						 &iser_task->prot[ISER_DIR_IN]);  	}  	if (iser_task->dir[ISER_DIR_OUT]) { -		regd = &iser_task->rdma_regd[ISER_DIR_OUT]; -		if (regd->reg.is_fmr) -			iser_unreg_mem(®d->reg); +		device->iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT); +		if (is_rdma_data_aligned) +			iser_dma_unmap_task_data(iser_task, +						 &iser_task->data[ISER_DIR_OUT]); +		if (prot_count && is_rdma_prot_aligned) +			iser_dma_unmap_task_data(iser_task, +						 &iser_task->prot[ISER_DIR_OUT]);  	} - -       /* if the data was unaligned, it was already unmapped and then copied */ -       if (is_rdma_aligned) -		iser_dma_unmap_task_data(iser_task);  } diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index fb88d6896b6..47acd3ad3a1 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -1,5 +1,6 @@  /*   * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.   *   * This software is available to you under a choice of one of two   * licenses.  You may choose to be licensed under the terms of the GNU @@ -44,13 +45,19 @@   * iser_start_rdma_unaligned_sg   */  static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, +					struct iser_data_buf *data, +					struct iser_data_buf *data_copy,  					enum iser_data_dir cmd_dir)  { -	int dma_nents; -	struct ib_device *dev; +	struct ib_device *dev = iser_task->ib_conn->device->ib_device; +	struct scatterlist *sgl = (struct scatterlist *)data->buf; +	struct scatterlist *sg;  	char *mem = NULL; -	struct iser_data_buf *data = &iser_task->data[cmd_dir]; -	unsigned long  cmd_data_len = data->data_len; +	unsigned long  cmd_data_len = 0; +	int dma_nents, i; + +	for_each_sg(sgl, sg, data->size, i) +		cmd_data_len += ib_sg_dma_len(dev, sg);  	if (cmd_data_len > ISER_KMALLOC_THRESHOLD)  		mem = (void *)__get_free_pages(GFP_ATOMIC, @@ -60,61 +67,58 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,  	if (mem == NULL) {  		iser_err("Failed to allocate mem size %d %d for copying sglist\n", -			 data->size,(int)cmd_data_len); +			 data->size, (int)cmd_data_len);  		return -ENOMEM;  	}  	if (cmd_dir == ISER_DIR_OUT) {  		/* copy the unaligned sg the buffer which is used for RDMA */ -		struct scatterlist *sgl = (struct scatterlist *)data->buf; -		struct scatterlist *sg;  		int i;  		char *p, *from; +		sgl = (struct scatterlist *)data->buf;  		p = mem;  		for_each_sg(sgl, sg, data->size, i) { -			from = kmap_atomic(sg_page(sg), KM_USER0); +			from = kmap_atomic(sg_page(sg));  			memcpy(p,  			       from + sg->offset,  			       sg->length); -			kunmap_atomic(from, KM_USER0); +			kunmap_atomic(from);  			p += sg->length;  		}  	} -	sg_init_one(&iser_task->data_copy[cmd_dir].sg_single, mem, cmd_data_len); -	iser_task->data_copy[cmd_dir].buf  = -		&iser_task->data_copy[cmd_dir].sg_single; -	iser_task->data_copy[cmd_dir].size = 1; +	sg_init_one(&data_copy->sg_single, mem, cmd_data_len); +	data_copy->buf = &data_copy->sg_single; +	data_copy->size = 1; +	data_copy->copy_buf = mem; -	iser_task->data_copy[cmd_dir].copy_buf  = mem; - -	dev = iser_task->iser_conn->ib_conn->device->ib_device; -	dma_nents = ib_dma_map_sg(dev, -				  &iser_task->data_copy[cmd_dir].sg_single, -				  1, +	dma_nents = ib_dma_map_sg(dev, &data_copy->sg_single, 1,  				  (cmd_dir == ISER_DIR_OUT) ?  				  DMA_TO_DEVICE : DMA_FROM_DEVICE);  	BUG_ON(dma_nents == 0); -	iser_task->data_copy[cmd_dir].dma_nents = dma_nents; +	data_copy->dma_nents = dma_nents; +	data_copy->data_len = cmd_data_len; +  	return 0;  }  /**   * iser_finalize_rdma_unaligned_sg   */ +  void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, -				     enum iser_data_dir         cmd_dir) +				     struct iser_data_buf *data, +				     struct iser_data_buf *data_copy, +				     enum iser_data_dir cmd_dir)  {  	struct ib_device *dev; -	struct iser_data_buf *mem_copy;  	unsigned long  cmd_data_len; -	dev = iser_task->iser_conn->ib_conn->device->ib_device; -	mem_copy = &iser_task->data_copy[cmd_dir]; +	dev = iser_task->ib_conn->device->ib_device; -	ib_dma_unmap_sg(dev, &mem_copy->sg_single, 1, +	ib_dma_unmap_sg(dev, &data_copy->sg_single, 1,  			(cmd_dir == ISER_DIR_OUT) ?  			DMA_TO_DEVICE : DMA_FROM_DEVICE); @@ -126,31 +130,31 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,  		int i;  		/* copy back read RDMA to unaligned sg */ -		mem	= mem_copy->copy_buf; +		mem = data_copy->copy_buf; -		sgl	= (struct scatterlist *)iser_task->data[ISER_DIR_IN].buf; -		sg_size = iser_task->data[ISER_DIR_IN].size; +		sgl = (struct scatterlist *)data->buf; +		sg_size = data->size;  		p = mem;  		for_each_sg(sgl, sg, sg_size, i) { -			to = kmap_atomic(sg_page(sg), KM_SOFTIRQ0); +			to = kmap_atomic(sg_page(sg));  			memcpy(to + sg->offset,  			       p,  			       sg->length); -			kunmap_atomic(to, KM_SOFTIRQ0); +			kunmap_atomic(to);  			p += sg->length;  		}  	} -	cmd_data_len = iser_task->data[cmd_dir].data_len; +	cmd_data_len = data->data_len;  	if (cmd_data_len > ISER_KMALLOC_THRESHOLD) -		free_pages((unsigned long)mem_copy->copy_buf, +		free_pages((unsigned long)data_copy->copy_buf,  			   ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);  	else -		kfree(mem_copy->copy_buf); +		kfree(data_copy->copy_buf); -	mem_copy->copy_buf = NULL; +	data_copy->copy_buf = NULL;  }  #define IS_4K_ALIGNED(addr)	((((unsigned long)addr) & ~MASK_4K) == 0) @@ -169,8 +173,8 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,   */  static int iser_sg_to_page_vec(struct iser_data_buf *data, -			       struct iser_page_vec *page_vec, -			       struct ib_device *ibdev) +			       struct ib_device *ibdev, u64 *pages, +			       int *offset, int *data_size)  {  	struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf;  	u64 start_addr, end_addr, page, chunk_start = 0; @@ -179,7 +183,7 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data,  	int i, new_chunk, cur_page, last_ent = data->dma_nents - 1;  	/* compute the offset of first element */ -	page_vec->offset = (u64) sgl[0].offset & ~MASK_4K; +	*offset = (u64) sgl[0].offset & ~MASK_4K;  	new_chunk = 1;  	cur_page  = 0; @@ -203,13 +207,14 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data,  		   which might be unaligned */  		page = chunk_start & MASK_4K;  		do { -			page_vec->pages[cur_page++] = page; +			pages[cur_page++] = page;  			page += SIZE_4K;  		} while (page < end_addr);  	} -	page_vec->data_size = total_sz; -	iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page); +	*data_size = total_sz; +	iser_dbg("page_vec->data_size:%d cur_page %d\n", +		 *data_size, cur_page);  	return cur_page;  } @@ -266,11 +271,8 @@ static void iser_data_buf_dump(struct iser_data_buf *data,  	struct scatterlist *sg;  	int i; -	if (iser_debug_level == 0) -		return; -  	for_each_sg(sgl, sg, data->dma_nents, i) -		iser_warn("sg[%d] dma_addr:0x%lX page:0x%p " +		iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p "  			 "off:0x%x sz:0x%x dma_len:0x%x\n",  			 i, (unsigned long)ib_sg_dma_address(ibdev, sg),  			 sg_page(sg), sg->offset, @@ -297,8 +299,10 @@ static void iser_page_vec_build(struct iser_data_buf *data,  	page_vec->offset = 0;  	iser_dbg("Translating sg sz: %d\n", data->dma_nents); -	page_vec_len = iser_sg_to_page_vec(data, page_vec, ibdev); -	iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents,page_vec_len); +	page_vec_len = iser_sg_to_page_vec(data, ibdev, page_vec->pages, +					   &page_vec->offset, +					   &page_vec->data_size); +	iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents, page_vec_len);  	page_vec->length = page_vec_len; @@ -318,7 +322,7 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,  	struct ib_device *dev;  	iser_task->dir[iser_dir] = 1; -	dev = iser_task->iser_conn->ib_conn->device->ib_device; +	dev = iser_task->ib_conn->device->ib_device;  	data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir);  	if (data->dma_nents == 0) { @@ -328,35 +332,52 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,  	return 0;  } -void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task) +void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, +			      struct iser_data_buf *data)  {  	struct ib_device *dev; -	struct iser_data_buf *data; -	dev = iser_task->iser_conn->ib_conn->device->ib_device; +	dev = iser_task->ib_conn->device->ib_device; +	ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE); +} -	if (iser_task->dir[ISER_DIR_IN]) { -		data = &iser_task->data[ISER_DIR_IN]; -		ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE); -	} +static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, +			      struct ib_device *ibdev, +			      struct iser_data_buf *mem, +			      struct iser_data_buf *mem_copy, +			      enum iser_data_dir cmd_dir, +			      int aligned_len) +{ +	struct iscsi_conn    *iscsi_conn = iser_task->ib_conn->iscsi_conn; -	if (iser_task->dir[ISER_DIR_OUT]) { -		data = &iser_task->data[ISER_DIR_OUT]; -		ib_dma_unmap_sg(dev, data->buf, data->size, DMA_TO_DEVICE); -	} +	iscsi_conn->fmr_unalign_cnt++; +	iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n", +		  aligned_len, mem->size); + +	if (iser_debug_level > 0) +		iser_data_buf_dump(mem, ibdev); + +	/* unmap the command data before accessing it */ +	iser_dma_unmap_task_data(iser_task, mem); + +	/* allocate copy buf, if we are writing, copy the */ +	/* unaligned scatterlist, dma map the copy        */ +	if (iser_start_rdma_unaligned_sg(iser_task, mem, mem_copy, cmd_dir) != 0) +		return -ENOMEM; + +	return 0;  }  /** - * iser_reg_rdma_mem - Registers memory intended for RDMA, - * obtaining rkey and va + * iser_reg_rdma_mem_fmr - Registers memory intended for RDMA, + * using FMR (if possible) obtaining rkey and va   *   * returns 0 on success, errno code on failure   */ -int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task, -		      enum   iser_data_dir        cmd_dir) +int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, +			  enum iser_data_dir cmd_dir)  { -	struct iscsi_conn    *iscsi_conn = iser_task->iser_conn->iscsi_conn; -	struct iser_conn     *ib_conn = iser_task->iser_conn->ib_conn; +	struct iser_conn     *ib_conn = iser_task->ib_conn;  	struct iser_device   *device = ib_conn->device;  	struct ib_device     *ibdev = device->ib_device;  	struct iser_data_buf *mem = &iser_task->data[cmd_dir]; @@ -370,18 +391,13 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task,  	aligned_len = iser_data_buf_aligned_len(mem, ibdev);  	if (aligned_len != mem->dma_nents) { -		iscsi_conn->fmr_unalign_cnt++; -		iser_warn("rdma alignment violation %d/%d aligned\n", -			 aligned_len, mem->size); -		iser_data_buf_dump(mem, ibdev); - -		/* unmap the command data before accessing it */ -		iser_dma_unmap_task_data(iser_task); - -		/* allocate copy buf, if we are writing, copy the */ -		/* unaligned scatterlist, dma map the copy        */ -		if (iser_start_rdma_unaligned_sg(iser_task, cmd_dir) != 0) -				return -ENOMEM; +		err = fall_to_bounce_buf(iser_task, ibdev, mem, +					 &iser_task->data_copy[cmd_dir], +					 cmd_dir, aligned_len); +		if (err) { +			iser_err("failed to allocate bounce buffer\n"); +			return err; +		}  		mem = &iser_task->data_copy[cmd_dir];  	} @@ -393,7 +409,7 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task,  		regd_buf->reg.rkey = device->mr->rkey;  		regd_buf->reg.len  = ib_sg_dma_len(ibdev, &sg[0]);  		regd_buf->reg.va   = ib_sg_dma_address(ibdev, &sg[0]); -		regd_buf->reg.is_fmr = 0; +		regd_buf->reg.is_mr = 0;  		iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X  "  			 "va: 0x%08lX sz: %ld]\n", @@ -402,21 +418,383 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task,  			 (unsigned long)regd_buf->reg.va,  			 (unsigned long)regd_buf->reg.len);  	} else { /* use FMR for multiple dma entries */ -		iser_page_vec_build(mem, ib_conn->page_vec, ibdev); -		err = iser_reg_page_vec(ib_conn, ib_conn->page_vec, ®d_buf->reg); -		if (err) { +		iser_page_vec_build(mem, ib_conn->fmr.page_vec, ibdev); +		err = iser_reg_page_vec(ib_conn, ib_conn->fmr.page_vec, +					®d_buf->reg); +		if (err && err != -EAGAIN) {  			iser_data_buf_dump(mem, ibdev);  			iser_err("mem->dma_nents = %d (dlength = 0x%x)\n",  				 mem->dma_nents,  				 ntoh24(iser_task->desc.iscsi_header.dlength));  			iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n", -				 ib_conn->page_vec->data_size, ib_conn->page_vec->length, -				 ib_conn->page_vec->offset); -			for (i=0 ; i<ib_conn->page_vec->length ; i++) +				 ib_conn->fmr.page_vec->data_size, +				 ib_conn->fmr.page_vec->length, +				 ib_conn->fmr.page_vec->offset); +			for (i = 0; i < ib_conn->fmr.page_vec->length; i++)  				iser_err("page_vec[%d] = 0x%llx\n", i, -					 (unsigned long long) ib_conn->page_vec->pages[i]); +					 (unsigned long long) ib_conn->fmr.page_vec->pages[i]); +		} +		if (err)  			return err; +	} +	return 0; +} + +static inline enum ib_t10_dif_type +scsi2ib_prot_type(unsigned char prot_type) +{ +	switch (prot_type) { +	case SCSI_PROT_DIF_TYPE0: +		return IB_T10DIF_NONE; +	case SCSI_PROT_DIF_TYPE1: +		return IB_T10DIF_TYPE1; +	case SCSI_PROT_DIF_TYPE2: +		return IB_T10DIF_TYPE2; +	case SCSI_PROT_DIF_TYPE3: +		return IB_T10DIF_TYPE3; +	default: +		return IB_T10DIF_NONE; +	} +} + + +static int +iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs) +{ +	unsigned char scsi_ptype = scsi_get_prot_type(sc); + +	sig_attrs->mem.sig_type = IB_SIG_TYPE_T10_DIF; +	sig_attrs->wire.sig_type = IB_SIG_TYPE_T10_DIF; +	sig_attrs->mem.sig.dif.pi_interval = sc->device->sector_size; +	sig_attrs->wire.sig.dif.pi_interval = sc->device->sector_size; + +	switch (scsi_get_prot_op(sc)) { +	case SCSI_PROT_WRITE_INSERT: +	case SCSI_PROT_READ_STRIP: +		sig_attrs->mem.sig.dif.type = IB_T10DIF_NONE; +		sig_attrs->wire.sig.dif.type = scsi2ib_prot_type(scsi_ptype); +		sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; +		sig_attrs->wire.sig.dif.ref_tag = scsi_get_lba(sc) & +						  0xffffffff; +		break; +	case SCSI_PROT_READ_INSERT: +	case SCSI_PROT_WRITE_STRIP: +		sig_attrs->mem.sig.dif.type = scsi2ib_prot_type(scsi_ptype); +		sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; +		sig_attrs->mem.sig.dif.ref_tag = scsi_get_lba(sc) & +						 0xffffffff; +		sig_attrs->wire.sig.dif.type = IB_T10DIF_NONE; +		break; +	case SCSI_PROT_READ_PASS: +	case SCSI_PROT_WRITE_PASS: +		sig_attrs->mem.sig.dif.type = scsi2ib_prot_type(scsi_ptype); +		sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; +		sig_attrs->mem.sig.dif.ref_tag = scsi_get_lba(sc) & +						 0xffffffff; +		sig_attrs->wire.sig.dif.type = scsi2ib_prot_type(scsi_ptype); +		sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; +		sig_attrs->wire.sig.dif.ref_tag = scsi_get_lba(sc) & +						  0xffffffff; +		break; +	default: +		iser_err("Unsupported PI operation %d\n", +			 scsi_get_prot_op(sc)); +		return -EINVAL; +	} +	return 0; +} + + +static int +iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) +{ +	switch (scsi_get_prot_type(sc)) { +	case SCSI_PROT_DIF_TYPE0: +		*mask = 0x0; +		break; +	case SCSI_PROT_DIF_TYPE1: +	case SCSI_PROT_DIF_TYPE2: +		*mask = ISER_CHECK_GUARD | ISER_CHECK_REFTAG; +		break; +	case SCSI_PROT_DIF_TYPE3: +		*mask = ISER_CHECK_GUARD; +		break; +	default: +		iser_err("Unsupported protection type %d\n", +			 scsi_get_prot_type(sc)); +		return -EINVAL; +	} + +	return 0; +} + +static int +iser_reg_sig_mr(struct iscsi_iser_task *iser_task, +		struct fast_reg_descriptor *desc, struct ib_sge *data_sge, +		struct ib_sge *prot_sge, struct ib_sge *sig_sge) +{ +	struct iser_conn *ib_conn = iser_task->ib_conn; +	struct iser_pi_context *pi_ctx = desc->pi_ctx; +	struct ib_send_wr sig_wr, inv_wr; +	struct ib_send_wr *bad_wr, *wr = NULL; +	struct ib_sig_attrs sig_attrs; +	int ret; +	u32 key; + +	memset(&sig_attrs, 0, sizeof(sig_attrs)); +	ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs); +	if (ret) +		goto err; + +	ret = iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask); +	if (ret) +		goto err; + +	if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) { +		memset(&inv_wr, 0, sizeof(inv_wr)); +		inv_wr.opcode = IB_WR_LOCAL_INV; +		inv_wr.wr_id = ISER_FASTREG_LI_WRID; +		inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey; +		wr = &inv_wr; +		/* Bump the key */ +		key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF); +		ib_update_fast_reg_key(pi_ctx->sig_mr, ++key); +	} + +	memset(&sig_wr, 0, sizeof(sig_wr)); +	sig_wr.opcode = IB_WR_REG_SIG_MR; +	sig_wr.wr_id = ISER_FASTREG_LI_WRID; +	sig_wr.sg_list = data_sge; +	sig_wr.num_sge = 1; +	sig_wr.wr.sig_handover.sig_attrs = &sig_attrs; +	sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr; +	if (scsi_prot_sg_count(iser_task->sc)) +		sig_wr.wr.sig_handover.prot = prot_sge; +	sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE | +					      IB_ACCESS_REMOTE_READ | +					      IB_ACCESS_REMOTE_WRITE; + +	if (!wr) +		wr = &sig_wr; +	else +		wr->next = &sig_wr; + +	ret = ib_post_send(ib_conn->qp, wr, &bad_wr); +	if (ret) { +		iser_err("reg_sig_mr failed, ret:%d\n", ret); +		goto err; +	} +	desc->reg_indicators &= ~ISER_SIG_KEY_VALID; + +	sig_sge->lkey = pi_ctx->sig_mr->lkey; +	sig_sge->addr = 0; +	sig_sge->length = data_sge->length + prot_sge->length; +	if (scsi_get_prot_op(iser_task->sc) == SCSI_PROT_WRITE_INSERT || +	    scsi_get_prot_op(iser_task->sc) == SCSI_PROT_READ_STRIP) { +		sig_sge->length += (data_sge->length / +				   iser_task->sc->device->sector_size) * 8; +	} + +	iser_dbg("sig_sge: addr: 0x%llx  length: %u lkey: 0x%x\n", +		 sig_sge->addr, sig_sge->length, +		 sig_sge->lkey); +err: +	return ret; +} + +static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, +			    struct iser_regd_buf *regd_buf, +			    struct iser_data_buf *mem, +			    enum iser_reg_indicator ind, +			    struct ib_sge *sge) +{ +	struct fast_reg_descriptor *desc = regd_buf->reg.mem_h; +	struct iser_conn *ib_conn = iser_task->ib_conn; +	struct iser_device *device = ib_conn->device; +	struct ib_device *ibdev = device->ib_device; +	struct ib_mr *mr; +	struct ib_fast_reg_page_list *frpl; +	struct ib_send_wr fastreg_wr, inv_wr; +	struct ib_send_wr *bad_wr, *wr = NULL; +	u8 key; +	int ret, offset, size, plen; + +	/* if there a single dma entry, dma mr suffices */ +	if (mem->dma_nents == 1) { +		struct scatterlist *sg = (struct scatterlist *)mem->buf; + +		sge->lkey = device->mr->lkey; +		sge->addr   = ib_sg_dma_address(ibdev, &sg[0]); +		sge->length  = ib_sg_dma_len(ibdev, &sg[0]); + +		iser_dbg("Single DMA entry: lkey=0x%x, addr=0x%llx, length=0x%x\n", +			 sge->lkey, sge->addr, sge->length); +		return 0; +	} + +	if (ind == ISER_DATA_KEY_VALID) { +		mr = desc->data_mr; +		frpl = desc->data_frpl; +	} else { +		mr = desc->pi_ctx->prot_mr; +		frpl = desc->pi_ctx->prot_frpl; +	} + +	plen = iser_sg_to_page_vec(mem, device->ib_device, frpl->page_list, +				   &offset, &size); +	if (plen * SIZE_4K < size) { +		iser_err("fast reg page_list too short to hold this SG\n"); +		return -EINVAL; +	} + +	if (!(desc->reg_indicators & ind)) { +		memset(&inv_wr, 0, sizeof(inv_wr)); +		inv_wr.wr_id = ISER_FASTREG_LI_WRID; +		inv_wr.opcode = IB_WR_LOCAL_INV; +		inv_wr.ex.invalidate_rkey = mr->rkey; +		wr = &inv_wr; +		/* Bump the key */ +		key = (u8)(mr->rkey & 0x000000FF); +		ib_update_fast_reg_key(mr, ++key); +	} + +	/* Prepare FASTREG WR */ +	memset(&fastreg_wr, 0, sizeof(fastreg_wr)); +	fastreg_wr.wr_id = ISER_FASTREG_LI_WRID; +	fastreg_wr.opcode = IB_WR_FAST_REG_MR; +	fastreg_wr.wr.fast_reg.iova_start = frpl->page_list[0] + offset; +	fastreg_wr.wr.fast_reg.page_list = frpl; +	fastreg_wr.wr.fast_reg.page_list_len = plen; +	fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K; +	fastreg_wr.wr.fast_reg.length = size; +	fastreg_wr.wr.fast_reg.rkey = mr->rkey; +	fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE  | +					       IB_ACCESS_REMOTE_WRITE | +					       IB_ACCESS_REMOTE_READ); + +	if (!wr) +		wr = &fastreg_wr; +	else +		wr->next = &fastreg_wr; + +	ret = ib_post_send(ib_conn->qp, wr, &bad_wr); +	if (ret) { +		iser_err("fast registration failed, ret:%d\n", ret); +		return ret; +	} +	desc->reg_indicators &= ~ind; + +	sge->lkey = mr->lkey; +	sge->addr = frpl->page_list[0] + offset; +	sge->length = size; + +	return ret; +} + +/** + * iser_reg_rdma_mem_fastreg - Registers memory intended for RDMA, + * using Fast Registration WR (if possible) obtaining rkey and va + * + * returns 0 on success, errno code on failure + */ +int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task, +			      enum iser_data_dir cmd_dir) +{ +	struct iser_conn *ib_conn = iser_task->ib_conn; +	struct iser_device *device = ib_conn->device; +	struct ib_device *ibdev = device->ib_device; +	struct iser_data_buf *mem = &iser_task->data[cmd_dir]; +	struct iser_regd_buf *regd_buf = &iser_task->rdma_regd[cmd_dir]; +	struct fast_reg_descriptor *desc = NULL; +	struct ib_sge data_sge; +	int err, aligned_len; +	unsigned long flags; + +	aligned_len = iser_data_buf_aligned_len(mem, ibdev); +	if (aligned_len != mem->dma_nents) { +		err = fall_to_bounce_buf(iser_task, ibdev, mem, +					 &iser_task->data_copy[cmd_dir], +					 cmd_dir, aligned_len); +		if (err) { +			iser_err("failed to allocate bounce buffer\n"); +			return err; +		} +		mem = &iser_task->data_copy[cmd_dir]; +	} + +	if (mem->dma_nents != 1 || +	    scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { +		spin_lock_irqsave(&ib_conn->lock, flags); +		desc = list_first_entry(&ib_conn->fastreg.pool, +					struct fast_reg_descriptor, list); +		list_del(&desc->list); +		spin_unlock_irqrestore(&ib_conn->lock, flags); +		regd_buf->reg.mem_h = desc; +	} + +	err = iser_fast_reg_mr(iser_task, regd_buf, mem, +			       ISER_DATA_KEY_VALID, &data_sge); +	if (err) +		goto err_reg; + +	if (scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { +		struct ib_sge prot_sge, sig_sge; + +		memset(&prot_sge, 0, sizeof(prot_sge)); +		if (scsi_prot_sg_count(iser_task->sc)) { +			mem = &iser_task->prot[cmd_dir]; +			aligned_len = iser_data_buf_aligned_len(mem, ibdev); +			if (aligned_len != mem->dma_nents) { +				err = fall_to_bounce_buf(iser_task, ibdev, mem, +							 &iser_task->prot_copy[cmd_dir], +							 cmd_dir, aligned_len); +				if (err) { +					iser_err("failed to allocate bounce buffer\n"); +					return err; +				} +				mem = &iser_task->prot_copy[cmd_dir]; +			} + +			err = iser_fast_reg_mr(iser_task, regd_buf, mem, +					       ISER_PROT_KEY_VALID, &prot_sge); +			if (err) +				goto err_reg;  		} + +		err = iser_reg_sig_mr(iser_task, desc, &data_sge, +				      &prot_sge, &sig_sge); +		if (err) { +			iser_err("Failed to register signature mr\n"); +			return err; +		} +		desc->reg_indicators |= ISER_FASTREG_PROTECTED; + +		regd_buf->reg.lkey = sig_sge.lkey; +		regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey; +		regd_buf->reg.va = sig_sge.addr; +		regd_buf->reg.len = sig_sge.length; +		regd_buf->reg.is_mr = 1; +	} else { +		if (desc) { +			regd_buf->reg.rkey = desc->data_mr->rkey; +			regd_buf->reg.is_mr = 1; +		} else { +			regd_buf->reg.rkey = device->mr->rkey; +			regd_buf->reg.is_mr = 0; +		} + +		regd_buf->reg.lkey = data_sge.lkey; +		regd_buf->reg.va = data_sge.addr; +		regd_buf->reg.len = data_sge.length;  	} +  	return 0; +err_reg: +	if (desc) { +		spin_lock_irqsave(&ib_conn->lock, flags); +		list_add_tail(&desc->list, &ib_conn->fastreg.pool); +		spin_unlock_irqrestore(&ib_conn->lock, flags); +	} + +	return err;  } diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 9876865732f..ea01075f9f9 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -1,6 +1,7 @@  /*   * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.   * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.   *   * This software is available to you under a choice of one of two   * licenses.  You may choose to be licensed under the terms of the GNU @@ -70,32 +71,78 @@ static void iser_event_handler(struct ib_event_handler *handler,   */  static int iser_create_device_ib_res(struct iser_device *device)  { +	struct iser_cq_desc *cq_desc; +	struct ib_device_attr *dev_attr = &device->dev_attr; +	int ret, i, j; + +	ret = ib_query_device(device->ib_device, dev_attr); +	if (ret) { +		pr_warn("Query device failed for %s\n", device->ib_device->name); +		return ret; +	} + +	/* Assign function handles  - based on FMR support */ +	if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr && +	    device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) { +		iser_info("FMR supported, using FMR for registration\n"); +		device->iser_alloc_rdma_reg_res = iser_create_fmr_pool; +		device->iser_free_rdma_reg_res = iser_free_fmr_pool; +		device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr; +		device->iser_unreg_rdma_mem = iser_unreg_mem_fmr; +	} else +	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { +		iser_info("FastReg supported, using FastReg for registration\n"); +		device->iser_alloc_rdma_reg_res = iser_create_fastreg_pool; +		device->iser_free_rdma_reg_res = iser_free_fastreg_pool; +		device->iser_reg_rdma_mem = iser_reg_rdma_mem_fastreg; +		device->iser_unreg_rdma_mem = iser_unreg_mem_fastreg; +	} else { +		iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n"); +		return -1; +	} + +	device->cqs_used = min(ISER_MAX_CQ, device->ib_device->num_comp_vectors); +	iser_info("using %d CQs, device %s supports %d vectors\n", +		  device->cqs_used, device->ib_device->name, +		  device->ib_device->num_comp_vectors); + +	device->cq_desc = kmalloc(sizeof(struct iser_cq_desc) * device->cqs_used, +				  GFP_KERNEL); +	if (device->cq_desc == NULL) +		goto cq_desc_err; +	cq_desc = device->cq_desc; +  	device->pd = ib_alloc_pd(device->ib_device);  	if (IS_ERR(device->pd))  		goto pd_err; -	device->rx_cq = ib_create_cq(device->ib_device, -				  iser_cq_callback, -				  iser_cq_event_callback, -				  (void *)device, -				  ISER_MAX_RX_CQ_LEN, 0); -	if (IS_ERR(device->rx_cq)) -		goto rx_cq_err; +	for (i = 0; i < device->cqs_used; i++) { +		cq_desc[i].device   = device; +		cq_desc[i].cq_index = i; -	device->tx_cq = ib_create_cq(device->ib_device, -				  NULL, iser_cq_event_callback, -				  (void *)device, -				  ISER_MAX_TX_CQ_LEN, 0); +		device->rx_cq[i] = ib_create_cq(device->ib_device, +					  iser_cq_callback, +					  iser_cq_event_callback, +					  (void *)&cq_desc[i], +					  ISER_MAX_RX_CQ_LEN, i); +		if (IS_ERR(device->rx_cq[i])) +			goto cq_err; -	if (IS_ERR(device->tx_cq)) -		goto tx_cq_err; +		device->tx_cq[i] = ib_create_cq(device->ib_device, +					  NULL, iser_cq_event_callback, +					  (void *)&cq_desc[i], +					  ISER_MAX_TX_CQ_LEN, i); -	if (ib_req_notify_cq(device->rx_cq, IB_CQ_NEXT_COMP)) -		goto cq_arm_err; +		if (IS_ERR(device->tx_cq[i])) +			goto cq_err; -	tasklet_init(&device->cq_tasklet, -		     iser_cq_tasklet_fn, -		     (unsigned long)device); +		if (ib_req_notify_cq(device->rx_cq[i], IB_CQ_NEXT_COMP)) +			goto cq_err; + +		tasklet_init(&device->cq_tasklet[i], +			     iser_cq_tasklet_fn, +			(unsigned long)&cq_desc[i]); +	}  	device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE |  				   IB_ACCESS_REMOTE_WRITE | @@ -113,14 +160,19 @@ static int iser_create_device_ib_res(struct iser_device *device)  handler_err:  	ib_dereg_mr(device->mr);  dma_mr_err: -	tasklet_kill(&device->cq_tasklet); -cq_arm_err: -	ib_destroy_cq(device->tx_cq); -tx_cq_err: -	ib_destroy_cq(device->rx_cq); -rx_cq_err: +	for (j = 0; j < device->cqs_used; j++) +		tasklet_kill(&device->cq_tasklet[j]); +cq_err: +	for (j = 0; j < i; j++) { +		if (device->tx_cq[j]) +			ib_destroy_cq(device->tx_cq[j]); +		if (device->rx_cq[j]) +			ib_destroy_cq(device->rx_cq[j]); +	}  	ib_dealloc_pd(device->pd);  pd_err: +	kfree(device->cq_desc); +cq_desc_err:  	iser_err("failed to allocate an IB resource\n");  	return -1;  } @@ -131,52 +183,45 @@ pd_err:   */  static void iser_free_device_ib_res(struct iser_device *device)  { +	int i;  	BUG_ON(device->mr == NULL); -	tasklet_kill(&device->cq_tasklet); +	for (i = 0; i < device->cqs_used; i++) { +		tasklet_kill(&device->cq_tasklet[i]); +		(void)ib_destroy_cq(device->tx_cq[i]); +		(void)ib_destroy_cq(device->rx_cq[i]); +		device->tx_cq[i] = NULL; +		device->rx_cq[i] = NULL; +	} +  	(void)ib_unregister_event_handler(&device->event_handler);  	(void)ib_dereg_mr(device->mr); -	(void)ib_destroy_cq(device->tx_cq); -	(void)ib_destroy_cq(device->rx_cq);  	(void)ib_dealloc_pd(device->pd); +	kfree(device->cq_desc); +  	device->mr = NULL; -	device->tx_cq = NULL; -	device->rx_cq = NULL;  	device->pd = NULL;  }  /** - * iser_create_ib_conn_res - Creates FMR pool and Queue-Pair (QP) + * iser_create_fmr_pool - Creates FMR pool and page_vector   * - * returns 0 on success, -1 on failure + * returns 0 on success, or errno code on failure   */ -static int iser_create_ib_conn_res(struct iser_conn *ib_conn) +int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max)  { -	struct iser_device	*device; -	struct ib_qp_init_attr	init_attr; -	int			ret = -ENOMEM; +	struct iser_device *device = ib_conn->device;  	struct ib_fmr_pool_param params; +	int ret = -ENOMEM; -	BUG_ON(ib_conn->device == NULL); - -	device = ib_conn->device; - -	ib_conn->login_buf = kmalloc(ISER_RX_LOGIN_SIZE, GFP_KERNEL); -	if (!ib_conn->login_buf) -		goto out_err; - -	ib_conn->login_dma = ib_dma_map_single(ib_conn->device->ib_device, -				(void *)ib_conn->login_buf, ISER_RX_LOGIN_SIZE, -				DMA_FROM_DEVICE); +	ib_conn->fmr.page_vec = kmalloc(sizeof(*ib_conn->fmr.page_vec) + +					(sizeof(u64)*(ISCSI_ISER_SG_TABLESIZE + 1)), +					GFP_KERNEL); +	if (!ib_conn->fmr.page_vec) +		return ret; -	ib_conn->page_vec = kmalloc(sizeof(struct iser_page_vec) + -				    (sizeof(u64) * (ISCSI_ISER_SG_TABLESIZE +1)), -				    GFP_KERNEL); -	if (!ib_conn->page_vec) -		goto out_err; - -	ib_conn->page_vec->pages = (u64 *) (ib_conn->page_vec + 1); +	ib_conn->fmr.page_vec->pages = (u64 *)(ib_conn->fmr.page_vec + 1);  	params.page_shift        = SHIFT_4K;  	/* when the first/last SG element are not start/end * @@ -184,42 +229,260 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn)  	params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1;  	/* make the pool size twice the max number of SCSI commands *  	 * the ML is expected to queue, watermark for unmap at 50%  */ -	params.pool_size	 = ISCSI_DEF_XMIT_CMDS_MAX * 2; -	params.dirty_watermark	 = ISCSI_DEF_XMIT_CMDS_MAX; +	params.pool_size	 = cmds_max * 2; +	params.dirty_watermark	 = cmds_max;  	params.cache		 = 0;  	params.flush_function	 = NULL;  	params.access		 = (IB_ACCESS_LOCAL_WRITE  |  				    IB_ACCESS_REMOTE_WRITE |  				    IB_ACCESS_REMOTE_READ); -	ib_conn->fmr_pool = ib_create_fmr_pool(device->pd, ¶ms); -	if (IS_ERR(ib_conn->fmr_pool)) { -		ret = PTR_ERR(ib_conn->fmr_pool); -		ib_conn->fmr_pool = NULL; -		goto out_err; +	ib_conn->fmr.pool = ib_create_fmr_pool(device->pd, ¶ms); +	if (!IS_ERR(ib_conn->fmr.pool)) +		return 0; + +	/* no FMR => no need for page_vec */ +	kfree(ib_conn->fmr.page_vec); +	ib_conn->fmr.page_vec = NULL; + +	ret = PTR_ERR(ib_conn->fmr.pool); +	ib_conn->fmr.pool = NULL; +	if (ret != -ENOSYS) { +		iser_err("FMR allocation failed, err %d\n", ret); +		return ret; +	} else { +		iser_warn("FMRs are not supported, using unaligned mode\n"); +		return 0; +	} +} + +/** + * iser_free_fmr_pool - releases the FMR pool and page vec + */ +void iser_free_fmr_pool(struct iser_conn *ib_conn) +{ +	iser_info("freeing conn %p fmr pool %p\n", +		  ib_conn, ib_conn->fmr.pool); + +	if (ib_conn->fmr.pool != NULL) +		ib_destroy_fmr_pool(ib_conn->fmr.pool); + +	ib_conn->fmr.pool = NULL; + +	kfree(ib_conn->fmr.page_vec); +	ib_conn->fmr.page_vec = NULL; +} + +static int +iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd, +			 bool pi_enable, struct fast_reg_descriptor *desc) +{ +	int ret; + +	desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device, +						      ISCSI_ISER_SG_TABLESIZE + 1); +	if (IS_ERR(desc->data_frpl)) { +		ret = PTR_ERR(desc->data_frpl); +		iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n", +			 ret); +		return PTR_ERR(desc->data_frpl); +	} + +	desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1); +	if (IS_ERR(desc->data_mr)) { +		ret = PTR_ERR(desc->data_mr); +		iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); +		goto fast_reg_mr_failure; +	} +	desc->reg_indicators |= ISER_DATA_KEY_VALID; + +	if (pi_enable) { +		struct ib_mr_init_attr mr_init_attr = {0}; +		struct iser_pi_context *pi_ctx = NULL; + +		desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); +		if (!desc->pi_ctx) { +			iser_err("Failed to allocate pi context\n"); +			ret = -ENOMEM; +			goto pi_ctx_alloc_failure; +		} +		pi_ctx = desc->pi_ctx; + +		pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, +						    ISCSI_ISER_SG_TABLESIZE); +		if (IS_ERR(pi_ctx->prot_frpl)) { +			ret = PTR_ERR(pi_ctx->prot_frpl); +			iser_err("Failed to allocate prot frpl ret=%d\n", +				 ret); +			goto prot_frpl_failure; +		} + +		pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, +						ISCSI_ISER_SG_TABLESIZE + 1); +		if (IS_ERR(pi_ctx->prot_mr)) { +			ret = PTR_ERR(pi_ctx->prot_mr); +			iser_err("Failed to allocate prot frmr ret=%d\n", +				 ret); +			goto prot_mr_failure; +		} +		desc->reg_indicators |= ISER_PROT_KEY_VALID; + +		mr_init_attr.max_reg_descriptors = 2; +		mr_init_attr.flags |= IB_MR_SIGNATURE_EN; +		pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); +		if (IS_ERR(pi_ctx->sig_mr)) { +			ret = PTR_ERR(pi_ctx->sig_mr); +			iser_err("Failed to allocate signature enabled mr err=%d\n", +				 ret); +			goto sig_mr_failure; +		} +		desc->reg_indicators |= ISER_SIG_KEY_VALID; +	} +	desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; + +	iser_dbg("Create fr_desc %p page_list %p\n", +		 desc, desc->data_frpl->page_list); + +	return 0; +sig_mr_failure: +	ib_dereg_mr(desc->pi_ctx->prot_mr); +prot_mr_failure: +	ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); +prot_frpl_failure: +	kfree(desc->pi_ctx); +pi_ctx_alloc_failure: +	ib_dereg_mr(desc->data_mr); +fast_reg_mr_failure: +	ib_free_fast_reg_page_list(desc->data_frpl); + +	return ret; +} + +/** + * iser_create_fastreg_pool - Creates pool of fast_reg descriptors + * for fast registration work requests. + * returns 0 on success, or errno code on failure + */ +int iser_create_fastreg_pool(struct iser_conn *ib_conn, unsigned cmds_max) +{ +	struct iser_device	*device = ib_conn->device; +	struct fast_reg_descriptor	*desc; +	int i, ret; + +	INIT_LIST_HEAD(&ib_conn->fastreg.pool); +	ib_conn->fastreg.pool_size = 0; +	for (i = 0; i < cmds_max; i++) { +		desc = kzalloc(sizeof(*desc), GFP_KERNEL); +		if (!desc) { +			iser_err("Failed to allocate a new fast_reg descriptor\n"); +			ret = -ENOMEM; +			goto err; +		} + +		ret = iser_create_fastreg_desc(device->ib_device, device->pd, +					       ib_conn->pi_support, desc); +		if (ret) { +			iser_err("Failed to create fastreg descriptor err=%d\n", +				 ret); +			kfree(desc); +			goto err; +		} + +		list_add_tail(&desc->list, &ib_conn->fastreg.pool); +		ib_conn->fastreg.pool_size++;  	} +	return 0; + +err: +	iser_free_fastreg_pool(ib_conn); +	return ret; +} + +/** + * iser_free_fastreg_pool - releases the pool of fast_reg descriptors + */ +void iser_free_fastreg_pool(struct iser_conn *ib_conn) +{ +	struct fast_reg_descriptor *desc, *tmp; +	int i = 0; + +	if (list_empty(&ib_conn->fastreg.pool)) +		return; + +	iser_info("freeing conn %p fr pool\n", ib_conn); + +	list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) { +		list_del(&desc->list); +		ib_free_fast_reg_page_list(desc->data_frpl); +		ib_dereg_mr(desc->data_mr); +		if (desc->pi_ctx) { +			ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); +			ib_dereg_mr(desc->pi_ctx->prot_mr); +			ib_destroy_mr(desc->pi_ctx->sig_mr); +			kfree(desc->pi_ctx); +		} +		kfree(desc); +		++i; +	} + +	if (i < ib_conn->fastreg.pool_size) +		iser_warn("pool still has %d regions registered\n", +			  ib_conn->fastreg.pool_size - i); +} + +/** + * iser_create_ib_conn_res - Queue-Pair (QP) + * + * returns 0 on success, -1 on failure + */ +static int iser_create_ib_conn_res(struct iser_conn *ib_conn) +{ +	struct iser_device	*device; +	struct ib_qp_init_attr	init_attr; +	int			ret = -ENOMEM; +	int index, min_index = 0; + +	BUG_ON(ib_conn->device == NULL); + +	device = ib_conn->device; +  	memset(&init_attr, 0, sizeof init_attr); +	mutex_lock(&ig.connlist_mutex); +	/* select the CQ with the minimal number of usages */ +	for (index = 0; index < device->cqs_used; index++) +		if (device->cq_active_qps[index] < +		    device->cq_active_qps[min_index]) +			min_index = index; +	device->cq_active_qps[min_index]++; +	mutex_unlock(&ig.connlist_mutex); +	iser_info("cq index %d used for ib_conn %p\n", min_index, ib_conn); +  	init_attr.event_handler = iser_qp_event_callback;  	init_attr.qp_context	= (void *)ib_conn; -	init_attr.send_cq	= device->tx_cq; -	init_attr.recv_cq	= device->rx_cq; -	init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS; +	init_attr.send_cq	= device->tx_cq[min_index]; +	init_attr.recv_cq	= device->rx_cq[min_index];  	init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;  	init_attr.cap.max_send_sge = 2;  	init_attr.cap.max_recv_sge = 1;  	init_attr.sq_sig_type	= IB_SIGNAL_REQ_WR;  	init_attr.qp_type	= IB_QPT_RC; +	if (ib_conn->pi_support) { +		init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS; +		init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; +	} else { +		init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS; +	}  	ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);  	if (ret)  		goto out_err;  	ib_conn->qp = ib_conn->cma_id->qp; -	iser_err("setting conn %p cma_id %p: fmr_pool %p qp %p\n", -		 ib_conn, ib_conn->cma_id, -		 ib_conn->fmr_pool, ib_conn->cma_id->qp); +	iser_info("setting conn %p cma_id %p qp %p\n", +		  ib_conn, ib_conn->cma_id, +		  ib_conn->cma_id->qp);  	return ret;  out_err: @@ -228,32 +491,28 @@ out_err:  }  /** - * releases the FMR pool, QP and CMA ID objects, returns 0 on success, + * releases the QP objects, returns 0 on success,   * -1 on failure   */ -static int iser_free_ib_conn_res(struct iser_conn *ib_conn, int can_destroy_id) +static int iser_free_ib_conn_res(struct iser_conn *ib_conn)  { +	int cq_index;  	BUG_ON(ib_conn == NULL); -	iser_err("freeing conn %p cma_id %p fmr pool %p qp %p\n", -		 ib_conn, ib_conn->cma_id, -		 ib_conn->fmr_pool, ib_conn->qp); +	iser_info("freeing conn %p cma_id %p qp %p\n", +		  ib_conn, ib_conn->cma_id, +		  ib_conn->qp);  	/* qp is created only once both addr & route are resolved */ -	if (ib_conn->fmr_pool != NULL) -		ib_destroy_fmr_pool(ib_conn->fmr_pool); -	if (ib_conn->qp != NULL) -		rdma_destroy_qp(ib_conn->cma_id); +	if (ib_conn->qp != NULL) { +		cq_index = ((struct iser_cq_desc *)ib_conn->qp->recv_cq->cq_context)->cq_index; +		ib_conn->device->cq_active_qps[cq_index]--; -	/* if cma handler context, the caller acts s.t the cma destroy the id */ -	if (ib_conn->cma_id != NULL && can_destroy_id) -		rdma_destroy_id(ib_conn->cma_id); +		rdma_destroy_qp(ib_conn->cma_id); +	} -	ib_conn->fmr_pool = NULL;  	ib_conn->qp	  = NULL; -	ib_conn->cma_id   = NULL; -	kfree(ib_conn->page_vec);  	return 0;  } @@ -300,7 +559,7 @@ static void iser_device_try_release(struct iser_device *device)  {  	mutex_lock(&ig.device_list_mutex);  	device->refcount--; -	iser_err("device %p refcount %d\n",device,device->refcount); +	iser_info("device %p refcount %d\n", device, device->refcount);  	if (!device->refcount) {  		iser_free_device_ib_res(device);  		list_del(&device->ig_list); @@ -322,39 +581,46 @@ static int iser_conn_state_comp_exch(struct iser_conn *ib_conn,  	return ret;  } +void iser_release_work(struct work_struct *work) +{ +	struct iser_conn *ib_conn; + +	ib_conn = container_of(work, struct iser_conn, release_work); + +	/* wait for .conn_stop callback */ +	wait_for_completion(&ib_conn->stop_completion); + +	/* wait for the qp`s post send and post receive buffers to empty */ +	wait_event_interruptible(ib_conn->wait, +				 ib_conn->state == ISER_CONN_DOWN); + +	iser_conn_release(ib_conn); +} +  /**   * Frees all conn objects and deallocs conn descriptor   */ -static void iser_conn_release(struct iser_conn *ib_conn, int can_destroy_id) +void iser_conn_release(struct iser_conn *ib_conn)  {  	struct iser_device  *device = ib_conn->device; -	BUG_ON(ib_conn->state != ISER_CONN_DOWN); +	BUG_ON(ib_conn->state == ISER_CONN_UP);  	mutex_lock(&ig.connlist_mutex);  	list_del(&ib_conn->conn_list);  	mutex_unlock(&ig.connlist_mutex);  	iser_free_rx_descriptors(ib_conn); -	iser_free_ib_conn_res(ib_conn, can_destroy_id); +	iser_free_ib_conn_res(ib_conn);  	ib_conn->device = NULL;  	/* on EVENT_ADDR_ERROR there's no device yet for this conn */  	if (device != NULL)  		iser_device_try_release(device); -	iscsi_destroy_endpoint(ib_conn->ep); -} - -void iser_conn_get(struct iser_conn *ib_conn) -{ -	atomic_inc(&ib_conn->refcount); -} - -int iser_conn_put(struct iser_conn *ib_conn, int can_destroy_id) -{ -	if (atomic_dec_and_test(&ib_conn->refcount)) { -		iser_conn_release(ib_conn, can_destroy_id); -		return 1; +	/* if cma handler context, the caller actually destroy the id */ +	if (ib_conn->cma_id != NULL) { +		rdma_destroy_id(ib_conn->cma_id); +		ib_conn->cma_id = NULL;  	} -	return 0; +	iscsi_destroy_endpoint(ib_conn->ep);  }  /** @@ -374,24 +640,19 @@ void iser_conn_terminate(struct iser_conn *ib_conn)  	if (err)  		iser_err("Failed to disconnect, conn: 0x%p err %d\n",  			 ib_conn,err); - -	wait_event_interruptible(ib_conn->wait, -				 ib_conn->state == ISER_CONN_DOWN); - -	iser_conn_put(ib_conn, 1); /* deref ib conn deallocate */  } -static int iser_connect_error(struct rdma_cm_id *cma_id) +static void iser_connect_error(struct rdma_cm_id *cma_id)  {  	struct iser_conn *ib_conn; +  	ib_conn = (struct iser_conn *)cma_id->context;  	ib_conn->state = ISER_CONN_DOWN;  	wake_up_interruptible(&ib_conn->wait); -	return iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */  } -static int iser_addr_handler(struct rdma_cm_id *cma_id) +static void iser_addr_handler(struct rdma_cm_id *cma_id)  {  	struct iser_device *device;  	struct iser_conn   *ib_conn; @@ -400,25 +661,39 @@ static int iser_addr_handler(struct rdma_cm_id *cma_id)  	device = iser_device_find_by_ib_device(cma_id);  	if (!device) {  		iser_err("device lookup/creation failed\n"); -		return iser_connect_error(cma_id); +		iser_connect_error(cma_id); +		return;  	}  	ib_conn = (struct iser_conn *)cma_id->context;  	ib_conn->device = device; +	/* connection T10-PI support */ +	if (iser_pi_enable) { +		if (!(device->dev_attr.device_cap_flags & +		      IB_DEVICE_SIGNATURE_HANDOVER)) { +			iser_warn("T10-PI requested but not supported on %s, " +				  "continue without T10-PI\n", +				  ib_conn->device->ib_device->name); +			ib_conn->pi_support = false; +		} else { +			ib_conn->pi_support = true; +		} +	} +  	ret = rdma_resolve_route(cma_id, 1000);  	if (ret) {  		iser_err("resolve route failed: %d\n", ret); -		return iser_connect_error(cma_id); +		iser_connect_error(cma_id); +		return;  	} - -	return 0;  } -static int iser_route_handler(struct rdma_cm_id *cma_id) +static void iser_route_handler(struct rdma_cm_id *cma_id)  {  	struct rdma_conn_param conn_param;  	int    ret; +	struct iser_cm_hdr req_hdr;  	ret = iser_create_ib_conn_res((struct iser_conn *)cma_id->context);  	if (ret) @@ -430,39 +705,52 @@ static int iser_route_handler(struct rdma_cm_id *cma_id)  	conn_param.retry_count	       = 7;  	conn_param.rnr_retry_count     = 6; +	memset(&req_hdr, 0, sizeof(req_hdr)); +	req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED | +			ISER_SEND_W_INV_NOT_SUPPORTED); +	conn_param.private_data		= (void *)&req_hdr; +	conn_param.private_data_len	= sizeof(struct iser_cm_hdr); +  	ret = rdma_connect(cma_id, &conn_param);  	if (ret) {  		iser_err("failure connecting: %d\n", ret);  		goto failure;  	} -	return 0; +	return;  failure: -	return iser_connect_error(cma_id); +	iser_connect_error(cma_id);  }  static void iser_connected_handler(struct rdma_cm_id *cma_id)  {  	struct iser_conn *ib_conn; +	struct ib_qp_attr attr; +	struct ib_qp_init_attr init_attr; + +	(void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr); +	iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num);  	ib_conn = (struct iser_conn *)cma_id->context; -	ib_conn->state = ISER_CONN_UP; -	wake_up_interruptible(&ib_conn->wait); +	if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_PENDING, ISER_CONN_UP)) +		wake_up_interruptible(&ib_conn->wait);  } -static int iser_disconnected_handler(struct rdma_cm_id *cma_id) +static void iser_disconnected_handler(struct rdma_cm_id *cma_id)  {  	struct iser_conn *ib_conn; -	int ret;  	ib_conn = (struct iser_conn *)cma_id->context;  	/* getting here when the state is UP means that the conn is being *  	 * terminated asynchronously from the iSCSI layer's perspective.  */  	if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, -				      ISER_CONN_TERMINATING)) -		iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn, -				   ISCSI_ERR_CONN_FAILED); +					ISER_CONN_TERMINATING)){ +		if (ib_conn->iscsi_conn) +			iscsi_conn_failure(ib_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); +		else +			iser_err("iscsi_iser connection isn't bound\n"); +	}  	/* Complete the termination process if no posts are pending */  	if (ib_conn->post_recv_buf_count == 0 && @@ -470,24 +758,19 @@ static int iser_disconnected_handler(struct rdma_cm_id *cma_id)  		ib_conn->state = ISER_CONN_DOWN;  		wake_up_interruptible(&ib_conn->wait);  	} - -	ret = iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */ -	return ret;  }  static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)  { -	int ret = 0; - -	iser_err("event %d status %d conn %p id %p\n", -		event->event, event->status, cma_id->context, cma_id); +	iser_info("event %d status %d conn %p id %p\n", +		  event->event, event->status, cma_id->context, cma_id);  	switch (event->event) {  	case RDMA_CM_EVENT_ADDR_RESOLVED: -		ret = iser_addr_handler(cma_id); +		iser_addr_handler(cma_id);  		break;  	case RDMA_CM_EVENT_ROUTE_RESOLVED: -		ret = iser_route_handler(cma_id); +		iser_route_handler(cma_id);  		break;  	case RDMA_CM_EVENT_ESTABLISHED:  		iser_connected_handler(cma_id); @@ -497,18 +780,18 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve  	case RDMA_CM_EVENT_CONNECT_ERROR:  	case RDMA_CM_EVENT_UNREACHABLE:  	case RDMA_CM_EVENT_REJECTED: -		ret = iser_connect_error(cma_id); +		iser_connect_error(cma_id);  		break;  	case RDMA_CM_EVENT_DISCONNECTED:  	case RDMA_CM_EVENT_DEVICE_REMOVAL:  	case RDMA_CM_EVENT_ADDR_CHANGE: -		ret = iser_disconnected_handler(cma_id); +		iser_disconnected_handler(cma_id);  		break;  	default:  		iser_err("Unexpected RDMA CM event (%d)\n", event->event);  		break;  	} -	return ret; +	return 0;  }  void iser_conn_init(struct iser_conn *ib_conn) @@ -517,7 +800,7 @@ void iser_conn_init(struct iser_conn *ib_conn)  	init_waitqueue_head(&ib_conn->wait);  	ib_conn->post_recv_buf_count = 0;  	atomic_set(&ib_conn->post_send_buf_count, 0); -	atomic_set(&ib_conn->refcount, 1); /* ref ib conn allocation */ +	init_completion(&ib_conn->stop_completion);  	INIT_LIST_HEAD(&ib_conn->conn_list);  	spin_lock_init(&ib_conn->lock);  } @@ -540,15 +823,14 @@ int iser_connect(struct iser_conn   *ib_conn,  	/* the device is known only --after-- address resolution */  	ib_conn->device = NULL; -	iser_err("connecting to: %pI4, port 0x%x\n", -		 &dst_addr->sin_addr, dst_addr->sin_port); +	iser_info("connecting to: %pI4, port 0x%x\n", +		  &dst_addr->sin_addr, dst_addr->sin_port);  	ib_conn->state = ISER_CONN_PENDING; -	iser_conn_get(ib_conn); /* ref ib conn's cma id */  	ib_conn->cma_id = rdma_create_id(iser_cma_handler,  					     (void *)ib_conn, -					     RDMA_PS_TCP); +					     RDMA_PS_TCP, IB_QPT_RC);  	if (IS_ERR(ib_conn->cma_id)) {  		err = PTR_ERR(ib_conn->cma_id);  		iser_err("rdma_create_id failed: %d\n", err); @@ -583,7 +865,7 @@ id_failure:  addr_failure:  	ib_conn->state = ISER_CONN_DOWN;  connect_failure: -	iser_conn_release(ib_conn, 1); +	iser_conn_release(ib_conn);  	return err;  } @@ -604,7 +886,7 @@ int iser_reg_page_vec(struct iser_conn     *ib_conn,  	page_list = page_vec->pages;  	io_addr	  = page_list[0]; -	mem  = ib_fmr_pool_map_phys(ib_conn->fmr_pool, +	mem  = ib_fmr_pool_map_phys(ib_conn->fmr.pool,  				    page_list,  				    page_vec->length,  				    io_addr); @@ -619,7 +901,7 @@ int iser_reg_page_vec(struct iser_conn     *ib_conn,  	mem_reg->rkey  = mem->fmr->rkey;  	mem_reg->len   = page_vec->length * SIZE_4K;  	mem_reg->va    = io_addr; -	mem_reg->is_fmr = 1; +	mem_reg->is_mr = 1;  	mem_reg->mem_h = (void *)mem;  	mem_reg->va   += page_vec->offset; @@ -637,12 +919,18 @@ int iser_reg_page_vec(struct iser_conn     *ib_conn,  }  /** - * Unregister (previosuly registered) memory. + * Unregister (previosuly registered using FMR) memory. + * If memory is non-FMR does nothing.   */ -void iser_unreg_mem(struct iser_mem_reg *reg) +void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, +			enum iser_data_dir cmd_dir)  { +	struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg;  	int ret; +	if (!reg->is_mr) +		return; +  	iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h);  	ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h); @@ -652,17 +940,34 @@ void iser_unreg_mem(struct iser_mem_reg *reg)  	reg->mem_h = NULL;  } +void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, +			    enum iser_data_dir cmd_dir) +{ +	struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; +	struct iser_conn *ib_conn = iser_task->ib_conn; +	struct fast_reg_descriptor *desc = reg->mem_h; + +	if (!reg->is_mr) +		return; + +	reg->mem_h = NULL; +	reg->is_mr = 0; +	spin_lock_bh(&ib_conn->lock); +	list_add_tail(&desc->list, &ib_conn->fastreg.pool); +	spin_unlock_bh(&ib_conn->lock); +} +  int iser_post_recvl(struct iser_conn *ib_conn)  {  	struct ib_recv_wr rx_wr, *rx_wr_failed;  	struct ib_sge	  sge;  	int ib_ret; -	sge.addr   = ib_conn->login_dma; +	sge.addr   = ib_conn->login_resp_dma;  	sge.length = ISER_RX_LOGIN_SIZE;  	sge.lkey   = ib_conn->device->mr->lkey; -	rx_wr.wr_id   = (unsigned long)ib_conn->login_buf; +	rx_wr.wr_id   = (unsigned long)ib_conn->login_resp_buf;  	rx_wr.sg_list = &sge;  	rx_wr.num_sge = 1;  	rx_wr.next    = NULL; @@ -689,7 +994,7 @@ int iser_post_recvm(struct iser_conn *ib_conn, int count)  		rx_wr->sg_list	= &rx_desc->rx_sg;  		rx_wr->num_sge	= 1;  		rx_wr->next	= rx_wr + 1; -		my_rx_head = (my_rx_head + 1) & (ISER_QP_MAX_RECV_DTOS - 1); +		my_rx_head = (my_rx_head + 1) & ib_conn->qp_max_recv_dtos_mask;  	}  	rx_wr--; @@ -749,7 +1054,7 @@ static void iser_handle_comp_error(struct iser_tx_desc *desc,  		 * perspective.                                             */  		if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP,  		    ISER_CONN_TERMINATING)) -			iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn, +			iscsi_conn_failure(ib_conn->iscsi_conn,  					   ISCSI_ERR_CONN_FAILED);  		/* no more non completed posts to the QP, complete the @@ -759,9 +1064,9 @@ static void iser_handle_comp_error(struct iser_tx_desc *desc,  	}  } -static int iser_drain_tx_cq(struct iser_device  *device) +static int iser_drain_tx_cq(struct iser_device  *device, int cq_index)  { -	struct ib_cq  *cq = device->tx_cq; +	struct ib_cq  *cq = device->tx_cq[cq_index];  	struct ib_wc  wc;  	struct iser_tx_desc *tx_desc;  	struct iser_conn *ib_conn; @@ -778,9 +1083,11 @@ static int iser_drain_tx_cq(struct iser_device  *device)  					IB_WC_SEND, wc.opcode);  		} else {  			iser_err("tx id %llx status %d vend_err %x\n", -				wc.wr_id, wc.status, wc.vendor_err); -			atomic_dec(&ib_conn->post_send_buf_count); -			iser_handle_comp_error(tx_desc, ib_conn); +				 wc.wr_id, wc.status, wc.vendor_err); +			if (wc.wr_id != ISER_FASTREG_LI_WRID) { +				atomic_dec(&ib_conn->post_send_buf_count); +				iser_handle_comp_error(tx_desc, ib_conn); +			}  		}  		completed_tx++;  	} @@ -790,14 +1097,20 @@ static int iser_drain_tx_cq(struct iser_device  *device)  static void iser_cq_tasklet_fn(unsigned long data)  { -	 struct iser_device  *device = (struct iser_device *)data; -	 struct ib_cq	     *cq = device->rx_cq; +	struct iser_cq_desc *cq_desc = (struct iser_cq_desc *)data; +	struct iser_device  *device = cq_desc->device; +	int cq_index = cq_desc->cq_index; +	struct ib_cq	     *cq = device->rx_cq[cq_index];  	 struct ib_wc	     wc;  	 struct iser_rx_desc *desc;  	 unsigned long	     xfer_len;  	struct iser_conn *ib_conn; -	int completed_tx, completed_rx; -	completed_tx = completed_rx = 0; +	int completed_tx, completed_rx = 0; + +	/* First do tx drain, so in a case where we have rx flushes and a successful +	 * tx completion we will still go through completion error handling. +	 */ +	completed_tx = iser_drain_tx_cq(device, cq_index);  	while (ib_poll_cq(cq, 1, &wc) == 1) {  		desc	 = (struct iser_rx_desc *) (unsigned long) wc.wr_id; @@ -819,19 +1132,68 @@ static void iser_cq_tasklet_fn(unsigned long data)  		}  		completed_rx++;  		if (!(completed_rx & 63)) -			completed_tx += iser_drain_tx_cq(device); +			completed_tx += iser_drain_tx_cq(device, cq_index);  	}  	/* #warning "it is assumed here that arming CQ only once its empty" *  	 * " would not cause interrupts to be missed"                       */  	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); -	completed_tx += iser_drain_tx_cq(device);  	iser_dbg("got %d rx %d tx completions\n", completed_rx, completed_tx);  }  static void iser_cq_callback(struct ib_cq *cq, void *cq_context)  { -	struct iser_device  *device = (struct iser_device *)cq_context; +	struct iser_cq_desc *cq_desc = (struct iser_cq_desc *)cq_context; +	struct iser_device  *device = cq_desc->device; +	int cq_index = cq_desc->cq_index; -	tasklet_schedule(&device->cq_tasklet); +	tasklet_schedule(&device->cq_tasklet[cq_index]); +} + +u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, +			     enum iser_data_dir cmd_dir, sector_t *sector) +{ +	struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; +	struct fast_reg_descriptor *desc = reg->mem_h; +	unsigned long sector_size = iser_task->sc->device->sector_size; +	struct ib_mr_status mr_status; +	int ret; + +	if (desc && desc->reg_indicators & ISER_FASTREG_PROTECTED) { +		desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; +		ret = ib_check_mr_status(desc->pi_ctx->sig_mr, +					 IB_MR_CHECK_SIG_STATUS, &mr_status); +		if (ret) { +			pr_err("ib_check_mr_status failed, ret %d\n", ret); +			goto err; +		} + +		if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { +			sector_t sector_off = mr_status.sig_err.sig_err_offset; + +			do_div(sector_off, sector_size + 8); +			*sector = scsi_get_lba(iser_task->sc) + sector_off; + +			pr_err("PI error found type %d at sector %llx " +			       "expected %x vs actual %x\n", +			       mr_status.sig_err.err_type, +			       (unsigned long long)*sector, +			       mr_status.sig_err.expected, +			       mr_status.sig_err.actual); + +			switch (mr_status.sig_err.err_type) { +			case IB_SIG_BAD_GUARD: +				return 0x1; +			case IB_SIG_BAD_REFTAG: +				return 0x3; +			case IB_SIG_BAD_APPTAG: +				return 0x2; +			} +		} +	} + +	return 0; +err: +	/* Not alot we can do here, return ambiguous guard error */ +	return 0x1;  } diff --git a/drivers/infiniband/ulp/isert/Kconfig b/drivers/infiniband/ulp/isert/Kconfig new file mode 100644 index 00000000000..02f9759ebb1 --- /dev/null +++ b/drivers/infiniband/ulp/isert/Kconfig @@ -0,0 +1,5 @@ +config INFINIBAND_ISERT +	tristate "iSCSI Extensions for RDMA (iSER) target support" +	depends on INET && INFINIBAND_ADDR_TRANS && TARGET_CORE && ISCSI_TARGET +	---help--- +	Support for iSCSI Extensions for RDMA (iSER) Target on Infiniband fabrics. diff --git a/drivers/infiniband/ulp/isert/Makefile b/drivers/infiniband/ulp/isert/Makefile new file mode 100644 index 00000000000..c8bf2421f5b --- /dev/null +++ b/drivers/infiniband/ulp/isert/Makefile @@ -0,0 +1,2 @@ +ccflags-y		:= -Idrivers/target -Idrivers/target/iscsi +obj-$(CONFIG_INFINIBAND_ISERT)	+= ib_isert.o diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c new file mode 100644 index 00000000000..d4c7928a0f3 --- /dev/null +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -0,0 +1,3308 @@ +/******************************************************************************* + * This file contains iSCSI extentions for RDMA (iSER) Verbs + * + * (c) Copyright 2013 Datera, Inc. + * + * Nicholas A. Bellinger <nab@linux-iscsi.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + ****************************************************************************/ + +#include <linux/string.h> +#include <linux/module.h> +#include <linux/scatterlist.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/llist.h> +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include <target/target_core_base.h> +#include <target/target_core_fabric.h> +#include <target/iscsi/iscsi_transport.h> +#include <linux/semaphore.h> + +#include "isert_proto.h" +#include "ib_isert.h" + +#define	ISERT_MAX_CONN		8 +#define ISER_MAX_RX_CQ_LEN	(ISERT_QP_MAX_RECV_DTOS * ISERT_MAX_CONN) +#define ISER_MAX_TX_CQ_LEN	(ISERT_QP_MAX_REQ_DTOS  * ISERT_MAX_CONN) + +static DEFINE_MUTEX(device_list_mutex); +static LIST_HEAD(device_list); +static struct workqueue_struct *isert_rx_wq; +static struct workqueue_struct *isert_comp_wq; + +static void +isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn); +static int +isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, +	       struct isert_rdma_wr *wr); +static void +isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn); +static int +isert_reg_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, +	       struct isert_rdma_wr *wr); +static int +isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd); + +static void +isert_qp_event_callback(struct ib_event *e, void *context) +{ +	struct isert_conn *isert_conn = (struct isert_conn *)context; + +	pr_err("isert_qp_event_callback event: %d\n", e->event); +	switch (e->event) { +	case IB_EVENT_COMM_EST: +		rdma_notify(isert_conn->conn_cm_id, IB_EVENT_COMM_EST); +		break; +	case IB_EVENT_QP_LAST_WQE_REACHED: +		pr_warn("Reached TX IB_EVENT_QP_LAST_WQE_REACHED:\n"); +		break; +	default: +		break; +	} +} + +static int +isert_query_device(struct ib_device *ib_dev, struct ib_device_attr *devattr) +{ +	int ret; + +	ret = ib_query_device(ib_dev, devattr); +	if (ret) { +		pr_err("ib_query_device() failed: %d\n", ret); +		return ret; +	} +	pr_debug("devattr->max_sge: %d\n", devattr->max_sge); +	pr_debug("devattr->max_sge_rd: %d\n", devattr->max_sge_rd); + +	return 0; +} + +static int +isert_conn_setup_qp(struct isert_conn *isert_conn, struct rdma_cm_id *cma_id, +		    u8 protection) +{ +	struct isert_device *device = isert_conn->conn_device; +	struct ib_qp_init_attr attr; +	int ret, index, min_index = 0; + +	mutex_lock(&device_list_mutex); +	for (index = 0; index < device->cqs_used; index++) +		if (device->cq_active_qps[index] < +		    device->cq_active_qps[min_index]) +			min_index = index; +	device->cq_active_qps[min_index]++; +	pr_debug("isert_conn_setup_qp: Using min_index: %d\n", min_index); +	mutex_unlock(&device_list_mutex); + +	memset(&attr, 0, sizeof(struct ib_qp_init_attr)); +	attr.event_handler = isert_qp_event_callback; +	attr.qp_context = isert_conn; +	attr.send_cq = device->dev_tx_cq[min_index]; +	attr.recv_cq = device->dev_rx_cq[min_index]; +	attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS; +	attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS; +	/* +	 * FIXME: Use devattr.max_sge - 2 for max_send_sge as +	 * work-around for RDMA_READ.. +	 */ +	attr.cap.max_send_sge = device->dev_attr.max_sge - 2; +	isert_conn->max_sge = attr.cap.max_send_sge; + +	attr.cap.max_recv_sge = 1; +	attr.sq_sig_type = IB_SIGNAL_REQ_WR; +	attr.qp_type = IB_QPT_RC; +	if (protection) +		attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; + +	pr_debug("isert_conn_setup_qp cma_id->device: %p\n", +		 cma_id->device); +	pr_debug("isert_conn_setup_qp conn_pd->device: %p\n", +		 isert_conn->conn_pd->device); + +	ret = rdma_create_qp(cma_id, isert_conn->conn_pd, &attr); +	if (ret) { +		pr_err("rdma_create_qp failed for cma_id %d\n", ret); +		return ret; +	} +	isert_conn->conn_qp = cma_id->qp; +	pr_debug("rdma_create_qp() returned success >>>>>>>>>>>>>>>>>>>>>>>>>.\n"); + +	return 0; +} + +static void +isert_cq_event_callback(struct ib_event *e, void *context) +{ +	pr_debug("isert_cq_event_callback event: %d\n", e->event); +} + +static int +isert_alloc_rx_descriptors(struct isert_conn *isert_conn) +{ +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; +	struct iser_rx_desc *rx_desc; +	struct ib_sge *rx_sg; +	u64 dma_addr; +	int i, j; + +	isert_conn->conn_rx_descs = kzalloc(ISERT_QP_MAX_RECV_DTOS * +				sizeof(struct iser_rx_desc), GFP_KERNEL); +	if (!isert_conn->conn_rx_descs) +		goto fail; + +	rx_desc = isert_conn->conn_rx_descs; + +	for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++)  { +		dma_addr = ib_dma_map_single(ib_dev, (void *)rx_desc, +					ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); +		if (ib_dma_mapping_error(ib_dev, dma_addr)) +			goto dma_map_fail; + +		rx_desc->dma_addr = dma_addr; + +		rx_sg = &rx_desc->rx_sg; +		rx_sg->addr = rx_desc->dma_addr; +		rx_sg->length = ISER_RX_PAYLOAD_SIZE; +		rx_sg->lkey = isert_conn->conn_mr->lkey; +	} + +	isert_conn->conn_rx_desc_head = 0; +	return 0; + +dma_map_fail: +	rx_desc = isert_conn->conn_rx_descs; +	for (j = 0; j < i; j++, rx_desc++) { +		ib_dma_unmap_single(ib_dev, rx_desc->dma_addr, +				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); +	} +	kfree(isert_conn->conn_rx_descs); +	isert_conn->conn_rx_descs = NULL; +fail: +	return -ENOMEM; +} + +static void +isert_free_rx_descriptors(struct isert_conn *isert_conn) +{ +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; +	struct iser_rx_desc *rx_desc; +	int i; + +	if (!isert_conn->conn_rx_descs) +		return; + +	rx_desc = isert_conn->conn_rx_descs; +	for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++)  { +		ib_dma_unmap_single(ib_dev, rx_desc->dma_addr, +				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); +	} + +	kfree(isert_conn->conn_rx_descs); +	isert_conn->conn_rx_descs = NULL; +} + +static void isert_cq_tx_work(struct work_struct *); +static void isert_cq_tx_callback(struct ib_cq *, void *); +static void isert_cq_rx_work(struct work_struct *); +static void isert_cq_rx_callback(struct ib_cq *, void *); + +static int +isert_create_device_ib_res(struct isert_device *device) +{ +	struct ib_device *ib_dev = device->ib_device; +	struct isert_cq_desc *cq_desc; +	struct ib_device_attr *dev_attr; +	int ret = 0, i, j; + +	dev_attr = &device->dev_attr; +	ret = isert_query_device(ib_dev, dev_attr); +	if (ret) +		return ret; + +	/* asign function handlers */ +	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS && +	    dev_attr->device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) { +		device->use_fastreg = 1; +		device->reg_rdma_mem = isert_reg_rdma; +		device->unreg_rdma_mem = isert_unreg_rdma; +	} else { +		device->use_fastreg = 0; +		device->reg_rdma_mem = isert_map_rdma; +		device->unreg_rdma_mem = isert_unmap_cmd; +	} + +	/* Check signature cap */ +	device->pi_capable = dev_attr->device_cap_flags & +			     IB_DEVICE_SIGNATURE_HANDOVER ? true : false; + +	device->cqs_used = min_t(int, num_online_cpus(), +				 device->ib_device->num_comp_vectors); +	device->cqs_used = min(ISERT_MAX_CQ, device->cqs_used); +	pr_debug("Using %d CQs, device %s supports %d vectors support " +		 "Fast registration %d pi_capable %d\n", +		 device->cqs_used, device->ib_device->name, +		 device->ib_device->num_comp_vectors, device->use_fastreg, +		 device->pi_capable); +	device->cq_desc = kzalloc(sizeof(struct isert_cq_desc) * +				device->cqs_used, GFP_KERNEL); +	if (!device->cq_desc) { +		pr_err("Unable to allocate device->cq_desc\n"); +		return -ENOMEM; +	} +	cq_desc = device->cq_desc; + +	for (i = 0; i < device->cqs_used; i++) { +		cq_desc[i].device = device; +		cq_desc[i].cq_index = i; + +		INIT_WORK(&cq_desc[i].cq_rx_work, isert_cq_rx_work); +		device->dev_rx_cq[i] = ib_create_cq(device->ib_device, +						isert_cq_rx_callback, +						isert_cq_event_callback, +						(void *)&cq_desc[i], +						ISER_MAX_RX_CQ_LEN, i); +		if (IS_ERR(device->dev_rx_cq[i])) { +			ret = PTR_ERR(device->dev_rx_cq[i]); +			device->dev_rx_cq[i] = NULL; +			goto out_cq; +		} + +		INIT_WORK(&cq_desc[i].cq_tx_work, isert_cq_tx_work); +		device->dev_tx_cq[i] = ib_create_cq(device->ib_device, +						isert_cq_tx_callback, +						isert_cq_event_callback, +						(void *)&cq_desc[i], +						ISER_MAX_TX_CQ_LEN, i); +		if (IS_ERR(device->dev_tx_cq[i])) { +			ret = PTR_ERR(device->dev_tx_cq[i]); +			device->dev_tx_cq[i] = NULL; +			goto out_cq; +		} + +		ret = ib_req_notify_cq(device->dev_rx_cq[i], IB_CQ_NEXT_COMP); +		if (ret) +			goto out_cq; + +		ret = ib_req_notify_cq(device->dev_tx_cq[i], IB_CQ_NEXT_COMP); +		if (ret) +			goto out_cq; +	} + +	return 0; + +out_cq: +	for (j = 0; j < i; j++) { +		cq_desc = &device->cq_desc[j]; + +		if (device->dev_rx_cq[j]) { +			cancel_work_sync(&cq_desc->cq_rx_work); +			ib_destroy_cq(device->dev_rx_cq[j]); +		} +		if (device->dev_tx_cq[j]) { +			cancel_work_sync(&cq_desc->cq_tx_work); +			ib_destroy_cq(device->dev_tx_cq[j]); +		} +	} +	kfree(device->cq_desc); + +	return ret; +} + +static void +isert_free_device_ib_res(struct isert_device *device) +{ +	struct isert_cq_desc *cq_desc; +	int i; + +	for (i = 0; i < device->cqs_used; i++) { +		cq_desc = &device->cq_desc[i]; + +		cancel_work_sync(&cq_desc->cq_rx_work); +		cancel_work_sync(&cq_desc->cq_tx_work); +		ib_destroy_cq(device->dev_rx_cq[i]); +		ib_destroy_cq(device->dev_tx_cq[i]); +		device->dev_rx_cq[i] = NULL; +		device->dev_tx_cq[i] = NULL; +	} + +	kfree(device->cq_desc); +} + +static void +isert_device_try_release(struct isert_device *device) +{ +	mutex_lock(&device_list_mutex); +	device->refcount--; +	if (!device->refcount) { +		isert_free_device_ib_res(device); +		list_del(&device->dev_node); +		kfree(device); +	} +	mutex_unlock(&device_list_mutex); +} + +static struct isert_device * +isert_device_find_by_ib_dev(struct rdma_cm_id *cma_id) +{ +	struct isert_device *device; +	int ret; + +	mutex_lock(&device_list_mutex); +	list_for_each_entry(device, &device_list, dev_node) { +		if (device->ib_device->node_guid == cma_id->device->node_guid) { +			device->refcount++; +			mutex_unlock(&device_list_mutex); +			return device; +		} +	} + +	device = kzalloc(sizeof(struct isert_device), GFP_KERNEL); +	if (!device) { +		mutex_unlock(&device_list_mutex); +		return ERR_PTR(-ENOMEM); +	} + +	INIT_LIST_HEAD(&device->dev_node); + +	device->ib_device = cma_id->device; +	ret = isert_create_device_ib_res(device); +	if (ret) { +		kfree(device); +		mutex_unlock(&device_list_mutex); +		return ERR_PTR(ret); +	} + +	device->refcount++; +	list_add_tail(&device->dev_node, &device_list); +	mutex_unlock(&device_list_mutex); + +	return device; +} + +static void +isert_conn_free_fastreg_pool(struct isert_conn *isert_conn) +{ +	struct fast_reg_descriptor *fr_desc, *tmp; +	int i = 0; + +	if (list_empty(&isert_conn->conn_fr_pool)) +		return; + +	pr_debug("Freeing conn %p fastreg pool", isert_conn); + +	list_for_each_entry_safe(fr_desc, tmp, +				 &isert_conn->conn_fr_pool, list) { +		list_del(&fr_desc->list); +		ib_free_fast_reg_page_list(fr_desc->data_frpl); +		ib_dereg_mr(fr_desc->data_mr); +		if (fr_desc->pi_ctx) { +			ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl); +			ib_dereg_mr(fr_desc->pi_ctx->prot_mr); +			ib_destroy_mr(fr_desc->pi_ctx->sig_mr); +			kfree(fr_desc->pi_ctx); +		} +		kfree(fr_desc); +		++i; +	} + +	if (i < isert_conn->conn_fr_pool_size) +		pr_warn("Pool still has %d regions registered\n", +			isert_conn->conn_fr_pool_size - i); +} + +static int +isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd, +		     struct fast_reg_descriptor *fr_desc, u8 protection) +{ +	int ret; + +	fr_desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device, +							 ISCSI_ISER_SG_TABLESIZE); +	if (IS_ERR(fr_desc->data_frpl)) { +		pr_err("Failed to allocate data frpl err=%ld\n", +		       PTR_ERR(fr_desc->data_frpl)); +		return PTR_ERR(fr_desc->data_frpl); +	} + +	fr_desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE); +	if (IS_ERR(fr_desc->data_mr)) { +		pr_err("Failed to allocate data frmr err=%ld\n", +		       PTR_ERR(fr_desc->data_mr)); +		ret = PTR_ERR(fr_desc->data_mr); +		goto err_data_frpl; +	} +	pr_debug("Create fr_desc %p page_list %p\n", +		 fr_desc, fr_desc->data_frpl->page_list); +	fr_desc->ind |= ISERT_DATA_KEY_VALID; + +	if (protection) { +		struct ib_mr_init_attr mr_init_attr = {0}; +		struct pi_context *pi_ctx; + +		fr_desc->pi_ctx = kzalloc(sizeof(*fr_desc->pi_ctx), GFP_KERNEL); +		if (!fr_desc->pi_ctx) { +			pr_err("Failed to allocate pi context\n"); +			ret = -ENOMEM; +			goto err_data_mr; +		} +		pi_ctx = fr_desc->pi_ctx; + +		pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, +						    ISCSI_ISER_SG_TABLESIZE); +		if (IS_ERR(pi_ctx->prot_frpl)) { +			pr_err("Failed to allocate prot frpl err=%ld\n", +			       PTR_ERR(pi_ctx->prot_frpl)); +			ret = PTR_ERR(pi_ctx->prot_frpl); +			goto err_pi_ctx; +		} + +		pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE); +		if (IS_ERR(pi_ctx->prot_mr)) { +			pr_err("Failed to allocate prot frmr err=%ld\n", +			       PTR_ERR(pi_ctx->prot_mr)); +			ret = PTR_ERR(pi_ctx->prot_mr); +			goto err_prot_frpl; +		} +		fr_desc->ind |= ISERT_PROT_KEY_VALID; + +		mr_init_attr.max_reg_descriptors = 2; +		mr_init_attr.flags |= IB_MR_SIGNATURE_EN; +		pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); +		if (IS_ERR(pi_ctx->sig_mr)) { +			pr_err("Failed to allocate signature enabled mr err=%ld\n", +			       PTR_ERR(pi_ctx->sig_mr)); +			ret = PTR_ERR(pi_ctx->sig_mr); +			goto err_prot_mr; +		} +		fr_desc->ind |= ISERT_SIG_KEY_VALID; +	} +	fr_desc->ind &= ~ISERT_PROTECTED; + +	return 0; +err_prot_mr: +	ib_dereg_mr(fr_desc->pi_ctx->prot_mr); +err_prot_frpl: +	ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl); +err_pi_ctx: +	kfree(fr_desc->pi_ctx); +err_data_mr: +	ib_dereg_mr(fr_desc->data_mr); +err_data_frpl: +	ib_free_fast_reg_page_list(fr_desc->data_frpl); + +	return ret; +} + +static int +isert_conn_create_fastreg_pool(struct isert_conn *isert_conn, u8 pi_support) +{ +	struct fast_reg_descriptor *fr_desc; +	struct isert_device *device = isert_conn->conn_device; +	struct se_session *se_sess = isert_conn->conn->sess->se_sess; +	struct se_node_acl *se_nacl = se_sess->se_node_acl; +	int i, ret, tag_num; +	/* +	 * Setup the number of FRMRs based upon the number of tags +	 * available to session in iscsi_target_locate_portal(). +	 */ +	tag_num = max_t(u32, ISCSIT_MIN_TAGS, se_nacl->queue_depth); +	tag_num = (tag_num * 2) + ISCSIT_EXTRA_TAGS; + +	isert_conn->conn_fr_pool_size = 0; +	for (i = 0; i < tag_num; i++) { +		fr_desc = kzalloc(sizeof(*fr_desc), GFP_KERNEL); +		if (!fr_desc) { +			pr_err("Failed to allocate fast_reg descriptor\n"); +			ret = -ENOMEM; +			goto err; +		} + +		ret = isert_create_fr_desc(device->ib_device, +					   isert_conn->conn_pd, fr_desc, +					   pi_support); +		if (ret) { +			pr_err("Failed to create fastreg descriptor err=%d\n", +			       ret); +			kfree(fr_desc); +			goto err; +		} + +		list_add_tail(&fr_desc->list, &isert_conn->conn_fr_pool); +		isert_conn->conn_fr_pool_size++; +	} + +	pr_debug("Creating conn %p fastreg pool size=%d", +		 isert_conn, isert_conn->conn_fr_pool_size); + +	return 0; + +err: +	isert_conn_free_fastreg_pool(isert_conn); +	return ret; +} + +static int +isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ +	struct iscsi_np *np = cma_id->context; +	struct isert_np *isert_np = np->np_context; +	struct isert_conn *isert_conn; +	struct isert_device *device; +	struct ib_device *ib_dev = cma_id->device; +	int ret = 0; +	u8 pi_support; + +	spin_lock_bh(&np->np_thread_lock); +	if (!np->enabled) { +		spin_unlock_bh(&np->np_thread_lock); +		pr_debug("iscsi_np is not enabled, reject connect request\n"); +		return rdma_reject(cma_id, NULL, 0); +	} +	spin_unlock_bh(&np->np_thread_lock); + +	pr_debug("Entering isert_connect_request cma_id: %p, context: %p\n", +		 cma_id, cma_id->context); + +	isert_conn = kzalloc(sizeof(struct isert_conn), GFP_KERNEL); +	if (!isert_conn) { +		pr_err("Unable to allocate isert_conn\n"); +		return -ENOMEM; +	} +	isert_conn->state = ISER_CONN_INIT; +	INIT_LIST_HEAD(&isert_conn->conn_accept_node); +	init_completion(&isert_conn->conn_login_comp); +	init_completion(&isert_conn->conn_wait); +	init_completion(&isert_conn->conn_wait_comp_err); +	kref_init(&isert_conn->conn_kref); +	kref_get(&isert_conn->conn_kref); +	mutex_init(&isert_conn->conn_mutex); +	spin_lock_init(&isert_conn->conn_lock); +	INIT_LIST_HEAD(&isert_conn->conn_fr_pool); + +	cma_id->context = isert_conn; +	isert_conn->conn_cm_id = cma_id; +	isert_conn->responder_resources = event->param.conn.responder_resources; +	isert_conn->initiator_depth = event->param.conn.initiator_depth; +	pr_debug("Using responder_resources: %u initiator_depth: %u\n", +		 isert_conn->responder_resources, isert_conn->initiator_depth); + +	isert_conn->login_buf = kzalloc(ISCSI_DEF_MAX_RECV_SEG_LEN + +					ISER_RX_LOGIN_SIZE, GFP_KERNEL); +	if (!isert_conn->login_buf) { +		pr_err("Unable to allocate isert_conn->login_buf\n"); +		ret = -ENOMEM; +		goto out; +	} + +	isert_conn->login_req_buf = isert_conn->login_buf; +	isert_conn->login_rsp_buf = isert_conn->login_buf + +				    ISCSI_DEF_MAX_RECV_SEG_LEN; +	pr_debug("Set login_buf: %p login_req_buf: %p login_rsp_buf: %p\n", +		 isert_conn->login_buf, isert_conn->login_req_buf, +		 isert_conn->login_rsp_buf); + +	isert_conn->login_req_dma = ib_dma_map_single(ib_dev, +				(void *)isert_conn->login_req_buf, +				ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_FROM_DEVICE); + +	ret = ib_dma_mapping_error(ib_dev, isert_conn->login_req_dma); +	if (ret) { +		pr_err("ib_dma_mapping_error failed for login_req_dma: %d\n", +		       ret); +		isert_conn->login_req_dma = 0; +		goto out_login_buf; +	} + +	isert_conn->login_rsp_dma = ib_dma_map_single(ib_dev, +					(void *)isert_conn->login_rsp_buf, +					ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE); + +	ret = ib_dma_mapping_error(ib_dev, isert_conn->login_rsp_dma); +	if (ret) { +		pr_err("ib_dma_mapping_error failed for login_rsp_dma: %d\n", +		       ret); +		isert_conn->login_rsp_dma = 0; +		goto out_req_dma_map; +	} + +	device = isert_device_find_by_ib_dev(cma_id); +	if (IS_ERR(device)) { +		ret = PTR_ERR(device); +		goto out_rsp_dma_map; +	} + +	isert_conn->conn_device = device; +	isert_conn->conn_pd = ib_alloc_pd(isert_conn->conn_device->ib_device); +	if (IS_ERR(isert_conn->conn_pd)) { +		ret = PTR_ERR(isert_conn->conn_pd); +		pr_err("ib_alloc_pd failed for conn %p: ret=%d\n", +		       isert_conn, ret); +		goto out_pd; +	} + +	isert_conn->conn_mr = ib_get_dma_mr(isert_conn->conn_pd, +					   IB_ACCESS_LOCAL_WRITE); +	if (IS_ERR(isert_conn->conn_mr)) { +		ret = PTR_ERR(isert_conn->conn_mr); +		pr_err("ib_get_dma_mr failed for conn %p: ret=%d\n", +		       isert_conn, ret); +		goto out_mr; +	} + +	pi_support = np->tpg_np->tpg->tpg_attrib.t10_pi; +	if (pi_support && !device->pi_capable) { +		pr_err("Protection information requested but not supported, " +		       "rejecting connect request\n"); +		ret = rdma_reject(cma_id, NULL, 0); +		goto out_mr; +	} + +	ret = isert_conn_setup_qp(isert_conn, cma_id, pi_support); +	if (ret) +		goto out_conn_dev; + +	mutex_lock(&isert_np->np_accept_mutex); +	list_add_tail(&isert_conn->conn_accept_node, &isert_np->np_accept_list); +	mutex_unlock(&isert_np->np_accept_mutex); + +	pr_debug("isert_connect_request() up np_sem np: %p\n", np); +	up(&isert_np->np_sem); +	return 0; + +out_conn_dev: +	ib_dereg_mr(isert_conn->conn_mr); +out_mr: +	ib_dealloc_pd(isert_conn->conn_pd); +out_pd: +	isert_device_try_release(device); +out_rsp_dma_map: +	ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma, +			    ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE); +out_req_dma_map: +	ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma, +			    ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_FROM_DEVICE); +out_login_buf: +	kfree(isert_conn->login_buf); +out: +	kfree(isert_conn); +	return ret; +} + +static void +isert_connect_release(struct isert_conn *isert_conn) +{ +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; +	struct isert_device *device = isert_conn->conn_device; +	int cq_index; + +	pr_debug("Entering isert_connect_release(): >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"); + +	if (device && device->use_fastreg) +		isert_conn_free_fastreg_pool(isert_conn); + +	if (isert_conn->conn_qp) { +		cq_index = ((struct isert_cq_desc *) +			isert_conn->conn_qp->recv_cq->cq_context)->cq_index; +		pr_debug("isert_connect_release: cq_index: %d\n", cq_index); +		isert_conn->conn_device->cq_active_qps[cq_index]--; + +		rdma_destroy_qp(isert_conn->conn_cm_id); +	} + +	isert_free_rx_descriptors(isert_conn); +	rdma_destroy_id(isert_conn->conn_cm_id); + +	ib_dereg_mr(isert_conn->conn_mr); +	ib_dealloc_pd(isert_conn->conn_pd); + +	if (isert_conn->login_buf) { +		ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma, +				    ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE); +		ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma, +				    ISCSI_DEF_MAX_RECV_SEG_LEN, +				    DMA_FROM_DEVICE); +		kfree(isert_conn->login_buf); +	} +	kfree(isert_conn); + +	if (device) +		isert_device_try_release(device); + +	pr_debug("Leaving isert_connect_release >>>>>>>>>>>>\n"); +} + +static void +isert_connected_handler(struct rdma_cm_id *cma_id) +{ +	return; +} + +static void +isert_release_conn_kref(struct kref *kref) +{ +	struct isert_conn *isert_conn = container_of(kref, +				struct isert_conn, conn_kref); + +	pr_debug("Calling isert_connect_release for final kref %s/%d\n", +		 current->comm, current->pid); + +	isert_connect_release(isert_conn); +} + +static void +isert_put_conn(struct isert_conn *isert_conn) +{ +	kref_put(&isert_conn->conn_kref, isert_release_conn_kref); +} + +static void +isert_disconnect_work(struct work_struct *work) +{ +	struct isert_conn *isert_conn = container_of(work, +				struct isert_conn, conn_logout_work); + +	pr_debug("isert_disconnect_work(): >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"); +	mutex_lock(&isert_conn->conn_mutex); +	if (isert_conn->state == ISER_CONN_UP) +		isert_conn->state = ISER_CONN_TERMINATING; + +	if (isert_conn->post_recv_buf_count == 0 && +	    atomic_read(&isert_conn->post_send_buf_count) == 0) { +		mutex_unlock(&isert_conn->conn_mutex); +		goto wake_up; +	} +	if (!isert_conn->conn_cm_id) { +		mutex_unlock(&isert_conn->conn_mutex); +		isert_put_conn(isert_conn); +		return; +	} + +	if (isert_conn->disconnect) { +		/* Send DREQ/DREP towards our initiator */ +		rdma_disconnect(isert_conn->conn_cm_id); +	} + +	mutex_unlock(&isert_conn->conn_mutex); + +wake_up: +	complete(&isert_conn->conn_wait); +	isert_put_conn(isert_conn); +} + +static void +isert_disconnected_handler(struct rdma_cm_id *cma_id, bool disconnect) +{ +	struct isert_conn *isert_conn = (struct isert_conn *)cma_id->context; + +	isert_conn->disconnect = disconnect; +	INIT_WORK(&isert_conn->conn_logout_work, isert_disconnect_work); +	schedule_work(&isert_conn->conn_logout_work); +} + +static int +isert_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ +	int ret = 0; +	bool disconnect = false; + +	pr_debug("isert_cma_handler: event %d status %d conn %p id %p\n", +		 event->event, event->status, cma_id->context, cma_id); + +	switch (event->event) { +	case RDMA_CM_EVENT_CONNECT_REQUEST: +		ret = isert_connect_request(cma_id, event); +		break; +	case RDMA_CM_EVENT_ESTABLISHED: +		isert_connected_handler(cma_id); +		break; +	case RDMA_CM_EVENT_ADDR_CHANGE:    /* FALLTHRU */ +	case RDMA_CM_EVENT_DISCONNECTED:   /* FALLTHRU */ +	case RDMA_CM_EVENT_DEVICE_REMOVAL: /* FALLTHRU */ +		disconnect = true; +	case RDMA_CM_EVENT_TIMEWAIT_EXIT:  /* FALLTHRU */ +		isert_disconnected_handler(cma_id, disconnect); +		break; +	case RDMA_CM_EVENT_CONNECT_ERROR: +	default: +		pr_err("Unhandled RDMA CMA event: %d\n", event->event); +		break; +	} + +	if (ret != 0) { +		pr_err("isert_cma_handler failed RDMA_CM_EVENT: 0x%08x %d\n", +		       event->event, ret); +		dump_stack(); +	} + +	return ret; +} + +static int +isert_post_recv(struct isert_conn *isert_conn, u32 count) +{ +	struct ib_recv_wr *rx_wr, *rx_wr_failed; +	int i, ret; +	unsigned int rx_head = isert_conn->conn_rx_desc_head; +	struct iser_rx_desc *rx_desc; + +	for (rx_wr = isert_conn->conn_rx_wr, i = 0; i < count; i++, rx_wr++) { +		rx_desc		= &isert_conn->conn_rx_descs[rx_head]; +		rx_wr->wr_id	= (unsigned long)rx_desc; +		rx_wr->sg_list	= &rx_desc->rx_sg; +		rx_wr->num_sge	= 1; +		rx_wr->next	= rx_wr + 1; +		rx_head = (rx_head + 1) & (ISERT_QP_MAX_RECV_DTOS - 1); +	} + +	rx_wr--; +	rx_wr->next = NULL; /* mark end of work requests list */ + +	isert_conn->post_recv_buf_count += count; +	ret = ib_post_recv(isert_conn->conn_qp, isert_conn->conn_rx_wr, +				&rx_wr_failed); +	if (ret) { +		pr_err("ib_post_recv() failed with ret: %d\n", ret); +		isert_conn->post_recv_buf_count -= count; +	} else { +		pr_debug("isert_post_recv(): Posted %d RX buffers\n", count); +		isert_conn->conn_rx_desc_head = rx_head; +	} +	return ret; +} + +static int +isert_post_send(struct isert_conn *isert_conn, struct iser_tx_desc *tx_desc) +{ +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; +	struct ib_send_wr send_wr, *send_wr_failed; +	int ret; + +	ib_dma_sync_single_for_device(ib_dev, tx_desc->dma_addr, +				      ISER_HEADERS_LEN, DMA_TO_DEVICE); + +	send_wr.next	= NULL; +	send_wr.wr_id	= (unsigned long)tx_desc; +	send_wr.sg_list	= tx_desc->tx_sg; +	send_wr.num_sge	= tx_desc->num_sge; +	send_wr.opcode	= IB_WR_SEND; +	send_wr.send_flags = IB_SEND_SIGNALED; + +	atomic_inc(&isert_conn->post_send_buf_count); + +	ret = ib_post_send(isert_conn->conn_qp, &send_wr, &send_wr_failed); +	if (ret) { +		pr_err("ib_post_send() failed, ret: %d\n", ret); +		atomic_dec(&isert_conn->post_send_buf_count); +	} + +	return ret; +} + +static void +isert_create_send_desc(struct isert_conn *isert_conn, +		       struct isert_cmd *isert_cmd, +		       struct iser_tx_desc *tx_desc) +{ +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + +	ib_dma_sync_single_for_cpu(ib_dev, tx_desc->dma_addr, +				   ISER_HEADERS_LEN, DMA_TO_DEVICE); + +	memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr)); +	tx_desc->iser_header.flags = ISER_VER; + +	tx_desc->num_sge = 1; +	tx_desc->isert_cmd = isert_cmd; + +	if (tx_desc->tx_sg[0].lkey != isert_conn->conn_mr->lkey) { +		tx_desc->tx_sg[0].lkey = isert_conn->conn_mr->lkey; +		pr_debug("tx_desc %p lkey mismatch, fixing\n", tx_desc); +	} +} + +static int +isert_init_tx_hdrs(struct isert_conn *isert_conn, +		   struct iser_tx_desc *tx_desc) +{ +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; +	u64 dma_addr; + +	dma_addr = ib_dma_map_single(ib_dev, (void *)tx_desc, +			ISER_HEADERS_LEN, DMA_TO_DEVICE); +	if (ib_dma_mapping_error(ib_dev, dma_addr)) { +		pr_err("ib_dma_mapping_error() failed\n"); +		return -ENOMEM; +	} + +	tx_desc->dma_addr = dma_addr; +	tx_desc->tx_sg[0].addr	= tx_desc->dma_addr; +	tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; +	tx_desc->tx_sg[0].lkey = isert_conn->conn_mr->lkey; + +	pr_debug("isert_init_tx_hdrs: Setup tx_sg[0].addr: 0x%llx length: %u" +		 " lkey: 0x%08x\n", tx_desc->tx_sg[0].addr, +		 tx_desc->tx_sg[0].length, tx_desc->tx_sg[0].lkey); + +	return 0; +} + +static void +isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, +		   struct ib_send_wr *send_wr, bool coalesce) +{ +	struct iser_tx_desc *tx_desc = &isert_cmd->tx_desc; + +	isert_cmd->rdma_wr.iser_ib_op = ISER_IB_SEND; +	send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc; +	send_wr->opcode = IB_WR_SEND; +	send_wr->sg_list = &tx_desc->tx_sg[0]; +	send_wr->num_sge = isert_cmd->tx_desc.num_sge; +	/* +	 * Coalesce send completion interrupts by only setting IB_SEND_SIGNALED +	 * bit for every ISERT_COMP_BATCH_COUNT number of ib_post_send() calls. +	 */ +	mutex_lock(&isert_conn->conn_mutex); +	if (coalesce && isert_conn->state == ISER_CONN_UP && +	    ++isert_conn->conn_comp_batch < ISERT_COMP_BATCH_COUNT) { +		tx_desc->llnode_active = true; +		llist_add(&tx_desc->comp_llnode, &isert_conn->conn_comp_llist); +		mutex_unlock(&isert_conn->conn_mutex); +		return; +	} +	isert_conn->conn_comp_batch = 0; +	tx_desc->comp_llnode_batch = llist_del_all(&isert_conn->conn_comp_llist); +	mutex_unlock(&isert_conn->conn_mutex); + +	send_wr->send_flags = IB_SEND_SIGNALED; +} + +static int +isert_rdma_post_recvl(struct isert_conn *isert_conn) +{ +	struct ib_recv_wr rx_wr, *rx_wr_fail; +	struct ib_sge sge; +	int ret; + +	memset(&sge, 0, sizeof(struct ib_sge)); +	sge.addr = isert_conn->login_req_dma; +	sge.length = ISER_RX_LOGIN_SIZE; +	sge.lkey = isert_conn->conn_mr->lkey; + +	pr_debug("Setup sge: addr: %llx length: %d 0x%08x\n", +		sge.addr, sge.length, sge.lkey); + +	memset(&rx_wr, 0, sizeof(struct ib_recv_wr)); +	rx_wr.wr_id = (unsigned long)isert_conn->login_req_buf; +	rx_wr.sg_list = &sge; +	rx_wr.num_sge = 1; + +	isert_conn->post_recv_buf_count++; +	ret = ib_post_recv(isert_conn->conn_qp, &rx_wr, &rx_wr_fail); +	if (ret) { +		pr_err("ib_post_recv() failed: %d\n", ret); +		isert_conn->post_recv_buf_count--; +	} + +	pr_debug("ib_post_recv(): returned success >>>>>>>>>>>>>>>>>>>>>>>>\n"); +	return ret; +} + +static int +isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login, +		   u32 length) +{ +	struct isert_conn *isert_conn = conn->context; +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; +	struct iser_tx_desc *tx_desc = &isert_conn->conn_login_tx_desc; +	int ret; + +	isert_create_send_desc(isert_conn, NULL, tx_desc); + +	memcpy(&tx_desc->iscsi_header, &login->rsp[0], +	       sizeof(struct iscsi_hdr)); + +	isert_init_tx_hdrs(isert_conn, tx_desc); + +	if (length > 0) { +		struct ib_sge *tx_dsg = &tx_desc->tx_sg[1]; + +		ib_dma_sync_single_for_cpu(ib_dev, isert_conn->login_rsp_dma, +					   length, DMA_TO_DEVICE); + +		memcpy(isert_conn->login_rsp_buf, login->rsp_buf, length); + +		ib_dma_sync_single_for_device(ib_dev, isert_conn->login_rsp_dma, +					      length, DMA_TO_DEVICE); + +		tx_dsg->addr	= isert_conn->login_rsp_dma; +		tx_dsg->length	= length; +		tx_dsg->lkey	= isert_conn->conn_mr->lkey; +		tx_desc->num_sge = 2; +	} +	if (!login->login_failed) { +		if (login->login_complete) { +			if (!conn->sess->sess_ops->SessionType && +			    isert_conn->conn_device->use_fastreg) { +				/* Normal Session and fastreg is used */ +				u8 pi_support = login->np->tpg_np->tpg->tpg_attrib.t10_pi; + +				ret = isert_conn_create_fastreg_pool(isert_conn, +								     pi_support); +				if (ret) { +					pr_err("Conn: %p failed to create" +					       " fastreg pool\n", isert_conn); +					return ret; +				} +			} + +			ret = isert_alloc_rx_descriptors(isert_conn); +			if (ret) +				return ret; + +			ret = isert_post_recv(isert_conn, ISERT_MIN_POSTED_RX); +			if (ret) +				return ret; + +			isert_conn->state = ISER_CONN_UP; +			goto post_send; +		} + +		ret = isert_rdma_post_recvl(isert_conn); +		if (ret) +			return ret; +	} +post_send: +	ret = isert_post_send(isert_conn, tx_desc); +	if (ret) +		return ret; + +	return 0; +} + +static void +isert_rx_login_req(struct iser_rx_desc *rx_desc, int rx_buflen, +		   struct isert_conn *isert_conn) +{ +	struct iscsi_conn *conn = isert_conn->conn; +	struct iscsi_login *login = conn->conn_login; +	int size; + +	if (!login) { +		pr_err("conn->conn_login is NULL\n"); +		dump_stack(); +		return; +	} + +	if (login->first_request) { +		struct iscsi_login_req *login_req = +			(struct iscsi_login_req *)&rx_desc->iscsi_header; +		/* +		 * Setup the initial iscsi_login values from the leading +		 * login request PDU. +		 */ +		login->leading_connection = (!login_req->tsih) ? 1 : 0; +		login->current_stage = +			(login_req->flags & ISCSI_FLAG_LOGIN_CURRENT_STAGE_MASK) +			 >> 2; +		login->version_min	= login_req->min_version; +		login->version_max	= login_req->max_version; +		memcpy(login->isid, login_req->isid, 6); +		login->cmd_sn		= be32_to_cpu(login_req->cmdsn); +		login->init_task_tag	= login_req->itt; +		login->initial_exp_statsn = be32_to_cpu(login_req->exp_statsn); +		login->cid		= be16_to_cpu(login_req->cid); +		login->tsih		= be16_to_cpu(login_req->tsih); +	} + +	memcpy(&login->req[0], (void *)&rx_desc->iscsi_header, ISCSI_HDR_LEN); + +	size = min(rx_buflen, MAX_KEY_VALUE_PAIRS); +	pr_debug("Using login payload size: %d, rx_buflen: %d MAX_KEY_VALUE_PAIRS: %d\n", +		 size, rx_buflen, MAX_KEY_VALUE_PAIRS); +	memcpy(login->req_buf, &rx_desc->data[0], size); + +	if (login->first_request) { +		complete(&isert_conn->conn_login_comp); +		return; +	} +	schedule_delayed_work(&conn->login_work, 0); +} + +static struct iscsi_cmd +*isert_allocate_cmd(struct iscsi_conn *conn) +{ +	struct isert_conn *isert_conn = (struct isert_conn *)conn->context; +	struct isert_cmd *isert_cmd; +	struct iscsi_cmd *cmd; + +	cmd = iscsit_allocate_cmd(conn, TASK_INTERRUPTIBLE); +	if (!cmd) { +		pr_err("Unable to allocate iscsi_cmd + isert_cmd\n"); +		return NULL; +	} +	isert_cmd = iscsit_priv_cmd(cmd); +	isert_cmd->conn = isert_conn; +	isert_cmd->iscsi_cmd = cmd; + +	return cmd; +} + +static int +isert_handle_scsi_cmd(struct isert_conn *isert_conn, +		      struct isert_cmd *isert_cmd, struct iscsi_cmd *cmd, +		      struct iser_rx_desc *rx_desc, unsigned char *buf) +{ +	struct iscsi_conn *conn = isert_conn->conn; +	struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)buf; +	struct scatterlist *sg; +	int imm_data, imm_data_len, unsol_data, sg_nents, rc; +	bool dump_payload = false; + +	rc = iscsit_setup_scsi_cmd(conn, cmd, buf); +	if (rc < 0) +		return rc; + +	imm_data = cmd->immediate_data; +	imm_data_len = cmd->first_burst_len; +	unsol_data = cmd->unsolicited_data; + +	rc = iscsit_process_scsi_cmd(conn, cmd, hdr); +	if (rc < 0) { +		return 0; +	} else if (rc > 0) { +		dump_payload = true; +		goto sequence_cmd; +	} + +	if (!imm_data) +		return 0; + +	sg = &cmd->se_cmd.t_data_sg[0]; +	sg_nents = max(1UL, DIV_ROUND_UP(imm_data_len, PAGE_SIZE)); + +	pr_debug("Copying Immediate SG: %p sg_nents: %u from %p imm_data_len: %d\n", +		 sg, sg_nents, &rx_desc->data[0], imm_data_len); + +	sg_copy_from_buffer(sg, sg_nents, &rx_desc->data[0], imm_data_len); + +	cmd->write_data_done += imm_data_len; + +	if (cmd->write_data_done == cmd->se_cmd.data_length) { +		spin_lock_bh(&cmd->istate_lock); +		cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT; +		cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT; +		spin_unlock_bh(&cmd->istate_lock); +	} + +sequence_cmd: +	rc = iscsit_sequence_cmd(conn, cmd, buf, hdr->cmdsn); + +	if (!rc && dump_payload == false && unsol_data) +		iscsit_set_unsoliticed_dataout(cmd); +	else if (dump_payload && imm_data) +		target_put_sess_cmd(conn->sess->se_sess, &cmd->se_cmd); + +	return 0; +} + +static int +isert_handle_iscsi_dataout(struct isert_conn *isert_conn, +			   struct iser_rx_desc *rx_desc, unsigned char *buf) +{ +	struct scatterlist *sg_start; +	struct iscsi_conn *conn = isert_conn->conn; +	struct iscsi_cmd *cmd = NULL; +	struct iscsi_data *hdr = (struct iscsi_data *)buf; +	u32 unsol_data_len = ntoh24(hdr->dlength); +	int rc, sg_nents, sg_off, page_off; + +	rc = iscsit_check_dataout_hdr(conn, buf, &cmd); +	if (rc < 0) +		return rc; +	else if (!cmd) +		return 0; +	/* +	 * FIXME: Unexpected unsolicited_data out +	 */ +	if (!cmd->unsolicited_data) { +		pr_err("Received unexpected solicited data payload\n"); +		dump_stack(); +		return -1; +	} + +	pr_debug("Unsolicited DataOut unsol_data_len: %u, write_data_done: %u, data_length: %u\n", +		 unsol_data_len, cmd->write_data_done, cmd->se_cmd.data_length); + +	sg_off = cmd->write_data_done / PAGE_SIZE; +	sg_start = &cmd->se_cmd.t_data_sg[sg_off]; +	sg_nents = max(1UL, DIV_ROUND_UP(unsol_data_len, PAGE_SIZE)); +	page_off = cmd->write_data_done % PAGE_SIZE; +	/* +	 * FIXME: Non page-aligned unsolicited_data out +	 */ +	if (page_off) { +		pr_err("Received unexpected non-page aligned data payload\n"); +		dump_stack(); +		return -1; +	} +	pr_debug("Copying DataOut: sg_start: %p, sg_off: %u sg_nents: %u from %p %u\n", +		 sg_start, sg_off, sg_nents, &rx_desc->data[0], unsol_data_len); + +	sg_copy_from_buffer(sg_start, sg_nents, &rx_desc->data[0], +			    unsol_data_len); + +	rc = iscsit_check_dataout_payload(cmd, hdr, false); +	if (rc < 0) +		return rc; + +	return 0; +} + +static int +isert_handle_nop_out(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, +		     struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc, +		     unsigned char *buf) +{ +	struct iscsi_conn *conn = isert_conn->conn; +	struct iscsi_nopout *hdr = (struct iscsi_nopout *)buf; +	int rc; + +	rc = iscsit_setup_nop_out(conn, cmd, hdr); +	if (rc < 0) +		return rc; +	/* +	 * FIXME: Add support for NOPOUT payload using unsolicited RDMA payload +	 */ + +	return iscsit_process_nop_out(conn, cmd, hdr); +} + +static int +isert_handle_text_cmd(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, +		      struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc, +		      struct iscsi_text *hdr) +{ +	struct iscsi_conn *conn = isert_conn->conn; +	u32 payload_length = ntoh24(hdr->dlength); +	int rc; +	unsigned char *text_in; + +	rc = iscsit_setup_text_cmd(conn, cmd, hdr); +	if (rc < 0) +		return rc; + +	text_in = kzalloc(payload_length, GFP_KERNEL); +	if (!text_in) { +		pr_err("Unable to allocate text_in of payload_length: %u\n", +		       payload_length); +		return -ENOMEM; +	} +	cmd->text_in_ptr = text_in; + +	memcpy(cmd->text_in_ptr, &rx_desc->data[0], payload_length); + +	return iscsit_process_text_cmd(conn, cmd, hdr); +} + +static int +isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc, +		uint32_t read_stag, uint64_t read_va, +		uint32_t write_stag, uint64_t write_va) +{ +	struct iscsi_hdr *hdr = &rx_desc->iscsi_header; +	struct iscsi_conn *conn = isert_conn->conn; +	struct iscsi_session *sess = conn->sess; +	struct iscsi_cmd *cmd; +	struct isert_cmd *isert_cmd; +	int ret = -EINVAL; +	u8 opcode = (hdr->opcode & ISCSI_OPCODE_MASK); + +	if (sess->sess_ops->SessionType && +	   (!(opcode & ISCSI_OP_TEXT) || !(opcode & ISCSI_OP_LOGOUT))) { +		pr_err("Got illegal opcode: 0x%02x in SessionType=Discovery," +		       " ignoring\n", opcode); +		return 0; +	} + +	switch (opcode) { +	case ISCSI_OP_SCSI_CMD: +		cmd = isert_allocate_cmd(conn); +		if (!cmd) +			break; + +		isert_cmd = iscsit_priv_cmd(cmd); +		isert_cmd->read_stag = read_stag; +		isert_cmd->read_va = read_va; +		isert_cmd->write_stag = write_stag; +		isert_cmd->write_va = write_va; + +		ret = isert_handle_scsi_cmd(isert_conn, isert_cmd, cmd, +					rx_desc, (unsigned char *)hdr); +		break; +	case ISCSI_OP_NOOP_OUT: +		cmd = isert_allocate_cmd(conn); +		if (!cmd) +			break; + +		isert_cmd = iscsit_priv_cmd(cmd); +		ret = isert_handle_nop_out(isert_conn, isert_cmd, cmd, +					   rx_desc, (unsigned char *)hdr); +		break; +	case ISCSI_OP_SCSI_DATA_OUT: +		ret = isert_handle_iscsi_dataout(isert_conn, rx_desc, +						(unsigned char *)hdr); +		break; +	case ISCSI_OP_SCSI_TMFUNC: +		cmd = isert_allocate_cmd(conn); +		if (!cmd) +			break; + +		ret = iscsit_handle_task_mgt_cmd(conn, cmd, +						(unsigned char *)hdr); +		break; +	case ISCSI_OP_LOGOUT: +		cmd = isert_allocate_cmd(conn); +		if (!cmd) +			break; + +		ret = iscsit_handle_logout_cmd(conn, cmd, (unsigned char *)hdr); +		if (ret > 0) +			wait_for_completion_timeout(&conn->conn_logout_comp, +						    SECONDS_FOR_LOGOUT_COMP * +						    HZ); +		break; +	case ISCSI_OP_TEXT: +		cmd = isert_allocate_cmd(conn); +		if (!cmd) +			break; + +		isert_cmd = iscsit_priv_cmd(cmd); +		ret = isert_handle_text_cmd(isert_conn, isert_cmd, cmd, +					    rx_desc, (struct iscsi_text *)hdr); +		break; +	default: +		pr_err("Got unknown iSCSI OpCode: 0x%02x\n", opcode); +		dump_stack(); +		break; +	} + +	return ret; +} + +static void +isert_rx_do_work(struct iser_rx_desc *rx_desc, struct isert_conn *isert_conn) +{ +	struct iser_hdr *iser_hdr = &rx_desc->iser_header; +	uint64_t read_va = 0, write_va = 0; +	uint32_t read_stag = 0, write_stag = 0; +	int rc; + +	switch (iser_hdr->flags & 0xF0) { +	case ISCSI_CTRL: +		if (iser_hdr->flags & ISER_RSV) { +			read_stag = be32_to_cpu(iser_hdr->read_stag); +			read_va = be64_to_cpu(iser_hdr->read_va); +			pr_debug("ISER_RSV: read_stag: 0x%08x read_va: 0x%16llx\n", +				 read_stag, (unsigned long long)read_va); +		} +		if (iser_hdr->flags & ISER_WSV) { +			write_stag = be32_to_cpu(iser_hdr->write_stag); +			write_va = be64_to_cpu(iser_hdr->write_va); +			pr_debug("ISER_WSV: write__stag: 0x%08x write_va: 0x%16llx\n", +				 write_stag, (unsigned long long)write_va); +		} + +		pr_debug("ISER ISCSI_CTRL PDU\n"); +		break; +	case ISER_HELLO: +		pr_err("iSER Hello message\n"); +		break; +	default: +		pr_warn("Unknown iSER hdr flags: 0x%02x\n", iser_hdr->flags); +		break; +	} + +	rc = isert_rx_opcode(isert_conn, rx_desc, +			     read_stag, read_va, write_stag, write_va); +} + +static void +isert_rx_completion(struct iser_rx_desc *desc, struct isert_conn *isert_conn, +		    unsigned long xfer_len) +{ +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; +	struct iscsi_hdr *hdr; +	u64 rx_dma; +	int rx_buflen, outstanding; + +	if ((char *)desc == isert_conn->login_req_buf) { +		rx_dma = isert_conn->login_req_dma; +		rx_buflen = ISER_RX_LOGIN_SIZE; +		pr_debug("ISER login_buf: Using rx_dma: 0x%llx, rx_buflen: %d\n", +			 rx_dma, rx_buflen); +	} else { +		rx_dma = desc->dma_addr; +		rx_buflen = ISER_RX_PAYLOAD_SIZE; +		pr_debug("ISER req_buf: Using rx_dma: 0x%llx, rx_buflen: %d\n", +			 rx_dma, rx_buflen); +	} + +	ib_dma_sync_single_for_cpu(ib_dev, rx_dma, rx_buflen, DMA_FROM_DEVICE); + +	hdr = &desc->iscsi_header; +	pr_debug("iSCSI opcode: 0x%02x, ITT: 0x%08x, flags: 0x%02x dlen: %d\n", +		 hdr->opcode, hdr->itt, hdr->flags, +		 (int)(xfer_len - ISER_HEADERS_LEN)); + +	if ((char *)desc == isert_conn->login_req_buf) +		isert_rx_login_req(desc, xfer_len - ISER_HEADERS_LEN, +				   isert_conn); +	else +		isert_rx_do_work(desc, isert_conn); + +	ib_dma_sync_single_for_device(ib_dev, rx_dma, rx_buflen, +				      DMA_FROM_DEVICE); + +	isert_conn->post_recv_buf_count--; +	pr_debug("iSERT: Decremented post_recv_buf_count: %d\n", +		 isert_conn->post_recv_buf_count); + +	if ((char *)desc == isert_conn->login_req_buf) +		return; + +	outstanding = isert_conn->post_recv_buf_count; +	if (outstanding + ISERT_MIN_POSTED_RX <= ISERT_QP_MAX_RECV_DTOS) { +		int err, count = min(ISERT_QP_MAX_RECV_DTOS - outstanding, +				ISERT_MIN_POSTED_RX); +		err = isert_post_recv(isert_conn, count); +		if (err) { +			pr_err("isert_post_recv() count: %d failed, %d\n", +			       count, err); +		} +	} +} + +static int +isert_map_data_buf(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, +		   struct scatterlist *sg, u32 nents, u32 length, u32 offset, +		   enum iser_ib_op_code op, struct isert_data_buf *data) +{ +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + +	data->dma_dir = op == ISER_IB_RDMA_WRITE ? +			      DMA_TO_DEVICE : DMA_FROM_DEVICE; + +	data->len = length - offset; +	data->offset = offset; +	data->sg_off = data->offset / PAGE_SIZE; + +	data->sg = &sg[data->sg_off]; +	data->nents = min_t(unsigned int, nents - data->sg_off, +					  ISCSI_ISER_SG_TABLESIZE); +	data->len = min_t(unsigned int, data->len, ISCSI_ISER_SG_TABLESIZE * +					PAGE_SIZE); + +	data->dma_nents = ib_dma_map_sg(ib_dev, data->sg, data->nents, +					data->dma_dir); +	if (unlikely(!data->dma_nents)) { +		pr_err("Cmd: unable to dma map SGs %p\n", sg); +		return -EINVAL; +	} + +	pr_debug("Mapped cmd: %p count: %u sg: %p sg_nents: %u rdma_len %d\n", +		 isert_cmd, data->dma_nents, data->sg, data->nents, data->len); + +	return 0; +} + +static void +isert_unmap_data_buf(struct isert_conn *isert_conn, struct isert_data_buf *data) +{ +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + +	ib_dma_unmap_sg(ib_dev, data->sg, data->nents, data->dma_dir); +	memset(data, 0, sizeof(*data)); +} + + + +static void +isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) +{ +	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + +	pr_debug("isert_unmap_cmd: %p\n", isert_cmd); + +	if (wr->data.sg) { +		pr_debug("isert_unmap_cmd: %p unmap_sg op\n", isert_cmd); +		isert_unmap_data_buf(isert_conn, &wr->data); +	} + +	if (wr->send_wr) { +		pr_debug("isert_unmap_cmd: %p free send_wr\n", isert_cmd); +		kfree(wr->send_wr); +		wr->send_wr = NULL; +	} + +	if (wr->ib_sge) { +		pr_debug("isert_unmap_cmd: %p free ib_sge\n", isert_cmd); +		kfree(wr->ib_sge); +		wr->ib_sge = NULL; +	} +} + +static void +isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) +{ +	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; +	LIST_HEAD(unmap_list); + +	pr_debug("unreg_fastreg_cmd: %p\n", isert_cmd); + +	if (wr->fr_desc) { +		pr_debug("unreg_fastreg_cmd: %p free fr_desc %p\n", +			 isert_cmd, wr->fr_desc); +		if (wr->fr_desc->ind & ISERT_PROTECTED) { +			isert_unmap_data_buf(isert_conn, &wr->prot); +			wr->fr_desc->ind &= ~ISERT_PROTECTED; +		} +		spin_lock_bh(&isert_conn->conn_lock); +		list_add_tail(&wr->fr_desc->list, &isert_conn->conn_fr_pool); +		spin_unlock_bh(&isert_conn->conn_lock); +		wr->fr_desc = NULL; +	} + +	if (wr->data.sg) { +		pr_debug("unreg_fastreg_cmd: %p unmap_sg op\n", isert_cmd); +		isert_unmap_data_buf(isert_conn, &wr->data); +	} + +	wr->ib_sge = NULL; +	wr->send_wr = NULL; +} + +static void +isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err) +{ +	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; +	struct isert_conn *isert_conn = isert_cmd->conn; +	struct iscsi_conn *conn = isert_conn->conn; +	struct isert_device *device = isert_conn->conn_device; + +	pr_debug("Entering isert_put_cmd: %p\n", isert_cmd); + +	switch (cmd->iscsi_opcode) { +	case ISCSI_OP_SCSI_CMD: +		spin_lock_bh(&conn->cmd_lock); +		if (!list_empty(&cmd->i_conn_node)) +			list_del_init(&cmd->i_conn_node); +		spin_unlock_bh(&conn->cmd_lock); + +		if (cmd->data_direction == DMA_TO_DEVICE) { +			iscsit_stop_dataout_timer(cmd); +			/* +			 * Check for special case during comp_err where +			 * WRITE_PENDING has been handed off from core, +			 * but requires an extra target_put_sess_cmd() +			 * before transport_generic_free_cmd() below. +			 */ +			if (comp_err && +			    cmd->se_cmd.t_state == TRANSPORT_WRITE_PENDING) { +				struct se_cmd *se_cmd = &cmd->se_cmd; + +				target_put_sess_cmd(se_cmd->se_sess, se_cmd); +			} +		} + +		device->unreg_rdma_mem(isert_cmd, isert_conn); +		transport_generic_free_cmd(&cmd->se_cmd, 0); +		break; +	case ISCSI_OP_SCSI_TMFUNC: +		spin_lock_bh(&conn->cmd_lock); +		if (!list_empty(&cmd->i_conn_node)) +			list_del_init(&cmd->i_conn_node); +		spin_unlock_bh(&conn->cmd_lock); + +		transport_generic_free_cmd(&cmd->se_cmd, 0); +		break; +	case ISCSI_OP_REJECT: +	case ISCSI_OP_NOOP_OUT: +	case ISCSI_OP_TEXT: +		spin_lock_bh(&conn->cmd_lock); +		if (!list_empty(&cmd->i_conn_node)) +			list_del_init(&cmd->i_conn_node); +		spin_unlock_bh(&conn->cmd_lock); + +		/* +		 * Handle special case for REJECT when iscsi_add_reject*() has +		 * overwritten the original iscsi_opcode assignment, and the +		 * associated cmd->se_cmd needs to be released. +		 */ +		if (cmd->se_cmd.se_tfo != NULL) { +			pr_debug("Calling transport_generic_free_cmd from" +				 " isert_put_cmd for 0x%02x\n", +				 cmd->iscsi_opcode); +			transport_generic_free_cmd(&cmd->se_cmd, 0); +			break; +		} +		/* +		 * Fall-through +		 */ +	default: +		iscsit_release_cmd(cmd); +		break; +	} +} + +static void +isert_unmap_tx_desc(struct iser_tx_desc *tx_desc, struct ib_device *ib_dev) +{ +	if (tx_desc->dma_addr != 0) { +		pr_debug("Calling ib_dma_unmap_single for tx_desc->dma_addr\n"); +		ib_dma_unmap_single(ib_dev, tx_desc->dma_addr, +				    ISER_HEADERS_LEN, DMA_TO_DEVICE); +		tx_desc->dma_addr = 0; +	} +} + +static void +isert_completion_put(struct iser_tx_desc *tx_desc, struct isert_cmd *isert_cmd, +		     struct ib_device *ib_dev, bool comp_err) +{ +	if (isert_cmd->pdu_buf_dma != 0) { +		pr_debug("Calling ib_dma_unmap_single for isert_cmd->pdu_buf_dma\n"); +		ib_dma_unmap_single(ib_dev, isert_cmd->pdu_buf_dma, +				    isert_cmd->pdu_buf_len, DMA_TO_DEVICE); +		isert_cmd->pdu_buf_dma = 0; +	} + +	isert_unmap_tx_desc(tx_desc, ib_dev); +	isert_put_cmd(isert_cmd, comp_err); +} + +static int +isert_check_pi_status(struct se_cmd *se_cmd, struct ib_mr *sig_mr) +{ +	struct ib_mr_status mr_status; +	int ret; + +	ret = ib_check_mr_status(sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status); +	if (ret) { +		pr_err("ib_check_mr_status failed, ret %d\n", ret); +		goto fail_mr_status; +	} + +	if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { +		u64 sec_offset_err; +		u32 block_size = se_cmd->se_dev->dev_attrib.block_size + 8; + +		switch (mr_status.sig_err.err_type) { +		case IB_SIG_BAD_GUARD: +			se_cmd->pi_err = TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED; +			break; +		case IB_SIG_BAD_REFTAG: +			se_cmd->pi_err = TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED; +			break; +		case IB_SIG_BAD_APPTAG: +			se_cmd->pi_err = TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED; +			break; +		} +		sec_offset_err = mr_status.sig_err.sig_err_offset; +		do_div(sec_offset_err, block_size); +		se_cmd->bad_sector = sec_offset_err + se_cmd->t_task_lba; + +		pr_err("isert: PI error found type %d at sector 0x%llx " +		       "expected 0x%x vs actual 0x%x\n", +		       mr_status.sig_err.err_type, +		       (unsigned long long)se_cmd->bad_sector, +		       mr_status.sig_err.expected, +		       mr_status.sig_err.actual); +		ret = 1; +	} + +fail_mr_status: +	return ret; +} + +static void +isert_completion_rdma_write(struct iser_tx_desc *tx_desc, +			    struct isert_cmd *isert_cmd) +{ +	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; +	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; +	struct se_cmd *se_cmd = &cmd->se_cmd; +	struct isert_conn *isert_conn = isert_cmd->conn; +	struct isert_device *device = isert_conn->conn_device; +	int ret = 0; + +	if (wr->fr_desc && wr->fr_desc->ind & ISERT_PROTECTED) { +		ret = isert_check_pi_status(se_cmd, +					    wr->fr_desc->pi_ctx->sig_mr); +		wr->fr_desc->ind &= ~ISERT_PROTECTED; +	} + +	device->unreg_rdma_mem(isert_cmd, isert_conn); +	wr->send_wr_num = 0; +	if (ret) +		transport_send_check_condition_and_sense(se_cmd, +							 se_cmd->pi_err, 0); +	else +		isert_put_response(isert_conn->conn, cmd); +} + +static void +isert_completion_rdma_read(struct iser_tx_desc *tx_desc, +			   struct isert_cmd *isert_cmd) +{ +	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; +	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; +	struct se_cmd *se_cmd = &cmd->se_cmd; +	struct isert_conn *isert_conn = isert_cmd->conn; +	struct isert_device *device = isert_conn->conn_device; +	int ret = 0; + +	if (wr->fr_desc && wr->fr_desc->ind & ISERT_PROTECTED) { +		ret = isert_check_pi_status(se_cmd, +					    wr->fr_desc->pi_ctx->sig_mr); +		wr->fr_desc->ind &= ~ISERT_PROTECTED; +	} + +	iscsit_stop_dataout_timer(cmd); +	device->unreg_rdma_mem(isert_cmd, isert_conn); +	cmd->write_data_done = wr->data.len; +	wr->send_wr_num = 0; + +	pr_debug("Cmd: %p RDMA_READ comp calling execute_cmd\n", isert_cmd); +	spin_lock_bh(&cmd->istate_lock); +	cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT; +	cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT; +	spin_unlock_bh(&cmd->istate_lock); + +	if (ret) +		transport_send_check_condition_and_sense(se_cmd, +							 se_cmd->pi_err, 0); +	else +		target_execute_cmd(se_cmd); +} + +static void +isert_do_control_comp(struct work_struct *work) +{ +	struct isert_cmd *isert_cmd = container_of(work, +			struct isert_cmd, comp_work); +	struct isert_conn *isert_conn = isert_cmd->conn; +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; +	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + +	switch (cmd->i_state) { +	case ISTATE_SEND_TASKMGTRSP: +		pr_debug("Calling iscsit_tmr_post_handler >>>>>>>>>>>>>>>>>\n"); + +		atomic_dec(&isert_conn->post_send_buf_count); +		iscsit_tmr_post_handler(cmd, cmd->conn); + +		cmd->i_state = ISTATE_SENT_STATUS; +		isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); +		break; +	case ISTATE_SEND_REJECT: +		pr_debug("Got isert_do_control_comp ISTATE_SEND_REJECT: >>>\n"); +		atomic_dec(&isert_conn->post_send_buf_count); + +		cmd->i_state = ISTATE_SENT_STATUS; +		isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); +		break; +	case ISTATE_SEND_LOGOUTRSP: +		pr_debug("Calling iscsit_logout_post_handler >>>>>>>>>>>>>>\n"); + +		atomic_dec(&isert_conn->post_send_buf_count); +		iscsit_logout_post_handler(cmd, cmd->conn); +		break; +	case ISTATE_SEND_TEXTRSP: +		atomic_dec(&isert_conn->post_send_buf_count); +		cmd->i_state = ISTATE_SENT_STATUS; +		isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); +		break; +	default: +		pr_err("Unknown do_control_comp i_state %d\n", cmd->i_state); +		dump_stack(); +		break; +	} +} + +static void +isert_response_completion(struct iser_tx_desc *tx_desc, +			  struct isert_cmd *isert_cmd, +			  struct isert_conn *isert_conn, +			  struct ib_device *ib_dev) +{ +	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; +	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + +	if (cmd->i_state == ISTATE_SEND_TASKMGTRSP || +	    cmd->i_state == ISTATE_SEND_LOGOUTRSP || +	    cmd->i_state == ISTATE_SEND_REJECT || +	    cmd->i_state == ISTATE_SEND_TEXTRSP) { +		isert_unmap_tx_desc(tx_desc, ib_dev); + +		INIT_WORK(&isert_cmd->comp_work, isert_do_control_comp); +		queue_work(isert_comp_wq, &isert_cmd->comp_work); +		return; +	} + +	/** +	 * If send_wr_num is 0 this means that we got +	 * RDMA completion and we cleared it and we should +	 * simply decrement the response post. else the +	 * response is incorporated in send_wr_num, just +	 * sub it. +	 **/ +	if (wr->send_wr_num) +		atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); +	else +		atomic_dec(&isert_conn->post_send_buf_count); + +	cmd->i_state = ISTATE_SENT_STATUS; +	isert_completion_put(tx_desc, isert_cmd, ib_dev, false); +} + +static void +__isert_send_completion(struct iser_tx_desc *tx_desc, +		        struct isert_conn *isert_conn) +{ +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; +	struct isert_cmd *isert_cmd = tx_desc->isert_cmd; +	struct isert_rdma_wr *wr; + +	if (!isert_cmd) { +		atomic_dec(&isert_conn->post_send_buf_count); +		isert_unmap_tx_desc(tx_desc, ib_dev); +		return; +	} +	wr = &isert_cmd->rdma_wr; + +	switch (wr->iser_ib_op) { +	case ISER_IB_RECV: +		pr_err("isert_send_completion: Got ISER_IB_RECV\n"); +		dump_stack(); +		break; +	case ISER_IB_SEND: +		pr_debug("isert_send_completion: Got ISER_IB_SEND\n"); +		isert_response_completion(tx_desc, isert_cmd, +					  isert_conn, ib_dev); +		break; +	case ISER_IB_RDMA_WRITE: +		pr_debug("isert_send_completion: Got ISER_IB_RDMA_WRITE\n"); +		atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); +		isert_completion_rdma_write(tx_desc, isert_cmd); +		break; +	case ISER_IB_RDMA_READ: +		pr_debug("isert_send_completion: Got ISER_IB_RDMA_READ:\n"); + +		atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); +		isert_completion_rdma_read(tx_desc, isert_cmd); +		break; +	default: +		pr_err("Unknown wr->iser_ib_op: 0x%02x\n", wr->iser_ib_op); +		dump_stack(); +		break; +	} +} + +static void +isert_send_completion(struct iser_tx_desc *tx_desc, +		      struct isert_conn *isert_conn) +{ +	struct llist_node *llnode = tx_desc->comp_llnode_batch; +	struct iser_tx_desc *t; +	/* +	 * Drain coalesced completion llist starting from comp_llnode_batch +	 * setup in isert_init_send_wr(), and then complete trailing tx_desc. +	 */ +	while (llnode) { +		t = llist_entry(llnode, struct iser_tx_desc, comp_llnode); +		llnode = llist_next(llnode); +		__isert_send_completion(t, isert_conn); +	} +	__isert_send_completion(tx_desc, isert_conn); +} + +static void +isert_cq_drain_comp_llist(struct isert_conn *isert_conn, struct ib_device *ib_dev) +{ +	struct llist_node *llnode; +	struct isert_rdma_wr *wr; +	struct iser_tx_desc *t; + +	mutex_lock(&isert_conn->conn_mutex); +	llnode = llist_del_all(&isert_conn->conn_comp_llist); +	isert_conn->conn_comp_batch = 0; +	mutex_unlock(&isert_conn->conn_mutex); + +	while (llnode) { +		t = llist_entry(llnode, struct iser_tx_desc, comp_llnode); +		llnode = llist_next(llnode); +		wr = &t->isert_cmd->rdma_wr; + +		/** +		 * If send_wr_num is 0 this means that we got +		 * RDMA completion and we cleared it and we should +		 * simply decrement the response post. else the +		 * response is incorporated in send_wr_num, just +		 * sub it. +		 **/ +		if (wr->send_wr_num) +			atomic_sub(wr->send_wr_num, +				   &isert_conn->post_send_buf_count); +		else +			atomic_dec(&isert_conn->post_send_buf_count); + +		isert_completion_put(t, t->isert_cmd, ib_dev, true); +	} +} + +static void +isert_cq_tx_comp_err(struct iser_tx_desc *tx_desc, struct isert_conn *isert_conn) +{ +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; +	struct isert_cmd *isert_cmd = tx_desc->isert_cmd; +	struct llist_node *llnode = tx_desc->comp_llnode_batch; +	struct isert_rdma_wr *wr; +	struct iser_tx_desc *t; + +	while (llnode) { +		t = llist_entry(llnode, struct iser_tx_desc, comp_llnode); +		llnode = llist_next(llnode); +		wr = &t->isert_cmd->rdma_wr; + +		/** +		 * If send_wr_num is 0 this means that we got +		 * RDMA completion and we cleared it and we should +		 * simply decrement the response post. else the +		 * response is incorporated in send_wr_num, just +		 * sub it. +		 **/ +		if (wr->send_wr_num) +			atomic_sub(wr->send_wr_num, +				   &isert_conn->post_send_buf_count); +		else +			atomic_dec(&isert_conn->post_send_buf_count); + +		isert_completion_put(t, t->isert_cmd, ib_dev, true); +	} +	tx_desc->comp_llnode_batch = NULL; + +	if (!isert_cmd) +		isert_unmap_tx_desc(tx_desc, ib_dev); +	else +		isert_completion_put(tx_desc, isert_cmd, ib_dev, true); +} + +static void +isert_cq_rx_comp_err(struct isert_conn *isert_conn) +{ +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; +	struct iscsi_conn *conn = isert_conn->conn; + +	if (isert_conn->post_recv_buf_count) +		return; + +	isert_cq_drain_comp_llist(isert_conn, ib_dev); + +	if (conn->sess) { +		target_sess_cmd_list_set_waiting(conn->sess->se_sess); +		target_wait_for_sess_cmds(conn->sess->se_sess); +	} + +	while (atomic_read(&isert_conn->post_send_buf_count)) +		msleep(3000); + +	mutex_lock(&isert_conn->conn_mutex); +	isert_conn->state = ISER_CONN_DOWN; +	mutex_unlock(&isert_conn->conn_mutex); + +	iscsit_cause_connection_reinstatement(isert_conn->conn, 0); + +	complete(&isert_conn->conn_wait_comp_err); +} + +static void +isert_cq_tx_work(struct work_struct *work) +{ +	struct isert_cq_desc *cq_desc = container_of(work, +				struct isert_cq_desc, cq_tx_work); +	struct isert_device *device = cq_desc->device; +	int cq_index = cq_desc->cq_index; +	struct ib_cq *tx_cq = device->dev_tx_cq[cq_index]; +	struct isert_conn *isert_conn; +	struct iser_tx_desc *tx_desc; +	struct ib_wc wc; + +	while (ib_poll_cq(tx_cq, 1, &wc) == 1) { +		tx_desc = (struct iser_tx_desc *)(unsigned long)wc.wr_id; +		isert_conn = wc.qp->qp_context; + +		if (wc.status == IB_WC_SUCCESS) { +			isert_send_completion(tx_desc, isert_conn); +		} else { +			pr_debug("TX wc.status != IB_WC_SUCCESS >>>>>>>>>>>>>>\n"); +			pr_debug("TX wc.status: 0x%08x\n", wc.status); +			pr_debug("TX wc.vendor_err: 0x%08x\n", wc.vendor_err); + +			if (wc.wr_id != ISER_FASTREG_LI_WRID) { +				if (tx_desc->llnode_active) +					continue; + +				atomic_dec(&isert_conn->post_send_buf_count); +				isert_cq_tx_comp_err(tx_desc, isert_conn); +			} +		} +	} + +	ib_req_notify_cq(tx_cq, IB_CQ_NEXT_COMP); +} + +static void +isert_cq_tx_callback(struct ib_cq *cq, void *context) +{ +	struct isert_cq_desc *cq_desc = (struct isert_cq_desc *)context; + +	queue_work(isert_comp_wq, &cq_desc->cq_tx_work); +} + +static void +isert_cq_rx_work(struct work_struct *work) +{ +	struct isert_cq_desc *cq_desc = container_of(work, +			struct isert_cq_desc, cq_rx_work); +	struct isert_device *device = cq_desc->device; +	int cq_index = cq_desc->cq_index; +	struct ib_cq *rx_cq = device->dev_rx_cq[cq_index]; +	struct isert_conn *isert_conn; +	struct iser_rx_desc *rx_desc; +	struct ib_wc wc; +	unsigned long xfer_len; + +	while (ib_poll_cq(rx_cq, 1, &wc) == 1) { +		rx_desc = (struct iser_rx_desc *)(unsigned long)wc.wr_id; +		isert_conn = wc.qp->qp_context; + +		if (wc.status == IB_WC_SUCCESS) { +			xfer_len = (unsigned long)wc.byte_len; +			isert_rx_completion(rx_desc, isert_conn, xfer_len); +		} else { +			pr_debug("RX wc.status != IB_WC_SUCCESS >>>>>>>>>>>>>>\n"); +			if (wc.status != IB_WC_WR_FLUSH_ERR) { +				pr_debug("RX wc.status: 0x%08x\n", wc.status); +				pr_debug("RX wc.vendor_err: 0x%08x\n", +					 wc.vendor_err); +			} +			isert_conn->post_recv_buf_count--; +			isert_cq_rx_comp_err(isert_conn); +		} +	} + +	ib_req_notify_cq(rx_cq, IB_CQ_NEXT_COMP); +} + +static void +isert_cq_rx_callback(struct ib_cq *cq, void *context) +{ +	struct isert_cq_desc *cq_desc = (struct isert_cq_desc *)context; + +	queue_work(isert_rx_wq, &cq_desc->cq_rx_work); +} + +static int +isert_post_response(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd) +{ +	struct ib_send_wr *wr_failed; +	int ret; + +	atomic_inc(&isert_conn->post_send_buf_count); + +	ret = ib_post_send(isert_conn->conn_qp, &isert_cmd->tx_desc.send_wr, +			   &wr_failed); +	if (ret) { +		pr_err("ib_post_send failed with %d\n", ret); +		atomic_dec(&isert_conn->post_send_buf_count); +		return ret; +	} +	return ret; +} + +static int +isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd) +{ +	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); +	struct isert_conn *isert_conn = (struct isert_conn *)conn->context; +	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; +	struct iscsi_scsi_rsp *hdr = (struct iscsi_scsi_rsp *) +				&isert_cmd->tx_desc.iscsi_header; + +	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); +	iscsit_build_rsp_pdu(cmd, conn, true, hdr); +	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); +	/* +	 * Attach SENSE DATA payload to iSCSI Response PDU +	 */ +	if (cmd->se_cmd.sense_buffer && +	    ((cmd->se_cmd.se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) || +	    (cmd->se_cmd.se_cmd_flags & SCF_EMULATED_TASK_SENSE))) { +		struct ib_device *ib_dev = isert_conn->conn_cm_id->device; +		struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; +		u32 padding, pdu_len; + +		put_unaligned_be16(cmd->se_cmd.scsi_sense_length, +				   cmd->sense_buffer); +		cmd->se_cmd.scsi_sense_length += sizeof(__be16); + +		padding = -(cmd->se_cmd.scsi_sense_length) & 3; +		hton24(hdr->dlength, (u32)cmd->se_cmd.scsi_sense_length); +		pdu_len = cmd->se_cmd.scsi_sense_length + padding; + +		isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev, +				(void *)cmd->sense_buffer, pdu_len, +				DMA_TO_DEVICE); + +		isert_cmd->pdu_buf_len = pdu_len; +		tx_dsg->addr	= isert_cmd->pdu_buf_dma; +		tx_dsg->length	= pdu_len; +		tx_dsg->lkey	= isert_conn->conn_mr->lkey; +		isert_cmd->tx_desc.num_sge = 2; +	} + +	isert_init_send_wr(isert_conn, isert_cmd, send_wr, true); + +	pr_debug("Posting SCSI Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + +	return isert_post_response(isert_conn, isert_cmd); +} + +static void +isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) +{ +	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); +	struct isert_conn *isert_conn = (struct isert_conn *)conn->context; +	struct isert_device *device = isert_conn->conn_device; + +	spin_lock_bh(&conn->cmd_lock); +	if (!list_empty(&cmd->i_conn_node)) +		list_del_init(&cmd->i_conn_node); +	spin_unlock_bh(&conn->cmd_lock); + +	if (cmd->data_direction == DMA_TO_DEVICE) +		iscsit_stop_dataout_timer(cmd); + +	device->unreg_rdma_mem(isert_cmd, isert_conn); +} + +static enum target_prot_op +isert_get_sup_prot_ops(struct iscsi_conn *conn) +{ +	struct isert_conn *isert_conn = (struct isert_conn *)conn->context; +	struct isert_device *device = isert_conn->conn_device; + +	if (device->pi_capable) +		return TARGET_PROT_ALL; + +	return TARGET_PROT_NORMAL; +} + +static int +isert_put_nopin(struct iscsi_cmd *cmd, struct iscsi_conn *conn, +		bool nopout_response) +{ +	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); +	struct isert_conn *isert_conn = (struct isert_conn *)conn->context; +	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + +	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); +	iscsit_build_nopin_rsp(cmd, conn, (struct iscsi_nopin *) +			       &isert_cmd->tx_desc.iscsi_header, +			       nopout_response); +	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); +	isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + +	pr_debug("Posting NOPIN Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + +	return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_logout_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ +	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); +	struct isert_conn *isert_conn = (struct isert_conn *)conn->context; +	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + +	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); +	iscsit_build_logout_rsp(cmd, conn, (struct iscsi_logout_rsp *) +				&isert_cmd->tx_desc.iscsi_header); +	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); +	isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + +	pr_debug("Posting Logout Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + +	return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_tm_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ +	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); +	struct isert_conn *isert_conn = (struct isert_conn *)conn->context; +	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + +	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); +	iscsit_build_task_mgt_rsp(cmd, conn, (struct iscsi_tm_rsp *) +				  &isert_cmd->tx_desc.iscsi_header); +	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); +	isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + +	pr_debug("Posting Task Management Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + +	return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ +	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); +	struct isert_conn *isert_conn = (struct isert_conn *)conn->context; +	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; +	struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; +	struct iscsi_reject *hdr = +		(struct iscsi_reject *)&isert_cmd->tx_desc.iscsi_header; + +	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); +	iscsit_build_reject(cmd, conn, hdr); +	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + +	hton24(hdr->dlength, ISCSI_HDR_LEN); +	isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev, +			(void *)cmd->buf_ptr, ISCSI_HDR_LEN, +			DMA_TO_DEVICE); +	isert_cmd->pdu_buf_len = ISCSI_HDR_LEN; +	tx_dsg->addr	= isert_cmd->pdu_buf_dma; +	tx_dsg->length	= ISCSI_HDR_LEN; +	tx_dsg->lkey	= isert_conn->conn_mr->lkey; +	isert_cmd->tx_desc.num_sge = 2; + +	isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + +	pr_debug("Posting Reject IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + +	return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ +	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); +	struct isert_conn *isert_conn = (struct isert_conn *)conn->context; +	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; +	struct iscsi_text_rsp *hdr = +		(struct iscsi_text_rsp *)&isert_cmd->tx_desc.iscsi_header; +	u32 txt_rsp_len; +	int rc; + +	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); +	rc = iscsit_build_text_rsp(cmd, conn, hdr, ISCSI_INFINIBAND); +	if (rc < 0) +		return rc; + +	txt_rsp_len = rc; +	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + +	if (txt_rsp_len) { +		struct ib_device *ib_dev = isert_conn->conn_cm_id->device; +		struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; +		void *txt_rsp_buf = cmd->buf_ptr; + +		isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev, +				txt_rsp_buf, txt_rsp_len, DMA_TO_DEVICE); + +		isert_cmd->pdu_buf_len = txt_rsp_len; +		tx_dsg->addr	= isert_cmd->pdu_buf_dma; +		tx_dsg->length	= txt_rsp_len; +		tx_dsg->lkey	= isert_conn->conn_mr->lkey; +		isert_cmd->tx_desc.num_sge = 2; +	} +	isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + +	pr_debug("Posting Text Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + +	return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, +		    struct ib_sge *ib_sge, struct ib_send_wr *send_wr, +		    u32 data_left, u32 offset) +{ +	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; +	struct scatterlist *sg_start, *tmp_sg; +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; +	u32 sg_off, page_off; +	int i = 0, sg_nents; + +	sg_off = offset / PAGE_SIZE; +	sg_start = &cmd->se_cmd.t_data_sg[sg_off]; +	sg_nents = min(cmd->se_cmd.t_data_nents - sg_off, isert_conn->max_sge); +	page_off = offset % PAGE_SIZE; + +	send_wr->sg_list = ib_sge; +	send_wr->num_sge = sg_nents; +	send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc; +	/* +	 * Perform mapping of TCM scatterlist memory ib_sge dma_addr. +	 */ +	for_each_sg(sg_start, tmp_sg, sg_nents, i) { +		pr_debug("ISER RDMA from SGL dma_addr: 0x%16llx dma_len: %u, page_off: %u\n", +			 (unsigned long long)tmp_sg->dma_address, +			 tmp_sg->length, page_off); + +		ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off; +		ib_sge->length = min_t(u32, data_left, +				ib_sg_dma_len(ib_dev, tmp_sg) - page_off); +		ib_sge->lkey = isert_conn->conn_mr->lkey; + +		pr_debug("RDMA ib_sge: addr: 0x%16llx  length: %u lkey: %08x\n", +			 ib_sge->addr, ib_sge->length, ib_sge->lkey); +		page_off = 0; +		data_left -= ib_sge->length; +		ib_sge++; +		pr_debug("Incrementing ib_sge pointer to %p\n", ib_sge); +	} + +	pr_debug("Set outgoing sg_list: %p num_sg: %u from TCM SGLs\n", +		 send_wr->sg_list, send_wr->num_sge); + +	return sg_nents; +} + +static int +isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, +	       struct isert_rdma_wr *wr) +{ +	struct se_cmd *se_cmd = &cmd->se_cmd; +	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); +	struct isert_conn *isert_conn = (struct isert_conn *)conn->context; +	struct isert_data_buf *data = &wr->data; +	struct ib_send_wr *send_wr; +	struct ib_sge *ib_sge; +	u32 offset, data_len, data_left, rdma_write_max, va_offset = 0; +	int ret = 0, i, ib_sge_cnt; + +	isert_cmd->tx_desc.isert_cmd = isert_cmd; + +	offset = wr->iser_ib_op == ISER_IB_RDMA_READ ? cmd->write_data_done : 0; +	ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg, +				 se_cmd->t_data_nents, se_cmd->data_length, +				 offset, wr->iser_ib_op, &wr->data); +	if (ret) +		return ret; + +	data_left = data->len; +	offset = data->offset; + +	ib_sge = kzalloc(sizeof(struct ib_sge) * data->nents, GFP_KERNEL); +	if (!ib_sge) { +		pr_warn("Unable to allocate ib_sge\n"); +		ret = -ENOMEM; +		goto unmap_cmd; +	} +	wr->ib_sge = ib_sge; + +	wr->send_wr_num = DIV_ROUND_UP(data->nents, isert_conn->max_sge); +	wr->send_wr = kzalloc(sizeof(struct ib_send_wr) * wr->send_wr_num, +				GFP_KERNEL); +	if (!wr->send_wr) { +		pr_debug("Unable to allocate wr->send_wr\n"); +		ret = -ENOMEM; +		goto unmap_cmd; +	} + +	wr->isert_cmd = isert_cmd; +	rdma_write_max = isert_conn->max_sge * PAGE_SIZE; + +	for (i = 0; i < wr->send_wr_num; i++) { +		send_wr = &isert_cmd->rdma_wr.send_wr[i]; +		data_len = min(data_left, rdma_write_max); + +		send_wr->send_flags = 0; +		if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) { +			send_wr->opcode = IB_WR_RDMA_WRITE; +			send_wr->wr.rdma.remote_addr = isert_cmd->read_va + offset; +			send_wr->wr.rdma.rkey = isert_cmd->read_stag; +			if (i + 1 == wr->send_wr_num) +				send_wr->next = &isert_cmd->tx_desc.send_wr; +			else +				send_wr->next = &wr->send_wr[i + 1]; +		} else { +			send_wr->opcode = IB_WR_RDMA_READ; +			send_wr->wr.rdma.remote_addr = isert_cmd->write_va + va_offset; +			send_wr->wr.rdma.rkey = isert_cmd->write_stag; +			if (i + 1 == wr->send_wr_num) +				send_wr->send_flags = IB_SEND_SIGNALED; +			else +				send_wr->next = &wr->send_wr[i + 1]; +		} + +		ib_sge_cnt = isert_build_rdma_wr(isert_conn, isert_cmd, ib_sge, +					send_wr, data_len, offset); +		ib_sge += ib_sge_cnt; + +		offset += data_len; +		va_offset += data_len; +		data_left -= data_len; +	} + +	return 0; +unmap_cmd: +	isert_unmap_data_buf(isert_conn, data); + +	return ret; +} + +static int +isert_map_fr_pagelist(struct ib_device *ib_dev, +		      struct scatterlist *sg_start, int sg_nents, u64 *fr_pl) +{ +	u64 start_addr, end_addr, page, chunk_start = 0; +	struct scatterlist *tmp_sg; +	int i = 0, new_chunk, last_ent, n_pages; + +	n_pages = 0; +	new_chunk = 1; +	last_ent = sg_nents - 1; +	for_each_sg(sg_start, tmp_sg, sg_nents, i) { +		start_addr = ib_sg_dma_address(ib_dev, tmp_sg); +		if (new_chunk) +			chunk_start = start_addr; +		end_addr = start_addr + ib_sg_dma_len(ib_dev, tmp_sg); + +		pr_debug("SGL[%d] dma_addr: 0x%16llx len: %u\n", +			 i, (unsigned long long)tmp_sg->dma_address, +			 tmp_sg->length); + +		if ((end_addr & ~PAGE_MASK) && i < last_ent) { +			new_chunk = 0; +			continue; +		} +		new_chunk = 1; + +		page = chunk_start & PAGE_MASK; +		do { +			fr_pl[n_pages++] = page; +			pr_debug("Mapped page_list[%d] page_addr: 0x%16llx\n", +				 n_pages - 1, page); +			page += PAGE_SIZE; +		} while (page < end_addr); +	} + +	return n_pages; +} + +static int +isert_fast_reg_mr(struct isert_conn *isert_conn, +		  struct fast_reg_descriptor *fr_desc, +		  struct isert_data_buf *mem, +		  enum isert_indicator ind, +		  struct ib_sge *sge) +{ +	struct ib_device *ib_dev = isert_conn->conn_cm_id->device; +	struct ib_mr *mr; +	struct ib_fast_reg_page_list *frpl; +	struct ib_send_wr fr_wr, inv_wr; +	struct ib_send_wr *bad_wr, *wr = NULL; +	int ret, pagelist_len; +	u32 page_off; +	u8 key; + +	if (mem->dma_nents == 1) { +		sge->lkey = isert_conn->conn_mr->lkey; +		sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]); +		sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]); +		pr_debug("%s:%d sge: addr: 0x%llx  length: %u lkey: %x\n", +			 __func__, __LINE__, sge->addr, sge->length, +			 sge->lkey); +		return 0; +	} + +	if (ind == ISERT_DATA_KEY_VALID) { +		/* Registering data buffer */ +		mr = fr_desc->data_mr; +		frpl = fr_desc->data_frpl; +	} else { +		/* Registering protection buffer */ +		mr = fr_desc->pi_ctx->prot_mr; +		frpl = fr_desc->pi_ctx->prot_frpl; +	} + +	page_off = mem->offset % PAGE_SIZE; + +	pr_debug("Use fr_desc %p sg_nents %d offset %u\n", +		 fr_desc, mem->nents, mem->offset); + +	pagelist_len = isert_map_fr_pagelist(ib_dev, mem->sg, mem->nents, +					     &frpl->page_list[0]); + +	if (!(fr_desc->ind & ISERT_DATA_KEY_VALID)) { +		memset(&inv_wr, 0, sizeof(inv_wr)); +		inv_wr.wr_id = ISER_FASTREG_LI_WRID; +		inv_wr.opcode = IB_WR_LOCAL_INV; +		inv_wr.ex.invalidate_rkey = mr->rkey; +		wr = &inv_wr; +		/* Bump the key */ +		key = (u8)(mr->rkey & 0x000000FF); +		ib_update_fast_reg_key(mr, ++key); +	} + +	/* Prepare FASTREG WR */ +	memset(&fr_wr, 0, sizeof(fr_wr)); +	fr_wr.wr_id = ISER_FASTREG_LI_WRID; +	fr_wr.opcode = IB_WR_FAST_REG_MR; +	fr_wr.wr.fast_reg.iova_start = frpl->page_list[0] + page_off; +	fr_wr.wr.fast_reg.page_list = frpl; +	fr_wr.wr.fast_reg.page_list_len = pagelist_len; +	fr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; +	fr_wr.wr.fast_reg.length = mem->len; +	fr_wr.wr.fast_reg.rkey = mr->rkey; +	fr_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE; + +	if (!wr) +		wr = &fr_wr; +	else +		wr->next = &fr_wr; + +	ret = ib_post_send(isert_conn->conn_qp, wr, &bad_wr); +	if (ret) { +		pr_err("fast registration failed, ret:%d\n", ret); +		return ret; +	} +	fr_desc->ind &= ~ind; + +	sge->lkey = mr->lkey; +	sge->addr = frpl->page_list[0] + page_off; +	sge->length = mem->len; + +	pr_debug("%s:%d sge: addr: 0x%llx  length: %u lkey: %x\n", +		 __func__, __LINE__, sge->addr, sge->length, +		 sge->lkey); + +	return ret; +} + +static inline enum ib_t10_dif_type +se2ib_prot_type(enum target_prot_type prot_type) +{ +	switch (prot_type) { +	case TARGET_DIF_TYPE0_PROT: +		return IB_T10DIF_NONE; +	case TARGET_DIF_TYPE1_PROT: +		return IB_T10DIF_TYPE1; +	case TARGET_DIF_TYPE2_PROT: +		return IB_T10DIF_TYPE2; +	case TARGET_DIF_TYPE3_PROT: +		return IB_T10DIF_TYPE3; +	default: +		return IB_T10DIF_NONE; +	} +} + +static int +isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs) +{ +	enum ib_t10_dif_type ib_prot_type = se2ib_prot_type(se_cmd->prot_type); + +	sig_attrs->mem.sig_type = IB_SIG_TYPE_T10_DIF; +	sig_attrs->wire.sig_type = IB_SIG_TYPE_T10_DIF; +	sig_attrs->mem.sig.dif.pi_interval = +				se_cmd->se_dev->dev_attrib.block_size; +	sig_attrs->wire.sig.dif.pi_interval = +				se_cmd->se_dev->dev_attrib.block_size; + +	switch (se_cmd->prot_op) { +	case TARGET_PROT_DIN_INSERT: +	case TARGET_PROT_DOUT_STRIP: +		sig_attrs->mem.sig.dif.type = IB_T10DIF_NONE; +		sig_attrs->wire.sig.dif.type = ib_prot_type; +		sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; +		sig_attrs->wire.sig.dif.ref_tag = se_cmd->reftag_seed; +		break; +	case TARGET_PROT_DOUT_INSERT: +	case TARGET_PROT_DIN_STRIP: +		sig_attrs->mem.sig.dif.type = ib_prot_type; +		sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; +		sig_attrs->mem.sig.dif.ref_tag = se_cmd->reftag_seed; +		sig_attrs->wire.sig.dif.type = IB_T10DIF_NONE; +		break; +	case TARGET_PROT_DIN_PASS: +	case TARGET_PROT_DOUT_PASS: +		sig_attrs->mem.sig.dif.type = ib_prot_type; +		sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; +		sig_attrs->mem.sig.dif.ref_tag = se_cmd->reftag_seed; +		sig_attrs->wire.sig.dif.type = ib_prot_type; +		sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; +		sig_attrs->wire.sig.dif.ref_tag = se_cmd->reftag_seed; +		break; +	default: +		pr_err("Unsupported PI operation %d\n", se_cmd->prot_op); +		return -EINVAL; +	} + +	return 0; +} + +static inline u8 +isert_set_prot_checks(u8 prot_checks) +{ +	return (prot_checks & TARGET_DIF_CHECK_GUARD  ? 0xc0 : 0) | +	       (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) | +	       (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0); +} + +static int +isert_reg_sig_mr(struct isert_conn *isert_conn, struct se_cmd *se_cmd, +		 struct fast_reg_descriptor *fr_desc, +		 struct ib_sge *data_sge, struct ib_sge *prot_sge, +		 struct ib_sge *sig_sge) +{ +	struct ib_send_wr sig_wr, inv_wr; +	struct ib_send_wr *bad_wr, *wr = NULL; +	struct pi_context *pi_ctx = fr_desc->pi_ctx; +	struct ib_sig_attrs sig_attrs; +	int ret; +	u32 key; + +	memset(&sig_attrs, 0, sizeof(sig_attrs)); +	ret = isert_set_sig_attrs(se_cmd, &sig_attrs); +	if (ret) +		goto err; + +	sig_attrs.check_mask = isert_set_prot_checks(se_cmd->prot_checks); + +	if (!(fr_desc->ind & ISERT_SIG_KEY_VALID)) { +		memset(&inv_wr, 0, sizeof(inv_wr)); +		inv_wr.opcode = IB_WR_LOCAL_INV; +		inv_wr.wr_id = ISER_FASTREG_LI_WRID; +		inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey; +		wr = &inv_wr; +		/* Bump the key */ +		key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF); +		ib_update_fast_reg_key(pi_ctx->sig_mr, ++key); +	} + +	memset(&sig_wr, 0, sizeof(sig_wr)); +	sig_wr.opcode = IB_WR_REG_SIG_MR; +	sig_wr.wr_id = ISER_FASTREG_LI_WRID; +	sig_wr.sg_list = data_sge; +	sig_wr.num_sge = 1; +	sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE; +	sig_wr.wr.sig_handover.sig_attrs = &sig_attrs; +	sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr; +	if (se_cmd->t_prot_sg) +		sig_wr.wr.sig_handover.prot = prot_sge; + +	if (!wr) +		wr = &sig_wr; +	else +		wr->next = &sig_wr; + +	ret = ib_post_send(isert_conn->conn_qp, wr, &bad_wr); +	if (ret) { +		pr_err("fast registration failed, ret:%d\n", ret); +		goto err; +	} +	fr_desc->ind &= ~ISERT_SIG_KEY_VALID; + +	sig_sge->lkey = pi_ctx->sig_mr->lkey; +	sig_sge->addr = 0; +	sig_sge->length = se_cmd->data_length; +	if (se_cmd->prot_op != TARGET_PROT_DIN_STRIP && +	    se_cmd->prot_op != TARGET_PROT_DOUT_INSERT) +		/* +		 * We have protection guards on the wire +		 * so we need to set a larget transfer +		 */ +		sig_sge->length += se_cmd->prot_length; + +	pr_debug("sig_sge: addr: 0x%llx  length: %u lkey: %x\n", +		 sig_sge->addr, sig_sge->length, +		 sig_sge->lkey); +err: +	return ret; +} + +static int +isert_reg_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, +	       struct isert_rdma_wr *wr) +{ +	struct se_cmd *se_cmd = &cmd->se_cmd; +	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); +	struct isert_conn *isert_conn = conn->context; +	struct ib_sge data_sge; +	struct ib_send_wr *send_wr; +	struct fast_reg_descriptor *fr_desc = NULL; +	u32 offset; +	int ret = 0; +	unsigned long flags; + +	isert_cmd->tx_desc.isert_cmd = isert_cmd; + +	offset = wr->iser_ib_op == ISER_IB_RDMA_READ ? cmd->write_data_done : 0; +	ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg, +				 se_cmd->t_data_nents, se_cmd->data_length, +				 offset, wr->iser_ib_op, &wr->data); +	if (ret) +		return ret; + +	if (wr->data.dma_nents != 1 || +	    se_cmd->prot_op != TARGET_PROT_NORMAL) { +		spin_lock_irqsave(&isert_conn->conn_lock, flags); +		fr_desc = list_first_entry(&isert_conn->conn_fr_pool, +					   struct fast_reg_descriptor, list); +		list_del(&fr_desc->list); +		spin_unlock_irqrestore(&isert_conn->conn_lock, flags); +		wr->fr_desc = fr_desc; +	} + +	ret = isert_fast_reg_mr(isert_conn, fr_desc, &wr->data, +				ISERT_DATA_KEY_VALID, &data_sge); +	if (ret) +		goto unmap_cmd; + +	if (se_cmd->prot_op != TARGET_PROT_NORMAL) { +		struct ib_sge prot_sge, sig_sge; + +		if (se_cmd->t_prot_sg) { +			ret = isert_map_data_buf(isert_conn, isert_cmd, +						 se_cmd->t_prot_sg, +						 se_cmd->t_prot_nents, +						 se_cmd->prot_length, +						 0, wr->iser_ib_op, &wr->prot); +			if (ret) +				goto unmap_cmd; + +			ret = isert_fast_reg_mr(isert_conn, fr_desc, &wr->prot, +						ISERT_PROT_KEY_VALID, &prot_sge); +			if (ret) +				goto unmap_prot_cmd; +		} + +		ret = isert_reg_sig_mr(isert_conn, se_cmd, fr_desc, +				       &data_sge, &prot_sge, &sig_sge); +		if (ret) +			goto unmap_prot_cmd; + +		fr_desc->ind |= ISERT_PROTECTED; +		memcpy(&wr->s_ib_sge, &sig_sge, sizeof(sig_sge)); +	} else +		memcpy(&wr->s_ib_sge, &data_sge, sizeof(data_sge)); + +	wr->ib_sge = &wr->s_ib_sge; +	wr->send_wr_num = 1; +	memset(&wr->s_send_wr, 0, sizeof(*send_wr)); +	wr->send_wr = &wr->s_send_wr; +	wr->isert_cmd = isert_cmd; + +	send_wr = &isert_cmd->rdma_wr.s_send_wr; +	send_wr->sg_list = &wr->s_ib_sge; +	send_wr->num_sge = 1; +	send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc; +	if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) { +		send_wr->opcode = IB_WR_RDMA_WRITE; +		send_wr->wr.rdma.remote_addr = isert_cmd->read_va; +		send_wr->wr.rdma.rkey = isert_cmd->read_stag; +		send_wr->send_flags = se_cmd->prot_op == TARGET_PROT_NORMAL ? +				      0 : IB_SEND_SIGNALED; +	} else { +		send_wr->opcode = IB_WR_RDMA_READ; +		send_wr->wr.rdma.remote_addr = isert_cmd->write_va; +		send_wr->wr.rdma.rkey = isert_cmd->write_stag; +		send_wr->send_flags = IB_SEND_SIGNALED; +	} + +	return 0; +unmap_prot_cmd: +	if (se_cmd->t_prot_sg) +		isert_unmap_data_buf(isert_conn, &wr->prot); +unmap_cmd: +	if (fr_desc) { +		spin_lock_irqsave(&isert_conn->conn_lock, flags); +		list_add_tail(&fr_desc->list, &isert_conn->conn_fr_pool); +		spin_unlock_irqrestore(&isert_conn->conn_lock, flags); +	} +	isert_unmap_data_buf(isert_conn, &wr->data); + +	return ret; +} + +static int +isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) +{ +	struct se_cmd *se_cmd = &cmd->se_cmd; +	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); +	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; +	struct isert_conn *isert_conn = (struct isert_conn *)conn->context; +	struct isert_device *device = isert_conn->conn_device; +	struct ib_send_wr *wr_failed; +	int rc; + +	pr_debug("Cmd: %p RDMA_WRITE data_length: %u\n", +		 isert_cmd, se_cmd->data_length); +	wr->iser_ib_op = ISER_IB_RDMA_WRITE; +	rc = device->reg_rdma_mem(conn, cmd, wr); +	if (rc) { +		pr_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd); +		return rc; +	} + +	if (se_cmd->prot_op == TARGET_PROT_NORMAL) { +		/* +		 * Build isert_conn->tx_desc for iSCSI response PDU and attach +		 */ +		isert_create_send_desc(isert_conn, isert_cmd, +				       &isert_cmd->tx_desc); +		iscsit_build_rsp_pdu(cmd, conn, true, (struct iscsi_scsi_rsp *) +				     &isert_cmd->tx_desc.iscsi_header); +		isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); +		isert_init_send_wr(isert_conn, isert_cmd, +				   &isert_cmd->tx_desc.send_wr, true); +		isert_cmd->rdma_wr.s_send_wr.next = &isert_cmd->tx_desc.send_wr; +		wr->send_wr_num += 1; +	} + +	atomic_add(wr->send_wr_num, &isert_conn->post_send_buf_count); + +	rc = ib_post_send(isert_conn->conn_qp, wr->send_wr, &wr_failed); +	if (rc) { +		pr_warn("ib_post_send() failed for IB_WR_RDMA_WRITE\n"); +		atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); +	} + +	if (se_cmd->prot_op == TARGET_PROT_NORMAL) +		pr_debug("Cmd: %p posted RDMA_WRITE + Response for iSER Data " +			 "READ\n", isert_cmd); +	else +		pr_debug("Cmd: %p posted RDMA_WRITE for iSER Data READ\n", +			 isert_cmd); + +	return 1; +} + +static int +isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery) +{ +	struct se_cmd *se_cmd = &cmd->se_cmd; +	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); +	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; +	struct isert_conn *isert_conn = (struct isert_conn *)conn->context; +	struct isert_device *device = isert_conn->conn_device; +	struct ib_send_wr *wr_failed; +	int rc; + +	pr_debug("Cmd: %p RDMA_READ data_length: %u write_data_done: %u\n", +		 isert_cmd, se_cmd->data_length, cmd->write_data_done); +	wr->iser_ib_op = ISER_IB_RDMA_READ; +	rc = device->reg_rdma_mem(conn, cmd, wr); +	if (rc) { +		pr_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd); +		return rc; +	} + +	atomic_add(wr->send_wr_num, &isert_conn->post_send_buf_count); + +	rc = ib_post_send(isert_conn->conn_qp, wr->send_wr, &wr_failed); +	if (rc) { +		pr_warn("ib_post_send() failed for IB_WR_RDMA_READ\n"); +		atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); +	} +	pr_debug("Cmd: %p posted RDMA_READ memory for ISER Data WRITE\n", +		 isert_cmd); + +	return 0; +} + +static int +isert_immediate_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state) +{ +	int ret; + +	switch (state) { +	case ISTATE_SEND_NOPIN_WANT_RESPONSE: +		ret = isert_put_nopin(cmd, conn, false); +		break; +	default: +		pr_err("Unknown immediate state: 0x%02x\n", state); +		ret = -EINVAL; +		break; +	} + +	return ret; +} + +static int +isert_response_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state) +{ +	int ret; + +	switch (state) { +	case ISTATE_SEND_LOGOUTRSP: +		ret = isert_put_logout_rsp(cmd, conn); +		if (!ret) { +			pr_debug("Returning iSER Logout -EAGAIN\n"); +			ret = -EAGAIN; +		} +		break; +	case ISTATE_SEND_NOPIN: +		ret = isert_put_nopin(cmd, conn, true); +		break; +	case ISTATE_SEND_TASKMGTRSP: +		ret = isert_put_tm_rsp(cmd, conn); +		break; +	case ISTATE_SEND_REJECT: +		ret = isert_put_reject(cmd, conn); +		break; +	case ISTATE_SEND_TEXTRSP: +		ret = isert_put_text_rsp(cmd, conn); +		break; +	case ISTATE_SEND_STATUS: +		/* +		 * Special case for sending non GOOD SCSI status from TX thread +		 * context during pre se_cmd excecution failure. +		 */ +		ret = isert_put_response(conn, cmd); +		break; +	default: +		pr_err("Unknown response state: 0x%02x\n", state); +		ret = -EINVAL; +		break; +	} + +	return ret; +} + +static int +isert_setup_np(struct iscsi_np *np, +	       struct __kernel_sockaddr_storage *ksockaddr) +{ +	struct isert_np *isert_np; +	struct rdma_cm_id *isert_lid; +	struct sockaddr *sa; +	int ret; + +	isert_np = kzalloc(sizeof(struct isert_np), GFP_KERNEL); +	if (!isert_np) { +		pr_err("Unable to allocate struct isert_np\n"); +		return -ENOMEM; +	} +	sema_init(&isert_np->np_sem, 0); +	mutex_init(&isert_np->np_accept_mutex); +	INIT_LIST_HEAD(&isert_np->np_accept_list); +	init_completion(&isert_np->np_login_comp); + +	sa = (struct sockaddr *)ksockaddr; +	pr_debug("ksockaddr: %p, sa: %p\n", ksockaddr, sa); +	/* +	 * Setup the np->np_sockaddr from the passed sockaddr setup +	 * in iscsi_target_configfs.c code.. +	 */ +	memcpy(&np->np_sockaddr, ksockaddr, +	       sizeof(struct __kernel_sockaddr_storage)); + +	isert_lid = rdma_create_id(isert_cma_handler, np, RDMA_PS_TCP, +				IB_QPT_RC); +	if (IS_ERR(isert_lid)) { +		pr_err("rdma_create_id() for isert_listen_handler failed: %ld\n", +		       PTR_ERR(isert_lid)); +		ret = PTR_ERR(isert_lid); +		goto out; +	} + +	ret = rdma_bind_addr(isert_lid, sa); +	if (ret) { +		pr_err("rdma_bind_addr() for isert_lid failed: %d\n", ret); +		goto out_lid; +	} + +	ret = rdma_listen(isert_lid, ISERT_RDMA_LISTEN_BACKLOG); +	if (ret) { +		pr_err("rdma_listen() for isert_lid failed: %d\n", ret); +		goto out_lid; +	} + +	isert_np->np_cm_id = isert_lid; +	np->np_context = isert_np; +	pr_debug("Setup isert_lid->context: %p\n", isert_lid->context); + +	return 0; + +out_lid: +	rdma_destroy_id(isert_lid); +out: +	kfree(isert_np); +	return ret; +} + +static int +isert_rdma_accept(struct isert_conn *isert_conn) +{ +	struct rdma_cm_id *cm_id = isert_conn->conn_cm_id; +	struct rdma_conn_param cp; +	int ret; + +	memset(&cp, 0, sizeof(struct rdma_conn_param)); +	cp.responder_resources = isert_conn->responder_resources; +	cp.initiator_depth = isert_conn->initiator_depth; +	cp.retry_count = 7; +	cp.rnr_retry_count = 7; + +	pr_debug("Before rdma_accept >>>>>>>>>>>>>>>>>>>>.\n"); + +	ret = rdma_accept(cm_id, &cp); +	if (ret) { +		pr_err("rdma_accept() failed with: %d\n", ret); +		return ret; +	} + +	pr_debug("After rdma_accept >>>>>>>>>>>>>>>>>>>>>.\n"); + +	return 0; +} + +static int +isert_get_login_rx(struct iscsi_conn *conn, struct iscsi_login *login) +{ +	struct isert_conn *isert_conn = (struct isert_conn *)conn->context; +	int ret; + +	pr_debug("isert_get_login_rx before conn_login_comp conn: %p\n", conn); +	/* +	 * For login requests after the first PDU, isert_rx_login_req() will +	 * kick schedule_delayed_work(&conn->login_work) as the packet is +	 * received, which turns this callback from iscsi_target_do_login_rx() +	 * into a NOP. +	 */ +	if (!login->first_request) +		return 0; + +	ret = wait_for_completion_interruptible(&isert_conn->conn_login_comp); +	if (ret) +		return ret; + +	pr_debug("isert_get_login_rx processing login->req: %p\n", login->req); +	return 0; +} + +static void +isert_set_conn_info(struct iscsi_np *np, struct iscsi_conn *conn, +		    struct isert_conn *isert_conn) +{ +	struct rdma_cm_id *cm_id = isert_conn->conn_cm_id; +	struct rdma_route *cm_route = &cm_id->route; +	struct sockaddr_in *sock_in; +	struct sockaddr_in6 *sock_in6; + +	conn->login_family = np->np_sockaddr.ss_family; + +	if (np->np_sockaddr.ss_family == AF_INET6) { +		sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.dst_addr; +		snprintf(conn->login_ip, sizeof(conn->login_ip), "%pI6c", +			 &sock_in6->sin6_addr.in6_u); +		conn->login_port = ntohs(sock_in6->sin6_port); + +		sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.src_addr; +		snprintf(conn->local_ip, sizeof(conn->local_ip), "%pI6c", +			 &sock_in6->sin6_addr.in6_u); +		conn->local_port = ntohs(sock_in6->sin6_port); +	} else { +		sock_in = (struct sockaddr_in *)&cm_route->addr.dst_addr; +		sprintf(conn->login_ip, "%pI4", +			&sock_in->sin_addr.s_addr); +		conn->login_port = ntohs(sock_in->sin_port); + +		sock_in = (struct sockaddr_in *)&cm_route->addr.src_addr; +		sprintf(conn->local_ip, "%pI4", +			&sock_in->sin_addr.s_addr); +		conn->local_port = ntohs(sock_in->sin_port); +	} +} + +static int +isert_accept_np(struct iscsi_np *np, struct iscsi_conn *conn) +{ +	struct isert_np *isert_np = (struct isert_np *)np->np_context; +	struct isert_conn *isert_conn; +	int max_accept = 0, ret; + +accept_wait: +	ret = down_interruptible(&isert_np->np_sem); +	if (max_accept > 5) +		return -ENODEV; + +	spin_lock_bh(&np->np_thread_lock); +	if (np->np_thread_state >= ISCSI_NP_THREAD_RESET) { +		spin_unlock_bh(&np->np_thread_lock); +		pr_debug("np_thread_state %d for isert_accept_np\n", +			 np->np_thread_state); +		/** +		 * No point in stalling here when np_thread +		 * is in state RESET/SHUTDOWN/EXIT - bail +		 **/ +		return -ENODEV; +	} +	spin_unlock_bh(&np->np_thread_lock); + +	mutex_lock(&isert_np->np_accept_mutex); +	if (list_empty(&isert_np->np_accept_list)) { +		mutex_unlock(&isert_np->np_accept_mutex); +		max_accept++; +		goto accept_wait; +	} +	isert_conn = list_first_entry(&isert_np->np_accept_list, +			struct isert_conn, conn_accept_node); +	list_del_init(&isert_conn->conn_accept_node); +	mutex_unlock(&isert_np->np_accept_mutex); + +	conn->context = isert_conn; +	isert_conn->conn = conn; +	max_accept = 0; + +	ret = isert_rdma_post_recvl(isert_conn); +	if (ret) +		return ret; + +	ret = isert_rdma_accept(isert_conn); +	if (ret) +		return ret; + +	isert_set_conn_info(np, conn, isert_conn); + +	pr_debug("Processing isert_accept_np: isert_conn: %p\n", isert_conn); +	return 0; +} + +static void +isert_free_np(struct iscsi_np *np) +{ +	struct isert_np *isert_np = (struct isert_np *)np->np_context; + +	rdma_destroy_id(isert_np->np_cm_id); + +	np->np_context = NULL; +	kfree(isert_np); +} + +static void isert_wait_conn(struct iscsi_conn *conn) +{ +	struct isert_conn *isert_conn = conn->context; + +	pr_debug("isert_wait_conn: Starting \n"); + +	mutex_lock(&isert_conn->conn_mutex); +	if (isert_conn->conn_cm_id) { +		pr_debug("Calling rdma_disconnect from isert_wait_conn\n"); +		rdma_disconnect(isert_conn->conn_cm_id); +	} +	/* +	 * Only wait for conn_wait_comp_err if the isert_conn made it +	 * into full feature phase.. +	 */ +	if (isert_conn->state == ISER_CONN_INIT) { +		mutex_unlock(&isert_conn->conn_mutex); +		return; +	} +	if (isert_conn->state == ISER_CONN_UP) +		isert_conn->state = ISER_CONN_TERMINATING; +	mutex_unlock(&isert_conn->conn_mutex); + +	wait_for_completion(&isert_conn->conn_wait_comp_err); + +	wait_for_completion(&isert_conn->conn_wait); +} + +static void isert_free_conn(struct iscsi_conn *conn) +{ +	struct isert_conn *isert_conn = conn->context; + +	isert_put_conn(isert_conn); +} + +static struct iscsit_transport iser_target_transport = { +	.name			= "IB/iSER", +	.transport_type		= ISCSI_INFINIBAND, +	.priv_size		= sizeof(struct isert_cmd), +	.owner			= THIS_MODULE, +	.iscsit_setup_np	= isert_setup_np, +	.iscsit_accept_np	= isert_accept_np, +	.iscsit_free_np		= isert_free_np, +	.iscsit_wait_conn	= isert_wait_conn, +	.iscsit_free_conn	= isert_free_conn, +	.iscsit_get_login_rx	= isert_get_login_rx, +	.iscsit_put_login_tx	= isert_put_login_tx, +	.iscsit_immediate_queue	= isert_immediate_queue, +	.iscsit_response_queue	= isert_response_queue, +	.iscsit_get_dataout	= isert_get_dataout, +	.iscsit_queue_data_in	= isert_put_datain, +	.iscsit_queue_status	= isert_put_response, +	.iscsit_aborted_task	= isert_aborted_task, +	.iscsit_get_sup_prot_ops = isert_get_sup_prot_ops, +}; + +static int __init isert_init(void) +{ +	int ret; + +	isert_rx_wq = alloc_workqueue("isert_rx_wq", 0, 0); +	if (!isert_rx_wq) { +		pr_err("Unable to allocate isert_rx_wq\n"); +		return -ENOMEM; +	} + +	isert_comp_wq = alloc_workqueue("isert_comp_wq", 0, 0); +	if (!isert_comp_wq) { +		pr_err("Unable to allocate isert_comp_wq\n"); +		ret = -ENOMEM; +		goto destroy_rx_wq; +	} + +	iscsit_register_transport(&iser_target_transport); +	pr_debug("iSER_TARGET[0] - Loaded iser_target_transport\n"); +	return 0; + +destroy_rx_wq: +	destroy_workqueue(isert_rx_wq); +	return ret; +} + +static void __exit isert_exit(void) +{ +	flush_scheduled_work(); +	destroy_workqueue(isert_comp_wq); +	destroy_workqueue(isert_rx_wq); +	iscsit_unregister_transport(&iser_target_transport); +	pr_debug("iSER_TARGET[0] - Released iser_target_transport\n"); +} + +MODULE_DESCRIPTION("iSER-Target for mainline target infrastructure"); +MODULE_VERSION("0.1"); +MODULE_AUTHOR("nab@Linux-iSCSI.org"); +MODULE_LICENSE("GPL"); + +module_init(isert_init); +module_exit(isert_exit); diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h new file mode 100644 index 00000000000..04f51f7bf61 --- /dev/null +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -0,0 +1,190 @@ +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> + +#define ISERT_RDMA_LISTEN_BACKLOG	10 +#define ISCSI_ISER_SG_TABLESIZE		256 +#define ISER_FASTREG_LI_WRID		0xffffffffffffffffULL + +enum isert_desc_type { +	ISCSI_TX_CONTROL, +	ISCSI_TX_DATAIN +}; + +enum iser_ib_op_code { +	ISER_IB_RECV, +	ISER_IB_SEND, +	ISER_IB_RDMA_WRITE, +	ISER_IB_RDMA_READ, +}; + +enum iser_conn_state { +	ISER_CONN_INIT, +	ISER_CONN_UP, +	ISER_CONN_TERMINATING, +	ISER_CONN_DOWN, +}; + +struct iser_rx_desc { +	struct iser_hdr iser_header; +	struct iscsi_hdr iscsi_header; +	char		data[ISER_RECV_DATA_SEG_LEN]; +	u64		dma_addr; +	struct ib_sge	rx_sg; +	char		pad[ISER_RX_PAD_SIZE]; +} __packed; + +struct iser_tx_desc { +	struct iser_hdr iser_header; +	struct iscsi_hdr iscsi_header; +	enum isert_desc_type type; +	u64		dma_addr; +	struct ib_sge	tx_sg[2]; +	int		num_sge; +	struct isert_cmd *isert_cmd; +	struct llist_node *comp_llnode_batch; +	struct llist_node comp_llnode; +	bool		llnode_active; +	struct ib_send_wr send_wr; +} __packed; + +enum isert_indicator { +	ISERT_PROTECTED		= 1 << 0, +	ISERT_DATA_KEY_VALID	= 1 << 1, +	ISERT_PROT_KEY_VALID	= 1 << 2, +	ISERT_SIG_KEY_VALID	= 1 << 3, +}; + +struct pi_context { +	struct ib_mr		       *prot_mr; +	struct ib_fast_reg_page_list   *prot_frpl; +	struct ib_mr		       *sig_mr; +}; + +struct fast_reg_descriptor { +	struct list_head		list; +	struct ib_mr		       *data_mr; +	struct ib_fast_reg_page_list   *data_frpl; +	u8				ind; +	struct pi_context	       *pi_ctx; +}; + +struct isert_data_buf { +	struct scatterlist     *sg; +	int			nents; +	u32			sg_off; +	u32			len; /* cur_rdma_length */ +	u32			offset; +	unsigned int		dma_nents; +	enum dma_data_direction dma_dir; +}; + +struct isert_rdma_wr { +	struct list_head	wr_list; +	struct isert_cmd	*isert_cmd; +	enum iser_ib_op_code	iser_ib_op; +	struct ib_sge		*ib_sge; +	struct ib_sge		s_ib_sge; +	int			send_wr_num; +	struct ib_send_wr	*send_wr; +	struct ib_send_wr	s_send_wr; +	struct isert_data_buf	data; +	struct isert_data_buf	prot; +	struct fast_reg_descriptor *fr_desc; +}; + +struct isert_cmd { +	uint32_t		read_stag; +	uint32_t		write_stag; +	uint64_t		read_va; +	uint64_t		write_va; +	u64			pdu_buf_dma; +	u32			pdu_buf_len; +	u32			read_va_off; +	u32			write_va_off; +	u32			rdma_wr_num; +	struct isert_conn	*conn; +	struct iscsi_cmd	*iscsi_cmd; +	struct iser_tx_desc	tx_desc; +	struct isert_rdma_wr	rdma_wr; +	struct work_struct	comp_work; +}; + +struct isert_device; + +struct isert_conn { +	enum iser_conn_state	state; +	int			post_recv_buf_count; +	atomic_t		post_send_buf_count; +	u32			responder_resources; +	u32			initiator_depth; +	u32			max_sge; +	char			*login_buf; +	char			*login_req_buf; +	char			*login_rsp_buf; +	u64			login_req_dma; +	u64			login_rsp_dma; +	unsigned int		conn_rx_desc_head; +	struct iser_rx_desc	*conn_rx_descs; +	struct ib_recv_wr	conn_rx_wr[ISERT_MIN_POSTED_RX]; +	struct iscsi_conn	*conn; +	struct list_head	conn_accept_node; +	struct completion	conn_login_comp; +	struct iser_tx_desc	conn_login_tx_desc; +	struct rdma_cm_id	*conn_cm_id; +	struct ib_pd		*conn_pd; +	struct ib_mr		*conn_mr; +	struct ib_qp		*conn_qp; +	struct isert_device	*conn_device; +	struct work_struct	conn_logout_work; +	struct mutex		conn_mutex; +	struct completion	conn_wait; +	struct completion	conn_wait_comp_err; +	struct kref		conn_kref; +	struct list_head	conn_fr_pool; +	int			conn_fr_pool_size; +	/* lock to protect fastreg pool */ +	spinlock_t		conn_lock; +#define ISERT_COMP_BATCH_COUNT	8 +	int			conn_comp_batch; +	struct llist_head	conn_comp_llist; +	bool                    disconnect; +}; + +#define ISERT_MAX_CQ 64 + +struct isert_cq_desc { +	struct isert_device	*device; +	int			cq_index; +	struct work_struct	cq_rx_work; +	struct work_struct	cq_tx_work; +}; + +struct isert_device { +	int			use_fastreg; +	bool			pi_capable; +	int			cqs_used; +	int			refcount; +	int			cq_active_qps[ISERT_MAX_CQ]; +	struct ib_device	*ib_device; +	struct ib_cq		*dev_rx_cq[ISERT_MAX_CQ]; +	struct ib_cq		*dev_tx_cq[ISERT_MAX_CQ]; +	struct isert_cq_desc	*cq_desc; +	struct list_head	dev_node; +	struct ib_device_attr	dev_attr; +	int			(*reg_rdma_mem)(struct iscsi_conn *conn, +						    struct iscsi_cmd *cmd, +						    struct isert_rdma_wr *wr); +	void			(*unreg_rdma_mem)(struct isert_cmd *isert_cmd, +						  struct isert_conn *isert_conn); +}; + +struct isert_np { +	struct semaphore	np_sem; +	struct rdma_cm_id	*np_cm_id; +	struct mutex		np_accept_mutex; +	struct list_head	np_accept_list; +	struct completion	np_login_comp; +}; diff --git a/drivers/infiniband/ulp/isert/isert_proto.h b/drivers/infiniband/ulp/isert/isert_proto.h new file mode 100644 index 00000000000..4dccd313b77 --- /dev/null +++ b/drivers/infiniband/ulp/isert/isert_proto.h @@ -0,0 +1,47 @@ +/* From iscsi_iser.h */ + +struct iser_hdr { +	u8	flags; +	u8	rsvd[3]; +	__be32	write_stag; /* write rkey */ +	__be64	write_va; +	__be32	read_stag;  /* read rkey */ +	__be64	read_va; +} __packed; + +/*Constant PDU lengths calculations */ +#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr)) + +#define ISER_RECV_DATA_SEG_LEN  8192 +#define ISER_RX_PAYLOAD_SIZE    (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN) +#define ISER_RX_LOGIN_SIZE      (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN) + +/* QP settings */ +/* Maximal bounds on received asynchronous PDUs */ +#define ISERT_MAX_TX_MISC_PDUS	4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */ + +#define ISERT_MAX_RX_MISC_PDUS	6 /* NOOP_OUT(2), TEXT(1),         * +				   * SCSI_TMFUNC(2), LOGOUT(1) */ + +#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* from libiscsi.h, must be power of 2 */ + +#define ISERT_QP_MAX_RECV_DTOS	(ISCSI_DEF_XMIT_CMDS_MAX) + +#define ISERT_MIN_POSTED_RX	(ISCSI_DEF_XMIT_CMDS_MAX >> 2) + +#define ISERT_INFLIGHT_DATAOUTS	8 + +#define ISERT_QP_MAX_REQ_DTOS	(ISCSI_DEF_XMIT_CMDS_MAX *    \ +				(1 + ISERT_INFLIGHT_DATAOUTS) + \ +				ISERT_MAX_TX_MISC_PDUS	+ \ +				ISERT_MAX_RX_MISC_PDUS) + +#define ISER_RX_PAD_SIZE	(ISER_RECV_DATA_SEG_LEN + 4096 - \ +		(ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge))) + +#define ISER_VER	0x10 +#define ISER_WSV	0x08 +#define ISER_RSV	0x04 +#define ISCSI_CTRL	0x10 +#define ISER_HELLO	0x20 +#define ISER_HELLORPLY	0x30 diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index cfc1d65c457..e3c2c5b4297 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -30,6 +30,8 @@   * SOFTWARE.   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/module.h>  #include <linux/init.h>  #include <linux/slab.h> @@ -39,11 +41,12 @@  #include <linux/random.h>  #include <linux/jiffies.h> -#include <asm/atomic.h> +#include <linux/atomic.h>  #include <scsi/scsi.h>  #include <scsi/scsi_device.h>  #include <scsi/scsi_dbg.h> +#include <scsi/scsi_tcq.h>  #include <scsi/srp.h>  #include <scsi/scsi_transport_srp.h> @@ -51,32 +54,74 @@  #define DRV_NAME	"ib_srp"  #define PFX		DRV_NAME ": " -#define DRV_VERSION	"0.2" -#define DRV_RELDATE	"November 1, 2005" +#define DRV_VERSION	"1.0" +#define DRV_RELDATE	"July 1, 2013"  MODULE_AUTHOR("Roland Dreier");  MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator "  		   "v" DRV_VERSION " (" DRV_RELDATE ")");  MODULE_LICENSE("Dual BSD/GPL"); -static int srp_sg_tablesize = SRP_DEF_SG_TABLESIZE; -static int srp_max_iu_len; +static unsigned int srp_sg_tablesize; +static unsigned int cmd_sg_entries; +static unsigned int indirect_sg_entries; +static bool allow_ext_sg; +static bool prefer_fr; +static bool register_always; +static int topspin_workarounds = 1; -module_param(srp_sg_tablesize, int, 0444); -MODULE_PARM_DESC(srp_sg_tablesize, -		 "Max number of gather/scatter entries per I/O (default is 12, max 255)"); +module_param(srp_sg_tablesize, uint, 0444); +MODULE_PARM_DESC(srp_sg_tablesize, "Deprecated name for cmd_sg_entries"); -static int topspin_workarounds = 1; +module_param(cmd_sg_entries, uint, 0444); +MODULE_PARM_DESC(cmd_sg_entries, +		 "Default number of gather/scatter entries in the SRP command (default is 12, max 255)"); + +module_param(indirect_sg_entries, uint, 0444); +MODULE_PARM_DESC(indirect_sg_entries, +		 "Default max number of gather/scatter entries (default is 12, max is " __stringify(SCSI_MAX_SG_CHAIN_SEGMENTS) ")"); + +module_param(allow_ext_sg, bool, 0444); +MODULE_PARM_DESC(allow_ext_sg, +		  "Default behavior when there are more than cmd_sg_entries S/G entries after mapping; fails the request when false (default false)");  module_param(topspin_workarounds, int, 0444);  MODULE_PARM_DESC(topspin_workarounds,  		 "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); -static int mellanox_workarounds = 1; - -module_param(mellanox_workarounds, int, 0444); -MODULE_PARM_DESC(mellanox_workarounds, -		 "Enable workarounds for Mellanox SRP target bugs if != 0"); +module_param(prefer_fr, bool, 0444); +MODULE_PARM_DESC(prefer_fr, +"Whether to use fast registration if both FMR and fast registration are supported"); + +module_param(register_always, bool, 0444); +MODULE_PARM_DESC(register_always, +		 "Use memory registration even for contiguous memory regions"); + +static struct kernel_param_ops srp_tmo_ops; + +static int srp_reconnect_delay = 10; +module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay, +		S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reconnect_delay, "Time between successive reconnect attempts"); + +static int srp_fast_io_fail_tmo = 15; +module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo, +		S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(fast_io_fail_tmo, +		 "Number of seconds between the observation of a transport" +		 " layer error and failing all I/O. \"off\" means that this" +		 " functionality is disabled."); + +static int srp_dev_loss_tmo = 600; +module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo, +		S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dev_loss_tmo, +		 "Maximum number of seconds that the SRP transport should" +		 " insulate transport layer errors. After this time has been" +		 " exceeded the SCSI host is removed. Should be" +		 " between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT) +		 " if fast_io_fail_tmo has not been set. \"off\" means that" +		 " this functionality is disabled.");  static void srp_add_one(struct ib_device *device);  static void srp_remove_one(struct ib_device *device); @@ -94,6 +139,48 @@ static struct ib_client srp_client = {  static struct ib_sa_client srp_sa_client; +static int srp_tmo_get(char *buffer, const struct kernel_param *kp) +{ +	int tmo = *(int *)kp->arg; + +	if (tmo >= 0) +		return sprintf(buffer, "%d", tmo); +	else +		return sprintf(buffer, "off"); +} + +static int srp_tmo_set(const char *val, const struct kernel_param *kp) +{ +	int tmo, res; + +	if (strncmp(val, "off", 3) != 0) { +		res = kstrtoint(val, 0, &tmo); +		if (res) +			goto out; +	} else { +		tmo = -1; +	} +	if (kp->arg == &srp_reconnect_delay) +		res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo, +				    srp_dev_loss_tmo); +	else if (kp->arg == &srp_fast_io_fail_tmo) +		res = srp_tmo_valid(srp_reconnect_delay, tmo, srp_dev_loss_tmo); +	else +		res = srp_tmo_valid(srp_reconnect_delay, srp_fast_io_fail_tmo, +				    tmo); +	if (res) +		goto out; +	*(int *)kp->arg = tmo; + +out: +	return res; +} + +static struct kernel_param_ops srp_tmo_ops = { +	.get = srp_tmo_get, +	.set = srp_tmo_set, +}; +  static inline struct srp_target_port *host_to_target(struct Scsi_Host *host)  {  	return (struct srp_target_port *) host->hostdata; @@ -114,14 +201,6 @@ static int srp_target_is_topspin(struct srp_target_port *target)  		 !memcmp(&target->ioc_guid, cisco_oui, sizeof cisco_oui));  } -static int srp_target_is_mellanox(struct srp_target_port *target) -{ -	static const u8 mellanox_oui[3] = { 0x00, 0x02, 0xc9 }; - -	return mellanox_workarounds && -		!memcmp(&target->ioc_guid, mellanox_oui, sizeof mellanox_oui); -} -  static struct srp_iu *srp_alloc_iu(struct srp_host *host, size_t size,  				   gfp_t gfp_mask,  				   enum dma_data_direction direction) @@ -167,7 +246,7 @@ static void srp_free_iu(struct srp_host *host, struct srp_iu *iu)  static void srp_qp_event(struct ib_event *event, void *context)  { -	printk(KERN_ERR PFX "QP event %d\n", event->event); +	pr_debug("QP event %d\n", event->event);  }  static int srp_init_qp(struct srp_target_port *target, @@ -219,80 +298,288 @@ static int srp_new_cm_id(struct srp_target_port *target)  	return 0;  } +static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target) +{ +	struct srp_device *dev = target->srp_host->srp_dev; +	struct ib_fmr_pool_param fmr_param; + +	memset(&fmr_param, 0, sizeof(fmr_param)); +	fmr_param.pool_size	    = target->scsi_host->can_queue; +	fmr_param.dirty_watermark   = fmr_param.pool_size / 4; +	fmr_param.cache		    = 1; +	fmr_param.max_pages_per_fmr = dev->max_pages_per_mr; +	fmr_param.page_shift	    = ilog2(dev->mr_page_size); +	fmr_param.access	    = (IB_ACCESS_LOCAL_WRITE | +				       IB_ACCESS_REMOTE_WRITE | +				       IB_ACCESS_REMOTE_READ); + +	return ib_create_fmr_pool(dev->pd, &fmr_param); +} + +/** + * srp_destroy_fr_pool() - free the resources owned by a pool + * @pool: Fast registration pool to be destroyed. + */ +static void srp_destroy_fr_pool(struct srp_fr_pool *pool) +{ +	int i; +	struct srp_fr_desc *d; + +	if (!pool) +		return; + +	for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { +		if (d->frpl) +			ib_free_fast_reg_page_list(d->frpl); +		if (d->mr) +			ib_dereg_mr(d->mr); +	} +	kfree(pool); +} + +/** + * srp_create_fr_pool() - allocate and initialize a pool for fast registration + * @device:            IB device to allocate fast registration descriptors for. + * @pd:                Protection domain associated with the FR descriptors. + * @pool_size:         Number of descriptors to allocate. + * @max_page_list_len: Maximum fast registration work request page list length. + */ +static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device, +					      struct ib_pd *pd, int pool_size, +					      int max_page_list_len) +{ +	struct srp_fr_pool *pool; +	struct srp_fr_desc *d; +	struct ib_mr *mr; +	struct ib_fast_reg_page_list *frpl; +	int i, ret = -EINVAL; + +	if (pool_size <= 0) +		goto err; +	ret = -ENOMEM; +	pool = kzalloc(sizeof(struct srp_fr_pool) + +		       pool_size * sizeof(struct srp_fr_desc), GFP_KERNEL); +	if (!pool) +		goto err; +	pool->size = pool_size; +	pool->max_page_list_len = max_page_list_len; +	spin_lock_init(&pool->lock); +	INIT_LIST_HEAD(&pool->free_list); + +	for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { +		mr = ib_alloc_fast_reg_mr(pd, max_page_list_len); +		if (IS_ERR(mr)) { +			ret = PTR_ERR(mr); +			goto destroy_pool; +		} +		d->mr = mr; +		frpl = ib_alloc_fast_reg_page_list(device, max_page_list_len); +		if (IS_ERR(frpl)) { +			ret = PTR_ERR(frpl); +			goto destroy_pool; +		} +		d->frpl = frpl; +		list_add_tail(&d->entry, &pool->free_list); +	} + +out: +	return pool; + +destroy_pool: +	srp_destroy_fr_pool(pool); + +err: +	pool = ERR_PTR(ret); +	goto out; +} + +/** + * srp_fr_pool_get() - obtain a descriptor suitable for fast registration + * @pool: Pool to obtain descriptor from. + */ +static struct srp_fr_desc *srp_fr_pool_get(struct srp_fr_pool *pool) +{ +	struct srp_fr_desc *d = NULL; +	unsigned long flags; + +	spin_lock_irqsave(&pool->lock, flags); +	if (!list_empty(&pool->free_list)) { +		d = list_first_entry(&pool->free_list, typeof(*d), entry); +		list_del(&d->entry); +	} +	spin_unlock_irqrestore(&pool->lock, flags); + +	return d; +} + +/** + * srp_fr_pool_put() - put an FR descriptor back in the free list + * @pool: Pool the descriptor was allocated from. + * @desc: Pointer to an array of fast registration descriptor pointers. + * @n:    Number of descriptors to put back. + * + * Note: The caller must already have queued an invalidation request for + * desc->mr->rkey before calling this function. + */ +static void srp_fr_pool_put(struct srp_fr_pool *pool, struct srp_fr_desc **desc, +			    int n) +{ +	unsigned long flags; +	int i; + +	spin_lock_irqsave(&pool->lock, flags); +	for (i = 0; i < n; i++) +		list_add(&desc[i]->entry, &pool->free_list); +	spin_unlock_irqrestore(&pool->lock, flags); +} + +static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) +{ +	struct srp_device *dev = target->srp_host->srp_dev; + +	return srp_create_fr_pool(dev->dev, dev->pd, +				  target->scsi_host->can_queue, +				  dev->max_pages_per_mr); +} +  static int srp_create_target_ib(struct srp_target_port *target)  { +	struct srp_device *dev = target->srp_host->srp_dev;  	struct ib_qp_init_attr *init_attr; +	struct ib_cq *recv_cq, *send_cq; +	struct ib_qp *qp; +	struct ib_fmr_pool *fmr_pool = NULL; +	struct srp_fr_pool *fr_pool = NULL; +	const int m = 1 + dev->use_fast_reg;  	int ret;  	init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL);  	if (!init_attr)  		return -ENOMEM; -	target->recv_cq = ib_create_cq(target->srp_host->srp_dev->dev, -				       srp_recv_completion, NULL, target, SRP_RQ_SIZE, 0); -	if (IS_ERR(target->recv_cq)) { -		ret = PTR_ERR(target->recv_cq); +	recv_cq = ib_create_cq(dev->dev, srp_recv_completion, NULL, target, +			       target->queue_size, target->comp_vector); +	if (IS_ERR(recv_cq)) { +		ret = PTR_ERR(recv_cq);  		goto err;  	} -	target->send_cq = ib_create_cq(target->srp_host->srp_dev->dev, -				       srp_send_completion, NULL, target, SRP_SQ_SIZE, 0); -	if (IS_ERR(target->send_cq)) { -		ret = PTR_ERR(target->send_cq); +	send_cq = ib_create_cq(dev->dev, srp_send_completion, NULL, target, +			       m * target->queue_size, target->comp_vector); +	if (IS_ERR(send_cq)) { +		ret = PTR_ERR(send_cq);  		goto err_recv_cq;  	} -	ib_req_notify_cq(target->recv_cq, IB_CQ_NEXT_COMP); +	ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP);  	init_attr->event_handler       = srp_qp_event; -	init_attr->cap.max_send_wr     = SRP_SQ_SIZE; -	init_attr->cap.max_recv_wr     = SRP_RQ_SIZE; +	init_attr->cap.max_send_wr     = m * target->queue_size; +	init_attr->cap.max_recv_wr     = target->queue_size;  	init_attr->cap.max_recv_sge    = 1;  	init_attr->cap.max_send_sge    = 1; -	init_attr->sq_sig_type         = IB_SIGNAL_ALL_WR; +	init_attr->sq_sig_type         = IB_SIGNAL_REQ_WR;  	init_attr->qp_type             = IB_QPT_RC; -	init_attr->send_cq             = target->send_cq; -	init_attr->recv_cq             = target->recv_cq; +	init_attr->send_cq             = send_cq; +	init_attr->recv_cq             = recv_cq; -	target->qp = ib_create_qp(target->srp_host->srp_dev->pd, init_attr); -	if (IS_ERR(target->qp)) { -		ret = PTR_ERR(target->qp); +	qp = ib_create_qp(dev->pd, init_attr); +	if (IS_ERR(qp)) { +		ret = PTR_ERR(qp);  		goto err_send_cq;  	} -	ret = srp_init_qp(target, target->qp); +	ret = srp_init_qp(target, qp);  	if (ret)  		goto err_qp; +	if (dev->use_fast_reg && dev->has_fr) { +		fr_pool = srp_alloc_fr_pool(target); +		if (IS_ERR(fr_pool)) { +			ret = PTR_ERR(fr_pool); +			shost_printk(KERN_WARNING, target->scsi_host, PFX +				     "FR pool allocation failed (%d)\n", ret); +			goto err_qp; +		} +		if (target->fr_pool) +			srp_destroy_fr_pool(target->fr_pool); +		target->fr_pool = fr_pool; +	} else if (!dev->use_fast_reg && dev->has_fmr) { +		fmr_pool = srp_alloc_fmr_pool(target); +		if (IS_ERR(fmr_pool)) { +			ret = PTR_ERR(fmr_pool); +			shost_printk(KERN_WARNING, target->scsi_host, PFX +				     "FMR pool allocation failed (%d)\n", ret); +			goto err_qp; +		} +		if (target->fmr_pool) +			ib_destroy_fmr_pool(target->fmr_pool); +		target->fmr_pool = fmr_pool; +	} + +	if (target->qp) +		ib_destroy_qp(target->qp); +	if (target->recv_cq) +		ib_destroy_cq(target->recv_cq); +	if (target->send_cq) +		ib_destroy_cq(target->send_cq); + +	target->qp = qp; +	target->recv_cq = recv_cq; +	target->send_cq = send_cq; +  	kfree(init_attr);  	return 0;  err_qp: -	ib_destroy_qp(target->qp); +	ib_destroy_qp(qp);  err_send_cq: -	ib_destroy_cq(target->send_cq); +	ib_destroy_cq(send_cq);  err_recv_cq: -	ib_destroy_cq(target->recv_cq); +	ib_destroy_cq(recv_cq);  err:  	kfree(init_attr);  	return ret;  } +/* + * Note: this function may be called without srp_alloc_iu_bufs() having been + * invoked. Hence the target->[rt]x_ring checks. + */  static void srp_free_target_ib(struct srp_target_port *target)  { +	struct srp_device *dev = target->srp_host->srp_dev;  	int i; +	if (dev->use_fast_reg) { +		if (target->fr_pool) +			srp_destroy_fr_pool(target->fr_pool); +	} else { +		if (target->fmr_pool) +			ib_destroy_fmr_pool(target->fmr_pool); +	}  	ib_destroy_qp(target->qp);  	ib_destroy_cq(target->send_cq);  	ib_destroy_cq(target->recv_cq); -	for (i = 0; i < SRP_RQ_SIZE; ++i) -		srp_free_iu(target->srp_host, target->rx_ring[i]); -	for (i = 0; i < SRP_SQ_SIZE; ++i) -		srp_free_iu(target->srp_host, target->tx_ring[i]); +	target->qp = NULL; +	target->send_cq = target->recv_cq = NULL; + +	if (target->rx_ring) { +		for (i = 0; i < target->queue_size; ++i) +			srp_free_iu(target->srp_host, target->rx_ring[i]); +		kfree(target->rx_ring); +		target->rx_ring = NULL; +	} +	if (target->tx_ring) { +		for (i = 0; i < target->queue_size; ++i) +			srp_free_iu(target->srp_host, target->tx_ring[i]); +		kfree(target->tx_ring); +		target->tx_ring = NULL; +	}  }  static void srp_path_rec_completion(int status, @@ -312,6 +599,8 @@ static void srp_path_rec_completion(int status,  static int srp_lookup_path(struct srp_target_port *target)  { +	int ret; +  	target->path.numb_path = 1;  	init_completion(&target->done); @@ -332,7 +621,9 @@ static int srp_lookup_path(struct srp_target_port *target)  	if (target->path_query_id < 0)  		return target->path_query_id; -	wait_for_completion(&target->done); +	ret = wait_for_completion_interruptible(&target->done); +	if (ret < 0) +		return ret;  	if (target->status < 0)  		shost_printk(KERN_WARNING, target->scsi_host, @@ -372,13 +663,13 @@ static int srp_send_req(struct srp_target_port *target)  	req->param.responder_resources	      = 4;  	req->param.remote_cm_response_timeout = 20;  	req->param.local_cm_response_timeout  = 20; -	req->param.retry_count 		      = 7; +	req->param.retry_count                = target->tl_retry_count;  	req->param.rnr_retry_count 	      = 7;  	req->param.max_cm_retries 	      = 15;  	req->priv.opcode     	= SRP_LOGIN_REQ;  	req->priv.tag        	= 0; -	req->priv.req_it_iu_len = cpu_to_be32(srp_max_iu_len); +	req->priv.req_it_iu_len = cpu_to_be32(target->max_iu_len);  	req->priv.req_buf_fmt 	= cpu_to_be16(SRP_BUF_FORMAT_DIRECT |  					      SRP_BUF_FORMAT_INDIRECT);  	/* @@ -428,48 +719,193 @@ static int srp_send_req(struct srp_target_port *target)  	return status;  } +static bool srp_queue_remove_work(struct srp_target_port *target) +{ +	bool changed = false; + +	spin_lock_irq(&target->lock); +	if (target->state != SRP_TARGET_REMOVED) { +		target->state = SRP_TARGET_REMOVED; +		changed = true; +	} +	spin_unlock_irq(&target->lock); + +	if (changed) +		queue_work(system_long_wq, &target->remove_work); + +	return changed; +} + +static bool srp_change_conn_state(struct srp_target_port *target, +				  bool connected) +{ +	bool changed = false; + +	spin_lock_irq(&target->lock); +	if (target->connected != connected) { +		target->connected = connected; +		changed = true; +	} +	spin_unlock_irq(&target->lock); + +	return changed; +} +  static void srp_disconnect_target(struct srp_target_port *target)  { -	/* XXX should send SRP_I_LOGOUT request */ +	if (srp_change_conn_state(target, false)) { +		/* XXX should send SRP_I_LOGOUT request */ -	init_completion(&target->done); -	if (ib_send_cm_dreq(target->cm_id, NULL, 0)) { -		shost_printk(KERN_DEBUG, target->scsi_host, -			     PFX "Sending CM DREQ failed\n"); -		return; +		if (ib_send_cm_dreq(target->cm_id, NULL, 0)) { +			shost_printk(KERN_DEBUG, target->scsi_host, +				     PFX "Sending CM DREQ failed\n"); +		}  	} -	wait_for_completion(&target->done);  } -static void srp_remove_work(struct work_struct *work) +static void srp_free_req_data(struct srp_target_port *target)  { -	struct srp_target_port *target = -		container_of(work, struct srp_target_port, work); +	struct srp_device *dev = target->srp_host->srp_dev; +	struct ib_device *ibdev = dev->dev; +	struct srp_request *req; +	int i; -	spin_lock_irq(target->scsi_host->host_lock); -	if (target->state != SRP_TARGET_DEAD) { -		spin_unlock_irq(target->scsi_host->host_lock); +	if (!target->req_ring)  		return; + +	for (i = 0; i < target->req_ring_size; ++i) { +		req = &target->req_ring[i]; +		if (dev->use_fast_reg) +			kfree(req->fr_list); +		else +			kfree(req->fmr_list); +		kfree(req->map_page); +		if (req->indirect_dma_addr) { +			ib_dma_unmap_single(ibdev, req->indirect_dma_addr, +					    target->indirect_size, +					    DMA_TO_DEVICE); +		} +		kfree(req->indirect_desc);  	} -	target->state = SRP_TARGET_REMOVED; -	spin_unlock_irq(target->scsi_host->host_lock); -	spin_lock(&target->srp_host->target_lock); -	list_del(&target->list); -	spin_unlock(&target->srp_host->target_lock); +	kfree(target->req_ring); +	target->req_ring = NULL; +} + +static int srp_alloc_req_data(struct srp_target_port *target) +{ +	struct srp_device *srp_dev = target->srp_host->srp_dev; +	struct ib_device *ibdev = srp_dev->dev; +	struct srp_request *req; +	void *mr_list; +	dma_addr_t dma_addr; +	int i, ret = -ENOMEM; + +	INIT_LIST_HEAD(&target->free_reqs); +	target->req_ring = kzalloc(target->req_ring_size * +				   sizeof(*target->req_ring), GFP_KERNEL); +	if (!target->req_ring) +		goto out; + +	for (i = 0; i < target->req_ring_size; ++i) { +		req = &target->req_ring[i]; +		mr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *), +				  GFP_KERNEL); +		if (!mr_list) +			goto out; +		if (srp_dev->use_fast_reg) +			req->fr_list = mr_list; +		else +			req->fmr_list = mr_list; +		req->map_page = kmalloc(srp_dev->max_pages_per_mr * +					sizeof(void *), GFP_KERNEL); +		if (!req->map_page) +			goto out; +		req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); +		if (!req->indirect_desc) +			goto out; + +		dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, +					     target->indirect_size, +					     DMA_TO_DEVICE); +		if (ib_dma_mapping_error(ibdev, dma_addr)) +			goto out; + +		req->indirect_dma_addr = dma_addr; +		req->index = i; +		list_add_tail(&req->list, &target->free_reqs); +	} +	ret = 0; + +out: +	return ret; +} + +/** + * srp_del_scsi_host_attr() - Remove attributes defined in the host template. + * @shost: SCSI host whose attributes to remove from sysfs. + * + * Note: Any attributes defined in the host template and that did not exist + * before invocation of this function will be ignored. + */ +static void srp_del_scsi_host_attr(struct Scsi_Host *shost) +{ +	struct device_attribute **attr; + +	for (attr = shost->hostt->shost_attrs; attr && *attr; ++attr) +		device_remove_file(&shost->shost_dev, *attr); +} + +static void srp_remove_target(struct srp_target_port *target) +{ +	WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); + +	srp_del_scsi_host_attr(target->scsi_host); +	srp_rport_get(target->rport);  	srp_remove_host(target->scsi_host);  	scsi_remove_host(target->scsi_host); +	srp_stop_rport_timers(target->rport); +	srp_disconnect_target(target);  	ib_destroy_cm_id(target->cm_id);  	srp_free_target_ib(target); +	cancel_work_sync(&target->tl_err_work); +	srp_rport_put(target->rport); +	srp_free_req_data(target); + +	spin_lock(&target->srp_host->target_lock); +	list_del(&target->list); +	spin_unlock(&target->srp_host->target_lock); +  	scsi_host_put(target->scsi_host);  } +static void srp_remove_work(struct work_struct *work) +{ +	struct srp_target_port *target = +		container_of(work, struct srp_target_port, remove_work); + +	WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); + +	srp_remove_target(target); +} + +static void srp_rport_delete(struct srp_rport *rport) +{ +	struct srp_target_port *target = rport->lld_data; + +	srp_queue_remove_work(target); +} +  static int srp_connect_target(struct srp_target_port *target)  {  	int retries = 3;  	int ret; +	WARN_ON_ONCE(target->connected); + +	target->qp_in_error = false; +  	ret = srp_lookup_path(target);  	if (ret)  		return ret; @@ -479,7 +915,9 @@ static int srp_connect_target(struct srp_target_port *target)  		ret = srp_send_req(target);  		if (ret)  			return ret; -		wait_for_completion(&target->done); +		ret = wait_for_completion_interruptible(&target->done); +		if (ret < 0) +			return ret;  		/*  		 * The CM event handling code will set status to @@ -489,6 +927,7 @@ static int srp_connect_target(struct srp_target_port *target)  		 */  		switch (target->status) {  		case 0: +			srp_change_conn_state(target, true);  			return 0;  		case SRP_PORT_REDIRECT: @@ -521,199 +960,436 @@ static int srp_connect_target(struct srp_target_port *target)  	}  } +static int srp_inv_rkey(struct srp_target_port *target, u32 rkey) +{ +	struct ib_send_wr *bad_wr; +	struct ib_send_wr wr = { +		.opcode		    = IB_WR_LOCAL_INV, +		.wr_id		    = LOCAL_INV_WR_ID_MASK, +		.next		    = NULL, +		.num_sge	    = 0, +		.send_flags	    = 0, +		.ex.invalidate_rkey = rkey, +	}; + +	return ib_post_send(target->qp, &wr, &bad_wr); +} +  static void srp_unmap_data(struct scsi_cmnd *scmnd,  			   struct srp_target_port *target,  			   struct srp_request *req)  { +	struct srp_device *dev = target->srp_host->srp_dev; +	struct ib_device *ibdev = dev->dev; +	int i, res; +  	if (!scsi_sglist(scmnd) ||  	    (scmnd->sc_data_direction != DMA_TO_DEVICE &&  	     scmnd->sc_data_direction != DMA_FROM_DEVICE))  		return; -	if (req->fmr) { -		ib_fmr_pool_unmap(req->fmr); -		req->fmr = NULL; +	if (dev->use_fast_reg) { +		struct srp_fr_desc **pfr; + +		for (i = req->nmdesc, pfr = req->fr_list; i > 0; i--, pfr++) { +			res = srp_inv_rkey(target, (*pfr)->mr->rkey); +			if (res < 0) { +				shost_printk(KERN_ERR, target->scsi_host, PFX +				  "Queueing INV WR for rkey %#x failed (%d)\n", +				  (*pfr)->mr->rkey, res); +				queue_work(system_long_wq, +					   &target->tl_err_work); +			} +		} +		if (req->nmdesc) +			srp_fr_pool_put(target->fr_pool, req->fr_list, +					req->nmdesc); +	} else { +		struct ib_pool_fmr **pfmr; + +		for (i = req->nmdesc, pfmr = req->fmr_list; i > 0; i--, pfmr++) +			ib_fmr_pool_unmap(*pfmr);  	} -	ib_dma_unmap_sg(target->srp_host->srp_dev->dev, scsi_sglist(scmnd), -			scsi_sg_count(scmnd), scmnd->sc_data_direction); +	ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd), +			scmnd->sc_data_direction);  } -static void srp_remove_req(struct srp_target_port *target, struct srp_request *req) +/** + * srp_claim_req - Take ownership of the scmnd associated with a request. + * @target: SRP target port. + * @req: SRP request. + * @sdev: If not NULL, only take ownership for this SCSI device. + * @scmnd: If NULL, take ownership of @req->scmnd. If not NULL, only take + *         ownership of @req->scmnd if it equals @scmnd. + * + * Return value: + * Either NULL or a pointer to the SCSI command the caller became owner of. + */ +static struct scsi_cmnd *srp_claim_req(struct srp_target_port *target, +				       struct srp_request *req, +				       struct scsi_device *sdev, +				       struct scsi_cmnd *scmnd)  { -	srp_unmap_data(req->scmnd, target, req); -	list_move_tail(&req->list, &target->free_reqs); +	unsigned long flags; + +	spin_lock_irqsave(&target->lock, flags); +	if (req->scmnd && +	    (!sdev || req->scmnd->device == sdev) && +	    (!scmnd || req->scmnd == scmnd)) { +		scmnd = req->scmnd; +		req->scmnd = NULL; +	} else { +		scmnd = NULL; +	} +	spin_unlock_irqrestore(&target->lock, flags); + +	return scmnd;  } -static void srp_reset_req(struct srp_target_port *target, struct srp_request *req) +/** + * srp_free_req() - Unmap data and add request to the free request list. + * @target: SRP target port. + * @req:    Request to be freed. + * @scmnd:  SCSI command associated with @req. + * @req_lim_delta: Amount to be added to @target->req_lim. + */ +static void srp_free_req(struct srp_target_port *target, +			 struct srp_request *req, struct scsi_cmnd *scmnd, +			 s32 req_lim_delta)  { -	req->scmnd->result = DID_RESET << 16; -	req->scmnd->scsi_done(req->scmnd); -	srp_remove_req(target, req); +	unsigned long flags; + +	srp_unmap_data(scmnd, target, req); + +	spin_lock_irqsave(&target->lock, flags); +	target->req_lim += req_lim_delta; +	list_add_tail(&req->list, &target->free_reqs); +	spin_unlock_irqrestore(&target->lock, flags);  } -static int srp_reconnect_target(struct srp_target_port *target) +static void srp_finish_req(struct srp_target_port *target, +			   struct srp_request *req, struct scsi_device *sdev, +			   int result)  { -	struct ib_qp_attr qp_attr; -	struct srp_request *req, *tmp; -	struct ib_wc wc; -	int ret; +	struct scsi_cmnd *scmnd = srp_claim_req(target, req, sdev, NULL); -	spin_lock_irq(target->scsi_host->host_lock); -	if (target->state != SRP_TARGET_LIVE) { -		spin_unlock_irq(target->scsi_host->host_lock); -		return -EAGAIN; +	if (scmnd) { +		srp_free_req(target, req, scmnd, 0); +		scmnd->result = result; +		scmnd->scsi_done(scmnd);  	} -	target->state = SRP_TARGET_CONNECTING; -	spin_unlock_irq(target->scsi_host->host_lock); +} + +static void srp_terminate_io(struct srp_rport *rport) +{ +	struct srp_target_port *target = rport->lld_data; +	struct Scsi_Host *shost = target->scsi_host; +	struct scsi_device *sdev; +	int i; + +	/* +	 * Invoking srp_terminate_io() while srp_queuecommand() is running +	 * is not safe. Hence the warning statement below. +	 */ +	shost_for_each_device(sdev, shost) +		WARN_ON_ONCE(sdev->request_queue->request_fn_active); + +	for (i = 0; i < target->req_ring_size; ++i) { +		struct srp_request *req = &target->req_ring[i]; +		srp_finish_req(target, req, NULL, DID_TRANSPORT_FAILFAST << 16); +	} +} + +/* + * It is up to the caller to ensure that srp_rport_reconnect() calls are + * serialized and that no concurrent srp_queuecommand(), srp_abort(), + * srp_reset_device() or srp_reset_host() calls will occur while this function + * is in progress. One way to realize that is not to call this function + * directly but to call srp_reconnect_rport() instead since that last function + * serializes calls of this function via rport->mutex and also blocks + * srp_queuecommand() calls before invoking this function. + */ +static int srp_rport_reconnect(struct srp_rport *rport) +{ +	struct srp_target_port *target = rport->lld_data; +	int i, ret;  	srp_disconnect_target(target);  	/* -	 * Now get a new local CM ID so that we avoid confusing the -	 * target in case things are really fouled up. +	 * Now get a new local CM ID so that we avoid confusing the target in +	 * case things are really fouled up. Doing so also ensures that all CM +	 * callbacks will have finished before a new QP is allocated.  	 */  	ret = srp_new_cm_id(target); -	if (ret) -		goto err; -	qp_attr.qp_state = IB_QPS_RESET; -	ret = ib_modify_qp(target->qp, &qp_attr, IB_QP_STATE); -	if (ret) -		goto err; +	for (i = 0; i < target->req_ring_size; ++i) { +		struct srp_request *req = &target->req_ring[i]; +		srp_finish_req(target, req, NULL, DID_RESET << 16); +	} -	ret = srp_init_qp(target, target->qp); -	if (ret) -		goto err; +	/* +	 * Whether or not creating a new CM ID succeeded, create a new +	 * QP. This guarantees that all callback functions for the old QP have +	 * finished before any send requests are posted on the new QP. +	 */ +	ret += srp_create_target_ib(target); -	while (ib_poll_cq(target->recv_cq, 1, &wc) > 0) -		; /* nothing */ -	while (ib_poll_cq(target->send_cq, 1, &wc) > 0) -		; /* nothing */ +	INIT_LIST_HEAD(&target->free_tx); +	for (i = 0; i < target->queue_size; ++i) +		list_add(&target->tx_ring[i]->list, &target->free_tx); -	spin_lock_irq(target->scsi_host->host_lock); -	list_for_each_entry_safe(req, tmp, &target->req_queue, list) -		srp_reset_req(target, req); -	spin_unlock_irq(target->scsi_host->host_lock); +	if (ret == 0) +		ret = srp_connect_target(target); -	target->rx_head	 = 0; -	target->tx_head	 = 0; -	target->tx_tail  = 0; +	if (ret == 0) +		shost_printk(KERN_INFO, target->scsi_host, +			     PFX "reconnect succeeded\n"); -	target->qp_in_error = 0; -	ret = srp_connect_target(target); -	if (ret) -		goto err; +	return ret; +} -	spin_lock_irq(target->scsi_host->host_lock); -	if (target->state == SRP_TARGET_CONNECTING) { -		ret = 0; -		target->state = SRP_TARGET_LIVE; -	} else -		ret = -EAGAIN; -	spin_unlock_irq(target->scsi_host->host_lock); +static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr, +			 unsigned int dma_len, u32 rkey) +{ +	struct srp_direct_buf *desc = state->desc; -	return ret; +	desc->va = cpu_to_be64(dma_addr); +	desc->key = cpu_to_be32(rkey); +	desc->len = cpu_to_be32(dma_len); -err: -	shost_printk(KERN_ERR, target->scsi_host, -		     PFX "reconnect failed (%d), removing target port.\n", ret); +	state->total_len += dma_len; +	state->desc++; +	state->ndesc++; +} -	/* -	 * We couldn't reconnect, so kill our target port off. -	 * However, we have to defer the real removal because we might -	 * be in the context of the SCSI error handler now, which -	 * would deadlock if we call scsi_remove_host(). -	 */ -	spin_lock_irq(target->scsi_host->host_lock); -	if (target->state == SRP_TARGET_CONNECTING) { -		target->state = SRP_TARGET_DEAD; -		INIT_WORK(&target->work, srp_remove_work); -		schedule_work(&target->work); +static int srp_map_finish_fmr(struct srp_map_state *state, +			      struct srp_target_port *target) +{ +	struct ib_pool_fmr *fmr; +	u64 io_addr = 0; + +	fmr = ib_fmr_pool_map_phys(target->fmr_pool, state->pages, +				   state->npages, io_addr); +	if (IS_ERR(fmr)) +		return PTR_ERR(fmr); + +	*state->next_fmr++ = fmr; +	state->nmdesc++; + +	srp_map_desc(state, 0, state->dma_len, fmr->fmr->rkey); + +	return 0; +} + +static int srp_map_finish_fr(struct srp_map_state *state, +			     struct srp_target_port *target) +{ +	struct srp_device *dev = target->srp_host->srp_dev; +	struct ib_send_wr *bad_wr; +	struct ib_send_wr wr; +	struct srp_fr_desc *desc; +	u32 rkey; + +	desc = srp_fr_pool_get(target->fr_pool); +	if (!desc) +		return -ENOMEM; + +	rkey = ib_inc_rkey(desc->mr->rkey); +	ib_update_fast_reg_key(desc->mr, rkey); + +	memcpy(desc->frpl->page_list, state->pages, +	       sizeof(state->pages[0]) * state->npages); + +	memset(&wr, 0, sizeof(wr)); +	wr.opcode = IB_WR_FAST_REG_MR; +	wr.wr_id = FAST_REG_WR_ID_MASK; +	wr.wr.fast_reg.iova_start = state->base_dma_addr; +	wr.wr.fast_reg.page_list = desc->frpl; +	wr.wr.fast_reg.page_list_len = state->npages; +	wr.wr.fast_reg.page_shift = ilog2(dev->mr_page_size); +	wr.wr.fast_reg.length = state->dma_len; +	wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | +				       IB_ACCESS_REMOTE_READ | +				       IB_ACCESS_REMOTE_WRITE); +	wr.wr.fast_reg.rkey = desc->mr->lkey; + +	*state->next_fr++ = desc; +	state->nmdesc++; + +	srp_map_desc(state, state->base_dma_addr, state->dma_len, +		     desc->mr->rkey); + +	return ib_post_send(target->qp, &wr, &bad_wr); +} + +static int srp_finish_mapping(struct srp_map_state *state, +			      struct srp_target_port *target) +{ +	int ret = 0; + +	if (state->npages == 0) +		return 0; + +	if (state->npages == 1 && !register_always) +		srp_map_desc(state, state->base_dma_addr, state->dma_len, +			     target->rkey); +	else +		ret = target->srp_host->srp_dev->use_fast_reg ? +			srp_map_finish_fr(state, target) : +			srp_map_finish_fmr(state, target); + +	if (ret == 0) { +		state->npages = 0; +		state->dma_len = 0;  	} -	spin_unlock_irq(target->scsi_host->host_lock);  	return ret;  } -static int srp_map_fmr(struct srp_target_port *target, struct scatterlist *scat, -		       int sg_cnt, struct srp_request *req, -		       struct srp_direct_buf *buf) +static void srp_map_update_start(struct srp_map_state *state, +				 struct scatterlist *sg, int sg_index, +				 dma_addr_t dma_addr) +{ +	state->unmapped_sg = sg; +	state->unmapped_index = sg_index; +	state->unmapped_addr = dma_addr; +} + +static int srp_map_sg_entry(struct srp_map_state *state, +			    struct srp_target_port *target, +			    struct scatterlist *sg, int sg_index, +			    bool use_mr)  { -	u64 io_addr = 0; -	u64 *dma_pages; -	u32 len; -	int page_cnt; -	int i, j; -	int ret;  	struct srp_device *dev = target->srp_host->srp_dev;  	struct ib_device *ibdev = dev->dev; -	struct scatterlist *sg; - -	if (!dev->fmr_pool) -		return -ENODEV; +	dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg); +	unsigned int dma_len = ib_sg_dma_len(ibdev, sg); +	unsigned int len; +	int ret; -	if (srp_target_is_mellanox(target) && -	    (ib_sg_dma_address(ibdev, &scat[0]) & ~dev->fmr_page_mask)) -		return -EINVAL; +	if (!dma_len) +		return 0; -	len = page_cnt = 0; -	scsi_for_each_sg(req->scmnd, sg, sg_cnt, i) { -		unsigned int dma_len = ib_sg_dma_len(ibdev, sg); +	if (!use_mr) { +		/* +		 * Once we're in direct map mode for a request, we don't +		 * go back to FMR or FR mode, so no need to update anything +		 * other than the descriptor. +		 */ +		srp_map_desc(state, dma_addr, dma_len, target->rkey); +		return 0; +	} -		if (ib_sg_dma_address(ibdev, sg) & ~dev->fmr_page_mask) { -			if (i > 0) -				return -EINVAL; -			else -				++page_cnt; -		} -		if ((ib_sg_dma_address(ibdev, sg) + dma_len) & -		    ~dev->fmr_page_mask) { -			if (i < sg_cnt - 1) -				return -EINVAL; -			else -				++page_cnt; -		} +	/* +	 * Since not all RDMA HW drivers support non-zero page offsets for +	 * FMR, if we start at an offset into a page, don't merge into the +	 * current FMR mapping. Finish it out, and use the kernel's MR for +	 * this sg entry. +	 */ +	if ((!dev->use_fast_reg && dma_addr & ~dev->mr_page_mask) || +	    dma_len > dev->mr_max_size) { +		ret = srp_finish_mapping(state, target); +		if (ret) +			return ret; -		len += dma_len; +		srp_map_desc(state, dma_addr, dma_len, target->rkey); +		srp_map_update_start(state, NULL, 0, 0); +		return 0;  	} -	page_cnt += len >> dev->fmr_page_shift; -	if (page_cnt > SRP_FMR_SIZE) -		return -ENOMEM; +	/* +	 * If this is the first sg that will be mapped via FMR or via FR, save +	 * our position. We need to know the first unmapped entry, its index, +	 * and the first unmapped address within that entry to be able to +	 * restart mapping after an error. +	 */ +	if (!state->unmapped_sg) +		srp_map_update_start(state, sg, sg_index, dma_addr); -	dma_pages = kmalloc(sizeof (u64) * page_cnt, GFP_ATOMIC); -	if (!dma_pages) -		return -ENOMEM; +	while (dma_len) { +		unsigned offset = dma_addr & ~dev->mr_page_mask; +		if (state->npages == dev->max_pages_per_mr || offset != 0) { +			ret = srp_finish_mapping(state, target); +			if (ret) +				return ret; + +			srp_map_update_start(state, sg, sg_index, dma_addr); +		} -	page_cnt = 0; -	scsi_for_each_sg(req->scmnd, sg, sg_cnt, i) { -		unsigned int dma_len = ib_sg_dma_len(ibdev, sg); +		len = min_t(unsigned int, dma_len, dev->mr_page_size - offset); -		for (j = 0; j < dma_len; j += dev->fmr_page_size) -			dma_pages[page_cnt++] = -				(ib_sg_dma_address(ibdev, sg) & -				 dev->fmr_page_mask) + j; +		if (!state->npages) +			state->base_dma_addr = dma_addr; +		state->pages[state->npages++] = dma_addr & dev->mr_page_mask; +		state->dma_len += len; +		dma_addr += len; +		dma_len -= len;  	} -	req->fmr = ib_fmr_pool_map_phys(dev->fmr_pool, -					dma_pages, page_cnt, io_addr); -	if (IS_ERR(req->fmr)) { -		ret = PTR_ERR(req->fmr); -		req->fmr = NULL; -		goto out; +	/* +	 * If the last entry of the MR wasn't a full page, then we need to +	 * close it out and start a new one -- we can only merge at page +	 * boundries. +	 */ +	ret = 0; +	if (len != dev->mr_page_size) { +		ret = srp_finish_mapping(state, target); +		if (!ret) +			srp_map_update_start(state, NULL, 0, 0);  	} +	return ret; +} -	buf->va  = cpu_to_be64(ib_sg_dma_address(ibdev, &scat[0]) & -			       ~dev->fmr_page_mask); -	buf->key = cpu_to_be32(req->fmr->fmr->rkey); -	buf->len = cpu_to_be32(len); +static int srp_map_sg(struct srp_map_state *state, +		      struct srp_target_port *target, struct srp_request *req, +		      struct scatterlist *scat, int count) +{ +	struct srp_device *dev = target->srp_host->srp_dev; +	struct ib_device *ibdev = dev->dev; +	struct scatterlist *sg; +	int i; +	bool use_mr; -	ret = 0; +	state->desc	= req->indirect_desc; +	state->pages	= req->map_page; +	if (dev->use_fast_reg) { +		state->next_fr = req->fr_list; +		use_mr = !!target->fr_pool; +	} else { +		state->next_fmr = req->fmr_list; +		use_mr = !!target->fmr_pool; +	} -out: -	kfree(dma_pages); +	for_each_sg(scat, sg, count, i) { +		if (srp_map_sg_entry(state, target, sg, i, use_mr)) { +			/* +			 * Memory registration failed, so backtrack to the +			 * first unmapped entry and continue on without using +			 * memory registration. +			 */ +			dma_addr_t dma_addr; +			unsigned int dma_len; + +backtrack: +			sg = state->unmapped_sg; +			i = state->unmapped_index; + +			dma_addr = ib_sg_dma_address(ibdev, sg); +			dma_len = ib_sg_dma_len(ibdev, sg); +			dma_len -= (state->unmapped_addr - dma_addr); +			dma_addr = state->unmapped_addr; +			use_mr = false; +			srp_map_desc(state, dma_addr, dma_len, target->rkey); +		} +	} -	return ret; +	if (use_mr && srp_finish_mapping(state, target)) +		goto backtrack; + +	req->nmdesc = state->nmdesc; + +	return 0;  }  static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, @@ -722,9 +1398,12 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target,  	struct scatterlist *scat;  	struct srp_cmd *cmd = req->cmd->buf;  	int len, nents, count; -	u8 fmt = SRP_DATA_DESC_DIRECT;  	struct srp_device *dev;  	struct ib_device *ibdev; +	struct srp_map_state state; +	struct srp_indirect_buf *indirect_hdr; +	u32 table_len; +	u8 fmt;  	if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE)  		return sizeof (struct srp_cmd); @@ -744,11 +1423,13 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target,  	ibdev = dev->dev;  	count = ib_dma_map_sg(ibdev, scat, nents, scmnd->sc_data_direction); +	if (unlikely(count == 0)) +		return -EIO;  	fmt = SRP_DATA_DESC_DIRECT;  	len = sizeof (struct srp_cmd) +	sizeof (struct srp_direct_buf); -	if (count == 1) { +	if (count == 1 && !register_always) {  		/*  		 * The midlayer only generated a single gather/scatter  		 * entry, or DMA mapping coalesced everything to a @@ -758,51 +1439,73 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target,  		struct srp_direct_buf *buf = (void *) cmd->add_data;  		buf->va  = cpu_to_be64(ib_sg_dma_address(ibdev, scat)); -		buf->key = cpu_to_be32(dev->mr->rkey); +		buf->key = cpu_to_be32(target->rkey);  		buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat)); -	} else if (srp_map_fmr(target, scat, count, req, -			       (void *) cmd->add_data)) { + +		req->nmdesc = 0; +		goto map_complete; +	} + +	/* +	 * We have more than one scatter/gather entry, so build our indirect +	 * descriptor table, trying to merge as many entries as we can. +	 */ +	indirect_hdr = (void *) cmd->add_data; + +	ib_dma_sync_single_for_cpu(ibdev, req->indirect_dma_addr, +				   target->indirect_size, DMA_TO_DEVICE); + +	memset(&state, 0, sizeof(state)); +	srp_map_sg(&state, target, req, scat, count); + +	/* We've mapped the request, now pull as much of the indirect +	 * descriptor table as we can into the command buffer. If this +	 * target is not using an external indirect table, we are +	 * guaranteed to fit into the command, as the SCSI layer won't +	 * give us more S/G entries than we allow. +	 */ +	if (state.ndesc == 1) {  		/* -		 * FMR mapping failed, and the scatterlist has more -		 * than one entry.  Generate an indirect memory -		 * descriptor. +		 * Memory registration collapsed the sg-list into one entry, +		 * so use a direct descriptor.  		 */ -		struct srp_indirect_buf *buf = (void *) cmd->add_data; -		struct scatterlist *sg; -		u32 datalen = 0; -		int i; - -		fmt = SRP_DATA_DESC_INDIRECT; -		len = sizeof (struct srp_cmd) + -			sizeof (struct srp_indirect_buf) + -			count * sizeof (struct srp_direct_buf); - -		scsi_for_each_sg(scmnd, sg, count, i) { -			unsigned int dma_len = ib_sg_dma_len(ibdev, sg); - -			buf->desc_list[i].va  = -				cpu_to_be64(ib_sg_dma_address(ibdev, sg)); -			buf->desc_list[i].key = -				cpu_to_be32(dev->mr->rkey); -			buf->desc_list[i].len = cpu_to_be32(dma_len); -			datalen += dma_len; -		} - -		if (scmnd->sc_data_direction == DMA_TO_DEVICE) -			cmd->data_out_desc_cnt = count; -		else -			cmd->data_in_desc_cnt = count; +		struct srp_direct_buf *buf = (void *) cmd->add_data; -		buf->table_desc.va  = -			cpu_to_be64(req->cmd->dma + sizeof *cmd + sizeof *buf); -		buf->table_desc.key = -			cpu_to_be32(target->srp_host->srp_dev->mr->rkey); -		buf->table_desc.len = -			cpu_to_be32(count * sizeof (struct srp_direct_buf)); +		*buf = req->indirect_desc[0]; +		goto map_complete; +	} -		buf->len = cpu_to_be32(datalen); +	if (unlikely(target->cmd_sg_cnt < state.ndesc && +						!target->allow_ext_sg)) { +		shost_printk(KERN_ERR, target->scsi_host, +			     "Could not fit S/G list into SRP_CMD\n"); +		return -EIO;  	} +	count = min(state.ndesc, target->cmd_sg_cnt); +	table_len = state.ndesc * sizeof (struct srp_direct_buf); + +	fmt = SRP_DATA_DESC_INDIRECT; +	len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf); +	len += count * sizeof (struct srp_direct_buf); + +	memcpy(indirect_hdr->desc_list, req->indirect_desc, +	       count * sizeof (struct srp_direct_buf)); + +	indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr); +	indirect_hdr->table_desc.key = cpu_to_be32(target->rkey); +	indirect_hdr->table_desc.len = cpu_to_be32(table_len); +	indirect_hdr->len = cpu_to_be32(state.total_len); + +	if (scmnd->sc_data_direction == DMA_TO_DEVICE) +		cmd->data_out_desc_cnt = count; +	else +		cmd->data_in_desc_cnt = count; + +	ib_dma_sync_single_for_device(ibdev, req->indirect_dma_addr, table_len, +				      DMA_TO_DEVICE); + +map_complete:  	if (scmnd->sc_data_direction == DMA_TO_DEVICE)  		cmd->buf_fmt = fmt << 4;  	else @@ -812,9 +1515,23 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target,  }  /* - * Must be called with target->scsi_host->host_lock held to protect - * req_lim and tx_head.  Lock cannot be dropped between call here and - * call to __srp_post_send(). + * Return an IU and possible credit to the free pool + */ +static void srp_put_tx_iu(struct srp_target_port *target, struct srp_iu *iu, +			  enum srp_iu_type iu_type) +{ +	unsigned long flags; + +	spin_lock_irqsave(&target->lock, flags); +	list_add(&iu->list, &target->free_tx); +	if (iu_type != SRP_IU_RSP) +		++target->req_lim; +	spin_unlock_irqrestore(&target->lock, flags); +} + +/* + * Must be called with target->lock held to protect req_lim and free_tx. + * If IU is not sent, it must be returned using srp_put_tx_iu().   *   * Note:   * An upper limit for the number of allocated information units for each @@ -833,83 +1550,59 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target,  	srp_send_completion(target->send_cq, target); -	if (target->tx_head - target->tx_tail >= SRP_SQ_SIZE) +	if (list_empty(&target->free_tx))  		return NULL;  	/* Initiator responses to target requests do not consume credits */ -	if (target->req_lim <= rsv && iu_type != SRP_IU_RSP) { -		++target->zero_req_lim; -		return NULL; +	if (iu_type != SRP_IU_RSP) { +		if (target->req_lim <= rsv) { +			++target->zero_req_lim; +			return NULL; +		} + +		--target->req_lim;  	} -	iu = target->tx_ring[target->tx_head & SRP_SQ_MASK]; -	iu->type = iu_type; +	iu = list_first_entry(&target->free_tx, struct srp_iu, list); +	list_del(&iu->list);  	return iu;  } -/* - * Must be called with target->scsi_host->host_lock held to protect - * req_lim and tx_head. - */ -static int __srp_post_send(struct srp_target_port *target, -			   struct srp_iu *iu, int len) +static int srp_post_send(struct srp_target_port *target, +			 struct srp_iu *iu, int len)  {  	struct ib_sge list;  	struct ib_send_wr wr, *bad_wr; -	int ret = 0;  	list.addr   = iu->dma;  	list.length = len; -	list.lkey   = target->srp_host->srp_dev->mr->lkey; +	list.lkey   = target->lkey;  	wr.next       = NULL; -	wr.wr_id      = target->tx_head & SRP_SQ_MASK; +	wr.wr_id      = (uintptr_t) iu;  	wr.sg_list    = &list;  	wr.num_sge    = 1;  	wr.opcode     = IB_WR_SEND;  	wr.send_flags = IB_SEND_SIGNALED; -	ret = ib_post_send(target->qp, &wr, &bad_wr); - -	if (!ret) { -		++target->tx_head; -		if (iu->type != SRP_IU_RSP) -			--target->req_lim; -	} - -	return ret; +	return ib_post_send(target->qp, &wr, &bad_wr);  } -static int srp_post_recv(struct srp_target_port *target) +static int srp_post_recv(struct srp_target_port *target, struct srp_iu *iu)  { -	unsigned long flags; -	struct srp_iu *iu; -	struct ib_sge list;  	struct ib_recv_wr wr, *bad_wr; -	unsigned int next; -	int ret; - -	spin_lock_irqsave(target->scsi_host->host_lock, flags); - -	next	 = target->rx_head & SRP_RQ_MASK; -	wr.wr_id = next; -	iu	 = target->rx_ring[next]; +	struct ib_sge list;  	list.addr   = iu->dma;  	list.length = iu->size; -	list.lkey   = target->srp_host->srp_dev->mr->lkey; +	list.lkey   = target->lkey;  	wr.next     = NULL; +	wr.wr_id    = (uintptr_t) iu;  	wr.sg_list  = &list;  	wr.num_sge  = 1; -	ret = ib_post_recv(target->qp, &wr, &bad_wr); -	if (!ret) -		++target->rx_head; - -	spin_unlock_irqrestore(target->scsi_host->host_lock, flags); - -	return ret; +	return ib_post_recv(target->qp, &wr, &bad_wr);  }  static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) @@ -917,28 +1610,30 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp)  	struct srp_request *req;  	struct scsi_cmnd *scmnd;  	unsigned long flags; -	s32 delta; - -	delta = (s32) be32_to_cpu(rsp->req_lim_delta); - -	spin_lock_irqsave(target->scsi_host->host_lock, flags); - -	target->req_lim += delta; - -	req = &target->req_ring[rsp->tag & ~SRP_TAG_TSK_MGMT];  	if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) { -		if (be32_to_cpu(rsp->resp_data_len) < 4) -			req->tsk_status = -1; -		else -			req->tsk_status = rsp->data[3]; -		complete(&req->done); +		spin_lock_irqsave(&target->lock, flags); +		target->req_lim += be32_to_cpu(rsp->req_lim_delta); +		spin_unlock_irqrestore(&target->lock, flags); + +		target->tsk_mgmt_status = -1; +		if (be32_to_cpu(rsp->resp_data_len) >= 4) +			target->tsk_mgmt_status = rsp->data[3]; +		complete(&target->tsk_mgmt_done);  	} else { -		scmnd = req->scmnd; -		if (!scmnd) +		req = &target->req_ring[rsp->tag]; +		scmnd = srp_claim_req(target, req, NULL, NULL); +		if (!scmnd) {  			shost_printk(KERN_ERR, target->scsi_host,  				     "Null scmnd for RSP w/tag %016llx\n",  				     (unsigned long long) rsp->tag); + +			spin_lock_irqsave(&target->lock, flags); +			target->req_lim += be32_to_cpu(rsp->req_lim_delta); +			spin_unlock_irqrestore(&target->lock, flags); + +			return; +		}  		scmnd->result = rsp->status;  		if (rsp->flags & SRP_RSP_FLAG_SNSVALID) { @@ -953,49 +1648,44 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp)  		else if (rsp->flags & (SRP_RSP_FLAG_DIOVER | SRP_RSP_FLAG_DIUNDER))  			scsi_set_resid(scmnd, be32_to_cpu(rsp->data_in_res_cnt)); -		if (!req->tsk_mgmt) { -			scmnd->host_scribble = (void *) -1L; -			scmnd->scsi_done(scmnd); +		srp_free_req(target, req, scmnd, +			     be32_to_cpu(rsp->req_lim_delta)); -			srp_remove_req(target, req); -		} else -			req->cmd_done = 1; +		scmnd->host_scribble = NULL; +		scmnd->scsi_done(scmnd);  	} - -	spin_unlock_irqrestore(target->scsi_host->host_lock, flags);  }  static int srp_response_common(struct srp_target_port *target, s32 req_delta,  			       void *rsp, int len)  { -	struct ib_device *dev; +	struct ib_device *dev = target->srp_host->srp_dev->dev;  	unsigned long flags;  	struct srp_iu *iu; -	int err = 1; +	int err; -	dev = target->srp_host->srp_dev->dev; - -	spin_lock_irqsave(target->scsi_host->host_lock, flags); +	spin_lock_irqsave(&target->lock, flags);  	target->req_lim += req_delta; -  	iu = __srp_get_tx_iu(target, SRP_IU_RSP); +	spin_unlock_irqrestore(&target->lock, flags); +  	if (!iu) {  		shost_printk(KERN_ERR, target->scsi_host, PFX  			     "no IU available to send response\n"); -		goto out; +		return 1;  	}  	ib_dma_sync_single_for_cpu(dev, iu->dma, len, DMA_TO_DEVICE);  	memcpy(iu->buf, rsp, len);  	ib_dma_sync_single_for_device(dev, iu->dma, len, DMA_TO_DEVICE); -	err = __srp_post_send(target, iu, len); -	if (err) +	err = srp_post_send(target, iu, len); +	if (err) {  		shost_printk(KERN_ERR, target->scsi_host, PFX  			     "unable to post response: %d\n", err); +		srp_put_tx_iu(target, iu, SRP_IU_RSP); +	} -out: -	spin_unlock_irqrestore(target->scsi_host->host_lock, flags);  	return err;  } @@ -1032,14 +1722,11 @@ static void srp_process_aer_req(struct srp_target_port *target,  static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc)  { -	struct ib_device *dev; -	struct srp_iu *iu; +	struct ib_device *dev = target->srp_host->srp_dev->dev; +	struct srp_iu *iu = (struct srp_iu *) (uintptr_t) wc->wr_id;  	int res;  	u8 opcode; -	iu = target->rx_ring[wc->wr_id]; - -	dev = target->srp_host->srp_dev->dev;  	ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_ti_iu_len,  				   DMA_FROM_DEVICE); @@ -1080,12 +1767,51 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc)  	ib_dma_sync_single_for_device(dev, iu->dma, target->max_ti_iu_len,  				      DMA_FROM_DEVICE); -	res = srp_post_recv(target); +	res = srp_post_recv(target, iu);  	if (res != 0)  		shost_printk(KERN_ERR, target->scsi_host,  			     PFX "Recv failed with error code %d\n", res);  } +/** + * srp_tl_err_work() - handle a transport layer error + * @work: Work structure embedded in an SRP target port. + * + * Note: This function may get invoked before the rport has been created, + * hence the target->rport test. + */ +static void srp_tl_err_work(struct work_struct *work) +{ +	struct srp_target_port *target; + +	target = container_of(work, struct srp_target_port, tl_err_work); +	if (target->rport) +		srp_start_tl_fail_timers(target->rport); +} + +static void srp_handle_qp_err(u64 wr_id, enum ib_wc_status wc_status, +			      bool send_err, struct srp_target_port *target) +{ +	if (target->connected && !target->qp_in_error) { +		if (wr_id & LOCAL_INV_WR_ID_MASK) { +			shost_printk(KERN_ERR, target->scsi_host, PFX +				     "LOCAL_INV failed with status %d\n", +				     wc_status); +		} else if (wr_id & FAST_REG_WR_ID_MASK) { +			shost_printk(KERN_ERR, target->scsi_host, PFX +				     "FAST_REG_MR failed status %d\n", +				     wc_status); +		} else { +			shost_printk(KERN_ERR, target->scsi_host, +				     PFX "failed %s status %d for iu %p\n", +				     send_err ? "send" : "receive", +				     wc_status, (void *)(uintptr_t)wr_id); +		} +		queue_work(system_long_wq, &target->tl_err_work); +	} +	target->qp_in_error = true; +} +  static void srp_recv_completion(struct ib_cq *cq, void *target_ptr)  {  	struct srp_target_port *target = target_ptr; @@ -1093,15 +1819,11 @@ static void srp_recv_completion(struct ib_cq *cq, void *target_ptr)  	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);  	while (ib_poll_cq(cq, 1, &wc) > 0) { -		if (wc.status) { -			shost_printk(KERN_ERR, target->scsi_host, -				     PFX "failed receive status %d\n", -				     wc.status); -			target->qp_in_error = 1; -			break; +		if (likely(wc.status == IB_WC_SUCCESS)) { +			srp_handle_recv(target, &wc); +		} else { +			srp_handle_qp_err(wc.wr_id, wc.status, false, target);  		} - -		srp_handle_recv(target, &wc);  	}  } @@ -1109,53 +1831,57 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr)  {  	struct srp_target_port *target = target_ptr;  	struct ib_wc wc; +	struct srp_iu *iu;  	while (ib_poll_cq(cq, 1, &wc) > 0) { -		if (wc.status) { -			shost_printk(KERN_ERR, target->scsi_host, -				     PFX "failed send status %d\n", -				     wc.status); -			target->qp_in_error = 1; -			break; +		if (likely(wc.status == IB_WC_SUCCESS)) { +			iu = (struct srp_iu *) (uintptr_t) wc.wr_id; +			list_add(&iu->list, &target->free_tx); +		} else { +			srp_handle_qp_err(wc.wr_id, wc.status, true, target);  		} - -		++target->tx_tail;  	}  } -static int srp_queuecommand(struct scsi_cmnd *scmnd, -			    void (*done)(struct scsi_cmnd *)) +static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)  { -	struct srp_target_port *target = host_to_target(scmnd->device->host); +	struct srp_target_port *target = host_to_target(shost); +	struct srp_rport *rport = target->rport;  	struct srp_request *req;  	struct srp_iu *iu;  	struct srp_cmd *cmd;  	struct ib_device *dev; -	int len; +	unsigned long flags; +	int len, ret; +	const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler; -	if (target->state == SRP_TARGET_CONNECTING) -		goto err; +	/* +	 * The SCSI EH thread is the only context from which srp_queuecommand() +	 * can get invoked for blocked devices (SDEV_BLOCK / +	 * SDEV_CREATED_BLOCK). Avoid racing with srp_reconnect_rport() by +	 * locking the rport mutex if invoked from inside the SCSI EH. +	 */ +	if (in_scsi_eh) +		mutex_lock(&rport->mutex); -	if (target->state == SRP_TARGET_DEAD || -	    target->state == SRP_TARGET_REMOVED) { -		scmnd->result = DID_BAD_TARGET << 16; -		done(scmnd); -		return 0; -	} +	scmnd->result = srp_chkready(target->rport); +	if (unlikely(scmnd->result)) +		goto err; +	spin_lock_irqsave(&target->lock, flags);  	iu = __srp_get_tx_iu(target, SRP_IU_CMD);  	if (!iu) -		goto err; +		goto err_unlock; + +	req = list_first_entry(&target->free_reqs, struct srp_request, list); +	list_del(&req->list); +	spin_unlock_irqrestore(&target->lock, flags);  	dev = target->srp_host->srp_dev->dev; -	ib_dma_sync_single_for_cpu(dev, iu->dma, srp_max_iu_len, +	ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_iu_len,  				   DMA_TO_DEVICE); -	req = list_first_entry(&target->free_reqs, struct srp_request, list); - -	scmnd->scsi_done     = done; -	scmnd->result        = 0; -	scmnd->host_scribble = (void *) (long) req->index; +	scmnd->host_scribble = (void *) req;  	cmd = iu->buf;  	memset(cmd, 0, sizeof *cmd); @@ -1167,40 +1893,85 @@ static int srp_queuecommand(struct scsi_cmnd *scmnd,  	req->scmnd    = scmnd;  	req->cmd      = iu; -	req->cmd_done = 0; -	req->tsk_mgmt = NULL;  	len = srp_map_data(scmnd, target, req);  	if (len < 0) {  		shost_printk(KERN_ERR, target->scsi_host, -			     PFX "Failed to map data\n"); -		goto err; +			     PFX "Failed to map data (%d)\n", len); +		/* +		 * If we ran out of memory descriptors (-ENOMEM) because an +		 * application is queuing many requests with more than +		 * max_pages_per_mr sg-list elements, tell the SCSI mid-layer +		 * to reduce queue depth temporarily. +		 */ +		scmnd->result = len == -ENOMEM ? +			DID_OK << 16 | QUEUE_FULL << 1 : DID_ERROR << 16; +		goto err_iu;  	} -	ib_dma_sync_single_for_device(dev, iu->dma, srp_max_iu_len, +	ib_dma_sync_single_for_device(dev, iu->dma, target->max_iu_len,  				      DMA_TO_DEVICE); -	if (__srp_post_send(target, iu, len)) { +	if (srp_post_send(target, iu, len)) {  		shost_printk(KERN_ERR, target->scsi_host, PFX "Send failed\n");  		goto err_unmap;  	} -	list_move_tail(&req->list, &target->req_queue); +	ret = 0; + +unlock_rport: +	if (in_scsi_eh) +		mutex_unlock(&rport->mutex); -	return 0; +	return ret;  err_unmap:  	srp_unmap_data(scmnd, target, req); +err_iu: +	srp_put_tx_iu(target, iu, SRP_IU_CMD); + +	/* +	 * Avoid that the loops that iterate over the request ring can +	 * encounter a dangling SCSI command pointer. +	 */ +	req->scmnd = NULL; + +	spin_lock_irqsave(&target->lock, flags); +	list_add(&req->list, &target->free_reqs); + +err_unlock: +	spin_unlock_irqrestore(&target->lock, flags); +  err: -	return SCSI_MLQUEUE_HOST_BUSY; +	if (scmnd->result) { +		scmnd->scsi_done(scmnd); +		ret = 0; +	} else { +		ret = SCSI_MLQUEUE_HOST_BUSY; +	} + +	goto unlock_rport;  } +/* + * Note: the resources allocated in this function are freed in + * srp_free_target_ib(). + */  static int srp_alloc_iu_bufs(struct srp_target_port *target)  {  	int i; -	for (i = 0; i < SRP_RQ_SIZE; ++i) { +	target->rx_ring = kzalloc(target->queue_size * sizeof(*target->rx_ring), +				  GFP_KERNEL); +	if (!target->rx_ring) +		goto err_no_ring; +	target->tx_ring = kzalloc(target->queue_size * sizeof(*target->tx_ring), +				  GFP_KERNEL); +	if (!target->tx_ring) +		goto err_no_ring; + +	for (i = 0; i < target->queue_size; ++i) {  		target->rx_ring[i] = srp_alloc_iu(target->srp_host,  						  target->max_ti_iu_len,  						  GFP_KERNEL, DMA_FROM_DEVICE); @@ -1208,30 +1979,138 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target)  			goto err;  	} -	for (i = 0; i < SRP_SQ_SIZE; ++i) { +	for (i = 0; i < target->queue_size; ++i) {  		target->tx_ring[i] = srp_alloc_iu(target->srp_host, -						  srp_max_iu_len, +						  target->max_iu_len,  						  GFP_KERNEL, DMA_TO_DEVICE);  		if (!target->tx_ring[i])  			goto err; + +		list_add(&target->tx_ring[i]->list, &target->free_tx);  	}  	return 0;  err: -	for (i = 0; i < SRP_RQ_SIZE; ++i) { +	for (i = 0; i < target->queue_size; ++i) {  		srp_free_iu(target->srp_host, target->rx_ring[i]); -		target->rx_ring[i] = NULL; -	} - -	for (i = 0; i < SRP_SQ_SIZE; ++i) {  		srp_free_iu(target->srp_host, target->tx_ring[i]); -		target->tx_ring[i] = NULL;  	} + +err_no_ring: +	kfree(target->tx_ring); +	target->tx_ring = NULL; +	kfree(target->rx_ring); +	target->rx_ring = NULL; +  	return -ENOMEM;  } +static uint32_t srp_compute_rq_tmo(struct ib_qp_attr *qp_attr, int attr_mask) +{ +	uint64_t T_tr_ns, max_compl_time_ms; +	uint32_t rq_tmo_jiffies; + +	/* +	 * According to section 11.2.4.2 in the IBTA spec (Modify Queue Pair, +	 * table 91), both the QP timeout and the retry count have to be set +	 * for RC QP's during the RTR to RTS transition. +	 */ +	WARN_ON_ONCE((attr_mask & (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)) != +		     (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)); + +	/* +	 * Set target->rq_tmo_jiffies to one second more than the largest time +	 * it can take before an error completion is generated. See also +	 * C9-140..142 in the IBTA spec for more information about how to +	 * convert the QP Local ACK Timeout value to nanoseconds. +	 */ +	T_tr_ns = 4096 * (1ULL << qp_attr->timeout); +	max_compl_time_ms = qp_attr->retry_cnt * 4 * T_tr_ns; +	do_div(max_compl_time_ms, NSEC_PER_MSEC); +	rq_tmo_jiffies = msecs_to_jiffies(max_compl_time_ms + 1000); + +	return rq_tmo_jiffies; +} + +static void srp_cm_rep_handler(struct ib_cm_id *cm_id, +			       struct srp_login_rsp *lrsp, +			       struct srp_target_port *target) +{ +	struct ib_qp_attr *qp_attr = NULL; +	int attr_mask = 0; +	int ret; +	int i; + +	if (lrsp->opcode == SRP_LOGIN_RSP) { +		target->max_ti_iu_len = be32_to_cpu(lrsp->max_ti_iu_len); +		target->req_lim       = be32_to_cpu(lrsp->req_lim_delta); + +		/* +		 * Reserve credits for task management so we don't +		 * bounce requests back to the SCSI mid-layer. +		 */ +		target->scsi_host->can_queue +			= min(target->req_lim - SRP_TSK_MGMT_SQ_SIZE, +			      target->scsi_host->can_queue); +		target->scsi_host->cmd_per_lun +			= min_t(int, target->scsi_host->can_queue, +				target->scsi_host->cmd_per_lun); +	} else { +		shost_printk(KERN_WARNING, target->scsi_host, +			     PFX "Unhandled RSP opcode %#x\n", lrsp->opcode); +		ret = -ECONNRESET; +		goto error; +	} + +	if (!target->rx_ring) { +		ret = srp_alloc_iu_bufs(target); +		if (ret) +			goto error; +	} + +	ret = -ENOMEM; +	qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); +	if (!qp_attr) +		goto error; + +	qp_attr->qp_state = IB_QPS_RTR; +	ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); +	if (ret) +		goto error_free; + +	ret = ib_modify_qp(target->qp, qp_attr, attr_mask); +	if (ret) +		goto error_free; + +	for (i = 0; i < target->queue_size; i++) { +		struct srp_iu *iu = target->rx_ring[i]; +		ret = srp_post_recv(target, iu); +		if (ret) +			goto error_free; +	} + +	qp_attr->qp_state = IB_QPS_RTS; +	ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); +	if (ret) +		goto error_free; + +	target->rq_tmo_jiffies = srp_compute_rq_tmo(qp_attr, attr_mask); + +	ret = ib_modify_qp(target->qp, qp_attr, attr_mask); +	if (ret) +		goto error_free; + +	ret = ib_send_cm_rtu(cm_id, NULL, 0); + +error_free: +	kfree(qp_attr); + +error: +	target->status = ret; +} +  static void srp_cm_rej_handler(struct ib_cm_id *cm_id,  			       struct ib_cm_event *event,  			       struct srp_target_port *target) @@ -1291,8 +2170,10 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id,  				shost_printk(KERN_WARNING, shost,  					     PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n");  			else -				shost_printk(KERN_WARNING, shost, -					    PFX "SRP LOGIN REJECTED, reason 0x%08x\n", reason); +				shost_printk(KERN_WARNING, shost, PFX +					     "SRP LOGIN from %pI6 to %pI6 REJECTED, reason 0x%08x\n", +					     target->path.sgid.raw, +					     target->orig_dgid, reason);  		} else  			shost_printk(KERN_WARNING, shost,  				     "  REJ reason: IB_CM_REJ_CONSUMER_DEFINED," @@ -1315,11 +2196,7 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id,  static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)  {  	struct srp_target_port *target = cm_id->context; -	struct ib_qp_attr *qp_attr = NULL; -	int attr_mask = 0;  	int comp = 0; -	int opcode = 0; -	int i;  	switch (event->event) {  	case IB_CM_REQ_ERROR: @@ -1331,70 +2208,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)  	case IB_CM_REP_RECEIVED:  		comp = 1; -		opcode = *(u8 *) event->private_data; - -		if (opcode == SRP_LOGIN_RSP) { -			struct srp_login_rsp *rsp = event->private_data; - -			target->max_ti_iu_len = be32_to_cpu(rsp->max_ti_iu_len); -			target->req_lim       = be32_to_cpu(rsp->req_lim_delta); - -			/* -			 * Reserve credits for task management so we don't -			 * bounce requests back to the SCSI mid-layer. -			 */ -			target->scsi_host->can_queue -				= min(target->req_lim - SRP_TSK_MGMT_SQ_SIZE, -				      target->scsi_host->can_queue); -		} else { -			shost_printk(KERN_WARNING, target->scsi_host, -				    PFX "Unhandled RSP opcode %#x\n", opcode); -			target->status = -ECONNRESET; -			break; -		} - -		if (!target->rx_ring[0]) { -			target->status = srp_alloc_iu_bufs(target); -			if (target->status) -				break; -		} - -		qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); -		if (!qp_attr) { -			target->status = -ENOMEM; -			break; -		} - -		qp_attr->qp_state = IB_QPS_RTR; -		target->status = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); -		if (target->status) -			break; - -		target->status = ib_modify_qp(target->qp, qp_attr, attr_mask); -		if (target->status) -			break; - -		for (i = 0; i < SRP_RQ_SIZE; i++) { -			target->status = srp_post_recv(target); -			if (target->status) -				break; -		} -		if (target->status) -			break; - -		qp_attr->qp_state = IB_QPS_RTS; -		target->status = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); -		if (target->status) -			break; - -		target->status = ib_modify_qp(target->qp, qp_attr, attr_mask); -		if (target->status) -			break; - -		target->status = ib_send_cm_rtu(cm_id, NULL, 0); -		if (target->status) -			break; - +		srp_cm_rep_handler(cm_id, event->private_data, target);  		break;  	case IB_CM_REJ_RECEIVED: @@ -1407,16 +2221,18 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)  	case IB_CM_DREQ_RECEIVED:  		shost_printk(KERN_WARNING, target->scsi_host,  			     PFX "DREQ received - connection closed\n"); +		srp_change_conn_state(target, false);  		if (ib_send_cm_drep(cm_id, NULL, 0))  			shost_printk(KERN_ERR, target->scsi_host,  				     PFX "Sending CM DREP failed\n"); +		queue_work(system_long_wq, &target->tl_err_work);  		break;  	case IB_CM_TIMEWAIT_EXIT:  		shost_printk(KERN_ERR, target->scsi_host,  			     PFX "connection closed\n"); -  		comp = 1; +  		target->status = 0;  		break; @@ -1434,31 +2250,87 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)  	if (comp)  		complete(&target->done); -	kfree(qp_attr); -  	return 0;  } +/** + * srp_change_queue_type - changing device queue tag type + * @sdev: scsi device struct + * @tag_type: requested tag type + * + * Returns queue tag type. + */ +static int +srp_change_queue_type(struct scsi_device *sdev, int tag_type) +{ +	if (sdev->tagged_supported) { +		scsi_set_tag_type(sdev, tag_type); +		if (tag_type) +			scsi_activate_tcq(sdev, sdev->queue_depth); +		else +			scsi_deactivate_tcq(sdev, sdev->queue_depth); +	} else +		tag_type = 0; + +	return tag_type; +} + +/** + * srp_change_queue_depth - setting device queue depth + * @sdev: scsi device struct + * @qdepth: requested queue depth + * @reason: SCSI_QDEPTH_DEFAULT/SCSI_QDEPTH_QFULL/SCSI_QDEPTH_RAMP_UP + * (see include/scsi/scsi_host.h for definition) + * + * Returns queue depth. + */ +static int +srp_change_queue_depth(struct scsi_device *sdev, int qdepth, int reason) +{ +	struct Scsi_Host *shost = sdev->host; +	int max_depth; +	if (reason == SCSI_QDEPTH_DEFAULT || reason == SCSI_QDEPTH_RAMP_UP) { +		max_depth = shost->can_queue; +		if (!sdev->tagged_supported) +			max_depth = 1; +		if (qdepth > max_depth) +			qdepth = max_depth; +		scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), qdepth); +	} else if (reason == SCSI_QDEPTH_QFULL) +		scsi_track_queue_full(sdev, qdepth); +	else +		return -EOPNOTSUPP; + +	return sdev->queue_depth; +} +  static int srp_send_tsk_mgmt(struct srp_target_port *target, -			     struct srp_request *req, u8 func) +			     u64 req_tag, unsigned int lun, u8 func)  { +	struct srp_rport *rport = target->rport;  	struct ib_device *dev = target->srp_host->srp_dev->dev;  	struct srp_iu *iu;  	struct srp_tsk_mgmt *tsk_mgmt; -	spin_lock_irq(target->scsi_host->host_lock); - -	if (target->state == SRP_TARGET_DEAD || -	    target->state == SRP_TARGET_REMOVED) { -		req->scmnd->result = DID_BAD_TARGET << 16; -		goto out; -	} +	if (!target->connected || target->qp_in_error) +		return -1; -	init_completion(&req->done); +	init_completion(&target->tsk_mgmt_done); +	/* +	 * Lock the rport mutex to avoid that srp_create_target_ib() is +	 * invoked while a task management function is being sent. +	 */ +	mutex_lock(&rport->mutex); +	spin_lock_irq(&target->lock);  	iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT); -	if (!iu) -		goto out; +	spin_unlock_irq(&target->lock); + +	if (!iu) { +		mutex_unlock(&rport->mutex); + +		return -1; +	}  	ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt,  				   DMA_TO_DEVICE); @@ -1466,70 +2338,48 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target,  	memset(tsk_mgmt, 0, sizeof *tsk_mgmt);  	tsk_mgmt->opcode 	= SRP_TSK_MGMT; -	tsk_mgmt->lun 		= cpu_to_be64((u64) req->scmnd->device->lun << 48); -	tsk_mgmt->tag 		= req->index | SRP_TAG_TSK_MGMT; +	tsk_mgmt->lun		= cpu_to_be64((u64) lun << 48); +	tsk_mgmt->tag		= req_tag | SRP_TAG_TSK_MGMT;  	tsk_mgmt->tsk_mgmt_func = func; -	tsk_mgmt->task_tag 	= req->index; +	tsk_mgmt->task_tag	= req_tag;  	ib_dma_sync_single_for_device(dev, iu->dma, sizeof *tsk_mgmt,  				      DMA_TO_DEVICE); -	if (__srp_post_send(target, iu, sizeof *tsk_mgmt)) -		goto out; - -	req->tsk_mgmt = iu; - -	spin_unlock_irq(target->scsi_host->host_lock); +	if (srp_post_send(target, iu, sizeof *tsk_mgmt)) { +		srp_put_tx_iu(target, iu, SRP_IU_TSK_MGMT); +		mutex_unlock(&rport->mutex); -	if (!wait_for_completion_timeout(&req->done, -					 msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS)))  		return -1; +	} +	mutex_unlock(&rport->mutex); -	return 0; - -out: -	spin_unlock_irq(target->scsi_host->host_lock); -	return -1; -} - -static int srp_find_req(struct srp_target_port *target, -			struct scsi_cmnd *scmnd, -			struct srp_request **req) -{ -	if (scmnd->host_scribble == (void *) -1L) +	if (!wait_for_completion_timeout(&target->tsk_mgmt_done, +					 msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS)))  		return -1; -	*req = &target->req_ring[(long) scmnd->host_scribble]; -  	return 0;  }  static int srp_abort(struct scsi_cmnd *scmnd)  {  	struct srp_target_port *target = host_to_target(scmnd->device->host); -	struct srp_request *req; -	int ret = SUCCESS; +	struct srp_request *req = (struct srp_request *) scmnd->host_scribble; +	int ret;  	shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n"); -	if (target->qp_in_error) -		return FAILED; -	if (srp_find_req(target, scmnd, &req)) -		return FAILED; -	if (srp_send_tsk_mgmt(target, req, SRP_TSK_ABORT_TASK)) -		return FAILED; - -	spin_lock_irq(target->scsi_host->host_lock); - -	if (req->cmd_done) { -		srp_remove_req(target, req); -		scmnd->scsi_done(scmnd); -	} else if (!req->tsk_status) { -		srp_remove_req(target, req); -		scmnd->result = DID_ABORT << 16; -	} else +	if (!req || !srp_claim_req(target, req, NULL, scmnd)) +		return SUCCESS; +	if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun, +			      SRP_TSK_ABORT_TASK) == 0) +		ret = SUCCESS; +	else if (target->rport->state == SRP_RPORT_LOST) +		ret = FAST_IO_FAIL; +	else  		ret = FAILED; - -	spin_unlock_irq(target->scsi_host->host_lock); +	srp_free_req(target, req, scmnd, 0); +	scmnd->result = DID_ABORT << 16; +	scmnd->scsi_done(scmnd);  	return ret;  } @@ -1537,26 +2387,20 @@ static int srp_abort(struct scsi_cmnd *scmnd)  static int srp_reset_device(struct scsi_cmnd *scmnd)  {  	struct srp_target_port *target = host_to_target(scmnd->device->host); -	struct srp_request *req, *tmp; +	int i;  	shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n"); -	if (target->qp_in_error) +	if (srp_send_tsk_mgmt(target, SRP_TAG_NO_REQ, scmnd->device->lun, +			      SRP_TSK_LUN_RESET))  		return FAILED; -	if (srp_find_req(target, scmnd, &req)) +	if (target->tsk_mgmt_status)  		return FAILED; -	if (srp_send_tsk_mgmt(target, req, SRP_TSK_LUN_RESET)) -		return FAILED; -	if (req->tsk_status) -		return FAILED; - -	spin_lock_irq(target->scsi_host->host_lock); -	list_for_each_entry_safe(req, tmp, &target->req_queue, list) -		if (req->scmnd->device == scmnd->device) -			srp_reset_req(target, req); - -	spin_unlock_irq(target->scsi_host->host_lock); +	for (i = 0; i < target->req_ring_size; ++i) { +		struct srp_request *req = &target->req_ring[i]; +		srp_finish_req(target, req, scmnd->device, DID_RESET << 16); +	}  	return SUCCESS;  } @@ -1564,14 +2408,25 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)  static int srp_reset_host(struct scsi_cmnd *scmnd)  {  	struct srp_target_port *target = host_to_target(scmnd->device->host); -	int ret = FAILED;  	shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n"); -	if (!srp_reconnect_target(target)) -		ret = SUCCESS; +	return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED; +} -	return ret; +static int srp_slave_configure(struct scsi_device *sdev) +{ +	struct Scsi_Host *shost = sdev->host; +	struct srp_target_port *target = host_to_target(shost); +	struct request_queue *q = sdev->request_queue; +	unsigned long timeout; + +	if (sdev->type == TYPE_DISK) { +		timeout = max_t(unsigned, 30 * HZ, target->rq_tmo_jiffies); +		blk_queue_rq_timeout(q, timeout); +	} + +	return 0;  }  static ssize_t show_id_ext(struct device *dev, struct device_attribute *attr, @@ -1579,10 +2434,6 @@ static ssize_t show_id_ext(struct device *dev, struct device_attribute *attr,  {  	struct srp_target_port *target = host_to_target(class_to_shost(dev)); -	if (target->state == SRP_TARGET_DEAD || -	    target->state == SRP_TARGET_REMOVED) -		return -ENODEV; -  	return sprintf(buf, "0x%016llx\n",  		       (unsigned long long) be64_to_cpu(target->id_ext));  } @@ -1592,10 +2443,6 @@ static ssize_t show_ioc_guid(struct device *dev, struct device_attribute *attr,  {  	struct srp_target_port *target = host_to_target(class_to_shost(dev)); -	if (target->state == SRP_TARGET_DEAD || -	    target->state == SRP_TARGET_REMOVED) -		return -ENODEV; -  	return sprintf(buf, "0x%016llx\n",  		       (unsigned long long) be64_to_cpu(target->ioc_guid));  } @@ -1605,10 +2452,6 @@ static ssize_t show_service_id(struct device *dev,  {  	struct srp_target_port *target = host_to_target(class_to_shost(dev)); -	if (target->state == SRP_TARGET_DEAD || -	    target->state == SRP_TARGET_REMOVED) -		return -ENODEV; -  	return sprintf(buf, "0x%016llx\n",  		       (unsigned long long) be64_to_cpu(target->service_id));  } @@ -1618,21 +2461,21 @@ static ssize_t show_pkey(struct device *dev, struct device_attribute *attr,  {  	struct srp_target_port *target = host_to_target(class_to_shost(dev)); -	if (target->state == SRP_TARGET_DEAD || -	    target->state == SRP_TARGET_REMOVED) -		return -ENODEV; -  	return sprintf(buf, "0x%04x\n", be16_to_cpu(target->path.pkey));  } -static ssize_t show_dgid(struct device *dev, struct device_attribute *attr, +static ssize_t show_sgid(struct device *dev, struct device_attribute *attr,  			 char *buf)  {  	struct srp_target_port *target = host_to_target(class_to_shost(dev)); -	if (target->state == SRP_TARGET_DEAD || -	    target->state == SRP_TARGET_REMOVED) -		return -ENODEV; +	return sprintf(buf, "%pI6\n", target->path.sgid.raw); +} + +static ssize_t show_dgid(struct device *dev, struct device_attribute *attr, +			 char *buf) +{ +	struct srp_target_port *target = host_to_target(class_to_shost(dev));  	return sprintf(buf, "%pI6\n", target->path.dgid.raw);  } @@ -1642,10 +2485,6 @@ static ssize_t show_orig_dgid(struct device *dev,  {  	struct srp_target_port *target = host_to_target(class_to_shost(dev)); -	if (target->state == SRP_TARGET_DEAD || -	    target->state == SRP_TARGET_REMOVED) -		return -ENODEV; -  	return sprintf(buf, "%pI6\n", target->orig_dgid);  } @@ -1654,10 +2493,6 @@ static ssize_t show_req_lim(struct device *dev,  {  	struct srp_target_port *target = host_to_target(class_to_shost(dev)); -	if (target->state == SRP_TARGET_DEAD || -	    target->state == SRP_TARGET_REMOVED) -		return -ENODEV; -  	return sprintf(buf, "%d\n", target->req_lim);  } @@ -1666,10 +2501,6 @@ static ssize_t show_zero_req_lim(struct device *dev,  {  	struct srp_target_port *target = host_to_target(class_to_shost(dev)); -	if (target->state == SRP_TARGET_DEAD || -	    target->state == SRP_TARGET_REMOVED) -		return -ENODEV; -  	return sprintf(buf, "%d\n", target->zero_req_lim);  } @@ -1689,28 +2520,70 @@ static ssize_t show_local_ib_device(struct device *dev,  	return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name);  } +static ssize_t show_comp_vector(struct device *dev, +				struct device_attribute *attr, char *buf) +{ +	struct srp_target_port *target = host_to_target(class_to_shost(dev)); + +	return sprintf(buf, "%d\n", target->comp_vector); +} + +static ssize_t show_tl_retry_count(struct device *dev, +				   struct device_attribute *attr, char *buf) +{ +	struct srp_target_port *target = host_to_target(class_to_shost(dev)); + +	return sprintf(buf, "%d\n", target->tl_retry_count); +} + +static ssize_t show_cmd_sg_entries(struct device *dev, +				   struct device_attribute *attr, char *buf) +{ +	struct srp_target_port *target = host_to_target(class_to_shost(dev)); + +	return sprintf(buf, "%u\n", target->cmd_sg_cnt); +} + +static ssize_t show_allow_ext_sg(struct device *dev, +				 struct device_attribute *attr, char *buf) +{ +	struct srp_target_port *target = host_to_target(class_to_shost(dev)); + +	return sprintf(buf, "%s\n", target->allow_ext_sg ? "true" : "false"); +} +  static DEVICE_ATTR(id_ext,	    S_IRUGO, show_id_ext,	   NULL);  static DEVICE_ATTR(ioc_guid,	    S_IRUGO, show_ioc_guid,	   NULL);  static DEVICE_ATTR(service_id,	    S_IRUGO, show_service_id,	   NULL);  static DEVICE_ATTR(pkey,	    S_IRUGO, show_pkey,		   NULL); +static DEVICE_ATTR(sgid,	    S_IRUGO, show_sgid,		   NULL);  static DEVICE_ATTR(dgid,	    S_IRUGO, show_dgid,		   NULL);  static DEVICE_ATTR(orig_dgid,	    S_IRUGO, show_orig_dgid,	   NULL);  static DEVICE_ATTR(req_lim,         S_IRUGO, show_req_lim,         NULL);  static DEVICE_ATTR(zero_req_lim,    S_IRUGO, show_zero_req_lim,	   NULL);  static DEVICE_ATTR(local_ib_port,   S_IRUGO, show_local_ib_port,   NULL);  static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL); +static DEVICE_ATTR(comp_vector,     S_IRUGO, show_comp_vector,     NULL); +static DEVICE_ATTR(tl_retry_count,  S_IRUGO, show_tl_retry_count,  NULL); +static DEVICE_ATTR(cmd_sg_entries,  S_IRUGO, show_cmd_sg_entries,  NULL); +static DEVICE_ATTR(allow_ext_sg,    S_IRUGO, show_allow_ext_sg,    NULL);  static struct device_attribute *srp_host_attrs[] = {  	&dev_attr_id_ext,  	&dev_attr_ioc_guid,  	&dev_attr_service_id,  	&dev_attr_pkey, +	&dev_attr_sgid,  	&dev_attr_dgid,  	&dev_attr_orig_dgid,  	&dev_attr_req_lim,  	&dev_attr_zero_req_lim,  	&dev_attr_local_ib_port,  	&dev_attr_local_ib_device, +	&dev_attr_comp_vector, +	&dev_attr_tl_retry_count, +	&dev_attr_cmd_sg_entries, +	&dev_attr_allow_ext_sg,  	NULL  }; @@ -1718,14 +2591,19 @@ static struct scsi_host_template srp_template = {  	.module				= THIS_MODULE,  	.name				= "InfiniBand SRP initiator",  	.proc_name			= DRV_NAME, +	.slave_configure		= srp_slave_configure,  	.info				= srp_target_info,  	.queuecommand			= srp_queuecommand, +	.change_queue_depth             = srp_change_queue_depth, +	.change_queue_type              = srp_change_queue_type,  	.eh_abort_handler		= srp_abort,  	.eh_device_reset_handler	= srp_reset_device,  	.eh_host_reset_handler		= srp_reset_host, -	.can_queue			= SRP_CMD_SQ_SIZE, +	.skip_settle_delay		= true, +	.sg_tablesize			= SRP_DEF_SG_TABLESIZE, +	.can_queue			= SRP_DEFAULT_CMD_SQ_SIZE,  	.this_id			= -1, -	.cmd_per_lun			= SRP_CMD_SQ_SIZE, +	.cmd_per_lun			= SRP_DEFAULT_CMD_SQ_SIZE,  	.use_clustering			= ENABLE_CLUSTERING,  	.shost_attrs			= srp_host_attrs  }; @@ -1750,6 +2628,9 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target)  		return PTR_ERR(rport);  	} +	rport->lld_data = target; +	target->rport = rport; +  	spin_lock(&host->target_lock);  	list_add_tail(&target->list, &host->target_list);  	spin_unlock(&host->target_lock); @@ -1775,6 +2656,38 @@ static struct class srp_class = {  	.dev_release = srp_release_dev  }; +/** + * srp_conn_unique() - check whether the connection to a target is unique + * @host:   SRP host. + * @target: SRP target port. + */ +static bool srp_conn_unique(struct srp_host *host, +			    struct srp_target_port *target) +{ +	struct srp_target_port *t; +	bool ret = false; + +	if (target->state == SRP_TARGET_REMOVED) +		goto out; + +	ret = true; + +	spin_lock(&host->target_lock); +	list_for_each_entry(t, &host->target_list, list) { +		if (t != target && +		    target->id_ext == t->id_ext && +		    target->ioc_guid == t->ioc_guid && +		    target->initiator_ext == t->initiator_ext) { +			ret = false; +			break; +		} +	} +	spin_unlock(&host->target_lock); + +out: +	return ret; +} +  /*   * Target ports are added by writing   * @@ -1794,6 +2707,12 @@ enum {  	SRP_OPT_MAX_CMD_PER_LUN	= 1 << 6,  	SRP_OPT_IO_CLASS	= 1 << 7,  	SRP_OPT_INITIATOR_EXT	= 1 << 8, +	SRP_OPT_CMD_SG_ENTRIES	= 1 << 9, +	SRP_OPT_ALLOW_EXT_SG	= 1 << 10, +	SRP_OPT_SG_TABLESIZE	= 1 << 11, +	SRP_OPT_COMP_VECTOR	= 1 << 12, +	SRP_OPT_TL_RETRY_COUNT	= 1 << 13, +	SRP_OPT_QUEUE_SIZE	= 1 << 14,  	SRP_OPT_ALL		= (SRP_OPT_ID_EXT	|  				   SRP_OPT_IOC_GUID	|  				   SRP_OPT_DGID		| @@ -1811,6 +2730,12 @@ static const match_table_t srp_opt_tokens = {  	{ SRP_OPT_MAX_CMD_PER_LUN,	"max_cmd_per_lun=%d" 	},  	{ SRP_OPT_IO_CLASS,		"io_class=%x"		},  	{ SRP_OPT_INITIATOR_EXT,	"initiator_ext=%s"	}, +	{ SRP_OPT_CMD_SG_ENTRIES,	"cmd_sg_entries=%u"	}, +	{ SRP_OPT_ALLOW_EXT_SG,		"allow_ext_sg=%u"	}, +	{ SRP_OPT_SG_TABLESIZE,		"sg_tablesize=%u"	}, +	{ SRP_OPT_COMP_VECTOR,		"comp_vector=%u"	}, +	{ SRP_OPT_TL_RETRY_COUNT,	"tl_retry_count=%u"	}, +	{ SRP_OPT_QUEUE_SIZE,		"queue_size=%d"		},  	{ SRP_OPT_ERR,			NULL 			}  }; @@ -1865,7 +2790,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target)  				goto out;  			}  			if (strlen(p) != 32) { -				printk(KERN_WARNING PFX "bad dest GID parameter '%s'\n", p); +				pr_warn("bad dest GID parameter '%s'\n", p);  				kfree(p);  				goto out;  			} @@ -1880,7 +2805,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target)  		case SRP_OPT_PKEY:  			if (match_hex(args, &token)) { -				printk(KERN_WARNING PFX "bad P_Key parameter '%s'\n", p); +				pr_warn("bad P_Key parameter '%s'\n", p);  				goto out;  			}  			target->path.pkey = cpu_to_be16(token); @@ -1899,30 +2824,43 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target)  		case SRP_OPT_MAX_SECT:  			if (match_int(args, &token)) { -				printk(KERN_WARNING PFX "bad max sect parameter '%s'\n", p); +				pr_warn("bad max sect parameter '%s'\n", p);  				goto out;  			}  			target->scsi_host->max_sectors = token;  			break; +		case SRP_OPT_QUEUE_SIZE: +			if (match_int(args, &token) || token < 1) { +				pr_warn("bad queue_size parameter '%s'\n", p); +				goto out; +			} +			target->scsi_host->can_queue = token; +			target->queue_size = token + SRP_RSP_SQ_SIZE + +					     SRP_TSK_MGMT_SQ_SIZE; +			if (!(opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) +				target->scsi_host->cmd_per_lun = token; +			break; +  		case SRP_OPT_MAX_CMD_PER_LUN: -			if (match_int(args, &token)) { -				printk(KERN_WARNING PFX "bad max cmd_per_lun parameter '%s'\n", p); +			if (match_int(args, &token) || token < 1) { +				pr_warn("bad max cmd_per_lun parameter '%s'\n", +					p);  				goto out;  			} -			target->scsi_host->cmd_per_lun = min(token, SRP_CMD_SQ_SIZE); +			target->scsi_host->cmd_per_lun = token;  			break;  		case SRP_OPT_IO_CLASS:  			if (match_hex(args, &token)) { -				printk(KERN_WARNING PFX "bad  IO class parameter '%s' \n", p); +				pr_warn("bad IO class parameter '%s'\n", p);  				goto out;  			}  			if (token != SRP_REV10_IB_IO_CLASS &&  			    token != SRP_REV16A_IB_IO_CLASS) { -				printk(KERN_WARNING PFX "unknown IO class parameter value" -				       " %x specified (use %x or %x).\n", -				       token, SRP_REV10_IB_IO_CLASS, SRP_REV16A_IB_IO_CLASS); +				pr_warn("unknown IO class parameter value %x specified (use %x or %x).\n", +					token, SRP_REV10_IB_IO_CLASS, +					SRP_REV16A_IB_IO_CLASS);  				goto out;  			}  			target->io_class = token; @@ -1938,9 +2876,53 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target)  			kfree(p);  			break; +		case SRP_OPT_CMD_SG_ENTRIES: +			if (match_int(args, &token) || token < 1 || token > 255) { +				pr_warn("bad max cmd_sg_entries parameter '%s'\n", +					p); +				goto out; +			} +			target->cmd_sg_cnt = token; +			break; + +		case SRP_OPT_ALLOW_EXT_SG: +			if (match_int(args, &token)) { +				pr_warn("bad allow_ext_sg parameter '%s'\n", p); +				goto out; +			} +			target->allow_ext_sg = !!token; +			break; + +		case SRP_OPT_SG_TABLESIZE: +			if (match_int(args, &token) || token < 1 || +					token > SCSI_MAX_SG_CHAIN_SEGMENTS) { +				pr_warn("bad max sg_tablesize parameter '%s'\n", +					p); +				goto out; +			} +			target->sg_tablesize = token; +			break; + +		case SRP_OPT_COMP_VECTOR: +			if (match_int(args, &token) || token < 0) { +				pr_warn("bad comp_vector parameter '%s'\n", p); +				goto out; +			} +			target->comp_vector = token; +			break; + +		case SRP_OPT_TL_RETRY_COUNT: +			if (match_int(args, &token) || token < 2 || token > 7) { +				pr_warn("bad tl_retry_count parameter '%s' (must be a number between 2 and 7)\n", +					p); +				goto out; +			} +			target->tl_retry_count = token; +			break; +  		default: -			printk(KERN_WARNING PFX "unknown parameter or missing value " -			       "'%s' in target creation request\n", p); +			pr_warn("unknown parameter or missing value '%s' in target creation request\n", +				p);  			goto out;  		}  	} @@ -1951,9 +2933,14 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target)  		for (i = 0; i < ARRAY_SIZE(srp_opt_tokens); ++i)  			if ((srp_opt_tokens[i].token & SRP_OPT_ALL) &&  			    !(srp_opt_tokens[i].token & opt_mask)) -				printk(KERN_WARNING PFX "target creation request is " -				       "missing parameter '%s'\n", -				       srp_opt_tokens[i].pattern); +				pr_warn("target creation request is missing parameter '%s'\n", +					srp_opt_tokens[i].pattern); + +	if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue +	    && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) +		pr_warn("cmd_per_lun = %d > queue_size = %d\n", +			target->scsi_host->cmd_per_lun, +			target->scsi_host->can_queue);  out:  	kfree(options); @@ -1968,55 +2955,85 @@ static ssize_t srp_create_target(struct device *dev,  		container_of(dev, struct srp_host, dev);  	struct Scsi_Host *target_host;  	struct srp_target_port *target; +	struct srp_device *srp_dev = host->srp_dev; +	struct ib_device *ibdev = srp_dev->dev;  	int ret; -	int i;  	target_host = scsi_host_alloc(&srp_template,  				      sizeof (struct srp_target_port));  	if (!target_host)  		return -ENOMEM; -	target_host->transportt = ib_srp_transport_template; +	target_host->transportt  = ib_srp_transport_template; +	target_host->max_channel = 0; +	target_host->max_id      = 1;  	target_host->max_lun     = SRP_MAX_LUN;  	target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb;  	target = host_to_target(target_host); -	target->io_class   = SRP_REV16A_IB_IO_CLASS; -	target->scsi_host  = target_host; -	target->srp_host   = host; +	target->io_class	= SRP_REV16A_IB_IO_CLASS; +	target->scsi_host	= target_host; +	target->srp_host	= host; +	target->lkey		= host->srp_dev->mr->lkey; +	target->rkey		= host->srp_dev->mr->rkey; +	target->cmd_sg_cnt	= cmd_sg_entries; +	target->sg_tablesize	= indirect_sg_entries ? : cmd_sg_entries; +	target->allow_ext_sg	= allow_ext_sg; +	target->tl_retry_count	= 7; +	target->queue_size	= SRP_DEFAULT_QUEUE_SIZE; -	INIT_LIST_HEAD(&target->free_reqs); -	INIT_LIST_HEAD(&target->req_queue); -	for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { -		target->req_ring[i].index = i; -		list_add_tail(&target->req_ring[i].list, &target->free_reqs); -	} +	mutex_lock(&host->add_target_mutex);  	ret = srp_parse_options(buf, target);  	if (ret)  		goto err; -	ib_query_gid(host->srp_dev->dev, host->port, 0, &target->path.sgid); +	target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE; -	shost_printk(KERN_DEBUG, target->scsi_host, PFX -		     "new target: id_ext %016llx ioc_guid %016llx pkey %04x " -		     "service_id %016llx dgid %pI6\n", -	       (unsigned long long) be64_to_cpu(target->id_ext), -	       (unsigned long long) be64_to_cpu(target->ioc_guid), -	       be16_to_cpu(target->path.pkey), -	       (unsigned long long) be64_to_cpu(target->service_id), -	       target->path.dgid.raw); +	if (!srp_conn_unique(target->srp_host, target)) { +		shost_printk(KERN_INFO, target->scsi_host, +			     PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n", +			     be64_to_cpu(target->id_ext), +			     be64_to_cpu(target->ioc_guid), +			     be64_to_cpu(target->initiator_ext)); +		ret = -EEXIST; +		goto err; +	} + +	if (!srp_dev->has_fmr && !srp_dev->has_fr && !target->allow_ext_sg && +	    target->cmd_sg_cnt < target->sg_tablesize) { +		pr_warn("No MR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); +		target->sg_tablesize = target->cmd_sg_cnt; +	} + +	target_host->sg_tablesize = target->sg_tablesize; +	target->indirect_size = target->sg_tablesize * +				sizeof (struct srp_direct_buf); +	target->max_iu_len = sizeof (struct srp_cmd) + +			     sizeof (struct srp_indirect_buf) + +			     target->cmd_sg_cnt * sizeof (struct srp_direct_buf); + +	INIT_WORK(&target->tl_err_work, srp_tl_err_work); +	INIT_WORK(&target->remove_work, srp_remove_work); +	spin_lock_init(&target->lock); +	INIT_LIST_HEAD(&target->free_tx); +	ret = srp_alloc_req_data(target); +	if (ret) +		goto err_free_mem; + +	ret = ib_query_gid(ibdev, host->port, 0, &target->path.sgid); +	if (ret) +		goto err_free_mem;  	ret = srp_create_target_ib(target);  	if (ret) -		goto err; +		goto err_free_mem;  	ret = srp_new_cm_id(target);  	if (ret) -		goto err_free; +		goto err_free_ib; -	target->qp_in_error = 0;  	ret = srp_connect_target(target);  	if (ret) {  		shost_printk(KERN_ERR, target->scsi_host, @@ -2028,7 +3045,19 @@ static ssize_t srp_create_target(struct device *dev,  	if (ret)  		goto err_disconnect; -	return count; +	shost_printk(KERN_DEBUG, target->scsi_host, PFX +		     "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n", +		     be64_to_cpu(target->id_ext), +		     be64_to_cpu(target->ioc_guid), +		     be16_to_cpu(target->path.pkey), +		     be64_to_cpu(target->service_id), +		     target->path.sgid.raw, target->path.dgid.raw); + +	ret = count; + +out: +	mutex_unlock(&host->add_target_mutex); +	return ret;  err_disconnect:  	srp_disconnect_target(target); @@ -2036,13 +3065,15 @@ err_disconnect:  err_cm_id:  	ib_destroy_cm_id(target->cm_id); -err_free: +err_free_ib:  	srp_free_target_ib(target); +err_free_mem: +	srp_free_req_data(target); +  err:  	scsi_host_put(target_host); - -	return ret; +	goto out;  }  static DEVICE_ATTR(add_target, S_IWUSR, NULL, srp_create_target); @@ -2078,6 +3109,7 @@ static struct srp_host *srp_add_port(struct srp_device *device, u8 port)  	INIT_LIST_HEAD(&host->target_list);  	spin_lock_init(&host->target_lock);  	init_completion(&host->released); +	mutex_init(&host->add_target_mutex);  	host->srp_dev = device;  	host->port = port; @@ -2109,17 +3141,16 @@ static void srp_add_one(struct ib_device *device)  {  	struct srp_device *srp_dev;  	struct ib_device_attr *dev_attr; -	struct ib_fmr_pool_param fmr_param;  	struct srp_host *host; -	int s, e, p; +	int mr_page_shift, s, e, p; +	u64 max_pages_per_mr;  	dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);  	if (!dev_attr)  		return;  	if (ib_query_device(device, dev_attr)) { -		printk(KERN_WARNING PFX "Query device failed for %s\n", -		       device->name); +		pr_warn("Query device failed for %s\n", device->name);  		goto free_attr;  	} @@ -2127,14 +3158,39 @@ static void srp_add_one(struct ib_device *device)  	if (!srp_dev)  		goto free_attr; +	srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && +			    device->map_phys_fmr && device->unmap_fmr); +	srp_dev->has_fr = (dev_attr->device_cap_flags & +			   IB_DEVICE_MEM_MGT_EXTENSIONS); +	if (!srp_dev->has_fmr && !srp_dev->has_fr) +		dev_warn(&device->dev, "neither FMR nor FR is supported\n"); + +	srp_dev->use_fast_reg = (srp_dev->has_fr && +				 (!srp_dev->has_fmr || prefer_fr)); +  	/*  	 * Use the smallest page size supported by the HCA, down to a -	 * minimum of 512 bytes (which is the smallest sector that a -	 * SCSI command will ever carry). +	 * minimum of 4096 bytes. We're unlikely to build large sglists +	 * out of smaller entries.  	 */ -	srp_dev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1); -	srp_dev->fmr_page_size  = 1 << srp_dev->fmr_page_shift; -	srp_dev->fmr_page_mask  = ~((u64) srp_dev->fmr_page_size - 1); +	mr_page_shift		= max(12, ffs(dev_attr->page_size_cap) - 1); +	srp_dev->mr_page_size	= 1 << mr_page_shift; +	srp_dev->mr_page_mask	= ~((u64) srp_dev->mr_page_size - 1); +	max_pages_per_mr	= dev_attr->max_mr_size; +	do_div(max_pages_per_mr, srp_dev->mr_page_size); +	srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR, +					  max_pages_per_mr); +	if (srp_dev->use_fast_reg) { +		srp_dev->max_pages_per_mr = +			min_t(u32, srp_dev->max_pages_per_mr, +			      dev_attr->max_fast_reg_page_list_len); +	} +	srp_dev->mr_max_size	= srp_dev->mr_page_size * +				   srp_dev->max_pages_per_mr; +	pr_debug("%s: mr_page_shift = %d, dev_attr->max_mr_size = %#llx, dev_attr->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n", +		 device->name, mr_page_shift, dev_attr->max_mr_size, +		 dev_attr->max_fast_reg_page_list_len, +		 srp_dev->max_pages_per_mr, srp_dev->mr_max_size);  	INIT_LIST_HEAD(&srp_dev->dev_list); @@ -2150,20 +3206,6 @@ static void srp_add_one(struct ib_device *device)  	if (IS_ERR(srp_dev->mr))  		goto err_pd; -	memset(&fmr_param, 0, sizeof fmr_param); -	fmr_param.pool_size	    = SRP_FMR_POOL_SIZE; -	fmr_param.dirty_watermark   = SRP_FMR_DIRTY_SIZE; -	fmr_param.cache		    = 1; -	fmr_param.max_pages_per_fmr = SRP_FMR_SIZE; -	fmr_param.page_shift	    = srp_dev->fmr_page_shift; -	fmr_param.access	    = (IB_ACCESS_LOCAL_WRITE | -				       IB_ACCESS_REMOTE_WRITE | -				       IB_ACCESS_REMOTE_READ); - -	srp_dev->fmr_pool = ib_create_fmr_pool(srp_dev->pd, &fmr_param); -	if (IS_ERR(srp_dev->fmr_pool)) -		srp_dev->fmr_pool = NULL; -  	if (device->node_type == RDMA_NODE_IB_SWITCH) {  		s = 0;  		e = 0; @@ -2196,10 +3238,11 @@ static void srp_remove_one(struct ib_device *device)  {  	struct srp_device *srp_dev;  	struct srp_host *host, *tmp_host; -	LIST_HEAD(target_list); -	struct srp_target_port *target, *tmp_target; +	struct srp_target_port *target;  	srp_dev = ib_get_client_data(device, &srp_client); +	if (!srp_dev) +		return;  	list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) {  		device_unregister(&host->dev); @@ -2210,39 +3253,21 @@ static void srp_remove_one(struct ib_device *device)  		wait_for_completion(&host->released);  		/* -		 * Mark all target ports as removed, so we stop queueing -		 * commands and don't try to reconnect. +		 * Remove all target ports.  		 */  		spin_lock(&host->target_lock); -		list_for_each_entry(target, &host->target_list, list) { -			spin_lock_irq(target->scsi_host->host_lock); -			target->state = SRP_TARGET_REMOVED; -			spin_unlock_irq(target->scsi_host->host_lock); -		} +		list_for_each_entry(target, &host->target_list, list) +			srp_queue_remove_work(target);  		spin_unlock(&host->target_lock);  		/* -		 * Wait for any reconnection tasks that may have -		 * started before we marked our target ports as -		 * removed, and any target port removal tasks. +		 * Wait for target port removal tasks.  		 */ -		flush_scheduled_work(); - -		list_for_each_entry_safe(target, tmp_target, -					 &host->target_list, list) { -			srp_remove_host(target->scsi_host); -			scsi_remove_host(target->scsi_host); -			srp_disconnect_target(target); -			ib_destroy_cm_id(target->cm_id); -			srp_free_target_ib(target); -			scsi_host_put(target->scsi_host); -		} +		flush_workqueue(system_long_wq);  		kfree(host);  	} -	if (srp_dev->fmr_pool) -		ib_destroy_fmr_pool(srp_dev->fmr_pool);  	ib_dereg_mr(srp_dev->mr);  	ib_dealloc_pd(srp_dev->pd); @@ -2250,18 +3275,42 @@ static void srp_remove_one(struct ib_device *device)  }  static struct srp_function_template ib_srp_transport_functions = { +	.has_rport_state	 = true, +	.reset_timer_if_blocked	 = true, +	.reconnect_delay	 = &srp_reconnect_delay, +	.fast_io_fail_tmo	 = &srp_fast_io_fail_tmo, +	.dev_loss_tmo		 = &srp_dev_loss_tmo, +	.reconnect		 = srp_rport_reconnect, +	.rport_delete		 = srp_rport_delete, +	.terminate_rport_io	 = srp_terminate_io,  };  static int __init srp_init_module(void)  {  	int ret; -	BUILD_BUG_ON_NOT_POWER_OF_2(SRP_SQ_SIZE); -	BUILD_BUG_ON_NOT_POWER_OF_2(SRP_RQ_SIZE); +	BUILD_BUG_ON(FIELD_SIZEOF(struct ib_wc, wr_id) < sizeof(void *)); -	if (srp_sg_tablesize > 255) { -		printk(KERN_WARNING PFX "Clamping srp_sg_tablesize to 255\n"); -		srp_sg_tablesize = 255; +	if (srp_sg_tablesize) { +		pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n"); +		if (!cmd_sg_entries) +			cmd_sg_entries = srp_sg_tablesize; +	} + +	if (!cmd_sg_entries) +		cmd_sg_entries = SRP_DEF_SG_TABLESIZE; + +	if (cmd_sg_entries > 255) { +		pr_warn("Clamping cmd_sg_entries to 255\n"); +		cmd_sg_entries = 255; +	} + +	if (!indirect_sg_entries) +		indirect_sg_entries = cmd_sg_entries; +	else if (indirect_sg_entries < cmd_sg_entries) { +		pr_warn("Bumping up indirect_sg_entries to match cmd_sg_entries (%u)\n", +			cmd_sg_entries); +		indirect_sg_entries = cmd_sg_entries;  	}  	ib_srp_transport_template = @@ -2269,14 +3318,9 @@ static int __init srp_init_module(void)  	if (!ib_srp_transport_template)  		return -ENOMEM; -	srp_template.sg_tablesize = srp_sg_tablesize; -	srp_max_iu_len = (sizeof (struct srp_cmd) + -			  sizeof (struct srp_indirect_buf) + -			  srp_sg_tablesize * 16); -  	ret = class_register(&srp_class);  	if (ret) { -		printk(KERN_ERR PFX "couldn't register class infiniband_srp\n"); +		pr_err("couldn't register class infiniband_srp\n");  		srp_release_transport(ib_srp_transport_template);  		return ret;  	} @@ -2285,7 +3329,7 @@ static int __init srp_init_module(void)  	ret = ib_register_client(&srp_client);  	if (ret) { -		printk(KERN_ERR PFX "couldn't register IB client\n"); +		pr_err("couldn't register IB client\n");  		srp_release_transport(ib_srp_transport_template);  		ib_sa_unregister_client(&srp_sa_client);  		class_unregister(&srp_class); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index ed0dce9e479..e46ecb15aa0 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -57,29 +57,24 @@ enum {  	SRP_MAX_LUN		= 512,  	SRP_DEF_SG_TABLESIZE	= 12, -	SRP_RQ_SHIFT    	= 6, -	SRP_RQ_SIZE		= 1 << SRP_RQ_SHIFT, -	SRP_RQ_MASK		= SRP_RQ_SIZE - 1, - -	SRP_SQ_SIZE		= SRP_RQ_SIZE, -	SRP_SQ_MASK		= SRP_SQ_SIZE - 1, +	SRP_DEFAULT_QUEUE_SIZE	= 1 << 6,  	SRP_RSP_SQ_SIZE		= 1, -	SRP_REQ_SQ_SIZE		= SRP_SQ_SIZE - SRP_RSP_SQ_SIZE,  	SRP_TSK_MGMT_SQ_SIZE	= 1, -	SRP_CMD_SQ_SIZE		= SRP_REQ_SQ_SIZE - SRP_TSK_MGMT_SQ_SIZE, +	SRP_DEFAULT_CMD_SQ_SIZE = SRP_DEFAULT_QUEUE_SIZE - SRP_RSP_SQ_SIZE - +				  SRP_TSK_MGMT_SQ_SIZE, + +	SRP_TAG_NO_REQ		= ~0U, +	SRP_TAG_TSK_MGMT	= 1U << 31, -	SRP_TAG_TSK_MGMT	= 1 << (SRP_RQ_SHIFT + 1), +	SRP_MAX_PAGES_PER_MR	= 512, -	SRP_FMR_SIZE		= 256, -	SRP_FMR_POOL_SIZE	= 1024, -	SRP_FMR_DIRTY_SIZE	= SRP_FMR_POOL_SIZE / 4 +	LOCAL_INV_WR_ID_MASK	= 1, +	FAST_REG_WR_ID_MASK	= 2,  };  enum srp_target_state {  	SRP_TARGET_LIVE, -	SRP_TARGET_CONNECTING, -	SRP_TARGET_DEAD, -	SRP_TARGET_REMOVED +	SRP_TARGET_REMOVED,  };  enum srp_iu_type { @@ -88,15 +83,24 @@ enum srp_iu_type {  	SRP_IU_RSP,  }; +/* + * @mr_page_mask: HCA memory registration page mask. + * @mr_page_size: HCA memory registration page size. + * @mr_max_size: Maximum size in bytes of a single FMR / FR registration + *   request. + */  struct srp_device {  	struct list_head	dev_list;  	struct ib_device       *dev;  	struct ib_pd	       *pd;  	struct ib_mr	       *mr; -	struct ib_fmr_pool     *fmr_pool; -	int			fmr_page_shift; -	int			fmr_page_size; -	u64			fmr_page_mask; +	u64			mr_page_mask; +	int			mr_page_size; +	int			mr_max_size; +	int			max_pages_per_mr; +	bool			has_fmr; +	bool			has_fr; +	bool			use_fast_reg;  };  struct srp_host { @@ -107,21 +111,51 @@ struct srp_host {  	spinlock_t		target_lock;  	struct completion	released;  	struct list_head	list; +	struct mutex		add_target_mutex;  };  struct srp_request {  	struct list_head	list;  	struct scsi_cmnd       *scmnd;  	struct srp_iu	       *cmd; -	struct srp_iu	       *tsk_mgmt; -	struct ib_pool_fmr     *fmr; -	struct completion	done; +	union { +		struct ib_pool_fmr **fmr_list; +		struct srp_fr_desc **fr_list; +	}; +	u64		       *map_page; +	struct srp_direct_buf  *indirect_desc; +	dma_addr_t		indirect_dma_addr; +	short			nmdesc;  	short			index; -	u8			cmd_done; -	u8			tsk_status;  };  struct srp_target_port { +	/* These are RW in the hot path, and commonly used together */ +	struct list_head	free_tx; +	struct list_head	free_reqs; +	spinlock_t		lock; +	s32			req_lim; + +	/* These are read-only in the hot path */ +	struct ib_cq	       *send_cq ____cacheline_aligned_in_smp; +	struct ib_cq	       *recv_cq; +	struct ib_qp	       *qp; +	union { +		struct ib_fmr_pool     *fmr_pool; +		struct srp_fr_pool     *fr_pool; +	}; +	u32			lkey; +	u32			rkey; +	enum srp_target_state	state; +	unsigned int		max_iu_len; +	unsigned int		cmd_sg_cnt; +	unsigned int		indirect_size; +	bool			allow_ext_sg; + +	/* Everything above this point is used in the hot path of +	 * command processing. Try to keep them packed into cachelines. +	 */ +  	__be64			id_ext;  	__be64			ioc_guid;  	__be64			service_id; @@ -129,50 +163,117 @@ struct srp_target_port {  	u16			io_class;  	struct srp_host	       *srp_host;  	struct Scsi_Host       *scsi_host; +	struct srp_rport       *rport;  	char			target_name[32];  	unsigned int		scsi_id; +	unsigned int		sg_tablesize; +	int			queue_size; +	int			req_ring_size; +	int			comp_vector; +	int			tl_retry_count;  	struct ib_sa_path_rec	path;  	__be16			orig_dgid[8];  	struct ib_sa_query     *path_query;  	int			path_query_id; +	u32			rq_tmo_jiffies; +	bool			connected; +  	struct ib_cm_id	       *cm_id; -	struct ib_cq	       *recv_cq; -	struct ib_cq	       *send_cq; -	struct ib_qp	       *qp;  	int			max_ti_iu_len; -	s32			req_lim;  	int			zero_req_lim; -	unsigned		rx_head; -	struct srp_iu	       *rx_ring[SRP_RQ_SIZE]; +	struct srp_iu	       **tx_ring; +	struct srp_iu	       **rx_ring; +	struct srp_request	*req_ring; -	unsigned		tx_head; -	unsigned		tx_tail; -	struct srp_iu	       *tx_ring[SRP_SQ_SIZE]; - -	struct list_head	free_reqs; -	struct list_head	req_queue; -	struct srp_request	req_ring[SRP_CMD_SQ_SIZE]; - -	struct work_struct	work; +	struct work_struct	tl_err_work; +	struct work_struct	remove_work;  	struct list_head	list;  	struct completion	done;  	int			status; -	enum srp_target_state	state; -	int			qp_in_error; +	bool			qp_in_error; + +	struct completion	tsk_mgmt_done; +	u8			tsk_mgmt_status;  };  struct srp_iu { +	struct list_head	list;  	u64			dma;  	void		       *buf;  	size_t			size;  	enum dma_data_direction	direction; -	enum srp_iu_type	type; +}; + +/** + * struct srp_fr_desc - fast registration work request arguments + * @entry: Entry in srp_fr_pool.free_list. + * @mr:    Memory region. + * @frpl:  Fast registration page list. + */ +struct srp_fr_desc { +	struct list_head		entry; +	struct ib_mr			*mr; +	struct ib_fast_reg_page_list	*frpl; +}; + +/** + * struct srp_fr_pool - pool of fast registration descriptors + * + * An entry is available for allocation if and only if it occurs in @free_list. + * + * @size:      Number of descriptors in this pool. + * @max_page_list_len: Maximum fast registration work request page list length. + * @lock:      Protects free_list. + * @free_list: List of free descriptors. + * @desc:      Fast registration descriptor pool. + */ +struct srp_fr_pool { +	int			size; +	int			max_page_list_len; +	spinlock_t		lock; +	struct list_head	free_list; +	struct srp_fr_desc	desc[0]; +}; + +/** + * struct srp_map_state - per-request DMA memory mapping state + * @desc:	    Pointer to the element of the SRP buffer descriptor array + *		    that is being filled in. + * @pages:	    Array with DMA addresses of pages being considered for + *		    memory registration. + * @base_dma_addr:  DMA address of the first page that has not yet been mapped. + * @dma_len:	    Number of bytes that will be registered with the next + *		    FMR or FR memory registration call. + * @total_len:	    Total number of bytes in the sg-list being mapped. + * @npages:	    Number of page addresses in the pages[] array. + * @nmdesc:	    Number of FMR or FR memory descriptors used for mapping. + * @ndesc:	    Number of SRP buffer descriptors that have been filled in. + * @unmapped_sg:    First element of the sg-list that is mapped via FMR or FR. + * @unmapped_index: Index of the first element mapped via FMR or FR. + * @unmapped_addr:  DMA address of the first element mapped via FMR or FR. + */ +struct srp_map_state { +	union { +		struct ib_pool_fmr **next_fmr; +		struct srp_fr_desc **next_fr; +	}; +	struct srp_direct_buf  *desc; +	u64		       *pages; +	dma_addr_t		base_dma_addr; +	u32			dma_len; +	u32			total_len; +	unsigned int		npages; +	unsigned int		nmdesc; +	unsigned int		ndesc; +	struct scatterlist     *unmapped_sg; +	int			unmapped_index; +	dma_addr_t		unmapped_addr;  };  #endif /* IB_SRP_H */ diff --git a/drivers/infiniband/ulp/srpt/Kconfig b/drivers/infiniband/ulp/srpt/Kconfig new file mode 100644 index 00000000000..31ee83d528d --- /dev/null +++ b/drivers/infiniband/ulp/srpt/Kconfig @@ -0,0 +1,12 @@ +config INFINIBAND_SRPT +	tristate "InfiniBand SCSI RDMA Protocol target support" +	depends on INFINIBAND && TARGET_CORE +	---help--- + +	  Support for the SCSI RDMA Protocol (SRP) Target driver. The +	  SRP protocol is a protocol that allows an initiator to access +	  a block storage device on another host (target) over a network +	  that supports the RDMA protocol. Currently the RDMA protocol is +	  supported by InfiniBand and by iWarp network hardware. More +	  information about the SRP protocol can be found on the website +	  of the INCITS T10 technical committee (http://www.t10.org/). diff --git a/drivers/infiniband/ulp/srpt/Makefile b/drivers/infiniband/ulp/srpt/Makefile new file mode 100644 index 00000000000..e3ee4bdfffa --- /dev/null +++ b/drivers/infiniband/ulp/srpt/Makefile @@ -0,0 +1,2 @@ +ccflags-y			:= -Idrivers/target +obj-$(CONFIG_INFINIBAND_SRPT)	+= ib_srpt.o diff --git a/drivers/infiniband/ulp/srpt/ib_dm_mad.h b/drivers/infiniband/ulp/srpt/ib_dm_mad.h new file mode 100644 index 00000000000..fb1de1f6f29 --- /dev/null +++ b/drivers/infiniband/ulp/srpt/ib_dm_mad.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2006 - 2009 Mellanox Technology Inc.  All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses.  You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + *     Redistribution and use in source and binary forms, with or + *     without modification, are permitted provided that the following + *     conditions are met: + * + *      - Redistributions of source code must retain the above + *        copyright notice, this list of conditions and the following + *        disclaimer. + * + *      - Redistributions in binary form must reproduce the above + *        copyright notice, this list of conditions and the following + *        disclaimer in the documentation and/or other materials + *        provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef IB_DM_MAD_H +#define IB_DM_MAD_H + +#include <linux/types.h> + +#include <rdma/ib_mad.h> + +enum { +	/* +	 * See also section 13.4.7 Status Field, table 115 MAD Common Status +	 * Field Bit Values and also section 16.3.1.1 Status Field in the +	 * InfiniBand Architecture Specification. +	 */ +	DM_MAD_STATUS_UNSUP_METHOD = 0x0008, +	DM_MAD_STATUS_UNSUP_METHOD_ATTR = 0x000c, +	DM_MAD_STATUS_INVALID_FIELD = 0x001c, +	DM_MAD_STATUS_NO_IOC = 0x0100, + +	/* +	 * See also the Device Management chapter, section 16.3.3 Attributes, +	 * table 279 Device Management Attributes in the InfiniBand +	 * Architecture Specification. +	 */ +	DM_ATTR_CLASS_PORT_INFO = 0x01, +	DM_ATTR_IOU_INFO = 0x10, +	DM_ATTR_IOC_PROFILE = 0x11, +	DM_ATTR_SVC_ENTRIES = 0x12 +}; + +struct ib_dm_hdr { +	u8 reserved[28]; +}; + +/* + * Structure of management datagram sent by the SRP target implementation. + * Contains a management datagram header, reliable multi-packet transaction + * protocol (RMPP) header and ib_dm_hdr. Notes: + * - The SRP target implementation does not use RMPP or ib_dm_hdr when sending + *   management datagrams. + * - The header size must be exactly 64 bytes (IB_MGMT_DEVICE_HDR), since this + *   is the header size that is passed to ib_create_send_mad() in ib_srpt.c. + * - The maximum supported size for a management datagram when not using RMPP + *   is 256 bytes -- 64 bytes header and 192 (IB_MGMT_DEVICE_DATA) bytes data. + */ +struct ib_dm_mad { +	struct ib_mad_hdr mad_hdr; +	struct ib_rmpp_hdr rmpp_hdr; +	struct ib_dm_hdr dm_hdr; +	u8 data[IB_MGMT_DEVICE_DATA]; +}; + +/* + * IOUnitInfo as defined in section 16.3.3.3 IOUnitInfo of the InfiniBand + * Architecture Specification. + */ +struct ib_dm_iou_info { +	__be16 change_id; +	u8 max_controllers; +	u8 op_rom; +	u8 controller_list[128]; +}; + +/* + * IOControllerprofile as defined in section 16.3.3.4 IOControllerProfile of + * the InfiniBand Architecture Specification. + */ +struct ib_dm_ioc_profile { +	__be64 guid; +	__be32 vendor_id; +	__be32 device_id; +	__be16 device_version; +	__be16 reserved1; +	__be32 subsys_vendor_id; +	__be32 subsys_device_id; +	__be16 io_class; +	__be16 io_subclass; +	__be16 protocol; +	__be16 protocol_version; +	__be16 service_conn; +	__be16 initiators_supported; +	__be16 send_queue_depth; +	u8 reserved2; +	u8 rdma_read_depth; +	__be32 send_size; +	__be32 rdma_size; +	u8 op_cap_mask; +	u8 svc_cap_mask; +	u8 num_svc_entries; +	u8 reserved3[9]; +	u8 id_string[64]; +}; + +struct ib_dm_svc_entry { +	u8 name[40]; +	__be64 id; +}; + +/* + * See also section 16.3.3.5 ServiceEntries in the InfiniBand Architecture + * Specification. See also section B.7, table B.8 in the T10 SRP r16a document. + */ +struct ib_dm_svc_entries { +	struct ib_dm_svc_entry service_entries[4]; +}; + +#endif diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c new file mode 100644 index 00000000000..fe09f2788b1 --- /dev/null +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -0,0 +1,4042 @@ +/* + * Copyright (c) 2006 - 2009 Mellanox Technology Inc.  All rights reserved. + * Copyright (C) 2008 - 2011 Bart Van Assche <bvanassche@acm.org>. + * + * This software is available to you under a choice of one of two + * licenses.  You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + *     Redistribution and use in source and binary forms, with or + *     without modification, are permitted provided that the following + *     conditions are met: + * + *      - Redistributions of source code must retain the above + *        copyright notice, this list of conditions and the following + *        disclaimer. + * + *      - Redistributions in binary form must reproduce the above + *        copyright notice, this list of conditions and the following + *        disclaimer in the documentation and/or other materials + *        provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/ctype.h> +#include <linux/kthread.h> +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/atomic.h> +#include <scsi/scsi_tcq.h> +#include <target/configfs_macros.h> +#include <target/target_core_base.h> +#include <target/target_core_fabric_configfs.h> +#include <target/target_core_fabric.h> +#include <target/target_core_configfs.h> +#include "ib_srpt.h" + +/* Name of this kernel module. */ +#define DRV_NAME		"ib_srpt" +#define DRV_VERSION		"2.0.0" +#define DRV_RELDATE		"2011-02-14" + +#define SRPT_ID_STRING	"Linux SRP target" + +#undef pr_fmt +#define pr_fmt(fmt) DRV_NAME " " fmt + +MODULE_AUTHOR("Vu Pham and Bart Van Assche"); +MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol target " +		   "v" DRV_VERSION " (" DRV_RELDATE ")"); +MODULE_LICENSE("Dual BSD/GPL"); + +/* + * Global Variables + */ + +static u64 srpt_service_guid; +static DEFINE_SPINLOCK(srpt_dev_lock);	/* Protects srpt_dev_list. */ +static LIST_HEAD(srpt_dev_list);	/* List of srpt_device structures. */ + +static unsigned srp_max_req_size = DEFAULT_MAX_REQ_SIZE; +module_param(srp_max_req_size, int, 0444); +MODULE_PARM_DESC(srp_max_req_size, +		 "Maximum size of SRP request messages in bytes."); + +static int srpt_srq_size = DEFAULT_SRPT_SRQ_SIZE; +module_param(srpt_srq_size, int, 0444); +MODULE_PARM_DESC(srpt_srq_size, +		 "Shared receive queue (SRQ) size."); + +static int srpt_get_u64_x(char *buffer, struct kernel_param *kp) +{ +	return sprintf(buffer, "0x%016llx", *(u64 *)kp->arg); +} +module_param_call(srpt_service_guid, NULL, srpt_get_u64_x, &srpt_service_guid, +		  0444); +MODULE_PARM_DESC(srpt_service_guid, +		 "Using this value for ioc_guid, id_ext, and cm_listen_id" +		 " instead of using the node_guid of the first HCA."); + +static struct ib_client srpt_client; +static struct target_fabric_configfs *srpt_target; +static void srpt_release_channel(struct srpt_rdma_ch *ch); +static int srpt_queue_status(struct se_cmd *cmd); + +/** + * opposite_dma_dir() - Swap DMA_TO_DEVICE and DMA_FROM_DEVICE. + */ +static inline +enum dma_data_direction opposite_dma_dir(enum dma_data_direction dir) +{ +	switch (dir) { +	case DMA_TO_DEVICE:	return DMA_FROM_DEVICE; +	case DMA_FROM_DEVICE:	return DMA_TO_DEVICE; +	default:		return dir; +	} +} + +/** + * srpt_sdev_name() - Return the name associated with the HCA. + * + * Examples are ib0, ib1, ... + */ +static inline const char *srpt_sdev_name(struct srpt_device *sdev) +{ +	return sdev->device->name; +} + +static enum rdma_ch_state srpt_get_ch_state(struct srpt_rdma_ch *ch) +{ +	unsigned long flags; +	enum rdma_ch_state state; + +	spin_lock_irqsave(&ch->spinlock, flags); +	state = ch->state; +	spin_unlock_irqrestore(&ch->spinlock, flags); +	return state; +} + +static enum rdma_ch_state +srpt_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state new_state) +{ +	unsigned long flags; +	enum rdma_ch_state prev; + +	spin_lock_irqsave(&ch->spinlock, flags); +	prev = ch->state; +	ch->state = new_state; +	spin_unlock_irqrestore(&ch->spinlock, flags); +	return prev; +} + +/** + * srpt_test_and_set_ch_state() - Test and set the channel state. + * + * Returns true if and only if the channel state has been set to the new state. + */ +static bool +srpt_test_and_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state old, +			   enum rdma_ch_state new) +{ +	unsigned long flags; +	enum rdma_ch_state prev; + +	spin_lock_irqsave(&ch->spinlock, flags); +	prev = ch->state; +	if (prev == old) +		ch->state = new; +	spin_unlock_irqrestore(&ch->spinlock, flags); +	return prev == old; +} + +/** + * srpt_event_handler() - Asynchronous IB event callback function. + * + * Callback function called by the InfiniBand core when an asynchronous IB + * event occurs. This callback may occur in interrupt context. See also + * section 11.5.2, Set Asynchronous Event Handler in the InfiniBand + * Architecture Specification. + */ +static void srpt_event_handler(struct ib_event_handler *handler, +			       struct ib_event *event) +{ +	struct srpt_device *sdev; +	struct srpt_port *sport; + +	sdev = ib_get_client_data(event->device, &srpt_client); +	if (!sdev || sdev->device != event->device) +		return; + +	pr_debug("ASYNC event= %d on device= %s\n", event->event, +		 srpt_sdev_name(sdev)); + +	switch (event->event) { +	case IB_EVENT_PORT_ERR: +		if (event->element.port_num <= sdev->device->phys_port_cnt) { +			sport = &sdev->port[event->element.port_num - 1]; +			sport->lid = 0; +			sport->sm_lid = 0; +		} +		break; +	case IB_EVENT_PORT_ACTIVE: +	case IB_EVENT_LID_CHANGE: +	case IB_EVENT_PKEY_CHANGE: +	case IB_EVENT_SM_CHANGE: +	case IB_EVENT_CLIENT_REREGISTER: +		/* Refresh port data asynchronously. */ +		if (event->element.port_num <= sdev->device->phys_port_cnt) { +			sport = &sdev->port[event->element.port_num - 1]; +			if (!sport->lid && !sport->sm_lid) +				schedule_work(&sport->work); +		} +		break; +	default: +		printk(KERN_ERR "received unrecognized IB event %d\n", +		       event->event); +		break; +	} +} + +/** + * srpt_srq_event() - SRQ event callback function. + */ +static void srpt_srq_event(struct ib_event *event, void *ctx) +{ +	printk(KERN_INFO "SRQ event %d\n", event->event); +} + +/** + * srpt_qp_event() - QP event callback function. + */ +static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch) +{ +	pr_debug("QP event %d on cm_id=%p sess_name=%s state=%d\n", +		 event->event, ch->cm_id, ch->sess_name, srpt_get_ch_state(ch)); + +	switch (event->event) { +	case IB_EVENT_COMM_EST: +		ib_cm_notify(ch->cm_id, event->event); +		break; +	case IB_EVENT_QP_LAST_WQE_REACHED: +		if (srpt_test_and_set_ch_state(ch, CH_DRAINING, +					       CH_RELEASING)) +			srpt_release_channel(ch); +		else +			pr_debug("%s: state %d - ignored LAST_WQE.\n", +				 ch->sess_name, srpt_get_ch_state(ch)); +		break; +	default: +		printk(KERN_ERR "received unrecognized IB QP event %d\n", +		       event->event); +		break; +	} +} + +/** + * srpt_set_ioc() - Helper function for initializing an IOUnitInfo structure. + * + * @slot: one-based slot number. + * @value: four-bit value. + * + * Copies the lowest four bits of value in element slot of the array of four + * bit elements called c_list (controller list). The index slot is one-based. + */ +static void srpt_set_ioc(u8 *c_list, u32 slot, u8 value) +{ +	u16 id; +	u8 tmp; + +	id = (slot - 1) / 2; +	if (slot & 0x1) { +		tmp = c_list[id] & 0xf; +		c_list[id] = (value << 4) | tmp; +	} else { +		tmp = c_list[id] & 0xf0; +		c_list[id] = (value & 0xf) | tmp; +	} +} + +/** + * srpt_get_class_port_info() - Copy ClassPortInfo to a management datagram. + * + * See also section 16.3.3.1 ClassPortInfo in the InfiniBand Architecture + * Specification. + */ +static void srpt_get_class_port_info(struct ib_dm_mad *mad) +{ +	struct ib_class_port_info *cif; + +	cif = (struct ib_class_port_info *)mad->data; +	memset(cif, 0, sizeof *cif); +	cif->base_version = 1; +	cif->class_version = 1; +	cif->resp_time_value = 20; + +	mad->mad_hdr.status = 0; +} + +/** + * srpt_get_iou() - Write IOUnitInfo to a management datagram. + * + * See also section 16.3.3.3 IOUnitInfo in the InfiniBand Architecture + * Specification. See also section B.7, table B.6 in the SRP r16a document. + */ +static void srpt_get_iou(struct ib_dm_mad *mad) +{ +	struct ib_dm_iou_info *ioui; +	u8 slot; +	int i; + +	ioui = (struct ib_dm_iou_info *)mad->data; +	ioui->change_id = __constant_cpu_to_be16(1); +	ioui->max_controllers = 16; + +	/* set present for slot 1 and empty for the rest */ +	srpt_set_ioc(ioui->controller_list, 1, 1); +	for (i = 1, slot = 2; i < 16; i++, slot++) +		srpt_set_ioc(ioui->controller_list, slot, 0); + +	mad->mad_hdr.status = 0; +} + +/** + * srpt_get_ioc() - Write IOControllerprofile to a management datagram. + * + * See also section 16.3.3.4 IOControllerProfile in the InfiniBand + * Architecture Specification. See also section B.7, table B.7 in the SRP + * r16a document. + */ +static void srpt_get_ioc(struct srpt_port *sport, u32 slot, +			 struct ib_dm_mad *mad) +{ +	struct srpt_device *sdev = sport->sdev; +	struct ib_dm_ioc_profile *iocp; + +	iocp = (struct ib_dm_ioc_profile *)mad->data; + +	if (!slot || slot > 16) { +		mad->mad_hdr.status +			= __constant_cpu_to_be16(DM_MAD_STATUS_INVALID_FIELD); +		return; +	} + +	if (slot > 2) { +		mad->mad_hdr.status +			= __constant_cpu_to_be16(DM_MAD_STATUS_NO_IOC); +		return; +	} + +	memset(iocp, 0, sizeof *iocp); +	strcpy(iocp->id_string, SRPT_ID_STRING); +	iocp->guid = cpu_to_be64(srpt_service_guid); +	iocp->vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id); +	iocp->device_id = cpu_to_be32(sdev->dev_attr.vendor_part_id); +	iocp->device_version = cpu_to_be16(sdev->dev_attr.hw_ver); +	iocp->subsys_vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id); +	iocp->subsys_device_id = 0x0; +	iocp->io_class = __constant_cpu_to_be16(SRP_REV16A_IB_IO_CLASS); +	iocp->io_subclass = __constant_cpu_to_be16(SRP_IO_SUBCLASS); +	iocp->protocol = __constant_cpu_to_be16(SRP_PROTOCOL); +	iocp->protocol_version = __constant_cpu_to_be16(SRP_PROTOCOL_VERSION); +	iocp->send_queue_depth = cpu_to_be16(sdev->srq_size); +	iocp->rdma_read_depth = 4; +	iocp->send_size = cpu_to_be32(srp_max_req_size); +	iocp->rdma_size = cpu_to_be32(min(sport->port_attrib.srp_max_rdma_size, +					  1U << 24)); +	iocp->num_svc_entries = 1; +	iocp->op_cap_mask = SRP_SEND_TO_IOC | SRP_SEND_FROM_IOC | +		SRP_RDMA_READ_FROM_IOC | SRP_RDMA_WRITE_FROM_IOC; + +	mad->mad_hdr.status = 0; +} + +/** + * srpt_get_svc_entries() - Write ServiceEntries to a management datagram. + * + * See also section 16.3.3.5 ServiceEntries in the InfiniBand Architecture + * Specification. See also section B.7, table B.8 in the SRP r16a document. + */ +static void srpt_get_svc_entries(u64 ioc_guid, +				 u16 slot, u8 hi, u8 lo, struct ib_dm_mad *mad) +{ +	struct ib_dm_svc_entries *svc_entries; + +	WARN_ON(!ioc_guid); + +	if (!slot || slot > 16) { +		mad->mad_hdr.status +			= __constant_cpu_to_be16(DM_MAD_STATUS_INVALID_FIELD); +		return; +	} + +	if (slot > 2 || lo > hi || hi > 1) { +		mad->mad_hdr.status +			= __constant_cpu_to_be16(DM_MAD_STATUS_NO_IOC); +		return; +	} + +	svc_entries = (struct ib_dm_svc_entries *)mad->data; +	memset(svc_entries, 0, sizeof *svc_entries); +	svc_entries->service_entries[0].id = cpu_to_be64(ioc_guid); +	snprintf(svc_entries->service_entries[0].name, +		 sizeof(svc_entries->service_entries[0].name), +		 "%s%016llx", +		 SRP_SERVICE_NAME_PREFIX, +		 ioc_guid); + +	mad->mad_hdr.status = 0; +} + +/** + * srpt_mgmt_method_get() - Process a received management datagram. + * @sp:      source port through which the MAD has been received. + * @rq_mad:  received MAD. + * @rsp_mad: response MAD. + */ +static void srpt_mgmt_method_get(struct srpt_port *sp, struct ib_mad *rq_mad, +				 struct ib_dm_mad *rsp_mad) +{ +	u16 attr_id; +	u32 slot; +	u8 hi, lo; + +	attr_id = be16_to_cpu(rq_mad->mad_hdr.attr_id); +	switch (attr_id) { +	case DM_ATTR_CLASS_PORT_INFO: +		srpt_get_class_port_info(rsp_mad); +		break; +	case DM_ATTR_IOU_INFO: +		srpt_get_iou(rsp_mad); +		break; +	case DM_ATTR_IOC_PROFILE: +		slot = be32_to_cpu(rq_mad->mad_hdr.attr_mod); +		srpt_get_ioc(sp, slot, rsp_mad); +		break; +	case DM_ATTR_SVC_ENTRIES: +		slot = be32_to_cpu(rq_mad->mad_hdr.attr_mod); +		hi = (u8) ((slot >> 8) & 0xff); +		lo = (u8) (slot & 0xff); +		slot = (u16) ((slot >> 16) & 0xffff); +		srpt_get_svc_entries(srpt_service_guid, +				     slot, hi, lo, rsp_mad); +		break; +	default: +		rsp_mad->mad_hdr.status = +		    __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD_ATTR); +		break; +	} +} + +/** + * srpt_mad_send_handler() - Post MAD-send callback function. + */ +static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent, +				  struct ib_mad_send_wc *mad_wc) +{ +	ib_destroy_ah(mad_wc->send_buf->ah); +	ib_free_send_mad(mad_wc->send_buf); +} + +/** + * srpt_mad_recv_handler() - MAD reception callback function. + */ +static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent, +				  struct ib_mad_recv_wc *mad_wc) +{ +	struct srpt_port *sport = (struct srpt_port *)mad_agent->context; +	struct ib_ah *ah; +	struct ib_mad_send_buf *rsp; +	struct ib_dm_mad *dm_mad; + +	if (!mad_wc || !mad_wc->recv_buf.mad) +		return; + +	ah = ib_create_ah_from_wc(mad_agent->qp->pd, mad_wc->wc, +				  mad_wc->recv_buf.grh, mad_agent->port_num); +	if (IS_ERR(ah)) +		goto err; + +	BUILD_BUG_ON(offsetof(struct ib_dm_mad, data) != IB_MGMT_DEVICE_HDR); + +	rsp = ib_create_send_mad(mad_agent, mad_wc->wc->src_qp, +				 mad_wc->wc->pkey_index, 0, +				 IB_MGMT_DEVICE_HDR, IB_MGMT_DEVICE_DATA, +				 GFP_KERNEL); +	if (IS_ERR(rsp)) +		goto err_rsp; + +	rsp->ah = ah; + +	dm_mad = rsp->mad; +	memcpy(dm_mad, mad_wc->recv_buf.mad, sizeof *dm_mad); +	dm_mad->mad_hdr.method = IB_MGMT_METHOD_GET_RESP; +	dm_mad->mad_hdr.status = 0; + +	switch (mad_wc->recv_buf.mad->mad_hdr.method) { +	case IB_MGMT_METHOD_GET: +		srpt_mgmt_method_get(sport, mad_wc->recv_buf.mad, dm_mad); +		break; +	case IB_MGMT_METHOD_SET: +		dm_mad->mad_hdr.status = +		    __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD_ATTR); +		break; +	default: +		dm_mad->mad_hdr.status = +		    __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD); +		break; +	} + +	if (!ib_post_send_mad(rsp, NULL)) { +		ib_free_recv_mad(mad_wc); +		/* will destroy_ah & free_send_mad in send completion */ +		return; +	} + +	ib_free_send_mad(rsp); + +err_rsp: +	ib_destroy_ah(ah); +err: +	ib_free_recv_mad(mad_wc); +} + +/** + * srpt_refresh_port() - Configure a HCA port. + * + * Enable InfiniBand management datagram processing, update the cached sm_lid, + * lid and gid values, and register a callback function for processing MADs + * on the specified port. + * + * Note: It is safe to call this function more than once for the same port. + */ +static int srpt_refresh_port(struct srpt_port *sport) +{ +	struct ib_mad_reg_req reg_req; +	struct ib_port_modify port_modify; +	struct ib_port_attr port_attr; +	int ret; + +	memset(&port_modify, 0, sizeof port_modify); +	port_modify.set_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP; +	port_modify.clr_port_cap_mask = 0; + +	ret = ib_modify_port(sport->sdev->device, sport->port, 0, &port_modify); +	if (ret) +		goto err_mod_port; + +	ret = ib_query_port(sport->sdev->device, sport->port, &port_attr); +	if (ret) +		goto err_query_port; + +	sport->sm_lid = port_attr.sm_lid; +	sport->lid = port_attr.lid; + +	ret = ib_query_gid(sport->sdev->device, sport->port, 0, &sport->gid); +	if (ret) +		goto err_query_port; + +	if (!sport->mad_agent) { +		memset(®_req, 0, sizeof reg_req); +		reg_req.mgmt_class = IB_MGMT_CLASS_DEVICE_MGMT; +		reg_req.mgmt_class_version = IB_MGMT_BASE_VERSION; +		set_bit(IB_MGMT_METHOD_GET, reg_req.method_mask); +		set_bit(IB_MGMT_METHOD_SET, reg_req.method_mask); + +		sport->mad_agent = ib_register_mad_agent(sport->sdev->device, +							 sport->port, +							 IB_QPT_GSI, +							 ®_req, 0, +							 srpt_mad_send_handler, +							 srpt_mad_recv_handler, +							 sport); +		if (IS_ERR(sport->mad_agent)) { +			ret = PTR_ERR(sport->mad_agent); +			sport->mad_agent = NULL; +			goto err_query_port; +		} +	} + +	return 0; + +err_query_port: + +	port_modify.set_port_cap_mask = 0; +	port_modify.clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP; +	ib_modify_port(sport->sdev->device, sport->port, 0, &port_modify); + +err_mod_port: + +	return ret; +} + +/** + * srpt_unregister_mad_agent() - Unregister MAD callback functions. + * + * Note: It is safe to call this function more than once for the same device. + */ +static void srpt_unregister_mad_agent(struct srpt_device *sdev) +{ +	struct ib_port_modify port_modify = { +		.clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP, +	}; +	struct srpt_port *sport; +	int i; + +	for (i = 1; i <= sdev->device->phys_port_cnt; i++) { +		sport = &sdev->port[i - 1]; +		WARN_ON(sport->port != i); +		if (ib_modify_port(sdev->device, i, 0, &port_modify) < 0) +			printk(KERN_ERR "disabling MAD processing failed.\n"); +		if (sport->mad_agent) { +			ib_unregister_mad_agent(sport->mad_agent); +			sport->mad_agent = NULL; +		} +	} +} + +/** + * srpt_alloc_ioctx() - Allocate an SRPT I/O context structure. + */ +static struct srpt_ioctx *srpt_alloc_ioctx(struct srpt_device *sdev, +					   int ioctx_size, int dma_size, +					   enum dma_data_direction dir) +{ +	struct srpt_ioctx *ioctx; + +	ioctx = kmalloc(ioctx_size, GFP_KERNEL); +	if (!ioctx) +		goto err; + +	ioctx->buf = kmalloc(dma_size, GFP_KERNEL); +	if (!ioctx->buf) +		goto err_free_ioctx; + +	ioctx->dma = ib_dma_map_single(sdev->device, ioctx->buf, dma_size, dir); +	if (ib_dma_mapping_error(sdev->device, ioctx->dma)) +		goto err_free_buf; + +	return ioctx; + +err_free_buf: +	kfree(ioctx->buf); +err_free_ioctx: +	kfree(ioctx); +err: +	return NULL; +} + +/** + * srpt_free_ioctx() - Free an SRPT I/O context structure. + */ +static void srpt_free_ioctx(struct srpt_device *sdev, struct srpt_ioctx *ioctx, +			    int dma_size, enum dma_data_direction dir) +{ +	if (!ioctx) +		return; + +	ib_dma_unmap_single(sdev->device, ioctx->dma, dma_size, dir); +	kfree(ioctx->buf); +	kfree(ioctx); +} + +/** + * srpt_alloc_ioctx_ring() - Allocate a ring of SRPT I/O context structures. + * @sdev:       Device to allocate the I/O context ring for. + * @ring_size:  Number of elements in the I/O context ring. + * @ioctx_size: I/O context size. + * @dma_size:   DMA buffer size. + * @dir:        DMA data direction. + */ +static struct srpt_ioctx **srpt_alloc_ioctx_ring(struct srpt_device *sdev, +				int ring_size, int ioctx_size, +				int dma_size, enum dma_data_direction dir) +{ +	struct srpt_ioctx **ring; +	int i; + +	WARN_ON(ioctx_size != sizeof(struct srpt_recv_ioctx) +		&& ioctx_size != sizeof(struct srpt_send_ioctx)); + +	ring = kmalloc(ring_size * sizeof(ring[0]), GFP_KERNEL); +	if (!ring) +		goto out; +	for (i = 0; i < ring_size; ++i) { +		ring[i] = srpt_alloc_ioctx(sdev, ioctx_size, dma_size, dir); +		if (!ring[i]) +			goto err; +		ring[i]->index = i; +	} +	goto out; + +err: +	while (--i >= 0) +		srpt_free_ioctx(sdev, ring[i], dma_size, dir); +	kfree(ring); +	ring = NULL; +out: +	return ring; +} + +/** + * srpt_free_ioctx_ring() - Free the ring of SRPT I/O context structures. + */ +static void srpt_free_ioctx_ring(struct srpt_ioctx **ioctx_ring, +				 struct srpt_device *sdev, int ring_size, +				 int dma_size, enum dma_data_direction dir) +{ +	int i; + +	for (i = 0; i < ring_size; ++i) +		srpt_free_ioctx(sdev, ioctx_ring[i], dma_size, dir); +	kfree(ioctx_ring); +} + +/** + * srpt_get_cmd_state() - Get the state of a SCSI command. + */ +static enum srpt_command_state srpt_get_cmd_state(struct srpt_send_ioctx *ioctx) +{ +	enum srpt_command_state state; +	unsigned long flags; + +	BUG_ON(!ioctx); + +	spin_lock_irqsave(&ioctx->spinlock, flags); +	state = ioctx->state; +	spin_unlock_irqrestore(&ioctx->spinlock, flags); +	return state; +} + +/** + * srpt_set_cmd_state() - Set the state of a SCSI command. + * + * Does not modify the state of aborted commands. Returns the previous command + * state. + */ +static enum srpt_command_state srpt_set_cmd_state(struct srpt_send_ioctx *ioctx, +						  enum srpt_command_state new) +{ +	enum srpt_command_state previous; +	unsigned long flags; + +	BUG_ON(!ioctx); + +	spin_lock_irqsave(&ioctx->spinlock, flags); +	previous = ioctx->state; +	if (previous != SRPT_STATE_DONE) +		ioctx->state = new; +	spin_unlock_irqrestore(&ioctx->spinlock, flags); + +	return previous; +} + +/** + * srpt_test_and_set_cmd_state() - Test and set the state of a command. + * + * Returns true if and only if the previous command state was equal to 'old'. + */ +static bool srpt_test_and_set_cmd_state(struct srpt_send_ioctx *ioctx, +					enum srpt_command_state old, +					enum srpt_command_state new) +{ +	enum srpt_command_state previous; +	unsigned long flags; + +	WARN_ON(!ioctx); +	WARN_ON(old == SRPT_STATE_DONE); +	WARN_ON(new == SRPT_STATE_NEW); + +	spin_lock_irqsave(&ioctx->spinlock, flags); +	previous = ioctx->state; +	if (previous == old) +		ioctx->state = new; +	spin_unlock_irqrestore(&ioctx->spinlock, flags); +	return previous == old; +} + +/** + * srpt_post_recv() - Post an IB receive request. + */ +static int srpt_post_recv(struct srpt_device *sdev, +			  struct srpt_recv_ioctx *ioctx) +{ +	struct ib_sge list; +	struct ib_recv_wr wr, *bad_wr; + +	BUG_ON(!sdev); +	wr.wr_id = encode_wr_id(SRPT_RECV, ioctx->ioctx.index); + +	list.addr = ioctx->ioctx.dma; +	list.length = srp_max_req_size; +	list.lkey = sdev->mr->lkey; + +	wr.next = NULL; +	wr.sg_list = &list; +	wr.num_sge = 1; + +	return ib_post_srq_recv(sdev->srq, &wr, &bad_wr); +} + +/** + * srpt_post_send() - Post an IB send request. + * + * Returns zero upon success and a non-zero value upon failure. + */ +static int srpt_post_send(struct srpt_rdma_ch *ch, +			  struct srpt_send_ioctx *ioctx, int len) +{ +	struct ib_sge list; +	struct ib_send_wr wr, *bad_wr; +	struct srpt_device *sdev = ch->sport->sdev; +	int ret; + +	atomic_inc(&ch->req_lim); + +	ret = -ENOMEM; +	if (unlikely(atomic_dec_return(&ch->sq_wr_avail) < 0)) { +		printk(KERN_WARNING "IB send queue full (needed 1)\n"); +		goto out; +	} + +	ib_dma_sync_single_for_device(sdev->device, ioctx->ioctx.dma, len, +				      DMA_TO_DEVICE); + +	list.addr = ioctx->ioctx.dma; +	list.length = len; +	list.lkey = sdev->mr->lkey; + +	wr.next = NULL; +	wr.wr_id = encode_wr_id(SRPT_SEND, ioctx->ioctx.index); +	wr.sg_list = &list; +	wr.num_sge = 1; +	wr.opcode = IB_WR_SEND; +	wr.send_flags = IB_SEND_SIGNALED; + +	ret = ib_post_send(ch->qp, &wr, &bad_wr); + +out: +	if (ret < 0) { +		atomic_inc(&ch->sq_wr_avail); +		atomic_dec(&ch->req_lim); +	} +	return ret; +} + +/** + * srpt_get_desc_tbl() - Parse the data descriptors of an SRP_CMD request. + * @ioctx: Pointer to the I/O context associated with the request. + * @srp_cmd: Pointer to the SRP_CMD request data. + * @dir: Pointer to the variable to which the transfer direction will be + *   written. + * @data_len: Pointer to the variable to which the total data length of all + *   descriptors in the SRP_CMD request will be written. + * + * This function initializes ioctx->nrbuf and ioctx->r_bufs. + * + * Returns -EINVAL when the SRP_CMD request contains inconsistent descriptors; + * -ENOMEM when memory allocation fails and zero upon success. + */ +static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx, +			     struct srp_cmd *srp_cmd, +			     enum dma_data_direction *dir, u64 *data_len) +{ +	struct srp_indirect_buf *idb; +	struct srp_direct_buf *db; +	unsigned add_cdb_offset; +	int ret; + +	/* +	 * The pointer computations below will only be compiled correctly +	 * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check +	 * whether srp_cmd::add_data has been declared as a byte pointer. +	 */ +	BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0) +		     && !__same_type(srp_cmd->add_data[0], (u8)0)); + +	BUG_ON(!dir); +	BUG_ON(!data_len); + +	ret = 0; +	*data_len = 0; + +	/* +	 * The lower four bits of the buffer format field contain the DATA-IN +	 * buffer descriptor format, and the highest four bits contain the +	 * DATA-OUT buffer descriptor format. +	 */ +	*dir = DMA_NONE; +	if (srp_cmd->buf_fmt & 0xf) +		/* DATA-IN: transfer data from target to initiator (read). */ +		*dir = DMA_FROM_DEVICE; +	else if (srp_cmd->buf_fmt >> 4) +		/* DATA-OUT: transfer data from initiator to target (write). */ +		*dir = DMA_TO_DEVICE; + +	/* +	 * According to the SRP spec, the lower two bits of the 'ADDITIONAL +	 * CDB LENGTH' field are reserved and the size in bytes of this field +	 * is four times the value specified in bits 3..7. Hence the "& ~3". +	 */ +	add_cdb_offset = srp_cmd->add_cdb_len & ~3; +	if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_DIRECT) || +	    ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_DIRECT)) { +		ioctx->n_rbuf = 1; +		ioctx->rbufs = &ioctx->single_rbuf; + +		db = (struct srp_direct_buf *)(srp_cmd->add_data +					       + add_cdb_offset); +		memcpy(ioctx->rbufs, db, sizeof *db); +		*data_len = be32_to_cpu(db->len); +	} else if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_INDIRECT) || +		   ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_INDIRECT)) { +		idb = (struct srp_indirect_buf *)(srp_cmd->add_data +						  + add_cdb_offset); + +		ioctx->n_rbuf = be32_to_cpu(idb->table_desc.len) / sizeof *db; + +		if (ioctx->n_rbuf > +		    (srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) { +			printk(KERN_ERR "received unsupported SRP_CMD request" +			       " type (%u out + %u in != %u / %zu)\n", +			       srp_cmd->data_out_desc_cnt, +			       srp_cmd->data_in_desc_cnt, +			       be32_to_cpu(idb->table_desc.len), +			       sizeof(*db)); +			ioctx->n_rbuf = 0; +			ret = -EINVAL; +			goto out; +		} + +		if (ioctx->n_rbuf == 1) +			ioctx->rbufs = &ioctx->single_rbuf; +		else { +			ioctx->rbufs = +				kmalloc(ioctx->n_rbuf * sizeof *db, GFP_ATOMIC); +			if (!ioctx->rbufs) { +				ioctx->n_rbuf = 0; +				ret = -ENOMEM; +				goto out; +			} +		} + +		db = idb->desc_list; +		memcpy(ioctx->rbufs, db, ioctx->n_rbuf * sizeof *db); +		*data_len = be32_to_cpu(idb->len); +	} +out: +	return ret; +} + +/** + * srpt_init_ch_qp() - Initialize queue pair attributes. + * + * Initialized the attributes of queue pair 'qp' by allowing local write, + * remote read and remote write. Also transitions 'qp' to state IB_QPS_INIT. + */ +static int srpt_init_ch_qp(struct srpt_rdma_ch *ch, struct ib_qp *qp) +{ +	struct ib_qp_attr *attr; +	int ret; + +	attr = kzalloc(sizeof *attr, GFP_KERNEL); +	if (!attr) +		return -ENOMEM; + +	attr->qp_state = IB_QPS_INIT; +	attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | +	    IB_ACCESS_REMOTE_WRITE; +	attr->port_num = ch->sport->port; +	attr->pkey_index = 0; + +	ret = ib_modify_qp(qp, attr, +			   IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PORT | +			   IB_QP_PKEY_INDEX); + +	kfree(attr); +	return ret; +} + +/** + * srpt_ch_qp_rtr() - Change the state of a channel to 'ready to receive' (RTR). + * @ch: channel of the queue pair. + * @qp: queue pair to change the state of. + * + * Returns zero upon success and a negative value upon failure. + * + * Note: currently a struct ib_qp_attr takes 136 bytes on a 64-bit system. + * If this structure ever becomes larger, it might be necessary to allocate + * it dynamically instead of on the stack. + */ +static int srpt_ch_qp_rtr(struct srpt_rdma_ch *ch, struct ib_qp *qp) +{ +	struct ib_qp_attr qp_attr; +	int attr_mask; +	int ret; + +	qp_attr.qp_state = IB_QPS_RTR; +	ret = ib_cm_init_qp_attr(ch->cm_id, &qp_attr, &attr_mask); +	if (ret) +		goto out; + +	qp_attr.max_dest_rd_atomic = 4; + +	ret = ib_modify_qp(qp, &qp_attr, attr_mask); + +out: +	return ret; +} + +/** + * srpt_ch_qp_rts() - Change the state of a channel to 'ready to send' (RTS). + * @ch: channel of the queue pair. + * @qp: queue pair to change the state of. + * + * Returns zero upon success and a negative value upon failure. + * + * Note: currently a struct ib_qp_attr takes 136 bytes on a 64-bit system. + * If this structure ever becomes larger, it might be necessary to allocate + * it dynamically instead of on the stack. + */ +static int srpt_ch_qp_rts(struct srpt_rdma_ch *ch, struct ib_qp *qp) +{ +	struct ib_qp_attr qp_attr; +	int attr_mask; +	int ret; + +	qp_attr.qp_state = IB_QPS_RTS; +	ret = ib_cm_init_qp_attr(ch->cm_id, &qp_attr, &attr_mask); +	if (ret) +		goto out; + +	qp_attr.max_rd_atomic = 4; + +	ret = ib_modify_qp(qp, &qp_attr, attr_mask); + +out: +	return ret; +} + +/** + * srpt_ch_qp_err() - Set the channel queue pair state to 'error'. + */ +static int srpt_ch_qp_err(struct srpt_rdma_ch *ch) +{ +	struct ib_qp_attr qp_attr; + +	qp_attr.qp_state = IB_QPS_ERR; +	return ib_modify_qp(ch->qp, &qp_attr, IB_QP_STATE); +} + +/** + * srpt_unmap_sg_to_ib_sge() - Unmap an IB SGE list. + */ +static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch, +				    struct srpt_send_ioctx *ioctx) +{ +	struct scatterlist *sg; +	enum dma_data_direction dir; + +	BUG_ON(!ch); +	BUG_ON(!ioctx); +	BUG_ON(ioctx->n_rdma && !ioctx->rdma_ius); + +	while (ioctx->n_rdma) +		kfree(ioctx->rdma_ius[--ioctx->n_rdma].sge); + +	kfree(ioctx->rdma_ius); +	ioctx->rdma_ius = NULL; + +	if (ioctx->mapped_sg_count) { +		sg = ioctx->sg; +		WARN_ON(!sg); +		dir = ioctx->cmd.data_direction; +		BUG_ON(dir == DMA_NONE); +		ib_dma_unmap_sg(ch->sport->sdev->device, sg, ioctx->sg_cnt, +				opposite_dma_dir(dir)); +		ioctx->mapped_sg_count = 0; +	} +} + +/** + * srpt_map_sg_to_ib_sge() - Map an SG list to an IB SGE list. + */ +static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, +				 struct srpt_send_ioctx *ioctx) +{ +	struct ib_device *dev = ch->sport->sdev->device; +	struct se_cmd *cmd; +	struct scatterlist *sg, *sg_orig; +	int sg_cnt; +	enum dma_data_direction dir; +	struct rdma_iu *riu; +	struct srp_direct_buf *db; +	dma_addr_t dma_addr; +	struct ib_sge *sge; +	u64 raddr; +	u32 rsize; +	u32 tsize; +	u32 dma_len; +	int count, nrdma; +	int i, j, k; + +	BUG_ON(!ch); +	BUG_ON(!ioctx); +	cmd = &ioctx->cmd; +	dir = cmd->data_direction; +	BUG_ON(dir == DMA_NONE); + +	ioctx->sg = sg = sg_orig = cmd->t_data_sg; +	ioctx->sg_cnt = sg_cnt = cmd->t_data_nents; + +	count = ib_dma_map_sg(ch->sport->sdev->device, sg, sg_cnt, +			      opposite_dma_dir(dir)); +	if (unlikely(!count)) +		return -EAGAIN; + +	ioctx->mapped_sg_count = count; + +	if (ioctx->rdma_ius && ioctx->n_rdma_ius) +		nrdma = ioctx->n_rdma_ius; +	else { +		nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE +			+ ioctx->n_rbuf; + +		ioctx->rdma_ius = kzalloc(nrdma * sizeof *riu, GFP_KERNEL); +		if (!ioctx->rdma_ius) +			goto free_mem; + +		ioctx->n_rdma_ius = nrdma; +	} + +	db = ioctx->rbufs; +	tsize = cmd->data_length; +	dma_len = ib_sg_dma_len(dev, &sg[0]); +	riu = ioctx->rdma_ius; + +	/* +	 * For each remote desc - calculate the #ib_sge. +	 * If #ib_sge < SRPT_DEF_SG_PER_WQE per rdma operation then +	 *      each remote desc rdma_iu is required a rdma wr; +	 * else +	 *      we need to allocate extra rdma_iu to carry extra #ib_sge in +	 *      another rdma wr +	 */ +	for (i = 0, j = 0; +	     j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) { +		rsize = be32_to_cpu(db->len); +		raddr = be64_to_cpu(db->va); +		riu->raddr = raddr; +		riu->rkey = be32_to_cpu(db->key); +		riu->sge_cnt = 0; + +		/* calculate how many sge required for this remote_buf */ +		while (rsize > 0 && tsize > 0) { + +			if (rsize >= dma_len) { +				tsize -= dma_len; +				rsize -= dma_len; +				raddr += dma_len; + +				if (tsize > 0) { +					++j; +					if (j < count) { +						sg = sg_next(sg); +						dma_len = ib_sg_dma_len( +								dev, sg); +					} +				} +			} else { +				tsize -= rsize; +				dma_len -= rsize; +				rsize = 0; +			} + +			++riu->sge_cnt; + +			if (rsize > 0 && riu->sge_cnt == SRPT_DEF_SG_PER_WQE) { +				++ioctx->n_rdma; +				riu->sge = +				    kmalloc(riu->sge_cnt * sizeof *riu->sge, +					    GFP_KERNEL); +				if (!riu->sge) +					goto free_mem; + +				++riu; +				riu->sge_cnt = 0; +				riu->raddr = raddr; +				riu->rkey = be32_to_cpu(db->key); +			} +		} + +		++ioctx->n_rdma; +		riu->sge = kmalloc(riu->sge_cnt * sizeof *riu->sge, +				   GFP_KERNEL); +		if (!riu->sge) +			goto free_mem; +	} + +	db = ioctx->rbufs; +	tsize = cmd->data_length; +	riu = ioctx->rdma_ius; +	sg = sg_orig; +	dma_len = ib_sg_dma_len(dev, &sg[0]); +	dma_addr = ib_sg_dma_address(dev, &sg[0]); + +	/* this second loop is really mapped sg_addres to rdma_iu->ib_sge */ +	for (i = 0, j = 0; +	     j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) { +		rsize = be32_to_cpu(db->len); +		sge = riu->sge; +		k = 0; + +		while (rsize > 0 && tsize > 0) { +			sge->addr = dma_addr; +			sge->lkey = ch->sport->sdev->mr->lkey; + +			if (rsize >= dma_len) { +				sge->length = +					(tsize < dma_len) ? tsize : dma_len; +				tsize -= dma_len; +				rsize -= dma_len; + +				if (tsize > 0) { +					++j; +					if (j < count) { +						sg = sg_next(sg); +						dma_len = ib_sg_dma_len( +								dev, sg); +						dma_addr = ib_sg_dma_address( +								dev, sg); +					} +				} +			} else { +				sge->length = (tsize < rsize) ? tsize : rsize; +				tsize -= rsize; +				dma_len -= rsize; +				dma_addr += rsize; +				rsize = 0; +			} + +			++k; +			if (k == riu->sge_cnt && rsize > 0 && tsize > 0) { +				++riu; +				sge = riu->sge; +				k = 0; +			} else if (rsize > 0 && tsize > 0) +				++sge; +		} +	} + +	return 0; + +free_mem: +	srpt_unmap_sg_to_ib_sge(ch, ioctx); + +	return -ENOMEM; +} + +/** + * srpt_get_send_ioctx() - Obtain an I/O context for sending to the initiator. + */ +static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) +{ +	struct srpt_send_ioctx *ioctx; +	unsigned long flags; + +	BUG_ON(!ch); + +	ioctx = NULL; +	spin_lock_irqsave(&ch->spinlock, flags); +	if (!list_empty(&ch->free_list)) { +		ioctx = list_first_entry(&ch->free_list, +					 struct srpt_send_ioctx, free_list); +		list_del(&ioctx->free_list); +	} +	spin_unlock_irqrestore(&ch->spinlock, flags); + +	if (!ioctx) +		return ioctx; + +	BUG_ON(ioctx->ch != ch); +	spin_lock_init(&ioctx->spinlock); +	ioctx->state = SRPT_STATE_NEW; +	ioctx->n_rbuf = 0; +	ioctx->rbufs = NULL; +	ioctx->n_rdma = 0; +	ioctx->n_rdma_ius = 0; +	ioctx->rdma_ius = NULL; +	ioctx->mapped_sg_count = 0; +	init_completion(&ioctx->tx_done); +	ioctx->queue_status_only = false; +	/* +	 * transport_init_se_cmd() does not initialize all fields, so do it +	 * here. +	 */ +	memset(&ioctx->cmd, 0, sizeof(ioctx->cmd)); +	memset(&ioctx->sense_data, 0, sizeof(ioctx->sense_data)); + +	return ioctx; +} + +/** + * srpt_abort_cmd() - Abort a SCSI command. + * @ioctx:   I/O context associated with the SCSI command. + * @context: Preferred execution context. + */ +static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) +{ +	enum srpt_command_state state; +	unsigned long flags; + +	BUG_ON(!ioctx); + +	/* +	 * If the command is in a state where the target core is waiting for +	 * the ib_srpt driver, change the state to the next state. Changing +	 * the state of the command from SRPT_STATE_NEED_DATA to +	 * SRPT_STATE_DATA_IN ensures that srpt_xmit_response() will call this +	 * function a second time. +	 */ + +	spin_lock_irqsave(&ioctx->spinlock, flags); +	state = ioctx->state; +	switch (state) { +	case SRPT_STATE_NEED_DATA: +		ioctx->state = SRPT_STATE_DATA_IN; +		break; +	case SRPT_STATE_DATA_IN: +	case SRPT_STATE_CMD_RSP_SENT: +	case SRPT_STATE_MGMT_RSP_SENT: +		ioctx->state = SRPT_STATE_DONE; +		break; +	default: +		break; +	} +	spin_unlock_irqrestore(&ioctx->spinlock, flags); + +	if (state == SRPT_STATE_DONE) { +		struct srpt_rdma_ch *ch = ioctx->ch; + +		BUG_ON(ch->sess == NULL); + +		target_put_sess_cmd(ch->sess, &ioctx->cmd); +		goto out; +	} + +	pr_debug("Aborting cmd with state %d and tag %lld\n", state, +		 ioctx->tag); + +	switch (state) { +	case SRPT_STATE_NEW: +	case SRPT_STATE_DATA_IN: +	case SRPT_STATE_MGMT: +		/* +		 * Do nothing - defer abort processing until +		 * srpt_queue_response() is invoked. +		 */ +		WARN_ON(!transport_check_aborted_status(&ioctx->cmd, false)); +		break; +	case SRPT_STATE_NEED_DATA: +		/* DMA_TO_DEVICE (write) - RDMA read error. */ + +		/* XXX(hch): this is a horrible layering violation.. */ +		spin_lock_irqsave(&ioctx->cmd.t_state_lock, flags); +		ioctx->cmd.transport_state &= ~CMD_T_ACTIVE; +		spin_unlock_irqrestore(&ioctx->cmd.t_state_lock, flags); +		break; +	case SRPT_STATE_CMD_RSP_SENT: +		/* +		 * SRP_RSP sending failed or the SRP_RSP send completion has +		 * not been received in time. +		 */ +		srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); +		target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); +		break; +	case SRPT_STATE_MGMT_RSP_SENT: +		srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); +		target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); +		break; +	default: +		WARN(1, "Unexpected command state (%d)", state); +		break; +	} + +out: +	return state; +} + +/** + * srpt_handle_send_err_comp() - Process an IB_WC_SEND error completion. + */ +static void srpt_handle_send_err_comp(struct srpt_rdma_ch *ch, u64 wr_id) +{ +	struct srpt_send_ioctx *ioctx; +	enum srpt_command_state state; +	struct se_cmd *cmd; +	u32 index; + +	atomic_inc(&ch->sq_wr_avail); + +	index = idx_from_wr_id(wr_id); +	ioctx = ch->ioctx_ring[index]; +	state = srpt_get_cmd_state(ioctx); +	cmd = &ioctx->cmd; + +	WARN_ON(state != SRPT_STATE_CMD_RSP_SENT +		&& state != SRPT_STATE_MGMT_RSP_SENT +		&& state != SRPT_STATE_NEED_DATA +		&& state != SRPT_STATE_DONE); + +	/* If SRP_RSP sending failed, undo the ch->req_lim change. */ +	if (state == SRPT_STATE_CMD_RSP_SENT +	    || state == SRPT_STATE_MGMT_RSP_SENT) +		atomic_dec(&ch->req_lim); + +	srpt_abort_cmd(ioctx); +} + +/** + * srpt_handle_send_comp() - Process an IB send completion notification. + */ +static void srpt_handle_send_comp(struct srpt_rdma_ch *ch, +				  struct srpt_send_ioctx *ioctx) +{ +	enum srpt_command_state state; + +	atomic_inc(&ch->sq_wr_avail); + +	state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); + +	if (WARN_ON(state != SRPT_STATE_CMD_RSP_SENT +		    && state != SRPT_STATE_MGMT_RSP_SENT +		    && state != SRPT_STATE_DONE)) +		pr_debug("state = %d\n", state); + +	if (state != SRPT_STATE_DONE) { +		srpt_unmap_sg_to_ib_sge(ch, ioctx); +		transport_generic_free_cmd(&ioctx->cmd, 0); +	} else { +		printk(KERN_ERR "IB completion has been received too late for" +		       " wr_id = %u.\n", ioctx->ioctx.index); +	} +} + +/** + * srpt_handle_rdma_comp() - Process an IB RDMA completion notification. + * + * XXX: what is now target_execute_cmd used to be asynchronous, and unmapping + * the data that has been transferred via IB RDMA had to be postponed until the + * check_stop_free() callback.  None of this is necessary anymore and needs to + * be cleaned up. + */ +static void srpt_handle_rdma_comp(struct srpt_rdma_ch *ch, +				  struct srpt_send_ioctx *ioctx, +				  enum srpt_opcode opcode) +{ +	WARN_ON(ioctx->n_rdma <= 0); +	atomic_add(ioctx->n_rdma, &ch->sq_wr_avail); + +	if (opcode == SRPT_RDMA_READ_LAST) { +		if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA, +						SRPT_STATE_DATA_IN)) +			target_execute_cmd(&ioctx->cmd); +		else +			printk(KERN_ERR "%s[%d]: wrong state = %d\n", __func__, +			       __LINE__, srpt_get_cmd_state(ioctx)); +	} else if (opcode == SRPT_RDMA_ABORT) { +		ioctx->rdma_aborted = true; +	} else { +		WARN(true, "unexpected opcode %d\n", opcode); +	} +} + +/** + * srpt_handle_rdma_err_comp() - Process an IB RDMA error completion. + */ +static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch, +				      struct srpt_send_ioctx *ioctx, +				      enum srpt_opcode opcode) +{ +	struct se_cmd *cmd; +	enum srpt_command_state state; + +	cmd = &ioctx->cmd; +	state = srpt_get_cmd_state(ioctx); +	switch (opcode) { +	case SRPT_RDMA_READ_LAST: +		if (ioctx->n_rdma <= 0) { +			printk(KERN_ERR "Received invalid RDMA read" +			       " error completion with idx %d\n", +			       ioctx->ioctx.index); +			break; +		} +		atomic_add(ioctx->n_rdma, &ch->sq_wr_avail); +		if (state == SRPT_STATE_NEED_DATA) +			srpt_abort_cmd(ioctx); +		else +			printk(KERN_ERR "%s[%d]: wrong state = %d\n", +			       __func__, __LINE__, state); +		break; +	case SRPT_RDMA_WRITE_LAST: +		break; +	default: +		printk(KERN_ERR "%s[%d]: opcode = %u\n", __func__, +		       __LINE__, opcode); +		break; +	} +} + +/** + * srpt_build_cmd_rsp() - Build an SRP_RSP response. + * @ch: RDMA channel through which the request has been received. + * @ioctx: I/O context associated with the SRP_CMD request. The response will + *   be built in the buffer ioctx->buf points at and hence this function will + *   overwrite the request data. + * @tag: tag of the request for which this response is being generated. + * @status: value for the STATUS field of the SRP_RSP information unit. + * + * Returns the size in bytes of the SRP_RSP response. + * + * An SRP_RSP response contains a SCSI status or service response. See also + * section 6.9 in the SRP r16a document for the format of an SRP_RSP + * response. See also SPC-2 for more information about sense data. + */ +static int srpt_build_cmd_rsp(struct srpt_rdma_ch *ch, +			      struct srpt_send_ioctx *ioctx, u64 tag, +			      int status) +{ +	struct srp_rsp *srp_rsp; +	const u8 *sense_data; +	int sense_data_len, max_sense_len; + +	/* +	 * The lowest bit of all SAM-3 status codes is zero (see also +	 * paragraph 5.3 in SAM-3). +	 */ +	WARN_ON(status & 1); + +	srp_rsp = ioctx->ioctx.buf; +	BUG_ON(!srp_rsp); + +	sense_data = ioctx->sense_data; +	sense_data_len = ioctx->cmd.scsi_sense_length; +	WARN_ON(sense_data_len > sizeof(ioctx->sense_data)); + +	memset(srp_rsp, 0, sizeof *srp_rsp); +	srp_rsp->opcode = SRP_RSP; +	srp_rsp->req_lim_delta = +		__constant_cpu_to_be32(1 + atomic_xchg(&ch->req_lim_delta, 0)); +	srp_rsp->tag = tag; +	srp_rsp->status = status; + +	if (sense_data_len) { +		BUILD_BUG_ON(MIN_MAX_RSP_SIZE <= sizeof(*srp_rsp)); +		max_sense_len = ch->max_ti_iu_len - sizeof(*srp_rsp); +		if (sense_data_len > max_sense_len) { +			printk(KERN_WARNING "truncated sense data from %d to %d" +			       " bytes\n", sense_data_len, max_sense_len); +			sense_data_len = max_sense_len; +		} + +		srp_rsp->flags |= SRP_RSP_FLAG_SNSVALID; +		srp_rsp->sense_data_len = cpu_to_be32(sense_data_len); +		memcpy(srp_rsp + 1, sense_data, sense_data_len); +	} + +	return sizeof(*srp_rsp) + sense_data_len; +} + +/** + * srpt_build_tskmgmt_rsp() - Build a task management response. + * @ch:       RDMA channel through which the request has been received. + * @ioctx:    I/O context in which the SRP_RSP response will be built. + * @rsp_code: RSP_CODE that will be stored in the response. + * @tag:      Tag of the request for which this response is being generated. + * + * Returns the size in bytes of the SRP_RSP response. + * + * An SRP_RSP response contains a SCSI status or service response. See also + * section 6.9 in the SRP r16a document for the format of an SRP_RSP + * response. + */ +static int srpt_build_tskmgmt_rsp(struct srpt_rdma_ch *ch, +				  struct srpt_send_ioctx *ioctx, +				  u8 rsp_code, u64 tag) +{ +	struct srp_rsp *srp_rsp; +	int resp_data_len; +	int resp_len; + +	resp_data_len = 4; +	resp_len = sizeof(*srp_rsp) + resp_data_len; + +	srp_rsp = ioctx->ioctx.buf; +	BUG_ON(!srp_rsp); +	memset(srp_rsp, 0, sizeof *srp_rsp); + +	srp_rsp->opcode = SRP_RSP; +	srp_rsp->req_lim_delta = __constant_cpu_to_be32(1 +				    + atomic_xchg(&ch->req_lim_delta, 0)); +	srp_rsp->tag = tag; + +	srp_rsp->flags |= SRP_RSP_FLAG_RSPVALID; +	srp_rsp->resp_data_len = cpu_to_be32(resp_data_len); +	srp_rsp->data[3] = rsp_code; + +	return resp_len; +} + +#define NO_SUCH_LUN ((uint64_t)-1LL) + +/* + * SCSI LUN addressing method. See also SAM-2 and the section about + * eight byte LUNs. + */ +enum scsi_lun_addr_method { +	SCSI_LUN_ADDR_METHOD_PERIPHERAL   = 0, +	SCSI_LUN_ADDR_METHOD_FLAT         = 1, +	SCSI_LUN_ADDR_METHOD_LUN          = 2, +	SCSI_LUN_ADDR_METHOD_EXTENDED_LUN = 3, +}; + +/* + * srpt_unpack_lun() - Convert from network LUN to linear LUN. + * + * Convert an 2-byte, 4-byte, 6-byte or 8-byte LUN structure in network byte + * order (big endian) to a linear LUN. Supports three LUN addressing methods: + * peripheral, flat and logical unit. See also SAM-2, section 4.9.4 (page 40). + */ +static uint64_t srpt_unpack_lun(const uint8_t *lun, int len) +{ +	uint64_t res = NO_SUCH_LUN; +	int addressing_method; + +	if (unlikely(len < 2)) { +		printk(KERN_ERR "Illegal LUN length %d, expected 2 bytes or " +		       "more", len); +		goto out; +	} + +	switch (len) { +	case 8: +		if ((*((__be64 *)lun) & +		     __constant_cpu_to_be64(0x0000FFFFFFFFFFFFLL)) != 0) +			goto out_err; +		break; +	case 4: +		if (*((__be16 *)&lun[2]) != 0) +			goto out_err; +		break; +	case 6: +		if (*((__be32 *)&lun[2]) != 0) +			goto out_err; +		break; +	case 2: +		break; +	default: +		goto out_err; +	} + +	addressing_method = (*lun) >> 6; /* highest two bits of byte 0 */ +	switch (addressing_method) { +	case SCSI_LUN_ADDR_METHOD_PERIPHERAL: +	case SCSI_LUN_ADDR_METHOD_FLAT: +	case SCSI_LUN_ADDR_METHOD_LUN: +		res = *(lun + 1) | (((*lun) & 0x3f) << 8); +		break; + +	case SCSI_LUN_ADDR_METHOD_EXTENDED_LUN: +	default: +		printk(KERN_ERR "Unimplemented LUN addressing method %u", +		       addressing_method); +		break; +	} + +out: +	return res; + +out_err: +	printk(KERN_ERR "Support for multi-level LUNs has not yet been" +	       " implemented"); +	goto out; +} + +static int srpt_check_stop_free(struct se_cmd *cmd) +{ +	struct srpt_send_ioctx *ioctx = container_of(cmd, +				struct srpt_send_ioctx, cmd); + +	return target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); +} + +/** + * srpt_handle_cmd() - Process SRP_CMD. + */ +static int srpt_handle_cmd(struct srpt_rdma_ch *ch, +			   struct srpt_recv_ioctx *recv_ioctx, +			   struct srpt_send_ioctx *send_ioctx) +{ +	struct se_cmd *cmd; +	struct srp_cmd *srp_cmd; +	uint64_t unpacked_lun; +	u64 data_len; +	enum dma_data_direction dir; +	sense_reason_t ret; +	int rc; + +	BUG_ON(!send_ioctx); + +	srp_cmd = recv_ioctx->ioctx.buf; +	cmd = &send_ioctx->cmd; +	send_ioctx->tag = srp_cmd->tag; + +	switch (srp_cmd->task_attr) { +	case SRP_CMD_SIMPLE_Q: +		cmd->sam_task_attr = MSG_SIMPLE_TAG; +		break; +	case SRP_CMD_ORDERED_Q: +	default: +		cmd->sam_task_attr = MSG_ORDERED_TAG; +		break; +	case SRP_CMD_HEAD_OF_Q: +		cmd->sam_task_attr = MSG_HEAD_TAG; +		break; +	case SRP_CMD_ACA: +		cmd->sam_task_attr = MSG_ACA_TAG; +		break; +	} + +	if (srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &data_len)) { +		printk(KERN_ERR "0x%llx: parsing SRP descriptor table failed.\n", +		       srp_cmd->tag); +		ret = TCM_INVALID_CDB_FIELD; +		goto send_sense; +	} + +	unpacked_lun = srpt_unpack_lun((uint8_t *)&srp_cmd->lun, +				       sizeof(srp_cmd->lun)); +	rc = target_submit_cmd(cmd, ch->sess, srp_cmd->cdb, +			&send_ioctx->sense_data[0], unpacked_lun, data_len, +			MSG_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF); +	if (rc != 0) { +		ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; +		goto send_sense; +	} +	return 0; + +send_sense: +	transport_send_check_condition_and_sense(cmd, ret, 0); +	return -1; +} + +/** + * srpt_rx_mgmt_fn_tag() - Process a task management function by tag. + * @ch: RDMA channel of the task management request. + * @fn: Task management function to perform. + * @req_tag: Tag of the SRP task management request. + * @mgmt_ioctx: I/O context of the task management request. + * + * Returns zero if the target core will process the task management + * request asynchronously. + * + * Note: It is assumed that the initiator serializes tag-based task management + * requests. + */ +static int srpt_rx_mgmt_fn_tag(struct srpt_send_ioctx *ioctx, u64 tag) +{ +	struct srpt_device *sdev; +	struct srpt_rdma_ch *ch; +	struct srpt_send_ioctx *target; +	int ret, i; + +	ret = -EINVAL; +	ch = ioctx->ch; +	BUG_ON(!ch); +	BUG_ON(!ch->sport); +	sdev = ch->sport->sdev; +	BUG_ON(!sdev); +	spin_lock_irq(&sdev->spinlock); +	for (i = 0; i < ch->rq_size; ++i) { +		target = ch->ioctx_ring[i]; +		if (target->cmd.se_lun == ioctx->cmd.se_lun && +		    target->tag == tag && +		    srpt_get_cmd_state(target) != SRPT_STATE_DONE) { +			ret = 0; +			/* now let the target core abort &target->cmd; */ +			break; +		} +	} +	spin_unlock_irq(&sdev->spinlock); +	return ret; +} + +static int srp_tmr_to_tcm(int fn) +{ +	switch (fn) { +	case SRP_TSK_ABORT_TASK: +		return TMR_ABORT_TASK; +	case SRP_TSK_ABORT_TASK_SET: +		return TMR_ABORT_TASK_SET; +	case SRP_TSK_CLEAR_TASK_SET: +		return TMR_CLEAR_TASK_SET; +	case SRP_TSK_LUN_RESET: +		return TMR_LUN_RESET; +	case SRP_TSK_CLEAR_ACA: +		return TMR_CLEAR_ACA; +	default: +		return -1; +	} +} + +/** + * srpt_handle_tsk_mgmt() - Process an SRP_TSK_MGMT information unit. + * + * Returns 0 if and only if the request will be processed by the target core. + * + * For more information about SRP_TSK_MGMT information units, see also section + * 6.7 in the SRP r16a document. + */ +static void srpt_handle_tsk_mgmt(struct srpt_rdma_ch *ch, +				 struct srpt_recv_ioctx *recv_ioctx, +				 struct srpt_send_ioctx *send_ioctx) +{ +	struct srp_tsk_mgmt *srp_tsk; +	struct se_cmd *cmd; +	struct se_session *sess = ch->sess; +	uint64_t unpacked_lun; +	uint32_t tag = 0; +	int tcm_tmr; +	int rc; + +	BUG_ON(!send_ioctx); + +	srp_tsk = recv_ioctx->ioctx.buf; +	cmd = &send_ioctx->cmd; + +	pr_debug("recv tsk_mgmt fn %d for task_tag %lld and cmd tag %lld" +		 " cm_id %p sess %p\n", srp_tsk->tsk_mgmt_func, +		 srp_tsk->task_tag, srp_tsk->tag, ch->cm_id, ch->sess); + +	srpt_set_cmd_state(send_ioctx, SRPT_STATE_MGMT); +	send_ioctx->tag = srp_tsk->tag; +	tcm_tmr = srp_tmr_to_tcm(srp_tsk->tsk_mgmt_func); +	if (tcm_tmr < 0) { +		send_ioctx->cmd.se_tmr_req->response = +			TMR_TASK_MGMT_FUNCTION_NOT_SUPPORTED; +		goto fail; +	} +	unpacked_lun = srpt_unpack_lun((uint8_t *)&srp_tsk->lun, +				       sizeof(srp_tsk->lun)); + +	if (srp_tsk->tsk_mgmt_func == SRP_TSK_ABORT_TASK) { +		rc = srpt_rx_mgmt_fn_tag(send_ioctx, srp_tsk->task_tag); +		if (rc < 0) { +			send_ioctx->cmd.se_tmr_req->response = +					TMR_TASK_DOES_NOT_EXIST; +			goto fail; +		} +		tag = srp_tsk->task_tag; +	} +	rc = target_submit_tmr(&send_ioctx->cmd, sess, NULL, unpacked_lun, +				srp_tsk, tcm_tmr, GFP_KERNEL, tag, +				TARGET_SCF_ACK_KREF); +	if (rc != 0) { +		send_ioctx->cmd.se_tmr_req->response = TMR_FUNCTION_REJECTED; +		goto fail; +	} +	return; +fail: +	transport_send_check_condition_and_sense(cmd, 0, 0); // XXX: +} + +/** + * srpt_handle_new_iu() - Process a newly received information unit. + * @ch:    RDMA channel through which the information unit has been received. + * @ioctx: SRPT I/O context associated with the information unit. + */ +static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, +			       struct srpt_recv_ioctx *recv_ioctx, +			       struct srpt_send_ioctx *send_ioctx) +{ +	struct srp_cmd *srp_cmd; +	enum rdma_ch_state ch_state; + +	BUG_ON(!ch); +	BUG_ON(!recv_ioctx); + +	ib_dma_sync_single_for_cpu(ch->sport->sdev->device, +				   recv_ioctx->ioctx.dma, srp_max_req_size, +				   DMA_FROM_DEVICE); + +	ch_state = srpt_get_ch_state(ch); +	if (unlikely(ch_state == CH_CONNECTING)) { +		list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list); +		goto out; +	} + +	if (unlikely(ch_state != CH_LIVE)) +		goto out; + +	srp_cmd = recv_ioctx->ioctx.buf; +	if (srp_cmd->opcode == SRP_CMD || srp_cmd->opcode == SRP_TSK_MGMT) { +		if (!send_ioctx) +			send_ioctx = srpt_get_send_ioctx(ch); +		if (unlikely(!send_ioctx)) { +			list_add_tail(&recv_ioctx->wait_list, +				      &ch->cmd_wait_list); +			goto out; +		} +	} + +	switch (srp_cmd->opcode) { +	case SRP_CMD: +		srpt_handle_cmd(ch, recv_ioctx, send_ioctx); +		break; +	case SRP_TSK_MGMT: +		srpt_handle_tsk_mgmt(ch, recv_ioctx, send_ioctx); +		break; +	case SRP_I_LOGOUT: +		printk(KERN_ERR "Not yet implemented: SRP_I_LOGOUT\n"); +		break; +	case SRP_CRED_RSP: +		pr_debug("received SRP_CRED_RSP\n"); +		break; +	case SRP_AER_RSP: +		pr_debug("received SRP_AER_RSP\n"); +		break; +	case SRP_RSP: +		printk(KERN_ERR "Received SRP_RSP\n"); +		break; +	default: +		printk(KERN_ERR "received IU with unknown opcode 0x%x\n", +		       srp_cmd->opcode); +		break; +	} + +	srpt_post_recv(ch->sport->sdev, recv_ioctx); +out: +	return; +} + +static void srpt_process_rcv_completion(struct ib_cq *cq, +					struct srpt_rdma_ch *ch, +					struct ib_wc *wc) +{ +	struct srpt_device *sdev = ch->sport->sdev; +	struct srpt_recv_ioctx *ioctx; +	u32 index; + +	index = idx_from_wr_id(wc->wr_id); +	if (wc->status == IB_WC_SUCCESS) { +		int req_lim; + +		req_lim = atomic_dec_return(&ch->req_lim); +		if (unlikely(req_lim < 0)) +			printk(KERN_ERR "req_lim = %d < 0\n", req_lim); +		ioctx = sdev->ioctx_ring[index]; +		srpt_handle_new_iu(ch, ioctx, NULL); +	} else { +		printk(KERN_INFO "receiving failed for idx %u with status %d\n", +		       index, wc->status); +	} +} + +/** + * srpt_process_send_completion() - Process an IB send completion. + * + * Note: Although this has not yet been observed during tests, at least in + * theory it is possible that the srpt_get_send_ioctx() call invoked by + * srpt_handle_new_iu() fails. This is possible because the req_lim_delta + * value in each response is set to one, and it is possible that this response + * makes the initiator send a new request before the send completion for that + * response has been processed. This could e.g. happen if the call to + * srpt_put_send_iotcx() is delayed because of a higher priority interrupt or + * if IB retransmission causes generation of the send completion to be + * delayed. Incoming information units for which srpt_get_send_ioctx() fails + * are queued on cmd_wait_list. The code below processes these delayed + * requests one at a time. + */ +static void srpt_process_send_completion(struct ib_cq *cq, +					 struct srpt_rdma_ch *ch, +					 struct ib_wc *wc) +{ +	struct srpt_send_ioctx *send_ioctx; +	uint32_t index; +	enum srpt_opcode opcode; + +	index = idx_from_wr_id(wc->wr_id); +	opcode = opcode_from_wr_id(wc->wr_id); +	send_ioctx = ch->ioctx_ring[index]; +	if (wc->status == IB_WC_SUCCESS) { +		if (opcode == SRPT_SEND) +			srpt_handle_send_comp(ch, send_ioctx); +		else { +			WARN_ON(opcode != SRPT_RDMA_ABORT && +				wc->opcode != IB_WC_RDMA_READ); +			srpt_handle_rdma_comp(ch, send_ioctx, opcode); +		} +	} else { +		if (opcode == SRPT_SEND) { +			printk(KERN_INFO "sending response for idx %u failed" +			       " with status %d\n", index, wc->status); +			srpt_handle_send_err_comp(ch, wc->wr_id); +		} else if (opcode != SRPT_RDMA_MID) { +			printk(KERN_INFO "RDMA t %d for idx %u failed with" +				" status %d", opcode, index, wc->status); +			srpt_handle_rdma_err_comp(ch, send_ioctx, opcode); +		} +	} + +	while (unlikely(opcode == SRPT_SEND +			&& !list_empty(&ch->cmd_wait_list) +			&& srpt_get_ch_state(ch) == CH_LIVE +			&& (send_ioctx = srpt_get_send_ioctx(ch)) != NULL)) { +		struct srpt_recv_ioctx *recv_ioctx; + +		recv_ioctx = list_first_entry(&ch->cmd_wait_list, +					      struct srpt_recv_ioctx, +					      wait_list); +		list_del(&recv_ioctx->wait_list); +		srpt_handle_new_iu(ch, recv_ioctx, send_ioctx); +	} +} + +static void srpt_process_completion(struct ib_cq *cq, struct srpt_rdma_ch *ch) +{ +	struct ib_wc *const wc = ch->wc; +	int i, n; + +	WARN_ON(cq != ch->cq); + +	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); +	while ((n = ib_poll_cq(cq, ARRAY_SIZE(ch->wc), wc)) > 0) { +		for (i = 0; i < n; i++) { +			if (opcode_from_wr_id(wc[i].wr_id) == SRPT_RECV) +				srpt_process_rcv_completion(cq, ch, &wc[i]); +			else +				srpt_process_send_completion(cq, ch, &wc[i]); +		} +	} +} + +/** + * srpt_completion() - IB completion queue callback function. + * + * Notes: + * - It is guaranteed that a completion handler will never be invoked + *   concurrently on two different CPUs for the same completion queue. See also + *   Documentation/infiniband/core_locking.txt and the implementation of + *   handle_edge_irq() in kernel/irq/chip.c. + * - When threaded IRQs are enabled, completion handlers are invoked in thread + *   context instead of interrupt context. + */ +static void srpt_completion(struct ib_cq *cq, void *ctx) +{ +	struct srpt_rdma_ch *ch = ctx; + +	wake_up_interruptible(&ch->wait_queue); +} + +static int srpt_compl_thread(void *arg) +{ +	struct srpt_rdma_ch *ch; + +	/* Hibernation / freezing of the SRPT kernel thread is not supported. */ +	current->flags |= PF_NOFREEZE; + +	ch = arg; +	BUG_ON(!ch); +	printk(KERN_INFO "Session %s: kernel thread %s (PID %d) started\n", +	       ch->sess_name, ch->thread->comm, current->pid); +	while (!kthread_should_stop()) { +		wait_event_interruptible(ch->wait_queue, +			(srpt_process_completion(ch->cq, ch), +			 kthread_should_stop())); +	} +	printk(KERN_INFO "Session %s: kernel thread %s (PID %d) stopped\n", +	       ch->sess_name, ch->thread->comm, current->pid); +	return 0; +} + +/** + * srpt_create_ch_ib() - Create receive and send completion queues. + */ +static int srpt_create_ch_ib(struct srpt_rdma_ch *ch) +{ +	struct ib_qp_init_attr *qp_init; +	struct srpt_port *sport = ch->sport; +	struct srpt_device *sdev = sport->sdev; +	u32 srp_sq_size = sport->port_attrib.srp_sq_size; +	int ret; + +	WARN_ON(ch->rq_size < 1); + +	ret = -ENOMEM; +	qp_init = kzalloc(sizeof *qp_init, GFP_KERNEL); +	if (!qp_init) +		goto out; + +	ch->cq = ib_create_cq(sdev->device, srpt_completion, NULL, ch, +			      ch->rq_size + srp_sq_size, 0); +	if (IS_ERR(ch->cq)) { +		ret = PTR_ERR(ch->cq); +		printk(KERN_ERR "failed to create CQ cqe= %d ret= %d\n", +		       ch->rq_size + srp_sq_size, ret); +		goto out; +	} + +	qp_init->qp_context = (void *)ch; +	qp_init->event_handler +		= (void(*)(struct ib_event *, void*))srpt_qp_event; +	qp_init->send_cq = ch->cq; +	qp_init->recv_cq = ch->cq; +	qp_init->srq = sdev->srq; +	qp_init->sq_sig_type = IB_SIGNAL_REQ_WR; +	qp_init->qp_type = IB_QPT_RC; +	qp_init->cap.max_send_wr = srp_sq_size; +	qp_init->cap.max_send_sge = SRPT_DEF_SG_PER_WQE; + +	ch->qp = ib_create_qp(sdev->pd, qp_init); +	if (IS_ERR(ch->qp)) { +		ret = PTR_ERR(ch->qp); +		printk(KERN_ERR "failed to create_qp ret= %d\n", ret); +		goto err_destroy_cq; +	} + +	atomic_set(&ch->sq_wr_avail, qp_init->cap.max_send_wr); + +	pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", +		 __func__, ch->cq->cqe, qp_init->cap.max_send_sge, +		 qp_init->cap.max_send_wr, ch->cm_id); + +	ret = srpt_init_ch_qp(ch, ch->qp); +	if (ret) +		goto err_destroy_qp; + +	init_waitqueue_head(&ch->wait_queue); + +	pr_debug("creating thread for session %s\n", ch->sess_name); + +	ch->thread = kthread_run(srpt_compl_thread, ch, "ib_srpt_compl"); +	if (IS_ERR(ch->thread)) { +		printk(KERN_ERR "failed to create kernel thread %ld\n", +		       PTR_ERR(ch->thread)); +		ch->thread = NULL; +		goto err_destroy_qp; +	} + +out: +	kfree(qp_init); +	return ret; + +err_destroy_qp: +	ib_destroy_qp(ch->qp); +err_destroy_cq: +	ib_destroy_cq(ch->cq); +	goto out; +} + +static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch) +{ +	if (ch->thread) +		kthread_stop(ch->thread); + +	ib_destroy_qp(ch->qp); +	ib_destroy_cq(ch->cq); +} + +/** + * __srpt_close_ch() - Close an RDMA channel by setting the QP error state. + * + * Reset the QP and make sure all resources associated with the channel will + * be deallocated at an appropriate time. + * + * Note: The caller must hold ch->sport->sdev->spinlock. + */ +static void __srpt_close_ch(struct srpt_rdma_ch *ch) +{ +	struct srpt_device *sdev; +	enum rdma_ch_state prev_state; +	unsigned long flags; + +	sdev = ch->sport->sdev; + +	spin_lock_irqsave(&ch->spinlock, flags); +	prev_state = ch->state; +	switch (prev_state) { +	case CH_CONNECTING: +	case CH_LIVE: +		ch->state = CH_DISCONNECTING; +		break; +	default: +		break; +	} +	spin_unlock_irqrestore(&ch->spinlock, flags); + +	switch (prev_state) { +	case CH_CONNECTING: +		ib_send_cm_rej(ch->cm_id, IB_CM_REJ_NO_RESOURCES, NULL, 0, +			       NULL, 0); +		/* fall through */ +	case CH_LIVE: +		if (ib_send_cm_dreq(ch->cm_id, NULL, 0) < 0) +			printk(KERN_ERR "sending CM DREQ failed.\n"); +		break; +	case CH_DISCONNECTING: +		break; +	case CH_DRAINING: +	case CH_RELEASING: +		break; +	} +} + +/** + * srpt_close_ch() - Close an RDMA channel. + */ +static void srpt_close_ch(struct srpt_rdma_ch *ch) +{ +	struct srpt_device *sdev; + +	sdev = ch->sport->sdev; +	spin_lock_irq(&sdev->spinlock); +	__srpt_close_ch(ch); +	spin_unlock_irq(&sdev->spinlock); +} + +/** + * srpt_shutdown_session() - Whether or not a session may be shut down. + */ +static int srpt_shutdown_session(struct se_session *se_sess) +{ +	struct srpt_rdma_ch *ch = se_sess->fabric_sess_ptr; +	unsigned long flags; + +	spin_lock_irqsave(&ch->spinlock, flags); +	if (ch->in_shutdown) { +		spin_unlock_irqrestore(&ch->spinlock, flags); +		return true; +	} + +	ch->in_shutdown = true; +	target_sess_cmd_list_set_waiting(se_sess); +	spin_unlock_irqrestore(&ch->spinlock, flags); + +	return true; +} + +/** + * srpt_drain_channel() - Drain a channel by resetting the IB queue pair. + * @cm_id: Pointer to the CM ID of the channel to be drained. + * + * Note: Must be called from inside srpt_cm_handler to avoid a race between + * accessing sdev->spinlock and the call to kfree(sdev) in srpt_remove_one() + * (the caller of srpt_cm_handler holds the cm_id spinlock; srpt_remove_one() + * waits until all target sessions for the associated IB device have been + * unregistered and target session registration involves a call to + * ib_destroy_cm_id(), which locks the cm_id spinlock and hence waits until + * this function has finished). + */ +static void srpt_drain_channel(struct ib_cm_id *cm_id) +{ +	struct srpt_device *sdev; +	struct srpt_rdma_ch *ch; +	int ret; +	bool do_reset = false; + +	WARN_ON_ONCE(irqs_disabled()); + +	sdev = cm_id->context; +	BUG_ON(!sdev); +	spin_lock_irq(&sdev->spinlock); +	list_for_each_entry(ch, &sdev->rch_list, list) { +		if (ch->cm_id == cm_id) { +			do_reset = srpt_test_and_set_ch_state(ch, +					CH_CONNECTING, CH_DRAINING) || +				   srpt_test_and_set_ch_state(ch, +					CH_LIVE, CH_DRAINING) || +				   srpt_test_and_set_ch_state(ch, +					CH_DISCONNECTING, CH_DRAINING); +			break; +		} +	} +	spin_unlock_irq(&sdev->spinlock); + +	if (do_reset) { +		if (ch->sess) +			srpt_shutdown_session(ch->sess); + +		ret = srpt_ch_qp_err(ch); +		if (ret < 0) +			printk(KERN_ERR "Setting queue pair in error state" +			       " failed: %d\n", ret); +	} +} + +/** + * srpt_find_channel() - Look up an RDMA channel. + * @cm_id: Pointer to the CM ID of the channel to be looked up. + * + * Return NULL if no matching RDMA channel has been found. + */ +static struct srpt_rdma_ch *srpt_find_channel(struct srpt_device *sdev, +					      struct ib_cm_id *cm_id) +{ +	struct srpt_rdma_ch *ch; +	bool found; + +	WARN_ON_ONCE(irqs_disabled()); +	BUG_ON(!sdev); + +	found = false; +	spin_lock_irq(&sdev->spinlock); +	list_for_each_entry(ch, &sdev->rch_list, list) { +		if (ch->cm_id == cm_id) { +			found = true; +			break; +		} +	} +	spin_unlock_irq(&sdev->spinlock); + +	return found ? ch : NULL; +} + +/** + * srpt_release_channel() - Release channel resources. + * + * Schedules the actual release because: + * - Calling the ib_destroy_cm_id() call from inside an IB CM callback would + *   trigger a deadlock. + * - It is not safe to call TCM transport_* functions from interrupt context. + */ +static void srpt_release_channel(struct srpt_rdma_ch *ch) +{ +	schedule_work(&ch->release_work); +} + +static void srpt_release_channel_work(struct work_struct *w) +{ +	struct srpt_rdma_ch *ch; +	struct srpt_device *sdev; +	struct se_session *se_sess; + +	ch = container_of(w, struct srpt_rdma_ch, release_work); +	pr_debug("ch = %p; ch->sess = %p; release_done = %p\n", ch, ch->sess, +		 ch->release_done); + +	sdev = ch->sport->sdev; +	BUG_ON(!sdev); + +	se_sess = ch->sess; +	BUG_ON(!se_sess); + +	target_wait_for_sess_cmds(se_sess); + +	transport_deregister_session_configfs(se_sess); +	transport_deregister_session(se_sess); +	ch->sess = NULL; + +	ib_destroy_cm_id(ch->cm_id); + +	srpt_destroy_ch_ib(ch); + +	srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring, +			     ch->sport->sdev, ch->rq_size, +			     ch->rsp_size, DMA_TO_DEVICE); + +	spin_lock_irq(&sdev->spinlock); +	list_del(&ch->list); +	spin_unlock_irq(&sdev->spinlock); + +	if (ch->release_done) +		complete(ch->release_done); + +	wake_up(&sdev->ch_releaseQ); + +	kfree(ch); +} + +static struct srpt_node_acl *__srpt_lookup_acl(struct srpt_port *sport, +					       u8 i_port_id[16]) +{ +	struct srpt_node_acl *nacl; + +	list_for_each_entry(nacl, &sport->port_acl_list, list) +		if (memcmp(nacl->i_port_id, i_port_id, +			   sizeof(nacl->i_port_id)) == 0) +			return nacl; + +	return NULL; +} + +static struct srpt_node_acl *srpt_lookup_acl(struct srpt_port *sport, +					     u8 i_port_id[16]) +{ +	struct srpt_node_acl *nacl; + +	spin_lock_irq(&sport->port_acl_lock); +	nacl = __srpt_lookup_acl(sport, i_port_id); +	spin_unlock_irq(&sport->port_acl_lock); + +	return nacl; +} + +/** + * srpt_cm_req_recv() - Process the event IB_CM_REQ_RECEIVED. + * + * Ownership of the cm_id is transferred to the target session if this + * functions returns zero. Otherwise the caller remains the owner of cm_id. + */ +static int srpt_cm_req_recv(struct ib_cm_id *cm_id, +			    struct ib_cm_req_event_param *param, +			    void *private_data) +{ +	struct srpt_device *sdev = cm_id->context; +	struct srpt_port *sport = &sdev->port[param->port - 1]; +	struct srp_login_req *req; +	struct srp_login_rsp *rsp; +	struct srp_login_rej *rej; +	struct ib_cm_rep_param *rep_param; +	struct srpt_rdma_ch *ch, *tmp_ch; +	struct srpt_node_acl *nacl; +	u32 it_iu_len; +	int i; +	int ret = 0; + +	WARN_ON_ONCE(irqs_disabled()); + +	if (WARN_ON(!sdev || !private_data)) +		return -EINVAL; + +	req = (struct srp_login_req *)private_data; + +	it_iu_len = be32_to_cpu(req->req_it_iu_len); + +	printk(KERN_INFO "Received SRP_LOGIN_REQ with i_port_id 0x%llx:0x%llx," +	       " t_port_id 0x%llx:0x%llx and it_iu_len %d on port %d" +	       " (guid=0x%llx:0x%llx)\n", +	       be64_to_cpu(*(__be64 *)&req->initiator_port_id[0]), +	       be64_to_cpu(*(__be64 *)&req->initiator_port_id[8]), +	       be64_to_cpu(*(__be64 *)&req->target_port_id[0]), +	       be64_to_cpu(*(__be64 *)&req->target_port_id[8]), +	       it_iu_len, +	       param->port, +	       be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[0]), +	       be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[8])); + +	rsp = kzalloc(sizeof *rsp, GFP_KERNEL); +	rej = kzalloc(sizeof *rej, GFP_KERNEL); +	rep_param = kzalloc(sizeof *rep_param, GFP_KERNEL); + +	if (!rsp || !rej || !rep_param) { +		ret = -ENOMEM; +		goto out; +	} + +	if (it_iu_len > srp_max_req_size || it_iu_len < 64) { +		rej->reason = __constant_cpu_to_be32( +				SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE); +		ret = -EINVAL; +		printk(KERN_ERR "rejected SRP_LOGIN_REQ because its" +		       " length (%d bytes) is out of range (%d .. %d)\n", +		       it_iu_len, 64, srp_max_req_size); +		goto reject; +	} + +	if (!sport->enabled) { +		rej->reason = __constant_cpu_to_be32( +			     SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); +		ret = -EINVAL; +		printk(KERN_ERR "rejected SRP_LOGIN_REQ because the target port" +		       " has not yet been enabled\n"); +		goto reject; +	} + +	if ((req->req_flags & SRP_MTCH_ACTION) == SRP_MULTICHAN_SINGLE) { +		rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_NO_CHAN; + +		spin_lock_irq(&sdev->spinlock); + +		list_for_each_entry_safe(ch, tmp_ch, &sdev->rch_list, list) { +			if (!memcmp(ch->i_port_id, req->initiator_port_id, 16) +			    && !memcmp(ch->t_port_id, req->target_port_id, 16) +			    && param->port == ch->sport->port +			    && param->listen_id == ch->sport->sdev->cm_id +			    && ch->cm_id) { +				enum rdma_ch_state ch_state; + +				ch_state = srpt_get_ch_state(ch); +				if (ch_state != CH_CONNECTING +				    && ch_state != CH_LIVE) +					continue; + +				/* found an existing channel */ +				pr_debug("Found existing channel %s" +					 " cm_id= %p state= %d\n", +					 ch->sess_name, ch->cm_id, ch_state); + +				__srpt_close_ch(ch); + +				rsp->rsp_flags = +					SRP_LOGIN_RSP_MULTICHAN_TERMINATED; +			} +		} + +		spin_unlock_irq(&sdev->spinlock); + +	} else +		rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_MAINTAINED; + +	if (*(__be64 *)req->target_port_id != cpu_to_be64(srpt_service_guid) +	    || *(__be64 *)(req->target_port_id + 8) != +	       cpu_to_be64(srpt_service_guid)) { +		rej->reason = __constant_cpu_to_be32( +				SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL); +		ret = -ENOMEM; +		printk(KERN_ERR "rejected SRP_LOGIN_REQ because it" +		       " has an invalid target port identifier.\n"); +		goto reject; +	} + +	ch = kzalloc(sizeof *ch, GFP_KERNEL); +	if (!ch) { +		rej->reason = __constant_cpu_to_be32( +					SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); +		printk(KERN_ERR "rejected SRP_LOGIN_REQ because no memory.\n"); +		ret = -ENOMEM; +		goto reject; +	} + +	INIT_WORK(&ch->release_work, srpt_release_channel_work); +	memcpy(ch->i_port_id, req->initiator_port_id, 16); +	memcpy(ch->t_port_id, req->target_port_id, 16); +	ch->sport = &sdev->port[param->port - 1]; +	ch->cm_id = cm_id; +	/* +	 * Avoid QUEUE_FULL conditions by limiting the number of buffers used +	 * for the SRP protocol to the command queue size. +	 */ +	ch->rq_size = SRPT_RQ_SIZE; +	spin_lock_init(&ch->spinlock); +	ch->state = CH_CONNECTING; +	INIT_LIST_HEAD(&ch->cmd_wait_list); +	ch->rsp_size = ch->sport->port_attrib.srp_max_rsp_size; + +	ch->ioctx_ring = (struct srpt_send_ioctx **) +		srpt_alloc_ioctx_ring(ch->sport->sdev, ch->rq_size, +				      sizeof(*ch->ioctx_ring[0]), +				      ch->rsp_size, DMA_TO_DEVICE); +	if (!ch->ioctx_ring) +		goto free_ch; + +	INIT_LIST_HEAD(&ch->free_list); +	for (i = 0; i < ch->rq_size; i++) { +		ch->ioctx_ring[i]->ch = ch; +		list_add_tail(&ch->ioctx_ring[i]->free_list, &ch->free_list); +	} + +	ret = srpt_create_ch_ib(ch); +	if (ret) { +		rej->reason = __constant_cpu_to_be32( +				SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); +		printk(KERN_ERR "rejected SRP_LOGIN_REQ because creating" +		       " a new RDMA channel failed.\n"); +		goto free_ring; +	} + +	ret = srpt_ch_qp_rtr(ch, ch->qp); +	if (ret) { +		rej->reason = __constant_cpu_to_be32( +				SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); +		printk(KERN_ERR "rejected SRP_LOGIN_REQ because enabling" +		       " RTR failed (error code = %d)\n", ret); +		goto destroy_ib; +	} +	/* +	 * Use the initator port identifier as the session name. +	 */ +	snprintf(ch->sess_name, sizeof(ch->sess_name), "0x%016llx%016llx", +			be64_to_cpu(*(__be64 *)ch->i_port_id), +			be64_to_cpu(*(__be64 *)(ch->i_port_id + 8))); + +	pr_debug("registering session %s\n", ch->sess_name); + +	nacl = srpt_lookup_acl(sport, ch->i_port_id); +	if (!nacl) { +		printk(KERN_INFO "Rejected login because no ACL has been" +		       " configured yet for initiator %s.\n", ch->sess_name); +		rej->reason = __constant_cpu_to_be32( +				SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED); +		goto destroy_ib; +	} + +	ch->sess = transport_init_session(TARGET_PROT_NORMAL); +	if (IS_ERR(ch->sess)) { +		rej->reason = __constant_cpu_to_be32( +				SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); +		pr_debug("Failed to create session\n"); +		goto deregister_session; +	} +	ch->sess->se_node_acl = &nacl->nacl; +	transport_register_session(&sport->port_tpg_1, &nacl->nacl, ch->sess, ch); + +	pr_debug("Establish connection sess=%p name=%s cm_id=%p\n", ch->sess, +		 ch->sess_name, ch->cm_id); + +	/* create srp_login_response */ +	rsp->opcode = SRP_LOGIN_RSP; +	rsp->tag = req->tag; +	rsp->max_it_iu_len = req->req_it_iu_len; +	rsp->max_ti_iu_len = req->req_it_iu_len; +	ch->max_ti_iu_len = it_iu_len; +	rsp->buf_fmt = __constant_cpu_to_be16(SRP_BUF_FORMAT_DIRECT +					      | SRP_BUF_FORMAT_INDIRECT); +	rsp->req_lim_delta = cpu_to_be32(ch->rq_size); +	atomic_set(&ch->req_lim, ch->rq_size); +	atomic_set(&ch->req_lim_delta, 0); + +	/* create cm reply */ +	rep_param->qp_num = ch->qp->qp_num; +	rep_param->private_data = (void *)rsp; +	rep_param->private_data_len = sizeof *rsp; +	rep_param->rnr_retry_count = 7; +	rep_param->flow_control = 1; +	rep_param->failover_accepted = 0; +	rep_param->srq = 1; +	rep_param->responder_resources = 4; +	rep_param->initiator_depth = 4; + +	ret = ib_send_cm_rep(cm_id, rep_param); +	if (ret) { +		printk(KERN_ERR "sending SRP_LOGIN_REQ response failed" +		       " (error code = %d)\n", ret); +		goto release_channel; +	} + +	spin_lock_irq(&sdev->spinlock); +	list_add_tail(&ch->list, &sdev->rch_list); +	spin_unlock_irq(&sdev->spinlock); + +	goto out; + +release_channel: +	srpt_set_ch_state(ch, CH_RELEASING); +	transport_deregister_session_configfs(ch->sess); + +deregister_session: +	transport_deregister_session(ch->sess); +	ch->sess = NULL; + +destroy_ib: +	srpt_destroy_ch_ib(ch); + +free_ring: +	srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring, +			     ch->sport->sdev, ch->rq_size, +			     ch->rsp_size, DMA_TO_DEVICE); +free_ch: +	kfree(ch); + +reject: +	rej->opcode = SRP_LOGIN_REJ; +	rej->tag = req->tag; +	rej->buf_fmt = __constant_cpu_to_be16(SRP_BUF_FORMAT_DIRECT +					      | SRP_BUF_FORMAT_INDIRECT); + +	ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, +			     (void *)rej, sizeof *rej); + +out: +	kfree(rep_param); +	kfree(rsp); +	kfree(rej); + +	return ret; +} + +static void srpt_cm_rej_recv(struct ib_cm_id *cm_id) +{ +	printk(KERN_INFO "Received IB REJ for cm_id %p.\n", cm_id); +	srpt_drain_channel(cm_id); +} + +/** + * srpt_cm_rtu_recv() - Process an IB_CM_RTU_RECEIVED or USER_ESTABLISHED event. + * + * An IB_CM_RTU_RECEIVED message indicates that the connection is established + * and that the recipient may begin transmitting (RTU = ready to use). + */ +static void srpt_cm_rtu_recv(struct ib_cm_id *cm_id) +{ +	struct srpt_rdma_ch *ch; +	int ret; + +	ch = srpt_find_channel(cm_id->context, cm_id); +	BUG_ON(!ch); + +	if (srpt_test_and_set_ch_state(ch, CH_CONNECTING, CH_LIVE)) { +		struct srpt_recv_ioctx *ioctx, *ioctx_tmp; + +		ret = srpt_ch_qp_rts(ch, ch->qp); + +		list_for_each_entry_safe(ioctx, ioctx_tmp, &ch->cmd_wait_list, +					 wait_list) { +			list_del(&ioctx->wait_list); +			srpt_handle_new_iu(ch, ioctx, NULL); +		} +		if (ret) +			srpt_close_ch(ch); +	} +} + +static void srpt_cm_timewait_exit(struct ib_cm_id *cm_id) +{ +	printk(KERN_INFO "Received IB TimeWait exit for cm_id %p.\n", cm_id); +	srpt_drain_channel(cm_id); +} + +static void srpt_cm_rep_error(struct ib_cm_id *cm_id) +{ +	printk(KERN_INFO "Received IB REP error for cm_id %p.\n", cm_id); +	srpt_drain_channel(cm_id); +} + +/** + * srpt_cm_dreq_recv() - Process reception of a DREQ message. + */ +static void srpt_cm_dreq_recv(struct ib_cm_id *cm_id) +{ +	struct srpt_rdma_ch *ch; +	unsigned long flags; +	bool send_drep = false; + +	ch = srpt_find_channel(cm_id->context, cm_id); +	BUG_ON(!ch); + +	pr_debug("cm_id= %p ch->state= %d\n", cm_id, srpt_get_ch_state(ch)); + +	spin_lock_irqsave(&ch->spinlock, flags); +	switch (ch->state) { +	case CH_CONNECTING: +	case CH_LIVE: +		send_drep = true; +		ch->state = CH_DISCONNECTING; +		break; +	case CH_DISCONNECTING: +	case CH_DRAINING: +	case CH_RELEASING: +		WARN(true, "unexpected channel state %d\n", ch->state); +		break; +	} +	spin_unlock_irqrestore(&ch->spinlock, flags); + +	if (send_drep) { +		if (ib_send_cm_drep(ch->cm_id, NULL, 0) < 0) +			printk(KERN_ERR "Sending IB DREP failed.\n"); +		printk(KERN_INFO "Received DREQ and sent DREP for session %s.\n", +		       ch->sess_name); +	} +} + +/** + * srpt_cm_drep_recv() - Process reception of a DREP message. + */ +static void srpt_cm_drep_recv(struct ib_cm_id *cm_id) +{ +	printk(KERN_INFO "Received InfiniBand DREP message for cm_id %p.\n", +	       cm_id); +	srpt_drain_channel(cm_id); +} + +/** + * srpt_cm_handler() - IB connection manager callback function. + * + * A non-zero return value will cause the caller destroy the CM ID. + * + * Note: srpt_cm_handler() must only return a non-zero value when transferring + * ownership of the cm_id to a channel by srpt_cm_req_recv() failed. Returning + * a non-zero value in any other case will trigger a race with the + * ib_destroy_cm_id() call in srpt_release_channel(). + */ +static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +{ +	int ret; + +	ret = 0; +	switch (event->event) { +	case IB_CM_REQ_RECEIVED: +		ret = srpt_cm_req_recv(cm_id, &event->param.req_rcvd, +				       event->private_data); +		break; +	case IB_CM_REJ_RECEIVED: +		srpt_cm_rej_recv(cm_id); +		break; +	case IB_CM_RTU_RECEIVED: +	case IB_CM_USER_ESTABLISHED: +		srpt_cm_rtu_recv(cm_id); +		break; +	case IB_CM_DREQ_RECEIVED: +		srpt_cm_dreq_recv(cm_id); +		break; +	case IB_CM_DREP_RECEIVED: +		srpt_cm_drep_recv(cm_id); +		break; +	case IB_CM_TIMEWAIT_EXIT: +		srpt_cm_timewait_exit(cm_id); +		break; +	case IB_CM_REP_ERROR: +		srpt_cm_rep_error(cm_id); +		break; +	case IB_CM_DREQ_ERROR: +		printk(KERN_INFO "Received IB DREQ ERROR event.\n"); +		break; +	case IB_CM_MRA_RECEIVED: +		printk(KERN_INFO "Received IB MRA event\n"); +		break; +	default: +		printk(KERN_ERR "received unrecognized IB CM event %d\n", +		       event->event); +		break; +	} + +	return ret; +} + +/** + * srpt_perform_rdmas() - Perform IB RDMA. + * + * Returns zero upon success or a negative number upon failure. + */ +static int srpt_perform_rdmas(struct srpt_rdma_ch *ch, +			      struct srpt_send_ioctx *ioctx) +{ +	struct ib_send_wr wr; +	struct ib_send_wr *bad_wr; +	struct rdma_iu *riu; +	int i; +	int ret; +	int sq_wr_avail; +	enum dma_data_direction dir; +	const int n_rdma = ioctx->n_rdma; + +	dir = ioctx->cmd.data_direction; +	if (dir == DMA_TO_DEVICE) { +		/* write */ +		ret = -ENOMEM; +		sq_wr_avail = atomic_sub_return(n_rdma, &ch->sq_wr_avail); +		if (sq_wr_avail < 0) { +			printk(KERN_WARNING "IB send queue full (needed %d)\n", +			       n_rdma); +			goto out; +		} +	} + +	ioctx->rdma_aborted = false; +	ret = 0; +	riu = ioctx->rdma_ius; +	memset(&wr, 0, sizeof wr); + +	for (i = 0; i < n_rdma; ++i, ++riu) { +		if (dir == DMA_FROM_DEVICE) { +			wr.opcode = IB_WR_RDMA_WRITE; +			wr.wr_id = encode_wr_id(i == n_rdma - 1 ? +						SRPT_RDMA_WRITE_LAST : +						SRPT_RDMA_MID, +						ioctx->ioctx.index); +		} else { +			wr.opcode = IB_WR_RDMA_READ; +			wr.wr_id = encode_wr_id(i == n_rdma - 1 ? +						SRPT_RDMA_READ_LAST : +						SRPT_RDMA_MID, +						ioctx->ioctx.index); +		} +		wr.next = NULL; +		wr.wr.rdma.remote_addr = riu->raddr; +		wr.wr.rdma.rkey = riu->rkey; +		wr.num_sge = riu->sge_cnt; +		wr.sg_list = riu->sge; + +		/* only get completion event for the last rdma write */ +		if (i == (n_rdma - 1) && dir == DMA_TO_DEVICE) +			wr.send_flags = IB_SEND_SIGNALED; + +		ret = ib_post_send(ch->qp, &wr, &bad_wr); +		if (ret) +			break; +	} + +	if (ret) +		printk(KERN_ERR "%s[%d]: ib_post_send() returned %d for %d/%d", +				 __func__, __LINE__, ret, i, n_rdma); +	if (ret && i > 0) { +		wr.num_sge = 0; +		wr.wr_id = encode_wr_id(SRPT_RDMA_ABORT, ioctx->ioctx.index); +		wr.send_flags = IB_SEND_SIGNALED; +		while (ch->state == CH_LIVE && +			ib_post_send(ch->qp, &wr, &bad_wr) != 0) { +			printk(KERN_INFO "Trying to abort failed RDMA transfer [%d]", +				ioctx->ioctx.index); +			msleep(1000); +		} +		while (ch->state != CH_RELEASING && !ioctx->rdma_aborted) { +			printk(KERN_INFO "Waiting until RDMA abort finished [%d]", +				ioctx->ioctx.index); +			msleep(1000); +		} +	} +out: +	if (unlikely(dir == DMA_TO_DEVICE && ret < 0)) +		atomic_add(n_rdma, &ch->sq_wr_avail); +	return ret; +} + +/** + * srpt_xfer_data() - Start data transfer from initiator to target. + */ +static int srpt_xfer_data(struct srpt_rdma_ch *ch, +			  struct srpt_send_ioctx *ioctx) +{ +	int ret; + +	ret = srpt_map_sg_to_ib_sge(ch, ioctx); +	if (ret) { +		printk(KERN_ERR "%s[%d] ret=%d\n", __func__, __LINE__, ret); +		goto out; +	} + +	ret = srpt_perform_rdmas(ch, ioctx); +	if (ret) { +		if (ret == -EAGAIN || ret == -ENOMEM) +			printk(KERN_INFO "%s[%d] queue full -- ret=%d\n", +				   __func__, __LINE__, ret); +		else +			printk(KERN_ERR "%s[%d] fatal error -- ret=%d\n", +			       __func__, __LINE__, ret); +		goto out_unmap; +	} + +out: +	return ret; +out_unmap: +	srpt_unmap_sg_to_ib_sge(ch, ioctx); +	goto out; +} + +static int srpt_write_pending_status(struct se_cmd *se_cmd) +{ +	struct srpt_send_ioctx *ioctx; + +	ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); +	return srpt_get_cmd_state(ioctx) == SRPT_STATE_NEED_DATA; +} + +/* + * srpt_write_pending() - Start data transfer from initiator to target (write). + */ +static int srpt_write_pending(struct se_cmd *se_cmd) +{ +	struct srpt_rdma_ch *ch; +	struct srpt_send_ioctx *ioctx; +	enum srpt_command_state new_state; +	enum rdma_ch_state ch_state; +	int ret; + +	ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); + +	new_state = srpt_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA); +	WARN_ON(new_state == SRPT_STATE_DONE); + +	ch = ioctx->ch; +	BUG_ON(!ch); + +	ch_state = srpt_get_ch_state(ch); +	switch (ch_state) { +	case CH_CONNECTING: +		WARN(true, "unexpected channel state %d\n", ch_state); +		ret = -EINVAL; +		goto out; +	case CH_LIVE: +		break; +	case CH_DISCONNECTING: +	case CH_DRAINING: +	case CH_RELEASING: +		pr_debug("cmd with tag %lld: channel disconnecting\n", +			 ioctx->tag); +		srpt_set_cmd_state(ioctx, SRPT_STATE_DATA_IN); +		ret = -EINVAL; +		goto out; +	} +	ret = srpt_xfer_data(ch, ioctx); + +out: +	return ret; +} + +static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status) +{ +	switch (tcm_mgmt_status) { +	case TMR_FUNCTION_COMPLETE: +		return SRP_TSK_MGMT_SUCCESS; +	case TMR_FUNCTION_REJECTED: +		return SRP_TSK_MGMT_FUNC_NOT_SUPP; +	} +	return SRP_TSK_MGMT_FAILED; +} + +/** + * srpt_queue_response() - Transmits the response to a SCSI command. + * + * Callback function called by the TCM core. Must not block since it can be + * invoked on the context of the IB completion handler. + */ +static void srpt_queue_response(struct se_cmd *cmd) +{ +	struct srpt_rdma_ch *ch; +	struct srpt_send_ioctx *ioctx; +	enum srpt_command_state state; +	unsigned long flags; +	int ret; +	enum dma_data_direction dir; +	int resp_len; +	u8 srp_tm_status; + +	ioctx = container_of(cmd, struct srpt_send_ioctx, cmd); +	ch = ioctx->ch; +	BUG_ON(!ch); + +	spin_lock_irqsave(&ioctx->spinlock, flags); +	state = ioctx->state; +	switch (state) { +	case SRPT_STATE_NEW: +	case SRPT_STATE_DATA_IN: +		ioctx->state = SRPT_STATE_CMD_RSP_SENT; +		break; +	case SRPT_STATE_MGMT: +		ioctx->state = SRPT_STATE_MGMT_RSP_SENT; +		break; +	default: +		WARN(true, "ch %p; cmd %d: unexpected command state %d\n", +			ch, ioctx->ioctx.index, ioctx->state); +		break; +	} +	spin_unlock_irqrestore(&ioctx->spinlock, flags); + +	if (unlikely(transport_check_aborted_status(&ioctx->cmd, false) +		     || WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT))) { +		atomic_inc(&ch->req_lim_delta); +		srpt_abort_cmd(ioctx); +		return; +	} + +	dir = ioctx->cmd.data_direction; + +	/* For read commands, transfer the data to the initiator. */ +	if (dir == DMA_FROM_DEVICE && ioctx->cmd.data_length && +	    !ioctx->queue_status_only) { +		ret = srpt_xfer_data(ch, ioctx); +		if (ret) { +			printk(KERN_ERR "xfer_data failed for tag %llu\n", +			       ioctx->tag); +			return; +		} +	} + +	if (state != SRPT_STATE_MGMT) +		resp_len = srpt_build_cmd_rsp(ch, ioctx, ioctx->tag, +					      cmd->scsi_status); +	else { +		srp_tm_status +			= tcm_to_srp_tsk_mgmt_status(cmd->se_tmr_req->response); +		resp_len = srpt_build_tskmgmt_rsp(ch, ioctx, srp_tm_status, +						 ioctx->tag); +	} +	ret = srpt_post_send(ch, ioctx, resp_len); +	if (ret) { +		printk(KERN_ERR "sending cmd response failed for tag %llu\n", +		       ioctx->tag); +		srpt_unmap_sg_to_ib_sge(ch, ioctx); +		srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); +		target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); +	} +} + +static int srpt_queue_data_in(struct se_cmd *cmd) +{ +	srpt_queue_response(cmd); +	return 0; +} + +static void srpt_queue_tm_rsp(struct se_cmd *cmd) +{ +	srpt_queue_response(cmd); +} + +static void srpt_aborted_task(struct se_cmd *cmd) +{ +	struct srpt_send_ioctx *ioctx = container_of(cmd, +				struct srpt_send_ioctx, cmd); + +	srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); +} + +static int srpt_queue_status(struct se_cmd *cmd) +{ +	struct srpt_send_ioctx *ioctx; + +	ioctx = container_of(cmd, struct srpt_send_ioctx, cmd); +	BUG_ON(ioctx->sense_data != cmd->sense_buffer); +	if (cmd->se_cmd_flags & +	    (SCF_TRANSPORT_TASK_SENSE | SCF_EMULATED_TASK_SENSE)) +		WARN_ON(cmd->scsi_status != SAM_STAT_CHECK_CONDITION); +	ioctx->queue_status_only = true; +	srpt_queue_response(cmd); +	return 0; +} + +static void srpt_refresh_port_work(struct work_struct *work) +{ +	struct srpt_port *sport = container_of(work, struct srpt_port, work); + +	srpt_refresh_port(sport); +} + +static int srpt_ch_list_empty(struct srpt_device *sdev) +{ +	int res; + +	spin_lock_irq(&sdev->spinlock); +	res = list_empty(&sdev->rch_list); +	spin_unlock_irq(&sdev->spinlock); + +	return res; +} + +/** + * srpt_release_sdev() - Free the channel resources associated with a target. + */ +static int srpt_release_sdev(struct srpt_device *sdev) +{ +	struct srpt_rdma_ch *ch, *tmp_ch; +	int res; + +	WARN_ON_ONCE(irqs_disabled()); + +	BUG_ON(!sdev); + +	spin_lock_irq(&sdev->spinlock); +	list_for_each_entry_safe(ch, tmp_ch, &sdev->rch_list, list) +		__srpt_close_ch(ch); +	spin_unlock_irq(&sdev->spinlock); + +	res = wait_event_interruptible(sdev->ch_releaseQ, +				       srpt_ch_list_empty(sdev)); +	if (res) +		printk(KERN_ERR "%s: interrupted.\n", __func__); + +	return 0; +} + +static struct srpt_port *__srpt_lookup_port(const char *name) +{ +	struct ib_device *dev; +	struct srpt_device *sdev; +	struct srpt_port *sport; +	int i; + +	list_for_each_entry(sdev, &srpt_dev_list, list) { +		dev = sdev->device; +		if (!dev) +			continue; + +		for (i = 0; i < dev->phys_port_cnt; i++) { +			sport = &sdev->port[i]; + +			if (!strcmp(sport->port_guid, name)) +				return sport; +		} +	} + +	return NULL; +} + +static struct srpt_port *srpt_lookup_port(const char *name) +{ +	struct srpt_port *sport; + +	spin_lock(&srpt_dev_lock); +	sport = __srpt_lookup_port(name); +	spin_unlock(&srpt_dev_lock); + +	return sport; +} + +/** + * srpt_add_one() - Infiniband device addition callback function. + */ +static void srpt_add_one(struct ib_device *device) +{ +	struct srpt_device *sdev; +	struct srpt_port *sport; +	struct ib_srq_init_attr srq_attr; +	int i; + +	pr_debug("device = %p, device->dma_ops = %p\n", device, +		 device->dma_ops); + +	sdev = kzalloc(sizeof *sdev, GFP_KERNEL); +	if (!sdev) +		goto err; + +	sdev->device = device; +	INIT_LIST_HEAD(&sdev->rch_list); +	init_waitqueue_head(&sdev->ch_releaseQ); +	spin_lock_init(&sdev->spinlock); + +	if (ib_query_device(device, &sdev->dev_attr)) +		goto free_dev; + +	sdev->pd = ib_alloc_pd(device); +	if (IS_ERR(sdev->pd)) +		goto free_dev; + +	sdev->mr = ib_get_dma_mr(sdev->pd, IB_ACCESS_LOCAL_WRITE); +	if (IS_ERR(sdev->mr)) +		goto err_pd; + +	sdev->srq_size = min(srpt_srq_size, sdev->dev_attr.max_srq_wr); + +	srq_attr.event_handler = srpt_srq_event; +	srq_attr.srq_context = (void *)sdev; +	srq_attr.attr.max_wr = sdev->srq_size; +	srq_attr.attr.max_sge = 1; +	srq_attr.attr.srq_limit = 0; +	srq_attr.srq_type = IB_SRQT_BASIC; + +	sdev->srq = ib_create_srq(sdev->pd, &srq_attr); +	if (IS_ERR(sdev->srq)) +		goto err_mr; + +	pr_debug("%s: create SRQ #wr= %d max_allow=%d dev= %s\n", +		 __func__, sdev->srq_size, sdev->dev_attr.max_srq_wr, +		 device->name); + +	if (!srpt_service_guid) +		srpt_service_guid = be64_to_cpu(device->node_guid); + +	sdev->cm_id = ib_create_cm_id(device, srpt_cm_handler, sdev); +	if (IS_ERR(sdev->cm_id)) +		goto err_srq; + +	/* print out target login information */ +	pr_debug("Target login info: id_ext=%016llx,ioc_guid=%016llx," +		 "pkey=ffff,service_id=%016llx\n", srpt_service_guid, +		 srpt_service_guid, srpt_service_guid); + +	/* +	 * We do not have a consistent service_id (ie. also id_ext of target_id) +	 * to identify this target. We currently use the guid of the first HCA +	 * in the system as service_id; therefore, the target_id will change +	 * if this HCA is gone bad and replaced by different HCA +	 */ +	if (ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0, NULL)) +		goto err_cm; + +	INIT_IB_EVENT_HANDLER(&sdev->event_handler, sdev->device, +			      srpt_event_handler); +	if (ib_register_event_handler(&sdev->event_handler)) +		goto err_cm; + +	sdev->ioctx_ring = (struct srpt_recv_ioctx **) +		srpt_alloc_ioctx_ring(sdev, sdev->srq_size, +				      sizeof(*sdev->ioctx_ring[0]), +				      srp_max_req_size, DMA_FROM_DEVICE); +	if (!sdev->ioctx_ring) +		goto err_event; + +	for (i = 0; i < sdev->srq_size; ++i) +		srpt_post_recv(sdev, sdev->ioctx_ring[i]); + +	WARN_ON(sdev->device->phys_port_cnt > ARRAY_SIZE(sdev->port)); + +	for (i = 1; i <= sdev->device->phys_port_cnt; i++) { +		sport = &sdev->port[i - 1]; +		sport->sdev = sdev; +		sport->port = i; +		sport->port_attrib.srp_max_rdma_size = DEFAULT_MAX_RDMA_SIZE; +		sport->port_attrib.srp_max_rsp_size = DEFAULT_MAX_RSP_SIZE; +		sport->port_attrib.srp_sq_size = DEF_SRPT_SQ_SIZE; +		INIT_WORK(&sport->work, srpt_refresh_port_work); +		INIT_LIST_HEAD(&sport->port_acl_list); +		spin_lock_init(&sport->port_acl_lock); + +		if (srpt_refresh_port(sport)) { +			printk(KERN_ERR "MAD registration failed for %s-%d.\n", +			       srpt_sdev_name(sdev), i); +			goto err_ring; +		} +		snprintf(sport->port_guid, sizeof(sport->port_guid), +			"0x%016llx%016llx", +			be64_to_cpu(sport->gid.global.subnet_prefix), +			be64_to_cpu(sport->gid.global.interface_id)); +	} + +	spin_lock(&srpt_dev_lock); +	list_add_tail(&sdev->list, &srpt_dev_list); +	spin_unlock(&srpt_dev_lock); + +out: +	ib_set_client_data(device, &srpt_client, sdev); +	pr_debug("added %s.\n", device->name); +	return; + +err_ring: +	srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev, +			     sdev->srq_size, srp_max_req_size, +			     DMA_FROM_DEVICE); +err_event: +	ib_unregister_event_handler(&sdev->event_handler); +err_cm: +	ib_destroy_cm_id(sdev->cm_id); +err_srq: +	ib_destroy_srq(sdev->srq); +err_mr: +	ib_dereg_mr(sdev->mr); +err_pd: +	ib_dealloc_pd(sdev->pd); +free_dev: +	kfree(sdev); +err: +	sdev = NULL; +	printk(KERN_INFO "%s(%s) failed.\n", __func__, device->name); +	goto out; +} + +/** + * srpt_remove_one() - InfiniBand device removal callback function. + */ +static void srpt_remove_one(struct ib_device *device) +{ +	struct srpt_device *sdev; +	int i; + +	sdev = ib_get_client_data(device, &srpt_client); +	if (!sdev) { +		printk(KERN_INFO "%s(%s): nothing to do.\n", __func__, +		       device->name); +		return; +	} + +	srpt_unregister_mad_agent(sdev); + +	ib_unregister_event_handler(&sdev->event_handler); + +	/* Cancel any work queued by the just unregistered IB event handler. */ +	for (i = 0; i < sdev->device->phys_port_cnt; i++) +		cancel_work_sync(&sdev->port[i].work); + +	ib_destroy_cm_id(sdev->cm_id); + +	/* +	 * Unregistering a target must happen after destroying sdev->cm_id +	 * such that no new SRP_LOGIN_REQ information units can arrive while +	 * destroying the target. +	 */ +	spin_lock(&srpt_dev_lock); +	list_del(&sdev->list); +	spin_unlock(&srpt_dev_lock); +	srpt_release_sdev(sdev); + +	ib_destroy_srq(sdev->srq); +	ib_dereg_mr(sdev->mr); +	ib_dealloc_pd(sdev->pd); + +	srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev, +			     sdev->srq_size, srp_max_req_size, DMA_FROM_DEVICE); +	sdev->ioctx_ring = NULL; +	kfree(sdev); +} + +static struct ib_client srpt_client = { +	.name = DRV_NAME, +	.add = srpt_add_one, +	.remove = srpt_remove_one +}; + +static int srpt_check_true(struct se_portal_group *se_tpg) +{ +	return 1; +} + +static int srpt_check_false(struct se_portal_group *se_tpg) +{ +	return 0; +} + +static char *srpt_get_fabric_name(void) +{ +	return "srpt"; +} + +static u8 srpt_get_fabric_proto_ident(struct se_portal_group *se_tpg) +{ +	return SCSI_TRANSPORTID_PROTOCOLID_SRP; +} + +static char *srpt_get_fabric_wwn(struct se_portal_group *tpg) +{ +	struct srpt_port *sport = container_of(tpg, struct srpt_port, port_tpg_1); + +	return sport->port_guid; +} + +static u16 srpt_get_tag(struct se_portal_group *tpg) +{ +	return 1; +} + +static u32 srpt_get_default_depth(struct se_portal_group *se_tpg) +{ +	return 1; +} + +static u32 srpt_get_pr_transport_id(struct se_portal_group *se_tpg, +				    struct se_node_acl *se_nacl, +				    struct t10_pr_registration *pr_reg, +				    int *format_code, unsigned char *buf) +{ +	struct srpt_node_acl *nacl; +	struct spc_rdma_transport_id *tr_id; + +	nacl = container_of(se_nacl, struct srpt_node_acl, nacl); +	tr_id = (void *)buf; +	tr_id->protocol_identifier = SCSI_TRANSPORTID_PROTOCOLID_SRP; +	memcpy(tr_id->i_port_id, nacl->i_port_id, sizeof(tr_id->i_port_id)); +	return sizeof(*tr_id); +} + +static u32 srpt_get_pr_transport_id_len(struct se_portal_group *se_tpg, +					struct se_node_acl *se_nacl, +					struct t10_pr_registration *pr_reg, +					int *format_code) +{ +	*format_code = 0; +	return sizeof(struct spc_rdma_transport_id); +} + +static char *srpt_parse_pr_out_transport_id(struct se_portal_group *se_tpg, +					    const char *buf, u32 *out_tid_len, +					    char **port_nexus_ptr) +{ +	struct spc_rdma_transport_id *tr_id; + +	*port_nexus_ptr = NULL; +	*out_tid_len = sizeof(struct spc_rdma_transport_id); +	tr_id = (void *)buf; +	return (char *)tr_id->i_port_id; +} + +static struct se_node_acl *srpt_alloc_fabric_acl(struct se_portal_group *se_tpg) +{ +	struct srpt_node_acl *nacl; + +	nacl = kzalloc(sizeof(struct srpt_node_acl), GFP_KERNEL); +	if (!nacl) { +		printk(KERN_ERR "Unable to allocate struct srpt_node_acl\n"); +		return NULL; +	} + +	return &nacl->nacl; +} + +static void srpt_release_fabric_acl(struct se_portal_group *se_tpg, +				    struct se_node_acl *se_nacl) +{ +	struct srpt_node_acl *nacl; + +	nacl = container_of(se_nacl, struct srpt_node_acl, nacl); +	kfree(nacl); +} + +static u32 srpt_tpg_get_inst_index(struct se_portal_group *se_tpg) +{ +	return 1; +} + +static void srpt_release_cmd(struct se_cmd *se_cmd) +{ +	struct srpt_send_ioctx *ioctx = container_of(se_cmd, +				struct srpt_send_ioctx, cmd); +	struct srpt_rdma_ch *ch = ioctx->ch; +	unsigned long flags; + +	WARN_ON(ioctx->state != SRPT_STATE_DONE); +	WARN_ON(ioctx->mapped_sg_count != 0); + +	if (ioctx->n_rbuf > 1) { +		kfree(ioctx->rbufs); +		ioctx->rbufs = NULL; +		ioctx->n_rbuf = 0; +	} + +	spin_lock_irqsave(&ch->spinlock, flags); +	list_add(&ioctx->free_list, &ch->free_list); +	spin_unlock_irqrestore(&ch->spinlock, flags); +} + +/** + * srpt_close_session() - Forcibly close a session. + * + * Callback function invoked by the TCM core to clean up sessions associated + * with a node ACL when the user invokes + * rmdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id + */ +static void srpt_close_session(struct se_session *se_sess) +{ +	DECLARE_COMPLETION_ONSTACK(release_done); +	struct srpt_rdma_ch *ch; +	struct srpt_device *sdev; +	int res; + +	ch = se_sess->fabric_sess_ptr; +	WARN_ON(ch->sess != se_sess); + +	pr_debug("ch %p state %d\n", ch, srpt_get_ch_state(ch)); + +	sdev = ch->sport->sdev; +	spin_lock_irq(&sdev->spinlock); +	BUG_ON(ch->release_done); +	ch->release_done = &release_done; +	__srpt_close_ch(ch); +	spin_unlock_irq(&sdev->spinlock); + +	res = wait_for_completion_timeout(&release_done, 60 * HZ); +	WARN_ON(res <= 0); +} + +/** + * srpt_sess_get_index() - Return the value of scsiAttIntrPortIndex (SCSI-MIB). + * + * A quote from RFC 4455 (SCSI-MIB) about this MIB object: + * This object represents an arbitrary integer used to uniquely identify a + * particular attached remote initiator port to a particular SCSI target port + * within a particular SCSI target device within a particular SCSI instance. + */ +static u32 srpt_sess_get_index(struct se_session *se_sess) +{ +	return 0; +} + +static void srpt_set_default_node_attrs(struct se_node_acl *nacl) +{ +} + +static u32 srpt_get_task_tag(struct se_cmd *se_cmd) +{ +	struct srpt_send_ioctx *ioctx; + +	ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); +	return ioctx->tag; +} + +/* Note: only used from inside debug printk's by the TCM core. */ +static int srpt_get_tcm_cmd_state(struct se_cmd *se_cmd) +{ +	struct srpt_send_ioctx *ioctx; + +	ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); +	return srpt_get_cmd_state(ioctx); +} + +/** + * srpt_parse_i_port_id() - Parse an initiator port ID. + * @name: ASCII representation of a 128-bit initiator port ID. + * @i_port_id: Binary 128-bit port ID. + */ +static int srpt_parse_i_port_id(u8 i_port_id[16], const char *name) +{ +	const char *p; +	unsigned len, count, leading_zero_bytes; +	int ret, rc; + +	p = name; +	if (strnicmp(p, "0x", 2) == 0) +		p += 2; +	ret = -EINVAL; +	len = strlen(p); +	if (len % 2) +		goto out; +	count = min(len / 2, 16U); +	leading_zero_bytes = 16 - count; +	memset(i_port_id, 0, leading_zero_bytes); +	rc = hex2bin(i_port_id + leading_zero_bytes, p, count); +	if (rc < 0) +		pr_debug("hex2bin failed for srpt_parse_i_port_id: %d\n", rc); +	ret = 0; +out: +	return ret; +} + +/* + * configfs callback function invoked for + * mkdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id + */ +static struct se_node_acl *srpt_make_nodeacl(struct se_portal_group *tpg, +					     struct config_group *group, +					     const char *name) +{ +	struct srpt_port *sport = container_of(tpg, struct srpt_port, port_tpg_1); +	struct se_node_acl *se_nacl, *se_nacl_new; +	struct srpt_node_acl *nacl; +	int ret = 0; +	u32 nexus_depth = 1; +	u8 i_port_id[16]; + +	if (srpt_parse_i_port_id(i_port_id, name) < 0) { +		printk(KERN_ERR "invalid initiator port ID %s\n", name); +		ret = -EINVAL; +		goto err; +	} + +	se_nacl_new = srpt_alloc_fabric_acl(tpg); +	if (!se_nacl_new) { +		ret = -ENOMEM; +		goto err; +	} +	/* +	 * nacl_new may be released by core_tpg_add_initiator_node_acl() +	 * when converting a node ACL from demo mode to explict +	 */ +	se_nacl = core_tpg_add_initiator_node_acl(tpg, se_nacl_new, name, +						  nexus_depth); +	if (IS_ERR(se_nacl)) { +		ret = PTR_ERR(se_nacl); +		goto err; +	} +	/* Locate our struct srpt_node_acl and set sdev and i_port_id. */ +	nacl = container_of(se_nacl, struct srpt_node_acl, nacl); +	memcpy(&nacl->i_port_id[0], &i_port_id[0], 16); +	nacl->sport = sport; + +	spin_lock_irq(&sport->port_acl_lock); +	list_add_tail(&nacl->list, &sport->port_acl_list); +	spin_unlock_irq(&sport->port_acl_lock); + +	return se_nacl; +err: +	return ERR_PTR(ret); +} + +/* + * configfs callback function invoked for + * rmdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id + */ +static void srpt_drop_nodeacl(struct se_node_acl *se_nacl) +{ +	struct srpt_node_acl *nacl; +	struct srpt_device *sdev; +	struct srpt_port *sport; + +	nacl = container_of(se_nacl, struct srpt_node_acl, nacl); +	sport = nacl->sport; +	sdev = sport->sdev; +	spin_lock_irq(&sport->port_acl_lock); +	list_del(&nacl->list); +	spin_unlock_irq(&sport->port_acl_lock); +	core_tpg_del_initiator_node_acl(&sport->port_tpg_1, se_nacl, 1); +	srpt_release_fabric_acl(NULL, se_nacl); +} + +static ssize_t srpt_tpg_attrib_show_srp_max_rdma_size( +	struct se_portal_group *se_tpg, +	char *page) +{ +	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + +	return sprintf(page, "%u\n", sport->port_attrib.srp_max_rdma_size); +} + +static ssize_t srpt_tpg_attrib_store_srp_max_rdma_size( +	struct se_portal_group *se_tpg, +	const char *page, +	size_t count) +{ +	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); +	unsigned long val; +	int ret; + +	ret = kstrtoul(page, 0, &val); +	if (ret < 0) { +		pr_err("kstrtoul() failed with ret: %d\n", ret); +		return -EINVAL; +	} +	if (val > MAX_SRPT_RDMA_SIZE) { +		pr_err("val: %lu exceeds MAX_SRPT_RDMA_SIZE: %d\n", val, +			MAX_SRPT_RDMA_SIZE); +		return -EINVAL; +	} +	if (val < DEFAULT_MAX_RDMA_SIZE) { +		pr_err("val: %lu smaller than DEFAULT_MAX_RDMA_SIZE: %d\n", +			val, DEFAULT_MAX_RDMA_SIZE); +		return -EINVAL; +	} +	sport->port_attrib.srp_max_rdma_size = val; + +	return count; +} + +TF_TPG_ATTRIB_ATTR(srpt, srp_max_rdma_size, S_IRUGO | S_IWUSR); + +static ssize_t srpt_tpg_attrib_show_srp_max_rsp_size( +	struct se_portal_group *se_tpg, +	char *page) +{ +	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + +	return sprintf(page, "%u\n", sport->port_attrib.srp_max_rsp_size); +} + +static ssize_t srpt_tpg_attrib_store_srp_max_rsp_size( +	struct se_portal_group *se_tpg, +	const char *page, +	size_t count) +{ +	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); +	unsigned long val; +	int ret; + +	ret = kstrtoul(page, 0, &val); +	if (ret < 0) { +		pr_err("kstrtoul() failed with ret: %d\n", ret); +		return -EINVAL; +	} +	if (val > MAX_SRPT_RSP_SIZE) { +		pr_err("val: %lu exceeds MAX_SRPT_RSP_SIZE: %d\n", val, +			MAX_SRPT_RSP_SIZE); +		return -EINVAL; +	} +	if (val < MIN_MAX_RSP_SIZE) { +		pr_err("val: %lu smaller than MIN_MAX_RSP_SIZE: %d\n", val, +			MIN_MAX_RSP_SIZE); +		return -EINVAL; +	} +	sport->port_attrib.srp_max_rsp_size = val; + +	return count; +} + +TF_TPG_ATTRIB_ATTR(srpt, srp_max_rsp_size, S_IRUGO | S_IWUSR); + +static ssize_t srpt_tpg_attrib_show_srp_sq_size( +	struct se_portal_group *se_tpg, +	char *page) +{ +	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + +	return sprintf(page, "%u\n", sport->port_attrib.srp_sq_size); +} + +static ssize_t srpt_tpg_attrib_store_srp_sq_size( +	struct se_portal_group *se_tpg, +	const char *page, +	size_t count) +{ +	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); +	unsigned long val; +	int ret; + +	ret = kstrtoul(page, 0, &val); +	if (ret < 0) { +		pr_err("kstrtoul() failed with ret: %d\n", ret); +		return -EINVAL; +	} +	if (val > MAX_SRPT_SRQ_SIZE) { +		pr_err("val: %lu exceeds MAX_SRPT_SRQ_SIZE: %d\n", val, +			MAX_SRPT_SRQ_SIZE); +		return -EINVAL; +	} +	if (val < MIN_SRPT_SRQ_SIZE) { +		pr_err("val: %lu smaller than MIN_SRPT_SRQ_SIZE: %d\n", val, +			MIN_SRPT_SRQ_SIZE); +		return -EINVAL; +	} +	sport->port_attrib.srp_sq_size = val; + +	return count; +} + +TF_TPG_ATTRIB_ATTR(srpt, srp_sq_size, S_IRUGO | S_IWUSR); + +static struct configfs_attribute *srpt_tpg_attrib_attrs[] = { +	&srpt_tpg_attrib_srp_max_rdma_size.attr, +	&srpt_tpg_attrib_srp_max_rsp_size.attr, +	&srpt_tpg_attrib_srp_sq_size.attr, +	NULL, +}; + +static ssize_t srpt_tpg_show_enable( +	struct se_portal_group *se_tpg, +	char *page) +{ +	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + +	return snprintf(page, PAGE_SIZE, "%d\n", (sport->enabled) ? 1: 0); +} + +static ssize_t srpt_tpg_store_enable( +	struct se_portal_group *se_tpg, +	const char *page, +	size_t count) +{ +	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); +	unsigned long tmp; +        int ret; + +	ret = kstrtoul(page, 0, &tmp); +	if (ret < 0) { +		printk(KERN_ERR "Unable to extract srpt_tpg_store_enable\n"); +		return -EINVAL; +	} + +	if ((tmp != 0) && (tmp != 1)) { +		printk(KERN_ERR "Illegal value for srpt_tpg_store_enable: %lu\n", tmp); +		return -EINVAL; +	} +	if (tmp == 1) +		sport->enabled = true; +	else +		sport->enabled = false; + +	return count; +} + +TF_TPG_BASE_ATTR(srpt, enable, S_IRUGO | S_IWUSR); + +static struct configfs_attribute *srpt_tpg_attrs[] = { +	&srpt_tpg_enable.attr, +	NULL, +}; + +/** + * configfs callback invoked for + * mkdir /sys/kernel/config/target/$driver/$port/$tpg + */ +static struct se_portal_group *srpt_make_tpg(struct se_wwn *wwn, +					     struct config_group *group, +					     const char *name) +{ +	struct srpt_port *sport = container_of(wwn, struct srpt_port, port_wwn); +	int res; + +	/* Initialize sport->port_wwn and sport->port_tpg_1 */ +	res = core_tpg_register(&srpt_target->tf_ops, &sport->port_wwn, +			&sport->port_tpg_1, sport, TRANSPORT_TPG_TYPE_NORMAL); +	if (res) +		return ERR_PTR(res); + +	return &sport->port_tpg_1; +} + +/** + * configfs callback invoked for + * rmdir /sys/kernel/config/target/$driver/$port/$tpg + */ +static void srpt_drop_tpg(struct se_portal_group *tpg) +{ +	struct srpt_port *sport = container_of(tpg, +				struct srpt_port, port_tpg_1); + +	sport->enabled = false; +	core_tpg_deregister(&sport->port_tpg_1); +} + +/** + * configfs callback invoked for + * mkdir /sys/kernel/config/target/$driver/$port + */ +static struct se_wwn *srpt_make_tport(struct target_fabric_configfs *tf, +				      struct config_group *group, +				      const char *name) +{ +	struct srpt_port *sport; +	int ret; + +	sport = srpt_lookup_port(name); +	pr_debug("make_tport(%s)\n", name); +	ret = -EINVAL; +	if (!sport) +		goto err; + +	return &sport->port_wwn; + +err: +	return ERR_PTR(ret); +} + +/** + * configfs callback invoked for + * rmdir /sys/kernel/config/target/$driver/$port + */ +static void srpt_drop_tport(struct se_wwn *wwn) +{ +	struct srpt_port *sport = container_of(wwn, struct srpt_port, port_wwn); + +	pr_debug("drop_tport(%s\n", config_item_name(&sport->port_wwn.wwn_group.cg_item)); +} + +static ssize_t srpt_wwn_show_attr_version(struct target_fabric_configfs *tf, +					      char *buf) +{ +	return scnprintf(buf, PAGE_SIZE, "%s\n", DRV_VERSION); +} + +TF_WWN_ATTR_RO(srpt, version); + +static struct configfs_attribute *srpt_wwn_attrs[] = { +	&srpt_wwn_version.attr, +	NULL, +}; + +static struct target_core_fabric_ops srpt_template = { +	.get_fabric_name		= srpt_get_fabric_name, +	.get_fabric_proto_ident		= srpt_get_fabric_proto_ident, +	.tpg_get_wwn			= srpt_get_fabric_wwn, +	.tpg_get_tag			= srpt_get_tag, +	.tpg_get_default_depth		= srpt_get_default_depth, +	.tpg_get_pr_transport_id	= srpt_get_pr_transport_id, +	.tpg_get_pr_transport_id_len	= srpt_get_pr_transport_id_len, +	.tpg_parse_pr_out_transport_id	= srpt_parse_pr_out_transport_id, +	.tpg_check_demo_mode		= srpt_check_false, +	.tpg_check_demo_mode_cache	= srpt_check_true, +	.tpg_check_demo_mode_write_protect = srpt_check_true, +	.tpg_check_prod_mode_write_protect = srpt_check_false, +	.tpg_alloc_fabric_acl		= srpt_alloc_fabric_acl, +	.tpg_release_fabric_acl		= srpt_release_fabric_acl, +	.tpg_get_inst_index		= srpt_tpg_get_inst_index, +	.release_cmd			= srpt_release_cmd, +	.check_stop_free		= srpt_check_stop_free, +	.shutdown_session		= srpt_shutdown_session, +	.close_session			= srpt_close_session, +	.sess_get_index			= srpt_sess_get_index, +	.sess_get_initiator_sid		= NULL, +	.write_pending			= srpt_write_pending, +	.write_pending_status		= srpt_write_pending_status, +	.set_default_node_attributes	= srpt_set_default_node_attrs, +	.get_task_tag			= srpt_get_task_tag, +	.get_cmd_state			= srpt_get_tcm_cmd_state, +	.queue_data_in			= srpt_queue_data_in, +	.queue_status			= srpt_queue_status, +	.queue_tm_rsp			= srpt_queue_tm_rsp, +	.aborted_task			= srpt_aborted_task, +	/* +	 * Setup function pointers for generic logic in +	 * target_core_fabric_configfs.c +	 */ +	.fabric_make_wwn		= srpt_make_tport, +	.fabric_drop_wwn		= srpt_drop_tport, +	.fabric_make_tpg		= srpt_make_tpg, +	.fabric_drop_tpg		= srpt_drop_tpg, +	.fabric_post_link		= NULL, +	.fabric_pre_unlink		= NULL, +	.fabric_make_np			= NULL, +	.fabric_drop_np			= NULL, +	.fabric_make_nodeacl		= srpt_make_nodeacl, +	.fabric_drop_nodeacl		= srpt_drop_nodeacl, +}; + +/** + * srpt_init_module() - Kernel module initialization. + * + * Note: Since ib_register_client() registers callback functions, and since at + * least one of these callback functions (srpt_add_one()) calls target core + * functions, this driver must be registered with the target core before + * ib_register_client() is called. + */ +static int __init srpt_init_module(void) +{ +	int ret; + +	ret = -EINVAL; +	if (srp_max_req_size < MIN_MAX_REQ_SIZE) { +		printk(KERN_ERR "invalid value %d for kernel module parameter" +		       " srp_max_req_size -- must be at least %d.\n", +		       srp_max_req_size, MIN_MAX_REQ_SIZE); +		goto out; +	} + +	if (srpt_srq_size < MIN_SRPT_SRQ_SIZE +	    || srpt_srq_size > MAX_SRPT_SRQ_SIZE) { +		printk(KERN_ERR "invalid value %d for kernel module parameter" +		       " srpt_srq_size -- must be in the range [%d..%d].\n", +		       srpt_srq_size, MIN_SRPT_SRQ_SIZE, MAX_SRPT_SRQ_SIZE); +		goto out; +	} + +	srpt_target = target_fabric_configfs_init(THIS_MODULE, "srpt"); +	if (IS_ERR(srpt_target)) { +		printk(KERN_ERR "couldn't register\n"); +		ret = PTR_ERR(srpt_target); +		goto out; +	} + +	srpt_target->tf_ops = srpt_template; + +	/* +	 * Set up default attribute lists. +	 */ +	srpt_target->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = srpt_wwn_attrs; +	srpt_target->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = srpt_tpg_attrs; +	srpt_target->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = srpt_tpg_attrib_attrs; +	srpt_target->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = NULL; +	srpt_target->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL; +	srpt_target->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = NULL; +	srpt_target->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL; +	srpt_target->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL; +	srpt_target->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL; + +	ret = target_fabric_configfs_register(srpt_target); +	if (ret < 0) { +		printk(KERN_ERR "couldn't register\n"); +		goto out_free_target; +	} + +	ret = ib_register_client(&srpt_client); +	if (ret) { +		printk(KERN_ERR "couldn't register IB client\n"); +		goto out_unregister_target; +	} + +	return 0; + +out_unregister_target: +	target_fabric_configfs_deregister(srpt_target); +	srpt_target = NULL; +out_free_target: +	if (srpt_target) +		target_fabric_configfs_free(srpt_target); +out: +	return ret; +} + +static void __exit srpt_cleanup_module(void) +{ +	ib_unregister_client(&srpt_client); +	target_fabric_configfs_deregister(srpt_target); +	srpt_target = NULL; +} + +module_init(srpt_init_module); +module_exit(srpt_cleanup_module); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h new file mode 100644 index 00000000000..3dae156905d --- /dev/null +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -0,0 +1,443 @@ +/* + * Copyright (c) 2006 - 2009 Mellanox Technology Inc.  All rights reserved. + * Copyright (C) 2009 - 2010 Bart Van Assche <bvanassche@acm.org>. + * + * This software is available to you under a choice of one of two + * licenses.  You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + *     Redistribution and use in source and binary forms, with or + *     without modification, are permitted provided that the following + *     conditions are met: + * + *      - Redistributions of source code must retain the above + *        copyright notice, this list of conditions and the following + *        disclaimer. + * + *      - Redistributions in binary form must reproduce the above + *        copyright notice, this list of conditions and the following + *        disclaimer in the documentation and/or other materials + *        provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef IB_SRPT_H +#define IB_SRPT_H + +#include <linux/types.h> +#include <linux/list.h> +#include <linux/wait.h> + +#include <rdma/ib_verbs.h> +#include <rdma/ib_sa.h> +#include <rdma/ib_cm.h> + +#include <scsi/srp.h> + +#include "ib_dm_mad.h" + +/* + * The prefix the ServiceName field must start with in the device management + * ServiceEntries attribute pair. See also the SRP specification. + */ +#define SRP_SERVICE_NAME_PREFIX		"SRP.T10:" + +enum { +	/* +	 * SRP IOControllerProfile attributes for SRP target ports that have +	 * not been defined in <scsi/srp.h>. Source: section B.7, table B.7 +	 * in the SRP specification. +	 */ +	SRP_PROTOCOL = 0x0108, +	SRP_PROTOCOL_VERSION = 0x0001, +	SRP_IO_SUBCLASS = 0x609e, +	SRP_SEND_TO_IOC = 0x01, +	SRP_SEND_FROM_IOC = 0x02, +	SRP_RDMA_READ_FROM_IOC = 0x08, +	SRP_RDMA_WRITE_FROM_IOC = 0x20, + +	/* +	 * srp_login_cmd.req_flags bitmasks. See also table 9 in the SRP +	 * specification. +	 */ +	SRP_MTCH_ACTION = 0x03, /* MULTI-CHANNEL ACTION */ +	SRP_LOSOLNT = 0x10, /* logout solicited notification */ +	SRP_CRSOLNT = 0x20, /* credit request solicited notification */ +	SRP_AESOLNT = 0x40, /* asynchronous event solicited notification */ + +	/* +	 * srp_cmd.sol_nt / srp_tsk_mgmt.sol_not bitmasks. See also tables +	 * 18 and 20 in the SRP specification. +	 */ +	SRP_SCSOLNT = 0x02, /* SCSOLNT = successful solicited notification */ +	SRP_UCSOLNT = 0x04, /* UCSOLNT = unsuccessful solicited notification */ + +	/* +	 * srp_rsp.sol_not / srp_t_logout.sol_not bitmasks. See also tables +	 * 16 and 22 in the SRP specification. +	 */ +	SRP_SOLNT = 0x01, /* SOLNT = solicited notification */ + +	/* See also table 24 in the SRP specification. */ +	SRP_TSK_MGMT_SUCCESS = 0x00, +	SRP_TSK_MGMT_FUNC_NOT_SUPP = 0x04, +	SRP_TSK_MGMT_FAILED = 0x05, + +	/* See also table 21 in the SRP specification. */ +	SRP_CMD_SIMPLE_Q = 0x0, +	SRP_CMD_HEAD_OF_Q = 0x1, +	SRP_CMD_ORDERED_Q = 0x2, +	SRP_CMD_ACA = 0x4, + +	SRP_LOGIN_RSP_MULTICHAN_NO_CHAN = 0x0, +	SRP_LOGIN_RSP_MULTICHAN_TERMINATED = 0x1, +	SRP_LOGIN_RSP_MULTICHAN_MAINTAINED = 0x2, + +	SRPT_DEF_SG_TABLESIZE = 128, +	SRPT_DEF_SG_PER_WQE = 16, + +	MIN_SRPT_SQ_SIZE = 16, +	DEF_SRPT_SQ_SIZE = 4096, +	SRPT_RQ_SIZE = 128, +	MIN_SRPT_SRQ_SIZE = 4, +	DEFAULT_SRPT_SRQ_SIZE = 4095, +	MAX_SRPT_SRQ_SIZE = 65535, +	MAX_SRPT_RDMA_SIZE = 1U << 24, +	MAX_SRPT_RSP_SIZE = 1024, + +	MIN_MAX_REQ_SIZE = 996, +	DEFAULT_MAX_REQ_SIZE +		= sizeof(struct srp_cmd)/*48*/ +		+ sizeof(struct srp_indirect_buf)/*20*/ +		+ 128 * sizeof(struct srp_direct_buf)/*16*/, + +	MIN_MAX_RSP_SIZE = sizeof(struct srp_rsp)/*36*/ + 4, +	DEFAULT_MAX_RSP_SIZE = 256, /* leaves 220 bytes for sense data */ + +	DEFAULT_MAX_RDMA_SIZE = 65536, +}; + +enum srpt_opcode { +	SRPT_RECV, +	SRPT_SEND, +	SRPT_RDMA_MID, +	SRPT_RDMA_ABORT, +	SRPT_RDMA_READ_LAST, +	SRPT_RDMA_WRITE_LAST, +}; + +static inline u64 encode_wr_id(u8 opcode, u32 idx) +{ +	return ((u64)opcode << 32) | idx; +} +static inline enum srpt_opcode opcode_from_wr_id(u64 wr_id) +{ +	return wr_id >> 32; +} +static inline u32 idx_from_wr_id(u64 wr_id) +{ +	return (u32)wr_id; +} + +struct rdma_iu { +	u64		raddr; +	u32		rkey; +	struct ib_sge	*sge; +	u32		sge_cnt; +	int		mem_id; +}; + +/** + * enum srpt_command_state - SCSI command state managed by SRPT. + * @SRPT_STATE_NEW:           New command arrived and is being processed. + * @SRPT_STATE_NEED_DATA:     Processing a write or bidir command and waiting + *                            for data arrival. + * @SRPT_STATE_DATA_IN:       Data for the write or bidir command arrived and is + *                            being processed. + * @SRPT_STATE_CMD_RSP_SENT:  SRP_RSP for SRP_CMD has been sent. + * @SRPT_STATE_MGMT:          Processing a SCSI task management command. + * @SRPT_STATE_MGMT_RSP_SENT: SRP_RSP for SRP_TSK_MGMT has been sent. + * @SRPT_STATE_DONE:          Command processing finished successfully, command + *                            processing has been aborted or command processing + *                            failed. + */ +enum srpt_command_state { +	SRPT_STATE_NEW		 = 0, +	SRPT_STATE_NEED_DATA	 = 1, +	SRPT_STATE_DATA_IN	 = 2, +	SRPT_STATE_CMD_RSP_SENT	 = 3, +	SRPT_STATE_MGMT		 = 4, +	SRPT_STATE_MGMT_RSP_SENT = 5, +	SRPT_STATE_DONE		 = 6, +}; + +/** + * struct srpt_ioctx - Shared SRPT I/O context information. + * @buf:   Pointer to the buffer. + * @dma:   DMA address of the buffer. + * @index: Index of the I/O context in its ioctx_ring array. + */ +struct srpt_ioctx { +	void			*buf; +	dma_addr_t		dma; +	uint32_t		index; +}; + +/** + * struct srpt_recv_ioctx - SRPT receive I/O context. + * @ioctx:     See above. + * @wait_list: Node for insertion in srpt_rdma_ch.cmd_wait_list. + */ +struct srpt_recv_ioctx { +	struct srpt_ioctx	ioctx; +	struct list_head	wait_list; +}; + +/** + * struct srpt_send_ioctx - SRPT send I/O context. + * @ioctx:       See above. + * @ch:          Channel pointer. + * @free_list:   Node in srpt_rdma_ch.free_list. + * @n_rbuf:      Number of data buffers in the received SRP command. + * @rbufs:       Pointer to SRP data buffer array. + * @single_rbuf: SRP data buffer if the command has only a single buffer. + * @sg:          Pointer to sg-list associated with this I/O context. + * @sg_cnt:      SG-list size. + * @mapped_sg_count: ib_dma_map_sg() return value. + * @n_rdma_ius:  Number of elements in the rdma_ius array. + * @rdma_ius:    Array with information about the RDMA mapping. + * @tag:         Tag of the received SRP information unit. + * @spinlock:    Protects 'state'. + * @state:       I/O context state. + * @rdma_aborted: If initiating a multipart RDMA transfer failed, whether + * 		 the already initiated transfers have finished. + * @cmd:         Target core command data structure. + * @sense_data:  SCSI sense data. + */ +struct srpt_send_ioctx { +	struct srpt_ioctx	ioctx; +	struct srpt_rdma_ch	*ch; +	struct rdma_iu		*rdma_ius; +	struct srp_direct_buf	*rbufs; +	struct srp_direct_buf	single_rbuf; +	struct scatterlist	*sg; +	struct list_head	free_list; +	spinlock_t		spinlock; +	enum srpt_command_state	state; +	bool			rdma_aborted; +	struct se_cmd		cmd; +	struct completion	tx_done; +	u64			tag; +	int			sg_cnt; +	int			mapped_sg_count; +	u16			n_rdma_ius; +	u8			n_rdma; +	u8			n_rbuf; +	bool			queue_status_only; +	u8			sense_data[SCSI_SENSE_BUFFERSIZE]; +}; + +/** + * enum rdma_ch_state - SRP channel state. + * @CH_CONNECTING:	 QP is in RTR state; waiting for RTU. + * @CH_LIVE:		 QP is in RTS state. + * @CH_DISCONNECTING:    DREQ has been received; waiting for DREP + *                       or DREQ has been send and waiting for DREP + *                       or . + * @CH_DRAINING:	 QP is in ERR state; waiting for last WQE event. + * @CH_RELEASING:	 Last WQE event has been received; releasing resources. + */ +enum rdma_ch_state { +	CH_CONNECTING, +	CH_LIVE, +	CH_DISCONNECTING, +	CH_DRAINING, +	CH_RELEASING +}; + +/** + * struct srpt_rdma_ch - RDMA channel. + * @wait_queue:    Allows the kernel thread to wait for more work. + * @thread:        Kernel thread that processes the IB queues associated with + *                 the channel. + * @cm_id:         IB CM ID associated with the channel. + * @qp:            IB queue pair used for communicating over this channel. + * @cq:            IB completion queue for this channel. + * @rq_size:       IB receive queue size. + * @rsp_size	   IB response message size in bytes. + * @sq_wr_avail:   number of work requests available in the send queue. + * @sport:         pointer to the information of the HCA port used by this + *                 channel. + * @i_port_id:     128-bit initiator port identifier copied from SRP_LOGIN_REQ. + * @t_port_id:     128-bit target port identifier copied from SRP_LOGIN_REQ. + * @max_ti_iu_len: maximum target-to-initiator information unit length. + * @req_lim:       request limit: maximum number of requests that may be sent + *                 by the initiator without having received a response. + * @req_lim_delta: Number of credits not yet sent back to the initiator. + * @spinlock:      Protects free_list and state. + * @free_list:     Head of list with free send I/O contexts. + * @state:         channel state. See also enum rdma_ch_state. + * @ioctx_ring:    Send ring. + * @wc:            IB work completion array for srpt_process_completion(). + * @list:          Node for insertion in the srpt_device.rch_list list. + * @cmd_wait_list: List of SCSI commands that arrived before the RTU event. This + *                 list contains struct srpt_ioctx elements and is protected + *                 against concurrent modification by the cm_id spinlock. + * @sess:          Session information associated with this SRP channel. + * @sess_name:     Session name. + * @release_work:  Allows scheduling of srpt_release_channel(). + * @release_done:  Enables waiting for srpt_release_channel() completion. + */ +struct srpt_rdma_ch { +	wait_queue_head_t	wait_queue; +	struct task_struct	*thread; +	struct ib_cm_id		*cm_id; +	struct ib_qp		*qp; +	struct ib_cq		*cq; +	int			rq_size; +	u32			rsp_size; +	atomic_t		sq_wr_avail; +	struct srpt_port	*sport; +	u8			i_port_id[16]; +	u8			t_port_id[16]; +	int			max_ti_iu_len; +	atomic_t		req_lim; +	atomic_t		req_lim_delta; +	spinlock_t		spinlock; +	struct list_head	free_list; +	enum rdma_ch_state	state; +	struct srpt_send_ioctx	**ioctx_ring; +	struct ib_wc		wc[16]; +	struct list_head	list; +	struct list_head	cmd_wait_list; +	struct se_session	*sess; +	u8			sess_name[36]; +	struct work_struct	release_work; +	struct completion	*release_done; +	bool			in_shutdown; +}; + +/** + * struct srpt_port_attib - Attributes for SRPT port + * @srp_max_rdma_size: Maximum size of SRP RDMA transfers for new connections. + * @srp_max_rsp_size: Maximum size of SRP response messages in bytes. + * @srp_sq_size: Shared receive queue (SRQ) size. + */ +struct srpt_port_attrib { +	u32			srp_max_rdma_size; +	u32			srp_max_rsp_size; +	u32			srp_sq_size; +}; + +/** + * struct srpt_port - Information associated by SRPT with a single IB port. + * @sdev:      backpointer to the HCA information. + * @mad_agent: per-port management datagram processing information. + * @enabled:   Whether or not this target port is enabled. + * @port_guid: ASCII representation of Port GUID + * @port:      one-based port number. + * @sm_lid:    cached value of the port's sm_lid. + * @lid:       cached value of the port's lid. + * @gid:       cached value of the port's gid. + * @port_acl_lock spinlock for port_acl_list: + * @work:      work structure for refreshing the aforementioned cached values. + * @port_tpg_1 Target portal group = 1 data. + * @port_wwn:  Target core WWN data. + * @port_acl_list: Head of the list with all node ACLs for this port. + */ +struct srpt_port { +	struct srpt_device	*sdev; +	struct ib_mad_agent	*mad_agent; +	bool			enabled; +	u8			port_guid[64]; +	u8			port; +	u16			sm_lid; +	u16			lid; +	union ib_gid		gid; +	spinlock_t		port_acl_lock; +	struct work_struct	work; +	struct se_portal_group	port_tpg_1; +	struct se_wwn		port_wwn; +	struct list_head	port_acl_list; +	struct srpt_port_attrib port_attrib; +}; + +/** + * struct srpt_device - Information associated by SRPT with a single HCA. + * @device:        Backpointer to the struct ib_device managed by the IB core. + * @pd:            IB protection domain. + * @mr:            L_Key (local key) with write access to all local memory. + * @srq:           Per-HCA SRQ (shared receive queue). + * @cm_id:         Connection identifier. + * @dev_attr:      Attributes of the InfiniBand device as obtained during the + *                 ib_client.add() callback. + * @srq_size:      SRQ size. + * @ioctx_ring:    Per-HCA SRQ. + * @rch_list:      Per-device channel list -- see also srpt_rdma_ch.list. + * @ch_releaseQ:   Enables waiting for removal from rch_list. + * @spinlock:      Protects rch_list and tpg. + * @port:          Information about the ports owned by this HCA. + * @event_handler: Per-HCA asynchronous IB event handler. + * @list:          Node in srpt_dev_list. + */ +struct srpt_device { +	struct ib_device	*device; +	struct ib_pd		*pd; +	struct ib_mr		*mr; +	struct ib_srq		*srq; +	struct ib_cm_id		*cm_id; +	struct ib_device_attr	dev_attr; +	int			srq_size; +	struct srpt_recv_ioctx	**ioctx_ring; +	struct list_head	rch_list; +	wait_queue_head_t	ch_releaseQ; +	spinlock_t		spinlock; +	struct srpt_port	port[2]; +	struct ib_event_handler	event_handler; +	struct list_head	list; +}; + +/** + * struct srpt_node_acl - Per-initiator ACL data (managed via configfs). + * @i_port_id: 128-bit SRP initiator port ID. + * @sport:     port information. + * @nacl:      Target core node ACL information. + * @list:      Element of the per-HCA ACL list. + */ +struct srpt_node_acl { +	u8			i_port_id[16]; +	struct srpt_port	*sport; +	struct se_node_acl	nacl; +	struct list_head	list; +}; + +/* + * SRP-releated SCSI persistent reservation definitions. + * + * See also SPC4r28, section 7.6.1 (Protocol specific parameters introduction). + * See also SPC4r28, section 7.6.4.5 (TransportID for initiator ports using + * SCSI over an RDMA interface). + */ + +enum { +	SCSI_TRANSPORTID_PROTOCOLID_SRP	= 4, +}; + +struct spc_rdma_transport_id { +	uint8_t protocol_identifier; +	uint8_t reserved[7]; +	uint8_t i_port_id[16]; +}; + +#endif				/* IB_SRPT_H */  | 
