diff options
Diffstat (limited to 'drivers/infiniband/ulp')
33 files changed, 19683 insertions, 1996 deletions
diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile new file mode 100644 index 00000000000..f3c7dcf0309 --- /dev/null +++ b/drivers/infiniband/ulp/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_INFINIBAND_IPOIB) += ipoib/ +obj-$(CONFIG_INFINIBAND_SRP) += srp/ +obj-$(CONFIG_INFINIBAND_SRPT) += srpt/ +obj-$(CONFIG_INFINIBAND_ISER) += iser/ +obj-$(CONFIG_INFINIBAND_ISERT) += isert/ diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig index 8d2e04cac68..cda8eac55ff 100644 --- a/drivers/infiniband/ulp/ipoib/Kconfig +++ b/drivers/infiniband/ulp/ipoib/Kconfig @@ -1,32 +1,48 @@ config INFINIBAND_IPOIB tristate "IP-over-InfiniBand" - depends on INFINIBAND && NETDEVICES && INET + depends on NETDEVICES && INET && (IPV6 || IPV6=n) ---help--- Support for the IP-over-InfiniBand protocol (IPoIB). This transports IP packets over InfiniBand so you can use your IB device as a fancy NIC. - The IPoIB protocol is defined by the IETF ipoib working - group: <http://www.ietf.org/html.charters/ipoib-charter.html>. + See Documentation/infiniband/ipoib.txt for more information + +config INFINIBAND_IPOIB_CM + bool "IP-over-InfiniBand Connected Mode support" + depends on INFINIBAND_IPOIB + default n + ---help--- + This option enables support for IPoIB connected mode. After + enabling this option, you need to switch to connected mode + through /sys/class/net/ibXXX/mode to actually create + connections, and then increase the interface MTU with + e.g. ifconfig ib0 mtu 65520. + + WARNING: Enabling connected mode will trigger some packet + drops for multicast and UD mode traffic from this interface, + unless you limit mtu for these destinations to 2044. config INFINIBAND_IPOIB_DEBUG - bool "IP-over-InfiniBand debugging" + bool "IP-over-InfiniBand debugging" if EXPERT depends on INFINIBAND_IPOIB + default y ---help--- This option causes debugging code to be compiled into the IPoIB driver. The output can be turned on via the debug_level and mcast_debug_level module parameters (which can also be set after the driver is loaded through sysfs). - This option also creates an "ipoib_debugfs," which can be - mounted to expose debugging information about IB multicast - groups used by the IPoIB driver. + This option also creates a directory tree under ipoib/ in + debugfs, which contains files that expose debugging + information about IB multicast groups used by the IPoIB + driver. config INFINIBAND_IPOIB_DEBUG_DATA bool "IP-over-InfiniBand data path debugging" depends on INFINIBAND_IPOIB_DEBUG ---help--- - This option compiles debugging code into the the data path + This option compiles debugging code into the data path of the IPoIB driver. The output can be turned on via the data_debug_level module parameter; however, even with output turned off, this debugging code will have some performance diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile index 8935e74ae3f..e5430dd5076 100644 --- a/drivers/infiniband/ulp/ipoib/Makefile +++ b/drivers/infiniband/ulp/ipoib/Makefile @@ -4,6 +4,9 @@ ib_ipoib-y := ipoib_main.o \ ipoib_ib.o \ ipoib_multicast.o \ ipoib_verbs.o \ - ipoib_vlan.o + ipoib_vlan.o \ + ipoib_ethtool.o \ + ipoib_netlink.o +ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM) += ipoib_cm.o ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 0095acc0fbb..c639f90cfda 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -30,8 +30,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ipoib.h 1358 2004-12-17 22:00:11Z roland $ */ #ifndef _IPOIB_H @@ -41,52 +39,87 @@ #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/workqueue.h> -#include <linux/pci.h> -#include <linux/config.h> #include <linux/kref.h> #include <linux/if_infiniband.h> +#include <linux/mutex.h> #include <net/neighbour.h> +#include <net/sch_generic.h> -#include <asm/atomic.h> -#include <asm/semaphore.h> +#include <linux/atomic.h> #include <rdma/ib_verbs.h> #include <rdma/ib_pack.h> #include <rdma/ib_sa.h> +#include <linux/sched.h> /* constants */ +enum ipoib_flush_level { + IPOIB_FLUSH_LIGHT, + IPOIB_FLUSH_NORMAL, + IPOIB_FLUSH_HEAVY +}; + enum { - IPOIB_PACKET_SIZE = 2048, - IPOIB_BUF_SIZE = IPOIB_PACKET_SIZE + IB_GRH_BYTES, + IPOIB_ENCAP_LEN = 4, - IPOIB_ENCAP_LEN = 4, + IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN, + IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */ - IPOIB_RX_RING_SIZE = 128, - IPOIB_TX_RING_SIZE = 64, + IPOIB_CM_MTU = 0x10000 - 0x10, /* padding to align header to 16 */ + IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN, + IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE, + IPOIB_CM_RX_SG = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE, + IPOIB_RX_RING_SIZE = 256, + IPOIB_TX_RING_SIZE = 128, + IPOIB_MAX_QUEUE_SIZE = 8192, + IPOIB_MIN_QUEUE_SIZE = 2, + IPOIB_CM_MAX_CONN_QP = 4096, - IPOIB_NUM_WC = 4, + IPOIB_NUM_WC = 4, IPOIB_MAX_PATH_REC_QUEUE = 3, - IPOIB_MAX_MCAST_QUEUE = 3, - - IPOIB_FLAG_OPER_UP = 0, - IPOIB_FLAG_ADMIN_UP = 1, - IPOIB_PKEY_ASSIGNED = 2, - IPOIB_PKEY_STOP = 3, - IPOIB_FLAG_SUBINTERFACE = 4, - IPOIB_MCAST_RUN = 5, - IPOIB_STOP_REAPER = 6, + IPOIB_MAX_MCAST_QUEUE = 3, + + IPOIB_FLAG_OPER_UP = 0, + IPOIB_FLAG_INITIALIZED = 1, + IPOIB_FLAG_ADMIN_UP = 2, + IPOIB_PKEY_ASSIGNED = 3, + IPOIB_PKEY_STOP = 4, + IPOIB_FLAG_SUBINTERFACE = 5, + IPOIB_MCAST_RUN = 6, + IPOIB_STOP_REAPER = 7, + IPOIB_FLAG_ADMIN_CM = 9, + IPOIB_FLAG_UMCAST = 10, + IPOIB_STOP_NEIGH_GC = 11, + IPOIB_NEIGH_TBL_FLUSH = 12, IPOIB_MAX_BACKOFF_SECONDS = 16, - IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ + IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ IPOIB_MCAST_FLAG_SENDONLY = 1, - IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ + IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ IPOIB_MCAST_FLAG_ATTACHED = 3, + IPOIB_MCAST_JOIN_STARTED = 4, + + MAX_SEND_CQE = 16, + IPOIB_CM_COPYBREAK = 256, + + IPOIB_NON_CHILD = 0, + IPOIB_LEGACY_CHILD = 1, + IPOIB_RTNL_CHILD = 2, }; +#define IPOIB_OP_RECV (1ul << 31) +#ifdef CONFIG_INFINIBAND_IPOIB_CM +#define IPOIB_OP_CM (1ul << 30) +#else +#define IPOIB_OP_CM (0) +#endif + +#define IPOIB_QPN_MASK ((__force u32) cpu_to_be32(0xFFFFFF)) + /* structs */ struct ipoib_header { @@ -94,75 +127,230 @@ struct ipoib_header { u16 reserved; }; -struct ipoib_pseudoheader { - u8 hwaddr[INFINIBAND_ALEN]; +struct ipoib_cb { + struct qdisc_skb_cb qdisc_cb; + u8 hwaddr[INFINIBAND_ALEN]; }; -struct ipoib_mcast; +/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ +struct ipoib_mcast { + struct ib_sa_mcmember_rec mcmember; + struct ib_sa_multicast *mc; + struct ipoib_ah *ah; + + struct rb_node rb_node; + struct list_head list; + + unsigned long created; + unsigned long backoff; + + unsigned long flags; + unsigned char logcount; + + struct list_head neigh_list; + + struct sk_buff_head pkt_queue; + + struct net_device *dev; + struct completion done; +}; struct ipoib_rx_buf { struct sk_buff *skb; - dma_addr_t mapping; + u64 mapping[IPOIB_UD_RX_SG]; }; struct ipoib_tx_buf { struct sk_buff *skb; - DECLARE_PCI_UNMAP_ADDR(mapping) + u64 mapping[MAX_SKB_FRAGS + 1]; +}; + +struct ipoib_cm_tx_buf { + struct sk_buff *skb; + u64 mapping; +}; + +struct ib_cm_id; + +struct ipoib_cm_data { + __be32 qpn; /* High byte MUST be ignored on receive */ + __be32 mtu; }; /* - * Device private locking: tx_lock protects members used in TX fast - * path (and we use LLTX so upper layers don't do extra locking). - * lock protects everything else. lock nests inside of tx_lock (ie - * tx_lock must be acquired first if needed). + * Quoting 10.3.1 Queue Pair and EE Context States: + * + * Note, for QPs that are associated with an SRQ, the Consumer should take the + * QP through the Error State before invoking a Destroy QP or a Modify QP to the + * Reset State. The Consumer may invoke the Destroy QP without first performing + * a Modify QP to the Error State and waiting for the Affiliated Asynchronous + * Last WQE Reached Event. However, if the Consumer does not wait for the + * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment + * leakage may occur. Therefore, it is good programming practice to tear down a + * QP that is associated with an SRQ by using the following process: + * + * - Put the QP in the Error State + * - Wait for the Affiliated Asynchronous Last WQE Reached Event; + * - either: + * drain the CQ by invoking the Poll CQ verb and either wait for CQ + * to be empty or the number of Poll CQ operations has exceeded + * CQ capacity size; + * - or + * post another WR that completes on the same CQ and wait for this + * WR to return as a WC; + * - and then invoke a Destroy QP or Reset QP. + * + * We use the second option and wait for a completion on the + * same CQ before destroying QPs attached to our SRQ. + */ + +enum ipoib_cm_state { + IPOIB_CM_RX_LIVE, + IPOIB_CM_RX_ERROR, /* Ignored by stale task */ + IPOIB_CM_RX_FLUSH /* Last WQE Reached event observed */ +}; + +struct ipoib_cm_rx { + struct ib_cm_id *id; + struct ib_qp *qp; + struct ipoib_cm_rx_buf *rx_ring; + struct list_head list; + struct net_device *dev; + unsigned long jiffies; + enum ipoib_cm_state state; + int recv_count; +}; + +struct ipoib_cm_tx { + struct ib_cm_id *id; + struct ib_qp *qp; + struct list_head list; + struct net_device *dev; + struct ipoib_neigh *neigh; + struct ipoib_path *path; + struct ipoib_cm_tx_buf *tx_ring; + unsigned tx_head; + unsigned tx_tail; + unsigned long flags; + u32 mtu; +}; + +struct ipoib_cm_rx_buf { + struct sk_buff *skb; + u64 mapping[IPOIB_CM_RX_SG]; +}; + +struct ipoib_cm_dev_priv { + struct ib_srq *srq; + struct ipoib_cm_rx_buf *srq_ring; + struct ib_cm_id *id; + struct list_head passive_ids; /* state: LIVE */ + struct list_head rx_error_list; /* state: ERROR */ + struct list_head rx_flush_list; /* state: FLUSH, drain not started */ + struct list_head rx_drain_list; /* state: FLUSH, drain started */ + struct list_head rx_reap_list; /* state: FLUSH, drain done */ + struct work_struct start_task; + struct work_struct reap_task; + struct work_struct skb_task; + struct work_struct rx_reap_task; + struct delayed_work stale_task; + struct sk_buff_head skb_queue; + struct list_head start_list; + struct list_head reap_list; + struct ib_wc ibwc[IPOIB_NUM_WC]; + struct ib_sge rx_sge[IPOIB_CM_RX_SG]; + struct ib_recv_wr rx_wr; + int nonsrq_conn_qp; + int max_cm_mtu; + int num_frags; +}; + +struct ipoib_ethtool_st { + u16 coalesce_usecs; + u16 max_coalesced_frames; +}; + +struct ipoib_neigh_table; + +struct ipoib_neigh_hash { + struct ipoib_neigh_table *ntbl; + struct ipoib_neigh __rcu **buckets; + struct rcu_head rcu; + u32 mask; + u32 size; +}; + +struct ipoib_neigh_table { + struct ipoib_neigh_hash __rcu *htbl; + atomic_t entries; + struct completion flushed; + struct completion deleted; +}; + +/* + * Device private locking: network stack tx_lock protects members used + * in TX fast path, lock protects everything else. lock nests inside + * of tx_lock (ie tx_lock must be acquired first if needed). */ struct ipoib_dev_priv { spinlock_t lock; struct net_device *dev; + struct napi_struct napi; + unsigned long flags; - struct semaphore mcast_mutex; - struct semaphore vlan_mutex; + struct rw_semaphore vlan_rwsem; struct rb_root path_tree; struct list_head path_list; + struct ipoib_neigh_table ntbl; + struct ipoib_mcast *broadcast; struct list_head multicast_list; struct rb_root multicast_tree; - struct work_struct pkey_task; - struct work_struct mcast_task; - struct work_struct flush_task; + struct delayed_work pkey_poll_task; + struct delayed_work mcast_task; + struct work_struct carrier_on_task; + struct work_struct flush_light; + struct work_struct flush_normal; + struct work_struct flush_heavy; struct work_struct restart_task; - struct work_struct ah_reap_task; - + struct delayed_work ah_reap_task; + struct delayed_work neigh_reap_task; struct ib_device *ca; - u8 port; - u16 pkey; - struct ib_pd *pd; - struct ib_mr *mr; - struct ib_cq *cq; - struct ib_qp *qp; - u32 qkey; + u8 port; + u16 pkey; + u16 pkey_index; + struct ib_pd *pd; + struct ib_mr *mr; + struct ib_cq *recv_cq; + struct ib_cq *send_cq; + struct ib_qp *qp; + u32 qkey; union ib_gid local_gid; - u16 local_lid; - u8 local_rate; + u16 local_lid; unsigned int admin_mtu; unsigned int mcast_mtu; + unsigned int max_ib_mtu; struct ipoib_rx_buf *rx_ring; - spinlock_t tx_lock; struct ipoib_tx_buf *tx_ring; - unsigned tx_head; - unsigned tx_tail; - struct ib_sge tx_sge; + unsigned tx_head; + unsigned tx_tail; + struct ib_sge tx_sge[MAX_SKB_FRAGS + 1]; struct ib_send_wr tx_wr; + unsigned tx_outstanding; + struct ib_wc send_wc[MAX_SEND_CQE]; + + struct ib_recv_wr rx_wr; + struct ib_sge rx_sge[IPOIB_UD_RX_SG]; struct ib_wc ibwc[IPOIB_NUM_WC]; @@ -170,24 +358,31 @@ struct ipoib_dev_priv { struct ib_event_handler event_handler; - struct net_device_stats stats; - struct net_device *parent; struct list_head child_intfs; struct list_head list; + int child_type; + +#ifdef CONFIG_INFINIBAND_IPOIB_CM + struct ipoib_cm_dev_priv cm; +#endif #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct list_head fs_list; struct dentry *mcg_dentry; + struct dentry *path_dentry; #endif + int hca_caps; + struct ipoib_ethtool_st ethtool; + struct timer_list poll_timer; }; struct ipoib_ah { struct net_device *dev; - struct ib_ah *ah; + struct ib_ah *ah; struct list_head list; - struct kref ref; - unsigned last_send; + struct kref ref; + unsigned last_send; }; struct ipoib_path { @@ -198,34 +393,59 @@ struct ipoib_path { struct list_head neigh_list; - int query_id; + int query_id; struct ib_sa_query *query; struct completion done; - struct rb_node rb_node; + struct rb_node rb_node; struct list_head list; + int valid; }; struct ipoib_neigh { struct ipoib_ah *ah; +#ifdef CONFIG_INFINIBAND_IPOIB_CM + struct ipoib_cm_tx *cm; +#endif + u8 daddr[INFINIBAND_ALEN]; struct sk_buff_head queue; - struct neighbour *neighbour; + struct net_device *dev; struct list_head list; + struct ipoib_neigh __rcu *hnext; + struct rcu_head rcu; + atomic_t refcnt; + unsigned long alive; }; -static inline struct ipoib_neigh **to_ipoib_neigh(struct neighbour *neigh) +#define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN) +#define IPOIB_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES) + +static inline int ipoib_ud_need_sg(unsigned int ib_mtu) { - return (struct ipoib_neigh **) (neigh->ha + 24 - - (offsetof(struct neighbour, ha) & 4)); + return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE; } +void ipoib_neigh_dtor(struct ipoib_neigh *neigh); +static inline void ipoib_neigh_put(struct ipoib_neigh *neigh) +{ + if (atomic_dec_and_test(&neigh->refcnt)) + ipoib_neigh_dtor(neigh); +} +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr); +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, + struct net_device *dev); +void ipoib_neigh_free(struct ipoib_neigh *neigh); +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid); + extern struct workqueue_struct *ipoib_workqueue; /* functions */ +int ipoib_poll(struct napi_struct *napi, int budget); void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr); +void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr); struct ipoib_ah *ipoib_create_ah(struct net_device *dev, struct ib_pd *pd, struct ib_ah_attr *attr); @@ -234,34 +454,38 @@ static inline void ipoib_put_ah(struct ipoib_ah *ah) { kref_put(&ah->ref, ipoib_free_ah); } - int ipoib_open(struct net_device *dev); int ipoib_add_pkey_attr(struct net_device *dev); +int ipoib_add_umcast_attr(struct net_device *dev); void ipoib_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_ah *address, u32 qpn); -void ipoib_reap_ah(void *dev_ptr); +void ipoib_reap_ah(struct work_struct *work); +void ipoib_mark_paths_invalid(struct net_device *dev); void ipoib_flush_paths(struct net_device *dev); struct ipoib_dev_priv *ipoib_intf_alloc(const char *format); int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port); -void ipoib_ib_dev_flush(void *dev); +void ipoib_ib_dev_flush_light(struct work_struct *work); +void ipoib_ib_dev_flush_normal(struct work_struct *work); +void ipoib_ib_dev_flush_heavy(struct work_struct *work); +void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct net_device *dev); int ipoib_ib_dev_open(struct net_device *dev); int ipoib_ib_dev_up(struct net_device *dev); -int ipoib_ib_dev_down(struct net_device *dev); -int ipoib_ib_dev_stop(struct net_device *dev); +int ipoib_ib_dev_down(struct net_device *dev, int flush); +int ipoib_ib_dev_stop(struct net_device *dev, int flush); int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); void ipoib_dev_cleanup(struct net_device *dev); -void ipoib_mcast_join_task(void *dev_ptr); -void ipoib_mcast_send(struct net_device *dev, union ib_gid *mgid, - struct sk_buff *skb); +void ipoib_mcast_join_task(struct work_struct *work); +void ipoib_mcast_carrier_on_task(struct work_struct *work); +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); -void ipoib_mcast_restart_task(void *dev_ptr); +void ipoib_mcast_restart_task(struct work_struct *work); int ipoib_mcast_start_thread(struct net_device *dev); int ipoib_mcast_stop_thread(struct net_device *dev, int flush); @@ -270,7 +494,6 @@ void ipoib_mcast_dev_flush(struct net_device *dev); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev); -void ipoib_mcast_iter_free(struct ipoib_mcast_iter *iter); int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter); void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, union ib_gid *gid, @@ -278,12 +501,15 @@ void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, unsigned int *queuelen, unsigned int *complete, unsigned int *send_only); + +struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev); +int ipoib_path_iter_next(struct ipoib_path_iter *iter); +void ipoib_path_iter_read(struct ipoib_path_iter *iter, + struct ipoib_path *path); #endif int ipoib_mcast_attach(struct net_device *dev, u16 mlid, - union ib_gid *mgid); -int ipoib_mcast_detach(struct net_device *dev, u16 mlid, - union ib_gid *mgid); + union ib_gid *mgid, int set_qkey); int ipoib_init_qp(struct net_device *dev); int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca); @@ -295,38 +521,226 @@ void ipoib_event(struct ib_event_handler *handler, int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey); int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey); -void ipoib_pkey_poll(void *dev); +int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, + u16 pkey, int child_type); + +int __init ipoib_netlink_init(void); +void __exit ipoib_netlink_fini(void); + +void ipoib_set_umcast(struct net_device *ndev, int umcast_val); +int ipoib_set_mode(struct net_device *dev, const char *buf); + +void ipoib_setup(struct net_device *dev); + +void ipoib_pkey_poll(struct work_struct *work); int ipoib_pkey_dev_delay_open(struct net_device *dev); +void ipoib_drain_cq(struct net_device *dev); + +void ipoib_set_ethtool_ops(struct net_device *dev); +int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca); + +#define IPOIB_FLAGS_RC 0x80 +#define IPOIB_FLAGS_UC 0x40 + +/* We don't support UC connections at the moment */ +#define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC)) + +#ifdef CONFIG_INFINIBAND_IPOIB_CM + +extern int ipoib_max_conn_qp; + +static inline int ipoib_cm_admin_enabled(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return IPOIB_CM_SUPPORTED(dev->dev_addr) && + test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +} + +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return IPOIB_CM_SUPPORTED(hwaddr) && + test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +} + +static inline int ipoib_cm_up(struct ipoib_neigh *neigh) + +{ + return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags); +} + +static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh) +{ + return neigh->cm; +} + +static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx) +{ + neigh->cm = tx; +} + +static inline int ipoib_cm_has_srq(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return !!priv->cm.srq; +} + +static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return priv->cm.max_cm_mtu; +} + +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx); +int ipoib_cm_dev_open(struct net_device *dev); +void ipoib_cm_dev_stop(struct net_device *dev); +int ipoib_cm_dev_init(struct net_device *dev); +int ipoib_cm_add_mode_attr(struct net_device *dev); +void ipoib_cm_dev_cleanup(struct net_device *dev); +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh); +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx); +void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, + unsigned int mtu); +void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc); +void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc); +#else + +struct ipoib_cm_tx; + +#define ipoib_max_conn_qp 0 + +static inline int ipoib_cm_admin_enabled(struct net_device *dev) +{ + return 0; +} +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr) + +{ + return 0; +} + +static inline int ipoib_cm_up(struct ipoib_neigh *neigh) + +{ + return 0; +} + +static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh) +{ + return NULL; +} + +static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx) +{ +} + +static inline int ipoib_cm_has_srq(struct net_device *dev) +{ + return 0; +} + +static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev) +{ + return 0; +} + +static inline +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) +{ + return; +} + +static inline +int ipoib_cm_dev_open(struct net_device *dev) +{ + return 0; +} + +static inline +void ipoib_cm_dev_stop(struct net_device *dev) +{ + return; +} + +static inline +int ipoib_cm_dev_init(struct net_device *dev) +{ + return -ENOSYS; +} + +static inline +void ipoib_cm_dev_cleanup(struct net_device *dev) +{ + return; +} + +static inline +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh) +{ + return NULL; +} + +static inline +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) +{ + return; +} + +static inline +int ipoib_cm_add_mode_attr(struct net_device *dev) +{ + return 0; +} + +static inline void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, + unsigned int mtu) +{ + dev_kfree_skb_any(skb); +} + +static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ +} + +static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ +} +#endif #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG -int ipoib_create_debug_file(struct net_device *dev); -void ipoib_delete_debug_file(struct net_device *dev); +void ipoib_create_debug_files(struct net_device *dev); +void ipoib_delete_debug_files(struct net_device *dev); int ipoib_register_debugfs(void); void ipoib_unregister_debugfs(void); #else -static inline int ipoib_create_debug_file(struct net_device *dev) { return 0; } -static inline void ipoib_delete_debug_file(struct net_device *dev) { } +static inline void ipoib_create_debug_files(struct net_device *dev) { } +static inline void ipoib_delete_debug_files(struct net_device *dev) { } static inline int ipoib_register_debugfs(void) { return 0; } static inline void ipoib_unregister_debugfs(void) { } #endif - #define ipoib_printk(level, priv, format, arg...) \ printk(level "%s: " format, ((struct ipoib_dev_priv *) priv)->dev->name , ## arg) #define ipoib_warn(priv, format, arg...) \ ipoib_printk(KERN_WARNING, priv, format , ## arg) +extern int ipoib_sendq_size; +extern int ipoib_recvq_size; + +extern struct ib_sa_client ipoib_sa_client; #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG extern int ipoib_debug_level; #define ipoib_dbg(priv, format, arg...) \ - do { \ + do { \ if (ipoib_debug_level > 0) \ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ } while (0) #define ipoib_dbg_mcast(priv, format, arg...) \ - do { \ + do { \ if (mcast_debug_level > 0) \ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ } while (0) @@ -339,7 +753,7 @@ extern int ipoib_debug_level; #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA #define ipoib_dbg_data(priv, format, arg...) \ - do { \ + do { \ if (data_debug_level > 0) \ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ } while (0) @@ -348,16 +762,8 @@ extern int ipoib_debug_level; do { (void) (priv); } while (0) #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ +#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) -#define IPOIB_GID_FMT "%x:%x:%x:%x:%x:%x:%x:%x" - -#define IPOIB_GID_ARG(gid) be16_to_cpup((__be16 *) ((gid).raw + 0)), \ - be16_to_cpup((__be16 *) ((gid).raw + 2)), \ - be16_to_cpup((__be16 *) ((gid).raw + 4)), \ - be16_to_cpup((__be16 *) ((gid).raw + 6)), \ - be16_to_cpup((__be16 *) ((gid).raw + 8)), \ - be16_to_cpup((__be16 *) ((gid).raw + 10)), \ - be16_to_cpup((__be16 *) ((gid).raw + 12)), \ - be16_to_cpup((__be16 *) ((gid).raw + 14)) +extern const char ipoib_driver_version[]; #endif /* _IPOIB_H */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c new file mode 100644 index 00000000000..933efcea0d0 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -0,0 +1,1615 @@ +/* + * Copyright (c) 2006 Mellanox Technologies. All rights reserved + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_cm.h> +#include <net/dst.h> +#include <net/icmp.h> +#include <linux/icmpv6.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/moduleparam.h> + +#include "ipoib.h" + +int ipoib_max_conn_qp = 128; + +module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444); +MODULE_PARM_DESC(max_nonsrq_conn_qp, + "Max number of connected-mode QPs per interface " + "(applied only if shared receive queue is not available)"); + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA +static int data_debug_level; + +module_param_named(cm_data_debug_level, data_debug_level, int, 0644); +MODULE_PARM_DESC(cm_data_debug_level, + "Enable data path debug tracing for connected mode if > 0"); +#endif + +#define IPOIB_CM_IETF_ID 0x1000000000000000ULL + +#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ) +#define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ) +#define IPOIB_CM_RX_DELAY (3 * 256 * HZ) +#define IPOIB_CM_RX_UPDATE_MASK (0x3) + +static struct ib_qp_attr ipoib_cm_err_attr = { + .qp_state = IB_QPS_ERR +}; + +#define IPOIB_CM_RX_DRAIN_WRID 0xffffffff + +static struct ib_send_wr ipoib_cm_rx_drain_wr = { + .wr_id = IPOIB_CM_RX_DRAIN_WRID, + .opcode = IB_WR_SEND, +}; + +static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event); + +static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, + u64 mapping[IPOIB_CM_RX_SG]) +{ + int i; + + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); + + for (i = 0; i < frags; ++i) + ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); +} + +static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_recv_wr *bad_wr; + int i, ret; + + priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; + + for (i = 0; i < priv->cm.num_frags; ++i) + priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i]; + + ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr); + if (unlikely(ret)) { + ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret); + ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1, + priv->cm.srq_ring[id].mapping); + dev_kfree_skb_any(priv->cm.srq_ring[id].skb); + priv->cm.srq_ring[id].skb = NULL; + } + + return ret; +} + +static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, + struct ipoib_cm_rx *rx, + struct ib_recv_wr *wr, + struct ib_sge *sge, int id) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_recv_wr *bad_wr; + int i, ret; + + wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; + + for (i = 0; i < IPOIB_CM_RX_SG; ++i) + sge[i].addr = rx->rx_ring[id].mapping[i]; + + ret = ib_post_recv(rx->qp, wr, &bad_wr); + if (unlikely(ret)) { + ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret); + ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, + rx->rx_ring[id].mapping); + dev_kfree_skb_any(rx->rx_ring[id].skb); + rx->rx_ring[id].skb = NULL; + } + + return ret; +} + +static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, + struct ipoib_cm_rx_buf *rx_ring, + int id, int frags, + u64 mapping[IPOIB_CM_RX_SG], + gfp_t gfp) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct sk_buff *skb; + int i; + + skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12); + if (unlikely(!skb)) + return NULL; + + /* + * IPoIB adds a 4 byte header. So we need 12 more bytes to align the + * IP header to a multiple of 16. + */ + skb_reserve(skb, 12); + + mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE, + DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) { + dev_kfree_skb_any(skb); + return NULL; + } + + for (i = 0; i < frags; i++) { + struct page *page = alloc_page(gfp); + + if (!page) + goto partial_error; + skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE); + + mapping[i + 1] = ib_dma_map_page(priv->ca, page, + 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1]))) + goto partial_error; + } + + rx_ring[id].skb = skb; + return skb; + +partial_error: + + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); + + for (; i > 0; --i) + ib_dma_unmap_page(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE); + + dev_kfree_skb_any(skb); + return NULL; +} + +static void ipoib_cm_free_rx_ring(struct net_device *dev, + struct ipoib_cm_rx_buf *rx_ring) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < ipoib_recvq_size; ++i) + if (rx_ring[i].skb) { + ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, + rx_ring[i].mapping); + dev_kfree_skb_any(rx_ring[i].skb); + } + + vfree(rx_ring); +} + +static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv) +{ + struct ib_send_wr *bad_wr; + struct ipoib_cm_rx *p; + + /* We only reserved 1 extra slot in CQ for drain WRs, so + * make sure we have at most 1 outstanding WR. */ + if (list_empty(&priv->cm.rx_flush_list) || + !list_empty(&priv->cm.rx_drain_list)) + return; + + /* + * QPs on flush list are error state. This way, a "flush + * error" WC will be immediately generated for each WR we post. + */ + p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list); + if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr)) + ipoib_warn(priv, "failed to post drain wr\n"); + + list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list); +} + +static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx) +{ + struct ipoib_cm_rx *p = ctx; + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + unsigned long flags; + + if (event->event != IB_EVENT_QP_LAST_WQE_REACHED) + return; + + spin_lock_irqsave(&priv->lock, flags); + list_move(&p->list, &priv->cm.rx_flush_list); + p->state = IPOIB_CM_RX_FLUSH; + ipoib_cm_start_rx_drain(priv); + spin_unlock_irqrestore(&priv->lock, flags); +} + +static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, + struct ipoib_cm_rx *p) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_init_attr attr = { + .event_handler = ipoib_cm_rx_event_handler, + .send_cq = priv->recv_cq, /* For drain WR */ + .recv_cq = priv->recv_cq, + .srq = priv->cm.srq, + .cap.max_send_wr = 1, /* For drain WR */ + .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */ + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_RC, + .qp_context = p, + }; + + if (!ipoib_cm_has_srq(dev)) { + attr.cap.max_recv_wr = ipoib_recvq_size; + attr.cap.max_recv_sge = IPOIB_CM_RX_SG; + } + + return ib_create_qp(priv->pd, &attr); +} + +static int ipoib_cm_modify_rx_qp(struct net_device *dev, + struct ib_cm_id *cm_id, struct ib_qp *qp, + unsigned psn) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IB_QPS_INIT; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret); + return ret; + } + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret); + return ret; + } + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); + return ret; + } + qp_attr.rq_psn = psn; + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); + return ret; + } + + /* + * Current Mellanox HCA firmware won't generate completions + * with error for drain WRs unless the QP has been moved to + * RTS first. This work-around leaves a window where a QP has + * moved to error asynchronously, but this will eventually get + * fixed in firmware, so let's not error out if modify QP + * fails. + */ + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); + return 0; + } + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); + return 0; + } + + return 0; +} + +static void ipoib_cm_init_rx_wr(struct net_device *dev, + struct ib_recv_wr *wr, + struct ib_sge *sge) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < priv->cm.num_frags; ++i) + sge[i].lkey = priv->mr->lkey; + + sge[0].length = IPOIB_CM_HEAD_SIZE; + for (i = 1; i < priv->cm.num_frags; ++i) + sge[i].length = PAGE_SIZE; + + wr->next = NULL; + wr->sg_list = sge; + wr->num_sge = priv->cm.num_frags; +} + +static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id, + struct ipoib_cm_rx *rx) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct { + struct ib_recv_wr wr; + struct ib_sge sge[IPOIB_CM_RX_SG]; + } *t; + int ret; + int i; + + rx->rx_ring = vzalloc(ipoib_recvq_size * sizeof *rx->rx_ring); + if (!rx->rx_ring) { + printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n", + priv->ca->name, ipoib_recvq_size); + return -ENOMEM; + } + + t = kmalloc(sizeof *t, GFP_KERNEL); + if (!t) { + ret = -ENOMEM; + goto err_free; + } + + ipoib_cm_init_rx_wr(dev, &t->wr, t->sge); + + spin_lock_irq(&priv->lock); + + if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) { + spin_unlock_irq(&priv->lock); + ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0); + ret = -EINVAL; + goto err_free; + } else + ++priv->cm.nonsrq_conn_qp; + + spin_unlock_irq(&priv->lock); + + for (i = 0; i < ipoib_recvq_size; ++i) { + if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1, + rx->rx_ring[i].mapping, + GFP_KERNEL)) { + ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); + ret = -ENOMEM; + goto err_count; + } + ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i); + if (ret) { + ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq " + "failed for buf %d\n", i); + ret = -EIO; + goto err_count; + } + } + + rx->recv_count = ipoib_recvq_size; + + kfree(t); + + return 0; + +err_count: + spin_lock_irq(&priv->lock); + --priv->cm.nonsrq_conn_qp; + spin_unlock_irq(&priv->lock); + +err_free: + kfree(t); + ipoib_cm_free_rx_ring(dev, rx->rx_ring); + + return ret; +} + +static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, + struct ib_qp *qp, struct ib_cm_req_event_param *req, + unsigned psn) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_data data = {}; + struct ib_cm_rep_param rep = {}; + + data.qpn = cpu_to_be32(priv->qp->qp_num); + data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); + + rep.private_data = &data; + rep.private_data_len = sizeof data; + rep.flow_control = 0; + rep.rnr_retry_count = req->rnr_retry_count; + rep.srq = ipoib_cm_has_srq(dev); + rep.qp_num = qp->qp_num; + rep.starting_psn = psn; + return ib_send_cm_rep(cm_id, &rep); +} + +static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +{ + struct net_device *dev = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx *p; + unsigned psn; + int ret; + + ipoib_dbg(priv, "REQ arrived\n"); + p = kzalloc(sizeof *p, GFP_KERNEL); + if (!p) + return -ENOMEM; + p->dev = dev; + p->id = cm_id; + cm_id->context = p; + p->state = IPOIB_CM_RX_LIVE; + p->jiffies = jiffies; + INIT_LIST_HEAD(&p->list); + + p->qp = ipoib_cm_create_rx_qp(dev, p); + if (IS_ERR(p->qp)) { + ret = PTR_ERR(p->qp); + goto err_qp; + } + + psn = prandom_u32() & 0xffffff; + ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); + if (ret) + goto err_modify; + + if (!ipoib_cm_has_srq(dev)) { + ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p); + if (ret) + goto err_modify; + } + + spin_lock_irq(&priv->lock); + queue_delayed_work(ipoib_workqueue, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); + /* Add this entry to passive ids list head, but do not re-add it + * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ + p->jiffies = jiffies; + if (p->state == IPOIB_CM_RX_LIVE) + list_move(&p->list, &priv->cm.passive_ids); + spin_unlock_irq(&priv->lock); + + ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn); + if (ret) { + ipoib_warn(priv, "failed to send REP: %d\n", ret); + if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) + ipoib_warn(priv, "unable to move qp to error state\n"); + } + return 0; + +err_modify: + ib_destroy_qp(p->qp); +err_qp: + kfree(p); + return ret; +} + +static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event) +{ + struct ipoib_cm_rx *p; + struct ipoib_dev_priv *priv; + + switch (event->event) { + case IB_CM_REQ_RECEIVED: + return ipoib_cm_req_handler(cm_id, event); + case IB_CM_DREQ_RECEIVED: + p = cm_id->context; + ib_send_cm_drep(cm_id, NULL, 0); + /* Fall through */ + case IB_CM_REJ_RECEIVED: + p = cm_id->context; + priv = netdev_priv(p->dev); + if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) + ipoib_warn(priv, "unable to move qp to error state\n"); + /* Fall through */ + default: + return 0; + } +} +/* Adjust length of skb with fragments to match received data */ +static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, + unsigned int length, struct sk_buff *toskb) +{ + int i, num_frags; + unsigned int size; + + /* put header into skb */ + size = min(length, hdr_space); + skb->tail += size; + skb->len += size; + length -= size; + + num_frags = skb_shinfo(skb)->nr_frags; + for (i = 0; i < num_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (length == 0) { + /* don't need this page */ + skb_fill_page_desc(toskb, i, skb_frag_page(frag), + 0, PAGE_SIZE); + --skb_shinfo(skb)->nr_frags; + } else { + size = min(length, (unsigned) PAGE_SIZE); + + skb_frag_size_set(frag, size); + skb->data_len += size; + skb->truesize += size; + skb->len += size; + length -= size; + } + } +} + +void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx_buf *rx_ring; + unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV); + struct sk_buff *skb, *newskb; + struct ipoib_cm_rx *p; + unsigned long flags; + u64 mapping[IPOIB_CM_RX_SG]; + int frags; + int has_srq; + struct sk_buff *small_skb; + + ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= ipoib_recvq_size)) { + if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) { + spin_lock_irqsave(&priv->lock, flags); + list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); + ipoib_cm_start_rx_drain(priv); + queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + spin_unlock_irqrestore(&priv->lock, flags); + } else + ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", + wr_id, ipoib_recvq_size); + return; + } + + p = wc->qp->qp_context; + + has_srq = ipoib_cm_has_srq(dev); + rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring; + + skb = rx_ring[wr_id].skb; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + ipoib_dbg(priv, "cm recv error " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); + ++dev->stats.rx_dropped; + if (has_srq) + goto repost; + else { + if (!--p->recv_count) { + spin_lock_irqsave(&priv->lock, flags); + list_move(&p->list, &priv->cm.rx_reap_list); + spin_unlock_irqrestore(&priv->lock, flags); + queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + } + return; + } + } + + if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) { + if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) { + spin_lock_irqsave(&priv->lock, flags); + p->jiffies = jiffies; + /* Move this entry to list head, but do not re-add it + * if it has been moved out of list. */ + if (p->state == IPOIB_CM_RX_LIVE) + list_move(&p->list, &priv->cm.passive_ids); + spin_unlock_irqrestore(&priv->lock, flags); + } + } + + if (wc->byte_len < IPOIB_CM_COPYBREAK) { + int dlen = wc->byte_len; + + small_skb = dev_alloc_skb(dlen + 12); + if (small_skb) { + skb_reserve(small_skb, 12); + ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0], + dlen, DMA_FROM_DEVICE); + skb_copy_from_linear_data(skb, small_skb->data, dlen); + ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0], + dlen, DMA_FROM_DEVICE); + skb_put(small_skb, dlen); + skb = small_skb; + goto copied; + } + } + + frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, + (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; + + newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, + mapping, GFP_ATOMIC); + if (unlikely(!newskb)) { + /* + * If we can't allocate a new RX buffer, dump + * this packet and reuse the old buffer. + */ + ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id); + ++dev->stats.rx_dropped; + goto repost; + } + + ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping); + memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping); + + ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", + wc->byte_len, wc->slid); + + skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb); + +copied: + skb->protocol = ((struct ipoib_header *) skb->data)->proto; + skb_reset_mac_header(skb); + skb_pull(skb, IPOIB_ENCAP_LEN); + + ++dev->stats.rx_packets; + dev->stats.rx_bytes += skb->len; + + skb->dev = dev; + /* XXX get correct PACKET_ type here */ + skb->pkt_type = PACKET_HOST; + netif_receive_skb(skb); + +repost: + if (has_srq) { + if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id))) + ipoib_warn(priv, "ipoib_cm_post_receive_srq failed " + "for buf %d\n", wr_id); + } else { + if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, + &priv->cm.rx_wr, + priv->cm.rx_sge, + wr_id))) { + --p->recv_count; + ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed " + "for buf %d\n", wr_id); + } + } +} + +static inline int post_send(struct ipoib_dev_priv *priv, + struct ipoib_cm_tx *tx, + unsigned int wr_id, + u64 addr, int len) +{ + struct ib_send_wr *bad_wr; + + priv->tx_sge[0].addr = addr; + priv->tx_sge[0].length = len; + + priv->tx_wr.num_sge = 1; + priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; + + return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); +} + +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_tx_buf *tx_req; + u64 addr; + int rc; + + if (unlikely(skb->len > tx->mtu)) { + ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", + skb->len, tx->mtu); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN); + return; + } + + ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n", + tx->tx_head, skb->len, tx->qp->qp_num); + + /* + * We put the skb into the tx_ring _before_ we call post_send() + * because it's entirely possible that the completion handler will + * run before we execute anything after the post_send(). That + * means we have to make sure everything is properly recorded and + * our state is consistent before we call post_send(). + */ + tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; + tx_req->skb = skb; + addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + + tx_req->mapping = addr; + + skb_orphan(skb); + skb_dst_drop(skb); + + rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), + addr, skb->len); + if (unlikely(rc)) { + ipoib_warn(priv, "post_send failed, error %d\n", rc); + ++dev->stats.tx_errors; + ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); + dev_kfree_skb_any(skb); + } else { + dev->trans_start = jiffies; + ++tx->tx_head; + + if (++priv->tx_outstanding == ipoib_sendq_size) { + ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", + tx->qp->qp_num); + netif_stop_queue(dev); + rc = ib_req_notify_cq(priv->send_cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc < 0) + ipoib_warn(priv, "request notify on send CQ failed\n"); + else if (rc) + ipoib_send_comp_handler(priv->send_cq, dev); + } + } +} + +void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_tx *tx = wc->qp->qp_context; + unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; + struct ipoib_cm_tx_buf *tx_req; + unsigned long flags; + + ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= ipoib_sendq_size)) { + ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n", + wr_id, ipoib_sendq_size); + return; + } + + tx_req = &tx->tx_ring[wr_id]; + + ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); + + /* FIXME: is this right? Shouldn't we only increment on success? */ + ++dev->stats.tx_packets; + dev->stats.tx_bytes += tx_req->skb->len; + + dev_kfree_skb_any(tx_req->skb); + + netif_tx_lock(dev); + + ++tx->tx_tail; + if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + netif_queue_stopped(dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + netif_wake_queue(dev); + + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) { + struct ipoib_neigh *neigh; + + ipoib_dbg(priv, "failed cm send event " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); + + spin_lock_irqsave(&priv->lock, flags); + neigh = tx->neigh; + + if (neigh) { + neigh->cm = NULL; + ipoib_neigh_free(neigh); + + tx->neigh = NULL; + } + + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + } + + clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); + + spin_unlock_irqrestore(&priv->lock, flags); + } + + netif_tx_unlock(dev); +} + +int ipoib_cm_dev_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + + if (!IPOIB_CM_SUPPORTED(dev->dev_addr)) + return 0; + + priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev); + if (IS_ERR(priv->cm.id)) { + printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); + ret = PTR_ERR(priv->cm.id); + goto err_cm; + } + + ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), + 0, NULL); + if (ret) { + printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name, + IPOIB_CM_IETF_ID | priv->qp->qp_num); + goto err_listen; + } + + return 0; + +err_listen: + ib_destroy_cm_id(priv->cm.id); +err_cm: + priv->cm.id = NULL; + return ret; +} + +static void ipoib_cm_free_rx_reap_list(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx *rx, *n; + LIST_HEAD(list); + + spin_lock_irq(&priv->lock); + list_splice_init(&priv->cm.rx_reap_list, &list); + spin_unlock_irq(&priv->lock); + + list_for_each_entry_safe(rx, n, &list, list) { + ib_destroy_cm_id(rx->id); + ib_destroy_qp(rx->qp); + if (!ipoib_cm_has_srq(dev)) { + ipoib_cm_free_rx_ring(priv->dev, rx->rx_ring); + spin_lock_irq(&priv->lock); + --priv->cm.nonsrq_conn_qp; + spin_unlock_irq(&priv->lock); + } + kfree(rx); + } +} + +void ipoib_cm_dev_stop(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx *p; + unsigned long begin; + int ret; + + if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id) + return; + + ib_destroy_cm_id(priv->cm.id); + priv->cm.id = NULL; + + spin_lock_irq(&priv->lock); + while (!list_empty(&priv->cm.passive_ids)) { + p = list_entry(priv->cm.passive_ids.next, typeof(*p), list); + list_move(&p->list, &priv->cm.rx_error_list); + p->state = IPOIB_CM_RX_ERROR; + spin_unlock_irq(&priv->lock); + ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); + if (ret) + ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); + spin_lock_irq(&priv->lock); + } + + /* Wait for all RX to be drained */ + begin = jiffies; + + while (!list_empty(&priv->cm.rx_error_list) || + !list_empty(&priv->cm.rx_flush_list) || + !list_empty(&priv->cm.rx_drain_list)) { + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "RX drain timing out\n"); + + /* + * assume the HW is wedged and just free up everything. + */ + list_splice_init(&priv->cm.rx_flush_list, + &priv->cm.rx_reap_list); + list_splice_init(&priv->cm.rx_error_list, + &priv->cm.rx_reap_list); + list_splice_init(&priv->cm.rx_drain_list, + &priv->cm.rx_reap_list); + break; + } + spin_unlock_irq(&priv->lock); + msleep(1); + ipoib_drain_cq(dev); + spin_lock_irq(&priv->lock); + } + + spin_unlock_irq(&priv->lock); + + ipoib_cm_free_rx_reap_list(dev); + + cancel_delayed_work(&priv->cm.stale_task); +} + +static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +{ + struct ipoib_cm_tx *p = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_cm_data *data = event->private_data; + struct sk_buff_head skqueue; + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + struct sk_buff *skb; + + p->mtu = be32_to_cpu(data->mtu); + + if (p->mtu <= IPOIB_ENCAP_LEN) { + ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n", + p->mtu, IPOIB_ENCAP_LEN); + return -EINVAL; + } + + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); + return ret; + } + + qp_attr.rq_psn = 0 /* FIXME */; + ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); + return ret; + } + + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); + return ret; + } + ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); + return ret; + } + + skb_queue_head_init(&skqueue); + + spin_lock_irq(&priv->lock); + set_bit(IPOIB_FLAG_OPER_UP, &p->flags); + if (p->neigh) + while ((skb = __skb_dequeue(&p->neigh->queue))) + __skb_queue_tail(&skqueue, skb); + spin_unlock_irq(&priv->lock); + + while ((skb = __skb_dequeue(&skqueue))) { + skb->dev = p->dev; + if (dev_queue_xmit(skb)) + ipoib_warn(priv, "dev_queue_xmit failed " + "to requeue packet\n"); + } + + ret = ib_send_cm_rtu(cm_id, NULL, 0); + if (ret) { + ipoib_warn(priv, "failed to send RTU: %d\n", ret); + return ret; + } + return 0; +} + +static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_init_attr attr = { + .send_cq = priv->recv_cq, + .recv_cq = priv->recv_cq, + .srq = priv->cm.srq, + .cap.max_send_wr = ipoib_sendq_size, + .cap.max_send_sge = 1, + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_RC, + .qp_context = tx, + .create_flags = IB_QP_CREATE_USE_GFP_NOIO + }; + + struct ib_qp *tx_qp; + + tx_qp = ib_create_qp(priv->pd, &attr); + if (PTR_ERR(tx_qp) == -EINVAL) { + ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n", + priv->ca->name); + attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO; + tx_qp = ib_create_qp(priv->pd, &attr); + } + return tx_qp; +} + +static int ipoib_cm_send_req(struct net_device *dev, + struct ib_cm_id *id, struct ib_qp *qp, + u32 qpn, + struct ib_sa_path_rec *pathrec) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_data data = {}; + struct ib_cm_req_param req = {}; + + data.qpn = cpu_to_be32(priv->qp->qp_num); + data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); + + req.primary_path = pathrec; + req.alternate_path = NULL; + req.service_id = cpu_to_be64(IPOIB_CM_IETF_ID | qpn); + req.qp_num = qp->qp_num; + req.qp_type = qp->qp_type; + req.private_data = &data; + req.private_data_len = sizeof data; + req.flow_control = 0; + + req.starting_psn = 0; /* FIXME */ + + /* + * Pick some arbitrary defaults here; we could make these + * module parameters if anyone cared about setting them. + */ + req.responder_resources = 4; + req.remote_cm_response_timeout = 20; + req.local_cm_response_timeout = 20; + req.retry_count = 0; /* RFC draft warns against retries */ + req.rnr_retry_count = 0; /* RFC draft warns against retries */ + req.max_cm_retries = 15; + req.srq = ipoib_cm_has_srq(dev); + return ib_send_cm_req(id, &req); +} + +static int ipoib_cm_modify_tx_init(struct net_device *dev, + struct ib_cm_id *cm_id, struct ib_qp *qp) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index); + if (ret) { + ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret); + return ret; + } + + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE; + qp_attr.port_num = priv->port; + qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; + + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret); + return ret; + } + return 0; +} + +static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, + struct ib_sa_path_rec *pathrec) +{ + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + int ret; + + p->tx_ring = __vmalloc(ipoib_sendq_size * sizeof *p->tx_ring, + GFP_NOIO, PAGE_KERNEL); + if (!p->tx_ring) { + ipoib_warn(priv, "failed to allocate tx ring\n"); + ret = -ENOMEM; + goto err_tx; + } + memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring); + + p->qp = ipoib_cm_create_tx_qp(p->dev, p); + if (IS_ERR(p->qp)) { + ret = PTR_ERR(p->qp); + ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret); + goto err_qp; + } + + p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p); + if (IS_ERR(p->id)) { + ret = PTR_ERR(p->id); + ipoib_warn(priv, "failed to create tx cm id: %d\n", ret); + goto err_id; + } + + ret = ipoib_cm_modify_tx_init(p->dev, p->id, p->qp); + if (ret) { + ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret); + goto err_modify; + } + + ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec); + if (ret) { + ipoib_warn(priv, "failed to send cm req: %d\n", ret); + goto err_send_cm; + } + + ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n", + p->qp->qp_num, pathrec->dgid.raw, qpn); + + return 0; + +err_send_cm: +err_modify: + ib_destroy_cm_id(p->id); +err_id: + p->id = NULL; + ib_destroy_qp(p->qp); +err_qp: + p->qp = NULL; + vfree(p->tx_ring); +err_tx: + return ret; +} + +static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) +{ + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_cm_tx_buf *tx_req; + unsigned long begin; + + ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", + p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail); + + if (p->id) + ib_destroy_cm_id(p->id); + + if (p->tx_ring) { + /* Wait for all sends to complete */ + begin = jiffies; + while ((int) p->tx_tail - (int) p->tx_head < 0) { + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "timing out; %d sends not completed\n", + p->tx_head - p->tx_tail); + goto timeout; + } + + msleep(1); + } + } + +timeout: + + while ((int) p->tx_tail - (int) p->tx_head < 0) { + tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; + ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, + DMA_TO_DEVICE); + dev_kfree_skb_any(tx_req->skb); + ++p->tx_tail; + netif_tx_lock_bh(p->dev); + if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + netif_queue_stopped(p->dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + netif_wake_queue(p->dev); + netif_tx_unlock_bh(p->dev); + } + + if (p->qp) + ib_destroy_qp(p->qp); + + vfree(p->tx_ring); + kfree(p); +} + +static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event) +{ + struct ipoib_cm_tx *tx = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + struct net_device *dev = priv->dev; + struct ipoib_neigh *neigh; + unsigned long flags; + int ret; + + switch (event->event) { + case IB_CM_DREQ_RECEIVED: + ipoib_dbg(priv, "DREQ received.\n"); + ib_send_cm_drep(cm_id, NULL, 0); + break; + case IB_CM_REP_RECEIVED: + ipoib_dbg(priv, "REP received.\n"); + ret = ipoib_cm_rep_handler(cm_id, event); + if (ret) + ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, + NULL, 0, NULL, 0); + break; + case IB_CM_REQ_ERROR: + case IB_CM_REJ_RECEIVED: + case IB_CM_TIMEWAIT_EXIT: + ipoib_dbg(priv, "CM error %d.\n", event->event); + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + neigh = tx->neigh; + + if (neigh) { + neigh->cm = NULL; + ipoib_neigh_free(neigh); + + tx->neigh = NULL; + } + + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + break; + default: + break; + } + + return 0; +} + +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_tx *tx; + + tx = kzalloc(sizeof *tx, GFP_ATOMIC); + if (!tx) + return NULL; + + neigh->cm = tx; + tx->neigh = neigh; + tx->path = path; + tx->dev = dev; + list_add(&tx->list, &priv->cm.start_list); + set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); + queue_work(ipoib_workqueue, &priv->cm.start_task); + return tx; +} + +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + unsigned long flags; + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + spin_lock_irqsave(&priv->lock, flags); + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + ipoib_dbg(priv, "Reap connection for gid %pI6\n", + tx->neigh->daddr + 4); + tx->neigh = NULL; + spin_unlock_irqrestore(&priv->lock, flags); + } +} + +static void ipoib_cm_tx_start(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.start_task); + struct net_device *dev = priv->dev; + struct ipoib_neigh *neigh; + struct ipoib_cm_tx *p; + unsigned long flags; + int ret; + + struct ib_sa_path_rec pathrec; + u32 qpn; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + while (!list_empty(&priv->cm.start_list)) { + p = list_entry(priv->cm.start_list.next, typeof(*p), list); + list_del_init(&p->list); + neigh = p->neigh; + qpn = IPOIB_QPN(neigh->daddr); + memcpy(&pathrec, &p->path->pathrec, sizeof pathrec); + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + + ret = ipoib_cm_tx_init(p, qpn, &pathrec); + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + if (ret) { + neigh = p->neigh; + if (neigh) { + neigh->cm = NULL; + ipoib_neigh_free(neigh); + } + list_del(&p->list); + kfree(p); + } + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +static void ipoib_cm_tx_reap(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.reap_task); + struct net_device *dev = priv->dev; + struct ipoib_cm_tx *p; + unsigned long flags; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + while (!list_empty(&priv->cm.reap_list)) { + p = list_entry(priv->cm.reap_list.next, typeof(*p), list); + list_del(&p->list); + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + ipoib_cm_tx_destroy(p); + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +static void ipoib_cm_skb_reap(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.skb_task); + struct net_device *dev = priv->dev; + struct sk_buff *skb; + unsigned long flags; + unsigned mtu = priv->mcast_mtu; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + while ((skb = skb_dequeue(&priv->cm.skb_queue))) { + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + + if (skb->protocol == htons(ETH_P_IP)) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); +#if IS_ENABLED(CONFIG_IPV6) + else if (skb->protocol == htons(ETH_P_IPV6)) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); +#endif + dev_kfree_skb_any(skb); + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, + unsigned int mtu) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int e = skb_queue_empty(&priv->cm.skb_queue); + + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + + skb_queue_tail(&priv->cm.skb_queue, skb); + if (e) + queue_work(ipoib_workqueue, &priv->cm.skb_task); +} + +static void ipoib_cm_rx_reap(struct work_struct *work) +{ + ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv, + cm.rx_reap_task)->dev); +} + +static void ipoib_cm_stale_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.stale_task.work); + struct ipoib_cm_rx *p; + int ret; + + spin_lock_irq(&priv->lock); + while (!list_empty(&priv->cm.passive_ids)) { + /* List is sorted by LRU, start from tail, + * stop when we see a recently used entry */ + p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list); + if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT)) + break; + list_move(&p->list, &priv->cm.rx_error_list); + p->state = IPOIB_CM_RX_ERROR; + spin_unlock_irq(&priv->lock); + ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); + if (ret) + ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); + spin_lock_irq(&priv->lock); + } + + if (!list_empty(&priv->cm.passive_ids)) + queue_delayed_work(ipoib_workqueue, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); + spin_unlock_irq(&priv->lock); +} + + +static ssize_t show_mode(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d)); + + if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) + return sprintf(buf, "connected\n"); + else + return sprintf(buf, "datagram\n"); +} + +static ssize_t set_mode(struct device *d, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct net_device *dev = to_net_dev(d); + int ret; + + if (!rtnl_trylock()) + return restart_syscall(); + + ret = ipoib_set_mode(dev, buf); + + rtnl_unlock(); + + if (!ret) + return count; + + return ret; +} + +static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode); + +int ipoib_cm_add_mode_attr(struct net_device *dev) +{ + return device_create_file(&dev->dev, &dev_attr_mode); +} + +static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_srq_init_attr srq_init_attr = { + .srq_type = IB_SRQT_BASIC, + .attr = { + .max_wr = ipoib_recvq_size, + .max_sge = max_sge + } + }; + + priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr); + if (IS_ERR(priv->cm.srq)) { + if (PTR_ERR(priv->cm.srq) != -ENOSYS) + printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n", + priv->ca->name, PTR_ERR(priv->cm.srq)); + priv->cm.srq = NULL; + return; + } + + priv->cm.srq_ring = vzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring); + if (!priv->cm.srq_ring) { + printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n", + priv->ca->name, ipoib_recvq_size); + ib_destroy_srq(priv->cm.srq); + priv->cm.srq = NULL; + return; + } + +} + +int ipoib_cm_dev_init(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i, ret; + struct ib_device_attr attr; + + INIT_LIST_HEAD(&priv->cm.passive_ids); + INIT_LIST_HEAD(&priv->cm.reap_list); + INIT_LIST_HEAD(&priv->cm.start_list); + INIT_LIST_HEAD(&priv->cm.rx_error_list); + INIT_LIST_HEAD(&priv->cm.rx_flush_list); + INIT_LIST_HEAD(&priv->cm.rx_drain_list); + INIT_LIST_HEAD(&priv->cm.rx_reap_list); + INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start); + INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap); + INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap); + INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap); + INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task); + + skb_queue_head_init(&priv->cm.skb_queue); + + ret = ib_query_device(priv->ca, &attr); + if (ret) { + printk(KERN_WARNING "ib_query_device() failed with %d\n", ret); + return ret; + } + + ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge); + + attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge); + ipoib_cm_create_srq(dev, attr.max_srq_sge); + if (ipoib_cm_has_srq(dev)) { + priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10; + priv->cm.num_frags = attr.max_srq_sge; + ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n", + priv->cm.max_cm_mtu, priv->cm.num_frags); + } else { + priv->cm.max_cm_mtu = IPOIB_CM_MTU; + priv->cm.num_frags = IPOIB_CM_RX_SG; + } + + ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge); + + if (ipoib_cm_has_srq(dev)) { + for (i = 0; i < ipoib_recvq_size; ++i) { + if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i, + priv->cm.num_frags - 1, + priv->cm.srq_ring[i].mapping, + GFP_KERNEL)) { + ipoib_warn(priv, "failed to allocate " + "receive buffer %d\n", i); + ipoib_cm_dev_cleanup(dev); + return -ENOMEM; + } + + if (ipoib_cm_post_receive_srq(dev, i)) { + ipoib_warn(priv, "ipoib_cm_post_receive_srq " + "failed for buf %d\n", i); + ipoib_cm_dev_cleanup(dev); + return -EIO; + } + } + } + + priv->dev->dev_addr[0] = IPOIB_FLAGS_RC; + return 0; +} + +void ipoib_cm_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + + if (!priv->cm.srq) + return; + + ipoib_dbg(priv, "Cleanup ipoib connected mode.\n"); + + ret = ib_destroy_srq(priv->cm.srq); + if (ret) + ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret); + + priv->cm.srq = NULL; + if (!priv->cm.srq_ring) + return; + + ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring); + priv->cm.srq_ring = NULL; +} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c new file mode 100644 index 00000000000..078cadd6c79 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/kernel.h> +#include <linux/ethtool.h> +#include <linux/netdevice.h> + +#include "ipoib.h" + +static void ipoib_get_drvinfo(struct net_device *netdev, + struct ethtool_drvinfo *drvinfo) +{ + struct ipoib_dev_priv *priv = netdev_priv(netdev); + struct ib_device_attr *attr; + + attr = kmalloc(sizeof(*attr), GFP_KERNEL); + if (attr && !ib_query_device(priv->ca, attr)) + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%d", (int)(attr->fw_ver >> 32), + (int)(attr->fw_ver >> 16) & 0xffff, + (int)attr->fw_ver & 0xffff); + kfree(attr); + + strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device), + sizeof(drvinfo->bus_info)); + + strlcpy(drvinfo->version, ipoib_driver_version, + sizeof(drvinfo->version)); + + strlcpy(drvinfo->driver, "ib_ipoib", sizeof(drvinfo->driver)); +} + +static int ipoib_get_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs; + coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames; + + return 0; +} + +static int ipoib_set_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + + /* + * These values are saved in the private data and returned + * when ipoib_get_coalesce() is called + */ + if (coal->rx_coalesce_usecs > 0xffff || + coal->rx_max_coalesced_frames > 0xffff) + return -EINVAL; + + ret = ib_modify_cq(priv->recv_cq, coal->rx_max_coalesced_frames, + coal->rx_coalesce_usecs); + if (ret && ret != -ENOSYS) { + ipoib_warn(priv, "failed modifying CQ (%d)\n", ret); + return ret; + } + + priv->ethtool.coalesce_usecs = coal->rx_coalesce_usecs; + priv->ethtool.max_coalesced_frames = coal->rx_max_coalesced_frames; + + return 0; +} + +static const struct ethtool_ops ipoib_ethtool_ops = { + .get_drvinfo = ipoib_get_drvinfo, + .get_coalesce = ipoib_get_coalesce, + .set_coalesce = ipoib_set_coalesce, +}; + +void ipoib_set_ethtool_ops(struct net_device *dev) +{ + dev->ethtool_ops = &ipoib_ethtool_ops; +} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c index 38b150f775e..50061854616 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c @@ -28,21 +28,33 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ipoib_fs.c 1389 2004-12-27 22:56:47Z roland $ */ #include <linux/err.h> #include <linux/seq_file.h> +#include <linux/slab.h> struct file_operations; #include <linux/debugfs.h> +#include <linux/export.h> #include "ipoib.h" static struct dentry *ipoib_root; +static void format_gid(union ib_gid *gid, char *buf) +{ + int i, n; + + for (n = 0, i = 0; i < 8; ++i) { + n += sprintf(buf + n, "%x", + be16_to_cpu(((__be16 *) gid->raw)[i])); + if (i < 7) + buf[n++] = ':'; + } +} + static void *ipoib_mcg_seq_start(struct seq_file *file, loff_t *pos) { struct ipoib_mcast_iter *iter; @@ -54,7 +66,7 @@ static void *ipoib_mcg_seq_start(struct seq_file *file, loff_t *pos) while (n--) { if (ipoib_mcast_iter_next(iter)) { - ipoib_mcast_iter_free(iter); + kfree(iter); return NULL; } } @@ -70,7 +82,7 @@ static void *ipoib_mcg_seq_next(struct seq_file *file, void *iter_ptr, (*pos)++; if (ipoib_mcast_iter_next(iter)) { - ipoib_mcast_iter_free(iter); + kfree(iter); return NULL; } @@ -87,32 +99,32 @@ static int ipoib_mcg_seq_show(struct seq_file *file, void *iter_ptr) struct ipoib_mcast_iter *iter = iter_ptr; char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"]; union ib_gid mgid; - int i, n; unsigned long created; unsigned int queuelen, complete, send_only; - if (iter) { - ipoib_mcast_iter_read(iter, &mgid, &created, &queuelen, - &complete, &send_only); + if (!iter) + return 0; - for (n = 0, i = 0; i < sizeof mgid / 2; ++i) { - n += sprintf(gid_buf + n, "%x", - be16_to_cpu(((__be16 *) mgid.raw)[i])); - if (i < sizeof mgid / 2 - 1) - gid_buf[n++] = ':'; - } - } + ipoib_mcast_iter_read(iter, &mgid, &created, &queuelen, + &complete, &send_only); - seq_printf(file, "GID: %*s", -(1 + (int) sizeof gid_buf), gid_buf); + format_gid(&mgid, gid_buf); seq_printf(file, - " created: %10ld queuelen: %4d complete: %d send_only: %d\n", - created, queuelen, complete, send_only); + "GID: %s\n" + " created: %10ld\n" + " queuelen: %9d\n" + " complete: %9s\n" + " send_only: %8s\n" + "\n", + gid_buf, created, queuelen, + complete ? "yes" : "no", + send_only ? "yes" : "no"); return 0; } -static struct seq_operations ipoib_seq_ops = { +static const struct seq_operations ipoib_mcg_seq_ops = { .start = ipoib_mcg_seq_start, .next = ipoib_mcg_seq_next, .stop = ipoib_mcg_seq_stop, @@ -124,17 +136,17 @@ static int ipoib_mcg_open(struct inode *inode, struct file *file) struct seq_file *seq; int ret; - ret = seq_open(file, &ipoib_seq_ops); + ret = seq_open(file, &ipoib_mcg_seq_ops); if (ret) return ret; seq = file->private_data; - seq->private = inode->u.generic_ip; + seq->private = inode->i_private; return 0; } -static struct file_operations ipoib_fops = { +static const struct file_operations ipoib_mcg_fops = { .owner = THIS_MODULE, .open = ipoib_mcg_open, .read = seq_read, @@ -142,25 +154,137 @@ static struct file_operations ipoib_fops = { .release = seq_release }; -int ipoib_create_debug_file(struct net_device *dev) +static void *ipoib_path_seq_start(struct seq_file *file, loff_t *pos) +{ + struct ipoib_path_iter *iter; + loff_t n = *pos; + + iter = ipoib_path_iter_init(file->private); + if (!iter) + return NULL; + + while (n--) { + if (ipoib_path_iter_next(iter)) { + kfree(iter); + return NULL; + } + } + + return iter; +} + +static void *ipoib_path_seq_next(struct seq_file *file, void *iter_ptr, + loff_t *pos) +{ + struct ipoib_path_iter *iter = iter_ptr; + + (*pos)++; + + if (ipoib_path_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +static void ipoib_path_seq_stop(struct seq_file *file, void *iter_ptr) +{ + /* nothing for now */ +} + +static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr) +{ + struct ipoib_path_iter *iter = iter_ptr; + char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"]; + struct ipoib_path path; + int rate; + + if (!iter) + return 0; + + ipoib_path_iter_read(iter, &path); + + format_gid(&path.pathrec.dgid, gid_buf); + + seq_printf(file, + "GID: %s\n" + " complete: %6s\n", + gid_buf, path.pathrec.dlid ? "yes" : "no"); + + if (path.pathrec.dlid) { + rate = ib_rate_to_mbps(path.pathrec.rate); + + seq_printf(file, + " DLID: 0x%04x\n" + " SL: %12d\n" + " rate: %8d.%d Gb/sec\n", + be16_to_cpu(path.pathrec.dlid), + path.pathrec.sl, + rate / 1000, rate % 1000); + } + + seq_putc(file, '\n'); + + return 0; +} + +static const struct seq_operations ipoib_path_seq_ops = { + .start = ipoib_path_seq_start, + .next = ipoib_path_seq_next, + .stop = ipoib_path_seq_stop, + .show = ipoib_path_seq_show, +}; + +static int ipoib_path_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &ipoib_path_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; + + return 0; +} + +static const struct file_operations ipoib_path_fops = { + .owner = THIS_MODULE, + .open = ipoib_path_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +void ipoib_create_debug_files(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); - char name[IFNAMSIZ + sizeof "_mcg"]; + char name[IFNAMSIZ + sizeof "_path"]; snprintf(name, sizeof name, "%s_mcg", dev->name); - priv->mcg_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, - ipoib_root, dev, &ipoib_fops); - - return priv->mcg_dentry ? 0 : -ENOMEM; + ipoib_root, dev, &ipoib_mcg_fops); + if (!priv->mcg_dentry) + ipoib_warn(priv, "failed to create mcg debug file\n"); + + snprintf(name, sizeof name, "%s_path", dev->name); + priv->path_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, + ipoib_root, dev, &ipoib_path_fops); + if (!priv->path_dentry) + ipoib_warn(priv, "failed to create path debug file\n"); } -void ipoib_delete_debug_file(struct net_device *dev) +void ipoib_delete_debug_files(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); if (priv->mcg_dentry) debugfs_remove(priv->mcg_dentry); + if (priv->path_dentry) + debugfs_remove(priv->path_dentry); } int ipoib_register_debugfs(void) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 54ef2fea530..6a7003ddb0b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -31,14 +31,15 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ipoib_ib.c 1386 2004-12-27 16:23:17Z roland $ */ #include <linux/delay.h> +#include <linux/moduleparam.h> #include <linux/dma-mapping.h> +#include <linux/slab.h> -#include <rdma/ib_cache.h> +#include <linux/ip.h> +#include <linux/tcp.h> #include "ipoib.h" @@ -50,29 +51,30 @@ MODULE_PARM_DESC(data_debug_level, "Enable data path debug tracing if > 0"); #endif -#define IPOIB_OP_RECV (1ul << 31) - -static DECLARE_MUTEX(pkey_sem); +static DEFINE_MUTEX(pkey_mutex); struct ipoib_ah *ipoib_create_ah(struct net_device *dev, struct ib_pd *pd, struct ib_ah_attr *attr) { struct ipoib_ah *ah; + struct ib_ah *vah; ah = kmalloc(sizeof *ah, GFP_KERNEL); if (!ah) - return NULL; + return ERR_PTR(-ENOMEM); ah->dev = dev; ah->last_send = 0; kref_init(&ah->ref); - ah->ah = ib_create_ah(pd, attr); - if (IS_ERR(ah->ah)) { + vah = ib_create_ah(pd, attr); + if (IS_ERR(vah)) { kfree(ah); - ah = NULL; - } else + ah = (struct ipoib_ah *)vah; + } else { + ah->ah = vah; ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah); + } return ah; } @@ -84,40 +86,64 @@ void ipoib_free_ah(struct kref *kref) unsigned long flags; - if ((int) priv->tx_tail - (int) ah->last_send >= 0) { - ipoib_dbg(priv, "Freeing ah %p\n", ah->ah); - ib_destroy_ah(ah->ah); - kfree(ah); - } else { - spin_lock_irqsave(&priv->lock, flags); - list_add_tail(&ah->list, &priv->dead_ahs); - spin_unlock_irqrestore(&priv->lock, flags); - } + spin_lock_irqsave(&priv->lock, flags); + list_add_tail(&ah->list, &priv->dead_ahs); + spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv, + u64 mapping[IPOIB_UD_RX_SG]) +{ + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE, + DMA_FROM_DEVICE); + ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE, + DMA_FROM_DEVICE); + } else + ib_dma_unmap_single(priv->ca, mapping[0], + IPOIB_UD_BUF_SIZE(priv->max_ib_mtu), + DMA_FROM_DEVICE); +} + +static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv, + struct sk_buff *skb, + unsigned int length) +{ + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[0]; + unsigned int size; + /* + * There is only two buffers needed for max_payload = 4K, + * first buf size is IPOIB_UD_HEAD_SIZE + */ + skb->tail += IPOIB_UD_HEAD_SIZE; + skb->len += length; + + size = length - IPOIB_UD_HEAD_SIZE; + + skb_frag_size_set(frag, size); + skb->data_len += size; + skb->truesize += PAGE_SIZE; + } else + skb_put(skb, length); + } static int ipoib_ib_post_receive(struct net_device *dev, int id) { struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ib_sge list; - struct ib_recv_wr param; struct ib_recv_wr *bad_wr; int ret; - list.addr = priv->rx_ring[id].mapping; - list.length = IPOIB_BUF_SIZE; - list.lkey = priv->mr->lkey; + priv->rx_wr.wr_id = id | IPOIB_OP_RECV; + priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0]; + priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1]; - param.next = NULL; - param.wr_id = id | IPOIB_OP_RECV; - param.sg_list = &list; - param.num_sge = 1; - ret = ib_post_recv(priv->qp, ¶m, &bad_wr); + ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr); if (unlikely(ret)) { ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret); - dma_unmap_single(priv->ca->dma_device, - priv->rx_ring[id].mapping, - IPOIB_BUF_SIZE, DMA_FROM_DEVICE); + ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping); dev_kfree_skb_any(priv->rx_ring[id].skb); priv->rx_ring[id].skb = NULL; } @@ -125,15 +151,25 @@ static int ipoib_ib_post_receive(struct net_device *dev, int id) return ret; } -static int ipoib_alloc_rx_skb(struct net_device *dev, int id) +static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; - dma_addr_t addr; + int buf_size; + int tailroom; + u64 *mapping; - skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4); - if (!skb) - return -ENOMEM; + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + buf_size = IPOIB_UD_HEAD_SIZE; + tailroom = 128; /* reserve some tailroom for IP/TCP headers */ + } else { + buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); + tailroom = 0; + } + + skb = dev_alloc_skb(buf_size + tailroom + 4); + if (unlikely(!skb)) + return NULL; /* * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte @@ -142,18 +178,32 @@ static int ipoib_alloc_rx_skb(struct net_device *dev, int id) */ skb_reserve(skb, 4); - addr = dma_map_single(priv->ca->dma_device, - skb->data, IPOIB_BUF_SIZE, - DMA_FROM_DEVICE); - if (unlikely(dma_mapping_error(addr))) { - dev_kfree_skb_any(skb); - return -EIO; + mapping = priv->rx_ring[id].mapping; + mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size, + DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) + goto error; + + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + struct page *page = alloc_page(GFP_ATOMIC); + if (!page) + goto partial_error; + skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE); + mapping[1] = + ib_dma_map_page(priv->ca, page, + 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1]))) + goto partial_error; } - priv->rx_ring[id].skb = skb; - priv->rx_ring[id].mapping = addr; + priv->rx_ring[id].skb = skb; + return skb; - return 0; +partial_error: + ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE); +error: + dev_kfree_skb_any(skb); + return NULL; } static int ipoib_ib_post_receives(struct net_device *dev) @@ -161,8 +211,8 @@ static int ipoib_ib_post_receives(struct net_device *dev) struct ipoib_dev_priv *priv = netdev_priv(dev); int i; - for (i = 0; i < IPOIB_RX_RING_SIZE; ++i) { - if (ipoib_alloc_rx_skb(dev, i)) { + for (i = 0; i < ipoib_recvq_size; ++i) { + if (!ipoib_alloc_rx_skb(dev, i)) { ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); return -ENOMEM; } @@ -175,145 +225,316 @@ static int ipoib_ib_post_receives(struct net_device *dev) return 0; } -static void ipoib_ib_handle_wc(struct net_device *dev, - struct ib_wc *wc) +static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV; + struct sk_buff *skb; + u64 mapping[IPOIB_UD_RX_SG]; + union ib_gid *dgid; + + ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= ipoib_recvq_size)) { + ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n", + wr_id, ipoib_recvq_size); + return; + } + + skb = priv->rx_ring[wr_id].skb; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + if (wc->status != IB_WC_WR_FLUSH_ERR) + ipoib_warn(priv, "failed recv event " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); + ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping); + dev_kfree_skb_any(skb); + priv->rx_ring[wr_id].skb = NULL; + return; + } + + /* + * Drop packets that this interface sent, ie multicast packets + * that the HCA has replicated. + */ + if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) + goto repost; + + memcpy(mapping, priv->rx_ring[wr_id].mapping, + IPOIB_UD_RX_SG * sizeof *mapping); + + /* + * If we can't allocate a new RX buffer, dump + * this packet and reuse the old buffer. + */ + if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) { + ++dev->stats.rx_dropped; + goto repost; + } + + ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", + wc->byte_len, wc->slid); + + ipoib_ud_dma_unmap_rx(priv, mapping); + ipoib_ud_skb_put_frags(priv, skb, wc->byte_len); + + /* First byte of dgid signals multicast when 0xff */ + dgid = &((struct ib_grh *)skb->data)->dgid; + + if (!(wc->wc_flags & IB_WC_GRH) || dgid->raw[0] != 0xff) + skb->pkt_type = PACKET_HOST; + else if (memcmp(dgid, dev->broadcast + 4, sizeof(union ib_gid)) == 0) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + + skb_pull(skb, IB_GRH_BYTES); + + skb->protocol = ((struct ipoib_header *) skb->data)->proto; + skb_reset_mac_header(skb); + skb_pull(skb, IPOIB_ENCAP_LEN); + + ++dev->stats.rx_packets; + dev->stats.rx_bytes += skb->len; + + skb->dev = dev; + if ((dev->features & NETIF_F_RXCSUM) && + likely(wc->wc_flags & IB_WC_IP_CSUM_OK)) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + napi_gro_receive(&priv->napi, skb); + +repost: + if (unlikely(ipoib_ib_post_receive(dev, wr_id))) + ipoib_warn(priv, "ipoib_ib_post_receive failed " + "for buf %d\n", wr_id); +} + +static int ipoib_dma_map_tx(struct ib_device *ca, + struct ipoib_tx_buf *tx_req) +{ + struct sk_buff *skb = tx_req->skb; + u64 *mapping = tx_req->mapping; + int i; + int off; + + if (skb_headlen(skb)) { + mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb), + DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[0]))) + return -EIO; + + off = 1; + } else + off = 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + mapping[i + off] = ib_dma_map_page(ca, + skb_frag_page(frag), + frag->page_offset, skb_frag_size(frag), + DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[i + off]))) + goto partial_error; + } + return 0; + +partial_error: + for (; i > 0; --i) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; + + ib_dma_unmap_page(ca, mapping[i - !off], skb_frag_size(frag), DMA_TO_DEVICE); + } + + if (off) + ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); + + return -EIO; +} + +static void ipoib_dma_unmap_tx(struct ib_device *ca, + struct ipoib_tx_buf *tx_req) +{ + struct sk_buff *skb = tx_req->skb; + u64 *mapping = tx_req->mapping; + int i; + int off; + + if (skb_headlen(skb)) { + ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); + off = 1; + } else + off = 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + ib_dma_unmap_page(ca, mapping[i + off], skb_frag_size(frag), + DMA_TO_DEVICE); + } +} + +static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) { struct ipoib_dev_priv *priv = netdev_priv(dev); unsigned int wr_id = wc->wr_id; + struct ipoib_tx_buf *tx_req; - ipoib_dbg_data(priv, "called: id %d, op %d, status: %d\n", - wr_id, wc->opcode, wc->status); - - if (wr_id & IPOIB_OP_RECV) { - wr_id &= ~IPOIB_OP_RECV; - - if (wr_id < IPOIB_RX_RING_SIZE) { - struct sk_buff *skb = priv->rx_ring[wr_id].skb; - dma_addr_t addr = priv->rx_ring[wr_id].mapping; - - if (unlikely(wc->status != IB_WC_SUCCESS)) { - if (wc->status != IB_WC_WR_FLUSH_ERR) - ipoib_warn(priv, "failed recv event " - "(status=%d, wrid=%d vend_err %x)\n", - wc->status, wr_id, wc->vendor_err); - dma_unmap_single(priv->ca->dma_device, addr, - IPOIB_BUF_SIZE, DMA_FROM_DEVICE); - dev_kfree_skb_any(skb); - priv->rx_ring[wr_id].skb = NULL; - return; - } + ipoib_dbg_data(priv, "send completion: id %d, status: %d\n", + wr_id, wc->status); - /* - * If we can't allocate a new RX buffer, dump - * this packet and reuse the old buffer. - */ - if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) { - ++priv->stats.rx_dropped; - goto repost; - } + if (unlikely(wr_id >= ipoib_sendq_size)) { + ipoib_warn(priv, "send completion event with wrid %d (> %d)\n", + wr_id, ipoib_sendq_size); + return; + } - ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", - wc->byte_len, wc->slid); + tx_req = &priv->tx_ring[wr_id]; - dma_unmap_single(priv->ca->dma_device, addr, - IPOIB_BUF_SIZE, DMA_FROM_DEVICE); + ipoib_dma_unmap_tx(priv->ca, tx_req); - skb_put(skb, wc->byte_len); - skb_pull(skb, IB_GRH_BYTES); + ++dev->stats.tx_packets; + dev->stats.tx_bytes += tx_req->skb->len; - if (wc->slid != priv->local_lid || - wc->src_qp != priv->qp->qp_num) { - skb->protocol = ((struct ipoib_header *) skb->data)->proto; - skb->mac.raw = skb->data; - skb_pull(skb, IPOIB_ENCAP_LEN); + dev_kfree_skb_any(tx_req->skb); - dev->last_rx = jiffies; - ++priv->stats.rx_packets; - priv->stats.rx_bytes += skb->len; + ++priv->tx_tail; + if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + netif_queue_stopped(dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + netif_wake_queue(dev); - skb->dev = dev; - /* XXX get correct PACKET_ type here */ - skb->pkt_type = PACKET_HOST; - netif_rx_ni(skb); - } else { - ipoib_dbg_data(priv, "dropping loopback packet\n"); - dev_kfree_skb_any(skb); - } + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) + ipoib_warn(priv, "failed send event " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); +} - repost: - if (unlikely(ipoib_ib_post_receive(dev, wr_id))) - ipoib_warn(priv, "ipoib_ib_post_receive failed " - "for buf %d\n", wr_id); - } else - ipoib_warn(priv, "completion event with wrid %d\n", - wr_id); +static int poll_tx(struct ipoib_dev_priv *priv) +{ + int n, i; - } else { - struct ipoib_tx_buf *tx_req; - unsigned long flags; + n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc); + for (i = 0; i < n; ++i) + ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i); - if (wr_id >= IPOIB_TX_RING_SIZE) { - ipoib_warn(priv, "completion event with wrid %d (> %d)\n", - wr_id, IPOIB_TX_RING_SIZE); - return; - } + return n == MAX_SEND_CQE; +} + +int ipoib_poll(struct napi_struct *napi, int budget) +{ + struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv, napi); + struct net_device *dev = priv->dev; + int done; + int t; + int n, i; - ipoib_dbg_data(priv, "send complete, wrid %d\n", wr_id); + done = 0; - tx_req = &priv->tx_ring[wr_id]; +poll_more: + while (done < budget) { + int max = (budget - done); - dma_unmap_single(priv->ca->dma_device, - pci_unmap_addr(tx_req, mapping), - tx_req->skb->len, - DMA_TO_DEVICE); + t = min(IPOIB_NUM_WC, max); + n = ib_poll_cq(priv->recv_cq, t, priv->ibwc); - ++priv->stats.tx_packets; - priv->stats.tx_bytes += tx_req->skb->len; + for (i = 0; i < n; i++) { + struct ib_wc *wc = priv->ibwc + i; - dev_kfree_skb_any(tx_req->skb); + if (wc->wr_id & IPOIB_OP_RECV) { + ++done; + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(dev, wc); + else + ipoib_ib_handle_rx_wc(dev, wc); + } else + ipoib_cm_handle_tx_wc(priv->dev, wc); + } - spin_lock_irqsave(&priv->tx_lock, flags); - ++priv->tx_tail; - if (netif_queue_stopped(dev) && - priv->tx_head - priv->tx_tail <= IPOIB_TX_RING_SIZE / 2) - netif_wake_queue(dev); - spin_unlock_irqrestore(&priv->tx_lock, flags); + if (n != t) + break; + } - if (wc->status != IB_WC_SUCCESS && - wc->status != IB_WC_WR_FLUSH_ERR) - ipoib_warn(priv, "failed send event " - "(status=%d, wrid=%d vend_err %x)\n", - wc->status, wr_id, wc->vendor_err); + if (done < budget) { + napi_complete(napi); + if (unlikely(ib_req_notify_cq(priv->recv_cq, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS)) && + napi_reschedule(napi)) + goto poll_more; } + + return done; } void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr) { - struct net_device *dev = (struct net_device *) dev_ptr; + struct net_device *dev = dev_ptr; struct ipoib_dev_priv *priv = netdev_priv(dev); - int n, i; - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - do { - n = ib_poll_cq(cq, IPOIB_NUM_WC, priv->ibwc); - for (i = 0; i < n; ++i) - ipoib_ib_handle_wc(dev, priv->ibwc + i); - } while (n == IPOIB_NUM_WC); + napi_schedule(&priv->napi); +} + +static void drain_tx_cq(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + netif_tx_lock(dev); + while (poll_tx(priv)) + ; /* nothing */ + + if (netif_queue_stopped(dev)) + mod_timer(&priv->poll_timer, jiffies + 1); + + netif_tx_unlock(dev); +} + +void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev_ptr); + + mod_timer(&priv->poll_timer, jiffies); } static inline int post_send(struct ipoib_dev_priv *priv, unsigned int wr_id, struct ib_ah *address, u32 qpn, - dma_addr_t addr, int len) + struct ipoib_tx_buf *tx_req, + void *head, int hlen) { struct ib_send_wr *bad_wr; + int i, off; + struct sk_buff *skb = tx_req->skb; + skb_frag_t *frags = skb_shinfo(skb)->frags; + int nr_frags = skb_shinfo(skb)->nr_frags; + u64 *mapping = tx_req->mapping; + + if (skb_headlen(skb)) { + priv->tx_sge[0].addr = mapping[0]; + priv->tx_sge[0].length = skb_headlen(skb); + off = 1; + } else + off = 0; - priv->tx_sge.addr = addr; - priv->tx_sge.length = len; - - priv->tx_wr.wr_id = wr_id; - priv->tx_wr.wr.ud.remote_qpn = qpn; - priv->tx_wr.wr.ud.ah = address; + for (i = 0; i < nr_frags; ++i) { + priv->tx_sge[i + off].addr = mapping[i + off]; + priv->tx_sge[i + off].length = skb_frag_size(&frags[i]); + } + priv->tx_wr.num_sge = nr_frags + off; + priv->tx_wr.wr_id = wr_id; + priv->tx_wr.wr.ud.remote_qpn = qpn; + priv->tx_wr.wr.ud.ah = address; + + if (head) { + priv->tx_wr.wr.ud.mss = skb_shinfo(skb)->gso_size; + priv->tx_wr.wr.ud.header = head; + priv->tx_wr.wr.ud.hlen = hlen; + priv->tx_wr.opcode = IB_WR_LSO; + } else + priv->tx_wr.opcode = IB_WR_SEND; return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr); } @@ -323,15 +544,30 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_tx_buf *tx_req; - dma_addr_t addr; - - if (skb->len > dev->mtu + INFINIBAND_ALEN) { - ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", - skb->len, dev->mtu + INFINIBAND_ALEN); - ++priv->stats.tx_dropped; - ++priv->stats.tx_errors; - dev_kfree_skb_any(skb); - return; + int hlen, rc; + void *phead; + + if (skb_is_gso(skb)) { + hlen = skb_transport_offset(skb) + tcp_hdrlen(skb); + phead = skb->data; + if (unlikely(!skb_pull(skb, hlen))) { + ipoib_warn(priv, "linear data too small\n"); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + } else { + if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) { + ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", + skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu); + return; + } + phead = NULL; + hlen = 0; } ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n", @@ -344,30 +580,49 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, * means we have to make sure everything is properly recorded and * our state is consistent before we call post_send(). */ - tx_req = &priv->tx_ring[priv->tx_head & (IPOIB_TX_RING_SIZE - 1)]; + tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)]; tx_req->skb = skb; - addr = dma_map_single(priv->ca->dma_device, skb->data, skb->len, - DMA_TO_DEVICE); - pci_unmap_addr_set(tx_req, mapping, addr); - - if (unlikely(post_send(priv, priv->tx_head & (IPOIB_TX_RING_SIZE - 1), - address->ah, qpn, addr, skb->len))) { - ipoib_warn(priv, "post_send failed\n"); - ++priv->stats.tx_errors; - dma_unmap_single(priv->ca->dma_device, addr, skb->len, - DMA_TO_DEVICE); + if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) { + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + + if (skb->ip_summed == CHECKSUM_PARTIAL) + priv->tx_wr.send_flags |= IB_SEND_IP_CSUM; + else + priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + + if (++priv->tx_outstanding == ipoib_sendq_size) { + ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); + if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) + ipoib_warn(priv, "request notify on send CQ failed\n"); + netif_stop_queue(dev); + } + + skb_orphan(skb); + skb_dst_drop(skb); + + rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), + address->ah, qpn, tx_req, phead, hlen); + if (unlikely(rc)) { + ipoib_warn(priv, "post_send failed, error %d\n", rc); + ++dev->stats.tx_errors; + --priv->tx_outstanding; + ipoib_dma_unmap_tx(priv->ca, tx_req); dev_kfree_skb_any(skb); + if (netif_queue_stopped(dev)) + netif_wake_queue(dev); } else { dev->trans_start = jiffies; address->last_send = priv->tx_head; ++priv->tx_head; - - if (priv->tx_head - priv->tx_tail == IPOIB_TX_RING_SIZE) { - ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); - netif_stop_queue(dev); - } } + + if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) + while (poll_tx(priv)) + ; /* nothing */ } static void __ipoib_reap_ah(struct net_device *dev) @@ -375,31 +630,38 @@ static void __ipoib_reap_ah(struct net_device *dev) struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_ah *ah, *tah; LIST_HEAD(remove_list); + unsigned long flags; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); - spin_lock_irq(&priv->lock); list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list) if ((int) priv->tx_tail - (int) ah->last_send >= 0) { list_del(&ah->list); - list_add_tail(&ah->list, &remove_list); + ib_destroy_ah(ah->ah); + kfree(ah); } - spin_unlock_irq(&priv->lock); - list_for_each_entry_safe(ah, tah, &remove_list, list) { - ipoib_dbg(priv, "Reaping ah %p\n", ah->ah); - ib_destroy_ah(ah->ah); - kfree(ah); - } + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); } -void ipoib_reap_ah(void *dev_ptr) +void ipoib_reap_ah(struct work_struct *work) { - struct net_device *dev = dev_ptr; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, ah_reap_task.work); + struct net_device *dev = priv->dev; __ipoib_reap_ah(dev); if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); + queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + round_jiffies_relative(HZ)); +} + +static void ipoib_ib_tx_timer_func(unsigned long ctx) +{ + drain_tx_cq((struct net_device *)ctx); } int ipoib_ib_dev_open(struct net_device *dev) @@ -407,6 +669,13 @@ int ipoib_ib_dev_open(struct net_device *dev) struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) { + ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey); + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + return -1; + } + set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + ret = ipoib_init_qp(dev); if (ret) { ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret); @@ -416,25 +685,58 @@ int ipoib_ib_dev_open(struct net_device *dev) ret = ipoib_ib_post_receives(dev); if (ret) { ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); - return -1; + goto dev_stop; + } + + ret = ipoib_cm_dev_open(dev); + if (ret) { + ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); + goto dev_stop; } clear_bit(IPOIB_STOP_REAPER, &priv->flags); - queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); + queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + round_jiffies_relative(HZ)); + + if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_enable(&priv->napi); return 0; +dev_stop: + if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_enable(&priv->napi); + ipoib_ib_dev_stop(dev, 1); + return -1; +} + +static void ipoib_pkey_dev_check_presence(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + u16 pkey_index = 0; + + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + else + set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); } int ipoib_ib_dev_up(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); + ipoib_pkey_dev_check_presence(dev); + + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { + ipoib_dbg(priv, "PKEY is not assigned.\n"); + return 0; + } + set_bit(IPOIB_FLAG_OPER_UP, &priv->flags); return ipoib_mcast_start_thread(dev); } -int ipoib_ib_dev_down(struct net_device *dev) +int ipoib_ib_dev_down(struct net_device *dev, int flush) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -445,25 +747,15 @@ int ipoib_ib_dev_down(struct net_device *dev) /* Shutdown the P_Key thread if still active */ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { - down(&pkey_sem); + mutex_lock(&pkey_mutex); set_bit(IPOIB_PKEY_STOP, &priv->flags); - cancel_delayed_work(&priv->pkey_task); - up(&pkey_sem); - flush_workqueue(ipoib_workqueue); + cancel_delayed_work_sync(&priv->pkey_poll_task); + mutex_unlock(&pkey_mutex); } - ipoib_mcast_stop_thread(dev, 1); - - /* - * Flush the multicast groups first so we stop any multicast joins. The - * completion thread may have already died and we may deadlock waiting - * for the completion thread to finish some multicast joins. - */ + ipoib_mcast_stop_thread(dev, flush); ipoib_mcast_dev_flush(dev); - /* Delete broadcast and local addresses since they will be recreated */ - ipoib_mcast_dev_down(dev); - ipoib_flush_paths(dev); return 0; @@ -475,14 +767,53 @@ static int recvs_pending(struct net_device *dev) int pending = 0; int i; - for (i = 0; i < IPOIB_RX_RING_SIZE; ++i) + for (i = 0; i < ipoib_recvq_size; ++i) if (priv->rx_ring[i].skb) ++pending; return pending; } -int ipoib_ib_dev_stop(struct net_device *dev) +void ipoib_drain_cq(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i, n; + + /* + * We call completion handling routines that expect to be + * called from the BH-disabled NAPI poll context, so disable + * BHs here too. + */ + local_bh_disable(); + + do { + n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); + for (i = 0; i < n; ++i) { + /* + * Convert any successful completions to flush + * errors to avoid passing packets up the + * stack after bringing the device down. + */ + if (priv->ibwc[i].status == IB_WC_SUCCESS) + priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR; + + if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) { + if (priv->ibwc[i].wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(dev, priv->ibwc + i); + else + ipoib_ib_handle_rx_wc(dev, priv->ibwc + i); + } else + ipoib_cm_handle_tx_wc(dev, priv->ibwc + i); + } + } while (n == IPOIB_NUM_WC); + + while (poll_tx(priv)) + ; /* nothing */ + + local_bh_enable(); +} + +int ipoib_ib_dev_stop(struct net_device *dev, int flush) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_attr qp_attr; @@ -490,6 +821,11 @@ int ipoib_ib_dev_stop(struct net_device *dev) struct ipoib_tx_buf *tx_req; int i; + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_disable(&priv->napi); + + ipoib_cm_dev_stop(dev); + /* * Move our QP to the error state and then reinitialize in * when all work requests have completed or have been flushed. @@ -512,35 +848,37 @@ int ipoib_ib_dev_stop(struct net_device *dev) */ while ((int) priv->tx_tail - (int) priv->tx_head < 0) { tx_req = &priv->tx_ring[priv->tx_tail & - (IPOIB_TX_RING_SIZE - 1)]; - dma_unmap_single(priv->ca->dma_device, - pci_unmap_addr(tx_req, mapping), - tx_req->skb->len, - DMA_TO_DEVICE); + (ipoib_sendq_size - 1)]; + ipoib_dma_unmap_tx(priv->ca, tx_req); dev_kfree_skb_any(tx_req->skb); ++priv->tx_tail; + --priv->tx_outstanding; } - for (i = 0; i < IPOIB_RX_RING_SIZE; ++i) - if (priv->rx_ring[i].skb) { - dma_unmap_single(priv->ca->dma_device, - pci_unmap_addr(&priv->rx_ring[i], - mapping), - IPOIB_BUF_SIZE, - DMA_FROM_DEVICE); - dev_kfree_skb_any(priv->rx_ring[i].skb); - priv->rx_ring[i].skb = NULL; - } + for (i = 0; i < ipoib_recvq_size; ++i) { + struct ipoib_rx_buf *rx_req; + + rx_req = &priv->rx_ring[i]; + if (!rx_req->skb) + continue; + ipoib_ud_dma_unmap_rx(priv, + priv->rx_ring[i].mapping); + dev_kfree_skb_any(rx_req->skb); + rx_req->skb = NULL; + } goto timeout; } + ipoib_drain_cq(dev); + msleep(1); } ipoib_dbg(priv, "All sends and receives done.\n"); timeout: + del_timer_sync(&priv->poll_timer); qp_attr.qp_state = IB_QPS_RESET; if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) ipoib_warn(priv, "Failed to modify QP to RESET state\n"); @@ -548,7 +886,8 @@ timeout: /* Wait for all AHs to be reaped */ set_bit(IPOIB_STOP_REAPER, &priv->flags); cancel_delayed_work(&priv->ah_reap_task); - flush_workqueue(ipoib_workqueue); + if (flush) + flush_workqueue(ipoib_workqueue); begin = jiffies; @@ -563,6 +902,8 @@ timeout: msleep(1); } + ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); + return 0; } @@ -579,6 +920,9 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) return -ENODEV; } + setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func, + (unsigned long) dev); + if (dev->flags & IFF_UP) { if (ipoib_ib_dev_open(dev)) { ipoib_transport_dev_cleanup(dev); @@ -589,28 +933,148 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) return 0; } -void ipoib_ib_dev_flush(void *_dev) +/* + * Takes whatever value which is in pkey index 0 and updates priv->pkey + * returns 0 if the pkey value was changed. + */ +static inline int update_parent_pkey(struct ipoib_dev_priv *priv) +{ + int result; + u16 prev_pkey; + + prev_pkey = priv->pkey; + result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); + if (result) { + ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n", + priv->port, result); + return result; + } + + priv->pkey |= 0x8000; + + if (prev_pkey != priv->pkey) { + ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n", + prev_pkey, priv->pkey); + /* + * Update the pkey in the broadcast address, while making sure to set + * the full membership bit, so that we join the right broadcast group. + */ + priv->dev->broadcast[8] = priv->pkey >> 8; + priv->dev->broadcast[9] = priv->pkey & 0xff; + return 0; + } + + return 1; +} + +static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, + enum ipoib_flush_level level) { - struct net_device *dev = (struct net_device *)_dev; - struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv; + struct ipoib_dev_priv *cpriv; + struct net_device *dev = priv->dev; + u16 new_index; + int result; - if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + down_read(&priv->vlan_rwsem); + + /* + * Flush any child interfaces too -- they might be up even if + * the parent is down. + */ + list_for_each_entry(cpriv, &priv->child_intfs, list) + __ipoib_ib_dev_flush(cpriv, level); + + up_read(&priv->vlan_rwsem); + + if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { + /* for non-child devices must check/update the pkey value here */ + if (level == IPOIB_FLUSH_HEAVY && + !test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) + update_parent_pkey(priv); + ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); + return; + } + + if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { + ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n"); return; + } + + if (level == IPOIB_FLUSH_HEAVY) { + /* child devices chase their origin pkey value, while non-child + * (parent) devices should always takes what present in pkey index 0 + */ + if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + ipoib_ib_dev_down(dev, 0); + ipoib_ib_dev_stop(dev, 0); + if (ipoib_pkey_dev_delay_open(dev)) + return; + } + /* restart QP only if P_Key index is changed */ + if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && + new_index == priv->pkey_index) { + ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); + return; + } + priv->pkey_index = new_index; + } else { + result = update_parent_pkey(priv); + /* restart QP only if P_Key value changed */ + if (result) { + ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n"); + return; + } + } + } - ipoib_dbg(priv, "flushing\n"); + if (level == IPOIB_FLUSH_LIGHT) { + ipoib_mark_paths_invalid(dev); + ipoib_mcast_dev_flush(dev); + } - ipoib_ib_dev_down(dev); + if (level >= IPOIB_FLUSH_NORMAL) + ipoib_ib_dev_down(dev, 0); + + if (level == IPOIB_FLUSH_HEAVY) { + ipoib_ib_dev_stop(dev, 0); + ipoib_ib_dev_open(dev); + } /* * The device could have been brought down between the start and when * we get here, don't bring it back up if it's not configured up */ - if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) - ipoib_ib_dev_up(dev); + if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { + if (level >= IPOIB_FLUSH_NORMAL) + ipoib_ib_dev_up(dev); + ipoib_mcast_restart_task(&priv->restart_task); + } +} - /* Flush any child interfaces too */ - list_for_each_entry(cpriv, &priv->child_intfs, list) - ipoib_ib_dev_flush(&cpriv->dev); +void ipoib_ib_dev_flush_light(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_light); + + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT); +} + +void ipoib_ib_dev_flush_normal(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_normal); + + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL); +} + +void ipoib_ib_dev_flush_heavy(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_heavy); + + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY); } void ipoib_ib_dev_cleanup(struct net_device *dev) @@ -618,11 +1082,14 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) struct ipoib_dev_priv *priv = netdev_priv(dev); ipoib_dbg(priv, "cleaning up ib_dev\n"); + /* + * We must make sure there are no more (path) completions + * that may wish to touch priv fields that are no longer valid + */ + ipoib_flush_paths(dev); ipoib_mcast_stop_thread(dev, 1); - - /* Delete the broadcast address and the local address */ - ipoib_mcast_dev_down(dev); + ipoib_mcast_dev_flush(dev); ipoib_transport_dev_cleanup(dev); } @@ -637,33 +1104,23 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) * change async notification is available. */ -static void ipoib_pkey_dev_check_presence(struct net_device *dev) +void ipoib_pkey_poll(struct work_struct *work) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - u16 pkey_index = 0; - - if (ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) - clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); - else - set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); -} - -void ipoib_pkey_poll(void *dev_ptr) -{ - struct net_device *dev = dev_ptr; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, pkey_poll_task.work); + struct net_device *dev = priv->dev; ipoib_pkey_dev_check_presence(dev); if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) ipoib_open(dev); else { - down(&pkey_sem); + mutex_lock(&pkey_mutex); if (!test_bit(IPOIB_PKEY_STOP, &priv->flags)) queue_delayed_work(ipoib_workqueue, - &priv->pkey_task, + &priv->pkey_poll_task, HZ); - up(&pkey_sem); + mutex_unlock(&pkey_mutex); } } @@ -677,12 +1134,12 @@ int ipoib_pkey_dev_delay_open(struct net_device *dev) /* P_Key value not assigned yet - start polling */ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { - down(&pkey_sem); + mutex_lock(&pkey_mutex); clear_bit(IPOIB_PKEY_STOP, &priv->flags); queue_delayed_work(ipoib_workqueue, - &priv->pkey_task, + &priv->pkey_poll_task, HZ); - up(&pkey_sem); + mutex_unlock(&pkey_mutex); return 1; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index ce0296273e7..5786a78ff8b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -30,8 +30,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ipoib_main.c 1377 2004-12-23 19:57:12Z roland $ */ #include "ipoib.h" @@ -40,6 +38,7 @@ #include <linux/init.h> #include <linux/slab.h> +#include <linux/kernel.h> #include <linux/vmalloc.h> #include <linux/if_arp.h> /* For ARPHRD_xxx */ @@ -47,9 +46,25 @@ #include <linux/ip.h> #include <linux/in.h> +#include <linux/jhash.h> +#include <net/arp.h> + +#define DRV_VERSION "1.0.0" + +const char ipoib_driver_version[] = DRV_VERSION; + MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; +int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; + +module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); +MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); +module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); +MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level; @@ -58,6 +73,11 @@ module_param_named(debug_level, ipoib_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif +struct ipoib_path_iter { + struct net_device *dev; + struct ipoib_path path; +}; + static const u8 ipv4_bcast_addr[] = { 0x00, 0xff, 0xff, 0xff, 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, @@ -66,8 +86,11 @@ static const u8 ipv4_bcast_addr[] = { struct workqueue_struct *ipoib_workqueue; +struct ib_sa_client ipoib_sa_client; + static void ipoib_add_one(struct ib_device *device); static void ipoib_remove_one(struct ib_device *device); +static void ipoib_neigh_reclaim(struct rcu_head *rp); static struct ib_client ipoib_client = { .name = "ipoib", @@ -81,22 +104,24 @@ int ipoib_open(struct net_device *dev) ipoib_dbg(priv, "bringing up interface\n"); + netif_carrier_off(dev); + set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_pkey_dev_delay_open(dev)) return 0; if (ipoib_ib_dev_open(dev)) - return -EINVAL; + goto err_disable; if (ipoib_ib_dev_up(dev)) - return -EINVAL; + goto err_stop; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ - down(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; @@ -106,12 +131,20 @@ int ipoib_open(struct net_device *dev) dev_change_flags(cpriv->dev, flags | IFF_UP); } - up(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); } netif_start_queue(dev); return 0; + +err_stop: + ipoib_ib_dev_stop(dev, 1); + +err_disable: + clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + + return -EINVAL; } static int ipoib_stop(struct net_device *dev) @@ -124,14 +157,14 @@ static int ipoib_stop(struct net_device *dev) netif_stop_queue(dev); - ipoib_ib_dev_down(dev); - ipoib_ib_dev_stop(dev); + ipoib_ib_dev_down(dev, 1); + ipoib_ib_dev_stop(dev, 0); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ - down(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; @@ -141,17 +174,45 @@ static int ipoib_stop(struct net_device *dev) dev_change_flags(cpriv->dev, flags & ~IFF_UP); } - up(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); } return 0; } +static void ipoib_uninit(struct net_device *dev) +{ + ipoib_dev_cleanup(dev); +} + +static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) + features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO); + + return features; +} + static int ipoib_change_mtu(struct net_device *dev, int new_mtu) { struct ipoib_dev_priv *priv = netdev_priv(dev); - if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) + /* dev->mtu > 2K ==> connected mode */ + if (ipoib_cm_admin_enabled(dev)) { + if (new_mtu > ipoib_cm_max_mtu(dev)) + return -EINVAL; + + if (new_mtu > priv->mcast_mtu) + ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", + priv->mcast_mtu); + + dev->mtu = new_mtu; + return 0; + } + + if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) return -EINVAL; priv->admin_mtu = new_mtu; @@ -161,8 +222,38 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu) return 0; } -static struct ipoib_path *__path_find(struct net_device *dev, - union ib_gid *gid) +int ipoib_set_mode(struct net_device *dev, const char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + /* flush paths if we switch modes so that connections are restarted */ + if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { + set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + ipoib_warn(priv, "enabling connected mode " + "will cause multicast packet drops\n"); + netdev_update_features(dev); + rtnl_unlock(); + priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + + ipoib_flush_paths(dev); + rtnl_lock(); + return 0; + } + + if (!strcmp(buf, "datagram\n")) { + clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + netdev_update_features(dev); + dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); + rtnl_unlock(); + ipoib_flush_paths(dev); + rtnl_lock(); + return 0; + } + + return -EINVAL; +} + +static struct ipoib_path *__path_find(struct net_device *dev, void *gid) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct rb_node *n = priv->path_tree.rb_node; @@ -172,7 +263,7 @@ static struct ipoib_path *__path_find(struct net_device *dev, while (n) { path = rb_entry(n, struct ipoib_path, rb_node); - ret = memcmp(gid->raw, path->pathrec.dgid.raw, + ret = memcmp(gid, path->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) @@ -218,31 +309,15 @@ static int __path_add(struct net_device *dev, struct ipoib_path *path) static void path_free(struct net_device *dev, struct ipoib_path *path) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ipoib_neigh *neigh, *tn; struct sk_buff *skb; - unsigned long flags; while ((skb = __skb_dequeue(&path->queue))) dev_kfree_skb_irq(skb); - spin_lock_irqsave(&priv->lock, flags); + ipoib_dbg(netdev_priv(dev), "path_free\n"); - list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { - /* - * It's safe to call ipoib_put_ah() inside priv->lock - * here, because we know that path->ah will always - * hold one more reference, so ipoib_put_ah() will - * never do more than decrement the ref count. - */ - if (neigh->ah) - ipoib_put_ah(neigh->ah); - *to_ipoib_neigh(neigh->neighbour) = NULL; - neigh->neighbour->ops->destructor = NULL; - kfree(neigh); - } - - spin_unlock_irqrestore(&priv->lock, flags); + /* remove all neigh connected to this path */ + ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); if (path->ah) ipoib_put_ah(path->ah); @@ -250,6 +325,81 @@ static void path_free(struct net_device *dev, struct ipoib_path *path) kfree(path); } +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + +struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) +{ + struct ipoib_path_iter *iter; + + iter = kmalloc(sizeof *iter, GFP_KERNEL); + if (!iter) + return NULL; + + iter->dev = dev; + memset(iter->path.pathrec.dgid.raw, 0, 16); + + if (ipoib_path_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +int ipoib_path_iter_next(struct ipoib_path_iter *iter) +{ + struct ipoib_dev_priv *priv = netdev_priv(iter->dev); + struct rb_node *n; + struct ipoib_path *path; + int ret = 1; + + spin_lock_irq(&priv->lock); + + n = rb_first(&priv->path_tree); + + while (n) { + path = rb_entry(n, struct ipoib_path, rb_node); + + if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, + sizeof (union ib_gid)) < 0) { + iter->path = *path; + ret = 0; + break; + } + + n = rb_next(n); + } + + spin_unlock_irq(&priv->lock); + + return ret; +} + +void ipoib_path_iter_read(struct ipoib_path_iter *iter, + struct ipoib_path *path) +{ + *path = iter->path; +} + +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ + +void ipoib_mark_paths_invalid(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path, *tp; + + spin_lock_irq(&priv->lock); + + list_for_each_entry_safe(path, tp, &priv->path_list, list) { + ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n", + be16_to_cpu(path->pathrec.dlid), + path->pathrec.dgid.raw); + path->valid = 0; + } + + spin_unlock_irq(&priv->lock); +} + void ipoib_flush_paths(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -257,22 +407,27 @@ void ipoib_flush_paths(struct net_device *dev) LIST_HEAD(remove_list); unsigned long flags; + netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); - list_splice(&priv->path_list, &remove_list); - INIT_LIST_HEAD(&priv->path_list); + list_splice_init(&priv->path_list, &remove_list); list_for_each_entry(path, &remove_list, list) rb_erase(&path->rb_node, &priv->path_tree); - spin_unlock_irqrestore(&priv->lock, flags); - list_for_each_entry_safe(path, tp, &remove_list, list) { if (path->query) ib_sa_cancel_query(path->query_id, path->query); + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); wait_for_completion(&path->done); path_free(dev, path); + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); } static void path_rec_completion(int status, @@ -283,65 +438,85 @@ static void path_rec_completion(int status, struct net_device *dev = path->dev; struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_ah *ah = NULL; - struct ipoib_neigh *neigh; + struct ipoib_ah *old_ah = NULL; + struct ipoib_neigh *neigh, *tn; struct sk_buff_head skqueue; struct sk_buff *skb; unsigned long flags; - if (pathrec) - ipoib_dbg(priv, "PathRec LID 0x%04x for GID " IPOIB_GID_FMT "\n", - be16_to_cpu(pathrec->dlid), IPOIB_GID_ARG(pathrec->dgid)); + if (!status) + ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n", + be16_to_cpu(pathrec->dlid), pathrec->dgid.raw); else - ipoib_dbg(priv, "PathRec status %d for GID " IPOIB_GID_FMT "\n", - status, IPOIB_GID_ARG(path->pathrec.dgid)); + ipoib_dbg(priv, "PathRec status %d for GID %pI6\n", + status, path->pathrec.dgid.raw); skb_queue_head_init(&skqueue); if (!status) { - struct ib_ah_attr av = { - .dlid = be16_to_cpu(pathrec->dlid), - .sl = pathrec->sl, - .port_num = priv->port - }; - int path_rate = ib_sa_rate_enum_to_int(pathrec->rate); - - if (path_rate > 0 && priv->local_rate > path_rate) - av.static_rate = (priv->local_rate - 1) / path_rate; - - ipoib_dbg(priv, "static_rate %d for local port %dX, path %dX\n", - av.static_rate, priv->local_rate, - ib_sa_rate_enum_to_int(pathrec->rate)); + struct ib_ah_attr av; - ah = ipoib_create_ah(dev, priv->pd, &av); + if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) + ah = ipoib_create_ah(dev, priv->pd, &av); } spin_lock_irqsave(&priv->lock, flags); - path->ah = ah; - - if (ah) { + if (!IS_ERR_OR_NULL(ah)) { path->pathrec = *pathrec; + old_ah = path->ah; + path->ah = ah; + ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", ah, be16_to_cpu(pathrec->dlid), pathrec->sl); while ((skb = __skb_dequeue(&path->queue))) __skb_queue_tail(&skqueue, skb); - list_for_each_entry(neigh, &path->neigh_list, list) { + list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { + if (neigh->ah) { + WARN_ON(neigh->ah != old_ah); + /* + * Dropping the ah reference inside + * priv->lock is safe here, because we + * will hold one more reference from + * the original value of path->ah (ie + * old_ah). + */ + ipoib_put_ah(neigh->ah); + } kref_get(&path->ah->ref); neigh->ah = path->ah; + if (ipoib_cm_enabled(dev, neigh->daddr)) { + if (!ipoib_cm_get(neigh)) + ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, + path, + neigh)); + if (!ipoib_cm_get(neigh)) { + ipoib_neigh_free(neigh); + continue; + } + } + while ((skb = __skb_dequeue(&neigh->queue))) __skb_queue_tail(&skqueue, skb); } - } else - path->query = NULL; + path->valid = 1; + } + path->query = NULL; complete(&path->done); spin_unlock_irqrestore(&priv->lock, flags); + if (IS_ERR_OR_NULL(ah)) + ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); + + if (old_ah) + ipoib_put_ah(old_ah); + while ((skb = __skb_dequeue(&skqueue))) { skb->dev = dev; if (dev_queue_xmit(skb)) @@ -350,12 +525,14 @@ static void path_rec_completion(int status, } } -static struct ipoib_path *path_rec_create(struct net_device *dev, - union ib_gid *gid) +static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_path *path; + if (!priv->broadcast) + return NULL; + path = kzalloc(sizeof *path, GFP_ATOMIC); if (!path) return NULL; @@ -365,12 +542,12 @@ static struct ipoib_path *path_rec_create(struct net_device *dev, skb_queue_head_init(&path->queue); INIT_LIST_HEAD(&path->neigh_list); - init_completion(&path->done); - memcpy(path->pathrec.dgid.raw, gid->raw, sizeof (union ib_gid)); - path->pathrec.sgid = priv->local_gid; - path->pathrec.pkey = cpu_to_be16(priv->pkey); - path->pathrec.numb_path = 1; + memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid)); + path->pathrec.sgid = priv->local_gid; + path->pathrec.pkey = cpu_to_be16(priv->pkey); + path->pathrec.numb_path = 1; + path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; return path; } @@ -380,253 +557,244 @@ static int path_rec_start(struct net_device *dev, { struct ipoib_dev_priv *priv = netdev_priv(dev); - ipoib_dbg(priv, "Start path record lookup for " IPOIB_GID_FMT "\n", - IPOIB_GID_ARG(path->pathrec.dgid)); + ipoib_dbg(priv, "Start path record lookup for %pI6\n", + path->pathrec.dgid.raw); + + init_completion(&path->done); path->query_id = - ib_sa_path_rec_get(priv->ca, priv->port, + ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, &path->pathrec, IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | + IB_SA_PATH_REC_TRAFFIC_CLASS | IB_SA_PATH_REC_PKEY, 1000, GFP_ATOMIC, path_rec_completion, path, &path->query); if (path->query_id < 0) { - ipoib_warn(priv, "ib_sa_path_rec_get failed\n"); + ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); path->query = NULL; + complete(&path->done); return path->query_id; } return 0; } -static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) +static void neigh_add_path(struct sk_buff *skb, u8 *daddr, + struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_path *path; struct ipoib_neigh *neigh; + unsigned long flags; - neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); + spin_lock_irqsave(&priv->lock, flags); + neigh = ipoib_neigh_alloc(daddr, dev); if (!neigh) { - ++priv->stats.tx_dropped; + spin_unlock_irqrestore(&priv->lock, flags); + ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return; } - skb_queue_head_init(&neigh->queue); - neigh->neighbour = skb->dst->neighbour; - *to_ipoib_neigh(skb->dst->neighbour) = neigh; - - /* - * We can only be called from ipoib_start_xmit, so we're - * inside tx_lock -- no need to save/restore flags. - */ - spin_lock(&priv->lock); - - path = __path_find(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4)); + path = __path_find(dev, daddr + 4); if (!path) { - path = path_rec_create(dev, - (union ib_gid *) (skb->dst->neighbour->ha + 4)); + path = path_rec_create(dev, daddr + 4); if (!path) - goto err; + goto err_path; __path_add(dev, path); } list_add_tail(&neigh->list, &path->neigh_list); - if (path->pathrec.dlid) { + if (path->ah) { kref_get(&path->ah->ref); neigh->ah = path->ah; - ipoib_send(dev, skb, path->ah, - be32_to_cpup((__be32 *) skb->dst->neighbour->ha)); - } else { - neigh->ah = NULL; - if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { - __skb_queue_tail(&neigh->queue, skb); + if (ipoib_cm_enabled(dev, neigh->daddr)) { + if (!ipoib_cm_get(neigh)) + ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); + if (!ipoib_cm_get(neigh)) { + ipoib_neigh_free(neigh); + goto err_drop; + } + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) + __skb_queue_tail(&neigh->queue, skb); + else { + ipoib_warn(priv, "queue length limit %d. Packet drop.\n", + skb_queue_len(&neigh->queue)); + goto err_drop; + } } else { - ++priv->stats.tx_dropped; - dev_kfree_skb_any(skb); + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr)); + ipoib_neigh_put(neigh); + return; } + } else { + neigh->ah = NULL; if (!path->query && path_rec_start(dev, path)) - goto err; + goto err_path; + + __skb_queue_tail(&neigh->queue, skb); } - spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_neigh_put(neigh); return; -err: - *to_ipoib_neigh(skb->dst->neighbour) = NULL; - list_del(&neigh->list); - neigh->neighbour->ops->destructor = NULL; - kfree(neigh); - - ++priv->stats.tx_dropped; +err_path: + ipoib_neigh_free(neigh); +err_drop: + ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); - spin_unlock(&priv->lock); -} - -static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev) -{ - struct ipoib_dev_priv *priv = netdev_priv(skb->dev); - - /* Look up path record for unicasts */ - if (skb->dst->neighbour->ha[4] != 0xff) { - neigh_add_path(skb, dev); - return; - } - - /* Add in the P_Key for multicasts */ - skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff; - skb->dst->neighbour->ha[9] = priv->pkey & 0xff; - ipoib_mcast_send(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4), skb); + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_neigh_put(neigh); } static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, - struct ipoib_pseudoheader *phdr) + struct ipoib_cb *cb) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_path *path; + unsigned long flags; - /* - * We can only be called from ipoib_start_xmit, so we're - * inside tx_lock -- no need to save/restore flags. - */ - spin_lock(&priv->lock); + spin_lock_irqsave(&priv->lock, flags); - path = __path_find(dev, (union ib_gid *) (phdr->hwaddr + 4)); - if (!path) { - path = path_rec_create(dev, - (union ib_gid *) (phdr->hwaddr + 4)); + path = __path_find(dev, cb->hwaddr + 4); + if (!path || !path->valid) { + int new_path = 0; + + if (!path) { + path = path_rec_create(dev, cb->hwaddr + 4); + new_path = 1; + } if (path) { - /* put pseudoheader back on for next time */ - skb_push(skb, sizeof *phdr); __skb_queue_tail(&path->queue, skb); - if (path_rec_start(dev, path)) { - spin_unlock(&priv->lock); - path_free(dev, path); + if (!path->query && path_rec_start(dev, path)) { + spin_unlock_irqrestore(&priv->lock, flags); + if (new_path) + path_free(dev, path); return; } else __path_add(dev, path); } else { - ++priv->stats.tx_dropped; + ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } - spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->lock, flags); return; } - if (path->pathrec.dlid) { + if (path->ah) { ipoib_dbg(priv, "Send unicast ARP to %04x\n", be16_to_cpu(path->pathrec.dlid)); - ipoib_send(dev, skb, path->ah, - be32_to_cpup((__be32 *) phdr->hwaddr)); + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr)); + return; } else if ((path->query || !path_rec_start(dev, path)) && skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { - /* put pseudoheader back on for next time */ - skb_push(skb, sizeof *phdr); __skb_queue_tail(&path->queue, skb); } else { - ++priv->stats.tx_dropped; + ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } - spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->lock, flags); } static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_neigh *neigh; + struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; + struct ipoib_header *header; unsigned long flags; - if (!spin_trylock_irqsave(&priv->tx_lock, flags)) - return NETDEV_TX_LOCKED; - - /* - * Check if our queue is stopped. Since we have the LLTX bit - * set, we can't rely on netif_stop_queue() preventing our - * xmit function from being called with a full queue. - */ - if (unlikely(netif_queue_stopped(dev))) { - spin_unlock_irqrestore(&priv->tx_lock, flags); - return NETDEV_TX_BUSY; + header = (struct ipoib_header *) skb->data; + + if (unlikely(cb->hwaddr[4] == 0xff)) { + /* multicast, arrange "if" according to probability */ + if ((header->proto != htons(ETH_P_IP)) && + (header->proto != htons(ETH_P_IPV6)) && + (header->proto != htons(ETH_P_ARP)) && + (header->proto != htons(ETH_P_RARP)) && + (header->proto != htons(ETH_P_TIPC))) { + /* ethertype not supported by IPoIB */ + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + /* Add in the P_Key for multicast*/ + cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; + cb->hwaddr[9] = priv->pkey & 0xff; + + neigh = ipoib_neigh_get(dev, cb->hwaddr); + if (likely(neigh)) + goto send_using_neigh; + ipoib_mcast_send(dev, cb->hwaddr, skb); + return NETDEV_TX_OK; } - if (skb->dst && skb->dst->neighbour) { - if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) { - ipoib_path_lookup(skb, dev); - goto out; + /* unicast, arrange "switch" according to probability */ + switch (header->proto) { + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + case htons(ETH_P_TIPC): + neigh = ipoib_neigh_get(dev, cb->hwaddr); + if (unlikely(!neigh)) { + neigh_add_path(skb, cb->hwaddr, dev); + return NETDEV_TX_OK; } + break; + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): + /* for unicast ARP and RARP should always perform path find */ + unicast_arp_send(skb, dev, cb); + return NETDEV_TX_OK; + default: + /* ethertype not supported by IPoIB */ + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } - neigh = *to_ipoib_neigh(skb->dst->neighbour); - - if (likely(neigh->ah)) { - ipoib_send(dev, skb, neigh->ah, - be32_to_cpup((__be32 *) skb->dst->neighbour->ha)); - goto out; +send_using_neigh: + /* note we now hold a ref to neigh */ + if (ipoib_cm_get(neigh)) { + if (ipoib_cm_up(neigh)) { + ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); + goto unref; } + } else if (neigh->ah) { + ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr)); + goto unref; + } - if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { - spin_lock(&priv->lock); - __skb_queue_tail(&neigh->queue, skb); - spin_unlock(&priv->lock); - } else { - ++priv->stats.tx_dropped; - dev_kfree_skb_any(skb); - } + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + spin_lock_irqsave(&priv->lock, flags); + __skb_queue_tail(&neigh->queue, skb); + spin_unlock_irqrestore(&priv->lock, flags); } else { - struct ipoib_pseudoheader *phdr = - (struct ipoib_pseudoheader *) skb->data; - skb_pull(skb, sizeof *phdr); - - if (phdr->hwaddr[4] == 0xff) { - /* Add in the P_Key for multicast*/ - phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; - phdr->hwaddr[9] = priv->pkey & 0xff; - - ipoib_mcast_send(dev, (union ib_gid *) (phdr->hwaddr + 4), skb); - } else { - /* unicast GID -- should be ARP or RARP reply */ - - if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) && - (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) { - ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x " - IPOIB_GID_FMT "\n", - skb->dst ? "neigh" : "dst", - be16_to_cpup((__be16 *) skb->data), - be32_to_cpup((__be32 *) phdr->hwaddr), - IPOIB_GID_ARG(*(union ib_gid *) (phdr->hwaddr + 4))); - dev_kfree_skb_any(skb); - ++priv->stats.tx_dropped; - goto out; - } - - unicast_arp_send(skb, dev, phdr); - } + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); } -out: - spin_unlock_irqrestore(&priv->tx_lock, flags); +unref: + ipoib_neigh_put(neigh); return NETDEV_TX_OK; } -static struct net_device_stats *ipoib_get_stats(struct net_device *dev) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); - - return &priv->stats; -} - static void ipoib_timeout(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -642,9 +810,10 @@ static void ipoib_timeout(struct net_device *dev) static int ipoib_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, - void *daddr, void *saddr, unsigned len) + const void *daddr, const void *saddr, unsigned len) { struct ipoib_header *header; + struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; header = (struct ipoib_header *) skb_push(skb, sizeof *header); @@ -652,97 +821,466 @@ static int ipoib_hard_header(struct sk_buff *skb, header->reserved = 0; /* - * If we don't have a neighbour structure, stuff the - * destination address onto the front of the skb so we can - * figure out where to send the packet later. + * we don't rely on dst_entry structure, always stuff the + * destination address into skb->cb so we can figure out where + * to send the packet later. */ - if (!skb->dst || !skb->dst->neighbour) { - struct ipoib_pseudoheader *phdr = - (struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr); - memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); - } + memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); - return 0; + return sizeof *header; } static void ipoib_set_mcast_list(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { + ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set"); + return; + } + queue_work(ipoib_workqueue, &priv->restart_task); } -static void ipoib_neigh_destructor(struct neighbour *n) +static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) { - struct ipoib_neigh *neigh; - struct ipoib_dev_priv *priv = netdev_priv(n->dev); + /* + * Use only the address parts that contributes to spreading + * The subnet prefix is not used as one can not connect to + * same remote port (GUID) using the same remote QPN via two + * different subnets. + */ + /* qpn octets[1:4) & port GUID octets[12:20) */ + u32 *d32 = (u32 *) daddr; + u32 hv; + + hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0); + return hv & htbl->mask; +} + +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh *neigh = NULL; + u32 hash_val; + + rcu_read_lock_bh(); + + htbl = rcu_dereference_bh(ntbl->htbl); + + if (!htbl) + goto out_unlock; + + hash_val = ipoib_addr_hash(htbl, daddr); + for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]); + neigh != NULL; + neigh = rcu_dereference_bh(neigh->hnext)) { + if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { + /* found, take one ref on behalf of the caller */ + if (!atomic_inc_not_zero(&neigh->refcnt)) { + /* deleted */ + neigh = NULL; + goto out_unlock; + } + neigh->alive = jiffies; + goto out_unlock; + } + } + +out_unlock: + rcu_read_unlock_bh(); + return neigh; +} + +static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long neigh_obsolete; + unsigned long dt; unsigned long flags; - struct ipoib_ah *ah = NULL; + int i; - ipoib_dbg(priv, - "neigh_destructor for %06x " IPOIB_GID_FMT "\n", - be32_to_cpup((__be32 *) n->ha), - IPOIB_GID_ARG(*((union ib_gid *) (n->ha + 4)))); + if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) + return; spin_lock_irqsave(&priv->lock, flags); - neigh = *to_ipoib_neigh(n); - if (neigh) { - if (neigh->ah) - ah = neigh->ah; - list_del(&neigh->list); - *to_ipoib_neigh(n) = NULL; - kfree(neigh); + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + + if (!htbl) + goto out_unlock; + + /* neigh is obsolete if it was idle for two GC periods */ + dt = 2 * arp_tbl.gc_interval; + neigh_obsolete = jiffies - dt; + /* handle possible race condition */ + if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) + goto out_unlock; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) != NULL) { + /* was the neigh idle for two GC periods */ + if (time_after(neigh_obsolete, neigh->alive)) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from path/mc list */ + list_del(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } else { + np = &neigh->hnext; + } + + } } +out_unlock: spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_reap_neigh(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, neigh_reap_task.work); + + __ipoib_reap_neigh(priv); - if (ah) - ipoib_put_ah(ah); + if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + arp_tbl.gc_interval); } -static int ipoib_neigh_setup(struct neighbour *neigh) + +static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr, + struct net_device *dev) { - /* - * Is this kosher? I can't find anybody in the kernel that - * sets neigh->destructor, so we should be able to set it here - * without trouble. + struct ipoib_neigh *neigh; + + neigh = kzalloc(sizeof *neigh, GFP_ATOMIC); + if (!neigh) + return NULL; + + neigh->dev = dev; + memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr)); + skb_queue_head_init(&neigh->queue); + INIT_LIST_HEAD(&neigh->list); + ipoib_cm_set(neigh, NULL); + /* one ref on behalf of the caller */ + atomic_set(&neigh->refcnt, 1); + + return neigh; +} + +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, + struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh *neigh; + u32 hash_val; + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + if (!htbl) { + neigh = NULL; + goto out_unlock; + } + + /* need to add a new neigh, but maybe some other thread succeeded? + * recalc hash, maybe hash resize took place so we do a search */ - neigh->ops->destructor = ipoib_neigh_destructor; + hash_val = ipoib_addr_hash(htbl, daddr); + for (neigh = rcu_dereference_protected(htbl->buckets[hash_val], + lockdep_is_held(&priv->lock)); + neigh != NULL; + neigh = rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))) { + if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { + /* found, take one ref on behalf of the caller */ + if (!atomic_inc_not_zero(&neigh->refcnt)) { + /* deleted */ + neigh = NULL; + break; + } + neigh->alive = jiffies; + goto out_unlock; + } + } - return 0; + neigh = ipoib_neigh_ctor(daddr, dev); + if (!neigh) + goto out_unlock; + + /* one ref on behalf of the hash table */ + atomic_inc(&neigh->refcnt); + neigh->alive = jiffies; + /* put in hash */ + rcu_assign_pointer(neigh->hnext, + rcu_dereference_protected(htbl->buckets[hash_val], + lockdep_is_held(&priv->lock))); + rcu_assign_pointer(htbl->buckets[hash_val], neigh); + atomic_inc(&ntbl->entries); + +out_unlock: + + return neigh; +} + +void ipoib_neigh_dtor(struct ipoib_neigh *neigh) +{ + /* neigh reference count was dropprd to zero */ + struct net_device *dev = neigh->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct sk_buff *skb; + if (neigh->ah) + ipoib_put_ah(neigh->ah); + while ((skb = __skb_dequeue(&neigh->queue))) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + if (ipoib_cm_get(neigh)) + ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); + ipoib_dbg(netdev_priv(dev), + "neigh free for %06x %pI6\n", + IPOIB_QPN(neigh->daddr), + neigh->daddr + 4); + kfree(neigh); + if (atomic_dec_and_test(&priv->ntbl.entries)) { + if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags)) + complete(&priv->ntbl.flushed); + } +} + +static void ipoib_neigh_reclaim(struct rcu_head *rp) +{ + /* Called as a result of removal from hash table */ + struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu); + /* note TX context may hold another ref */ + ipoib_neigh_put(neigh); +} + +void ipoib_neigh_free(struct ipoib_neigh *neigh) +{ + struct net_device *dev = neigh->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh __rcu **np; + struct ipoib_neigh *n; + u32 hash_val; + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + if (!htbl) + return; + + hash_val = ipoib_addr_hash(htbl, neigh->daddr); + np = &htbl->buckets[hash_val]; + for (n = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock)); + n != NULL; + n = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) { + if (n == neigh) { + /* found */ + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from parent list */ + list_del(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + return; + } else { + np = &n->hnext; + } + } } -static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms) +static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) { - parms->neigh_setup = ipoib_neigh_setup; + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh **buckets; + u32 size; + + clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); + ntbl->htbl = NULL; + htbl = kzalloc(sizeof(*htbl), GFP_KERNEL); + if (!htbl) + return -ENOMEM; + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + size = roundup_pow_of_two(arp_tbl.gc_thresh3); + buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL); + if (!buckets) { + kfree(htbl); + return -ENOMEM; + } + htbl->size = size; + htbl->mask = (size - 1); + htbl->buckets = buckets; + ntbl->htbl = htbl; + htbl->ntbl = ntbl; + atomic_set(&ntbl->entries, 0); + + /* start garbage collection */ + clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + arp_tbl.gc_interval); return 0; } +static void neigh_hash_free_rcu(struct rcu_head *head) +{ + struct ipoib_neigh_hash *htbl = container_of(head, + struct ipoib_neigh_hash, + rcu); + struct ipoib_neigh __rcu **buckets = htbl->buckets; + struct ipoib_neigh_table *ntbl = htbl->ntbl; + + kfree(buckets); + kfree(htbl); + complete(&ntbl->deleted); +} + +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long flags; + int i; + + /* remove all neigh connected to a given path or mcast */ + spin_lock_irqsave(&priv->lock, flags); + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + + if (!htbl) + goto out_unlock; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) != NULL) { + /* delete neighs belong to this parent */ + if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from parent list */ + list_del(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } else { + np = &neigh->hnext; + } + + } + } +out_unlock: + spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long flags; + int i, wait_flushed = 0; + + init_completion(&priv->ntbl.flushed); + + spin_lock_irqsave(&priv->lock, flags); + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + if (!htbl) + goto out_unlock; + + wait_flushed = atomic_read(&priv->ntbl.entries); + if (!wait_flushed) + goto free_htbl; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) != NULL) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from path/mc list */ + list_del(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } + } + +free_htbl: + rcu_assign_pointer(ntbl->htbl, NULL); + call_rcu(&htbl->rcu, neigh_hash_free_rcu); + +out_unlock: + spin_unlock_irqrestore(&priv->lock, flags); + if (wait_flushed) + wait_for_completion(&priv->ntbl.flushed); +} + +static void ipoib_neigh_hash_uninit(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int stopped; + + ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n"); + init_completion(&priv->ntbl.deleted); + set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); + + /* Stop GC if called at init fail need to cancel work */ + stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + if (!stopped) + cancel_delayed_work(&priv->neigh_reap_task); + + ipoib_flush_neighs(priv); + + wait_for_completion(&priv->ntbl.deleted); +} + + int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) { struct ipoib_dev_priv *priv = netdev_priv(dev); + if (ipoib_neigh_hash_init(priv) < 0) + goto out; /* Allocate RX/TX "rings" to hold queued skbs */ - - priv->rx_ring = kzalloc(IPOIB_RX_RING_SIZE * sizeof (struct ipoib_rx_buf), + priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", - ca->name, IPOIB_RX_RING_SIZE); - goto out; + ca->name, ipoib_recvq_size); + goto out_neigh_hash_cleanup; } - priv->tx_ring = kzalloc(IPOIB_TX_RING_SIZE * sizeof (struct ipoib_tx_buf), - GFP_KERNEL); + priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); if (!priv->tx_ring) { printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", - ca->name, IPOIB_TX_RING_SIZE); + ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } - /* priv->tx_head & tx_tail are already 0 */ + /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ if (ipoib_ib_dev_init(dev, ca, port)) goto out_tx_ring_cleanup; @@ -750,11 +1288,13 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) return 0; out_tx_ring_cleanup: - kfree(priv->tx_ring); + vfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); +out_neigh_hash_cleanup: + ipoib_neigh_hash_uninit(dev); out: return -ENOMEM; } @@ -762,81 +1302,92 @@ out: void ipoib_dev_cleanup(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv; + LIST_HEAD(head); + + ASSERT_RTNL(); - ipoib_delete_debug_file(dev); + ipoib_delete_debug_files(dev); /* Delete any child interfaces first */ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { - unregister_netdev(cpriv->dev); - ipoib_dev_cleanup(cpriv->dev); - free_netdev(cpriv->dev); + /* Stop GC on child */ + set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags); + cancel_delayed_work(&cpriv->neigh_reap_task); + unregister_netdevice_queue(cpriv->dev, &head); } + unregister_netdevice_many(&head); ipoib_ib_dev_cleanup(dev); kfree(priv->rx_ring); - kfree(priv->tx_ring); + vfree(priv->tx_ring); priv->rx_ring = NULL; priv->tx_ring = NULL; + + ipoib_neigh_hash_uninit(dev); } -static void ipoib_setup(struct net_device *dev) +static const struct header_ops ipoib_header_ops = { + .create = ipoib_hard_header, +}; + +static const struct net_device_ops ipoib_netdev_ops = { + .ndo_uninit = ipoib_uninit, + .ndo_open = ipoib_open, + .ndo_stop = ipoib_stop, + .ndo_change_mtu = ipoib_change_mtu, + .ndo_fix_features = ipoib_fix_features, + .ndo_start_xmit = ipoib_start_xmit, + .ndo_tx_timeout = ipoib_timeout, + .ndo_set_rx_mode = ipoib_set_mcast_list, +}; + +void ipoib_setup(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); - dev->open = ipoib_open; - dev->stop = ipoib_stop; - dev->change_mtu = ipoib_change_mtu; - dev->hard_start_xmit = ipoib_start_xmit; - dev->get_stats = ipoib_get_stats; - dev->tx_timeout = ipoib_timeout; - dev->hard_header = ipoib_hard_header; - dev->set_multicast_list = ipoib_set_mcast_list; - dev->neigh_setup = ipoib_neigh_setup_dev; - - dev->watchdog_timeo = HZ; + dev->netdev_ops = &ipoib_netdev_ops; + dev->header_ops = &ipoib_header_ops; - dev->flags |= IFF_BROADCAST | IFF_MULTICAST; + ipoib_set_ethtool_ops(dev); - /* - * We add in INFINIBAND_ALEN to allow for the destination - * address "pseudoheader" for skbs without neighbour struct. - */ - dev->hard_header_len = IPOIB_ENCAP_LEN + INFINIBAND_ALEN; - dev->addr_len = INFINIBAND_ALEN; - dev->type = ARPHRD_INFINIBAND; - dev->tx_queue_len = IPOIB_TX_RING_SIZE * 2; - dev->features = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX; + netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT); - /* MTU will be reset when mcast join happens */ - dev->mtu = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN; - priv->mcast_mtu = priv->admin_mtu = dev->mtu; + dev->watchdog_timeo = HZ; - memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); + dev->flags |= IFF_BROADCAST | IFF_MULTICAST; - netif_carrier_off(dev); + dev->hard_header_len = IPOIB_ENCAP_LEN; + dev->addr_len = INFINIBAND_ALEN; + dev->type = ARPHRD_INFINIBAND; + dev->tx_queue_len = ipoib_sendq_size * 2; + dev->features = (NETIF_F_VLAN_CHALLENGED | + NETIF_F_HIGHDMA); + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; - SET_MODULE_OWNER(dev); + memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); priv->dev = dev; spin_lock_init(&priv->lock); - spin_lock_init(&priv->tx_lock); - init_MUTEX(&priv->mcast_mutex); - init_MUTEX(&priv->vlan_mutex); + init_rwsem(&priv->vlan_rwsem); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); - INIT_WORK(&priv->pkey_task, ipoib_pkey_poll, priv->dev); - INIT_WORK(&priv->mcast_task, ipoib_mcast_join_task, priv->dev); - INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush, priv->dev); - INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task, priv->dev); - INIT_WORK(&priv->ah_reap_task, ipoib_reap_ah, priv->dev); + INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); + INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); + INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); + INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); + INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); + INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); + INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); + INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); + INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh); } struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) @@ -851,16 +1402,54 @@ struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) return netdev_priv(dev); } -static ssize_t show_pkey(struct class_device *cdev, char *buf) +static ssize_t show_pkey(struct device *dev, + struct device_attribute *attr, char *buf) { - struct ipoib_dev_priv *priv = - netdev_priv(container_of(cdev, struct net_device, class_dev)); + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); return sprintf(buf, "0x%04x\n", priv->pkey); } -static CLASS_DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); +static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); + +static ssize_t show_umcast(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); + + return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags)); +} + +void ipoib_set_umcast(struct net_device *ndev, int umcast_val) +{ + struct ipoib_dev_priv *priv = netdev_priv(ndev); + + if (umcast_val > 0) { + set_bit(IPOIB_FLAG_UMCAST, &priv->flags); + ipoib_warn(priv, "ignoring multicast groups joined directly " + "by userspace\n"); + } else + clear_bit(IPOIB_FLAG_UMCAST, &priv->flags); +} + +static ssize_t set_umcast(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long umcast_val = simple_strtoul(buf, NULL, 0); -static ssize_t create_child(struct class_device *cdev, + ipoib_set_umcast(to_net_dev(dev), umcast_val); + + return count; +} +static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast); + +int ipoib_add_umcast_attr(struct net_device *dev) +{ + return device_create_file(&dev->dev, &dev_attr_umcast); +} + +static ssize_t create_child(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { int pkey; @@ -869,7 +1458,7 @@ static ssize_t create_child(struct class_device *cdev, if (sscanf(buf, "%i", &pkey) != 1) return -EINVAL; - if (pkey < 0 || pkey > 0xffff) + if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000) return -EINVAL; /* @@ -878,14 +1467,14 @@ static ssize_t create_child(struct class_device *cdev, */ pkey |= 0x8000; - ret = ipoib_vlan_add(container_of(cdev, struct net_device, class_dev), - pkey); + ret = ipoib_vlan_add(to_net_dev(dev), pkey); return ret ? ret : count; } -static CLASS_DEVICE_ATTR(create_child, S_IWUGO, NULL, create_child); +static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child); -static ssize_t delete_child(struct class_device *cdev, +static ssize_t delete_child(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { int pkey; @@ -897,24 +1486,59 @@ static ssize_t delete_child(struct class_device *cdev, if (pkey < 0 || pkey > 0xffff) return -EINVAL; - ret = ipoib_vlan_delete(container_of(cdev, struct net_device, class_dev), - pkey); + ret = ipoib_vlan_delete(to_net_dev(dev), pkey); return ret ? ret : count; } -static CLASS_DEVICE_ATTR(delete_child, S_IWUGO, NULL, delete_child); +static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child); int ipoib_add_pkey_attr(struct net_device *dev) { - return class_device_create_file(&dev->class_dev, - &class_device_attr_pkey); + return device_create_file(&dev->dev, &dev_attr_pkey); +} + +int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) +{ + struct ib_device_attr *device_attr; + int result = -ENOMEM; + + device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL); + if (!device_attr) { + printk(KERN_WARNING "%s: allocation of %zu bytes failed\n", + hca->name, sizeof *device_attr); + return result; + } + + result = ib_query_device(hca, device_attr); + if (result) { + printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n", + hca->name, result); + kfree(device_attr); + return result; + } + priv->hca_caps = device_attr->device_cap_flags; + + kfree(device_attr); + + if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { + priv->dev->hw_features = NETIF_F_SG | + NETIF_F_IP_CSUM | NETIF_F_RXCSUM; + + if (priv->hca_caps & IB_DEVICE_UD_TSO) + priv->dev->hw_features |= NETIF_F_TSO; + + priv->dev->features |= priv->dev->hw_features; + } + + return 0; } static struct net_device *ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { struct ipoib_dev_priv *priv; + struct ib_port_attr attr; int result = -ENOMEM; priv = ipoib_intf_alloc(format); @@ -922,14 +1546,32 @@ static struct net_device *ipoib_add_port(const char *format, goto alloc_mem_failed; SET_NETDEV_DEV(priv->dev, hca->dma_device); + priv->dev->dev_id = port - 1; + + if (!ib_query_port(hca, port, &attr)) + priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); + else { + printk(KERN_WARNING "%s: ib_query_port %d failed\n", + hca->name, port); + goto device_init_failed; + } + + /* MTU will be reset when mcast join happens */ + priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); + priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; + + priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh); result = ib_query_pkey(hca, port, 0, &priv->pkey); if (result) { printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", hca->name, port, result); - goto alloc_mem_failed; + goto device_init_failed; } + if (ipoib_set_dev_features(priv, hca)) + goto device_init_failed; + /* * Set the full membership bit, so that we join the right * broadcast group, etc. @@ -943,11 +1585,10 @@ static struct net_device *ipoib_add_port(const char *format, if (result) { printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", hca->name, port, result); - goto alloc_mem_failed; + goto device_init_failed; } else memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); - result = ipoib_dev_init(priv->dev, hca, port); if (result < 0) { printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", @@ -972,29 +1613,31 @@ static struct net_device *ipoib_add_port(const char *format, goto register_failed; } - if (ipoib_create_debug_file(priv->dev)) - goto debug_failed; + ipoib_create_debug_files(priv->dev); + if (ipoib_cm_add_mode_attr(priv->dev)) + goto sysfs_failed; if (ipoib_add_pkey_attr(priv->dev)) goto sysfs_failed; - if (class_device_create_file(&priv->dev->class_dev, - &class_device_attr_create_child)) + if (ipoib_add_umcast_attr(priv->dev)) + goto sysfs_failed; + if (device_create_file(&priv->dev->dev, &dev_attr_create_child)) goto sysfs_failed; - if (class_device_create_file(&priv->dev->class_dev, - &class_device_attr_delete_child)) + if (device_create_file(&priv->dev->dev, &dev_attr_delete_child)) goto sysfs_failed; return priv->dev; sysfs_failed: - ipoib_delete_debug_file(priv->dev); - -debug_failed: + ipoib_delete_debug_files(priv->dev); unregister_netdev(priv->dev); register_failed: ib_unregister_event_handler(&priv->event_handler); - flush_scheduled_work(); + /* Stop GC if started before flush */ + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + cancel_delayed_work(&priv->neigh_reap_task); + flush_workqueue(ipoib_workqueue); event_failed: ipoib_dev_cleanup(priv->dev); @@ -1013,13 +1656,16 @@ static void ipoib_add_one(struct ib_device *device) struct ipoib_dev_priv *priv; int s, e, p; + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); if (!dev_list) return; INIT_LIST_HEAD(dev_list); - if (device->node_type == IB_NODE_SWITCH) { + if (device->node_type == RDMA_NODE_IB_SWITCH) { s = 0; e = 0; } else { @@ -1028,6 +1674,8 @@ static void ipoib_add_one(struct ib_device *device) } for (p = s; p <= e; ++p) { + if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) + continue; dev = ipoib_add_port("ib%d", device, p); if (!IS_ERR(dev)) { priv = netdev_priv(dev); @@ -1043,14 +1691,26 @@ static void ipoib_remove_one(struct ib_device *device) struct ipoib_dev_priv *priv, *tmp; struct list_head *dev_list; + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + dev_list = ib_get_client_data(device, &ipoib_client); + if (!dev_list) + return; list_for_each_entry_safe(priv, tmp, dev_list, list) { ib_unregister_event_handler(&priv->event_handler); - flush_scheduled_work(); + + rtnl_lock(); + dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); + rtnl_unlock(); + + /* Stop GC */ + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + cancel_delayed_work(&priv->neigh_reap_task); + flush_workqueue(ipoib_workqueue); unregister_netdev(priv->dev); - ipoib_dev_cleanup(priv->dev); free_netdev(priv->dev); } @@ -1061,6 +1721,23 @@ static int __init ipoib_init_module(void) { int ret; + ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); + ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); + ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); + + ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); + ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); + ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE); +#ifdef CONFIG_INFINIBAND_IPOIB_CM + ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); +#endif + + /* + * When copying small received packets, we only copy from the + * linear data part of the SKB, so we rely on this condition. + */ + BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE); + ret = ipoib_register_debugfs(); if (ret) return ret; @@ -1079,13 +1756,23 @@ static int __init ipoib_init_module(void) goto err_fs; } + ib_sa_register_client(&ipoib_sa_client); + ret = ib_register_client(&ipoib_client); if (ret) - goto err_wq; + goto err_sa; + + ret = ipoib_netlink_init(); + if (ret) + goto err_client; return 0; -err_wq: +err_client: + ib_unregister_client(&ipoib_client); + +err_sa: + ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); err_fs: @@ -1096,7 +1783,9 @@ err_fs: static void __exit ipoib_cleanup_module(void) { + ipoib_netlink_fini(); ib_unregister_client(&ipoib_client); + ib_sa_unregister_client(&ipoib_sa_client); ipoib_unregister_debugfs(); destroy_workqueue(ipoib_workqueue); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 3ecf78a9493..d4e005720d0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -30,18 +30,20 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ipoib_multicast.c 1362 2004-12-18 15:56:29Z roland $ */ #include <linux/skbuff.h> #include <linux/rtnetlink.h> +#include <linux/moduleparam.h> #include <linux/ip.h> #include <linux/in.h> #include <linux/igmp.h> #include <linux/inetdevice.h> #include <linux/delay.h> #include <linux/completion.h> +#include <linux/slab.h> + +#include <net/dst.h> #include "ipoib.h" @@ -53,32 +55,7 @@ MODULE_PARM_DESC(mcast_debug_level, "Enable multicast debug tracing if > 0"); #endif -static DECLARE_MUTEX(mcast_mutex); - -/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ -struct ipoib_mcast { - struct ib_sa_mcmember_rec mcmember; - struct ipoib_ah *ah; - - struct rb_node rb_node; - struct list_head list; - struct completion done; - - int query_id; - struct ib_sa_query *query; - - unsigned long created; - unsigned long backoff; - - unsigned long flags; - unsigned char logcount; - - struct list_head neigh_list; - - struct sk_buff_head pkt_queue; - - struct net_device *dev; -}; +static DEFINE_MUTEX(mcast_mutex); struct ipoib_mcast_iter { struct net_device *dev; @@ -92,41 +69,26 @@ struct ipoib_mcast_iter { static void ipoib_mcast_free(struct ipoib_mcast *mcast) { struct net_device *dev = mcast->dev; - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ipoib_neigh *neigh, *tmp; - unsigned long flags; - LIST_HEAD(ah_list); - struct ipoib_ah *ah, *tah; + int tx_dropped = 0; - ipoib_dbg_mcast(netdev_priv(dev), - "deleting multicast group " IPOIB_GID_FMT "\n", - IPOIB_GID_ARG(mcast->mcmember.mgid)); + ipoib_dbg_mcast(netdev_priv(dev), "deleting multicast group %pI6\n", + mcast->mcmember.mgid.raw); - spin_lock_irqsave(&priv->lock, flags); - - list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list) { - if (neigh->ah) - list_add_tail(&neigh->ah->list, &ah_list); - *to_ipoib_neigh(neigh->neighbour) = NULL; - neigh->neighbour->ops->destructor = NULL; - kfree(neigh); - } - - spin_unlock_irqrestore(&priv->lock, flags); - - list_for_each_entry_safe(ah, tah, &ah_list, list) - ipoib_put_ah(ah); + /* remove all neigh connected to this mcast */ + ipoib_del_neighs_by_gid(dev, mcast->mcmember.mgid.raw); if (mcast->ah) ipoib_put_ah(mcast->ah); while (!skb_queue_empty(&mcast->pkt_queue)) { - struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue); - - skb->dev = dev; - dev_kfree_skb_any(skb); + ++tx_dropped; + dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); } + netif_tx_lock_bh(dev); + dev->stats.tx_dropped += tx_dropped; + netif_tx_unlock_bh(dev); + kfree(mcast); } @@ -139,24 +101,18 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, if (!mcast) return NULL; - init_completion(&mcast->done); - mcast->dev = dev; mcast->created = jiffies; mcast->backoff = 1; - mcast->logcount = 0; INIT_LIST_HEAD(&mcast->list); INIT_LIST_HEAD(&mcast->neigh_list); skb_queue_head_init(&mcast->pkt_queue); - mcast->ah = NULL; - mcast->query = NULL; - return mcast; } -static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, union ib_gid *mgid) +static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct rb_node *n = priv->multicast_tree.rb_node; @@ -167,7 +123,7 @@ static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, union ib_g mcast = rb_entry(n, struct ipoib_mcast, rb_node); - ret = memcmp(mgid->raw, mcast->mcmember.mgid.raw, + ret = memcmp(mgid, mcast->mcmember.mgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; @@ -213,32 +169,48 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, { struct net_device *dev = mcast->dev; struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_ah *ah; int ret; + int set_qkey = 0; mcast->mcmember = *mcmember; - /* Set the cached Q_Key before we attach if it's the broadcast group */ + /* Set the multicast MTU and cached Q_Key before we attach if it's + * the broadcast group. + */ if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4, sizeof (union ib_gid))) { + spin_lock_irq(&priv->lock); + if (!priv->broadcast) { + spin_unlock_irq(&priv->lock); + return -EAGAIN; + } + priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); + spin_unlock_irq(&priv->lock); priv->tx_wr.wr.ud.remote_qkey = priv->qkey; + set_qkey = 1; + + if (!ipoib_cm_admin_enabled(dev)) { + rtnl_lock(); + dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); + rtnl_unlock(); + } } if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { - ipoib_warn(priv, "multicast group " IPOIB_GID_FMT - " already attached\n", - IPOIB_GID_ARG(mcast->mcmember.mgid)); + ipoib_warn(priv, "multicast group %pI6 already attached\n", + mcast->mcmember.mgid.raw); return 0; } ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid), - &mcast->mcmember.mgid); + &mcast->mcmember.mgid, set_qkey); if (ret < 0) { - ipoib_warn(priv, "couldn't attach QP to multicast group " - IPOIB_GID_FMT "\n", - IPOIB_GID_ARG(mcast->mcmember.mgid)); + ipoib_warn(priv, "couldn't attach QP to multicast group %pI6\n", + mcast->mcmember.mgid.raw); clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags); return ret; @@ -251,6 +223,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, .port_num = priv->port, .sl = mcast->mcmember.sl, .ah_flags = IB_AH_GRH, + .static_rate = mcast->mcmember.rate, .grh = { .flow_label = be32_to_cpu(mcast->mcmember.flow_label), .hop_limit = mcast->mcmember.hop_limit, @@ -258,24 +231,21 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, .traffic_class = mcast->mcmember.traffic_class } }; - int path_rate = ib_sa_rate_enum_to_int(mcast->mcmember.rate); - av.grh.dgid = mcast->mcmember.mgid; - if (path_rate > 0 && priv->local_rate > path_rate) - av.static_rate = (priv->local_rate - 1) / path_rate; - - ipoib_dbg_mcast(priv, "static_rate %d for local port %dX, mcmember %dX\n", - av.static_rate, priv->local_rate, - ib_sa_rate_enum_to_int(mcast->mcmember.rate)); - - mcast->ah = ipoib_create_ah(dev, priv->pd, &av); - if (!mcast->ah) { - ipoib_warn(priv, "ib_address_create failed\n"); + ah = ipoib_create_ah(dev, priv->pd, &av); + if (IS_ERR(ah)) { + ipoib_warn(priv, "ib_address_create failed %ld\n", + -PTR_ERR(ah)); + /* use original error */ + return PTR_ERR(ah); } else { - ipoib_dbg_mcast(priv, "MGID " IPOIB_GID_FMT - " AV %p, LID 0x%04x, SL %d\n", - IPOIB_GID_ARG(mcast->mcmember.mgid), + spin_lock_irq(&priv->lock); + mcast->ah = ah; + spin_unlock_irq(&priv->lock); + + ipoib_dbg_mcast(priv, "MGID %pI6 AV %p, LID 0x%04x, SL %d\n", + mcast->mcmember.mgid.raw, mcast->ah->ah, be16_to_cpu(mcast->mcmember.mlid), mcast->mcmember.sl); @@ -283,53 +253,55 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, } /* actually send any queued packets */ + netif_tx_lock_bh(dev); while (!skb_queue_empty(&mcast->pkt_queue)) { struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue); - skb->dev = dev; - - if (!skb->dst || !skb->dst->neighbour) { - /* put pseudoheader back on for next time */ - skb_push(skb, sizeof (struct ipoib_pseudoheader)); - } + netif_tx_unlock_bh(dev); + skb->dev = dev; if (dev_queue_xmit(skb)) ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n"); + + netif_tx_lock_bh(dev); } + netif_tx_unlock_bh(dev); return 0; } -static void +static int ipoib_mcast_sendonly_join_complete(int status, - struct ib_sa_mcmember_rec *mcmember, - void *mcast_ptr) + struct ib_sa_multicast *multicast) { - struct ipoib_mcast *mcast = mcast_ptr; + struct ipoib_mcast *mcast = multicast->context; struct net_device *dev = mcast->dev; + /* We trap for port events ourselves. */ + if (status == -ENETRESET) + return 0; + if (!status) - ipoib_mcast_join_finish(mcast, mcmember); - else { + status = ipoib_mcast_join_finish(mcast, &multicast->rec); + + if (status) { if (mcast->logcount++ < 20) - ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for " - IPOIB_GID_FMT ", status %d\n", - IPOIB_GID_ARG(mcast->mcmember.mgid), status); + ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n", + mcast->mcmember.mgid.raw, status); /* Flush out any queued packets */ + netif_tx_lock_bh(dev); while (!skb_queue_empty(&mcast->pkt_queue)) { - struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue); - - skb->dev = dev; - - dev_kfree_skb_any(skb); + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); } + netif_tx_unlock_bh(dev); /* Clear the busy flag so we try again */ - clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, + &mcast->flags); } - - complete(&mcast->done); + return status; } static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) @@ -359,66 +331,95 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) rec.port_gid = priv->local_gid; rec.pkey = cpu_to_be16(priv->pkey); - ret = ib_sa_mcmember_rec_set(priv->ca, priv->port, &rec, - IB_SA_MCMEMBER_REC_MGID | - IB_SA_MCMEMBER_REC_PORT_GID | - IB_SA_MCMEMBER_REC_PKEY | - IB_SA_MCMEMBER_REC_JOIN_STATE, - 1000, GFP_ATOMIC, - ipoib_mcast_sendonly_join_complete, - mcast, &mcast->query); - if (ret < 0) { - ipoib_warn(priv, "ib_sa_mcmember_rec_set failed (ret = %d)\n", + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, + priv->port, &rec, + IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | + IB_SA_MCMEMBER_REC_PKEY | + IB_SA_MCMEMBER_REC_JOIN_STATE, + GFP_ATOMIC, + ipoib_mcast_sendonly_join_complete, + mcast); + if (IS_ERR(mcast->mc)) { + ret = PTR_ERR(mcast->mc); + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n", ret); } else { - ipoib_dbg_mcast(priv, "no multicast record for " IPOIB_GID_FMT - ", starting join\n", - IPOIB_GID_ARG(mcast->mcmember.mgid)); - - mcast->query_id = ret; + ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n", + mcast->mcmember.mgid.raw); } return ret; } -static void ipoib_mcast_join_complete(int status, - struct ib_sa_mcmember_rec *mcmember, - void *mcast_ptr) +void ipoib_mcast_carrier_on_task(struct work_struct *work) { - struct ipoib_mcast *mcast = mcast_ptr; + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + carrier_on_task); + struct ib_port_attr attr; + + /* + * Take rtnl_lock to avoid racing with ipoib_stop() and + * turning the carrier back on while a device is being + * removed. + */ + if (ib_query_port(priv->ca, priv->port, &attr) || + attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); + return; + } + + rtnl_lock(); + netif_carrier_on(priv->dev); + rtnl_unlock(); +} + +static int ipoib_mcast_join_complete(int status, + struct ib_sa_multicast *multicast) +{ + struct ipoib_mcast *mcast = multicast->context; struct net_device *dev = mcast->dev; struct ipoib_dev_priv *priv = netdev_priv(dev); - ipoib_dbg_mcast(priv, "join completion for " IPOIB_GID_FMT - " (status %d)\n", - IPOIB_GID_ARG(mcast->mcmember.mgid), status); + ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n", + mcast->mcmember.mgid.raw, status); - if (!status && !ipoib_mcast_join_finish(mcast, mcmember)) { + /* We trap for port events ourselves. */ + if (status == -ENETRESET) { + status = 0; + goto out; + } + + if (!status) + status = ipoib_mcast_join_finish(mcast, &multicast->rec); + + if (!status) { mcast->backoff = 1; - down(&mcast_mutex); + mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_work(ipoib_workqueue, &priv->mcast_task); - up(&mcast_mutex); - complete(&mcast->done); - return; - } + queue_delayed_work(ipoib_workqueue, + &priv->mcast_task, 0); + mutex_unlock(&mcast_mutex); - if (status == -EINTR) { - complete(&mcast->done); - return; + /* + * Defer carrier on work to ipoib_workqueue to avoid a + * deadlock on rtnl_lock here. + */ + if (mcast == priv->broadcast) + queue_work(ipoib_workqueue, &priv->carrier_on_task); + + status = 0; + goto out; } - if (status && mcast->logcount++ < 20) { - if (status == -ETIMEDOUT || status == -EINTR) { - ipoib_dbg_mcast(priv, "multicast join failed for " IPOIB_GID_FMT - ", status %d\n", - IPOIB_GID_ARG(mcast->mcmember.mgid), - status); + if (mcast->logcount++ < 20) { + if (status == -ETIMEDOUT || status == -EAGAIN) { + ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n", + mcast->mcmember.mgid.raw, status); } else { - ipoib_warn(priv, "multicast join failed for " - IPOIB_GID_FMT ", status %d\n", - IPOIB_GID_ARG(mcast->mcmember.mgid), - status); + ipoib_warn(priv, "multicast join failed for %pI6, status %d\n", + mcast->mcmember.mgid.raw, status); } } @@ -426,20 +427,19 @@ static void ipoib_mcast_join_complete(int status, if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; - mcast->query = NULL; - - down(&mcast_mutex); - if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) { - if (status == -ETIMEDOUT) - queue_work(ipoib_workqueue, &priv->mcast_task); - else - queue_delayed_work(ipoib_workqueue, &priv->mcast_task, - mcast->backoff * HZ); - } else - complete(&mcast->done); - up(&mcast_mutex); + /* Clear the busy flag so we try again */ + status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); - return; + mutex_lock(&mcast_mutex); + spin_lock_irq(&priv->lock); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->mcast_task, + mcast->backoff * HZ); + spin_unlock_irq(&priv->lock); + mutex_unlock(&mcast_mutex); +out: + complete(&mcast->done); + return status; } static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, @@ -452,8 +452,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, ib_sa_comp_mask comp_mask; int ret = 0; - ipoib_dbg_mcast(priv, "joining MGID " IPOIB_GID_FMT "\n", - IPOIB_GID_ARG(mcast->mcmember.mgid)); + ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw); rec.mgid = mcast->mcmember.mgid; rec.port_gid = priv->local_gid; @@ -467,85 +466,113 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, if (create) { comp_mask |= - IB_SA_MCMEMBER_REC_QKEY | - IB_SA_MCMEMBER_REC_SL | - IB_SA_MCMEMBER_REC_FLOW_LABEL | - IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; + IB_SA_MCMEMBER_REC_QKEY | + IB_SA_MCMEMBER_REC_MTU_SELECTOR | + IB_SA_MCMEMBER_REC_MTU | + IB_SA_MCMEMBER_REC_TRAFFIC_CLASS | + IB_SA_MCMEMBER_REC_RATE_SELECTOR | + IB_SA_MCMEMBER_REC_RATE | + IB_SA_MCMEMBER_REC_SL | + IB_SA_MCMEMBER_REC_FLOW_LABEL | + IB_SA_MCMEMBER_REC_HOP_LIMIT; rec.qkey = priv->broadcast->mcmember.qkey; + rec.mtu_selector = IB_SA_EQ; + rec.mtu = priv->broadcast->mcmember.mtu; + rec.traffic_class = priv->broadcast->mcmember.traffic_class; + rec.rate_selector = IB_SA_EQ; + rec.rate = priv->broadcast->mcmember.rate; rec.sl = priv->broadcast->mcmember.sl; rec.flow_label = priv->broadcast->mcmember.flow_label; - rec.traffic_class = priv->broadcast->mcmember.traffic_class; + rec.hop_limit = priv->broadcast->mcmember.hop_limit; } - ret = ib_sa_mcmember_rec_set(priv->ca, priv->port, &rec, comp_mask, - mcast->backoff * 1000, GFP_ATOMIC, - ipoib_mcast_join_complete, - mcast, &mcast->query); + set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); - if (ret < 0) { - ipoib_warn(priv, "ib_sa_mcmember_rec_set failed, status %d\n", ret); + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, + &rec, comp_mask, GFP_KERNEL, + ipoib_mcast_join_complete, mcast); + if (IS_ERR(mcast->mc)) { + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + complete(&mcast->done); + ret = PTR_ERR(mcast->mc); + ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); mcast->backoff *= 2; if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; - down(&mcast_mutex); + mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, mcast->backoff * HZ); - up(&mcast_mutex); - } else - mcast->query_id = ret; + mutex_unlock(&mcast_mutex); + } } -void ipoib_mcast_join_task(void *dev_ptr) +void ipoib_mcast_join_task(struct work_struct *work) { - struct net_device *dev = dev_ptr; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, mcast_task.work); + struct net_device *dev = priv->dev; + struct ib_port_attr port_attr; if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) return; + if (ib_query_port(priv->ca, priv->port, &port_attr) || + port_attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "port state is not ACTIVE (state = %d) suspending join task\n", + port_attr.state); + return; + } + if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) - ipoib_warn(priv, "ib_gid_entry_get() failed\n"); + ipoib_warn(priv, "ib_query_gid() failed\n"); else memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); { struct ib_port_attr attr; - if (!ib_query_port(priv->ca, priv->port, &attr)) { - priv->local_lid = attr.lid; - priv->local_rate = attr.active_speed * - ib_width_enum_to_int(attr.active_width); - } else + if (!ib_query_port(priv->ca, priv->port, &attr)) + priv->local_lid = attr.lid; + else ipoib_warn(priv, "ib_query_port failed\n"); } if (!priv->broadcast) { - priv->broadcast = ipoib_mcast_alloc(dev, 1); - if (!priv->broadcast) { + struct ipoib_mcast *broadcast; + + if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + return; + + broadcast = ipoib_mcast_alloc(dev, 1); + if (!broadcast) { ipoib_warn(priv, "failed to allocate broadcast group\n"); - down(&mcast_mutex); + mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, HZ); - up(&mcast_mutex); + mutex_unlock(&mcast_mutex); return; } - memcpy(priv->broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4, + spin_lock_irq(&priv->lock); + memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4, sizeof (union ib_gid)); + priv->broadcast = broadcast; - spin_lock_irq(&priv->lock); __ipoib_mcast_add(dev, priv->broadcast); spin_unlock_irq(&priv->lock); } if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { - ipoib_mcast_join(dev, priv->broadcast, 0); + if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) + ipoib_mcast_join(dev, priv->broadcast, 0); return; } @@ -572,14 +599,9 @@ void ipoib_mcast_join_task(void *dev_ptr) return; } - priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) - - IPOIB_ENCAP_LEN; - dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); - ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); clear_bit(IPOIB_MCAST_RUN, &priv->flags); - netif_carrier_on(dev); } int ipoib_mcast_start_thread(struct net_device *dev) @@ -588,10 +610,10 @@ int ipoib_mcast_start_thread(struct net_device *dev) ipoib_dbg_mcast(priv, "starting multicast thread\n"); - down(&mcast_mutex); + mutex_lock(&mcast_mutex); if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_work(ipoib_workqueue, &priv->mcast_task); - up(&mcast_mutex); + queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); + mutex_unlock(&mcast_mutex); return 0; } @@ -599,108 +621,76 @@ int ipoib_mcast_start_thread(struct net_device *dev) int ipoib_mcast_stop_thread(struct net_device *dev, int flush) { struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ipoib_mcast *mcast; ipoib_dbg_mcast(priv, "stopping multicast thread\n"); - down(&mcast_mutex); + mutex_lock(&mcast_mutex); clear_bit(IPOIB_MCAST_RUN, &priv->flags); cancel_delayed_work(&priv->mcast_task); - up(&mcast_mutex); + mutex_unlock(&mcast_mutex); if (flush) flush_workqueue(ipoib_workqueue); - if (priv->broadcast && priv->broadcast->query) { - ib_sa_cancel_query(priv->broadcast->query_id, priv->broadcast->query); - priv->broadcast->query = NULL; - ipoib_dbg_mcast(priv, "waiting for bcast\n"); - wait_for_completion(&priv->broadcast->done); - } - - list_for_each_entry(mcast, &priv->multicast_list, list) { - if (mcast->query) { - ib_sa_cancel_query(mcast->query_id, mcast->query); - mcast->query = NULL; - ipoib_dbg_mcast(priv, "waiting for MGID " IPOIB_GID_FMT "\n", - IPOIB_GID_ARG(mcast->mcmember.mgid)); - wait_for_completion(&mcast->done); - } - } - return 0; } static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) { struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ib_sa_mcmember_rec rec = { - .join_state = 1 - }; int ret = 0; - if (!test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) - return 0; - - ipoib_dbg_mcast(priv, "leaving MGID " IPOIB_GID_FMT "\n", - IPOIB_GID_ARG(mcast->mcmember.mgid)); + if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + ib_sa_free_multicast(mcast->mc); - rec.mgid = mcast->mcmember.mgid; - rec.port_gid = priv->local_gid; - rec.pkey = cpu_to_be16(priv->pkey); + if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { + ipoib_dbg_mcast(priv, "leaving MGID %pI6\n", + mcast->mcmember.mgid.raw); - /* Remove ourselves from the multicast group */ - ret = ipoib_mcast_detach(dev, be16_to_cpu(mcast->mcmember.mlid), - &mcast->mcmember.mgid); - if (ret) - ipoib_warn(priv, "ipoib_mcast_detach failed (result = %d)\n", ret); - - /* - * Just make one shot at leaving and don't wait for a reply; - * if we fail, too bad. - */ - ret = ib_sa_mcmember_rec_delete(priv->ca, priv->port, &rec, - IB_SA_MCMEMBER_REC_MGID | - IB_SA_MCMEMBER_REC_PORT_GID | - IB_SA_MCMEMBER_REC_PKEY | - IB_SA_MCMEMBER_REC_JOIN_STATE, - 0, GFP_ATOMIC, NULL, - mcast, &mcast->query); - if (ret < 0) - ipoib_warn(priv, "ib_sa_mcmember_rec_delete failed " - "for leave (result = %d)\n", ret); + /* Remove ourselves from the multicast group */ + ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid, + be16_to_cpu(mcast->mcmember.mlid)); + if (ret) + ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); + } return 0; } -void ipoib_mcast_send(struct net_device *dev, union ib_gid *mgid, - struct sk_buff *skb) +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_mcast *mcast; + unsigned long flags; + void *mgid = daddr + 4; - /* - * We can only be called from ipoib_start_xmit, so we're - * inside tx_lock -- no need to save/restore flags. - */ - spin_lock(&priv->lock); + spin_lock_irqsave(&priv->lock, flags); + + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags) || + !priv->broadcast || + !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + goto unlock; + } mcast = __ipoib_mcast_find(dev, mgid); if (!mcast) { /* Let's create a new send only group now */ - ipoib_dbg_mcast(priv, "setting up send only multicast group for " - IPOIB_GID_FMT "\n", IPOIB_GID_ARG(*mgid)); + ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n", + mgid); mcast = ipoib_mcast_alloc(dev, 0); if (!mcast) { ipoib_warn(priv, "unable to allocate memory for " "multicast structure\n"); + ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); goto out; } set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); - mcast->mcmember.mgid = *mgid; + memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); __ipoib_mcast_add(dev, mcast); list_add_tail(&mcast->list, &priv->multicast_list); } @@ -708,10 +698,12 @@ void ipoib_mcast_send(struct net_device *dev, union ib_gid *mgid, if (!mcast->ah) { if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) skb_queue_tail(&mcast->pkt_queue, skb); - else + else { + ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); + } - if (mcast->query) + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) ipoib_dbg_mcast(priv, "no address vector, " "but multicast join already started\n"); else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) @@ -726,114 +718,95 @@ void ipoib_mcast_send(struct net_device *dev, union ib_gid *mgid, out: if (mcast && mcast->ah) { - if (skb->dst && - skb->dst->neighbour && - !*to_ipoib_neigh(skb->dst->neighbour)) { - struct ipoib_neigh *neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); + struct ipoib_neigh *neigh; + spin_unlock_irqrestore(&priv->lock, flags); + neigh = ipoib_neigh_get(dev, daddr); + spin_lock_irqsave(&priv->lock, flags); + if (!neigh) { + neigh = ipoib_neigh_alloc(daddr, dev); if (neigh) { kref_get(&mcast->ah->ref); - neigh->ah = mcast->ah; - neigh->neighbour = skb->dst->neighbour; - *to_ipoib_neigh(skb->dst->neighbour) = neigh; + neigh->ah = mcast->ah; list_add_tail(&neigh->list, &mcast->neigh_list); } } - + spin_unlock_irqrestore(&priv->lock, flags); ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); + if (neigh) + ipoib_neigh_put(neigh); + return; } - spin_unlock(&priv->lock); +unlock: + spin_unlock_irqrestore(&priv->lock, flags); } void ipoib_mcast_dev_flush(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); LIST_HEAD(remove_list); - struct ipoib_mcast *mcast, *tmcast, *nmcast; + struct ipoib_mcast *mcast, *tmcast; unsigned long flags; ipoib_dbg_mcast(priv, "flushing multicast list\n"); spin_lock_irqsave(&priv->lock, flags); - list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { - nmcast = ipoib_mcast_alloc(dev, 0); - if (nmcast) { - nmcast->flags = - mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY); - - nmcast->mcmember.mgid = mcast->mcmember.mgid; - - /* Add the new group in before the to-be-destroyed group */ - list_add_tail(&nmcast->list, &mcast->list); - list_del_init(&mcast->list); - rb_replace_node(&mcast->rb_node, &nmcast->rb_node, - &priv->multicast_tree); - - list_add_tail(&mcast->list, &remove_list); - } else { - ipoib_warn(priv, "could not reallocate multicast group " - IPOIB_GID_FMT "\n", - IPOIB_GID_ARG(mcast->mcmember.mgid)); - } + list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { + list_del(&mcast->list); + rb_erase(&mcast->rb_node, &priv->multicast_tree); + list_add_tail(&mcast->list, &remove_list); } if (priv->broadcast) { - nmcast = ipoib_mcast_alloc(dev, 0); - if (nmcast) { - nmcast->mcmember.mgid = priv->broadcast->mcmember.mgid; - - rb_replace_node(&priv->broadcast->rb_node, - &nmcast->rb_node, - &priv->multicast_tree); - - list_add_tail(&priv->broadcast->list, &remove_list); - } - - priv->broadcast = nmcast; + rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree); + list_add_tail(&priv->broadcast->list, &remove_list); + priv->broadcast = NULL; } spin_unlock_irqrestore(&priv->lock, flags); + /* seperate between the wait to the leave*/ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) + if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) + wait_for_completion(&mcast->done); + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(dev, mcast); ipoib_mcast_free(mcast); } } -void ipoib_mcast_dev_down(struct net_device *dev) +static int ipoib_mcast_addr_is_valid(const u8 *addr, const u8 *broadcast) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - unsigned long flags; - - /* Delete broadcast since it will be recreated */ - if (priv->broadcast) { - ipoib_dbg_mcast(priv, "deleting broadcast group\n"); - - spin_lock_irqsave(&priv->lock, flags); - rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree); - spin_unlock_irqrestore(&priv->lock, flags); - ipoib_mcast_leave(dev, priv->broadcast); - ipoib_mcast_free(priv->broadcast); - priv->broadcast = NULL; - } + /* reserved QPN, prefix, scope */ + if (memcmp(addr, broadcast, 6)) + return 0; + /* signature lower, pkey */ + if (memcmp(addr + 7, broadcast + 7, 3)) + return 0; + return 1; } -void ipoib_mcast_restart_task(void *dev_ptr) +void ipoib_mcast_restart_task(struct work_struct *work) { - struct net_device *dev = dev_ptr; - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct dev_mc_list *mclist; + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, restart_task); + struct net_device *dev = priv->dev; + struct netdev_hw_addr *ha; struct ipoib_mcast *mcast, *tmcast; LIST_HEAD(remove_list); unsigned long flags; + struct ib_sa_mcmember_rec rec; ipoib_dbg_mcast(priv, "restarting multicast task\n"); ipoib_mcast_stop_thread(dev, 0); - spin_lock_irqsave(&priv->lock, flags); + local_irq_save(flags); + netif_addr_lock(dev); + spin_lock(&priv->lock); /* * Unfortunately, the networking core only gives us a list of all of @@ -846,22 +819,29 @@ void ipoib_mcast_restart_task(void *dev_ptr) clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); /* Mark all of the entries that are found or don't exist */ - for (mclist = dev->mc_list; mclist; mclist = mclist->next) { + netdev_for_each_mc_addr(ha, dev) { union ib_gid mgid; - memcpy(mgid.raw, mclist->dmi_addr + 4, sizeof mgid); + if (!ipoib_mcast_addr_is_valid(ha->addr, dev->broadcast)) + continue; - /* Add in the P_Key */ - mgid.raw[4] = (priv->pkey >> 8) & 0xff; - mgid.raw[5] = priv->pkey & 0xff; + memcpy(mgid.raw, ha->addr + 4, sizeof mgid); mcast = __ipoib_mcast_find(dev, &mgid); if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { struct ipoib_mcast *nmcast; + /* ignore group which is directly joined by userspace */ + if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) && + !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) { + ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %pI6\n", + mgid.raw); + continue; + } + /* Not found or send-only group, let's add a new entry */ - ipoib_dbg_mcast(priv, "adding multicast entry for mgid " - IPOIB_GID_FMT "\n", IPOIB_GID_ARG(mgid)); + ipoib_dbg_mcast(priv, "adding multicast entry for mgid %pI6\n", + mgid.raw); nmcast = ipoib_mcast_alloc(dev, 0); if (!nmcast) { @@ -875,8 +855,7 @@ void ipoib_mcast_restart_task(void *dev_ptr) if (mcast) { /* Destroy the send only entry */ - list_del(&mcast->list); - list_add_tail(&mcast->list, &remove_list); + list_move_tail(&mcast->list, &remove_list); rb_replace_node(&mcast->rb_node, &nmcast->rb_node, @@ -895,17 +874,19 @@ void ipoib_mcast_restart_task(void *dev_ptr) list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) && !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { - ipoib_dbg_mcast(priv, "deleting multicast group " IPOIB_GID_FMT "\n", - IPOIB_GID_ARG(mcast->mcmember.mgid)); + ipoib_dbg_mcast(priv, "deleting multicast group %pI6\n", + mcast->mcmember.mgid.raw); rb_erase(&mcast->rb_node, &priv->multicast_tree); /* Move to the remove list */ - list_del(&mcast->list); - list_add_tail(&mcast->list, &remove_list); + list_move_tail(&mcast->list, &remove_list); } } - spin_unlock_irqrestore(&priv->lock, flags); + + spin_unlock(&priv->lock); + netif_addr_unlock(dev); + local_irq_restore(flags); /* We have to cancel outside of the spinlock */ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { @@ -928,21 +909,16 @@ struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev) return NULL; iter->dev = dev; - memset(iter->mgid.raw, 0, sizeof iter->mgid); + memset(iter->mgid.raw, 0, 16); if (ipoib_mcast_iter_next(iter)) { - ipoib_mcast_iter_free(iter); + kfree(iter); return NULL; } return iter; } -void ipoib_mcast_iter_free(struct ipoib_mcast_iter *iter) -{ - kfree(iter); -} - int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter) { struct ipoib_dev_priv *priv = netdev_priv(iter->dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c new file mode 100644 index 00000000000..cdc7df4fdb8 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. - All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/netdevice.h> +#include <linux/if_arp.h> /* For ARPHRD_xxx */ +#include <linux/module.h> +#include <net/rtnetlink.h> +#include "ipoib.h" + +static const struct nla_policy ipoib_policy[IFLA_IPOIB_MAX + 1] = { + [IFLA_IPOIB_PKEY] = { .type = NLA_U16 }, + [IFLA_IPOIB_MODE] = { .type = NLA_U16 }, + [IFLA_IPOIB_UMCAST] = { .type = NLA_U16 }, +}; + +static int ipoib_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + u16 val; + + if (nla_put_u16(skb, IFLA_IPOIB_PKEY, priv->pkey)) + goto nla_put_failure; + + val = test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + if (nla_put_u16(skb, IFLA_IPOIB_MODE, val)) + goto nla_put_failure; + + val = test_bit(IPOIB_FLAG_UMCAST, &priv->flags); + if (nla_put_u16(skb, IFLA_IPOIB_UMCAST, val)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int ipoib_changelink(struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + u16 mode, umcast; + int ret = 0; + + if (data[IFLA_IPOIB_MODE]) { + mode = nla_get_u16(data[IFLA_IPOIB_MODE]); + if (mode == IPOIB_MODE_DATAGRAM) + ret = ipoib_set_mode(dev, "datagram\n"); + else if (mode == IPOIB_MODE_CONNECTED) + ret = ipoib_set_mode(dev, "connected\n"); + else + ret = -EINVAL; + + if (ret < 0) + goto out_err; + } + + if (data[IFLA_IPOIB_UMCAST]) { + umcast = nla_get_u16(data[IFLA_IPOIB_UMCAST]); + ipoib_set_umcast(dev, umcast); + } + +out_err: + return ret; +} + +static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct net_device *pdev; + struct ipoib_dev_priv *ppriv; + u16 child_pkey; + int err; + + if (!tb[IFLA_LINK]) + return -EINVAL; + + pdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + if (!pdev || pdev->type != ARPHRD_INFINIBAND) + return -ENODEV; + + ppriv = netdev_priv(pdev); + + if (test_bit(IPOIB_FLAG_SUBINTERFACE, &ppriv->flags)) { + ipoib_warn(ppriv, "child creation disallowed for child devices\n"); + return -EINVAL; + } + + if (!data || !data[IFLA_IPOIB_PKEY]) { + ipoib_dbg(ppriv, "no pkey specified, using parent pkey\n"); + child_pkey = ppriv->pkey; + } else + child_pkey = nla_get_u16(data[IFLA_IPOIB_PKEY]); + + if (child_pkey == 0 || child_pkey == 0x8000) + return -EINVAL; + + /* + * Set the full membership bit, so that we join the right + * broadcast group, etc. + */ + child_pkey |= 0x8000; + + err = __ipoib_vlan_add(ppriv, netdev_priv(dev), child_pkey, IPOIB_RTNL_CHILD); + + if (!err && data) + err = ipoib_changelink(dev, tb, data); + return err; +} + +static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head *head) +{ + struct ipoib_dev_priv *priv, *ppriv; + + priv = netdev_priv(dev); + ppriv = netdev_priv(priv->parent); + + down_write(&ppriv->vlan_rwsem); + unregister_netdevice_queue(dev, head); + list_del(&priv->list); + up_write(&ppriv->vlan_rwsem); +} + +static size_t ipoib_get_size(const struct net_device *dev) +{ + return nla_total_size(2) + /* IFLA_IPOIB_PKEY */ + nla_total_size(2) + /* IFLA_IPOIB_MODE */ + nla_total_size(2); /* IFLA_IPOIB_UMCAST */ +} + +static struct rtnl_link_ops ipoib_link_ops __read_mostly = { + .kind = "ipoib", + .maxtype = IFLA_IPOIB_MAX, + .policy = ipoib_policy, + .priv_size = sizeof(struct ipoib_dev_priv), + .setup = ipoib_setup, + .newlink = ipoib_new_child_link, + .changelink = ipoib_changelink, + .dellink = ipoib_unregister_child_dev, + .get_size = ipoib_get_size, + .fill_info = ipoib_fill_info, +}; + +int __init ipoib_netlink_init(void) +{ + return rtnl_link_register(&ipoib_link_ops); +} + +void __exit ipoib_netlink_fini(void) +{ + rtnl_link_unregister(&ipoib_link_ops); +} + +MODULE_ALIAS_RTNL_LINK("ipoib"); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index e829e10400e..c56d5d44c53 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -29,45 +29,43 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ipoib_verbs.c 1349 2004-12-16 21:09:43Z roland $ */ -#include <rdma/ib_cache.h> +#include <linux/slab.h> #include "ipoib.h" -int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid) +int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int set_qkey) { struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ib_qp_attr *qp_attr; + struct ib_qp_attr *qp_attr = NULL; int ret; u16 pkey_index; - ret = -ENOMEM; - qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); - if (!qp_attr) - goto out; - - if (ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) { + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) { clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); ret = -ENXIO; goto out; } set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); - /* set correct QKey for QP */ - qp_attr->qkey = priv->qkey; - ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY); - if (ret) { - ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret); - goto out; + if (set_qkey) { + ret = -ENOMEM; + qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); + if (!qp_attr) + goto out; + + /* set correct QKey for QP */ + qp_attr->qkey = priv->qkey; + ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY); + if (ret) { + ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret); + goto out; + } } /* attach QP to multicast group */ - down(&priv->mcast_mutex); ret = ib_attach_mcast(priv->qp, mgid, mlid); - up(&priv->mcast_mutex); if (ret) ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret); @@ -76,44 +74,20 @@ out: return ret; } -int ipoib_mcast_detach(struct net_device *dev, u16 mlid, union ib_gid *mgid) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); - int ret; - - down(&priv->mcast_mutex); - ret = ib_detach_mcast(priv->qp, mgid, mlid); - up(&priv->mcast_mutex); - if (ret) - ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); - - return ret; -} - int ipoib_init_qp(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; - u16 pkey_index; struct ib_qp_attr qp_attr; int attr_mask; - /* - * Search through the port P_Key table for the requested pkey value. - * The port has to be assigned to the respective IB partition in - * advance. - */ - ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index); - if (ret) { - clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); - return ret; - } - set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) + return -1; qp_attr.qp_state = IB_QPS_INIT; qp_attr.qkey = 0; qp_attr.port_num = priv->port; - qp_attr.pkey_index = pkey_index; + qp_attr.pkey_index = priv->pkey_index; attr_mask = IB_QP_QKEY | IB_QP_PORT | @@ -159,64 +133,112 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_init_attr init_attr = { .cap = { - .max_send_wr = IPOIB_TX_RING_SIZE, - .max_recv_wr = IPOIB_RX_RING_SIZE, + .max_send_wr = ipoib_sendq_size, + .max_recv_wr = ipoib_recvq_size, .max_send_sge = 1, - .max_recv_sge = 1 + .max_recv_sge = IPOIB_UD_RX_SG }, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_UD }; + int ret, size; + int i; + priv->pd = ib_alloc_pd(priv->ca); if (IS_ERR(priv->pd)) { printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); return -ENODEV; } - priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, - IPOIB_TX_RING_SIZE + IPOIB_RX_RING_SIZE + 1); - if (IS_ERR(priv->cq)) { - printk(KERN_WARNING "%s: failed to create CQ\n", ca->name); + priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(priv->mr)) { + printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name); goto out_free_pd; } - if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP)) - goto out_free_cq; + size = ipoib_recvq_size + 1; + ret = ipoib_cm_dev_init(dev); + if (!ret) { + size += ipoib_sendq_size; + if (ipoib_cm_has_srq(dev)) + size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */ + else + size += ipoib_recvq_size * ipoib_max_conn_qp; + } - priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(priv->mr)) { - printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name); - goto out_free_cq; + priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0); + if (IS_ERR(priv->recv_cq)) { + printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name); + goto out_free_mr; } - init_attr.send_cq = priv->cq; - init_attr.recv_cq = priv->cq, + priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL, + dev, ipoib_sendq_size, 0); + if (IS_ERR(priv->send_cq)) { + printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name); + goto out_free_recv_cq; + } + + if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP)) + goto out_free_send_cq; + + init_attr.send_cq = priv->send_cq; + init_attr.recv_cq = priv->recv_cq; + + if (priv->hca_caps & IB_DEVICE_UD_TSO) + init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + + if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK) + init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING) + init_attr.create_flags |= IB_QP_CREATE_NETIF_QP; + + if (dev->features & NETIF_F_SG) + init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; priv->qp = ib_create_qp(priv->pd, &init_attr); if (IS_ERR(priv->qp)) { printk(KERN_WARNING "%s: failed to create QP\n", ca->name); - goto out_free_mr; + goto out_free_send_cq; } priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff; priv->dev->dev_addr[2] = (priv->qp->qp_num >> 8) & 0xff; priv->dev->dev_addr[3] = (priv->qp->qp_num ) & 0xff; - priv->tx_sge.lkey = priv->mr->lkey; - - priv->tx_wr.opcode = IB_WR_SEND; - priv->tx_wr.sg_list = &priv->tx_sge; - priv->tx_wr.num_sge = 1; - priv->tx_wr.send_flags = IB_SEND_SIGNALED; + for (i = 0; i < MAX_SKB_FRAGS + 1; ++i) + priv->tx_sge[i].lkey = priv->mr->lkey; + + priv->tx_wr.opcode = IB_WR_SEND; + priv->tx_wr.sg_list = priv->tx_sge; + priv->tx_wr.send_flags = IB_SEND_SIGNALED; + + priv->rx_sge[0].lkey = priv->mr->lkey; + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + priv->rx_sge[0].length = IPOIB_UD_HEAD_SIZE; + priv->rx_sge[1].length = PAGE_SIZE; + priv->rx_sge[1].lkey = priv->mr->lkey; + priv->rx_wr.num_sge = IPOIB_UD_RX_SG; + } else { + priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); + priv->rx_wr.num_sge = 1; + } + priv->rx_wr.next = NULL; + priv->rx_wr.sg_list = priv->rx_sge; return 0; +out_free_send_cq: + ib_destroy_cq(priv->send_cq); + +out_free_recv_cq: + ib_destroy_cq(priv->recv_cq); + out_free_mr: ib_dereg_mr(priv->mr); - -out_free_cq: - ib_destroy_cq(priv->cq); + ipoib_cm_dev_cleanup(dev); out_free_pd: ib_dealloc_pd(priv->pd); @@ -235,12 +257,17 @@ void ipoib_transport_dev_cleanup(struct net_device *dev) clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); } + if (ib_destroy_cq(priv->send_cq)) + ipoib_warn(priv, "ib_cq_destroy (send) failed\n"); + + if (ib_destroy_cq(priv->recv_cq)) + ipoib_warn(priv, "ib_cq_destroy (recv) failed\n"); + + ipoib_cm_dev_cleanup(dev); + if (ib_dereg_mr(priv->mr)) ipoib_warn(priv, "ib_dereg_mr failed\n"); - if (ib_destroy_cq(priv->cq)) - ipoib_warn(priv, "ib_cq_destroy failed\n"); - if (ib_dealloc_pd(priv->pd)) ipoib_warn(priv, "ib_dealloc_pd failed\n"); } @@ -251,10 +278,20 @@ void ipoib_event(struct ib_event_handler *handler, struct ipoib_dev_priv *priv = container_of(handler, struct ipoib_dev_priv, event_handler); - if (record->event == IB_EVENT_PORT_ACTIVE || - record->event == IB_EVENT_LID_CHANGE || - record->event == IB_EVENT_SM_CHANGE) { - ipoib_dbg(priv, "Port active event\n"); - schedule_work(&priv->flush_task); + if (record->element.port_num != priv->port) + return; + + ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event, + record->device->name, record->element.port_num); + + if (record->event == IB_EVENT_SM_CHANGE || + record->event == IB_EVENT_CLIENT_REREGISTER) { + queue_work(ipoib_workqueue, &priv->flush_light); + } else if (record->event == IB_EVENT_PORT_ERR || + record->event == IB_EVENT_PORT_ACTIVE || + record->event == IB_EVENT_LID_CHANGE) { + queue_work(ipoib_workqueue, &priv->flush_normal); + } else if (record->event == IB_EVENT_PKEY_CHANGE) { + queue_work(ipoib_workqueue, &priv->flush_heavy); } } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 332d730e60c..9fad7b5ac8b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -28,68 +28,41 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ipoib_vlan.c 1349 2004-12-16 21:09:43Z roland $ */ #include <linux/module.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/seq_file.h> #include <asm/uaccess.h> #include "ipoib.h" -static ssize_t show_parent(struct class_device *class_dev, char *buf) +static ssize_t show_parent(struct device *d, struct device_attribute *attr, + char *buf) { - struct net_device *dev = - container_of(class_dev, struct net_device, class_dev); + struct net_device *dev = to_net_dev(d); struct ipoib_dev_priv *priv = netdev_priv(dev); return sprintf(buf, "%s\n", priv->parent->name); } -static CLASS_DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL); +static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL); -int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) +int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, + u16 pkey, int type) { - struct ipoib_dev_priv *ppriv, *priv; - char intf_name[IFNAMSIZ]; int result; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - ppriv = netdev_priv(pdev); - - down(&ppriv->vlan_mutex); - - /* - * First ensure this isn't a duplicate. We check the parent device and - * then all of the child interfaces to make sure the Pkey doesn't match. - */ - if (ppriv->pkey == pkey) { - result = -ENOTUNIQ; - goto err; - } - - list_for_each_entry(priv, &ppriv->child_intfs, list) { - if (priv->pkey == pkey) { - result = -ENOTUNIQ; - goto err; - } - } + priv->max_ib_mtu = ppriv->max_ib_mtu; + /* MTU will be reset when mcast join happens */ + priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); + priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; + set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); - snprintf(intf_name, sizeof intf_name, "%s.%04x", - ppriv->dev->name, pkey); - priv = ipoib_intf_alloc(intf_name); - if (!priv) { - result = -ENOMEM; + result = ipoib_set_dev_features(priv, ppriv->ca); + if (result) goto err; - } - - set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); priv->pkey = pkey; @@ -102,10 +75,10 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) ipoib_warn(ppriv, "failed to initialize subinterface: " "device %s, port %d", ppriv->ca->name, ppriv->port); - goto device_init_failed; + goto err; } - result = register_netdev(priv->dev); + result = register_netdevice(priv->dev); if (result) { ipoib_warn(priv, "failed to initialize; error %i", result); goto register_failed; @@ -113,64 +86,124 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) priv->parent = ppriv->dev; - if (ipoib_create_debug_file(priv->dev)) - goto debug_failed; + ipoib_create_debug_files(priv->dev); - if (ipoib_add_pkey_attr(priv->dev)) - goto sysfs_failed; + /* RTNL childs don't need proprietary sysfs entries */ + if (type == IPOIB_LEGACY_CHILD) { + if (ipoib_cm_add_mode_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_pkey_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_umcast_attr(priv->dev)) + goto sysfs_failed; - if (class_device_create_file(&priv->dev->class_dev, - &class_device_attr_parent)) - goto sysfs_failed; + if (device_create_file(&priv->dev->dev, &dev_attr_parent)) + goto sysfs_failed; + } + priv->child_type = type; + priv->dev->iflink = ppriv->dev->ifindex; list_add_tail(&priv->list, &ppriv->child_intfs); - up(&ppriv->vlan_mutex); - return 0; sysfs_failed: - ipoib_delete_debug_file(priv->dev); - -debug_failed: - unregister_netdev(priv->dev); + result = -ENOMEM; + ipoib_delete_debug_files(priv->dev); + unregister_netdevice(priv->dev); register_failed: ipoib_dev_cleanup(priv->dev); -device_init_failed: - free_netdev(priv->dev); - err: - up(&ppriv->vlan_mutex); + return result; +} + +int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) +{ + struct ipoib_dev_priv *ppriv, *priv; + char intf_name[IFNAMSIZ]; + struct ipoib_dev_priv *tpriv; + int result; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + ppriv = netdev_priv(pdev); + + snprintf(intf_name, sizeof intf_name, "%s.%04x", + ppriv->dev->name, pkey); + priv = ipoib_intf_alloc(intf_name); + if (!priv) + return -ENOMEM; + + if (!rtnl_trylock()) + return restart_syscall(); + + down_write(&ppriv->vlan_rwsem); + + /* + * First ensure this isn't a duplicate. We check the parent device and + * then all of the legacy child interfaces to make sure the Pkey + * doesn't match. + */ + if (ppriv->pkey == pkey) { + result = -ENOTUNIQ; + goto out; + } + + list_for_each_entry(tpriv, &ppriv->child_intfs, list) { + if (tpriv->pkey == pkey && + tpriv->child_type == IPOIB_LEGACY_CHILD) { + result = -ENOTUNIQ; + goto out; + } + } + + result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD); + +out: + up_write(&ppriv->vlan_rwsem); + + if (result) + free_netdev(priv->dev); + + rtnl_unlock(); + return result; } int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) { struct ipoib_dev_priv *ppriv, *priv, *tpriv; - int ret = -ENOENT; + struct net_device *dev = NULL; if (!capable(CAP_NET_ADMIN)) return -EPERM; ppriv = netdev_priv(pdev); - down(&ppriv->vlan_mutex); - list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { - if (priv->pkey == pkey) { - unregister_netdev(priv->dev); - ipoib_dev_cleanup(priv->dev); + if (!rtnl_trylock()) + return restart_syscall(); + down_write(&ppriv->vlan_rwsem); + list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { + if (priv->pkey == pkey && + priv->child_type == IPOIB_LEGACY_CHILD) { + unregister_netdevice(priv->dev); list_del(&priv->list); - - kfree(priv); - - ret = 0; + dev = priv->dev; break; } } - up(&ppriv->vlan_mutex); + up_write(&ppriv->vlan_rwsem); + + rtnl_unlock(); + + if (dev) { + free_netdev(dev); + return 0; + } - return ret; + return -ENODEV; } diff --git a/drivers/infiniband/ulp/iser/Kconfig b/drivers/infiniband/ulp/iser/Kconfig new file mode 100644 index 00000000000..d00af71a2cf --- /dev/null +++ b/drivers/infiniband/ulp/iser/Kconfig @@ -0,0 +1,12 @@ +config INFINIBAND_ISER + tristate "iSCSI Extensions for RDMA (iSER)" + depends on SCSI && INET && INFINIBAND_ADDR_TRANS + select SCSI_ISCSI_ATTRS + ---help--- + Support for the iSCSI Extensions for RDMA (iSER) Protocol + over InfiniBand. This allows you to access storage devices + that speak iSCSI over iSER over InfiniBand. + + The iSER protocol is defined by IETF. + See <http://www.ietf.org/rfc/rfc5046.txt> + and <http://members.infinibandta.org/kwspub/spec/Annex_iSER.PDF> diff --git a/drivers/infiniband/ulp/iser/Makefile b/drivers/infiniband/ulp/iser/Makefile new file mode 100644 index 00000000000..fe6cd15f231 --- /dev/null +++ b/drivers/infiniband/ulp/iser/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_INFINIBAND_ISER) += ib_iser.o + +ib_iser-y := iser_verbs.o iser_initiator.o iser_memory.o \ + iscsi_iser.o diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c new file mode 100644 index 00000000000..eb7973957a6 --- /dev/null +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -0,0 +1,845 @@ +/* + * iSCSI Initiator over iSER Data-Path + * + * Copyright (C) 2004 Dmitry Yusupov + * Copyright (C) 2004 Alex Aizman + * Copyright (C) 2005 Mike Christie + * Copyright (c) 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. + * maintained by openib-general@openib.org + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Credits: + * Christoph Hellwig + * FUJITA Tomonori + * Arne Redlich + * Zhenyu Wang + * Modified by: + * Erez Zilber + */ + +#include <linux/types.h> +#include <linux/list.h> +#include <linux/hardirq.h> +#include <linux/kfifo.h> +#include <linux/blkdev.h> +#include <linux/init.h> +#include <linux/ioctl.h> +#include <linux/cdev.h> +#include <linux/in.h> +#include <linux/net.h> +#include <linux/scatterlist.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/module.h> + +#include <net/sock.h> + +#include <asm/uaccess.h> + +#include <scsi/scsi_cmnd.h> +#include <scsi/scsi_device.h> +#include <scsi/scsi_eh.h> +#include <scsi/scsi_tcq.h> +#include <scsi/scsi_host.h> +#include <scsi/scsi.h> +#include <scsi/scsi_transport_iscsi.h> + +#include "iscsi_iser.h" + +static struct scsi_host_template iscsi_iser_sht; +static struct iscsi_transport iscsi_iser_transport; +static struct scsi_transport_template *iscsi_iser_scsi_transport; + +static unsigned int iscsi_max_lun = 512; +module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO); + +int iser_debug_level = 0; +bool iser_pi_enable = false; +int iser_pi_guard = 0; + +MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz"); +MODULE_VERSION(DRV_VER); + +module_param_named(debug_level, iser_debug_level, int, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)"); + +module_param_named(pi_enable, iser_pi_enable, bool, 0644); +MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)"); + +module_param_named(pi_guard, iser_pi_guard, int, 0644); +MODULE_PARM_DESC(pi_guard, "T10-PI guard_type, 0:CRC|1:IP_CSUM (default:CRC)"); + +static struct workqueue_struct *release_wq; +struct iser_global ig; + +void +iscsi_iser_recv(struct iscsi_conn *conn, + struct iscsi_hdr *hdr, char *rx_data, int rx_data_len) +{ + int rc = 0; + int datalen; + int ahslen; + + /* verify PDU length */ + datalen = ntoh24(hdr->dlength); + if (datalen > rx_data_len || (datalen + 4) < rx_data_len) { + iser_err("wrong datalen %d (hdr), %d (IB)\n", + datalen, rx_data_len); + rc = ISCSI_ERR_DATALEN; + goto error; + } + + if (datalen != rx_data_len) + iser_dbg("aligned datalen (%d) hdr, %d (IB)\n", + datalen, rx_data_len); + + /* read AHS */ + ahslen = hdr->hlength * 4; + + rc = iscsi_complete_pdu(conn, hdr, rx_data, rx_data_len); + if (rc && rc != ISCSI_ERR_NO_SCSI_CMD) + goto error; + + return; +error: + iscsi_conn_failure(conn, rc); +} + +static int iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + + task->hdr = (struct iscsi_hdr *)&iser_task->desc.iscsi_header; + task->hdr_max = sizeof(iser_task->desc.iscsi_header); + return 0; +} + +int iser_initialize_task_headers(struct iscsi_task *task, + struct iser_tx_desc *tx_desc) +{ + struct iser_conn *ib_conn = task->conn->dd_data; + struct iser_device *device = ib_conn->device; + struct iscsi_iser_task *iser_task = task->dd_data; + u64 dma_addr; + + dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + if (ib_dma_mapping_error(device->ib_device, dma_addr)) + return -ENOMEM; + + tx_desc->dma_addr = dma_addr; + tx_desc->tx_sg[0].addr = tx_desc->dma_addr; + tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; + tx_desc->tx_sg[0].lkey = device->mr->lkey; + + iser_task->ib_conn = ib_conn; + return 0; +} +/** + * iscsi_iser_task_init - Initialize task + * @task: iscsi task + * + * Initialize the task for the scsi command or mgmt command. + */ +static int +iscsi_iser_task_init(struct iscsi_task *task) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + + if (iser_initialize_task_headers(task, &iser_task->desc)) + return -ENOMEM; + + /* mgmt task */ + if (!task->sc) + return 0; + + iser_task->command_sent = 0; + iser_task_rdma_init(iser_task); + iser_task->sc = task->sc; + + return 0; +} + +/** + * iscsi_iser_mtask_xmit - xmit management(immediate) task + * @conn: iscsi connection + * @task: task management task + * + * Notes: + * The function can return -EAGAIN in which case caller must + * call it again later, or recover. '0' return code means successful + * xmit. + * + **/ +static int +iscsi_iser_mtask_xmit(struct iscsi_conn *conn, struct iscsi_task *task) +{ + int error = 0; + + iser_dbg("mtask xmit [cid %d itt 0x%x]\n", conn->id, task->itt); + + error = iser_send_control(conn, task); + + /* since iser xmits control with zero copy, tasks can not be recycled + * right after sending them. + * The recycling scheme is based on whether a response is expected + * - if yes, the task is recycled at iscsi_complete_pdu + * - if no, the task is recycled at iser_snd_completion + */ + return error; +} + +static int +iscsi_iser_task_xmit_unsol_data(struct iscsi_conn *conn, + struct iscsi_task *task) +{ + struct iscsi_r2t_info *r2t = &task->unsol_r2t; + struct iscsi_data hdr; + int error = 0; + + /* Send data-out PDUs while there's still unsolicited data to send */ + while (iscsi_task_has_unsol_data(task)) { + iscsi_prep_data_out_pdu(task, r2t, &hdr); + iser_dbg("Sending data-out: itt 0x%x, data count %d\n", + hdr.itt, r2t->data_count); + + /* the buffer description has been passed with the command */ + /* Send the command */ + error = iser_send_data_out(conn, task, &hdr); + if (error) { + r2t->datasn--; + goto iscsi_iser_task_xmit_unsol_data_exit; + } + r2t->sent += r2t->data_count; + iser_dbg("Need to send %d more as data-out PDUs\n", + r2t->data_length - r2t->sent); + } + +iscsi_iser_task_xmit_unsol_data_exit: + return error; +} + +static int +iscsi_iser_task_xmit(struct iscsi_task *task) +{ + struct iscsi_conn *conn = task->conn; + struct iscsi_iser_task *iser_task = task->dd_data; + int error = 0; + + if (!task->sc) + return iscsi_iser_mtask_xmit(conn, task); + + if (task->sc->sc_data_direction == DMA_TO_DEVICE) { + BUG_ON(scsi_bufflen(task->sc) == 0); + + iser_dbg("cmd [itt %x total %d imm %d unsol_data %d\n", + task->itt, scsi_bufflen(task->sc), + task->imm_count, task->unsol_r2t.data_length); + } + + iser_dbg("ctask xmit [cid %d itt 0x%x]\n", + conn->id, task->itt); + + /* Send the cmd PDU */ + if (!iser_task->command_sent) { + error = iser_send_command(conn, task); + if (error) + goto iscsi_iser_task_xmit_exit; + iser_task->command_sent = 1; + } + + /* Send unsolicited data-out PDU(s) if necessary */ + if (iscsi_task_has_unsol_data(task)) + error = iscsi_iser_task_xmit_unsol_data(conn, task); + + iscsi_iser_task_xmit_exit: + return error; +} + +static void iscsi_iser_cleanup_task(struct iscsi_task *task) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_tx_desc *tx_desc = &iser_task->desc; + struct iser_conn *ib_conn = task->conn->dd_data; + struct iser_device *device = ib_conn->device; + + ib_dma_unmap_single(device->ib_device, + tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); + + /* mgmt tasks do not need special cleanup */ + if (!task->sc) + return; + + if (iser_task->status == ISER_TASK_STATUS_STARTED) { + iser_task->status = ISER_TASK_STATUS_COMPLETED; + iser_task_rdma_finalize(iser_task); + } +} + +static u8 iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + + if (iser_task->dir[ISER_DIR_IN]) + return iser_check_task_pi_status(iser_task, ISER_DIR_IN, + sector); + else + return iser_check_task_pi_status(iser_task, ISER_DIR_OUT, + sector); +} + +static struct iscsi_cls_conn * +iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, uint32_t conn_idx) +{ + struct iscsi_conn *conn; + struct iscsi_cls_conn *cls_conn; + + cls_conn = iscsi_conn_setup(cls_session, 0, conn_idx); + if (!cls_conn) + return NULL; + conn = cls_conn->dd_data; + + /* + * due to issues with the login code re iser sematics + * this not set in iscsi_conn_setup - FIXME + */ + conn->max_recv_dlength = ISER_RECV_DATA_SEG_LEN; + + return cls_conn; +} + +static int +iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, + struct iscsi_cls_conn *cls_conn, uint64_t transport_eph, + int is_leading) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + struct iscsi_session *session; + struct iser_conn *ib_conn; + struct iscsi_endpoint *ep; + int error; + + error = iscsi_conn_bind(cls_session, cls_conn, is_leading); + if (error) + return error; + + /* the transport ep handle comes from user space so it must be + * verified against the global ib connections list */ + ep = iscsi_lookup_endpoint(transport_eph); + if (!ep) { + iser_err("can't bind eph %llx\n", + (unsigned long long)transport_eph); + return -EINVAL; + } + ib_conn = ep->dd_data; + + session = conn->session; + if (iser_alloc_rx_descriptors(ib_conn, session)) + return -ENOMEM; + + /* binds the iSER connection retrieved from the previously + * connected ep_handle to the iSCSI layer connection. exchanges + * connection pointers */ + iser_info("binding iscsi conn %p to ib_conn %p\n", conn, ib_conn); + + conn->dd_data = ib_conn; + ib_conn->iscsi_conn = conn; + + return 0; +} + +static int +iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn) +{ + struct iscsi_conn *iscsi_conn; + struct iser_conn *ib_conn; + + iscsi_conn = cls_conn->dd_data; + ib_conn = iscsi_conn->dd_data; + reinit_completion(&ib_conn->stop_completion); + + return iscsi_conn_start(cls_conn); +} + +static void +iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + struct iser_conn *ib_conn = conn->dd_data; + + iser_dbg("stopping iscsi_conn: %p, ib_conn: %p\n", conn, ib_conn); + iscsi_conn_stop(cls_conn, flag); + + /* + * Userspace may have goofed up and not bound the connection or + * might have only partially setup the connection. + */ + if (ib_conn) { + conn->dd_data = NULL; + complete(&ib_conn->stop_completion); + } +} + +static void iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session) +{ + struct Scsi_Host *shost = iscsi_session_to_shost(cls_session); + + iscsi_session_teardown(cls_session); + iscsi_host_remove(shost); + iscsi_host_free(shost); +} + +static inline unsigned int +iser_dif_prot_caps(int prot_caps) +{ + return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ? SHOST_DIF_TYPE1_PROTECTION | + SHOST_DIX_TYPE1_PROTECTION : 0) | + ((prot_caps & IB_PROT_T10DIF_TYPE_2) ? SHOST_DIF_TYPE2_PROTECTION | + SHOST_DIX_TYPE2_PROTECTION : 0) | + ((prot_caps & IB_PROT_T10DIF_TYPE_3) ? SHOST_DIF_TYPE3_PROTECTION | + SHOST_DIX_TYPE3_PROTECTION : 0); +} + +static struct iscsi_cls_session * +iscsi_iser_session_create(struct iscsi_endpoint *ep, + uint16_t cmds_max, uint16_t qdepth, + uint32_t initial_cmdsn) +{ + struct iscsi_cls_session *cls_session; + struct iscsi_session *session; + struct Scsi_Host *shost; + struct iser_conn *ib_conn = NULL; + + shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0); + if (!shost) + return NULL; + shost->transportt = iscsi_iser_scsi_transport; + shost->cmd_per_lun = qdepth; + shost->max_lun = iscsi_max_lun; + shost->max_id = 0; + shost->max_channel = 0; + shost->max_cmd_len = 16; + + /* + * older userspace tools (before 2.0-870) did not pass us + * the leading conn's ep so this will be NULL; + */ + if (ep) { + ib_conn = ep->dd_data; + if (ib_conn->pi_support) { + u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap; + + scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps)); + if (iser_pi_guard) + scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP); + else + scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC); + } + } + + if (iscsi_host_add(shost, + ep ? ib_conn->device->ib_device->dma_device : NULL)) + goto free_host; + + if (cmds_max > ISER_DEF_XMIT_CMDS_MAX) { + iser_info("cmds_max changed from %u to %u\n", + cmds_max, ISER_DEF_XMIT_CMDS_MAX); + cmds_max = ISER_DEF_XMIT_CMDS_MAX; + } + + cls_session = iscsi_session_setup(&iscsi_iser_transport, shost, + cmds_max, 0, + sizeof(struct iscsi_iser_task), + initial_cmdsn, 0); + if (!cls_session) + goto remove_host; + session = cls_session->dd_data; + + shost->can_queue = session->scsi_cmds_max; + return cls_session; + +remove_host: + iscsi_host_remove(shost); +free_host: + iscsi_host_free(shost); + return NULL; +} + +static int +iscsi_iser_set_param(struct iscsi_cls_conn *cls_conn, + enum iscsi_param param, char *buf, int buflen) +{ + int value; + + switch (param) { + case ISCSI_PARAM_MAX_RECV_DLENGTH: + /* TBD */ + break; + case ISCSI_PARAM_HDRDGST_EN: + sscanf(buf, "%d", &value); + if (value) { + iser_err("DataDigest wasn't negotiated to None\n"); + return -EPROTO; + } + break; + case ISCSI_PARAM_DATADGST_EN: + sscanf(buf, "%d", &value); + if (value) { + iser_err("DataDigest wasn't negotiated to None\n"); + return -EPROTO; + } + break; + case ISCSI_PARAM_IFMARKER_EN: + sscanf(buf, "%d", &value); + if (value) { + iser_err("IFMarker wasn't negotiated to No\n"); + return -EPROTO; + } + break; + case ISCSI_PARAM_OFMARKER_EN: + sscanf(buf, "%d", &value); + if (value) { + iser_err("OFMarker wasn't negotiated to No\n"); + return -EPROTO; + } + break; + default: + return iscsi_set_param(cls_conn, param, buf, buflen); + } + + return 0; +} + +static void +iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, struct iscsi_stats *stats) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + + stats->txdata_octets = conn->txdata_octets; + stats->rxdata_octets = conn->rxdata_octets; + stats->scsicmd_pdus = conn->scsicmd_pdus_cnt; + stats->dataout_pdus = conn->dataout_pdus_cnt; + stats->scsirsp_pdus = conn->scsirsp_pdus_cnt; + stats->datain_pdus = conn->datain_pdus_cnt; /* always 0 */ + stats->r2t_pdus = conn->r2t_pdus_cnt; /* always 0 */ + stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt; + stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt; + stats->custom_length = 4; + strcpy(stats->custom[0].desc, "qp_tx_queue_full"); + stats->custom[0].value = 0; /* TB iser_conn->qp_tx_queue_full; */ + strcpy(stats->custom[1].desc, "fmr_map_not_avail"); + stats->custom[1].value = 0; /* TB iser_conn->fmr_map_not_avail */; + strcpy(stats->custom[2].desc, "eh_abort_cnt"); + stats->custom[2].value = conn->eh_abort_cnt; + strcpy(stats->custom[3].desc, "fmr_unalign_cnt"); + stats->custom[3].value = conn->fmr_unalign_cnt; +} + +static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep, + enum iscsi_param param, char *buf) +{ + struct iser_conn *ib_conn = ep->dd_data; + int len; + + switch (param) { + case ISCSI_PARAM_CONN_PORT: + case ISCSI_PARAM_CONN_ADDRESS: + if (!ib_conn || !ib_conn->cma_id) + return -ENOTCONN; + + return iscsi_conn_get_addr_param((struct sockaddr_storage *) + &ib_conn->cma_id->route.addr.dst_addr, + param, buf); + break; + default: + return -ENOSYS; + } + + return len; +} + +static struct iscsi_endpoint * +iscsi_iser_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, + int non_blocking) +{ + int err; + struct iser_conn *ib_conn; + struct iscsi_endpoint *ep; + + ep = iscsi_create_endpoint(sizeof(*ib_conn)); + if (!ep) + return ERR_PTR(-ENOMEM); + + ib_conn = ep->dd_data; + ib_conn->ep = ep; + iser_conn_init(ib_conn); + + err = iser_connect(ib_conn, NULL, (struct sockaddr_in *)dst_addr, + non_blocking); + if (err) + return ERR_PTR(err); + + return ep; +} + +static int +iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms) +{ + struct iser_conn *ib_conn; + int rc; + + ib_conn = ep->dd_data; + rc = wait_event_interruptible_timeout(ib_conn->wait, + ib_conn->state == ISER_CONN_UP, + msecs_to_jiffies(timeout_ms)); + + /* if conn establishment failed, return error code to iscsi */ + if (!rc && + (ib_conn->state == ISER_CONN_TERMINATING || + ib_conn->state == ISER_CONN_DOWN)) + rc = -1; + + iser_info("ib conn %p rc = %d\n", ib_conn, rc); + + if (rc > 0) + return 1; /* success, this is the equivalent of POLLOUT */ + else if (!rc) + return 0; /* timeout */ + else + return rc; /* signal */ +} + +static void +iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) +{ + struct iser_conn *ib_conn; + + ib_conn = ep->dd_data; + iser_info("ep %p ib conn %p state %d\n", ep, ib_conn, ib_conn->state); + iser_conn_terminate(ib_conn); + + /* + * if iser_conn and iscsi_conn are bound, we must wait iscsi_conn_stop + * call and ISER_CONN_DOWN state before freeing the iser resources. + * otherwise we are safe to free resources immediately. + */ + if (ib_conn->iscsi_conn) { + INIT_WORK(&ib_conn->release_work, iser_release_work); + queue_work(release_wq, &ib_conn->release_work); + } else { + iser_conn_release(ib_conn); + } +} + +static umode_t iser_attr_is_visible(int param_type, int param) +{ + switch (param_type) { + case ISCSI_HOST_PARAM: + switch (param) { + case ISCSI_HOST_PARAM_NETDEV_NAME: + case ISCSI_HOST_PARAM_HWADDRESS: + case ISCSI_HOST_PARAM_INITIATOR_NAME: + return S_IRUGO; + default: + return 0; + } + case ISCSI_PARAM: + switch (param) { + case ISCSI_PARAM_MAX_RECV_DLENGTH: + case ISCSI_PARAM_MAX_XMIT_DLENGTH: + case ISCSI_PARAM_HDRDGST_EN: + case ISCSI_PARAM_DATADGST_EN: + case ISCSI_PARAM_CONN_ADDRESS: + case ISCSI_PARAM_CONN_PORT: + case ISCSI_PARAM_EXP_STATSN: + case ISCSI_PARAM_PERSISTENT_ADDRESS: + case ISCSI_PARAM_PERSISTENT_PORT: + case ISCSI_PARAM_PING_TMO: + case ISCSI_PARAM_RECV_TMO: + case ISCSI_PARAM_INITIAL_R2T_EN: + case ISCSI_PARAM_MAX_R2T: + case ISCSI_PARAM_IMM_DATA_EN: + case ISCSI_PARAM_FIRST_BURST: + case ISCSI_PARAM_MAX_BURST: + case ISCSI_PARAM_PDU_INORDER_EN: + case ISCSI_PARAM_DATASEQ_INORDER_EN: + case ISCSI_PARAM_TARGET_NAME: + case ISCSI_PARAM_TPGT: + case ISCSI_PARAM_USERNAME: + case ISCSI_PARAM_PASSWORD: + case ISCSI_PARAM_USERNAME_IN: + case ISCSI_PARAM_PASSWORD_IN: + case ISCSI_PARAM_FAST_ABORT: + case ISCSI_PARAM_ABORT_TMO: + case ISCSI_PARAM_LU_RESET_TMO: + case ISCSI_PARAM_TGT_RESET_TMO: + case ISCSI_PARAM_IFACE_NAME: + case ISCSI_PARAM_INITIATOR_NAME: + case ISCSI_PARAM_DISCOVERY_SESS: + return S_IRUGO; + default: + return 0; + } + } + + return 0; +} + +static struct scsi_host_template iscsi_iser_sht = { + .module = THIS_MODULE, + .name = "iSCSI Initiator over iSER", + .queuecommand = iscsi_queuecommand, + .change_queue_depth = iscsi_change_queue_depth, + .sg_tablesize = ISCSI_ISER_SG_TABLESIZE, + .max_sectors = 1024, + .cmd_per_lun = ISER_DEF_CMD_PER_LUN, + .eh_abort_handler = iscsi_eh_abort, + .eh_device_reset_handler= iscsi_eh_device_reset, + .eh_target_reset_handler = iscsi_eh_recover_target, + .target_alloc = iscsi_target_alloc, + .use_clustering = DISABLE_CLUSTERING, + .proc_name = "iscsi_iser", + .this_id = -1, +}; + +static struct iscsi_transport iscsi_iser_transport = { + .owner = THIS_MODULE, + .name = "iser", + .caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_TEXT_NEGO, + /* session management */ + .create_session = iscsi_iser_session_create, + .destroy_session = iscsi_iser_session_destroy, + /* connection management */ + .create_conn = iscsi_iser_conn_create, + .bind_conn = iscsi_iser_conn_bind, + .destroy_conn = iscsi_conn_teardown, + .attr_is_visible = iser_attr_is_visible, + .set_param = iscsi_iser_set_param, + .get_conn_param = iscsi_conn_get_param, + .get_ep_param = iscsi_iser_get_ep_param, + .get_session_param = iscsi_session_get_param, + .start_conn = iscsi_iser_conn_start, + .stop_conn = iscsi_iser_conn_stop, + /* iscsi host params */ + .get_host_param = iscsi_host_get_param, + .set_host_param = iscsi_host_set_param, + /* IO */ + .send_pdu = iscsi_conn_send_pdu, + .get_stats = iscsi_iser_conn_get_stats, + .init_task = iscsi_iser_task_init, + .xmit_task = iscsi_iser_task_xmit, + .cleanup_task = iscsi_iser_cleanup_task, + .alloc_pdu = iscsi_iser_pdu_alloc, + .check_protection = iscsi_iser_check_protection, + /* recovery */ + .session_recovery_timedout = iscsi_session_recovery_timedout, + + .ep_connect = iscsi_iser_ep_connect, + .ep_poll = iscsi_iser_ep_poll, + .ep_disconnect = iscsi_iser_ep_disconnect +}; + +static int __init iser_init(void) +{ + int err; + + iser_dbg("Starting iSER datamover...\n"); + + if (iscsi_max_lun < 1) { + iser_err("Invalid max_lun value of %u\n", iscsi_max_lun); + return -EINVAL; + } + + memset(&ig, 0, sizeof(struct iser_global)); + + ig.desc_cache = kmem_cache_create("iser_descriptors", + sizeof(struct iser_tx_desc), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (ig.desc_cache == NULL) + return -ENOMEM; + + /* device init is called only after the first addr resolution */ + mutex_init(&ig.device_list_mutex); + INIT_LIST_HEAD(&ig.device_list); + mutex_init(&ig.connlist_mutex); + INIT_LIST_HEAD(&ig.connlist); + + release_wq = alloc_workqueue("release workqueue", 0, 0); + if (!release_wq) { + iser_err("failed to allocate release workqueue\n"); + return -ENOMEM; + } + + iscsi_iser_scsi_transport = iscsi_register_transport( + &iscsi_iser_transport); + if (!iscsi_iser_scsi_transport) { + iser_err("iscsi_register_transport failed\n"); + err = -EINVAL; + goto register_transport_failure; + } + + return 0; + +register_transport_failure: + kmem_cache_destroy(ig.desc_cache); + + return err; +} + +static void __exit iser_exit(void) +{ + struct iser_conn *ib_conn, *n; + int connlist_empty; + + iser_dbg("Removing iSER datamover...\n"); + destroy_workqueue(release_wq); + + mutex_lock(&ig.connlist_mutex); + connlist_empty = list_empty(&ig.connlist); + mutex_unlock(&ig.connlist_mutex); + + if (!connlist_empty) { + iser_err("Error cleanup stage completed but we still have iser " + "connections, destroying them anyway.\n"); + list_for_each_entry_safe(ib_conn, n, &ig.connlist, conn_list) { + iser_conn_release(ib_conn); + } + } + + iscsi_unregister_transport(&iscsi_iser_transport); + kmem_cache_destroy(ig.desc_cache); +} + +module_init(iser_init); +module_exit(iser_exit); diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h new file mode 100644 index 00000000000..97cd385bf7f --- /dev/null +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -0,0 +1,484 @@ +/* + * iSER transport for the Open iSCSI Initiator & iSER transport internals + * + * Copyright (C) 2004 Dmitry Yusupov + * Copyright (C) 2004 Alex Aizman + * Copyright (C) 2005 Mike Christie + * based on code maintained by open-iscsi@googlegroups.com + * + * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ISCSI_ISER_H__ +#define __ISCSI_ISER_H__ + +#include <linux/types.h> +#include <linux/net.h> +#include <linux/printk.h> +#include <scsi/libiscsi.h> +#include <scsi/scsi_transport_iscsi.h> +#include <scsi/scsi_cmnd.h> +#include <scsi/scsi_device.h> + +#include <linux/interrupt.h> +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/dma-mapping.h> +#include <linux/mutex.h> +#include <linux/mempool.h> +#include <linux/uio.h> + +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/in6.h> + +#include <rdma/ib_verbs.h> +#include <rdma/ib_fmr_pool.h> +#include <rdma/rdma_cm.h> + +#define DRV_NAME "iser" +#define PFX DRV_NAME ": " +#define DRV_VER "1.4" + +#define iser_dbg(fmt, arg...) \ + do { \ + if (iser_debug_level > 2) \ + printk(KERN_DEBUG PFX "%s:" fmt,\ + __func__ , ## arg); \ + } while (0) + +#define iser_warn(fmt, arg...) \ + do { \ + if (iser_debug_level > 0) \ + pr_warn(PFX "%s:" fmt, \ + __func__ , ## arg); \ + } while (0) + +#define iser_info(fmt, arg...) \ + do { \ + if (iser_debug_level > 1) \ + pr_info(PFX "%s:" fmt, \ + __func__ , ## arg); \ + } while (0) + +#define iser_err(fmt, arg...) \ + do { \ + printk(KERN_ERR PFX "%s:" fmt, \ + __func__ , ## arg); \ + } while (0) + +#define SHIFT_4K 12 +#define SIZE_4K (1ULL << SHIFT_4K) +#define MASK_4K (~(SIZE_4K-1)) + + /* support up to 512KB in one RDMA */ +#define ISCSI_ISER_SG_TABLESIZE (0x80000 >> SHIFT_4K) +#define ISER_DEF_XMIT_CMDS_DEFAULT 512 +#if ISCSI_DEF_XMIT_CMDS_MAX > ISER_DEF_XMIT_CMDS_DEFAULT + #define ISER_DEF_XMIT_CMDS_MAX ISCSI_DEF_XMIT_CMDS_MAX +#else + #define ISER_DEF_XMIT_CMDS_MAX ISER_DEF_XMIT_CMDS_DEFAULT +#endif +#define ISER_DEF_CMD_PER_LUN ISER_DEF_XMIT_CMDS_MAX + +/* QP settings */ +/* Maximal bounds on received asynchronous PDUs */ +#define ISER_MAX_RX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */ + +#define ISER_MAX_TX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), * + * SCSI_TMFUNC(2), LOGOUT(1) */ + +#define ISER_QP_MAX_RECV_DTOS (ISER_DEF_XMIT_CMDS_MAX) + +#define ISER_MIN_POSTED_RX (ISER_DEF_XMIT_CMDS_MAX >> 2) + +/* the max TX (send) WR supported by the iSER QP is defined by * + * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect * + * to have at max for SCSI command. The tx posting & completion handling code * + * supports -EAGAIN scheme where tx is suspended till the QP has room for more * + * send WR. D=8 comes from 64K/8K */ + +#define ISER_INFLIGHT_DATAOUTS 8 + +#define ISER_QP_MAX_REQ_DTOS (ISER_DEF_XMIT_CMDS_MAX * \ + (1 + ISER_INFLIGHT_DATAOUTS) + \ + ISER_MAX_TX_MISC_PDUS + \ + ISER_MAX_RX_MISC_PDUS) + +/* Max registration work requests per command */ +#define ISER_MAX_REG_WR_PER_CMD 5 + +/* For Signature we don't support DATAOUTs so no need to make room for them */ +#define ISER_QP_SIG_MAX_REQ_DTOS (ISER_DEF_XMIT_CMDS_MAX * \ + (1 + ISER_MAX_REG_WR_PER_CMD) + \ + ISER_MAX_TX_MISC_PDUS + \ + ISER_MAX_RX_MISC_PDUS) + +#define ISER_VER 0x10 +#define ISER_WSV 0x08 +#define ISER_RSV 0x04 + +#define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL + +struct iser_hdr { + u8 flags; + u8 rsvd[3]; + __be32 write_stag; /* write rkey */ + __be64 write_va; + __be32 read_stag; /* read rkey */ + __be64 read_va; +} __attribute__((packed)); + + +#define ISER_ZBVA_NOT_SUPPORTED 0x80 +#define ISER_SEND_W_INV_NOT_SUPPORTED 0x40 + +struct iser_cm_hdr { + u8 flags; + u8 rsvd[3]; +} __packed; + +/* Constant PDU lengths calculations */ +#define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr)) + +#define ISER_RECV_DATA_SEG_LEN 128 +#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN) +#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN) + +/* Length of an object name string */ +#define ISER_OBJECT_NAME_SIZE 64 + +enum iser_ib_conn_state { + ISER_CONN_INIT, /* descriptor allocd, no conn */ + ISER_CONN_PENDING, /* in the process of being established */ + ISER_CONN_UP, /* up and running */ + ISER_CONN_TERMINATING, /* in the process of being terminated */ + ISER_CONN_DOWN, /* shut down */ + ISER_CONN_STATES_NUM +}; + +enum iser_task_status { + ISER_TASK_STATUS_INIT = 0, + ISER_TASK_STATUS_STARTED, + ISER_TASK_STATUS_COMPLETED +}; + +enum iser_data_dir { + ISER_DIR_IN = 0, /* to initiator */ + ISER_DIR_OUT, /* from initiator */ + ISER_DIRS_NUM +}; + +struct iser_data_buf { + void *buf; /* pointer to the sg list */ + unsigned int size; /* num entries of this sg */ + unsigned long data_len; /* total data len */ + unsigned int dma_nents; /* returned by dma_map_sg */ + char *copy_buf; /* allocated copy buf for SGs unaligned * + * for rdma which are copied */ + struct scatterlist sg_single; /* SG-ified clone of a non SG SC or * + * unaligned SG */ + }; + +/* fwd declarations */ +struct iser_device; +struct iser_cq_desc; +struct iscsi_iser_task; +struct iscsi_endpoint; + +struct iser_mem_reg { + u32 lkey; + u32 rkey; + u64 va; + u64 len; + void *mem_h; + int is_mr; +}; + +struct iser_regd_buf { + struct iser_mem_reg reg; /* memory registration info */ + void *virt_addr; + struct iser_device *device; /* device->device for dma_unmap */ + enum dma_data_direction direction; /* direction for dma_unmap */ + unsigned int data_size; +}; + +enum iser_desc_type { + ISCSI_TX_CONTROL , + ISCSI_TX_SCSI_COMMAND, + ISCSI_TX_DATAOUT +}; + +struct iser_tx_desc { + struct iser_hdr iser_header; + struct iscsi_hdr iscsi_header; + enum iser_desc_type type; + u64 dma_addr; + /* sg[0] points to iser/iscsi headers, sg[1] optionally points to either + of immediate data, unsolicited data-out or control (login,text) */ + struct ib_sge tx_sg[2]; + int num_sge; +}; + +#define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \ + sizeof(u64) + sizeof(struct ib_sge))) +struct iser_rx_desc { + struct iser_hdr iser_header; + struct iscsi_hdr iscsi_header; + char data[ISER_RECV_DATA_SEG_LEN]; + u64 dma_addr; + struct ib_sge rx_sg; + char pad[ISER_RX_PAD_SIZE]; +} __attribute__((packed)); + +#define ISER_MAX_CQ 4 + +struct iser_conn; +struct iscsi_iser_task; + +struct iser_device { + struct ib_device *ib_device; + struct ib_pd *pd; + struct ib_device_attr dev_attr; + struct ib_cq *rx_cq[ISER_MAX_CQ]; + struct ib_cq *tx_cq[ISER_MAX_CQ]; + struct ib_mr *mr; + struct tasklet_struct cq_tasklet[ISER_MAX_CQ]; + struct ib_event_handler event_handler; + struct list_head ig_list; /* entry in ig devices list */ + int refcount; + int cq_active_qps[ISER_MAX_CQ]; + int cqs_used; + struct iser_cq_desc *cq_desc; + int (*iser_alloc_rdma_reg_res)(struct iser_conn *ib_conn, + unsigned cmds_max); + void (*iser_free_rdma_reg_res)(struct iser_conn *ib_conn); + int (*iser_reg_rdma_mem)(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir); + void (*iser_unreg_rdma_mem)(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir); +}; + +#define ISER_CHECK_GUARD 0xc0 +#define ISER_CHECK_REFTAG 0x0f +#define ISER_CHECK_APPTAG 0x30 + +enum iser_reg_indicator { + ISER_DATA_KEY_VALID = 1 << 0, + ISER_PROT_KEY_VALID = 1 << 1, + ISER_SIG_KEY_VALID = 1 << 2, + ISER_FASTREG_PROTECTED = 1 << 3, +}; + +struct iser_pi_context { + struct ib_mr *prot_mr; + struct ib_fast_reg_page_list *prot_frpl; + struct ib_mr *sig_mr; +}; + +struct fast_reg_descriptor { + struct list_head list; + /* For fast registration - FRWR */ + struct ib_mr *data_mr; + struct ib_fast_reg_page_list *data_frpl; + struct iser_pi_context *pi_ctx; + /* registration indicators container */ + u8 reg_indicators; +}; + +struct iser_conn { + struct iscsi_conn *iscsi_conn; + struct iscsi_endpoint *ep; + enum iser_ib_conn_state state; /* rdma connection state */ + atomic_t refcount; + spinlock_t lock; /* used for state changes */ + struct iser_device *device; /* device context */ + struct rdma_cm_id *cma_id; /* CMA ID */ + struct ib_qp *qp; /* QP */ + wait_queue_head_t wait; /* waitq for conn/disconn */ + unsigned qp_max_recv_dtos; /* num of rx buffers */ + unsigned qp_max_recv_dtos_mask; /* above minus 1 */ + unsigned min_posted_rx; /* qp_max_recv_dtos >> 2 */ + int post_recv_buf_count; /* posted rx count */ + atomic_t post_send_buf_count; /* posted tx count */ + char name[ISER_OBJECT_NAME_SIZE]; + struct work_struct release_work; + struct completion stop_completion; + struct list_head conn_list; /* entry in ig conn list */ + + char *login_buf; + char *login_req_buf, *login_resp_buf; + u64 login_req_dma, login_resp_dma; + unsigned int rx_desc_head; + struct iser_rx_desc *rx_descs; + struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; + bool pi_support; + + /* Connection memory registration pool */ + union { + struct { + struct ib_fmr_pool *pool; /* pool of IB FMRs */ + struct iser_page_vec *page_vec; /* represents SG to fmr maps* + * maps serialized as tx is*/ + } fmr; + struct { + struct list_head pool; + int pool_size; + } fastreg; + }; +}; + +struct iscsi_iser_task { + struct iser_tx_desc desc; + struct iser_conn *ib_conn; + enum iser_task_status status; + struct scsi_cmnd *sc; + int command_sent; /* set if command sent */ + int dir[ISER_DIRS_NUM]; /* set if dir use*/ + struct iser_regd_buf rdma_regd[ISER_DIRS_NUM];/* regd rdma buf */ + struct iser_data_buf data[ISER_DIRS_NUM]; /* orig. data des*/ + struct iser_data_buf data_copy[ISER_DIRS_NUM];/* contig. copy */ + struct iser_data_buf prot[ISER_DIRS_NUM]; /* prot desc */ + struct iser_data_buf prot_copy[ISER_DIRS_NUM];/* prot copy */ +}; + +struct iser_page_vec { + u64 *pages; + int length; + int offset; + int data_size; +}; + +struct iser_cq_desc { + struct iser_device *device; + int cq_index; +}; + +struct iser_global { + struct mutex device_list_mutex;/* */ + struct list_head device_list; /* all iSER devices */ + struct mutex connlist_mutex; + struct list_head connlist; /* all iSER IB connections */ + + struct kmem_cache *desc_cache; +}; + +extern struct iser_global ig; +extern int iser_debug_level; +extern bool iser_pi_enable; +extern int iser_pi_guard; + +/* allocate connection resources needed for rdma functionality */ +int iser_conn_set_full_featured_mode(struct iscsi_conn *conn); + +int iser_send_control(struct iscsi_conn *conn, + struct iscsi_task *task); + +int iser_send_command(struct iscsi_conn *conn, + struct iscsi_task *task); + +int iser_send_data_out(struct iscsi_conn *conn, + struct iscsi_task *task, + struct iscsi_data *hdr); + +void iscsi_iser_recv(struct iscsi_conn *conn, + struct iscsi_hdr *hdr, + char *rx_data, + int rx_data_len); + +void iser_conn_init(struct iser_conn *ib_conn); + +void iser_conn_release(struct iser_conn *ib_conn); + +void iser_conn_terminate(struct iser_conn *ib_conn); + +void iser_release_work(struct work_struct *work); + +void iser_rcv_completion(struct iser_rx_desc *desc, + unsigned long dto_xfer_len, + struct iser_conn *ib_conn); + +void iser_snd_completion(struct iser_tx_desc *desc, struct iser_conn *ib_conn); + +void iser_task_rdma_init(struct iscsi_iser_task *task); + +void iser_task_rdma_finalize(struct iscsi_iser_task *task); + +void iser_free_rx_descriptors(struct iser_conn *ib_conn); + +void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, + struct iser_data_buf *mem, + struct iser_data_buf *mem_copy, + enum iser_data_dir cmd_dir); + +int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task, + enum iser_data_dir cmd_dir); +int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *task, + enum iser_data_dir cmd_dir); + +int iser_connect(struct iser_conn *ib_conn, + struct sockaddr_in *src_addr, + struct sockaddr_in *dst_addr, + int non_blocking); + +int iser_reg_page_vec(struct iser_conn *ib_conn, + struct iser_page_vec *page_vec, + struct iser_mem_reg *mem_reg); + +void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir); +void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir); + +int iser_post_recvl(struct iser_conn *ib_conn); +int iser_post_recvm(struct iser_conn *ib_conn, int count); +int iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc); + +int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + enum iser_data_dir iser_dir, + enum dma_data_direction dma_dir); + +void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data); +int iser_initialize_task_headers(struct iscsi_task *task, + struct iser_tx_desc *tx_desc); +int iser_alloc_rx_descriptors(struct iser_conn *ib_conn, struct iscsi_session *session); +int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max); +void iser_free_fmr_pool(struct iser_conn *ib_conn); +int iser_create_fastreg_pool(struct iser_conn *ib_conn, unsigned cmds_max); +void iser_free_fastreg_pool(struct iser_conn *ib_conn); +u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir, sector_t *sector); +#endif diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c new file mode 100644 index 00000000000..8d44a406063 --- /dev/null +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -0,0 +1,720 @@ +/* + * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/scatterlist.h> +#include <linux/kfifo.h> +#include <scsi/scsi_cmnd.h> +#include <scsi/scsi_host.h> + +#include "iscsi_iser.h" + +/* Register user buffer memory and initialize passive rdma + * dto descriptor. Data size is stored in + * task->data[ISER_DIR_IN].data_len, Protection size + * os stored in task->prot[ISER_DIR_IN].data_len + */ +static int iser_prepare_read_cmd(struct iscsi_task *task) + +{ + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_device *device = iser_task->ib_conn->device; + struct iser_regd_buf *regd_buf; + int err; + struct iser_hdr *hdr = &iser_task->desc.iser_header; + struct iser_data_buf *buf_in = &iser_task->data[ISER_DIR_IN]; + + err = iser_dma_map_task_data(iser_task, + buf_in, + ISER_DIR_IN, + DMA_FROM_DEVICE); + if (err) + return err; + + if (scsi_prot_sg_count(iser_task->sc)) { + struct iser_data_buf *pbuf_in = &iser_task->prot[ISER_DIR_IN]; + + err = iser_dma_map_task_data(iser_task, + pbuf_in, + ISER_DIR_IN, + DMA_FROM_DEVICE); + if (err) + return err; + } + + err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_IN); + if (err) { + iser_err("Failed to set up Data-IN RDMA\n"); + return err; + } + regd_buf = &iser_task->rdma_regd[ISER_DIR_IN]; + + hdr->flags |= ISER_RSV; + hdr->read_stag = cpu_to_be32(regd_buf->reg.rkey); + hdr->read_va = cpu_to_be64(regd_buf->reg.va); + + iser_dbg("Cmd itt:%d READ tags RKEY:%#.4X VA:%#llX\n", + task->itt, regd_buf->reg.rkey, + (unsigned long long)regd_buf->reg.va); + + return 0; +} + +/* Register user buffer memory and initialize passive rdma + * dto descriptor. Data size is stored in + * task->data[ISER_DIR_OUT].data_len, Protection size + * is stored at task->prot[ISER_DIR_OUT].data_len + */ +static int +iser_prepare_write_cmd(struct iscsi_task *task, + unsigned int imm_sz, + unsigned int unsol_sz, + unsigned int edtl) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_device *device = iser_task->ib_conn->device; + struct iser_regd_buf *regd_buf; + int err; + struct iser_hdr *hdr = &iser_task->desc.iser_header; + struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT]; + struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1]; + + err = iser_dma_map_task_data(iser_task, + buf_out, + ISER_DIR_OUT, + DMA_TO_DEVICE); + if (err) + return err; + + if (scsi_prot_sg_count(iser_task->sc)) { + struct iser_data_buf *pbuf_out = &iser_task->prot[ISER_DIR_OUT]; + + err = iser_dma_map_task_data(iser_task, + pbuf_out, + ISER_DIR_OUT, + DMA_TO_DEVICE); + if (err) + return err; + } + + err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_OUT); + if (err != 0) { + iser_err("Failed to register write cmd RDMA mem\n"); + return err; + } + + regd_buf = &iser_task->rdma_regd[ISER_DIR_OUT]; + + if (unsol_sz < edtl) { + hdr->flags |= ISER_WSV; + hdr->write_stag = cpu_to_be32(regd_buf->reg.rkey); + hdr->write_va = cpu_to_be64(regd_buf->reg.va + unsol_sz); + + iser_dbg("Cmd itt:%d, WRITE tags, RKEY:%#.4X " + "VA:%#llX + unsol:%d\n", + task->itt, regd_buf->reg.rkey, + (unsigned long long)regd_buf->reg.va, unsol_sz); + } + + if (imm_sz > 0) { + iser_dbg("Cmd itt:%d, WRITE, adding imm.data sz: %d\n", + task->itt, imm_sz); + tx_dsg->addr = regd_buf->reg.va; + tx_dsg->length = imm_sz; + tx_dsg->lkey = regd_buf->reg.lkey; + iser_task->desc.num_sge = 2; + } + + return 0; +} + +/* creates a new tx descriptor and adds header regd buffer */ +static void iser_create_send_desc(struct iser_conn *ib_conn, + struct iser_tx_desc *tx_desc) +{ + struct iser_device *device = ib_conn->device; + + ib_dma_sync_single_for_cpu(device->ib_device, + tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); + + memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr)); + tx_desc->iser_header.flags = ISER_VER; + + tx_desc->num_sge = 1; + + if (tx_desc->tx_sg[0].lkey != device->mr->lkey) { + tx_desc->tx_sg[0].lkey = device->mr->lkey; + iser_dbg("sdesc %p lkey mismatch, fixing\n", tx_desc); + } +} + +static void iser_free_login_buf(struct iser_conn *ib_conn) +{ + if (!ib_conn->login_buf) + return; + + if (ib_conn->login_req_dma) + ib_dma_unmap_single(ib_conn->device->ib_device, + ib_conn->login_req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); + + if (ib_conn->login_resp_dma) + ib_dma_unmap_single(ib_conn->device->ib_device, + ib_conn->login_resp_dma, + ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); + + kfree(ib_conn->login_buf); + + /* make sure we never redo any unmapping */ + ib_conn->login_req_dma = 0; + ib_conn->login_resp_dma = 0; + ib_conn->login_buf = NULL; +} + +static int iser_alloc_login_buf(struct iser_conn *ib_conn) +{ + struct iser_device *device; + int req_err, resp_err; + + BUG_ON(ib_conn->device == NULL); + + device = ib_conn->device; + + ib_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN + + ISER_RX_LOGIN_SIZE, GFP_KERNEL); + if (!ib_conn->login_buf) + goto out_err; + + ib_conn->login_req_buf = ib_conn->login_buf; + ib_conn->login_resp_buf = ib_conn->login_buf + + ISCSI_DEF_MAX_RECV_SEG_LEN; + + ib_conn->login_req_dma = ib_dma_map_single(ib_conn->device->ib_device, + (void *)ib_conn->login_req_buf, + ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); + + ib_conn->login_resp_dma = ib_dma_map_single(ib_conn->device->ib_device, + (void *)ib_conn->login_resp_buf, + ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); + + req_err = ib_dma_mapping_error(device->ib_device, + ib_conn->login_req_dma); + resp_err = ib_dma_mapping_error(device->ib_device, + ib_conn->login_resp_dma); + + if (req_err || resp_err) { + if (req_err) + ib_conn->login_req_dma = 0; + if (resp_err) + ib_conn->login_resp_dma = 0; + goto free_login_buf; + } + return 0; + +free_login_buf: + iser_free_login_buf(ib_conn); + +out_err: + iser_err("unable to alloc or map login buf\n"); + return -ENOMEM; +} + +int iser_alloc_rx_descriptors(struct iser_conn *ib_conn, struct iscsi_session *session) +{ + int i, j; + u64 dma_addr; + struct iser_rx_desc *rx_desc; + struct ib_sge *rx_sg; + struct iser_device *device = ib_conn->device; + + ib_conn->qp_max_recv_dtos = session->cmds_max; + ib_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */ + ib_conn->min_posted_rx = ib_conn->qp_max_recv_dtos >> 2; + + if (device->iser_alloc_rdma_reg_res(ib_conn, session->scsi_cmds_max)) + goto create_rdma_reg_res_failed; + + if (iser_alloc_login_buf(ib_conn)) + goto alloc_login_buf_fail; + + ib_conn->rx_descs = kmalloc(session->cmds_max * + sizeof(struct iser_rx_desc), GFP_KERNEL); + if (!ib_conn->rx_descs) + goto rx_desc_alloc_fail; + + rx_desc = ib_conn->rx_descs; + + for (i = 0; i < ib_conn->qp_max_recv_dtos; i++, rx_desc++) { + dma_addr = ib_dma_map_single(device->ib_device, (void *)rx_desc, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(device->ib_device, dma_addr)) + goto rx_desc_dma_map_failed; + + rx_desc->dma_addr = dma_addr; + + rx_sg = &rx_desc->rx_sg; + rx_sg->addr = rx_desc->dma_addr; + rx_sg->length = ISER_RX_PAYLOAD_SIZE; + rx_sg->lkey = device->mr->lkey; + } + + ib_conn->rx_desc_head = 0; + return 0; + +rx_desc_dma_map_failed: + rx_desc = ib_conn->rx_descs; + for (j = 0; j < i; j++, rx_desc++) + ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + kfree(ib_conn->rx_descs); + ib_conn->rx_descs = NULL; +rx_desc_alloc_fail: + iser_free_login_buf(ib_conn); +alloc_login_buf_fail: + device->iser_free_rdma_reg_res(ib_conn); +create_rdma_reg_res_failed: + iser_err("failed allocating rx descriptors / data buffers\n"); + return -ENOMEM; +} + +void iser_free_rx_descriptors(struct iser_conn *ib_conn) +{ + int i; + struct iser_rx_desc *rx_desc; + struct iser_device *device = ib_conn->device; + + if (!ib_conn->rx_descs) + goto free_login_buf; + + if (device->iser_free_rdma_reg_res) + device->iser_free_rdma_reg_res(ib_conn); + + rx_desc = ib_conn->rx_descs; + for (i = 0; i < ib_conn->qp_max_recv_dtos; i++, rx_desc++) + ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + kfree(ib_conn->rx_descs); + /* make sure we never redo any unmapping */ + ib_conn->rx_descs = NULL; + +free_login_buf: + iser_free_login_buf(ib_conn); +} + +static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) +{ + struct iser_conn *ib_conn = conn->dd_data; + struct iscsi_session *session = conn->session; + + iser_dbg("req op %x flags %x\n", req->opcode, req->flags); + /* check if this is the last login - going to full feature phase */ + if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE) + return 0; + + /* + * Check that there is one posted recv buffer (for the last login + * response) and no posted send buffers left - they must have been + * consumed during previous login phases. + */ + WARN_ON(ib_conn->post_recv_buf_count != 1); + WARN_ON(atomic_read(&ib_conn->post_send_buf_count) != 0); + + if (session->discovery_sess) { + iser_info("Discovery session, re-using login RX buffer\n"); + return 0; + } else + iser_info("Normal session, posting batch of RX %d buffers\n", + ib_conn->min_posted_rx); + + /* Initial post receive buffers */ + if (iser_post_recvm(ib_conn, ib_conn->min_posted_rx)) + return -ENOMEM; + + return 0; +} + +/** + * iser_send_command - send command PDU + */ +int iser_send_command(struct iscsi_conn *conn, + struct iscsi_task *task) +{ + struct iser_conn *ib_conn = conn->dd_data; + struct iscsi_iser_task *iser_task = task->dd_data; + unsigned long edtl; + int err; + struct iser_data_buf *data_buf, *prot_buf; + struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr; + struct scsi_cmnd *sc = task->sc; + struct iser_tx_desc *tx_desc = &iser_task->desc; + + edtl = ntohl(hdr->data_length); + + /* build the tx desc regd header and add it to the tx desc dto */ + tx_desc->type = ISCSI_TX_SCSI_COMMAND; + iser_create_send_desc(ib_conn, tx_desc); + + if (hdr->flags & ISCSI_FLAG_CMD_READ) { + data_buf = &iser_task->data[ISER_DIR_IN]; + prot_buf = &iser_task->prot[ISER_DIR_IN]; + } else { + data_buf = &iser_task->data[ISER_DIR_OUT]; + prot_buf = &iser_task->prot[ISER_DIR_OUT]; + } + + if (scsi_sg_count(sc)) { /* using a scatter list */ + data_buf->buf = scsi_sglist(sc); + data_buf->size = scsi_sg_count(sc); + } + data_buf->data_len = scsi_bufflen(sc); + + if (scsi_prot_sg_count(sc)) { + prot_buf->buf = scsi_prot_sglist(sc); + prot_buf->size = scsi_prot_sg_count(sc); + prot_buf->data_len = data_buf->data_len >> + ilog2(sc->device->sector_size) * 8; + } + + if (hdr->flags & ISCSI_FLAG_CMD_READ) { + err = iser_prepare_read_cmd(task); + if (err) + goto send_command_error; + } + if (hdr->flags & ISCSI_FLAG_CMD_WRITE) { + err = iser_prepare_write_cmd(task, + task->imm_count, + task->imm_count + + task->unsol_r2t.data_length, + edtl); + if (err) + goto send_command_error; + } + + iser_task->status = ISER_TASK_STATUS_STARTED; + + err = iser_post_send(ib_conn, tx_desc); + if (!err) + return 0; + +send_command_error: + iser_err("conn %p failed task->itt %d err %d\n",conn, task->itt, err); + return err; +} + +/** + * iser_send_data_out - send data out PDU + */ +int iser_send_data_out(struct iscsi_conn *conn, + struct iscsi_task *task, + struct iscsi_data *hdr) +{ + struct iser_conn *ib_conn = conn->dd_data; + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_tx_desc *tx_desc = NULL; + struct iser_regd_buf *regd_buf; + unsigned long buf_offset; + unsigned long data_seg_len; + uint32_t itt; + int err = 0; + struct ib_sge *tx_dsg; + + itt = (__force uint32_t)hdr->itt; + data_seg_len = ntoh24(hdr->dlength); + buf_offset = ntohl(hdr->offset); + + iser_dbg("%s itt %d dseg_len %d offset %d\n", + __func__,(int)itt,(int)data_seg_len,(int)buf_offset); + + tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_ATOMIC); + if (tx_desc == NULL) { + iser_err("Failed to alloc desc for post dataout\n"); + return -ENOMEM; + } + + tx_desc->type = ISCSI_TX_DATAOUT; + tx_desc->iser_header.flags = ISER_VER; + memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr)); + + /* build the tx desc */ + iser_initialize_task_headers(task, tx_desc); + + regd_buf = &iser_task->rdma_regd[ISER_DIR_OUT]; + tx_dsg = &tx_desc->tx_sg[1]; + tx_dsg->addr = regd_buf->reg.va + buf_offset; + tx_dsg->length = data_seg_len; + tx_dsg->lkey = regd_buf->reg.lkey; + tx_desc->num_sge = 2; + + if (buf_offset + data_seg_len > iser_task->data[ISER_DIR_OUT].data_len) { + iser_err("Offset:%ld & DSL:%ld in Data-Out " + "inconsistent with total len:%ld, itt:%d\n", + buf_offset, data_seg_len, + iser_task->data[ISER_DIR_OUT].data_len, itt); + err = -EINVAL; + goto send_data_out_error; + } + iser_dbg("data-out itt: %d, offset: %ld, sz: %ld\n", + itt, buf_offset, data_seg_len); + + + err = iser_post_send(ib_conn, tx_desc); + if (!err) + return 0; + +send_data_out_error: + kmem_cache_free(ig.desc_cache, tx_desc); + iser_err("conn %p failed err %d\n",conn, err); + return err; +} + +int iser_send_control(struct iscsi_conn *conn, + struct iscsi_task *task) +{ + struct iser_conn *ib_conn = conn->dd_data; + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_tx_desc *mdesc = &iser_task->desc; + unsigned long data_seg_len; + int err = 0; + struct iser_device *device; + + /* build the tx desc regd header and add it to the tx desc dto */ + mdesc->type = ISCSI_TX_CONTROL; + iser_create_send_desc(ib_conn, mdesc); + + device = ib_conn->device; + + data_seg_len = ntoh24(task->hdr->dlength); + + if (data_seg_len > 0) { + struct ib_sge *tx_dsg = &mdesc->tx_sg[1]; + if (task != conn->login_task) { + iser_err("data present on non login task!!!\n"); + goto send_control_error; + } + + ib_dma_sync_single_for_cpu(device->ib_device, + ib_conn->login_req_dma, task->data_count, + DMA_TO_DEVICE); + + memcpy(ib_conn->login_req_buf, task->data, task->data_count); + + ib_dma_sync_single_for_device(device->ib_device, + ib_conn->login_req_dma, task->data_count, + DMA_TO_DEVICE); + + tx_dsg->addr = ib_conn->login_req_dma; + tx_dsg->length = task->data_count; + tx_dsg->lkey = device->mr->lkey; + mdesc->num_sge = 2; + } + + if (task == conn->login_task) { + iser_dbg("op %x dsl %lx, posting login rx buffer\n", + task->hdr->opcode, data_seg_len); + err = iser_post_recvl(ib_conn); + if (err) + goto send_control_error; + err = iser_post_rx_bufs(conn, task->hdr); + if (err) + goto send_control_error; + } + + err = iser_post_send(ib_conn, mdesc); + if (!err) + return 0; + +send_control_error: + iser_err("conn %p failed err %d\n",conn, err); + return err; +} + +/** + * iser_rcv_dto_completion - recv DTO completion + */ +void iser_rcv_completion(struct iser_rx_desc *rx_desc, + unsigned long rx_xfer_len, + struct iser_conn *ib_conn) +{ + struct iscsi_hdr *hdr; + u64 rx_dma; + int rx_buflen, outstanding, count, err; + + /* differentiate between login to all other PDUs */ + if ((char *)rx_desc == ib_conn->login_resp_buf) { + rx_dma = ib_conn->login_resp_dma; + rx_buflen = ISER_RX_LOGIN_SIZE; + } else { + rx_dma = rx_desc->dma_addr; + rx_buflen = ISER_RX_PAYLOAD_SIZE; + } + + ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma, + rx_buflen, DMA_FROM_DEVICE); + + hdr = &rx_desc->iscsi_header; + + iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode, + hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN)); + + iscsi_iser_recv(ib_conn->iscsi_conn, hdr, rx_desc->data, + rx_xfer_len - ISER_HEADERS_LEN); + + ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma, + rx_buflen, DMA_FROM_DEVICE); + + /* decrementing conn->post_recv_buf_count only --after-- freeing the * + * task eliminates the need to worry on tasks which are completed in * + * parallel to the execution of iser_conn_term. So the code that waits * + * for the posted rx bufs refcount to become zero handles everything */ + ib_conn->post_recv_buf_count--; + + if (rx_dma == ib_conn->login_resp_dma) + return; + + outstanding = ib_conn->post_recv_buf_count; + if (outstanding + ib_conn->min_posted_rx <= ib_conn->qp_max_recv_dtos) { + count = min(ib_conn->qp_max_recv_dtos - outstanding, + ib_conn->min_posted_rx); + err = iser_post_recvm(ib_conn, count); + if (err) + iser_err("posting %d rx bufs err %d\n", count, err); + } +} + +void iser_snd_completion(struct iser_tx_desc *tx_desc, + struct iser_conn *ib_conn) +{ + struct iscsi_task *task; + struct iser_device *device = ib_conn->device; + + if (tx_desc->type == ISCSI_TX_DATAOUT) { + ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + kmem_cache_free(ig.desc_cache, tx_desc); + tx_desc = NULL; + } + + atomic_dec(&ib_conn->post_send_buf_count); + + if (tx_desc && tx_desc->type == ISCSI_TX_CONTROL) { + /* this arithmetic is legal by libiscsi dd_data allocation */ + task = (void *) ((long)(void *)tx_desc - + sizeof(struct iscsi_task)); + if (task->hdr->itt == RESERVED_ITT) + iscsi_put_task(task); + } +} + +void iser_task_rdma_init(struct iscsi_iser_task *iser_task) + +{ + iser_task->status = ISER_TASK_STATUS_INIT; + + iser_task->dir[ISER_DIR_IN] = 0; + iser_task->dir[ISER_DIR_OUT] = 0; + + iser_task->data[ISER_DIR_IN].data_len = 0; + iser_task->data[ISER_DIR_OUT].data_len = 0; + + iser_task->prot[ISER_DIR_IN].data_len = 0; + iser_task->prot[ISER_DIR_OUT].data_len = 0; + + memset(&iser_task->rdma_regd[ISER_DIR_IN], 0, + sizeof(struct iser_regd_buf)); + memset(&iser_task->rdma_regd[ISER_DIR_OUT], 0, + sizeof(struct iser_regd_buf)); +} + +void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) +{ + struct iser_device *device = iser_task->ib_conn->device; + int is_rdma_data_aligned = 1; + int is_rdma_prot_aligned = 1; + int prot_count = scsi_prot_sg_count(iser_task->sc); + + /* if we were reading, copy back to unaligned sglist, + * anyway dma_unmap and free the copy + */ + if (iser_task->data_copy[ISER_DIR_IN].copy_buf != NULL) { + is_rdma_data_aligned = 0; + iser_finalize_rdma_unaligned_sg(iser_task, + &iser_task->data[ISER_DIR_IN], + &iser_task->data_copy[ISER_DIR_IN], + ISER_DIR_IN); + } + + if (iser_task->data_copy[ISER_DIR_OUT].copy_buf != NULL) { + is_rdma_data_aligned = 0; + iser_finalize_rdma_unaligned_sg(iser_task, + &iser_task->data[ISER_DIR_OUT], + &iser_task->data_copy[ISER_DIR_OUT], + ISER_DIR_OUT); + } + + if (iser_task->prot_copy[ISER_DIR_IN].copy_buf != NULL) { + is_rdma_prot_aligned = 0; + iser_finalize_rdma_unaligned_sg(iser_task, + &iser_task->prot[ISER_DIR_IN], + &iser_task->prot_copy[ISER_DIR_IN], + ISER_DIR_IN); + } + + if (iser_task->prot_copy[ISER_DIR_OUT].copy_buf != NULL) { + is_rdma_prot_aligned = 0; + iser_finalize_rdma_unaligned_sg(iser_task, + &iser_task->prot[ISER_DIR_OUT], + &iser_task->prot_copy[ISER_DIR_OUT], + ISER_DIR_OUT); + } + + if (iser_task->dir[ISER_DIR_IN]) { + device->iser_unreg_rdma_mem(iser_task, ISER_DIR_IN); + if (is_rdma_data_aligned) + iser_dma_unmap_task_data(iser_task, + &iser_task->data[ISER_DIR_IN]); + if (prot_count && is_rdma_prot_aligned) + iser_dma_unmap_task_data(iser_task, + &iser_task->prot[ISER_DIR_IN]); + } + + if (iser_task->dir[ISER_DIR_OUT]) { + device->iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT); + if (is_rdma_data_aligned) + iser_dma_unmap_task_data(iser_task, + &iser_task->data[ISER_DIR_OUT]); + if (prot_count && is_rdma_prot_aligned) + iser_dma_unmap_task_data(iser_task, + &iser_task->prot[ISER_DIR_OUT]); + } +} diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c new file mode 100644 index 00000000000..47acd3ad3a1 --- /dev/null +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -0,0 +1,800 @@ +/* + * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/scatterlist.h> + +#include "iscsi_iser.h" + +#define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */ + +/** + * iser_start_rdma_unaligned_sg + */ +static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + struct iser_data_buf *data_copy, + enum iser_data_dir cmd_dir) +{ + struct ib_device *dev = iser_task->ib_conn->device->ib_device; + struct scatterlist *sgl = (struct scatterlist *)data->buf; + struct scatterlist *sg; + char *mem = NULL; + unsigned long cmd_data_len = 0; + int dma_nents, i; + + for_each_sg(sgl, sg, data->size, i) + cmd_data_len += ib_sg_dma_len(dev, sg); + + if (cmd_data_len > ISER_KMALLOC_THRESHOLD) + mem = (void *)__get_free_pages(GFP_ATOMIC, + ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); + else + mem = kmalloc(cmd_data_len, GFP_ATOMIC); + + if (mem == NULL) { + iser_err("Failed to allocate mem size %d %d for copying sglist\n", + data->size, (int)cmd_data_len); + return -ENOMEM; + } + + if (cmd_dir == ISER_DIR_OUT) { + /* copy the unaligned sg the buffer which is used for RDMA */ + int i; + char *p, *from; + + sgl = (struct scatterlist *)data->buf; + p = mem; + for_each_sg(sgl, sg, data->size, i) { + from = kmap_atomic(sg_page(sg)); + memcpy(p, + from + sg->offset, + sg->length); + kunmap_atomic(from); + p += sg->length; + } + } + + sg_init_one(&data_copy->sg_single, mem, cmd_data_len); + data_copy->buf = &data_copy->sg_single; + data_copy->size = 1; + data_copy->copy_buf = mem; + + dma_nents = ib_dma_map_sg(dev, &data_copy->sg_single, 1, + (cmd_dir == ISER_DIR_OUT) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + BUG_ON(dma_nents == 0); + + data_copy->dma_nents = dma_nents; + data_copy->data_len = cmd_data_len; + + return 0; +} + +/** + * iser_finalize_rdma_unaligned_sg + */ + +void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + struct iser_data_buf *data_copy, + enum iser_data_dir cmd_dir) +{ + struct ib_device *dev; + unsigned long cmd_data_len; + + dev = iser_task->ib_conn->device->ib_device; + + ib_dma_unmap_sg(dev, &data_copy->sg_single, 1, + (cmd_dir == ISER_DIR_OUT) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + + if (cmd_dir == ISER_DIR_IN) { + char *mem; + struct scatterlist *sgl, *sg; + unsigned char *p, *to; + unsigned int sg_size; + int i; + + /* copy back read RDMA to unaligned sg */ + mem = data_copy->copy_buf; + + sgl = (struct scatterlist *)data->buf; + sg_size = data->size; + + p = mem; + for_each_sg(sgl, sg, sg_size, i) { + to = kmap_atomic(sg_page(sg)); + memcpy(to + sg->offset, + p, + sg->length); + kunmap_atomic(to); + p += sg->length; + } + } + + cmd_data_len = data->data_len; + + if (cmd_data_len > ISER_KMALLOC_THRESHOLD) + free_pages((unsigned long)data_copy->copy_buf, + ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); + else + kfree(data_copy->copy_buf); + + data_copy->copy_buf = NULL; +} + +#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) + +/** + * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses + * and returns the length of resulting physical address array (may be less than + * the original due to possible compaction). + * + * we build a "page vec" under the assumption that the SG meets the RDMA + * alignment requirements. Other then the first and last SG elements, all + * the "internal" elements can be compacted into a list whose elements are + * dma addresses of physical pages. The code supports also the weird case + * where --few fragments of the same page-- are present in the SG as + * consecutive elements. Also, it handles one entry SG. + */ + +static int iser_sg_to_page_vec(struct iser_data_buf *data, + struct ib_device *ibdev, u64 *pages, + int *offset, int *data_size) +{ + struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf; + u64 start_addr, end_addr, page, chunk_start = 0; + unsigned long total_sz = 0; + unsigned int dma_len; + int i, new_chunk, cur_page, last_ent = data->dma_nents - 1; + + /* compute the offset of first element */ + *offset = (u64) sgl[0].offset & ~MASK_4K; + + new_chunk = 1; + cur_page = 0; + for_each_sg(sgl, sg, data->dma_nents, i) { + start_addr = ib_sg_dma_address(ibdev, sg); + if (new_chunk) + chunk_start = start_addr; + dma_len = ib_sg_dma_len(ibdev, sg); + end_addr = start_addr + dma_len; + total_sz += dma_len; + + /* collect page fragments until aligned or end of SG list */ + if (!IS_4K_ALIGNED(end_addr) && i < last_ent) { + new_chunk = 0; + continue; + } + new_chunk = 1; + + /* address of the first page in the contiguous chunk; + masking relevant for the very first SG entry, + which might be unaligned */ + page = chunk_start & MASK_4K; + do { + pages[cur_page++] = page; + page += SIZE_4K; + } while (page < end_addr); + } + + *data_size = total_sz; + iser_dbg("page_vec->data_size:%d cur_page %d\n", + *data_size, cur_page); + return cur_page; +} + + +/** + * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned + * for RDMA sub-list of a scatter-gather list of memory buffers, and returns + * the number of entries which are aligned correctly. Supports the case where + * consecutive SG elements are actually fragments of the same physcial page. + */ +static int iser_data_buf_aligned_len(struct iser_data_buf *data, + struct ib_device *ibdev) +{ + struct scatterlist *sgl, *sg, *next_sg = NULL; + u64 start_addr, end_addr; + int i, ret_len, start_check = 0; + + if (data->dma_nents == 1) + return 1; + + sgl = (struct scatterlist *)data->buf; + start_addr = ib_sg_dma_address(ibdev, sgl); + + for_each_sg(sgl, sg, data->dma_nents, i) { + if (start_check && !IS_4K_ALIGNED(start_addr)) + break; + + next_sg = sg_next(sg); + if (!next_sg) + break; + + end_addr = start_addr + ib_sg_dma_len(ibdev, sg); + start_addr = ib_sg_dma_address(ibdev, next_sg); + + if (end_addr == start_addr) { + start_check = 0; + continue; + } else + start_check = 1; + + if (!IS_4K_ALIGNED(end_addr)) + break; + } + ret_len = (next_sg) ? i : i+1; + iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", + ret_len, data->dma_nents, data); + return ret_len; +} + +static void iser_data_buf_dump(struct iser_data_buf *data, + struct ib_device *ibdev) +{ + struct scatterlist *sgl = (struct scatterlist *)data->buf; + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, data->dma_nents, i) + iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p " + "off:0x%x sz:0x%x dma_len:0x%x\n", + i, (unsigned long)ib_sg_dma_address(ibdev, sg), + sg_page(sg), sg->offset, + sg->length, ib_sg_dma_len(ibdev, sg)); +} + +static void iser_dump_page_vec(struct iser_page_vec *page_vec) +{ + int i; + + iser_err("page vec length %d data size %d\n", + page_vec->length, page_vec->data_size); + for (i = 0; i < page_vec->length; i++) + iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]); +} + +static void iser_page_vec_build(struct iser_data_buf *data, + struct iser_page_vec *page_vec, + struct ib_device *ibdev) +{ + int page_vec_len = 0; + + page_vec->length = 0; + page_vec->offset = 0; + + iser_dbg("Translating sg sz: %d\n", data->dma_nents); + page_vec_len = iser_sg_to_page_vec(data, ibdev, page_vec->pages, + &page_vec->offset, + &page_vec->data_size); + iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents, page_vec_len); + + page_vec->length = page_vec_len; + + if (page_vec_len * SIZE_4K < page_vec->data_size) { + iser_err("page_vec too short to hold this SG\n"); + iser_data_buf_dump(data, ibdev); + iser_dump_page_vec(page_vec); + BUG(); + } +} + +int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + enum iser_data_dir iser_dir, + enum dma_data_direction dma_dir) +{ + struct ib_device *dev; + + iser_task->dir[iser_dir] = 1; + dev = iser_task->ib_conn->device->ib_device; + + data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir); + if (data->dma_nents == 0) { + iser_err("dma_map_sg failed!!!\n"); + return -EINVAL; + } + return 0; +} + +void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data) +{ + struct ib_device *dev; + + dev = iser_task->ib_conn->device->ib_device; + ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE); +} + +static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, + struct ib_device *ibdev, + struct iser_data_buf *mem, + struct iser_data_buf *mem_copy, + enum iser_data_dir cmd_dir, + int aligned_len) +{ + struct iscsi_conn *iscsi_conn = iser_task->ib_conn->iscsi_conn; + + iscsi_conn->fmr_unalign_cnt++; + iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n", + aligned_len, mem->size); + + if (iser_debug_level > 0) + iser_data_buf_dump(mem, ibdev); + + /* unmap the command data before accessing it */ + iser_dma_unmap_task_data(iser_task, mem); + + /* allocate copy buf, if we are writing, copy the */ + /* unaligned scatterlist, dma map the copy */ + if (iser_start_rdma_unaligned_sg(iser_task, mem, mem_copy, cmd_dir) != 0) + return -ENOMEM; + + return 0; +} + +/** + * iser_reg_rdma_mem_fmr - Registers memory intended for RDMA, + * using FMR (if possible) obtaining rkey and va + * + * returns 0 on success, errno code on failure + */ +int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) +{ + struct iser_conn *ib_conn = iser_task->ib_conn; + struct iser_device *device = ib_conn->device; + struct ib_device *ibdev = device->ib_device; + struct iser_data_buf *mem = &iser_task->data[cmd_dir]; + struct iser_regd_buf *regd_buf; + int aligned_len; + int err; + int i; + struct scatterlist *sg; + + regd_buf = &iser_task->rdma_regd[cmd_dir]; + + aligned_len = iser_data_buf_aligned_len(mem, ibdev); + if (aligned_len != mem->dma_nents) { + err = fall_to_bounce_buf(iser_task, ibdev, mem, + &iser_task->data_copy[cmd_dir], + cmd_dir, aligned_len); + if (err) { + iser_err("failed to allocate bounce buffer\n"); + return err; + } + mem = &iser_task->data_copy[cmd_dir]; + } + + /* if there a single dma entry, FMR is not needed */ + if (mem->dma_nents == 1) { + sg = (struct scatterlist *)mem->buf; + + regd_buf->reg.lkey = device->mr->lkey; + regd_buf->reg.rkey = device->mr->rkey; + regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); + regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); + regd_buf->reg.is_mr = 0; + + iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " + "va: 0x%08lX sz: %ld]\n", + (unsigned int)regd_buf->reg.lkey, + (unsigned int)regd_buf->reg.rkey, + (unsigned long)regd_buf->reg.va, + (unsigned long)regd_buf->reg.len); + } else { /* use FMR for multiple dma entries */ + iser_page_vec_build(mem, ib_conn->fmr.page_vec, ibdev); + err = iser_reg_page_vec(ib_conn, ib_conn->fmr.page_vec, + ®d_buf->reg); + if (err && err != -EAGAIN) { + iser_data_buf_dump(mem, ibdev); + iser_err("mem->dma_nents = %d (dlength = 0x%x)\n", + mem->dma_nents, + ntoh24(iser_task->desc.iscsi_header.dlength)); + iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n", + ib_conn->fmr.page_vec->data_size, + ib_conn->fmr.page_vec->length, + ib_conn->fmr.page_vec->offset); + for (i = 0; i < ib_conn->fmr.page_vec->length; i++) + iser_err("page_vec[%d] = 0x%llx\n", i, + (unsigned long long) ib_conn->fmr.page_vec->pages[i]); + } + if (err) + return err; + } + return 0; +} + +static inline enum ib_t10_dif_type +scsi2ib_prot_type(unsigned char prot_type) +{ + switch (prot_type) { + case SCSI_PROT_DIF_TYPE0: + return IB_T10DIF_NONE; + case SCSI_PROT_DIF_TYPE1: + return IB_T10DIF_TYPE1; + case SCSI_PROT_DIF_TYPE2: + return IB_T10DIF_TYPE2; + case SCSI_PROT_DIF_TYPE3: + return IB_T10DIF_TYPE3; + default: + return IB_T10DIF_NONE; + } +} + + +static int +iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs) +{ + unsigned char scsi_ptype = scsi_get_prot_type(sc); + + sig_attrs->mem.sig_type = IB_SIG_TYPE_T10_DIF; + sig_attrs->wire.sig_type = IB_SIG_TYPE_T10_DIF; + sig_attrs->mem.sig.dif.pi_interval = sc->device->sector_size; + sig_attrs->wire.sig.dif.pi_interval = sc->device->sector_size; + + switch (scsi_get_prot_op(sc)) { + case SCSI_PROT_WRITE_INSERT: + case SCSI_PROT_READ_STRIP: + sig_attrs->mem.sig.dif.type = IB_T10DIF_NONE; + sig_attrs->wire.sig.dif.type = scsi2ib_prot_type(scsi_ptype); + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->wire.sig.dif.ref_tag = scsi_get_lba(sc) & + 0xffffffff; + break; + case SCSI_PROT_READ_INSERT: + case SCSI_PROT_WRITE_STRIP: + sig_attrs->mem.sig.dif.type = scsi2ib_prot_type(scsi_ptype); + sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->mem.sig.dif.ref_tag = scsi_get_lba(sc) & + 0xffffffff; + sig_attrs->wire.sig.dif.type = IB_T10DIF_NONE; + break; + case SCSI_PROT_READ_PASS: + case SCSI_PROT_WRITE_PASS: + sig_attrs->mem.sig.dif.type = scsi2ib_prot_type(scsi_ptype); + sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->mem.sig.dif.ref_tag = scsi_get_lba(sc) & + 0xffffffff; + sig_attrs->wire.sig.dif.type = scsi2ib_prot_type(scsi_ptype); + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->wire.sig.dif.ref_tag = scsi_get_lba(sc) & + 0xffffffff; + break; + default: + iser_err("Unsupported PI operation %d\n", + scsi_get_prot_op(sc)); + return -EINVAL; + } + return 0; +} + + +static int +iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) +{ + switch (scsi_get_prot_type(sc)) { + case SCSI_PROT_DIF_TYPE0: + *mask = 0x0; + break; + case SCSI_PROT_DIF_TYPE1: + case SCSI_PROT_DIF_TYPE2: + *mask = ISER_CHECK_GUARD | ISER_CHECK_REFTAG; + break; + case SCSI_PROT_DIF_TYPE3: + *mask = ISER_CHECK_GUARD; + break; + default: + iser_err("Unsupported protection type %d\n", + scsi_get_prot_type(sc)); + return -EINVAL; + } + + return 0; +} + +static int +iser_reg_sig_mr(struct iscsi_iser_task *iser_task, + struct fast_reg_descriptor *desc, struct ib_sge *data_sge, + struct ib_sge *prot_sge, struct ib_sge *sig_sge) +{ + struct iser_conn *ib_conn = iser_task->ib_conn; + struct iser_pi_context *pi_ctx = desc->pi_ctx; + struct ib_send_wr sig_wr, inv_wr; + struct ib_send_wr *bad_wr, *wr = NULL; + struct ib_sig_attrs sig_attrs; + int ret; + u32 key; + + memset(&sig_attrs, 0, sizeof(sig_attrs)); + ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs); + if (ret) + goto err; + + ret = iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask); + if (ret) + goto err; + + if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) { + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.wr_id = ISER_FASTREG_LI_WRID; + inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey; + wr = &inv_wr; + /* Bump the key */ + key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(pi_ctx->sig_mr, ++key); + } + + memset(&sig_wr, 0, sizeof(sig_wr)); + sig_wr.opcode = IB_WR_REG_SIG_MR; + sig_wr.wr_id = ISER_FASTREG_LI_WRID; + sig_wr.sg_list = data_sge; + sig_wr.num_sge = 1; + sig_wr.wr.sig_handover.sig_attrs = &sig_attrs; + sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr; + if (scsi_prot_sg_count(iser_task->sc)) + sig_wr.wr.sig_handover.prot = prot_sge; + sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + + if (!wr) + wr = &sig_wr; + else + wr->next = &sig_wr; + + ret = ib_post_send(ib_conn->qp, wr, &bad_wr); + if (ret) { + iser_err("reg_sig_mr failed, ret:%d\n", ret); + goto err; + } + desc->reg_indicators &= ~ISER_SIG_KEY_VALID; + + sig_sge->lkey = pi_ctx->sig_mr->lkey; + sig_sge->addr = 0; + sig_sge->length = data_sge->length + prot_sge->length; + if (scsi_get_prot_op(iser_task->sc) == SCSI_PROT_WRITE_INSERT || + scsi_get_prot_op(iser_task->sc) == SCSI_PROT_READ_STRIP) { + sig_sge->length += (data_sge->length / + iser_task->sc->device->sector_size) * 8; + } + + iser_dbg("sig_sge: addr: 0x%llx length: %u lkey: 0x%x\n", + sig_sge->addr, sig_sge->length, + sig_sge->lkey); +err: + return ret; +} + +static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, + struct iser_regd_buf *regd_buf, + struct iser_data_buf *mem, + enum iser_reg_indicator ind, + struct ib_sge *sge) +{ + struct fast_reg_descriptor *desc = regd_buf->reg.mem_h; + struct iser_conn *ib_conn = iser_task->ib_conn; + struct iser_device *device = ib_conn->device; + struct ib_device *ibdev = device->ib_device; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; + struct ib_send_wr fastreg_wr, inv_wr; + struct ib_send_wr *bad_wr, *wr = NULL; + u8 key; + int ret, offset, size, plen; + + /* if there a single dma entry, dma mr suffices */ + if (mem->dma_nents == 1) { + struct scatterlist *sg = (struct scatterlist *)mem->buf; + + sge->lkey = device->mr->lkey; + sge->addr = ib_sg_dma_address(ibdev, &sg[0]); + sge->length = ib_sg_dma_len(ibdev, &sg[0]); + + iser_dbg("Single DMA entry: lkey=0x%x, addr=0x%llx, length=0x%x\n", + sge->lkey, sge->addr, sge->length); + return 0; + } + + if (ind == ISER_DATA_KEY_VALID) { + mr = desc->data_mr; + frpl = desc->data_frpl; + } else { + mr = desc->pi_ctx->prot_mr; + frpl = desc->pi_ctx->prot_frpl; + } + + plen = iser_sg_to_page_vec(mem, device->ib_device, frpl->page_list, + &offset, &size); + if (plen * SIZE_4K < size) { + iser_err("fast reg page_list too short to hold this SG\n"); + return -EINVAL; + } + + if (!(desc->reg_indicators & ind)) { + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.wr_id = ISER_FASTREG_LI_WRID; + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.ex.invalidate_rkey = mr->rkey; + wr = &inv_wr; + /* Bump the key */ + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); + } + + /* Prepare FASTREG WR */ + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.wr_id = ISER_FASTREG_LI_WRID; + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.wr.fast_reg.iova_start = frpl->page_list[0] + offset; + fastreg_wr.wr.fast_reg.page_list = frpl; + fastreg_wr.wr.fast_reg.page_list_len = plen; + fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K; + fastreg_wr.wr.fast_reg.length = size; + fastreg_wr.wr.fast_reg.rkey = mr->rkey; + fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + + if (!wr) + wr = &fastreg_wr; + else + wr->next = &fastreg_wr; + + ret = ib_post_send(ib_conn->qp, wr, &bad_wr); + if (ret) { + iser_err("fast registration failed, ret:%d\n", ret); + return ret; + } + desc->reg_indicators &= ~ind; + + sge->lkey = mr->lkey; + sge->addr = frpl->page_list[0] + offset; + sge->length = size; + + return ret; +} + +/** + * iser_reg_rdma_mem_fastreg - Registers memory intended for RDMA, + * using Fast Registration WR (if possible) obtaining rkey and va + * + * returns 0 on success, errno code on failure + */ +int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) +{ + struct iser_conn *ib_conn = iser_task->ib_conn; + struct iser_device *device = ib_conn->device; + struct ib_device *ibdev = device->ib_device; + struct iser_data_buf *mem = &iser_task->data[cmd_dir]; + struct iser_regd_buf *regd_buf = &iser_task->rdma_regd[cmd_dir]; + struct fast_reg_descriptor *desc = NULL; + struct ib_sge data_sge; + int err, aligned_len; + unsigned long flags; + + aligned_len = iser_data_buf_aligned_len(mem, ibdev); + if (aligned_len != mem->dma_nents) { + err = fall_to_bounce_buf(iser_task, ibdev, mem, + &iser_task->data_copy[cmd_dir], + cmd_dir, aligned_len); + if (err) { + iser_err("failed to allocate bounce buffer\n"); + return err; + } + mem = &iser_task->data_copy[cmd_dir]; + } + + if (mem->dma_nents != 1 || + scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { + spin_lock_irqsave(&ib_conn->lock, flags); + desc = list_first_entry(&ib_conn->fastreg.pool, + struct fast_reg_descriptor, list); + list_del(&desc->list); + spin_unlock_irqrestore(&ib_conn->lock, flags); + regd_buf->reg.mem_h = desc; + } + + err = iser_fast_reg_mr(iser_task, regd_buf, mem, + ISER_DATA_KEY_VALID, &data_sge); + if (err) + goto err_reg; + + if (scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { + struct ib_sge prot_sge, sig_sge; + + memset(&prot_sge, 0, sizeof(prot_sge)); + if (scsi_prot_sg_count(iser_task->sc)) { + mem = &iser_task->prot[cmd_dir]; + aligned_len = iser_data_buf_aligned_len(mem, ibdev); + if (aligned_len != mem->dma_nents) { + err = fall_to_bounce_buf(iser_task, ibdev, mem, + &iser_task->prot_copy[cmd_dir], + cmd_dir, aligned_len); + if (err) { + iser_err("failed to allocate bounce buffer\n"); + return err; + } + mem = &iser_task->prot_copy[cmd_dir]; + } + + err = iser_fast_reg_mr(iser_task, regd_buf, mem, + ISER_PROT_KEY_VALID, &prot_sge); + if (err) + goto err_reg; + } + + err = iser_reg_sig_mr(iser_task, desc, &data_sge, + &prot_sge, &sig_sge); + if (err) { + iser_err("Failed to register signature mr\n"); + return err; + } + desc->reg_indicators |= ISER_FASTREG_PROTECTED; + + regd_buf->reg.lkey = sig_sge.lkey; + regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey; + regd_buf->reg.va = sig_sge.addr; + regd_buf->reg.len = sig_sge.length; + regd_buf->reg.is_mr = 1; + } else { + if (desc) { + regd_buf->reg.rkey = desc->data_mr->rkey; + regd_buf->reg.is_mr = 1; + } else { + regd_buf->reg.rkey = device->mr->rkey; + regd_buf->reg.is_mr = 0; + } + + regd_buf->reg.lkey = data_sge.lkey; + regd_buf->reg.va = data_sge.addr; + regd_buf->reg.len = data_sge.length; + } + + return 0; +err_reg: + if (desc) { + spin_lock_irqsave(&ib_conn->lock, flags); + list_add_tail(&desc->list, &ib_conn->fastreg.pool); + spin_unlock_irqrestore(&ib_conn->lock, flags); + } + + return err; +} diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c new file mode 100644 index 00000000000..ea01075f9f9 --- /dev/null +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -0,0 +1,1199 @@ +/* + * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/delay.h> + +#include "iscsi_iser.h" + +#define ISCSI_ISER_MAX_CONN 8 +#define ISER_MAX_RX_CQ_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN) +#define ISER_MAX_TX_CQ_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN) + +static void iser_cq_tasklet_fn(unsigned long data); +static void iser_cq_callback(struct ib_cq *cq, void *cq_context); + +static void iser_cq_event_callback(struct ib_event *cause, void *context) +{ + iser_err("got cq event %d \n", cause->event); +} + +static void iser_qp_event_callback(struct ib_event *cause, void *context) +{ + iser_err("got qp event %d\n",cause->event); +} + +static void iser_event_handler(struct ib_event_handler *handler, + struct ib_event *event) +{ + iser_err("async event %d on device %s port %d\n", event->event, + event->device->name, event->element.port_num); +} + +/** + * iser_create_device_ib_res - creates Protection Domain (PD), Completion + * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with + * the adapator. + * + * returns 0 on success, -1 on failure + */ +static int iser_create_device_ib_res(struct iser_device *device) +{ + struct iser_cq_desc *cq_desc; + struct ib_device_attr *dev_attr = &device->dev_attr; + int ret, i, j; + + ret = ib_query_device(device->ib_device, dev_attr); + if (ret) { + pr_warn("Query device failed for %s\n", device->ib_device->name); + return ret; + } + + /* Assign function handles - based on FMR support */ + if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr && + device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) { + iser_info("FMR supported, using FMR for registration\n"); + device->iser_alloc_rdma_reg_res = iser_create_fmr_pool; + device->iser_free_rdma_reg_res = iser_free_fmr_pool; + device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr; + device->iser_unreg_rdma_mem = iser_unreg_mem_fmr; + } else + if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + iser_info("FastReg supported, using FastReg for registration\n"); + device->iser_alloc_rdma_reg_res = iser_create_fastreg_pool; + device->iser_free_rdma_reg_res = iser_free_fastreg_pool; + device->iser_reg_rdma_mem = iser_reg_rdma_mem_fastreg; + device->iser_unreg_rdma_mem = iser_unreg_mem_fastreg; + } else { + iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n"); + return -1; + } + + device->cqs_used = min(ISER_MAX_CQ, device->ib_device->num_comp_vectors); + iser_info("using %d CQs, device %s supports %d vectors\n", + device->cqs_used, device->ib_device->name, + device->ib_device->num_comp_vectors); + + device->cq_desc = kmalloc(sizeof(struct iser_cq_desc) * device->cqs_used, + GFP_KERNEL); + if (device->cq_desc == NULL) + goto cq_desc_err; + cq_desc = device->cq_desc; + + device->pd = ib_alloc_pd(device->ib_device); + if (IS_ERR(device->pd)) + goto pd_err; + + for (i = 0; i < device->cqs_used; i++) { + cq_desc[i].device = device; + cq_desc[i].cq_index = i; + + device->rx_cq[i] = ib_create_cq(device->ib_device, + iser_cq_callback, + iser_cq_event_callback, + (void *)&cq_desc[i], + ISER_MAX_RX_CQ_LEN, i); + if (IS_ERR(device->rx_cq[i])) + goto cq_err; + + device->tx_cq[i] = ib_create_cq(device->ib_device, + NULL, iser_cq_event_callback, + (void *)&cq_desc[i], + ISER_MAX_TX_CQ_LEN, i); + + if (IS_ERR(device->tx_cq[i])) + goto cq_err; + + if (ib_req_notify_cq(device->rx_cq[i], IB_CQ_NEXT_COMP)) + goto cq_err; + + tasklet_init(&device->cq_tasklet[i], + iser_cq_tasklet_fn, + (unsigned long)&cq_desc[i]); + } + + device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + if (IS_ERR(device->mr)) + goto dma_mr_err; + + INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device, + iser_event_handler); + if (ib_register_event_handler(&device->event_handler)) + goto handler_err; + + return 0; + +handler_err: + ib_dereg_mr(device->mr); +dma_mr_err: + for (j = 0; j < device->cqs_used; j++) + tasklet_kill(&device->cq_tasklet[j]); +cq_err: + for (j = 0; j < i; j++) { + if (device->tx_cq[j]) + ib_destroy_cq(device->tx_cq[j]); + if (device->rx_cq[j]) + ib_destroy_cq(device->rx_cq[j]); + } + ib_dealloc_pd(device->pd); +pd_err: + kfree(device->cq_desc); +cq_desc_err: + iser_err("failed to allocate an IB resource\n"); + return -1; +} + +/** + * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR, + * CQ and PD created with the device associated with the adapator. + */ +static void iser_free_device_ib_res(struct iser_device *device) +{ + int i; + BUG_ON(device->mr == NULL); + + for (i = 0; i < device->cqs_used; i++) { + tasklet_kill(&device->cq_tasklet[i]); + (void)ib_destroy_cq(device->tx_cq[i]); + (void)ib_destroy_cq(device->rx_cq[i]); + device->tx_cq[i] = NULL; + device->rx_cq[i] = NULL; + } + + (void)ib_unregister_event_handler(&device->event_handler); + (void)ib_dereg_mr(device->mr); + (void)ib_dealloc_pd(device->pd); + + kfree(device->cq_desc); + + device->mr = NULL; + device->pd = NULL; +} + +/** + * iser_create_fmr_pool - Creates FMR pool and page_vector + * + * returns 0 on success, or errno code on failure + */ +int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max) +{ + struct iser_device *device = ib_conn->device; + struct ib_fmr_pool_param params; + int ret = -ENOMEM; + + ib_conn->fmr.page_vec = kmalloc(sizeof(*ib_conn->fmr.page_vec) + + (sizeof(u64)*(ISCSI_ISER_SG_TABLESIZE + 1)), + GFP_KERNEL); + if (!ib_conn->fmr.page_vec) + return ret; + + ib_conn->fmr.page_vec->pages = (u64 *)(ib_conn->fmr.page_vec + 1); + + params.page_shift = SHIFT_4K; + /* when the first/last SG element are not start/end * + * page aligned, the map whould be of N+1 pages */ + params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1; + /* make the pool size twice the max number of SCSI commands * + * the ML is expected to queue, watermark for unmap at 50% */ + params.pool_size = cmds_max * 2; + params.dirty_watermark = cmds_max; + params.cache = 0; + params.flush_function = NULL; + params.access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + + ib_conn->fmr.pool = ib_create_fmr_pool(device->pd, ¶ms); + if (!IS_ERR(ib_conn->fmr.pool)) + return 0; + + /* no FMR => no need for page_vec */ + kfree(ib_conn->fmr.page_vec); + ib_conn->fmr.page_vec = NULL; + + ret = PTR_ERR(ib_conn->fmr.pool); + ib_conn->fmr.pool = NULL; + if (ret != -ENOSYS) { + iser_err("FMR allocation failed, err %d\n", ret); + return ret; + } else { + iser_warn("FMRs are not supported, using unaligned mode\n"); + return 0; + } +} + +/** + * iser_free_fmr_pool - releases the FMR pool and page vec + */ +void iser_free_fmr_pool(struct iser_conn *ib_conn) +{ + iser_info("freeing conn %p fmr pool %p\n", + ib_conn, ib_conn->fmr.pool); + + if (ib_conn->fmr.pool != NULL) + ib_destroy_fmr_pool(ib_conn->fmr.pool); + + ib_conn->fmr.pool = NULL; + + kfree(ib_conn->fmr.page_vec); + ib_conn->fmr.page_vec = NULL; +} + +static int +iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd, + bool pi_enable, struct fast_reg_descriptor *desc) +{ + int ret; + + desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE + 1); + if (IS_ERR(desc->data_frpl)) { + ret = PTR_ERR(desc->data_frpl); + iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n", + ret); + return PTR_ERR(desc->data_frpl); + } + + desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1); + if (IS_ERR(desc->data_mr)) { + ret = PTR_ERR(desc->data_mr); + iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); + goto fast_reg_mr_failure; + } + desc->reg_indicators |= ISER_DATA_KEY_VALID; + + if (pi_enable) { + struct ib_mr_init_attr mr_init_attr = {0}; + struct iser_pi_context *pi_ctx = NULL; + + desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); + if (!desc->pi_ctx) { + iser_err("Failed to allocate pi context\n"); + ret = -ENOMEM; + goto pi_ctx_alloc_failure; + } + pi_ctx = desc->pi_ctx; + + pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(pi_ctx->prot_frpl)) { + ret = PTR_ERR(pi_ctx->prot_frpl); + iser_err("Failed to allocate prot frpl ret=%d\n", + ret); + goto prot_frpl_failure; + } + + pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, + ISCSI_ISER_SG_TABLESIZE + 1); + if (IS_ERR(pi_ctx->prot_mr)) { + ret = PTR_ERR(pi_ctx->prot_mr); + iser_err("Failed to allocate prot frmr ret=%d\n", + ret); + goto prot_mr_failure; + } + desc->reg_indicators |= ISER_PROT_KEY_VALID; + + mr_init_attr.max_reg_descriptors = 2; + mr_init_attr.flags |= IB_MR_SIGNATURE_EN; + pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); + if (IS_ERR(pi_ctx->sig_mr)) { + ret = PTR_ERR(pi_ctx->sig_mr); + iser_err("Failed to allocate signature enabled mr err=%d\n", + ret); + goto sig_mr_failure; + } + desc->reg_indicators |= ISER_SIG_KEY_VALID; + } + desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; + + iser_dbg("Create fr_desc %p page_list %p\n", + desc, desc->data_frpl->page_list); + + return 0; +sig_mr_failure: + ib_dereg_mr(desc->pi_ctx->prot_mr); +prot_mr_failure: + ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); +prot_frpl_failure: + kfree(desc->pi_ctx); +pi_ctx_alloc_failure: + ib_dereg_mr(desc->data_mr); +fast_reg_mr_failure: + ib_free_fast_reg_page_list(desc->data_frpl); + + return ret; +} + +/** + * iser_create_fastreg_pool - Creates pool of fast_reg descriptors + * for fast registration work requests. + * returns 0 on success, or errno code on failure + */ +int iser_create_fastreg_pool(struct iser_conn *ib_conn, unsigned cmds_max) +{ + struct iser_device *device = ib_conn->device; + struct fast_reg_descriptor *desc; + int i, ret; + + INIT_LIST_HEAD(&ib_conn->fastreg.pool); + ib_conn->fastreg.pool_size = 0; + for (i = 0; i < cmds_max; i++) { + desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) { + iser_err("Failed to allocate a new fast_reg descriptor\n"); + ret = -ENOMEM; + goto err; + } + + ret = iser_create_fastreg_desc(device->ib_device, device->pd, + ib_conn->pi_support, desc); + if (ret) { + iser_err("Failed to create fastreg descriptor err=%d\n", + ret); + kfree(desc); + goto err; + } + + list_add_tail(&desc->list, &ib_conn->fastreg.pool); + ib_conn->fastreg.pool_size++; + } + + return 0; + +err: + iser_free_fastreg_pool(ib_conn); + return ret; +} + +/** + * iser_free_fastreg_pool - releases the pool of fast_reg descriptors + */ +void iser_free_fastreg_pool(struct iser_conn *ib_conn) +{ + struct fast_reg_descriptor *desc, *tmp; + int i = 0; + + if (list_empty(&ib_conn->fastreg.pool)) + return; + + iser_info("freeing conn %p fr pool\n", ib_conn); + + list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) { + list_del(&desc->list); + ib_free_fast_reg_page_list(desc->data_frpl); + ib_dereg_mr(desc->data_mr); + if (desc->pi_ctx) { + ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); + ib_dereg_mr(desc->pi_ctx->prot_mr); + ib_destroy_mr(desc->pi_ctx->sig_mr); + kfree(desc->pi_ctx); + } + kfree(desc); + ++i; + } + + if (i < ib_conn->fastreg.pool_size) + iser_warn("pool still has %d regions registered\n", + ib_conn->fastreg.pool_size - i); +} + +/** + * iser_create_ib_conn_res - Queue-Pair (QP) + * + * returns 0 on success, -1 on failure + */ +static int iser_create_ib_conn_res(struct iser_conn *ib_conn) +{ + struct iser_device *device; + struct ib_qp_init_attr init_attr; + int ret = -ENOMEM; + int index, min_index = 0; + + BUG_ON(ib_conn->device == NULL); + + device = ib_conn->device; + + memset(&init_attr, 0, sizeof init_attr); + + mutex_lock(&ig.connlist_mutex); + /* select the CQ with the minimal number of usages */ + for (index = 0; index < device->cqs_used; index++) + if (device->cq_active_qps[index] < + device->cq_active_qps[min_index]) + min_index = index; + device->cq_active_qps[min_index]++; + mutex_unlock(&ig.connlist_mutex); + iser_info("cq index %d used for ib_conn %p\n", min_index, ib_conn); + + init_attr.event_handler = iser_qp_event_callback; + init_attr.qp_context = (void *)ib_conn; + init_attr.send_cq = device->tx_cq[min_index]; + init_attr.recv_cq = device->rx_cq[min_index]; + init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; + init_attr.cap.max_send_sge = 2; + init_attr.cap.max_recv_sge = 1; + init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + init_attr.qp_type = IB_QPT_RC; + if (ib_conn->pi_support) { + init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS; + init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; + } else { + init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS; + } + + ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); + if (ret) + goto out_err; + + ib_conn->qp = ib_conn->cma_id->qp; + iser_info("setting conn %p cma_id %p qp %p\n", + ib_conn, ib_conn->cma_id, + ib_conn->cma_id->qp); + return ret; + +out_err: + iser_err("unable to alloc mem or create resource, err %d\n", ret); + return ret; +} + +/** + * releases the QP objects, returns 0 on success, + * -1 on failure + */ +static int iser_free_ib_conn_res(struct iser_conn *ib_conn) +{ + int cq_index; + BUG_ON(ib_conn == NULL); + + iser_info("freeing conn %p cma_id %p qp %p\n", + ib_conn, ib_conn->cma_id, + ib_conn->qp); + + /* qp is created only once both addr & route are resolved */ + + if (ib_conn->qp != NULL) { + cq_index = ((struct iser_cq_desc *)ib_conn->qp->recv_cq->cq_context)->cq_index; + ib_conn->device->cq_active_qps[cq_index]--; + + rdma_destroy_qp(ib_conn->cma_id); + } + + ib_conn->qp = NULL; + + return 0; +} + +/** + * based on the resolved device node GUID see if there already allocated + * device for this device. If there's no such, create one. + */ +static +struct iser_device *iser_device_find_by_ib_device(struct rdma_cm_id *cma_id) +{ + struct iser_device *device; + + mutex_lock(&ig.device_list_mutex); + + list_for_each_entry(device, &ig.device_list, ig_list) + /* find if there's a match using the node GUID */ + if (device->ib_device->node_guid == cma_id->device->node_guid) + goto inc_refcnt; + + device = kzalloc(sizeof *device, GFP_KERNEL); + if (device == NULL) + goto out; + + /* assign this device to the device */ + device->ib_device = cma_id->device; + /* init the device and link it into ig device list */ + if (iser_create_device_ib_res(device)) { + kfree(device); + device = NULL; + goto out; + } + list_add(&device->ig_list, &ig.device_list); + +inc_refcnt: + device->refcount++; +out: + mutex_unlock(&ig.device_list_mutex); + return device; +} + +/* if there's no demand for this device, release it */ +static void iser_device_try_release(struct iser_device *device) +{ + mutex_lock(&ig.device_list_mutex); + device->refcount--; + iser_info("device %p refcount %d\n", device, device->refcount); + if (!device->refcount) { + iser_free_device_ib_res(device); + list_del(&device->ig_list); + kfree(device); + } + mutex_unlock(&ig.device_list_mutex); +} + +static int iser_conn_state_comp_exch(struct iser_conn *ib_conn, + enum iser_ib_conn_state comp, + enum iser_ib_conn_state exch) +{ + int ret; + + spin_lock_bh(&ib_conn->lock); + if ((ret = (ib_conn->state == comp))) + ib_conn->state = exch; + spin_unlock_bh(&ib_conn->lock); + return ret; +} + +void iser_release_work(struct work_struct *work) +{ + struct iser_conn *ib_conn; + + ib_conn = container_of(work, struct iser_conn, release_work); + + /* wait for .conn_stop callback */ + wait_for_completion(&ib_conn->stop_completion); + + /* wait for the qp`s post send and post receive buffers to empty */ + wait_event_interruptible(ib_conn->wait, + ib_conn->state == ISER_CONN_DOWN); + + iser_conn_release(ib_conn); +} + +/** + * Frees all conn objects and deallocs conn descriptor + */ +void iser_conn_release(struct iser_conn *ib_conn) +{ + struct iser_device *device = ib_conn->device; + + BUG_ON(ib_conn->state == ISER_CONN_UP); + + mutex_lock(&ig.connlist_mutex); + list_del(&ib_conn->conn_list); + mutex_unlock(&ig.connlist_mutex); + iser_free_rx_descriptors(ib_conn); + iser_free_ib_conn_res(ib_conn); + ib_conn->device = NULL; + /* on EVENT_ADDR_ERROR there's no device yet for this conn */ + if (device != NULL) + iser_device_try_release(device); + /* if cma handler context, the caller actually destroy the id */ + if (ib_conn->cma_id != NULL) { + rdma_destroy_id(ib_conn->cma_id); + ib_conn->cma_id = NULL; + } + iscsi_destroy_endpoint(ib_conn->ep); +} + +/** + * triggers start of the disconnect procedures and wait for them to be done + */ +void iser_conn_terminate(struct iser_conn *ib_conn) +{ + int err = 0; + + /* change the ib conn state only if the conn is UP, however always call + * rdma_disconnect since this is the only way to cause the CMA to change + * the QP state to ERROR + */ + + iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, ISER_CONN_TERMINATING); + err = rdma_disconnect(ib_conn->cma_id); + if (err) + iser_err("Failed to disconnect, conn: 0x%p err %d\n", + ib_conn,err); +} + +static void iser_connect_error(struct rdma_cm_id *cma_id) +{ + struct iser_conn *ib_conn; + + ib_conn = (struct iser_conn *)cma_id->context; + + ib_conn->state = ISER_CONN_DOWN; + wake_up_interruptible(&ib_conn->wait); +} + +static void iser_addr_handler(struct rdma_cm_id *cma_id) +{ + struct iser_device *device; + struct iser_conn *ib_conn; + int ret; + + device = iser_device_find_by_ib_device(cma_id); + if (!device) { + iser_err("device lookup/creation failed\n"); + iser_connect_error(cma_id); + return; + } + + ib_conn = (struct iser_conn *)cma_id->context; + ib_conn->device = device; + + /* connection T10-PI support */ + if (iser_pi_enable) { + if (!(device->dev_attr.device_cap_flags & + IB_DEVICE_SIGNATURE_HANDOVER)) { + iser_warn("T10-PI requested but not supported on %s, " + "continue without T10-PI\n", + ib_conn->device->ib_device->name); + ib_conn->pi_support = false; + } else { + ib_conn->pi_support = true; + } + } + + ret = rdma_resolve_route(cma_id, 1000); + if (ret) { + iser_err("resolve route failed: %d\n", ret); + iser_connect_error(cma_id); + return; + } +} + +static void iser_route_handler(struct rdma_cm_id *cma_id) +{ + struct rdma_conn_param conn_param; + int ret; + struct iser_cm_hdr req_hdr; + + ret = iser_create_ib_conn_res((struct iser_conn *)cma_id->context); + if (ret) + goto failure; + + memset(&conn_param, 0, sizeof conn_param); + conn_param.responder_resources = 4; + conn_param.initiator_depth = 1; + conn_param.retry_count = 7; + conn_param.rnr_retry_count = 6; + + memset(&req_hdr, 0, sizeof(req_hdr)); + req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED | + ISER_SEND_W_INV_NOT_SUPPORTED); + conn_param.private_data = (void *)&req_hdr; + conn_param.private_data_len = sizeof(struct iser_cm_hdr); + + ret = rdma_connect(cma_id, &conn_param); + if (ret) { + iser_err("failure connecting: %d\n", ret); + goto failure; + } + + return; +failure: + iser_connect_error(cma_id); +} + +static void iser_connected_handler(struct rdma_cm_id *cma_id) +{ + struct iser_conn *ib_conn; + struct ib_qp_attr attr; + struct ib_qp_init_attr init_attr; + + (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr); + iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num); + + ib_conn = (struct iser_conn *)cma_id->context; + if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_PENDING, ISER_CONN_UP)) + wake_up_interruptible(&ib_conn->wait); +} + +static void iser_disconnected_handler(struct rdma_cm_id *cma_id) +{ + struct iser_conn *ib_conn; + + ib_conn = (struct iser_conn *)cma_id->context; + + /* getting here when the state is UP means that the conn is being * + * terminated asynchronously from the iSCSI layer's perspective. */ + if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, + ISER_CONN_TERMINATING)){ + if (ib_conn->iscsi_conn) + iscsi_conn_failure(ib_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); + else + iser_err("iscsi_iser connection isn't bound\n"); + } + + /* Complete the termination process if no posts are pending */ + if (ib_conn->post_recv_buf_count == 0 && + (atomic_read(&ib_conn->post_send_buf_count) == 0)) { + ib_conn->state = ISER_CONN_DOWN; + wake_up_interruptible(&ib_conn->wait); + } +} + +static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ + iser_info("event %d status %d conn %p id %p\n", + event->event, event->status, cma_id->context, cma_id); + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + iser_addr_handler(cma_id); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + iser_route_handler(cma_id); + break; + case RDMA_CM_EVENT_ESTABLISHED: + iser_connected_handler(cma_id); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + iser_connect_error(cma_id); + break; + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_ADDR_CHANGE: + iser_disconnected_handler(cma_id); + break; + default: + iser_err("Unexpected RDMA CM event (%d)\n", event->event); + break; + } + return 0; +} + +void iser_conn_init(struct iser_conn *ib_conn) +{ + ib_conn->state = ISER_CONN_INIT; + init_waitqueue_head(&ib_conn->wait); + ib_conn->post_recv_buf_count = 0; + atomic_set(&ib_conn->post_send_buf_count, 0); + init_completion(&ib_conn->stop_completion); + INIT_LIST_HEAD(&ib_conn->conn_list); + spin_lock_init(&ib_conn->lock); +} + + /** + * starts the process of connecting to the target + * sleeps until the connection is established or rejected + */ +int iser_connect(struct iser_conn *ib_conn, + struct sockaddr_in *src_addr, + struct sockaddr_in *dst_addr, + int non_blocking) +{ + struct sockaddr *src, *dst; + int err = 0; + + sprintf(ib_conn->name, "%pI4:%d", + &dst_addr->sin_addr.s_addr, dst_addr->sin_port); + + /* the device is known only --after-- address resolution */ + ib_conn->device = NULL; + + iser_info("connecting to: %pI4, port 0x%x\n", + &dst_addr->sin_addr, dst_addr->sin_port); + + ib_conn->state = ISER_CONN_PENDING; + + ib_conn->cma_id = rdma_create_id(iser_cma_handler, + (void *)ib_conn, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(ib_conn->cma_id)) { + err = PTR_ERR(ib_conn->cma_id); + iser_err("rdma_create_id failed: %d\n", err); + goto id_failure; + } + + src = (struct sockaddr *)src_addr; + dst = (struct sockaddr *)dst_addr; + err = rdma_resolve_addr(ib_conn->cma_id, src, dst, 1000); + if (err) { + iser_err("rdma_resolve_addr failed: %d\n", err); + goto addr_failure; + } + + if (!non_blocking) { + wait_event_interruptible(ib_conn->wait, + (ib_conn->state != ISER_CONN_PENDING)); + + if (ib_conn->state != ISER_CONN_UP) { + err = -EIO; + goto connect_failure; + } + } + + mutex_lock(&ig.connlist_mutex); + list_add(&ib_conn->conn_list, &ig.connlist); + mutex_unlock(&ig.connlist_mutex); + return 0; + +id_failure: + ib_conn->cma_id = NULL; +addr_failure: + ib_conn->state = ISER_CONN_DOWN; +connect_failure: + iser_conn_release(ib_conn); + return err; +} + +/** + * iser_reg_page_vec - Register physical memory + * + * returns: 0 on success, errno code on failure + */ +int iser_reg_page_vec(struct iser_conn *ib_conn, + struct iser_page_vec *page_vec, + struct iser_mem_reg *mem_reg) +{ + struct ib_pool_fmr *mem; + u64 io_addr; + u64 *page_list; + int status; + + page_list = page_vec->pages; + io_addr = page_list[0]; + + mem = ib_fmr_pool_map_phys(ib_conn->fmr.pool, + page_list, + page_vec->length, + io_addr); + + if (IS_ERR(mem)) { + status = (int)PTR_ERR(mem); + iser_err("ib_fmr_pool_map_phys failed: %d\n", status); + return status; + } + + mem_reg->lkey = mem->fmr->lkey; + mem_reg->rkey = mem->fmr->rkey; + mem_reg->len = page_vec->length * SIZE_4K; + mem_reg->va = io_addr; + mem_reg->is_mr = 1; + mem_reg->mem_h = (void *)mem; + + mem_reg->va += page_vec->offset; + mem_reg->len = page_vec->data_size; + + iser_dbg("PHYSICAL Mem.register, [PHYS p_array: 0x%p, sz: %d, " + "entry[0]: (0x%08lx,%ld)] -> " + "[lkey: 0x%08X mem_h: 0x%p va: 0x%08lX sz: %ld]\n", + page_vec, page_vec->length, + (unsigned long)page_vec->pages[0], + (unsigned long)page_vec->data_size, + (unsigned int)mem_reg->lkey, mem_reg->mem_h, + (unsigned long)mem_reg->va, (unsigned long)mem_reg->len); + return 0; +} + +/** + * Unregister (previosuly registered using FMR) memory. + * If memory is non-FMR does nothing. + */ +void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) +{ + struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; + int ret; + + if (!reg->is_mr) + return; + + iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h); + + ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h); + if (ret) + iser_err("ib_fmr_pool_unmap failed %d\n", ret); + + reg->mem_h = NULL; +} + +void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) +{ + struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; + struct iser_conn *ib_conn = iser_task->ib_conn; + struct fast_reg_descriptor *desc = reg->mem_h; + + if (!reg->is_mr) + return; + + reg->mem_h = NULL; + reg->is_mr = 0; + spin_lock_bh(&ib_conn->lock); + list_add_tail(&desc->list, &ib_conn->fastreg.pool); + spin_unlock_bh(&ib_conn->lock); +} + +int iser_post_recvl(struct iser_conn *ib_conn) +{ + struct ib_recv_wr rx_wr, *rx_wr_failed; + struct ib_sge sge; + int ib_ret; + + sge.addr = ib_conn->login_resp_dma; + sge.length = ISER_RX_LOGIN_SIZE; + sge.lkey = ib_conn->device->mr->lkey; + + rx_wr.wr_id = (unsigned long)ib_conn->login_resp_buf; + rx_wr.sg_list = &sge; + rx_wr.num_sge = 1; + rx_wr.next = NULL; + + ib_conn->post_recv_buf_count++; + ib_ret = ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed); + if (ib_ret) { + iser_err("ib_post_recv failed ret=%d\n", ib_ret); + ib_conn->post_recv_buf_count--; + } + return ib_ret; +} + +int iser_post_recvm(struct iser_conn *ib_conn, int count) +{ + struct ib_recv_wr *rx_wr, *rx_wr_failed; + int i, ib_ret; + unsigned int my_rx_head = ib_conn->rx_desc_head; + struct iser_rx_desc *rx_desc; + + for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { + rx_desc = &ib_conn->rx_descs[my_rx_head]; + rx_wr->wr_id = (unsigned long)rx_desc; + rx_wr->sg_list = &rx_desc->rx_sg; + rx_wr->num_sge = 1; + rx_wr->next = rx_wr + 1; + my_rx_head = (my_rx_head + 1) & ib_conn->qp_max_recv_dtos_mask; + } + + rx_wr--; + rx_wr->next = NULL; /* mark end of work requests list */ + + ib_conn->post_recv_buf_count += count; + ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed); + if (ib_ret) { + iser_err("ib_post_recv failed ret=%d\n", ib_ret); + ib_conn->post_recv_buf_count -= count; + } else + ib_conn->rx_desc_head = my_rx_head; + return ib_ret; +} + + +/** + * iser_start_send - Initiate a Send DTO operation + * + * returns 0 on success, -1 on failure + */ +int iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc) +{ + int ib_ret; + struct ib_send_wr send_wr, *send_wr_failed; + + ib_dma_sync_single_for_device(ib_conn->device->ib_device, + tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); + + send_wr.next = NULL; + send_wr.wr_id = (unsigned long)tx_desc; + send_wr.sg_list = tx_desc->tx_sg; + send_wr.num_sge = tx_desc->num_sge; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + atomic_inc(&ib_conn->post_send_buf_count); + + ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed); + if (ib_ret) { + iser_err("ib_post_send failed, ret:%d\n", ib_ret); + atomic_dec(&ib_conn->post_send_buf_count); + } + return ib_ret; +} + +static void iser_handle_comp_error(struct iser_tx_desc *desc, + struct iser_conn *ib_conn) +{ + if (desc && desc->type == ISCSI_TX_DATAOUT) + kmem_cache_free(ig.desc_cache, desc); + + if (ib_conn->post_recv_buf_count == 0 && + atomic_read(&ib_conn->post_send_buf_count) == 0) { + /* getting here when the state is UP means that the conn is * + * being terminated asynchronously from the iSCSI layer's * + * perspective. */ + if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, + ISER_CONN_TERMINATING)) + iscsi_conn_failure(ib_conn->iscsi_conn, + ISCSI_ERR_CONN_FAILED); + + /* no more non completed posts to the QP, complete the + * termination process w.o worrying on disconnect event */ + ib_conn->state = ISER_CONN_DOWN; + wake_up_interruptible(&ib_conn->wait); + } +} + +static int iser_drain_tx_cq(struct iser_device *device, int cq_index) +{ + struct ib_cq *cq = device->tx_cq[cq_index]; + struct ib_wc wc; + struct iser_tx_desc *tx_desc; + struct iser_conn *ib_conn; + int completed_tx = 0; + + while (ib_poll_cq(cq, 1, &wc) == 1) { + tx_desc = (struct iser_tx_desc *) (unsigned long) wc.wr_id; + ib_conn = wc.qp->qp_context; + if (wc.status == IB_WC_SUCCESS) { + if (wc.opcode == IB_WC_SEND) + iser_snd_completion(tx_desc, ib_conn); + else + iser_err("expected opcode %d got %d\n", + IB_WC_SEND, wc.opcode); + } else { + iser_err("tx id %llx status %d vend_err %x\n", + wc.wr_id, wc.status, wc.vendor_err); + if (wc.wr_id != ISER_FASTREG_LI_WRID) { + atomic_dec(&ib_conn->post_send_buf_count); + iser_handle_comp_error(tx_desc, ib_conn); + } + } + completed_tx++; + } + return completed_tx; +} + + +static void iser_cq_tasklet_fn(unsigned long data) +{ + struct iser_cq_desc *cq_desc = (struct iser_cq_desc *)data; + struct iser_device *device = cq_desc->device; + int cq_index = cq_desc->cq_index; + struct ib_cq *cq = device->rx_cq[cq_index]; + struct ib_wc wc; + struct iser_rx_desc *desc; + unsigned long xfer_len; + struct iser_conn *ib_conn; + int completed_tx, completed_rx = 0; + + /* First do tx drain, so in a case where we have rx flushes and a successful + * tx completion we will still go through completion error handling. + */ + completed_tx = iser_drain_tx_cq(device, cq_index); + + while (ib_poll_cq(cq, 1, &wc) == 1) { + desc = (struct iser_rx_desc *) (unsigned long) wc.wr_id; + BUG_ON(desc == NULL); + ib_conn = wc.qp->qp_context; + if (wc.status == IB_WC_SUCCESS) { + if (wc.opcode == IB_WC_RECV) { + xfer_len = (unsigned long)wc.byte_len; + iser_rcv_completion(desc, xfer_len, ib_conn); + } else + iser_err("expected opcode %d got %d\n", + IB_WC_RECV, wc.opcode); + } else { + if (wc.status != IB_WC_WR_FLUSH_ERR) + iser_err("rx id %llx status %d vend_err %x\n", + wc.wr_id, wc.status, wc.vendor_err); + ib_conn->post_recv_buf_count--; + iser_handle_comp_error(NULL, ib_conn); + } + completed_rx++; + if (!(completed_rx & 63)) + completed_tx += iser_drain_tx_cq(device, cq_index); + } + /* #warning "it is assumed here that arming CQ only once its empty" * + * " would not cause interrupts to be missed" */ + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + + iser_dbg("got %d rx %d tx completions\n", completed_rx, completed_tx); +} + +static void iser_cq_callback(struct ib_cq *cq, void *cq_context) +{ + struct iser_cq_desc *cq_desc = (struct iser_cq_desc *)cq_context; + struct iser_device *device = cq_desc->device; + int cq_index = cq_desc->cq_index; + + tasklet_schedule(&device->cq_tasklet[cq_index]); +} + +u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir, sector_t *sector) +{ + struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; + struct fast_reg_descriptor *desc = reg->mem_h; + unsigned long sector_size = iser_task->sc->device->sector_size; + struct ib_mr_status mr_status; + int ret; + + if (desc && desc->reg_indicators & ISER_FASTREG_PROTECTED) { + desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; + ret = ib_check_mr_status(desc->pi_ctx->sig_mr, + IB_MR_CHECK_SIG_STATUS, &mr_status); + if (ret) { + pr_err("ib_check_mr_status failed, ret %d\n", ret); + goto err; + } + + if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { + sector_t sector_off = mr_status.sig_err.sig_err_offset; + + do_div(sector_off, sector_size + 8); + *sector = scsi_get_lba(iser_task->sc) + sector_off; + + pr_err("PI error found type %d at sector %llx " + "expected %x vs actual %x\n", + mr_status.sig_err.err_type, + (unsigned long long)*sector, + mr_status.sig_err.expected, + mr_status.sig_err.actual); + + switch (mr_status.sig_err.err_type) { + case IB_SIG_BAD_GUARD: + return 0x1; + case IB_SIG_BAD_REFTAG: + return 0x3; + case IB_SIG_BAD_APPTAG: + return 0x2; + } + } + } + + return 0; +err: + /* Not alot we can do here, return ambiguous guard error */ + return 0x1; +} diff --git a/drivers/infiniband/ulp/isert/Kconfig b/drivers/infiniband/ulp/isert/Kconfig new file mode 100644 index 00000000000..02f9759ebb1 --- /dev/null +++ b/drivers/infiniband/ulp/isert/Kconfig @@ -0,0 +1,5 @@ +config INFINIBAND_ISERT + tristate "iSCSI Extensions for RDMA (iSER) target support" + depends on INET && INFINIBAND_ADDR_TRANS && TARGET_CORE && ISCSI_TARGET + ---help--- + Support for iSCSI Extensions for RDMA (iSER) Target on Infiniband fabrics. diff --git a/drivers/infiniband/ulp/isert/Makefile b/drivers/infiniband/ulp/isert/Makefile new file mode 100644 index 00000000000..c8bf2421f5b --- /dev/null +++ b/drivers/infiniband/ulp/isert/Makefile @@ -0,0 +1,2 @@ +ccflags-y := -Idrivers/target -Idrivers/target/iscsi +obj-$(CONFIG_INFINIBAND_ISERT) += ib_isert.o diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c new file mode 100644 index 00000000000..d4c7928a0f3 --- /dev/null +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -0,0 +1,3308 @@ +/******************************************************************************* + * This file contains iSCSI extentions for RDMA (iSER) Verbs + * + * (c) Copyright 2013 Datera, Inc. + * + * Nicholas A. Bellinger <nab@linux-iscsi.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + ****************************************************************************/ + +#include <linux/string.h> +#include <linux/module.h> +#include <linux/scatterlist.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/llist.h> +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include <target/target_core_base.h> +#include <target/target_core_fabric.h> +#include <target/iscsi/iscsi_transport.h> +#include <linux/semaphore.h> + +#include "isert_proto.h" +#include "ib_isert.h" + +#define ISERT_MAX_CONN 8 +#define ISER_MAX_RX_CQ_LEN (ISERT_QP_MAX_RECV_DTOS * ISERT_MAX_CONN) +#define ISER_MAX_TX_CQ_LEN (ISERT_QP_MAX_REQ_DTOS * ISERT_MAX_CONN) + +static DEFINE_MUTEX(device_list_mutex); +static LIST_HEAD(device_list); +static struct workqueue_struct *isert_rx_wq; +static struct workqueue_struct *isert_comp_wq; + +static void +isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn); +static int +isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr); +static void +isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn); +static int +isert_reg_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr); +static int +isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd); + +static void +isert_qp_event_callback(struct ib_event *e, void *context) +{ + struct isert_conn *isert_conn = (struct isert_conn *)context; + + pr_err("isert_qp_event_callback event: %d\n", e->event); + switch (e->event) { + case IB_EVENT_COMM_EST: + rdma_notify(isert_conn->conn_cm_id, IB_EVENT_COMM_EST); + break; + case IB_EVENT_QP_LAST_WQE_REACHED: + pr_warn("Reached TX IB_EVENT_QP_LAST_WQE_REACHED:\n"); + break; + default: + break; + } +} + +static int +isert_query_device(struct ib_device *ib_dev, struct ib_device_attr *devattr) +{ + int ret; + + ret = ib_query_device(ib_dev, devattr); + if (ret) { + pr_err("ib_query_device() failed: %d\n", ret); + return ret; + } + pr_debug("devattr->max_sge: %d\n", devattr->max_sge); + pr_debug("devattr->max_sge_rd: %d\n", devattr->max_sge_rd); + + return 0; +} + +static int +isert_conn_setup_qp(struct isert_conn *isert_conn, struct rdma_cm_id *cma_id, + u8 protection) +{ + struct isert_device *device = isert_conn->conn_device; + struct ib_qp_init_attr attr; + int ret, index, min_index = 0; + + mutex_lock(&device_list_mutex); + for (index = 0; index < device->cqs_used; index++) + if (device->cq_active_qps[index] < + device->cq_active_qps[min_index]) + min_index = index; + device->cq_active_qps[min_index]++; + pr_debug("isert_conn_setup_qp: Using min_index: %d\n", min_index); + mutex_unlock(&device_list_mutex); + + memset(&attr, 0, sizeof(struct ib_qp_init_attr)); + attr.event_handler = isert_qp_event_callback; + attr.qp_context = isert_conn; + attr.send_cq = device->dev_tx_cq[min_index]; + attr.recv_cq = device->dev_rx_cq[min_index]; + attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS; + attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS; + /* + * FIXME: Use devattr.max_sge - 2 for max_send_sge as + * work-around for RDMA_READ.. + */ + attr.cap.max_send_sge = device->dev_attr.max_sge - 2; + isert_conn->max_sge = attr.cap.max_send_sge; + + attr.cap.max_recv_sge = 1; + attr.sq_sig_type = IB_SIGNAL_REQ_WR; + attr.qp_type = IB_QPT_RC; + if (protection) + attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; + + pr_debug("isert_conn_setup_qp cma_id->device: %p\n", + cma_id->device); + pr_debug("isert_conn_setup_qp conn_pd->device: %p\n", + isert_conn->conn_pd->device); + + ret = rdma_create_qp(cma_id, isert_conn->conn_pd, &attr); + if (ret) { + pr_err("rdma_create_qp failed for cma_id %d\n", ret); + return ret; + } + isert_conn->conn_qp = cma_id->qp; + pr_debug("rdma_create_qp() returned success >>>>>>>>>>>>>>>>>>>>>>>>>.\n"); + + return 0; +} + +static void +isert_cq_event_callback(struct ib_event *e, void *context) +{ + pr_debug("isert_cq_event_callback event: %d\n", e->event); +} + +static int +isert_alloc_rx_descriptors(struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iser_rx_desc *rx_desc; + struct ib_sge *rx_sg; + u64 dma_addr; + int i, j; + + isert_conn->conn_rx_descs = kzalloc(ISERT_QP_MAX_RECV_DTOS * + sizeof(struct iser_rx_desc), GFP_KERNEL); + if (!isert_conn->conn_rx_descs) + goto fail; + + rx_desc = isert_conn->conn_rx_descs; + + for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++) { + dma_addr = ib_dma_map_single(ib_dev, (void *)rx_desc, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ib_dev, dma_addr)) + goto dma_map_fail; + + rx_desc->dma_addr = dma_addr; + + rx_sg = &rx_desc->rx_sg; + rx_sg->addr = rx_desc->dma_addr; + rx_sg->length = ISER_RX_PAYLOAD_SIZE; + rx_sg->lkey = isert_conn->conn_mr->lkey; + } + + isert_conn->conn_rx_desc_head = 0; + return 0; + +dma_map_fail: + rx_desc = isert_conn->conn_rx_descs; + for (j = 0; j < i; j++, rx_desc++) { + ib_dma_unmap_single(ib_dev, rx_desc->dma_addr, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + } + kfree(isert_conn->conn_rx_descs); + isert_conn->conn_rx_descs = NULL; +fail: + return -ENOMEM; +} + +static void +isert_free_rx_descriptors(struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iser_rx_desc *rx_desc; + int i; + + if (!isert_conn->conn_rx_descs) + return; + + rx_desc = isert_conn->conn_rx_descs; + for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++) { + ib_dma_unmap_single(ib_dev, rx_desc->dma_addr, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + } + + kfree(isert_conn->conn_rx_descs); + isert_conn->conn_rx_descs = NULL; +} + +static void isert_cq_tx_work(struct work_struct *); +static void isert_cq_tx_callback(struct ib_cq *, void *); +static void isert_cq_rx_work(struct work_struct *); +static void isert_cq_rx_callback(struct ib_cq *, void *); + +static int +isert_create_device_ib_res(struct isert_device *device) +{ + struct ib_device *ib_dev = device->ib_device; + struct isert_cq_desc *cq_desc; + struct ib_device_attr *dev_attr; + int ret = 0, i, j; + + dev_attr = &device->dev_attr; + ret = isert_query_device(ib_dev, dev_attr); + if (ret) + return ret; + + /* asign function handlers */ + if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS && + dev_attr->device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) { + device->use_fastreg = 1; + device->reg_rdma_mem = isert_reg_rdma; + device->unreg_rdma_mem = isert_unreg_rdma; + } else { + device->use_fastreg = 0; + device->reg_rdma_mem = isert_map_rdma; + device->unreg_rdma_mem = isert_unmap_cmd; + } + + /* Check signature cap */ + device->pi_capable = dev_attr->device_cap_flags & + IB_DEVICE_SIGNATURE_HANDOVER ? true : false; + + device->cqs_used = min_t(int, num_online_cpus(), + device->ib_device->num_comp_vectors); + device->cqs_used = min(ISERT_MAX_CQ, device->cqs_used); + pr_debug("Using %d CQs, device %s supports %d vectors support " + "Fast registration %d pi_capable %d\n", + device->cqs_used, device->ib_device->name, + device->ib_device->num_comp_vectors, device->use_fastreg, + device->pi_capable); + device->cq_desc = kzalloc(sizeof(struct isert_cq_desc) * + device->cqs_used, GFP_KERNEL); + if (!device->cq_desc) { + pr_err("Unable to allocate device->cq_desc\n"); + return -ENOMEM; + } + cq_desc = device->cq_desc; + + for (i = 0; i < device->cqs_used; i++) { + cq_desc[i].device = device; + cq_desc[i].cq_index = i; + + INIT_WORK(&cq_desc[i].cq_rx_work, isert_cq_rx_work); + device->dev_rx_cq[i] = ib_create_cq(device->ib_device, + isert_cq_rx_callback, + isert_cq_event_callback, + (void *)&cq_desc[i], + ISER_MAX_RX_CQ_LEN, i); + if (IS_ERR(device->dev_rx_cq[i])) { + ret = PTR_ERR(device->dev_rx_cq[i]); + device->dev_rx_cq[i] = NULL; + goto out_cq; + } + + INIT_WORK(&cq_desc[i].cq_tx_work, isert_cq_tx_work); + device->dev_tx_cq[i] = ib_create_cq(device->ib_device, + isert_cq_tx_callback, + isert_cq_event_callback, + (void *)&cq_desc[i], + ISER_MAX_TX_CQ_LEN, i); + if (IS_ERR(device->dev_tx_cq[i])) { + ret = PTR_ERR(device->dev_tx_cq[i]); + device->dev_tx_cq[i] = NULL; + goto out_cq; + } + + ret = ib_req_notify_cq(device->dev_rx_cq[i], IB_CQ_NEXT_COMP); + if (ret) + goto out_cq; + + ret = ib_req_notify_cq(device->dev_tx_cq[i], IB_CQ_NEXT_COMP); + if (ret) + goto out_cq; + } + + return 0; + +out_cq: + for (j = 0; j < i; j++) { + cq_desc = &device->cq_desc[j]; + + if (device->dev_rx_cq[j]) { + cancel_work_sync(&cq_desc->cq_rx_work); + ib_destroy_cq(device->dev_rx_cq[j]); + } + if (device->dev_tx_cq[j]) { + cancel_work_sync(&cq_desc->cq_tx_work); + ib_destroy_cq(device->dev_tx_cq[j]); + } + } + kfree(device->cq_desc); + + return ret; +} + +static void +isert_free_device_ib_res(struct isert_device *device) +{ + struct isert_cq_desc *cq_desc; + int i; + + for (i = 0; i < device->cqs_used; i++) { + cq_desc = &device->cq_desc[i]; + + cancel_work_sync(&cq_desc->cq_rx_work); + cancel_work_sync(&cq_desc->cq_tx_work); + ib_destroy_cq(device->dev_rx_cq[i]); + ib_destroy_cq(device->dev_tx_cq[i]); + device->dev_rx_cq[i] = NULL; + device->dev_tx_cq[i] = NULL; + } + + kfree(device->cq_desc); +} + +static void +isert_device_try_release(struct isert_device *device) +{ + mutex_lock(&device_list_mutex); + device->refcount--; + if (!device->refcount) { + isert_free_device_ib_res(device); + list_del(&device->dev_node); + kfree(device); + } + mutex_unlock(&device_list_mutex); +} + +static struct isert_device * +isert_device_find_by_ib_dev(struct rdma_cm_id *cma_id) +{ + struct isert_device *device; + int ret; + + mutex_lock(&device_list_mutex); + list_for_each_entry(device, &device_list, dev_node) { + if (device->ib_device->node_guid == cma_id->device->node_guid) { + device->refcount++; + mutex_unlock(&device_list_mutex); + return device; + } + } + + device = kzalloc(sizeof(struct isert_device), GFP_KERNEL); + if (!device) { + mutex_unlock(&device_list_mutex); + return ERR_PTR(-ENOMEM); + } + + INIT_LIST_HEAD(&device->dev_node); + + device->ib_device = cma_id->device; + ret = isert_create_device_ib_res(device); + if (ret) { + kfree(device); + mutex_unlock(&device_list_mutex); + return ERR_PTR(ret); + } + + device->refcount++; + list_add_tail(&device->dev_node, &device_list); + mutex_unlock(&device_list_mutex); + + return device; +} + +static void +isert_conn_free_fastreg_pool(struct isert_conn *isert_conn) +{ + struct fast_reg_descriptor *fr_desc, *tmp; + int i = 0; + + if (list_empty(&isert_conn->conn_fr_pool)) + return; + + pr_debug("Freeing conn %p fastreg pool", isert_conn); + + list_for_each_entry_safe(fr_desc, tmp, + &isert_conn->conn_fr_pool, list) { + list_del(&fr_desc->list); + ib_free_fast_reg_page_list(fr_desc->data_frpl); + ib_dereg_mr(fr_desc->data_mr); + if (fr_desc->pi_ctx) { + ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl); + ib_dereg_mr(fr_desc->pi_ctx->prot_mr); + ib_destroy_mr(fr_desc->pi_ctx->sig_mr); + kfree(fr_desc->pi_ctx); + } + kfree(fr_desc); + ++i; + } + + if (i < isert_conn->conn_fr_pool_size) + pr_warn("Pool still has %d regions registered\n", + isert_conn->conn_fr_pool_size - i); +} + +static int +isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd, + struct fast_reg_descriptor *fr_desc, u8 protection) +{ + int ret; + + fr_desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(fr_desc->data_frpl)) { + pr_err("Failed to allocate data frpl err=%ld\n", + PTR_ERR(fr_desc->data_frpl)); + return PTR_ERR(fr_desc->data_frpl); + } + + fr_desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(fr_desc->data_mr)) { + pr_err("Failed to allocate data frmr err=%ld\n", + PTR_ERR(fr_desc->data_mr)); + ret = PTR_ERR(fr_desc->data_mr); + goto err_data_frpl; + } + pr_debug("Create fr_desc %p page_list %p\n", + fr_desc, fr_desc->data_frpl->page_list); + fr_desc->ind |= ISERT_DATA_KEY_VALID; + + if (protection) { + struct ib_mr_init_attr mr_init_attr = {0}; + struct pi_context *pi_ctx; + + fr_desc->pi_ctx = kzalloc(sizeof(*fr_desc->pi_ctx), GFP_KERNEL); + if (!fr_desc->pi_ctx) { + pr_err("Failed to allocate pi context\n"); + ret = -ENOMEM; + goto err_data_mr; + } + pi_ctx = fr_desc->pi_ctx; + + pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(pi_ctx->prot_frpl)) { + pr_err("Failed to allocate prot frpl err=%ld\n", + PTR_ERR(pi_ctx->prot_frpl)); + ret = PTR_ERR(pi_ctx->prot_frpl); + goto err_pi_ctx; + } + + pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(pi_ctx->prot_mr)) { + pr_err("Failed to allocate prot frmr err=%ld\n", + PTR_ERR(pi_ctx->prot_mr)); + ret = PTR_ERR(pi_ctx->prot_mr); + goto err_prot_frpl; + } + fr_desc->ind |= ISERT_PROT_KEY_VALID; + + mr_init_attr.max_reg_descriptors = 2; + mr_init_attr.flags |= IB_MR_SIGNATURE_EN; + pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); + if (IS_ERR(pi_ctx->sig_mr)) { + pr_err("Failed to allocate signature enabled mr err=%ld\n", + PTR_ERR(pi_ctx->sig_mr)); + ret = PTR_ERR(pi_ctx->sig_mr); + goto err_prot_mr; + } + fr_desc->ind |= ISERT_SIG_KEY_VALID; + } + fr_desc->ind &= ~ISERT_PROTECTED; + + return 0; +err_prot_mr: + ib_dereg_mr(fr_desc->pi_ctx->prot_mr); +err_prot_frpl: + ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl); +err_pi_ctx: + kfree(fr_desc->pi_ctx); +err_data_mr: + ib_dereg_mr(fr_desc->data_mr); +err_data_frpl: + ib_free_fast_reg_page_list(fr_desc->data_frpl); + + return ret; +} + +static int +isert_conn_create_fastreg_pool(struct isert_conn *isert_conn, u8 pi_support) +{ + struct fast_reg_descriptor *fr_desc; + struct isert_device *device = isert_conn->conn_device; + struct se_session *se_sess = isert_conn->conn->sess->se_sess; + struct se_node_acl *se_nacl = se_sess->se_node_acl; + int i, ret, tag_num; + /* + * Setup the number of FRMRs based upon the number of tags + * available to session in iscsi_target_locate_portal(). + */ + tag_num = max_t(u32, ISCSIT_MIN_TAGS, se_nacl->queue_depth); + tag_num = (tag_num * 2) + ISCSIT_EXTRA_TAGS; + + isert_conn->conn_fr_pool_size = 0; + for (i = 0; i < tag_num; i++) { + fr_desc = kzalloc(sizeof(*fr_desc), GFP_KERNEL); + if (!fr_desc) { + pr_err("Failed to allocate fast_reg descriptor\n"); + ret = -ENOMEM; + goto err; + } + + ret = isert_create_fr_desc(device->ib_device, + isert_conn->conn_pd, fr_desc, + pi_support); + if (ret) { + pr_err("Failed to create fastreg descriptor err=%d\n", + ret); + kfree(fr_desc); + goto err; + } + + list_add_tail(&fr_desc->list, &isert_conn->conn_fr_pool); + isert_conn->conn_fr_pool_size++; + } + + pr_debug("Creating conn %p fastreg pool size=%d", + isert_conn, isert_conn->conn_fr_pool_size); + + return 0; + +err: + isert_conn_free_fastreg_pool(isert_conn); + return ret; +} + +static int +isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ + struct iscsi_np *np = cma_id->context; + struct isert_np *isert_np = np->np_context; + struct isert_conn *isert_conn; + struct isert_device *device; + struct ib_device *ib_dev = cma_id->device; + int ret = 0; + u8 pi_support; + + spin_lock_bh(&np->np_thread_lock); + if (!np->enabled) { + spin_unlock_bh(&np->np_thread_lock); + pr_debug("iscsi_np is not enabled, reject connect request\n"); + return rdma_reject(cma_id, NULL, 0); + } + spin_unlock_bh(&np->np_thread_lock); + + pr_debug("Entering isert_connect_request cma_id: %p, context: %p\n", + cma_id, cma_id->context); + + isert_conn = kzalloc(sizeof(struct isert_conn), GFP_KERNEL); + if (!isert_conn) { + pr_err("Unable to allocate isert_conn\n"); + return -ENOMEM; + } + isert_conn->state = ISER_CONN_INIT; + INIT_LIST_HEAD(&isert_conn->conn_accept_node); + init_completion(&isert_conn->conn_login_comp); + init_completion(&isert_conn->conn_wait); + init_completion(&isert_conn->conn_wait_comp_err); + kref_init(&isert_conn->conn_kref); + kref_get(&isert_conn->conn_kref); + mutex_init(&isert_conn->conn_mutex); + spin_lock_init(&isert_conn->conn_lock); + INIT_LIST_HEAD(&isert_conn->conn_fr_pool); + + cma_id->context = isert_conn; + isert_conn->conn_cm_id = cma_id; + isert_conn->responder_resources = event->param.conn.responder_resources; + isert_conn->initiator_depth = event->param.conn.initiator_depth; + pr_debug("Using responder_resources: %u initiator_depth: %u\n", + isert_conn->responder_resources, isert_conn->initiator_depth); + + isert_conn->login_buf = kzalloc(ISCSI_DEF_MAX_RECV_SEG_LEN + + ISER_RX_LOGIN_SIZE, GFP_KERNEL); + if (!isert_conn->login_buf) { + pr_err("Unable to allocate isert_conn->login_buf\n"); + ret = -ENOMEM; + goto out; + } + + isert_conn->login_req_buf = isert_conn->login_buf; + isert_conn->login_rsp_buf = isert_conn->login_buf + + ISCSI_DEF_MAX_RECV_SEG_LEN; + pr_debug("Set login_buf: %p login_req_buf: %p login_rsp_buf: %p\n", + isert_conn->login_buf, isert_conn->login_req_buf, + isert_conn->login_rsp_buf); + + isert_conn->login_req_dma = ib_dma_map_single(ib_dev, + (void *)isert_conn->login_req_buf, + ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_FROM_DEVICE); + + ret = ib_dma_mapping_error(ib_dev, isert_conn->login_req_dma); + if (ret) { + pr_err("ib_dma_mapping_error failed for login_req_dma: %d\n", + ret); + isert_conn->login_req_dma = 0; + goto out_login_buf; + } + + isert_conn->login_rsp_dma = ib_dma_map_single(ib_dev, + (void *)isert_conn->login_rsp_buf, + ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE); + + ret = ib_dma_mapping_error(ib_dev, isert_conn->login_rsp_dma); + if (ret) { + pr_err("ib_dma_mapping_error failed for login_rsp_dma: %d\n", + ret); + isert_conn->login_rsp_dma = 0; + goto out_req_dma_map; + } + + device = isert_device_find_by_ib_dev(cma_id); + if (IS_ERR(device)) { + ret = PTR_ERR(device); + goto out_rsp_dma_map; + } + + isert_conn->conn_device = device; + isert_conn->conn_pd = ib_alloc_pd(isert_conn->conn_device->ib_device); + if (IS_ERR(isert_conn->conn_pd)) { + ret = PTR_ERR(isert_conn->conn_pd); + pr_err("ib_alloc_pd failed for conn %p: ret=%d\n", + isert_conn, ret); + goto out_pd; + } + + isert_conn->conn_mr = ib_get_dma_mr(isert_conn->conn_pd, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(isert_conn->conn_mr)) { + ret = PTR_ERR(isert_conn->conn_mr); + pr_err("ib_get_dma_mr failed for conn %p: ret=%d\n", + isert_conn, ret); + goto out_mr; + } + + pi_support = np->tpg_np->tpg->tpg_attrib.t10_pi; + if (pi_support && !device->pi_capable) { + pr_err("Protection information requested but not supported, " + "rejecting connect request\n"); + ret = rdma_reject(cma_id, NULL, 0); + goto out_mr; + } + + ret = isert_conn_setup_qp(isert_conn, cma_id, pi_support); + if (ret) + goto out_conn_dev; + + mutex_lock(&isert_np->np_accept_mutex); + list_add_tail(&isert_conn->conn_accept_node, &isert_np->np_accept_list); + mutex_unlock(&isert_np->np_accept_mutex); + + pr_debug("isert_connect_request() up np_sem np: %p\n", np); + up(&isert_np->np_sem); + return 0; + +out_conn_dev: + ib_dereg_mr(isert_conn->conn_mr); +out_mr: + ib_dealloc_pd(isert_conn->conn_pd); +out_pd: + isert_device_try_release(device); +out_rsp_dma_map: + ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma, + ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE); +out_req_dma_map: + ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_FROM_DEVICE); +out_login_buf: + kfree(isert_conn->login_buf); +out: + kfree(isert_conn); + return ret; +} + +static void +isert_connect_release(struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_device *device = isert_conn->conn_device; + int cq_index; + + pr_debug("Entering isert_connect_release(): >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"); + + if (device && device->use_fastreg) + isert_conn_free_fastreg_pool(isert_conn); + + if (isert_conn->conn_qp) { + cq_index = ((struct isert_cq_desc *) + isert_conn->conn_qp->recv_cq->cq_context)->cq_index; + pr_debug("isert_connect_release: cq_index: %d\n", cq_index); + isert_conn->conn_device->cq_active_qps[cq_index]--; + + rdma_destroy_qp(isert_conn->conn_cm_id); + } + + isert_free_rx_descriptors(isert_conn); + rdma_destroy_id(isert_conn->conn_cm_id); + + ib_dereg_mr(isert_conn->conn_mr); + ib_dealloc_pd(isert_conn->conn_pd); + + if (isert_conn->login_buf) { + ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma, + ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE); + ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, + DMA_FROM_DEVICE); + kfree(isert_conn->login_buf); + } + kfree(isert_conn); + + if (device) + isert_device_try_release(device); + + pr_debug("Leaving isert_connect_release >>>>>>>>>>>>\n"); +} + +static void +isert_connected_handler(struct rdma_cm_id *cma_id) +{ + return; +} + +static void +isert_release_conn_kref(struct kref *kref) +{ + struct isert_conn *isert_conn = container_of(kref, + struct isert_conn, conn_kref); + + pr_debug("Calling isert_connect_release for final kref %s/%d\n", + current->comm, current->pid); + + isert_connect_release(isert_conn); +} + +static void +isert_put_conn(struct isert_conn *isert_conn) +{ + kref_put(&isert_conn->conn_kref, isert_release_conn_kref); +} + +static void +isert_disconnect_work(struct work_struct *work) +{ + struct isert_conn *isert_conn = container_of(work, + struct isert_conn, conn_logout_work); + + pr_debug("isert_disconnect_work(): >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"); + mutex_lock(&isert_conn->conn_mutex); + if (isert_conn->state == ISER_CONN_UP) + isert_conn->state = ISER_CONN_TERMINATING; + + if (isert_conn->post_recv_buf_count == 0 && + atomic_read(&isert_conn->post_send_buf_count) == 0) { + mutex_unlock(&isert_conn->conn_mutex); + goto wake_up; + } + if (!isert_conn->conn_cm_id) { + mutex_unlock(&isert_conn->conn_mutex); + isert_put_conn(isert_conn); + return; + } + + if (isert_conn->disconnect) { + /* Send DREQ/DREP towards our initiator */ + rdma_disconnect(isert_conn->conn_cm_id); + } + + mutex_unlock(&isert_conn->conn_mutex); + +wake_up: + complete(&isert_conn->conn_wait); + isert_put_conn(isert_conn); +} + +static void +isert_disconnected_handler(struct rdma_cm_id *cma_id, bool disconnect) +{ + struct isert_conn *isert_conn = (struct isert_conn *)cma_id->context; + + isert_conn->disconnect = disconnect; + INIT_WORK(&isert_conn->conn_logout_work, isert_disconnect_work); + schedule_work(&isert_conn->conn_logout_work); +} + +static int +isert_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ + int ret = 0; + bool disconnect = false; + + pr_debug("isert_cma_handler: event %d status %d conn %p id %p\n", + event->event, event->status, cma_id->context, cma_id); + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = isert_connect_request(cma_id, event); + break; + case RDMA_CM_EVENT_ESTABLISHED: + isert_connected_handler(cma_id); + break; + case RDMA_CM_EVENT_ADDR_CHANGE: /* FALLTHRU */ + case RDMA_CM_EVENT_DISCONNECTED: /* FALLTHRU */ + case RDMA_CM_EVENT_DEVICE_REMOVAL: /* FALLTHRU */ + disconnect = true; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: /* FALLTHRU */ + isert_disconnected_handler(cma_id, disconnect); + break; + case RDMA_CM_EVENT_CONNECT_ERROR: + default: + pr_err("Unhandled RDMA CMA event: %d\n", event->event); + break; + } + + if (ret != 0) { + pr_err("isert_cma_handler failed RDMA_CM_EVENT: 0x%08x %d\n", + event->event, ret); + dump_stack(); + } + + return ret; +} + +static int +isert_post_recv(struct isert_conn *isert_conn, u32 count) +{ + struct ib_recv_wr *rx_wr, *rx_wr_failed; + int i, ret; + unsigned int rx_head = isert_conn->conn_rx_desc_head; + struct iser_rx_desc *rx_desc; + + for (rx_wr = isert_conn->conn_rx_wr, i = 0; i < count; i++, rx_wr++) { + rx_desc = &isert_conn->conn_rx_descs[rx_head]; + rx_wr->wr_id = (unsigned long)rx_desc; + rx_wr->sg_list = &rx_desc->rx_sg; + rx_wr->num_sge = 1; + rx_wr->next = rx_wr + 1; + rx_head = (rx_head + 1) & (ISERT_QP_MAX_RECV_DTOS - 1); + } + + rx_wr--; + rx_wr->next = NULL; /* mark end of work requests list */ + + isert_conn->post_recv_buf_count += count; + ret = ib_post_recv(isert_conn->conn_qp, isert_conn->conn_rx_wr, + &rx_wr_failed); + if (ret) { + pr_err("ib_post_recv() failed with ret: %d\n", ret); + isert_conn->post_recv_buf_count -= count; + } else { + pr_debug("isert_post_recv(): Posted %d RX buffers\n", count); + isert_conn->conn_rx_desc_head = rx_head; + } + return ret; +} + +static int +isert_post_send(struct isert_conn *isert_conn, struct iser_tx_desc *tx_desc) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_send_wr send_wr, *send_wr_failed; + int ret; + + ib_dma_sync_single_for_device(ib_dev, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + + send_wr.next = NULL; + send_wr.wr_id = (unsigned long)tx_desc; + send_wr.sg_list = tx_desc->tx_sg; + send_wr.num_sge = tx_desc->num_sge; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + atomic_inc(&isert_conn->post_send_buf_count); + + ret = ib_post_send(isert_conn->conn_qp, &send_wr, &send_wr_failed); + if (ret) { + pr_err("ib_post_send() failed, ret: %d\n", ret); + atomic_dec(&isert_conn->post_send_buf_count); + } + + return ret; +} + +static void +isert_create_send_desc(struct isert_conn *isert_conn, + struct isert_cmd *isert_cmd, + struct iser_tx_desc *tx_desc) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + + ib_dma_sync_single_for_cpu(ib_dev, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + + memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr)); + tx_desc->iser_header.flags = ISER_VER; + + tx_desc->num_sge = 1; + tx_desc->isert_cmd = isert_cmd; + + if (tx_desc->tx_sg[0].lkey != isert_conn->conn_mr->lkey) { + tx_desc->tx_sg[0].lkey = isert_conn->conn_mr->lkey; + pr_debug("tx_desc %p lkey mismatch, fixing\n", tx_desc); + } +} + +static int +isert_init_tx_hdrs(struct isert_conn *isert_conn, + struct iser_tx_desc *tx_desc) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + u64 dma_addr; + + dma_addr = ib_dma_map_single(ib_dev, (void *)tx_desc, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + if (ib_dma_mapping_error(ib_dev, dma_addr)) { + pr_err("ib_dma_mapping_error() failed\n"); + return -ENOMEM; + } + + tx_desc->dma_addr = dma_addr; + tx_desc->tx_sg[0].addr = tx_desc->dma_addr; + tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; + tx_desc->tx_sg[0].lkey = isert_conn->conn_mr->lkey; + + pr_debug("isert_init_tx_hdrs: Setup tx_sg[0].addr: 0x%llx length: %u" + " lkey: 0x%08x\n", tx_desc->tx_sg[0].addr, + tx_desc->tx_sg[0].length, tx_desc->tx_sg[0].lkey); + + return 0; +} + +static void +isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct ib_send_wr *send_wr, bool coalesce) +{ + struct iser_tx_desc *tx_desc = &isert_cmd->tx_desc; + + isert_cmd->rdma_wr.iser_ib_op = ISER_IB_SEND; + send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc; + send_wr->opcode = IB_WR_SEND; + send_wr->sg_list = &tx_desc->tx_sg[0]; + send_wr->num_sge = isert_cmd->tx_desc.num_sge; + /* + * Coalesce send completion interrupts by only setting IB_SEND_SIGNALED + * bit for every ISERT_COMP_BATCH_COUNT number of ib_post_send() calls. + */ + mutex_lock(&isert_conn->conn_mutex); + if (coalesce && isert_conn->state == ISER_CONN_UP && + ++isert_conn->conn_comp_batch < ISERT_COMP_BATCH_COUNT) { + tx_desc->llnode_active = true; + llist_add(&tx_desc->comp_llnode, &isert_conn->conn_comp_llist); + mutex_unlock(&isert_conn->conn_mutex); + return; + } + isert_conn->conn_comp_batch = 0; + tx_desc->comp_llnode_batch = llist_del_all(&isert_conn->conn_comp_llist); + mutex_unlock(&isert_conn->conn_mutex); + + send_wr->send_flags = IB_SEND_SIGNALED; +} + +static int +isert_rdma_post_recvl(struct isert_conn *isert_conn) +{ + struct ib_recv_wr rx_wr, *rx_wr_fail; + struct ib_sge sge; + int ret; + + memset(&sge, 0, sizeof(struct ib_sge)); + sge.addr = isert_conn->login_req_dma; + sge.length = ISER_RX_LOGIN_SIZE; + sge.lkey = isert_conn->conn_mr->lkey; + + pr_debug("Setup sge: addr: %llx length: %d 0x%08x\n", + sge.addr, sge.length, sge.lkey); + + memset(&rx_wr, 0, sizeof(struct ib_recv_wr)); + rx_wr.wr_id = (unsigned long)isert_conn->login_req_buf; + rx_wr.sg_list = &sge; + rx_wr.num_sge = 1; + + isert_conn->post_recv_buf_count++; + ret = ib_post_recv(isert_conn->conn_qp, &rx_wr, &rx_wr_fail); + if (ret) { + pr_err("ib_post_recv() failed: %d\n", ret); + isert_conn->post_recv_buf_count--; + } + + pr_debug("ib_post_recv(): returned success >>>>>>>>>>>>>>>>>>>>>>>>\n"); + return ret; +} + +static int +isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login, + u32 length) +{ + struct isert_conn *isert_conn = conn->context; + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iser_tx_desc *tx_desc = &isert_conn->conn_login_tx_desc; + int ret; + + isert_create_send_desc(isert_conn, NULL, tx_desc); + + memcpy(&tx_desc->iscsi_header, &login->rsp[0], + sizeof(struct iscsi_hdr)); + + isert_init_tx_hdrs(isert_conn, tx_desc); + + if (length > 0) { + struct ib_sge *tx_dsg = &tx_desc->tx_sg[1]; + + ib_dma_sync_single_for_cpu(ib_dev, isert_conn->login_rsp_dma, + length, DMA_TO_DEVICE); + + memcpy(isert_conn->login_rsp_buf, login->rsp_buf, length); + + ib_dma_sync_single_for_device(ib_dev, isert_conn->login_rsp_dma, + length, DMA_TO_DEVICE); + + tx_dsg->addr = isert_conn->login_rsp_dma; + tx_dsg->length = length; + tx_dsg->lkey = isert_conn->conn_mr->lkey; + tx_desc->num_sge = 2; + } + if (!login->login_failed) { + if (login->login_complete) { + if (!conn->sess->sess_ops->SessionType && + isert_conn->conn_device->use_fastreg) { + /* Normal Session and fastreg is used */ + u8 pi_support = login->np->tpg_np->tpg->tpg_attrib.t10_pi; + + ret = isert_conn_create_fastreg_pool(isert_conn, + pi_support); + if (ret) { + pr_err("Conn: %p failed to create" + " fastreg pool\n", isert_conn); + return ret; + } + } + + ret = isert_alloc_rx_descriptors(isert_conn); + if (ret) + return ret; + + ret = isert_post_recv(isert_conn, ISERT_MIN_POSTED_RX); + if (ret) + return ret; + + isert_conn->state = ISER_CONN_UP; + goto post_send; + } + + ret = isert_rdma_post_recvl(isert_conn); + if (ret) + return ret; + } +post_send: + ret = isert_post_send(isert_conn, tx_desc); + if (ret) + return ret; + + return 0; +} + +static void +isert_rx_login_req(struct iser_rx_desc *rx_desc, int rx_buflen, + struct isert_conn *isert_conn) +{ + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_login *login = conn->conn_login; + int size; + + if (!login) { + pr_err("conn->conn_login is NULL\n"); + dump_stack(); + return; + } + + if (login->first_request) { + struct iscsi_login_req *login_req = + (struct iscsi_login_req *)&rx_desc->iscsi_header; + /* + * Setup the initial iscsi_login values from the leading + * login request PDU. + */ + login->leading_connection = (!login_req->tsih) ? 1 : 0; + login->current_stage = + (login_req->flags & ISCSI_FLAG_LOGIN_CURRENT_STAGE_MASK) + >> 2; + login->version_min = login_req->min_version; + login->version_max = login_req->max_version; + memcpy(login->isid, login_req->isid, 6); + login->cmd_sn = be32_to_cpu(login_req->cmdsn); + login->init_task_tag = login_req->itt; + login->initial_exp_statsn = be32_to_cpu(login_req->exp_statsn); + login->cid = be16_to_cpu(login_req->cid); + login->tsih = be16_to_cpu(login_req->tsih); + } + + memcpy(&login->req[0], (void *)&rx_desc->iscsi_header, ISCSI_HDR_LEN); + + size = min(rx_buflen, MAX_KEY_VALUE_PAIRS); + pr_debug("Using login payload size: %d, rx_buflen: %d MAX_KEY_VALUE_PAIRS: %d\n", + size, rx_buflen, MAX_KEY_VALUE_PAIRS); + memcpy(login->req_buf, &rx_desc->data[0], size); + + if (login->first_request) { + complete(&isert_conn->conn_login_comp); + return; + } + schedule_delayed_work(&conn->login_work, 0); +} + +static struct iscsi_cmd +*isert_allocate_cmd(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_cmd *isert_cmd; + struct iscsi_cmd *cmd; + + cmd = iscsit_allocate_cmd(conn, TASK_INTERRUPTIBLE); + if (!cmd) { + pr_err("Unable to allocate iscsi_cmd + isert_cmd\n"); + return NULL; + } + isert_cmd = iscsit_priv_cmd(cmd); + isert_cmd->conn = isert_conn; + isert_cmd->iscsi_cmd = cmd; + + return cmd; +} + +static int +isert_handle_scsi_cmd(struct isert_conn *isert_conn, + struct isert_cmd *isert_cmd, struct iscsi_cmd *cmd, + struct iser_rx_desc *rx_desc, unsigned char *buf) +{ + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)buf; + struct scatterlist *sg; + int imm_data, imm_data_len, unsol_data, sg_nents, rc; + bool dump_payload = false; + + rc = iscsit_setup_scsi_cmd(conn, cmd, buf); + if (rc < 0) + return rc; + + imm_data = cmd->immediate_data; + imm_data_len = cmd->first_burst_len; + unsol_data = cmd->unsolicited_data; + + rc = iscsit_process_scsi_cmd(conn, cmd, hdr); + if (rc < 0) { + return 0; + } else if (rc > 0) { + dump_payload = true; + goto sequence_cmd; + } + + if (!imm_data) + return 0; + + sg = &cmd->se_cmd.t_data_sg[0]; + sg_nents = max(1UL, DIV_ROUND_UP(imm_data_len, PAGE_SIZE)); + + pr_debug("Copying Immediate SG: %p sg_nents: %u from %p imm_data_len: %d\n", + sg, sg_nents, &rx_desc->data[0], imm_data_len); + + sg_copy_from_buffer(sg, sg_nents, &rx_desc->data[0], imm_data_len); + + cmd->write_data_done += imm_data_len; + + if (cmd->write_data_done == cmd->se_cmd.data_length) { + spin_lock_bh(&cmd->istate_lock); + cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT; + cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT; + spin_unlock_bh(&cmd->istate_lock); + } + +sequence_cmd: + rc = iscsit_sequence_cmd(conn, cmd, buf, hdr->cmdsn); + + if (!rc && dump_payload == false && unsol_data) + iscsit_set_unsoliticed_dataout(cmd); + else if (dump_payload && imm_data) + target_put_sess_cmd(conn->sess->se_sess, &cmd->se_cmd); + + return 0; +} + +static int +isert_handle_iscsi_dataout(struct isert_conn *isert_conn, + struct iser_rx_desc *rx_desc, unsigned char *buf) +{ + struct scatterlist *sg_start; + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_cmd *cmd = NULL; + struct iscsi_data *hdr = (struct iscsi_data *)buf; + u32 unsol_data_len = ntoh24(hdr->dlength); + int rc, sg_nents, sg_off, page_off; + + rc = iscsit_check_dataout_hdr(conn, buf, &cmd); + if (rc < 0) + return rc; + else if (!cmd) + return 0; + /* + * FIXME: Unexpected unsolicited_data out + */ + if (!cmd->unsolicited_data) { + pr_err("Received unexpected solicited data payload\n"); + dump_stack(); + return -1; + } + + pr_debug("Unsolicited DataOut unsol_data_len: %u, write_data_done: %u, data_length: %u\n", + unsol_data_len, cmd->write_data_done, cmd->se_cmd.data_length); + + sg_off = cmd->write_data_done / PAGE_SIZE; + sg_start = &cmd->se_cmd.t_data_sg[sg_off]; + sg_nents = max(1UL, DIV_ROUND_UP(unsol_data_len, PAGE_SIZE)); + page_off = cmd->write_data_done % PAGE_SIZE; + /* + * FIXME: Non page-aligned unsolicited_data out + */ + if (page_off) { + pr_err("Received unexpected non-page aligned data payload\n"); + dump_stack(); + return -1; + } + pr_debug("Copying DataOut: sg_start: %p, sg_off: %u sg_nents: %u from %p %u\n", + sg_start, sg_off, sg_nents, &rx_desc->data[0], unsol_data_len); + + sg_copy_from_buffer(sg_start, sg_nents, &rx_desc->data[0], + unsol_data_len); + + rc = iscsit_check_dataout_payload(cmd, hdr, false); + if (rc < 0) + return rc; + + return 0; +} + +static int +isert_handle_nop_out(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc, + unsigned char *buf) +{ + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_nopout *hdr = (struct iscsi_nopout *)buf; + int rc; + + rc = iscsit_setup_nop_out(conn, cmd, hdr); + if (rc < 0) + return rc; + /* + * FIXME: Add support for NOPOUT payload using unsolicited RDMA payload + */ + + return iscsit_process_nop_out(conn, cmd, hdr); +} + +static int +isert_handle_text_cmd(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc, + struct iscsi_text *hdr) +{ + struct iscsi_conn *conn = isert_conn->conn; + u32 payload_length = ntoh24(hdr->dlength); + int rc; + unsigned char *text_in; + + rc = iscsit_setup_text_cmd(conn, cmd, hdr); + if (rc < 0) + return rc; + + text_in = kzalloc(payload_length, GFP_KERNEL); + if (!text_in) { + pr_err("Unable to allocate text_in of payload_length: %u\n", + payload_length); + return -ENOMEM; + } + cmd->text_in_ptr = text_in; + + memcpy(cmd->text_in_ptr, &rx_desc->data[0], payload_length); + + return iscsit_process_text_cmd(conn, cmd, hdr); +} + +static int +isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc, + uint32_t read_stag, uint64_t read_va, + uint32_t write_stag, uint64_t write_va) +{ + struct iscsi_hdr *hdr = &rx_desc->iscsi_header; + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_session *sess = conn->sess; + struct iscsi_cmd *cmd; + struct isert_cmd *isert_cmd; + int ret = -EINVAL; + u8 opcode = (hdr->opcode & ISCSI_OPCODE_MASK); + + if (sess->sess_ops->SessionType && + (!(opcode & ISCSI_OP_TEXT) || !(opcode & ISCSI_OP_LOGOUT))) { + pr_err("Got illegal opcode: 0x%02x in SessionType=Discovery," + " ignoring\n", opcode); + return 0; + } + + switch (opcode) { + case ISCSI_OP_SCSI_CMD: + cmd = isert_allocate_cmd(conn); + if (!cmd) + break; + + isert_cmd = iscsit_priv_cmd(cmd); + isert_cmd->read_stag = read_stag; + isert_cmd->read_va = read_va; + isert_cmd->write_stag = write_stag; + isert_cmd->write_va = write_va; + + ret = isert_handle_scsi_cmd(isert_conn, isert_cmd, cmd, + rx_desc, (unsigned char *)hdr); + break; + case ISCSI_OP_NOOP_OUT: + cmd = isert_allocate_cmd(conn); + if (!cmd) + break; + + isert_cmd = iscsit_priv_cmd(cmd); + ret = isert_handle_nop_out(isert_conn, isert_cmd, cmd, + rx_desc, (unsigned char *)hdr); + break; + case ISCSI_OP_SCSI_DATA_OUT: + ret = isert_handle_iscsi_dataout(isert_conn, rx_desc, + (unsigned char *)hdr); + break; + case ISCSI_OP_SCSI_TMFUNC: + cmd = isert_allocate_cmd(conn); + if (!cmd) + break; + + ret = iscsit_handle_task_mgt_cmd(conn, cmd, + (unsigned char *)hdr); + break; + case ISCSI_OP_LOGOUT: + cmd = isert_allocate_cmd(conn); + if (!cmd) + break; + + ret = iscsit_handle_logout_cmd(conn, cmd, (unsigned char *)hdr); + if (ret > 0) + wait_for_completion_timeout(&conn->conn_logout_comp, + SECONDS_FOR_LOGOUT_COMP * + HZ); + break; + case ISCSI_OP_TEXT: + cmd = isert_allocate_cmd(conn); + if (!cmd) + break; + + isert_cmd = iscsit_priv_cmd(cmd); + ret = isert_handle_text_cmd(isert_conn, isert_cmd, cmd, + rx_desc, (struct iscsi_text *)hdr); + break; + default: + pr_err("Got unknown iSCSI OpCode: 0x%02x\n", opcode); + dump_stack(); + break; + } + + return ret; +} + +static void +isert_rx_do_work(struct iser_rx_desc *rx_desc, struct isert_conn *isert_conn) +{ + struct iser_hdr *iser_hdr = &rx_desc->iser_header; + uint64_t read_va = 0, write_va = 0; + uint32_t read_stag = 0, write_stag = 0; + int rc; + + switch (iser_hdr->flags & 0xF0) { + case ISCSI_CTRL: + if (iser_hdr->flags & ISER_RSV) { + read_stag = be32_to_cpu(iser_hdr->read_stag); + read_va = be64_to_cpu(iser_hdr->read_va); + pr_debug("ISER_RSV: read_stag: 0x%08x read_va: 0x%16llx\n", + read_stag, (unsigned long long)read_va); + } + if (iser_hdr->flags & ISER_WSV) { + write_stag = be32_to_cpu(iser_hdr->write_stag); + write_va = be64_to_cpu(iser_hdr->write_va); + pr_debug("ISER_WSV: write__stag: 0x%08x write_va: 0x%16llx\n", + write_stag, (unsigned long long)write_va); + } + + pr_debug("ISER ISCSI_CTRL PDU\n"); + break; + case ISER_HELLO: + pr_err("iSER Hello message\n"); + break; + default: + pr_warn("Unknown iSER hdr flags: 0x%02x\n", iser_hdr->flags); + break; + } + + rc = isert_rx_opcode(isert_conn, rx_desc, + read_stag, read_va, write_stag, write_va); +} + +static void +isert_rx_completion(struct iser_rx_desc *desc, struct isert_conn *isert_conn, + unsigned long xfer_len) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iscsi_hdr *hdr; + u64 rx_dma; + int rx_buflen, outstanding; + + if ((char *)desc == isert_conn->login_req_buf) { + rx_dma = isert_conn->login_req_dma; + rx_buflen = ISER_RX_LOGIN_SIZE; + pr_debug("ISER login_buf: Using rx_dma: 0x%llx, rx_buflen: %d\n", + rx_dma, rx_buflen); + } else { + rx_dma = desc->dma_addr; + rx_buflen = ISER_RX_PAYLOAD_SIZE; + pr_debug("ISER req_buf: Using rx_dma: 0x%llx, rx_buflen: %d\n", + rx_dma, rx_buflen); + } + + ib_dma_sync_single_for_cpu(ib_dev, rx_dma, rx_buflen, DMA_FROM_DEVICE); + + hdr = &desc->iscsi_header; + pr_debug("iSCSI opcode: 0x%02x, ITT: 0x%08x, flags: 0x%02x dlen: %d\n", + hdr->opcode, hdr->itt, hdr->flags, + (int)(xfer_len - ISER_HEADERS_LEN)); + + if ((char *)desc == isert_conn->login_req_buf) + isert_rx_login_req(desc, xfer_len - ISER_HEADERS_LEN, + isert_conn); + else + isert_rx_do_work(desc, isert_conn); + + ib_dma_sync_single_for_device(ib_dev, rx_dma, rx_buflen, + DMA_FROM_DEVICE); + + isert_conn->post_recv_buf_count--; + pr_debug("iSERT: Decremented post_recv_buf_count: %d\n", + isert_conn->post_recv_buf_count); + + if ((char *)desc == isert_conn->login_req_buf) + return; + + outstanding = isert_conn->post_recv_buf_count; + if (outstanding + ISERT_MIN_POSTED_RX <= ISERT_QP_MAX_RECV_DTOS) { + int err, count = min(ISERT_QP_MAX_RECV_DTOS - outstanding, + ISERT_MIN_POSTED_RX); + err = isert_post_recv(isert_conn, count); + if (err) { + pr_err("isert_post_recv() count: %d failed, %d\n", + count, err); + } + } +} + +static int +isert_map_data_buf(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct scatterlist *sg, u32 nents, u32 length, u32 offset, + enum iser_ib_op_code op, struct isert_data_buf *data) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + + data->dma_dir = op == ISER_IB_RDMA_WRITE ? + DMA_TO_DEVICE : DMA_FROM_DEVICE; + + data->len = length - offset; + data->offset = offset; + data->sg_off = data->offset / PAGE_SIZE; + + data->sg = &sg[data->sg_off]; + data->nents = min_t(unsigned int, nents - data->sg_off, + ISCSI_ISER_SG_TABLESIZE); + data->len = min_t(unsigned int, data->len, ISCSI_ISER_SG_TABLESIZE * + PAGE_SIZE); + + data->dma_nents = ib_dma_map_sg(ib_dev, data->sg, data->nents, + data->dma_dir); + if (unlikely(!data->dma_nents)) { + pr_err("Cmd: unable to dma map SGs %p\n", sg); + return -EINVAL; + } + + pr_debug("Mapped cmd: %p count: %u sg: %p sg_nents: %u rdma_len %d\n", + isert_cmd, data->dma_nents, data->sg, data->nents, data->len); + + return 0; +} + +static void +isert_unmap_data_buf(struct isert_conn *isert_conn, struct isert_data_buf *data) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + + ib_dma_unmap_sg(ib_dev, data->sg, data->nents, data->dma_dir); + memset(data, 0, sizeof(*data)); +} + + + +static void +isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) +{ + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + + pr_debug("isert_unmap_cmd: %p\n", isert_cmd); + + if (wr->data.sg) { + pr_debug("isert_unmap_cmd: %p unmap_sg op\n", isert_cmd); + isert_unmap_data_buf(isert_conn, &wr->data); + } + + if (wr->send_wr) { + pr_debug("isert_unmap_cmd: %p free send_wr\n", isert_cmd); + kfree(wr->send_wr); + wr->send_wr = NULL; + } + + if (wr->ib_sge) { + pr_debug("isert_unmap_cmd: %p free ib_sge\n", isert_cmd); + kfree(wr->ib_sge); + wr->ib_sge = NULL; + } +} + +static void +isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) +{ + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + LIST_HEAD(unmap_list); + + pr_debug("unreg_fastreg_cmd: %p\n", isert_cmd); + + if (wr->fr_desc) { + pr_debug("unreg_fastreg_cmd: %p free fr_desc %p\n", + isert_cmd, wr->fr_desc); + if (wr->fr_desc->ind & ISERT_PROTECTED) { + isert_unmap_data_buf(isert_conn, &wr->prot); + wr->fr_desc->ind &= ~ISERT_PROTECTED; + } + spin_lock_bh(&isert_conn->conn_lock); + list_add_tail(&wr->fr_desc->list, &isert_conn->conn_fr_pool); + spin_unlock_bh(&isert_conn->conn_lock); + wr->fr_desc = NULL; + } + + if (wr->data.sg) { + pr_debug("unreg_fastreg_cmd: %p unmap_sg op\n", isert_cmd); + isert_unmap_data_buf(isert_conn, &wr->data); + } + + wr->ib_sge = NULL; + wr->send_wr = NULL; +} + +static void +isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err) +{ + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct isert_conn *isert_conn = isert_cmd->conn; + struct iscsi_conn *conn = isert_conn->conn; + struct isert_device *device = isert_conn->conn_device; + + pr_debug("Entering isert_put_cmd: %p\n", isert_cmd); + + switch (cmd->iscsi_opcode) { + case ISCSI_OP_SCSI_CMD: + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + if (cmd->data_direction == DMA_TO_DEVICE) { + iscsit_stop_dataout_timer(cmd); + /* + * Check for special case during comp_err where + * WRITE_PENDING has been handed off from core, + * but requires an extra target_put_sess_cmd() + * before transport_generic_free_cmd() below. + */ + if (comp_err && + cmd->se_cmd.t_state == TRANSPORT_WRITE_PENDING) { + struct se_cmd *se_cmd = &cmd->se_cmd; + + target_put_sess_cmd(se_cmd->se_sess, se_cmd); + } + } + + device->unreg_rdma_mem(isert_cmd, isert_conn); + transport_generic_free_cmd(&cmd->se_cmd, 0); + break; + case ISCSI_OP_SCSI_TMFUNC: + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + transport_generic_free_cmd(&cmd->se_cmd, 0); + break; + case ISCSI_OP_REJECT: + case ISCSI_OP_NOOP_OUT: + case ISCSI_OP_TEXT: + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + /* + * Handle special case for REJECT when iscsi_add_reject*() has + * overwritten the original iscsi_opcode assignment, and the + * associated cmd->se_cmd needs to be released. + */ + if (cmd->se_cmd.se_tfo != NULL) { + pr_debug("Calling transport_generic_free_cmd from" + " isert_put_cmd for 0x%02x\n", + cmd->iscsi_opcode); + transport_generic_free_cmd(&cmd->se_cmd, 0); + break; + } + /* + * Fall-through + */ + default: + iscsit_release_cmd(cmd); + break; + } +} + +static void +isert_unmap_tx_desc(struct iser_tx_desc *tx_desc, struct ib_device *ib_dev) +{ + if (tx_desc->dma_addr != 0) { + pr_debug("Calling ib_dma_unmap_single for tx_desc->dma_addr\n"); + ib_dma_unmap_single(ib_dev, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + tx_desc->dma_addr = 0; + } +} + +static void +isert_completion_put(struct iser_tx_desc *tx_desc, struct isert_cmd *isert_cmd, + struct ib_device *ib_dev, bool comp_err) +{ + if (isert_cmd->pdu_buf_dma != 0) { + pr_debug("Calling ib_dma_unmap_single for isert_cmd->pdu_buf_dma\n"); + ib_dma_unmap_single(ib_dev, isert_cmd->pdu_buf_dma, + isert_cmd->pdu_buf_len, DMA_TO_DEVICE); + isert_cmd->pdu_buf_dma = 0; + } + + isert_unmap_tx_desc(tx_desc, ib_dev); + isert_put_cmd(isert_cmd, comp_err); +} + +static int +isert_check_pi_status(struct se_cmd *se_cmd, struct ib_mr *sig_mr) +{ + struct ib_mr_status mr_status; + int ret; + + ret = ib_check_mr_status(sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status); + if (ret) { + pr_err("ib_check_mr_status failed, ret %d\n", ret); + goto fail_mr_status; + } + + if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { + u64 sec_offset_err; + u32 block_size = se_cmd->se_dev->dev_attrib.block_size + 8; + + switch (mr_status.sig_err.err_type) { + case IB_SIG_BAD_GUARD: + se_cmd->pi_err = TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED; + break; + case IB_SIG_BAD_REFTAG: + se_cmd->pi_err = TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED; + break; + case IB_SIG_BAD_APPTAG: + se_cmd->pi_err = TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED; + break; + } + sec_offset_err = mr_status.sig_err.sig_err_offset; + do_div(sec_offset_err, block_size); + se_cmd->bad_sector = sec_offset_err + se_cmd->t_task_lba; + + pr_err("isert: PI error found type %d at sector 0x%llx " + "expected 0x%x vs actual 0x%x\n", + mr_status.sig_err.err_type, + (unsigned long long)se_cmd->bad_sector, + mr_status.sig_err.expected, + mr_status.sig_err.actual); + ret = 1; + } + +fail_mr_status: + return ret; +} + +static void +isert_completion_rdma_write(struct iser_tx_desc *tx_desc, + struct isert_cmd *isert_cmd) +{ + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_conn *isert_conn = isert_cmd->conn; + struct isert_device *device = isert_conn->conn_device; + int ret = 0; + + if (wr->fr_desc && wr->fr_desc->ind & ISERT_PROTECTED) { + ret = isert_check_pi_status(se_cmd, + wr->fr_desc->pi_ctx->sig_mr); + wr->fr_desc->ind &= ~ISERT_PROTECTED; + } + + device->unreg_rdma_mem(isert_cmd, isert_conn); + wr->send_wr_num = 0; + if (ret) + transport_send_check_condition_and_sense(se_cmd, + se_cmd->pi_err, 0); + else + isert_put_response(isert_conn->conn, cmd); +} + +static void +isert_completion_rdma_read(struct iser_tx_desc *tx_desc, + struct isert_cmd *isert_cmd) +{ + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_conn *isert_conn = isert_cmd->conn; + struct isert_device *device = isert_conn->conn_device; + int ret = 0; + + if (wr->fr_desc && wr->fr_desc->ind & ISERT_PROTECTED) { + ret = isert_check_pi_status(se_cmd, + wr->fr_desc->pi_ctx->sig_mr); + wr->fr_desc->ind &= ~ISERT_PROTECTED; + } + + iscsit_stop_dataout_timer(cmd); + device->unreg_rdma_mem(isert_cmd, isert_conn); + cmd->write_data_done = wr->data.len; + wr->send_wr_num = 0; + + pr_debug("Cmd: %p RDMA_READ comp calling execute_cmd\n", isert_cmd); + spin_lock_bh(&cmd->istate_lock); + cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT; + cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT; + spin_unlock_bh(&cmd->istate_lock); + + if (ret) + transport_send_check_condition_and_sense(se_cmd, + se_cmd->pi_err, 0); + else + target_execute_cmd(se_cmd); +} + +static void +isert_do_control_comp(struct work_struct *work) +{ + struct isert_cmd *isert_cmd = container_of(work, + struct isert_cmd, comp_work); + struct isert_conn *isert_conn = isert_cmd->conn; + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + + switch (cmd->i_state) { + case ISTATE_SEND_TASKMGTRSP: + pr_debug("Calling iscsit_tmr_post_handler >>>>>>>>>>>>>>>>>\n"); + + atomic_dec(&isert_conn->post_send_buf_count); + iscsit_tmr_post_handler(cmd, cmd->conn); + + cmd->i_state = ISTATE_SENT_STATUS; + isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); + break; + case ISTATE_SEND_REJECT: + pr_debug("Got isert_do_control_comp ISTATE_SEND_REJECT: >>>\n"); + atomic_dec(&isert_conn->post_send_buf_count); + + cmd->i_state = ISTATE_SENT_STATUS; + isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); + break; + case ISTATE_SEND_LOGOUTRSP: + pr_debug("Calling iscsit_logout_post_handler >>>>>>>>>>>>>>\n"); + + atomic_dec(&isert_conn->post_send_buf_count); + iscsit_logout_post_handler(cmd, cmd->conn); + break; + case ISTATE_SEND_TEXTRSP: + atomic_dec(&isert_conn->post_send_buf_count); + cmd->i_state = ISTATE_SENT_STATUS; + isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); + break; + default: + pr_err("Unknown do_control_comp i_state %d\n", cmd->i_state); + dump_stack(); + break; + } +} + +static void +isert_response_completion(struct iser_tx_desc *tx_desc, + struct isert_cmd *isert_cmd, + struct isert_conn *isert_conn, + struct ib_device *ib_dev) +{ + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + + if (cmd->i_state == ISTATE_SEND_TASKMGTRSP || + cmd->i_state == ISTATE_SEND_LOGOUTRSP || + cmd->i_state == ISTATE_SEND_REJECT || + cmd->i_state == ISTATE_SEND_TEXTRSP) { + isert_unmap_tx_desc(tx_desc, ib_dev); + + INIT_WORK(&isert_cmd->comp_work, isert_do_control_comp); + queue_work(isert_comp_wq, &isert_cmd->comp_work); + return; + } + + /** + * If send_wr_num is 0 this means that we got + * RDMA completion and we cleared it and we should + * simply decrement the response post. else the + * response is incorporated in send_wr_num, just + * sub it. + **/ + if (wr->send_wr_num) + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + else + atomic_dec(&isert_conn->post_send_buf_count); + + cmd->i_state = ISTATE_SENT_STATUS; + isert_completion_put(tx_desc, isert_cmd, ib_dev, false); +} + +static void +__isert_send_completion(struct iser_tx_desc *tx_desc, + struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_cmd *isert_cmd = tx_desc->isert_cmd; + struct isert_rdma_wr *wr; + + if (!isert_cmd) { + atomic_dec(&isert_conn->post_send_buf_count); + isert_unmap_tx_desc(tx_desc, ib_dev); + return; + } + wr = &isert_cmd->rdma_wr; + + switch (wr->iser_ib_op) { + case ISER_IB_RECV: + pr_err("isert_send_completion: Got ISER_IB_RECV\n"); + dump_stack(); + break; + case ISER_IB_SEND: + pr_debug("isert_send_completion: Got ISER_IB_SEND\n"); + isert_response_completion(tx_desc, isert_cmd, + isert_conn, ib_dev); + break; + case ISER_IB_RDMA_WRITE: + pr_debug("isert_send_completion: Got ISER_IB_RDMA_WRITE\n"); + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + isert_completion_rdma_write(tx_desc, isert_cmd); + break; + case ISER_IB_RDMA_READ: + pr_debug("isert_send_completion: Got ISER_IB_RDMA_READ:\n"); + + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + isert_completion_rdma_read(tx_desc, isert_cmd); + break; + default: + pr_err("Unknown wr->iser_ib_op: 0x%02x\n", wr->iser_ib_op); + dump_stack(); + break; + } +} + +static void +isert_send_completion(struct iser_tx_desc *tx_desc, + struct isert_conn *isert_conn) +{ + struct llist_node *llnode = tx_desc->comp_llnode_batch; + struct iser_tx_desc *t; + /* + * Drain coalesced completion llist starting from comp_llnode_batch + * setup in isert_init_send_wr(), and then complete trailing tx_desc. + */ + while (llnode) { + t = llist_entry(llnode, struct iser_tx_desc, comp_llnode); + llnode = llist_next(llnode); + __isert_send_completion(t, isert_conn); + } + __isert_send_completion(tx_desc, isert_conn); +} + +static void +isert_cq_drain_comp_llist(struct isert_conn *isert_conn, struct ib_device *ib_dev) +{ + struct llist_node *llnode; + struct isert_rdma_wr *wr; + struct iser_tx_desc *t; + + mutex_lock(&isert_conn->conn_mutex); + llnode = llist_del_all(&isert_conn->conn_comp_llist); + isert_conn->conn_comp_batch = 0; + mutex_unlock(&isert_conn->conn_mutex); + + while (llnode) { + t = llist_entry(llnode, struct iser_tx_desc, comp_llnode); + llnode = llist_next(llnode); + wr = &t->isert_cmd->rdma_wr; + + /** + * If send_wr_num is 0 this means that we got + * RDMA completion and we cleared it and we should + * simply decrement the response post. else the + * response is incorporated in send_wr_num, just + * sub it. + **/ + if (wr->send_wr_num) + atomic_sub(wr->send_wr_num, + &isert_conn->post_send_buf_count); + else + atomic_dec(&isert_conn->post_send_buf_count); + + isert_completion_put(t, t->isert_cmd, ib_dev, true); + } +} + +static void +isert_cq_tx_comp_err(struct iser_tx_desc *tx_desc, struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_cmd *isert_cmd = tx_desc->isert_cmd; + struct llist_node *llnode = tx_desc->comp_llnode_batch; + struct isert_rdma_wr *wr; + struct iser_tx_desc *t; + + while (llnode) { + t = llist_entry(llnode, struct iser_tx_desc, comp_llnode); + llnode = llist_next(llnode); + wr = &t->isert_cmd->rdma_wr; + + /** + * If send_wr_num is 0 this means that we got + * RDMA completion and we cleared it and we should + * simply decrement the response post. else the + * response is incorporated in send_wr_num, just + * sub it. + **/ + if (wr->send_wr_num) + atomic_sub(wr->send_wr_num, + &isert_conn->post_send_buf_count); + else + atomic_dec(&isert_conn->post_send_buf_count); + + isert_completion_put(t, t->isert_cmd, ib_dev, true); + } + tx_desc->comp_llnode_batch = NULL; + + if (!isert_cmd) + isert_unmap_tx_desc(tx_desc, ib_dev); + else + isert_completion_put(tx_desc, isert_cmd, ib_dev, true); +} + +static void +isert_cq_rx_comp_err(struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iscsi_conn *conn = isert_conn->conn; + + if (isert_conn->post_recv_buf_count) + return; + + isert_cq_drain_comp_llist(isert_conn, ib_dev); + + if (conn->sess) { + target_sess_cmd_list_set_waiting(conn->sess->se_sess); + target_wait_for_sess_cmds(conn->sess->se_sess); + } + + while (atomic_read(&isert_conn->post_send_buf_count)) + msleep(3000); + + mutex_lock(&isert_conn->conn_mutex); + isert_conn->state = ISER_CONN_DOWN; + mutex_unlock(&isert_conn->conn_mutex); + + iscsit_cause_connection_reinstatement(isert_conn->conn, 0); + + complete(&isert_conn->conn_wait_comp_err); +} + +static void +isert_cq_tx_work(struct work_struct *work) +{ + struct isert_cq_desc *cq_desc = container_of(work, + struct isert_cq_desc, cq_tx_work); + struct isert_device *device = cq_desc->device; + int cq_index = cq_desc->cq_index; + struct ib_cq *tx_cq = device->dev_tx_cq[cq_index]; + struct isert_conn *isert_conn; + struct iser_tx_desc *tx_desc; + struct ib_wc wc; + + while (ib_poll_cq(tx_cq, 1, &wc) == 1) { + tx_desc = (struct iser_tx_desc *)(unsigned long)wc.wr_id; + isert_conn = wc.qp->qp_context; + + if (wc.status == IB_WC_SUCCESS) { + isert_send_completion(tx_desc, isert_conn); + } else { + pr_debug("TX wc.status != IB_WC_SUCCESS >>>>>>>>>>>>>>\n"); + pr_debug("TX wc.status: 0x%08x\n", wc.status); + pr_debug("TX wc.vendor_err: 0x%08x\n", wc.vendor_err); + + if (wc.wr_id != ISER_FASTREG_LI_WRID) { + if (tx_desc->llnode_active) + continue; + + atomic_dec(&isert_conn->post_send_buf_count); + isert_cq_tx_comp_err(tx_desc, isert_conn); + } + } + } + + ib_req_notify_cq(tx_cq, IB_CQ_NEXT_COMP); +} + +static void +isert_cq_tx_callback(struct ib_cq *cq, void *context) +{ + struct isert_cq_desc *cq_desc = (struct isert_cq_desc *)context; + + queue_work(isert_comp_wq, &cq_desc->cq_tx_work); +} + +static void +isert_cq_rx_work(struct work_struct *work) +{ + struct isert_cq_desc *cq_desc = container_of(work, + struct isert_cq_desc, cq_rx_work); + struct isert_device *device = cq_desc->device; + int cq_index = cq_desc->cq_index; + struct ib_cq *rx_cq = device->dev_rx_cq[cq_index]; + struct isert_conn *isert_conn; + struct iser_rx_desc *rx_desc; + struct ib_wc wc; + unsigned long xfer_len; + + while (ib_poll_cq(rx_cq, 1, &wc) == 1) { + rx_desc = (struct iser_rx_desc *)(unsigned long)wc.wr_id; + isert_conn = wc.qp->qp_context; + + if (wc.status == IB_WC_SUCCESS) { + xfer_len = (unsigned long)wc.byte_len; + isert_rx_completion(rx_desc, isert_conn, xfer_len); + } else { + pr_debug("RX wc.status != IB_WC_SUCCESS >>>>>>>>>>>>>>\n"); + if (wc.status != IB_WC_WR_FLUSH_ERR) { + pr_debug("RX wc.status: 0x%08x\n", wc.status); + pr_debug("RX wc.vendor_err: 0x%08x\n", + wc.vendor_err); + } + isert_conn->post_recv_buf_count--; + isert_cq_rx_comp_err(isert_conn); + } + } + + ib_req_notify_cq(rx_cq, IB_CQ_NEXT_COMP); +} + +static void +isert_cq_rx_callback(struct ib_cq *cq, void *context) +{ + struct isert_cq_desc *cq_desc = (struct isert_cq_desc *)context; + + queue_work(isert_rx_wq, &cq_desc->cq_rx_work); +} + +static int +isert_post_response(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd) +{ + struct ib_send_wr *wr_failed; + int ret; + + atomic_inc(&isert_conn->post_send_buf_count); + + ret = ib_post_send(isert_conn->conn_qp, &isert_cmd->tx_desc.send_wr, + &wr_failed); + if (ret) { + pr_err("ib_post_send failed with %d\n", ret); + atomic_dec(&isert_conn->post_send_buf_count); + return ret; + } + return ret; +} + +static int +isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + struct iscsi_scsi_rsp *hdr = (struct iscsi_scsi_rsp *) + &isert_cmd->tx_desc.iscsi_header; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_rsp_pdu(cmd, conn, true, hdr); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + /* + * Attach SENSE DATA payload to iSCSI Response PDU + */ + if (cmd->se_cmd.sense_buffer && + ((cmd->se_cmd.se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) || + (cmd->se_cmd.se_cmd_flags & SCF_EMULATED_TASK_SENSE))) { + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; + u32 padding, pdu_len; + + put_unaligned_be16(cmd->se_cmd.scsi_sense_length, + cmd->sense_buffer); + cmd->se_cmd.scsi_sense_length += sizeof(__be16); + + padding = -(cmd->se_cmd.scsi_sense_length) & 3; + hton24(hdr->dlength, (u32)cmd->se_cmd.scsi_sense_length); + pdu_len = cmd->se_cmd.scsi_sense_length + padding; + + isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev, + (void *)cmd->sense_buffer, pdu_len, + DMA_TO_DEVICE); + + isert_cmd->pdu_buf_len = pdu_len; + tx_dsg->addr = isert_cmd->pdu_buf_dma; + tx_dsg->length = pdu_len; + tx_dsg->lkey = isert_conn->conn_mr->lkey; + isert_cmd->tx_desc.num_sge = 2; + } + + isert_init_send_wr(isert_conn, isert_cmd, send_wr, true); + + pr_debug("Posting SCSI Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static void +isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_device *device = isert_conn->conn_device; + + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + if (cmd->data_direction == DMA_TO_DEVICE) + iscsit_stop_dataout_timer(cmd); + + device->unreg_rdma_mem(isert_cmd, isert_conn); +} + +static enum target_prot_op +isert_get_sup_prot_ops(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_device *device = isert_conn->conn_device; + + if (device->pi_capable) + return TARGET_PROT_ALL; + + return TARGET_PROT_NORMAL; +} + +static int +isert_put_nopin(struct iscsi_cmd *cmd, struct iscsi_conn *conn, + bool nopout_response) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_nopin_rsp(cmd, conn, (struct iscsi_nopin *) + &isert_cmd->tx_desc.iscsi_header, + nopout_response); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + + pr_debug("Posting NOPIN Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_logout_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_logout_rsp(cmd, conn, (struct iscsi_logout_rsp *) + &isert_cmd->tx_desc.iscsi_header); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + + pr_debug("Posting Logout Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_tm_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_task_mgt_rsp(cmd, conn, (struct iscsi_tm_rsp *) + &isert_cmd->tx_desc.iscsi_header); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + + pr_debug("Posting Task Management Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; + struct iscsi_reject *hdr = + (struct iscsi_reject *)&isert_cmd->tx_desc.iscsi_header; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_reject(cmd, conn, hdr); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + + hton24(hdr->dlength, ISCSI_HDR_LEN); + isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev, + (void *)cmd->buf_ptr, ISCSI_HDR_LEN, + DMA_TO_DEVICE); + isert_cmd->pdu_buf_len = ISCSI_HDR_LEN; + tx_dsg->addr = isert_cmd->pdu_buf_dma; + tx_dsg->length = ISCSI_HDR_LEN; + tx_dsg->lkey = isert_conn->conn_mr->lkey; + isert_cmd->tx_desc.num_sge = 2; + + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + + pr_debug("Posting Reject IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + struct iscsi_text_rsp *hdr = + (struct iscsi_text_rsp *)&isert_cmd->tx_desc.iscsi_header; + u32 txt_rsp_len; + int rc; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + rc = iscsit_build_text_rsp(cmd, conn, hdr, ISCSI_INFINIBAND); + if (rc < 0) + return rc; + + txt_rsp_len = rc; + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + + if (txt_rsp_len) { + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; + void *txt_rsp_buf = cmd->buf_ptr; + + isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev, + txt_rsp_buf, txt_rsp_len, DMA_TO_DEVICE); + + isert_cmd->pdu_buf_len = txt_rsp_len; + tx_dsg->addr = isert_cmd->pdu_buf_dma; + tx_dsg->length = txt_rsp_len; + tx_dsg->lkey = isert_conn->conn_mr->lkey; + isert_cmd->tx_desc.num_sge = 2; + } + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + + pr_debug("Posting Text Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct ib_sge *ib_sge, struct ib_send_wr *send_wr, + u32 data_left, u32 offset) +{ + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct scatterlist *sg_start, *tmp_sg; + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + u32 sg_off, page_off; + int i = 0, sg_nents; + + sg_off = offset / PAGE_SIZE; + sg_start = &cmd->se_cmd.t_data_sg[sg_off]; + sg_nents = min(cmd->se_cmd.t_data_nents - sg_off, isert_conn->max_sge); + page_off = offset % PAGE_SIZE; + + send_wr->sg_list = ib_sge; + send_wr->num_sge = sg_nents; + send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc; + /* + * Perform mapping of TCM scatterlist memory ib_sge dma_addr. + */ + for_each_sg(sg_start, tmp_sg, sg_nents, i) { + pr_debug("ISER RDMA from SGL dma_addr: 0x%16llx dma_len: %u, page_off: %u\n", + (unsigned long long)tmp_sg->dma_address, + tmp_sg->length, page_off); + + ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off; + ib_sge->length = min_t(u32, data_left, + ib_sg_dma_len(ib_dev, tmp_sg) - page_off); + ib_sge->lkey = isert_conn->conn_mr->lkey; + + pr_debug("RDMA ib_sge: addr: 0x%16llx length: %u lkey: %08x\n", + ib_sge->addr, ib_sge->length, ib_sge->lkey); + page_off = 0; + data_left -= ib_sge->length; + ib_sge++; + pr_debug("Incrementing ib_sge pointer to %p\n", ib_sge); + } + + pr_debug("Set outgoing sg_list: %p num_sg: %u from TCM SGLs\n", + send_wr->sg_list, send_wr->num_sge); + + return sg_nents; +} + +static int +isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr) +{ + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_data_buf *data = &wr->data; + struct ib_send_wr *send_wr; + struct ib_sge *ib_sge; + u32 offset, data_len, data_left, rdma_write_max, va_offset = 0; + int ret = 0, i, ib_sge_cnt; + + isert_cmd->tx_desc.isert_cmd = isert_cmd; + + offset = wr->iser_ib_op == ISER_IB_RDMA_READ ? cmd->write_data_done : 0; + ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg, + se_cmd->t_data_nents, se_cmd->data_length, + offset, wr->iser_ib_op, &wr->data); + if (ret) + return ret; + + data_left = data->len; + offset = data->offset; + + ib_sge = kzalloc(sizeof(struct ib_sge) * data->nents, GFP_KERNEL); + if (!ib_sge) { + pr_warn("Unable to allocate ib_sge\n"); + ret = -ENOMEM; + goto unmap_cmd; + } + wr->ib_sge = ib_sge; + + wr->send_wr_num = DIV_ROUND_UP(data->nents, isert_conn->max_sge); + wr->send_wr = kzalloc(sizeof(struct ib_send_wr) * wr->send_wr_num, + GFP_KERNEL); + if (!wr->send_wr) { + pr_debug("Unable to allocate wr->send_wr\n"); + ret = -ENOMEM; + goto unmap_cmd; + } + + wr->isert_cmd = isert_cmd; + rdma_write_max = isert_conn->max_sge * PAGE_SIZE; + + for (i = 0; i < wr->send_wr_num; i++) { + send_wr = &isert_cmd->rdma_wr.send_wr[i]; + data_len = min(data_left, rdma_write_max); + + send_wr->send_flags = 0; + if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) { + send_wr->opcode = IB_WR_RDMA_WRITE; + send_wr->wr.rdma.remote_addr = isert_cmd->read_va + offset; + send_wr->wr.rdma.rkey = isert_cmd->read_stag; + if (i + 1 == wr->send_wr_num) + send_wr->next = &isert_cmd->tx_desc.send_wr; + else + send_wr->next = &wr->send_wr[i + 1]; + } else { + send_wr->opcode = IB_WR_RDMA_READ; + send_wr->wr.rdma.remote_addr = isert_cmd->write_va + va_offset; + send_wr->wr.rdma.rkey = isert_cmd->write_stag; + if (i + 1 == wr->send_wr_num) + send_wr->send_flags = IB_SEND_SIGNALED; + else + send_wr->next = &wr->send_wr[i + 1]; + } + + ib_sge_cnt = isert_build_rdma_wr(isert_conn, isert_cmd, ib_sge, + send_wr, data_len, offset); + ib_sge += ib_sge_cnt; + + offset += data_len; + va_offset += data_len; + data_left -= data_len; + } + + return 0; +unmap_cmd: + isert_unmap_data_buf(isert_conn, data); + + return ret; +} + +static int +isert_map_fr_pagelist(struct ib_device *ib_dev, + struct scatterlist *sg_start, int sg_nents, u64 *fr_pl) +{ + u64 start_addr, end_addr, page, chunk_start = 0; + struct scatterlist *tmp_sg; + int i = 0, new_chunk, last_ent, n_pages; + + n_pages = 0; + new_chunk = 1; + last_ent = sg_nents - 1; + for_each_sg(sg_start, tmp_sg, sg_nents, i) { + start_addr = ib_sg_dma_address(ib_dev, tmp_sg); + if (new_chunk) + chunk_start = start_addr; + end_addr = start_addr + ib_sg_dma_len(ib_dev, tmp_sg); + + pr_debug("SGL[%d] dma_addr: 0x%16llx len: %u\n", + i, (unsigned long long)tmp_sg->dma_address, + tmp_sg->length); + + if ((end_addr & ~PAGE_MASK) && i < last_ent) { + new_chunk = 0; + continue; + } + new_chunk = 1; + + page = chunk_start & PAGE_MASK; + do { + fr_pl[n_pages++] = page; + pr_debug("Mapped page_list[%d] page_addr: 0x%16llx\n", + n_pages - 1, page); + page += PAGE_SIZE; + } while (page < end_addr); + } + + return n_pages; +} + +static int +isert_fast_reg_mr(struct isert_conn *isert_conn, + struct fast_reg_descriptor *fr_desc, + struct isert_data_buf *mem, + enum isert_indicator ind, + struct ib_sge *sge) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; + struct ib_send_wr fr_wr, inv_wr; + struct ib_send_wr *bad_wr, *wr = NULL; + int ret, pagelist_len; + u32 page_off; + u8 key; + + if (mem->dma_nents == 1) { + sge->lkey = isert_conn->conn_mr->lkey; + sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]); + sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]); + pr_debug("%s:%d sge: addr: 0x%llx length: %u lkey: %x\n", + __func__, __LINE__, sge->addr, sge->length, + sge->lkey); + return 0; + } + + if (ind == ISERT_DATA_KEY_VALID) { + /* Registering data buffer */ + mr = fr_desc->data_mr; + frpl = fr_desc->data_frpl; + } else { + /* Registering protection buffer */ + mr = fr_desc->pi_ctx->prot_mr; + frpl = fr_desc->pi_ctx->prot_frpl; + } + + page_off = mem->offset % PAGE_SIZE; + + pr_debug("Use fr_desc %p sg_nents %d offset %u\n", + fr_desc, mem->nents, mem->offset); + + pagelist_len = isert_map_fr_pagelist(ib_dev, mem->sg, mem->nents, + &frpl->page_list[0]); + + if (!(fr_desc->ind & ISERT_DATA_KEY_VALID)) { + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.wr_id = ISER_FASTREG_LI_WRID; + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.ex.invalidate_rkey = mr->rkey; + wr = &inv_wr; + /* Bump the key */ + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); + } + + /* Prepare FASTREG WR */ + memset(&fr_wr, 0, sizeof(fr_wr)); + fr_wr.wr_id = ISER_FASTREG_LI_WRID; + fr_wr.opcode = IB_WR_FAST_REG_MR; + fr_wr.wr.fast_reg.iova_start = frpl->page_list[0] + page_off; + fr_wr.wr.fast_reg.page_list = frpl; + fr_wr.wr.fast_reg.page_list_len = pagelist_len; + fr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fr_wr.wr.fast_reg.length = mem->len; + fr_wr.wr.fast_reg.rkey = mr->rkey; + fr_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE; + + if (!wr) + wr = &fr_wr; + else + wr->next = &fr_wr; + + ret = ib_post_send(isert_conn->conn_qp, wr, &bad_wr); + if (ret) { + pr_err("fast registration failed, ret:%d\n", ret); + return ret; + } + fr_desc->ind &= ~ind; + + sge->lkey = mr->lkey; + sge->addr = frpl->page_list[0] + page_off; + sge->length = mem->len; + + pr_debug("%s:%d sge: addr: 0x%llx length: %u lkey: %x\n", + __func__, __LINE__, sge->addr, sge->length, + sge->lkey); + + return ret; +} + +static inline enum ib_t10_dif_type +se2ib_prot_type(enum target_prot_type prot_type) +{ + switch (prot_type) { + case TARGET_DIF_TYPE0_PROT: + return IB_T10DIF_NONE; + case TARGET_DIF_TYPE1_PROT: + return IB_T10DIF_TYPE1; + case TARGET_DIF_TYPE2_PROT: + return IB_T10DIF_TYPE2; + case TARGET_DIF_TYPE3_PROT: + return IB_T10DIF_TYPE3; + default: + return IB_T10DIF_NONE; + } +} + +static int +isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs) +{ + enum ib_t10_dif_type ib_prot_type = se2ib_prot_type(se_cmd->prot_type); + + sig_attrs->mem.sig_type = IB_SIG_TYPE_T10_DIF; + sig_attrs->wire.sig_type = IB_SIG_TYPE_T10_DIF; + sig_attrs->mem.sig.dif.pi_interval = + se_cmd->se_dev->dev_attrib.block_size; + sig_attrs->wire.sig.dif.pi_interval = + se_cmd->se_dev->dev_attrib.block_size; + + switch (se_cmd->prot_op) { + case TARGET_PROT_DIN_INSERT: + case TARGET_PROT_DOUT_STRIP: + sig_attrs->mem.sig.dif.type = IB_T10DIF_NONE; + sig_attrs->wire.sig.dif.type = ib_prot_type; + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->wire.sig.dif.ref_tag = se_cmd->reftag_seed; + break; + case TARGET_PROT_DOUT_INSERT: + case TARGET_PROT_DIN_STRIP: + sig_attrs->mem.sig.dif.type = ib_prot_type; + sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->mem.sig.dif.ref_tag = se_cmd->reftag_seed; + sig_attrs->wire.sig.dif.type = IB_T10DIF_NONE; + break; + case TARGET_PROT_DIN_PASS: + case TARGET_PROT_DOUT_PASS: + sig_attrs->mem.sig.dif.type = ib_prot_type; + sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->mem.sig.dif.ref_tag = se_cmd->reftag_seed; + sig_attrs->wire.sig.dif.type = ib_prot_type; + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + sig_attrs->wire.sig.dif.ref_tag = se_cmd->reftag_seed; + break; + default: + pr_err("Unsupported PI operation %d\n", se_cmd->prot_op); + return -EINVAL; + } + + return 0; +} + +static inline u8 +isert_set_prot_checks(u8 prot_checks) +{ + return (prot_checks & TARGET_DIF_CHECK_GUARD ? 0xc0 : 0) | + (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) | + (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0); +} + +static int +isert_reg_sig_mr(struct isert_conn *isert_conn, struct se_cmd *se_cmd, + struct fast_reg_descriptor *fr_desc, + struct ib_sge *data_sge, struct ib_sge *prot_sge, + struct ib_sge *sig_sge) +{ + struct ib_send_wr sig_wr, inv_wr; + struct ib_send_wr *bad_wr, *wr = NULL; + struct pi_context *pi_ctx = fr_desc->pi_ctx; + struct ib_sig_attrs sig_attrs; + int ret; + u32 key; + + memset(&sig_attrs, 0, sizeof(sig_attrs)); + ret = isert_set_sig_attrs(se_cmd, &sig_attrs); + if (ret) + goto err; + + sig_attrs.check_mask = isert_set_prot_checks(se_cmd->prot_checks); + + if (!(fr_desc->ind & ISERT_SIG_KEY_VALID)) { + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.wr_id = ISER_FASTREG_LI_WRID; + inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey; + wr = &inv_wr; + /* Bump the key */ + key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(pi_ctx->sig_mr, ++key); + } + + memset(&sig_wr, 0, sizeof(sig_wr)); + sig_wr.opcode = IB_WR_REG_SIG_MR; + sig_wr.wr_id = ISER_FASTREG_LI_WRID; + sig_wr.sg_list = data_sge; + sig_wr.num_sge = 1; + sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE; + sig_wr.wr.sig_handover.sig_attrs = &sig_attrs; + sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr; + if (se_cmd->t_prot_sg) + sig_wr.wr.sig_handover.prot = prot_sge; + + if (!wr) + wr = &sig_wr; + else + wr->next = &sig_wr; + + ret = ib_post_send(isert_conn->conn_qp, wr, &bad_wr); + if (ret) { + pr_err("fast registration failed, ret:%d\n", ret); + goto err; + } + fr_desc->ind &= ~ISERT_SIG_KEY_VALID; + + sig_sge->lkey = pi_ctx->sig_mr->lkey; + sig_sge->addr = 0; + sig_sge->length = se_cmd->data_length; + if (se_cmd->prot_op != TARGET_PROT_DIN_STRIP && + se_cmd->prot_op != TARGET_PROT_DOUT_INSERT) + /* + * We have protection guards on the wire + * so we need to set a larget transfer + */ + sig_sge->length += se_cmd->prot_length; + + pr_debug("sig_sge: addr: 0x%llx length: %u lkey: %x\n", + sig_sge->addr, sig_sge->length, + sig_sge->lkey); +err: + return ret; +} + +static int +isert_reg_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr) +{ + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = conn->context; + struct ib_sge data_sge; + struct ib_send_wr *send_wr; + struct fast_reg_descriptor *fr_desc = NULL; + u32 offset; + int ret = 0; + unsigned long flags; + + isert_cmd->tx_desc.isert_cmd = isert_cmd; + + offset = wr->iser_ib_op == ISER_IB_RDMA_READ ? cmd->write_data_done : 0; + ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg, + se_cmd->t_data_nents, se_cmd->data_length, + offset, wr->iser_ib_op, &wr->data); + if (ret) + return ret; + + if (wr->data.dma_nents != 1 || + se_cmd->prot_op != TARGET_PROT_NORMAL) { + spin_lock_irqsave(&isert_conn->conn_lock, flags); + fr_desc = list_first_entry(&isert_conn->conn_fr_pool, + struct fast_reg_descriptor, list); + list_del(&fr_desc->list); + spin_unlock_irqrestore(&isert_conn->conn_lock, flags); + wr->fr_desc = fr_desc; + } + + ret = isert_fast_reg_mr(isert_conn, fr_desc, &wr->data, + ISERT_DATA_KEY_VALID, &data_sge); + if (ret) + goto unmap_cmd; + + if (se_cmd->prot_op != TARGET_PROT_NORMAL) { + struct ib_sge prot_sge, sig_sge; + + if (se_cmd->t_prot_sg) { + ret = isert_map_data_buf(isert_conn, isert_cmd, + se_cmd->t_prot_sg, + se_cmd->t_prot_nents, + se_cmd->prot_length, + 0, wr->iser_ib_op, &wr->prot); + if (ret) + goto unmap_cmd; + + ret = isert_fast_reg_mr(isert_conn, fr_desc, &wr->prot, + ISERT_PROT_KEY_VALID, &prot_sge); + if (ret) + goto unmap_prot_cmd; + } + + ret = isert_reg_sig_mr(isert_conn, se_cmd, fr_desc, + &data_sge, &prot_sge, &sig_sge); + if (ret) + goto unmap_prot_cmd; + + fr_desc->ind |= ISERT_PROTECTED; + memcpy(&wr->s_ib_sge, &sig_sge, sizeof(sig_sge)); + } else + memcpy(&wr->s_ib_sge, &data_sge, sizeof(data_sge)); + + wr->ib_sge = &wr->s_ib_sge; + wr->send_wr_num = 1; + memset(&wr->s_send_wr, 0, sizeof(*send_wr)); + wr->send_wr = &wr->s_send_wr; + wr->isert_cmd = isert_cmd; + + send_wr = &isert_cmd->rdma_wr.s_send_wr; + send_wr->sg_list = &wr->s_ib_sge; + send_wr->num_sge = 1; + send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc; + if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) { + send_wr->opcode = IB_WR_RDMA_WRITE; + send_wr->wr.rdma.remote_addr = isert_cmd->read_va; + send_wr->wr.rdma.rkey = isert_cmd->read_stag; + send_wr->send_flags = se_cmd->prot_op == TARGET_PROT_NORMAL ? + 0 : IB_SEND_SIGNALED; + } else { + send_wr->opcode = IB_WR_RDMA_READ; + send_wr->wr.rdma.remote_addr = isert_cmd->write_va; + send_wr->wr.rdma.rkey = isert_cmd->write_stag; + send_wr->send_flags = IB_SEND_SIGNALED; + } + + return 0; +unmap_prot_cmd: + if (se_cmd->t_prot_sg) + isert_unmap_data_buf(isert_conn, &wr->prot); +unmap_cmd: + if (fr_desc) { + spin_lock_irqsave(&isert_conn->conn_lock, flags); + list_add_tail(&fr_desc->list, &isert_conn->conn_fr_pool); + spin_unlock_irqrestore(&isert_conn->conn_lock, flags); + } + isert_unmap_data_buf(isert_conn, &wr->data); + + return ret; +} + +static int +isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) +{ + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_device *device = isert_conn->conn_device; + struct ib_send_wr *wr_failed; + int rc; + + pr_debug("Cmd: %p RDMA_WRITE data_length: %u\n", + isert_cmd, se_cmd->data_length); + wr->iser_ib_op = ISER_IB_RDMA_WRITE; + rc = device->reg_rdma_mem(conn, cmd, wr); + if (rc) { + pr_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd); + return rc; + } + + if (se_cmd->prot_op == TARGET_PROT_NORMAL) { + /* + * Build isert_conn->tx_desc for iSCSI response PDU and attach + */ + isert_create_send_desc(isert_conn, isert_cmd, + &isert_cmd->tx_desc); + iscsit_build_rsp_pdu(cmd, conn, true, (struct iscsi_scsi_rsp *) + &isert_cmd->tx_desc.iscsi_header); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr(isert_conn, isert_cmd, + &isert_cmd->tx_desc.send_wr, true); + isert_cmd->rdma_wr.s_send_wr.next = &isert_cmd->tx_desc.send_wr; + wr->send_wr_num += 1; + } + + atomic_add(wr->send_wr_num, &isert_conn->post_send_buf_count); + + rc = ib_post_send(isert_conn->conn_qp, wr->send_wr, &wr_failed); + if (rc) { + pr_warn("ib_post_send() failed for IB_WR_RDMA_WRITE\n"); + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + } + + if (se_cmd->prot_op == TARGET_PROT_NORMAL) + pr_debug("Cmd: %p posted RDMA_WRITE + Response for iSER Data " + "READ\n", isert_cmd); + else + pr_debug("Cmd: %p posted RDMA_WRITE for iSER Data READ\n", + isert_cmd); + + return 1; +} + +static int +isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery) +{ + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_device *device = isert_conn->conn_device; + struct ib_send_wr *wr_failed; + int rc; + + pr_debug("Cmd: %p RDMA_READ data_length: %u write_data_done: %u\n", + isert_cmd, se_cmd->data_length, cmd->write_data_done); + wr->iser_ib_op = ISER_IB_RDMA_READ; + rc = device->reg_rdma_mem(conn, cmd, wr); + if (rc) { + pr_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd); + return rc; + } + + atomic_add(wr->send_wr_num, &isert_conn->post_send_buf_count); + + rc = ib_post_send(isert_conn->conn_qp, wr->send_wr, &wr_failed); + if (rc) { + pr_warn("ib_post_send() failed for IB_WR_RDMA_READ\n"); + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + } + pr_debug("Cmd: %p posted RDMA_READ memory for ISER Data WRITE\n", + isert_cmd); + + return 0; +} + +static int +isert_immediate_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state) +{ + int ret; + + switch (state) { + case ISTATE_SEND_NOPIN_WANT_RESPONSE: + ret = isert_put_nopin(cmd, conn, false); + break; + default: + pr_err("Unknown immediate state: 0x%02x\n", state); + ret = -EINVAL; + break; + } + + return ret; +} + +static int +isert_response_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state) +{ + int ret; + + switch (state) { + case ISTATE_SEND_LOGOUTRSP: + ret = isert_put_logout_rsp(cmd, conn); + if (!ret) { + pr_debug("Returning iSER Logout -EAGAIN\n"); + ret = -EAGAIN; + } + break; + case ISTATE_SEND_NOPIN: + ret = isert_put_nopin(cmd, conn, true); + break; + case ISTATE_SEND_TASKMGTRSP: + ret = isert_put_tm_rsp(cmd, conn); + break; + case ISTATE_SEND_REJECT: + ret = isert_put_reject(cmd, conn); + break; + case ISTATE_SEND_TEXTRSP: + ret = isert_put_text_rsp(cmd, conn); + break; + case ISTATE_SEND_STATUS: + /* + * Special case for sending non GOOD SCSI status from TX thread + * context during pre se_cmd excecution failure. + */ + ret = isert_put_response(conn, cmd); + break; + default: + pr_err("Unknown response state: 0x%02x\n", state); + ret = -EINVAL; + break; + } + + return ret; +} + +static int +isert_setup_np(struct iscsi_np *np, + struct __kernel_sockaddr_storage *ksockaddr) +{ + struct isert_np *isert_np; + struct rdma_cm_id *isert_lid; + struct sockaddr *sa; + int ret; + + isert_np = kzalloc(sizeof(struct isert_np), GFP_KERNEL); + if (!isert_np) { + pr_err("Unable to allocate struct isert_np\n"); + return -ENOMEM; + } + sema_init(&isert_np->np_sem, 0); + mutex_init(&isert_np->np_accept_mutex); + INIT_LIST_HEAD(&isert_np->np_accept_list); + init_completion(&isert_np->np_login_comp); + + sa = (struct sockaddr *)ksockaddr; + pr_debug("ksockaddr: %p, sa: %p\n", ksockaddr, sa); + /* + * Setup the np->np_sockaddr from the passed sockaddr setup + * in iscsi_target_configfs.c code.. + */ + memcpy(&np->np_sockaddr, ksockaddr, + sizeof(struct __kernel_sockaddr_storage)); + + isert_lid = rdma_create_id(isert_cma_handler, np, RDMA_PS_TCP, + IB_QPT_RC); + if (IS_ERR(isert_lid)) { + pr_err("rdma_create_id() for isert_listen_handler failed: %ld\n", + PTR_ERR(isert_lid)); + ret = PTR_ERR(isert_lid); + goto out; + } + + ret = rdma_bind_addr(isert_lid, sa); + if (ret) { + pr_err("rdma_bind_addr() for isert_lid failed: %d\n", ret); + goto out_lid; + } + + ret = rdma_listen(isert_lid, ISERT_RDMA_LISTEN_BACKLOG); + if (ret) { + pr_err("rdma_listen() for isert_lid failed: %d\n", ret); + goto out_lid; + } + + isert_np->np_cm_id = isert_lid; + np->np_context = isert_np; + pr_debug("Setup isert_lid->context: %p\n", isert_lid->context); + + return 0; + +out_lid: + rdma_destroy_id(isert_lid); +out: + kfree(isert_np); + return ret; +} + +static int +isert_rdma_accept(struct isert_conn *isert_conn) +{ + struct rdma_cm_id *cm_id = isert_conn->conn_cm_id; + struct rdma_conn_param cp; + int ret; + + memset(&cp, 0, sizeof(struct rdma_conn_param)); + cp.responder_resources = isert_conn->responder_resources; + cp.initiator_depth = isert_conn->initiator_depth; + cp.retry_count = 7; + cp.rnr_retry_count = 7; + + pr_debug("Before rdma_accept >>>>>>>>>>>>>>>>>>>>.\n"); + + ret = rdma_accept(cm_id, &cp); + if (ret) { + pr_err("rdma_accept() failed with: %d\n", ret); + return ret; + } + + pr_debug("After rdma_accept >>>>>>>>>>>>>>>>>>>>>.\n"); + + return 0; +} + +static int +isert_get_login_rx(struct iscsi_conn *conn, struct iscsi_login *login) +{ + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + int ret; + + pr_debug("isert_get_login_rx before conn_login_comp conn: %p\n", conn); + /* + * For login requests after the first PDU, isert_rx_login_req() will + * kick schedule_delayed_work(&conn->login_work) as the packet is + * received, which turns this callback from iscsi_target_do_login_rx() + * into a NOP. + */ + if (!login->first_request) + return 0; + + ret = wait_for_completion_interruptible(&isert_conn->conn_login_comp); + if (ret) + return ret; + + pr_debug("isert_get_login_rx processing login->req: %p\n", login->req); + return 0; +} + +static void +isert_set_conn_info(struct iscsi_np *np, struct iscsi_conn *conn, + struct isert_conn *isert_conn) +{ + struct rdma_cm_id *cm_id = isert_conn->conn_cm_id; + struct rdma_route *cm_route = &cm_id->route; + struct sockaddr_in *sock_in; + struct sockaddr_in6 *sock_in6; + + conn->login_family = np->np_sockaddr.ss_family; + + if (np->np_sockaddr.ss_family == AF_INET6) { + sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.dst_addr; + snprintf(conn->login_ip, sizeof(conn->login_ip), "%pI6c", + &sock_in6->sin6_addr.in6_u); + conn->login_port = ntohs(sock_in6->sin6_port); + + sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.src_addr; + snprintf(conn->local_ip, sizeof(conn->local_ip), "%pI6c", + &sock_in6->sin6_addr.in6_u); + conn->local_port = ntohs(sock_in6->sin6_port); + } else { + sock_in = (struct sockaddr_in *)&cm_route->addr.dst_addr; + sprintf(conn->login_ip, "%pI4", + &sock_in->sin_addr.s_addr); + conn->login_port = ntohs(sock_in->sin_port); + + sock_in = (struct sockaddr_in *)&cm_route->addr.src_addr; + sprintf(conn->local_ip, "%pI4", + &sock_in->sin_addr.s_addr); + conn->local_port = ntohs(sock_in->sin_port); + } +} + +static int +isert_accept_np(struct iscsi_np *np, struct iscsi_conn *conn) +{ + struct isert_np *isert_np = (struct isert_np *)np->np_context; + struct isert_conn *isert_conn; + int max_accept = 0, ret; + +accept_wait: + ret = down_interruptible(&isert_np->np_sem); + if (max_accept > 5) + return -ENODEV; + + spin_lock_bh(&np->np_thread_lock); + if (np->np_thread_state >= ISCSI_NP_THREAD_RESET) { + spin_unlock_bh(&np->np_thread_lock); + pr_debug("np_thread_state %d for isert_accept_np\n", + np->np_thread_state); + /** + * No point in stalling here when np_thread + * is in state RESET/SHUTDOWN/EXIT - bail + **/ + return -ENODEV; + } + spin_unlock_bh(&np->np_thread_lock); + + mutex_lock(&isert_np->np_accept_mutex); + if (list_empty(&isert_np->np_accept_list)) { + mutex_unlock(&isert_np->np_accept_mutex); + max_accept++; + goto accept_wait; + } + isert_conn = list_first_entry(&isert_np->np_accept_list, + struct isert_conn, conn_accept_node); + list_del_init(&isert_conn->conn_accept_node); + mutex_unlock(&isert_np->np_accept_mutex); + + conn->context = isert_conn; + isert_conn->conn = conn; + max_accept = 0; + + ret = isert_rdma_post_recvl(isert_conn); + if (ret) + return ret; + + ret = isert_rdma_accept(isert_conn); + if (ret) + return ret; + + isert_set_conn_info(np, conn, isert_conn); + + pr_debug("Processing isert_accept_np: isert_conn: %p\n", isert_conn); + return 0; +} + +static void +isert_free_np(struct iscsi_np *np) +{ + struct isert_np *isert_np = (struct isert_np *)np->np_context; + + rdma_destroy_id(isert_np->np_cm_id); + + np->np_context = NULL; + kfree(isert_np); +} + +static void isert_wait_conn(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = conn->context; + + pr_debug("isert_wait_conn: Starting \n"); + + mutex_lock(&isert_conn->conn_mutex); + if (isert_conn->conn_cm_id) { + pr_debug("Calling rdma_disconnect from isert_wait_conn\n"); + rdma_disconnect(isert_conn->conn_cm_id); + } + /* + * Only wait for conn_wait_comp_err if the isert_conn made it + * into full feature phase.. + */ + if (isert_conn->state == ISER_CONN_INIT) { + mutex_unlock(&isert_conn->conn_mutex); + return; + } + if (isert_conn->state == ISER_CONN_UP) + isert_conn->state = ISER_CONN_TERMINATING; + mutex_unlock(&isert_conn->conn_mutex); + + wait_for_completion(&isert_conn->conn_wait_comp_err); + + wait_for_completion(&isert_conn->conn_wait); +} + +static void isert_free_conn(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = conn->context; + + isert_put_conn(isert_conn); +} + +static struct iscsit_transport iser_target_transport = { + .name = "IB/iSER", + .transport_type = ISCSI_INFINIBAND, + .priv_size = sizeof(struct isert_cmd), + .owner = THIS_MODULE, + .iscsit_setup_np = isert_setup_np, + .iscsit_accept_np = isert_accept_np, + .iscsit_free_np = isert_free_np, + .iscsit_wait_conn = isert_wait_conn, + .iscsit_free_conn = isert_free_conn, + .iscsit_get_login_rx = isert_get_login_rx, + .iscsit_put_login_tx = isert_put_login_tx, + .iscsit_immediate_queue = isert_immediate_queue, + .iscsit_response_queue = isert_response_queue, + .iscsit_get_dataout = isert_get_dataout, + .iscsit_queue_data_in = isert_put_datain, + .iscsit_queue_status = isert_put_response, + .iscsit_aborted_task = isert_aborted_task, + .iscsit_get_sup_prot_ops = isert_get_sup_prot_ops, +}; + +static int __init isert_init(void) +{ + int ret; + + isert_rx_wq = alloc_workqueue("isert_rx_wq", 0, 0); + if (!isert_rx_wq) { + pr_err("Unable to allocate isert_rx_wq\n"); + return -ENOMEM; + } + + isert_comp_wq = alloc_workqueue("isert_comp_wq", 0, 0); + if (!isert_comp_wq) { + pr_err("Unable to allocate isert_comp_wq\n"); + ret = -ENOMEM; + goto destroy_rx_wq; + } + + iscsit_register_transport(&iser_target_transport); + pr_debug("iSER_TARGET[0] - Loaded iser_target_transport\n"); + return 0; + +destroy_rx_wq: + destroy_workqueue(isert_rx_wq); + return ret; +} + +static void __exit isert_exit(void) +{ + flush_scheduled_work(); + destroy_workqueue(isert_comp_wq); + destroy_workqueue(isert_rx_wq); + iscsit_unregister_transport(&iser_target_transport); + pr_debug("iSER_TARGET[0] - Released iser_target_transport\n"); +} + +MODULE_DESCRIPTION("iSER-Target for mainline target infrastructure"); +MODULE_VERSION("0.1"); +MODULE_AUTHOR("nab@Linux-iSCSI.org"); +MODULE_LICENSE("GPL"); + +module_init(isert_init); +module_exit(isert_exit); diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h new file mode 100644 index 00000000000..04f51f7bf61 --- /dev/null +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -0,0 +1,190 @@ +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> + +#define ISERT_RDMA_LISTEN_BACKLOG 10 +#define ISCSI_ISER_SG_TABLESIZE 256 +#define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL + +enum isert_desc_type { + ISCSI_TX_CONTROL, + ISCSI_TX_DATAIN +}; + +enum iser_ib_op_code { + ISER_IB_RECV, + ISER_IB_SEND, + ISER_IB_RDMA_WRITE, + ISER_IB_RDMA_READ, +}; + +enum iser_conn_state { + ISER_CONN_INIT, + ISER_CONN_UP, + ISER_CONN_TERMINATING, + ISER_CONN_DOWN, +}; + +struct iser_rx_desc { + struct iser_hdr iser_header; + struct iscsi_hdr iscsi_header; + char data[ISER_RECV_DATA_SEG_LEN]; + u64 dma_addr; + struct ib_sge rx_sg; + char pad[ISER_RX_PAD_SIZE]; +} __packed; + +struct iser_tx_desc { + struct iser_hdr iser_header; + struct iscsi_hdr iscsi_header; + enum isert_desc_type type; + u64 dma_addr; + struct ib_sge tx_sg[2]; + int num_sge; + struct isert_cmd *isert_cmd; + struct llist_node *comp_llnode_batch; + struct llist_node comp_llnode; + bool llnode_active; + struct ib_send_wr send_wr; +} __packed; + +enum isert_indicator { + ISERT_PROTECTED = 1 << 0, + ISERT_DATA_KEY_VALID = 1 << 1, + ISERT_PROT_KEY_VALID = 1 << 2, + ISERT_SIG_KEY_VALID = 1 << 3, +}; + +struct pi_context { + struct ib_mr *prot_mr; + struct ib_fast_reg_page_list *prot_frpl; + struct ib_mr *sig_mr; +}; + +struct fast_reg_descriptor { + struct list_head list; + struct ib_mr *data_mr; + struct ib_fast_reg_page_list *data_frpl; + u8 ind; + struct pi_context *pi_ctx; +}; + +struct isert_data_buf { + struct scatterlist *sg; + int nents; + u32 sg_off; + u32 len; /* cur_rdma_length */ + u32 offset; + unsigned int dma_nents; + enum dma_data_direction dma_dir; +}; + +struct isert_rdma_wr { + struct list_head wr_list; + struct isert_cmd *isert_cmd; + enum iser_ib_op_code iser_ib_op; + struct ib_sge *ib_sge; + struct ib_sge s_ib_sge; + int send_wr_num; + struct ib_send_wr *send_wr; + struct ib_send_wr s_send_wr; + struct isert_data_buf data; + struct isert_data_buf prot; + struct fast_reg_descriptor *fr_desc; +}; + +struct isert_cmd { + uint32_t read_stag; + uint32_t write_stag; + uint64_t read_va; + uint64_t write_va; + u64 pdu_buf_dma; + u32 pdu_buf_len; + u32 read_va_off; + u32 write_va_off; + u32 rdma_wr_num; + struct isert_conn *conn; + struct iscsi_cmd *iscsi_cmd; + struct iser_tx_desc tx_desc; + struct isert_rdma_wr rdma_wr; + struct work_struct comp_work; +}; + +struct isert_device; + +struct isert_conn { + enum iser_conn_state state; + int post_recv_buf_count; + atomic_t post_send_buf_count; + u32 responder_resources; + u32 initiator_depth; + u32 max_sge; + char *login_buf; + char *login_req_buf; + char *login_rsp_buf; + u64 login_req_dma; + u64 login_rsp_dma; + unsigned int conn_rx_desc_head; + struct iser_rx_desc *conn_rx_descs; + struct ib_recv_wr conn_rx_wr[ISERT_MIN_POSTED_RX]; + struct iscsi_conn *conn; + struct list_head conn_accept_node; + struct completion conn_login_comp; + struct iser_tx_desc conn_login_tx_desc; + struct rdma_cm_id *conn_cm_id; + struct ib_pd *conn_pd; + struct ib_mr *conn_mr; + struct ib_qp *conn_qp; + struct isert_device *conn_device; + struct work_struct conn_logout_work; + struct mutex conn_mutex; + struct completion conn_wait; + struct completion conn_wait_comp_err; + struct kref conn_kref; + struct list_head conn_fr_pool; + int conn_fr_pool_size; + /* lock to protect fastreg pool */ + spinlock_t conn_lock; +#define ISERT_COMP_BATCH_COUNT 8 + int conn_comp_batch; + struct llist_head conn_comp_llist; + bool disconnect; +}; + +#define ISERT_MAX_CQ 64 + +struct isert_cq_desc { + struct isert_device *device; + int cq_index; + struct work_struct cq_rx_work; + struct work_struct cq_tx_work; +}; + +struct isert_device { + int use_fastreg; + bool pi_capable; + int cqs_used; + int refcount; + int cq_active_qps[ISERT_MAX_CQ]; + struct ib_device *ib_device; + struct ib_cq *dev_rx_cq[ISERT_MAX_CQ]; + struct ib_cq *dev_tx_cq[ISERT_MAX_CQ]; + struct isert_cq_desc *cq_desc; + struct list_head dev_node; + struct ib_device_attr dev_attr; + int (*reg_rdma_mem)(struct iscsi_conn *conn, + struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr); + void (*unreg_rdma_mem)(struct isert_cmd *isert_cmd, + struct isert_conn *isert_conn); +}; + +struct isert_np { + struct semaphore np_sem; + struct rdma_cm_id *np_cm_id; + struct mutex np_accept_mutex; + struct list_head np_accept_list; + struct completion np_login_comp; +}; diff --git a/drivers/infiniband/ulp/isert/isert_proto.h b/drivers/infiniband/ulp/isert/isert_proto.h new file mode 100644 index 00000000000..4dccd313b77 --- /dev/null +++ b/drivers/infiniband/ulp/isert/isert_proto.h @@ -0,0 +1,47 @@ +/* From iscsi_iser.h */ + +struct iser_hdr { + u8 flags; + u8 rsvd[3]; + __be32 write_stag; /* write rkey */ + __be64 write_va; + __be32 read_stag; /* read rkey */ + __be64 read_va; +} __packed; + +/*Constant PDU lengths calculations */ +#define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr)) + +#define ISER_RECV_DATA_SEG_LEN 8192 +#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN) +#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN) + +/* QP settings */ +/* Maximal bounds on received asynchronous PDUs */ +#define ISERT_MAX_TX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */ + +#define ISERT_MAX_RX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), * + * SCSI_TMFUNC(2), LOGOUT(1) */ + +#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* from libiscsi.h, must be power of 2 */ + +#define ISERT_QP_MAX_RECV_DTOS (ISCSI_DEF_XMIT_CMDS_MAX) + +#define ISERT_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX >> 2) + +#define ISERT_INFLIGHT_DATAOUTS 8 + +#define ISERT_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX * \ + (1 + ISERT_INFLIGHT_DATAOUTS) + \ + ISERT_MAX_TX_MISC_PDUS + \ + ISERT_MAX_RX_MISC_PDUS) + +#define ISER_RX_PAD_SIZE (ISER_RECV_DATA_SEG_LEN + 4096 - \ + (ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge))) + +#define ISER_VER 0x10 +#define ISER_WSV 0x08 +#define ISER_RSV 0x04 +#define ISCSI_CTRL 0x10 +#define ISER_HELLO 0x20 +#define ISER_HELLORPLY 0x30 diff --git a/drivers/infiniband/ulp/srp/Kconfig b/drivers/infiniband/ulp/srp/Kconfig index 8fe3be4e991..c74ee963304 100644 --- a/drivers/infiniband/ulp/srp/Kconfig +++ b/drivers/infiniband/ulp/srp/Kconfig @@ -1,6 +1,7 @@ config INFINIBAND_SRP tristate "InfiniBand SCSI RDMA Protocol" - depends on INFINIBAND && SCSI + depends on SCSI + select SCSI_SRP_ATTRS ---help--- Support for the SCSI RDMA Protocol over InfiniBand. This allows you to access storage devices that speak SRP over diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 2687e34aa5b..e3c2c5b4297 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -28,11 +28,10 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ib_srp.c 3932 2005-11-01 17:19:29Z roland $ */ -#include <linux/version.h> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> @@ -40,47 +39,148 @@ #include <linux/string.h> #include <linux/parser.h> #include <linux/random.h> +#include <linux/jiffies.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <scsi/scsi.h> #include <scsi/scsi_device.h> #include <scsi/scsi_dbg.h> +#include <scsi/scsi_tcq.h> #include <scsi/srp.h> - -#include <rdma/ib_cache.h> +#include <scsi/scsi_transport_srp.h> #include "ib_srp.h" #define DRV_NAME "ib_srp" #define PFX DRV_NAME ": " -#define DRV_VERSION "0.2" -#define DRV_RELDATE "November 1, 2005" +#define DRV_VERSION "1.0" +#define DRV_RELDATE "July 1, 2013" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator " "v" DRV_VERSION " (" DRV_RELDATE ")"); MODULE_LICENSE("Dual BSD/GPL"); +static unsigned int srp_sg_tablesize; +static unsigned int cmd_sg_entries; +static unsigned int indirect_sg_entries; +static bool allow_ext_sg; +static bool prefer_fr; +static bool register_always; static int topspin_workarounds = 1; +module_param(srp_sg_tablesize, uint, 0444); +MODULE_PARM_DESC(srp_sg_tablesize, "Deprecated name for cmd_sg_entries"); + +module_param(cmd_sg_entries, uint, 0444); +MODULE_PARM_DESC(cmd_sg_entries, + "Default number of gather/scatter entries in the SRP command (default is 12, max 255)"); + +module_param(indirect_sg_entries, uint, 0444); +MODULE_PARM_DESC(indirect_sg_entries, + "Default max number of gather/scatter entries (default is 12, max is " __stringify(SCSI_MAX_SG_CHAIN_SEGMENTS) ")"); + +module_param(allow_ext_sg, bool, 0444); +MODULE_PARM_DESC(allow_ext_sg, + "Default behavior when there are more than cmd_sg_entries S/G entries after mapping; fails the request when false (default false)"); + module_param(topspin_workarounds, int, 0444); MODULE_PARM_DESC(topspin_workarounds, "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); -static const u8 topspin_oui[3] = { 0x00, 0x05, 0xad }; +module_param(prefer_fr, bool, 0444); +MODULE_PARM_DESC(prefer_fr, +"Whether to use fast registration if both FMR and fast registration are supported"); + +module_param(register_always, bool, 0444); +MODULE_PARM_DESC(register_always, + "Use memory registration even for contiguous memory regions"); + +static struct kernel_param_ops srp_tmo_ops; + +static int srp_reconnect_delay = 10; +module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reconnect_delay, "Time between successive reconnect attempts"); + +static int srp_fast_io_fail_tmo = 15; +module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(fast_io_fail_tmo, + "Number of seconds between the observation of a transport" + " layer error and failing all I/O. \"off\" means that this" + " functionality is disabled."); + +static int srp_dev_loss_tmo = 600; +module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dev_loss_tmo, + "Maximum number of seconds that the SRP transport should" + " insulate transport layer errors. After this time has been" + " exceeded the SCSI host is removed. Should be" + " between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT) + " if fast_io_fail_tmo has not been set. \"off\" means that" + " this functionality is disabled."); static void srp_add_one(struct ib_device *device); static void srp_remove_one(struct ib_device *device); -static void srp_completion(struct ib_cq *cq, void *target_ptr); +static void srp_recv_completion(struct ib_cq *cq, void *target_ptr); +static void srp_send_completion(struct ib_cq *cq, void *target_ptr); static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); +static struct scsi_transport_template *ib_srp_transport_template; + static struct ib_client srp_client = { .name = "srp", .add = srp_add_one, .remove = srp_remove_one }; +static struct ib_sa_client srp_sa_client; + +static int srp_tmo_get(char *buffer, const struct kernel_param *kp) +{ + int tmo = *(int *)kp->arg; + + if (tmo >= 0) + return sprintf(buffer, "%d", tmo); + else + return sprintf(buffer, "off"); +} + +static int srp_tmo_set(const char *val, const struct kernel_param *kp) +{ + int tmo, res; + + if (strncmp(val, "off", 3) != 0) { + res = kstrtoint(val, 0, &tmo); + if (res) + goto out; + } else { + tmo = -1; + } + if (kp->arg == &srp_reconnect_delay) + res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo, + srp_dev_loss_tmo); + else if (kp->arg == &srp_fast_io_fail_tmo) + res = srp_tmo_valid(srp_reconnect_delay, tmo, srp_dev_loss_tmo); + else + res = srp_tmo_valid(srp_reconnect_delay, srp_fast_io_fail_tmo, + tmo); + if (res) + goto out; + *(int *)kp->arg = tmo; + +out: + return res; +} + +static struct kernel_param_ops srp_tmo_ops = { + .get = srp_tmo_get, + .set = srp_tmo_set, +}; + static inline struct srp_target_port *host_to_target(struct Scsi_Host *host) { return (struct srp_target_port *) host->hostdata; @@ -91,6 +191,16 @@ static const char *srp_target_info(struct Scsi_Host *host) return host_to_target(host)->target_name; } +static int srp_target_is_topspin(struct srp_target_port *target) +{ + static const u8 topspin_oui[3] = { 0x00, 0x05, 0xad }; + static const u8 cisco_oui[3] = { 0x00, 0x1b, 0x0d }; + + return topspin_workarounds && + (!memcmp(&target->ioc_guid, topspin_oui, sizeof topspin_oui) || + !memcmp(&target->ioc_guid, cisco_oui, sizeof cisco_oui)); +} + static struct srp_iu *srp_alloc_iu(struct srp_host *host, size_t size, gfp_t gfp_mask, enum dma_data_direction direction) @@ -105,8 +215,9 @@ static struct srp_iu *srp_alloc_iu(struct srp_host *host, size_t size, if (!iu->buf) goto out_free_iu; - iu->dma = dma_map_single(host->dev->dma_device, iu->buf, size, direction); - if (dma_mapping_error(iu->dma)) + iu->dma = ib_dma_map_single(host->srp_dev->dev, iu->buf, size, + direction); + if (ib_dma_mapping_error(host->srp_dev->dev, iu->dma)) goto out_free_buf; iu->size = size; @@ -127,14 +238,15 @@ static void srp_free_iu(struct srp_host *host, struct srp_iu *iu) if (!iu) return; - dma_unmap_single(host->dev->dma_device, iu->dma, iu->size, iu->direction); + ib_dma_unmap_single(host->srp_dev->dev, iu->dma, iu->size, + iu->direction); kfree(iu->buf); kfree(iu); } static void srp_qp_event(struct ib_event *event, void *context) { - printk(KERN_ERR PFX "QP event %d\n", event->event); + pr_debug("QP event %d\n", event->event); } static int srp_init_qp(struct srp_target_port *target, @@ -147,10 +259,10 @@ static int srp_init_qp(struct srp_target_port *target, if (!attr) return -ENOMEM; - ret = ib_find_cached_pkey(target->srp_host->dev, - target->srp_host->port, - be16_to_cpu(target->path.pkey), - &attr->pkey_index); + ret = ib_find_pkey(target->srp_host->srp_dev->dev, + target->srp_host->port, + be16_to_cpu(target->path.pkey), + &attr->pkey_index); if (ret) goto out; @@ -170,64 +282,304 @@ out: return ret; } +static int srp_new_cm_id(struct srp_target_port *target) +{ + struct ib_cm_id *new_cm_id; + + new_cm_id = ib_create_cm_id(target->srp_host->srp_dev->dev, + srp_cm_handler, target); + if (IS_ERR(new_cm_id)) + return PTR_ERR(new_cm_id); + + if (target->cm_id) + ib_destroy_cm_id(target->cm_id); + target->cm_id = new_cm_id; + + return 0; +} + +static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_fmr_pool_param fmr_param; + + memset(&fmr_param, 0, sizeof(fmr_param)); + fmr_param.pool_size = target->scsi_host->can_queue; + fmr_param.dirty_watermark = fmr_param.pool_size / 4; + fmr_param.cache = 1; + fmr_param.max_pages_per_fmr = dev->max_pages_per_mr; + fmr_param.page_shift = ilog2(dev->mr_page_size); + fmr_param.access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + + return ib_create_fmr_pool(dev->pd, &fmr_param); +} + +/** + * srp_destroy_fr_pool() - free the resources owned by a pool + * @pool: Fast registration pool to be destroyed. + */ +static void srp_destroy_fr_pool(struct srp_fr_pool *pool) +{ + int i; + struct srp_fr_desc *d; + + if (!pool) + return; + + for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { + if (d->frpl) + ib_free_fast_reg_page_list(d->frpl); + if (d->mr) + ib_dereg_mr(d->mr); + } + kfree(pool); +} + +/** + * srp_create_fr_pool() - allocate and initialize a pool for fast registration + * @device: IB device to allocate fast registration descriptors for. + * @pd: Protection domain associated with the FR descriptors. + * @pool_size: Number of descriptors to allocate. + * @max_page_list_len: Maximum fast registration work request page list length. + */ +static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device, + struct ib_pd *pd, int pool_size, + int max_page_list_len) +{ + struct srp_fr_pool *pool; + struct srp_fr_desc *d; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; + int i, ret = -EINVAL; + + if (pool_size <= 0) + goto err; + ret = -ENOMEM; + pool = kzalloc(sizeof(struct srp_fr_pool) + + pool_size * sizeof(struct srp_fr_desc), GFP_KERNEL); + if (!pool) + goto err; + pool->size = pool_size; + pool->max_page_list_len = max_page_list_len; + spin_lock_init(&pool->lock); + INIT_LIST_HEAD(&pool->free_list); + + for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { + mr = ib_alloc_fast_reg_mr(pd, max_page_list_len); + if (IS_ERR(mr)) { + ret = PTR_ERR(mr); + goto destroy_pool; + } + d->mr = mr; + frpl = ib_alloc_fast_reg_page_list(device, max_page_list_len); + if (IS_ERR(frpl)) { + ret = PTR_ERR(frpl); + goto destroy_pool; + } + d->frpl = frpl; + list_add_tail(&d->entry, &pool->free_list); + } + +out: + return pool; + +destroy_pool: + srp_destroy_fr_pool(pool); + +err: + pool = ERR_PTR(ret); + goto out; +} + +/** + * srp_fr_pool_get() - obtain a descriptor suitable for fast registration + * @pool: Pool to obtain descriptor from. + */ +static struct srp_fr_desc *srp_fr_pool_get(struct srp_fr_pool *pool) +{ + struct srp_fr_desc *d = NULL; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + if (!list_empty(&pool->free_list)) { + d = list_first_entry(&pool->free_list, typeof(*d), entry); + list_del(&d->entry); + } + spin_unlock_irqrestore(&pool->lock, flags); + + return d; +} + +/** + * srp_fr_pool_put() - put an FR descriptor back in the free list + * @pool: Pool the descriptor was allocated from. + * @desc: Pointer to an array of fast registration descriptor pointers. + * @n: Number of descriptors to put back. + * + * Note: The caller must already have queued an invalidation request for + * desc->mr->rkey before calling this function. + */ +static void srp_fr_pool_put(struct srp_fr_pool *pool, struct srp_fr_desc **desc, + int n) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&pool->lock, flags); + for (i = 0; i < n; i++) + list_add(&desc[i]->entry, &pool->free_list); + spin_unlock_irqrestore(&pool->lock, flags); +} + +static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + + return srp_create_fr_pool(dev->dev, dev->pd, + target->scsi_host->can_queue, + dev->max_pages_per_mr); +} + static int srp_create_target_ib(struct srp_target_port *target) { + struct srp_device *dev = target->srp_host->srp_dev; struct ib_qp_init_attr *init_attr; + struct ib_cq *recv_cq, *send_cq; + struct ib_qp *qp; + struct ib_fmr_pool *fmr_pool = NULL; + struct srp_fr_pool *fr_pool = NULL; + const int m = 1 + dev->use_fast_reg; int ret; init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL); if (!init_attr) return -ENOMEM; - target->cq = ib_create_cq(target->srp_host->dev, srp_completion, - NULL, target, SRP_CQ_SIZE); - if (IS_ERR(target->cq)) { - ret = PTR_ERR(target->cq); - goto out; + recv_cq = ib_create_cq(dev->dev, srp_recv_completion, NULL, target, + target->queue_size, target->comp_vector); + if (IS_ERR(recv_cq)) { + ret = PTR_ERR(recv_cq); + goto err; + } + + send_cq = ib_create_cq(dev->dev, srp_send_completion, NULL, target, + m * target->queue_size, target->comp_vector); + if (IS_ERR(send_cq)) { + ret = PTR_ERR(send_cq); + goto err_recv_cq; } - ib_req_notify_cq(target->cq, IB_CQ_NEXT_COMP); + ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP); init_attr->event_handler = srp_qp_event; - init_attr->cap.max_send_wr = SRP_SQ_SIZE; - init_attr->cap.max_recv_wr = SRP_RQ_SIZE; + init_attr->cap.max_send_wr = m * target->queue_size; + init_attr->cap.max_recv_wr = target->queue_size; init_attr->cap.max_recv_sge = 1; init_attr->cap.max_send_sge = 1; - init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; init_attr->qp_type = IB_QPT_RC; - init_attr->send_cq = target->cq; - init_attr->recv_cq = target->cq; + init_attr->send_cq = send_cq; + init_attr->recv_cq = recv_cq; - target->qp = ib_create_qp(target->srp_host->pd, init_attr); - if (IS_ERR(target->qp)) { - ret = PTR_ERR(target->qp); - ib_destroy_cq(target->cq); - goto out; + qp = ib_create_qp(dev->pd, init_attr); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto err_send_cq; } - ret = srp_init_qp(target, target->qp); - if (ret) { - ib_destroy_qp(target->qp); - ib_destroy_cq(target->cq); - goto out; + ret = srp_init_qp(target, qp); + if (ret) + goto err_qp; + + if (dev->use_fast_reg && dev->has_fr) { + fr_pool = srp_alloc_fr_pool(target); + if (IS_ERR(fr_pool)) { + ret = PTR_ERR(fr_pool); + shost_printk(KERN_WARNING, target->scsi_host, PFX + "FR pool allocation failed (%d)\n", ret); + goto err_qp; + } + if (target->fr_pool) + srp_destroy_fr_pool(target->fr_pool); + target->fr_pool = fr_pool; + } else if (!dev->use_fast_reg && dev->has_fmr) { + fmr_pool = srp_alloc_fmr_pool(target); + if (IS_ERR(fmr_pool)) { + ret = PTR_ERR(fmr_pool); + shost_printk(KERN_WARNING, target->scsi_host, PFX + "FMR pool allocation failed (%d)\n", ret); + goto err_qp; + } + if (target->fmr_pool) + ib_destroy_fmr_pool(target->fmr_pool); + target->fmr_pool = fmr_pool; } -out: + if (target->qp) + ib_destroy_qp(target->qp); + if (target->recv_cq) + ib_destroy_cq(target->recv_cq); + if (target->send_cq) + ib_destroy_cq(target->send_cq); + + target->qp = qp; + target->recv_cq = recv_cq; + target->send_cq = send_cq; + + kfree(init_attr); + return 0; + +err_qp: + ib_destroy_qp(qp); + +err_send_cq: + ib_destroy_cq(send_cq); + +err_recv_cq: + ib_destroy_cq(recv_cq); + +err: kfree(init_attr); return ret; } +/* + * Note: this function may be called without srp_alloc_iu_bufs() having been + * invoked. Hence the target->[rt]x_ring checks. + */ static void srp_free_target_ib(struct srp_target_port *target) { + struct srp_device *dev = target->srp_host->srp_dev; int i; + if (dev->use_fast_reg) { + if (target->fr_pool) + srp_destroy_fr_pool(target->fr_pool); + } else { + if (target->fmr_pool) + ib_destroy_fmr_pool(target->fmr_pool); + } ib_destroy_qp(target->qp); - ib_destroy_cq(target->cq); + ib_destroy_cq(target->send_cq); + ib_destroy_cq(target->recv_cq); - for (i = 0; i < SRP_RQ_SIZE; ++i) - srp_free_iu(target->srp_host, target->rx_ring[i]); - for (i = 0; i < SRP_SQ_SIZE + 1; ++i) - srp_free_iu(target->srp_host, target->tx_ring[i]); + target->qp = NULL; + target->send_cq = target->recv_cq = NULL; + + if (target->rx_ring) { + for (i = 0; i < target->queue_size; ++i) + srp_free_iu(target->srp_host, target->rx_ring[i]); + kfree(target->rx_ring); + target->rx_ring = NULL; + } + if (target->tx_ring) { + for (i = 0; i < target->queue_size; ++i) + srp_free_iu(target->srp_host, target->tx_ring[i]); + kfree(target->tx_ring); + target->tx_ring = NULL; + } } static void srp_path_rec_completion(int status, @@ -238,7 +590,8 @@ static void srp_path_rec_completion(int status, target->status = status; if (status) - printk(KERN_ERR PFX "Got failed path rec status %d\n", status); + shost_printk(KERN_ERR, target->scsi_host, + PFX "Got failed path rec status %d\n", status); else target->path = *pathrec; complete(&target->done); @@ -246,13 +599,17 @@ static void srp_path_rec_completion(int status, static int srp_lookup_path(struct srp_target_port *target) { + int ret; + target->path.numb_path = 1; init_completion(&target->done); - target->path_query_id = ib_sa_path_rec_get(target->srp_host->dev, + target->path_query_id = ib_sa_path_rec_get(&srp_sa_client, + target->srp_host->srp_dev->dev, target->srp_host->port, &target->path, + IB_SA_PATH_REC_SERVICE_ID | IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | @@ -264,10 +621,13 @@ static int srp_lookup_path(struct srp_target_port *target) if (target->path_query_id < 0) return target->path_query_id; - wait_for_completion(&target->done); + ret = wait_for_completion_interruptible(&target->done); + if (ret < 0) + return ret; if (target->status < 0) - printk(KERN_WARNING PFX "Path record query failed\n"); + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Path record query failed\n"); return target->status; } @@ -303,30 +663,54 @@ static int srp_send_req(struct srp_target_port *target) req->param.responder_resources = 4; req->param.remote_cm_response_timeout = 20; req->param.local_cm_response_timeout = 20; - req->param.retry_count = 7; + req->param.retry_count = target->tl_retry_count; req->param.rnr_retry_count = 7; req->param.max_cm_retries = 15; req->priv.opcode = SRP_LOGIN_REQ; req->priv.tag = 0; - req->priv.req_it_iu_len = cpu_to_be32(SRP_MAX_IU_LEN); + req->priv.req_it_iu_len = cpu_to_be32(target->max_iu_len); req->priv.req_buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | SRP_BUF_FORMAT_INDIRECT); - memcpy(req->priv.initiator_port_id, target->srp_host->initiator_port_id, 16); + /* + * In the published SRP specification (draft rev. 16a), the + * port identifier format is 8 bytes of ID extension followed + * by 8 bytes of GUID. Older drafts put the two halves in the + * opposite order, so that the GUID comes first. + * + * Targets conforming to these obsolete drafts can be + * recognized by the I/O Class they report. + */ + if (target->io_class == SRP_REV10_IB_IO_CLASS) { + memcpy(req->priv.initiator_port_id, + &target->path.sgid.global.interface_id, 8); + memcpy(req->priv.initiator_port_id + 8, + &target->initiator_ext, 8); + memcpy(req->priv.target_port_id, &target->ioc_guid, 8); + memcpy(req->priv.target_port_id + 8, &target->id_ext, 8); + } else { + memcpy(req->priv.initiator_port_id, + &target->initiator_ext, 8); + memcpy(req->priv.initiator_port_id + 8, + &target->path.sgid.global.interface_id, 8); + memcpy(req->priv.target_port_id, &target->id_ext, 8); + memcpy(req->priv.target_port_id + 8, &target->ioc_guid, 8); + } + /* * Topspin/Cisco SRP targets will reject our login unless we - * zero out the first 8 bytes of our initiator port ID. The - * second 8 bytes must be our local node GUID, but we always - * use that anyway. + * zero out the first 8 bytes of our initiator port ID and set + * the second 8 bytes to the local node GUID. */ - if (topspin_workarounds && !memcmp(&target->ioc_guid, topspin_oui, 3)) { - printk(KERN_DEBUG PFX "Topspin/Cisco initiator port ID workaround " - "activated for target GUID %016llx\n", - (unsigned long long) be64_to_cpu(target->ioc_guid)); + if (srp_target_is_topspin(target)) { + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Topspin/Cisco initiator port ID workaround " + "activated for target GUID %016llx\n", + (unsigned long long) be64_to_cpu(target->ioc_guid)); memset(req->priv.initiator_port_id, 0, 8); + memcpy(req->priv.initiator_port_id + 8, + &target->srp_host->srp_dev->dev->node_guid, 8); } - memcpy(req->priv.target_port_id, &target->id_ext, 8); - memcpy(req->priv.target_port_id + 8, &target->ioc_guid, 8); status = ib_send_cm_req(target->cm_id, &req->param); @@ -335,44 +719,193 @@ static int srp_send_req(struct srp_target_port *target) return status; } +static bool srp_queue_remove_work(struct srp_target_port *target) +{ + bool changed = false; + + spin_lock_irq(&target->lock); + if (target->state != SRP_TARGET_REMOVED) { + target->state = SRP_TARGET_REMOVED; + changed = true; + } + spin_unlock_irq(&target->lock); + + if (changed) + queue_work(system_long_wq, &target->remove_work); + + return changed; +} + +static bool srp_change_conn_state(struct srp_target_port *target, + bool connected) +{ + bool changed = false; + + spin_lock_irq(&target->lock); + if (target->connected != connected) { + target->connected = connected; + changed = true; + } + spin_unlock_irq(&target->lock); + + return changed; +} + static void srp_disconnect_target(struct srp_target_port *target) { - /* XXX should send SRP_I_LOGOUT request */ + if (srp_change_conn_state(target, false)) { + /* XXX should send SRP_I_LOGOUT request */ - init_completion(&target->done); - ib_send_cm_dreq(target->cm_id, NULL, 0); - wait_for_completion(&target->done); + if (ib_send_cm_dreq(target->cm_id, NULL, 0)) { + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Sending CM DREQ failed\n"); + } + } } -static void srp_remove_work(void *target_ptr) +static void srp_free_req_data(struct srp_target_port *target) { - struct srp_target_port *target = target_ptr; + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + struct srp_request *req; + int i; - spin_lock_irq(target->scsi_host->host_lock); - if (target->state != SRP_TARGET_DEAD) { - spin_unlock_irq(target->scsi_host->host_lock); - scsi_host_put(target->scsi_host); + if (!target->req_ring) return; + + for (i = 0; i < target->req_ring_size; ++i) { + req = &target->req_ring[i]; + if (dev->use_fast_reg) + kfree(req->fr_list); + else + kfree(req->fmr_list); + kfree(req->map_page); + if (req->indirect_dma_addr) { + ib_dma_unmap_single(ibdev, req->indirect_dma_addr, + target->indirect_size, + DMA_TO_DEVICE); + } + kfree(req->indirect_desc); } - target->state = SRP_TARGET_REMOVED; - spin_unlock_irq(target->scsi_host->host_lock); - down(&target->srp_host->target_mutex); - list_del(&target->list); - up(&target->srp_host->target_mutex); + kfree(target->req_ring); + target->req_ring = NULL; +} +static int srp_alloc_req_data(struct srp_target_port *target) +{ + struct srp_device *srp_dev = target->srp_host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; + struct srp_request *req; + void *mr_list; + dma_addr_t dma_addr; + int i, ret = -ENOMEM; + + INIT_LIST_HEAD(&target->free_reqs); + + target->req_ring = kzalloc(target->req_ring_size * + sizeof(*target->req_ring), GFP_KERNEL); + if (!target->req_ring) + goto out; + + for (i = 0; i < target->req_ring_size; ++i) { + req = &target->req_ring[i]; + mr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *), + GFP_KERNEL); + if (!mr_list) + goto out; + if (srp_dev->use_fast_reg) + req->fr_list = mr_list; + else + req->fmr_list = mr_list; + req->map_page = kmalloc(srp_dev->max_pages_per_mr * + sizeof(void *), GFP_KERNEL); + if (!req->map_page) + goto out; + req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); + if (!req->indirect_desc) + goto out; + + dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, + target->indirect_size, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ibdev, dma_addr)) + goto out; + + req->indirect_dma_addr = dma_addr; + req->index = i; + list_add_tail(&req->list, &target->free_reqs); + } + ret = 0; + +out: + return ret; +} + +/** + * srp_del_scsi_host_attr() - Remove attributes defined in the host template. + * @shost: SCSI host whose attributes to remove from sysfs. + * + * Note: Any attributes defined in the host template and that did not exist + * before invocation of this function will be ignored. + */ +static void srp_del_scsi_host_attr(struct Scsi_Host *shost) +{ + struct device_attribute **attr; + + for (attr = shost->hostt->shost_attrs; attr && *attr; ++attr) + device_remove_file(&shost->shost_dev, *attr); +} + +static void srp_remove_target(struct srp_target_port *target) +{ + WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); + + srp_del_scsi_host_attr(target->scsi_host); + srp_rport_get(target->rport); + srp_remove_host(target->scsi_host); scsi_remove_host(target->scsi_host); + srp_stop_rport_timers(target->rport); + srp_disconnect_target(target); ib_destroy_cm_id(target->cm_id); srp_free_target_ib(target); + cancel_work_sync(&target->tl_err_work); + srp_rport_put(target->rport); + srp_free_req_data(target); + + spin_lock(&target->srp_host->target_lock); + list_del(&target->list); + spin_unlock(&target->srp_host->target_lock); + scsi_host_put(target->scsi_host); - /* And another put to really free the target port... */ - scsi_host_put(target->scsi_host); +} + +static void srp_remove_work(struct work_struct *work) +{ + struct srp_target_port *target = + container_of(work, struct srp_target_port, remove_work); + + WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); + + srp_remove_target(target); +} + +static void srp_rport_delete(struct srp_rport *rport) +{ + struct srp_target_port *target = rport->lld_data; + + srp_queue_remove_work(target); } static int srp_connect_target(struct srp_target_port *target) { + int retries = 3; int ret; + WARN_ON_ONCE(target->connected); + + target->qp_in_error = false; + ret = srp_lookup_path(target); if (ret) return ret; @@ -382,7 +915,9 @@ static int srp_connect_target(struct srp_target_port *target) ret = srp_send_req(target); if (ret) return ret; - wait_for_completion(&target->done); + ret = wait_for_completion_interruptible(&target->done); + if (ret < 0) + return ret; /* * The CM event handling code will set status to @@ -392,6 +927,7 @@ static int srp_connect_target(struct srp_target_port *target) */ switch (target->status) { case 0: + srp_change_conn_state(target, true); return 0; case SRP_PORT_REDIRECT: @@ -403,253 +939,701 @@ static int srp_connect_target(struct srp_target_port *target) case SRP_DLID_REDIRECT: break; + case SRP_STALE_CONN: + /* Our current CM id was stale, and is now in timewait. + * Try to reconnect with a new one. + */ + if (!retries-- || srp_new_cm_id(target)) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "giving up on stale connection\n"); + target->status = -ECONNRESET; + return target->status; + } + + shost_printk(KERN_ERR, target->scsi_host, PFX + "retrying stale connection\n"); + break; + default: return target->status; } } } -static int srp_reconnect_target(struct srp_target_port *target) +static int srp_inv_rkey(struct srp_target_port *target, u32 rkey) { - struct ib_cm_id *new_cm_id; - struct ib_qp_attr qp_attr; - struct srp_request *req; - struct ib_wc wc; - int ret; + struct ib_send_wr *bad_wr; + struct ib_send_wr wr = { + .opcode = IB_WR_LOCAL_INV, + .wr_id = LOCAL_INV_WR_ID_MASK, + .next = NULL, + .num_sge = 0, + .send_flags = 0, + .ex.invalidate_rkey = rkey, + }; + + return ib_post_send(target->qp, &wr, &bad_wr); +} + +static void srp_unmap_data(struct scsi_cmnd *scmnd, + struct srp_target_port *target, + struct srp_request *req) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + int i, res; + + if (!scsi_sglist(scmnd) || + (scmnd->sc_data_direction != DMA_TO_DEVICE && + scmnd->sc_data_direction != DMA_FROM_DEVICE)) + return; + + if (dev->use_fast_reg) { + struct srp_fr_desc **pfr; + + for (i = req->nmdesc, pfr = req->fr_list; i > 0; i--, pfr++) { + res = srp_inv_rkey(target, (*pfr)->mr->rkey); + if (res < 0) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "Queueing INV WR for rkey %#x failed (%d)\n", + (*pfr)->mr->rkey, res); + queue_work(system_long_wq, + &target->tl_err_work); + } + } + if (req->nmdesc) + srp_fr_pool_put(target->fr_pool, req->fr_list, + req->nmdesc); + } else { + struct ib_pool_fmr **pfmr; + + for (i = req->nmdesc, pfmr = req->fmr_list; i > 0; i--, pfmr++) + ib_fmr_pool_unmap(*pfmr); + } + + ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd), + scmnd->sc_data_direction); +} + +/** + * srp_claim_req - Take ownership of the scmnd associated with a request. + * @target: SRP target port. + * @req: SRP request. + * @sdev: If not NULL, only take ownership for this SCSI device. + * @scmnd: If NULL, take ownership of @req->scmnd. If not NULL, only take + * ownership of @req->scmnd if it equals @scmnd. + * + * Return value: + * Either NULL or a pointer to the SCSI command the caller became owner of. + */ +static struct scsi_cmnd *srp_claim_req(struct srp_target_port *target, + struct srp_request *req, + struct scsi_device *sdev, + struct scsi_cmnd *scmnd) +{ + unsigned long flags; + + spin_lock_irqsave(&target->lock, flags); + if (req->scmnd && + (!sdev || req->scmnd->device == sdev) && + (!scmnd || req->scmnd == scmnd)) { + scmnd = req->scmnd; + req->scmnd = NULL; + } else { + scmnd = NULL; + } + spin_unlock_irqrestore(&target->lock, flags); + + return scmnd; +} + +/** + * srp_free_req() - Unmap data and add request to the free request list. + * @target: SRP target port. + * @req: Request to be freed. + * @scmnd: SCSI command associated with @req. + * @req_lim_delta: Amount to be added to @target->req_lim. + */ +static void srp_free_req(struct srp_target_port *target, + struct srp_request *req, struct scsi_cmnd *scmnd, + s32 req_lim_delta) +{ + unsigned long flags; + + srp_unmap_data(scmnd, target, req); + + spin_lock_irqsave(&target->lock, flags); + target->req_lim += req_lim_delta; + list_add_tail(&req->list, &target->free_reqs); + spin_unlock_irqrestore(&target->lock, flags); +} + +static void srp_finish_req(struct srp_target_port *target, + struct srp_request *req, struct scsi_device *sdev, + int result) +{ + struct scsi_cmnd *scmnd = srp_claim_req(target, req, sdev, NULL); + + if (scmnd) { + srp_free_req(target, req, scmnd, 0); + scmnd->result = result; + scmnd->scsi_done(scmnd); + } +} + +static void srp_terminate_io(struct srp_rport *rport) +{ + struct srp_target_port *target = rport->lld_data; + struct Scsi_Host *shost = target->scsi_host; + struct scsi_device *sdev; int i; - spin_lock_irq(target->scsi_host->host_lock); - if (target->state != SRP_TARGET_LIVE) { - spin_unlock_irq(target->scsi_host->host_lock); - return -EAGAIN; + /* + * Invoking srp_terminate_io() while srp_queuecommand() is running + * is not safe. Hence the warning statement below. + */ + shost_for_each_device(sdev, shost) + WARN_ON_ONCE(sdev->request_queue->request_fn_active); + + for (i = 0; i < target->req_ring_size; ++i) { + struct srp_request *req = &target->req_ring[i]; + srp_finish_req(target, req, NULL, DID_TRANSPORT_FAILFAST << 16); } - target->state = SRP_TARGET_CONNECTING; - spin_unlock_irq(target->scsi_host->host_lock); +} + +/* + * It is up to the caller to ensure that srp_rport_reconnect() calls are + * serialized and that no concurrent srp_queuecommand(), srp_abort(), + * srp_reset_device() or srp_reset_host() calls will occur while this function + * is in progress. One way to realize that is not to call this function + * directly but to call srp_reconnect_rport() instead since that last function + * serializes calls of this function via rport->mutex and also blocks + * srp_queuecommand() calls before invoking this function. + */ +static int srp_rport_reconnect(struct srp_rport *rport) +{ + struct srp_target_port *target = rport->lld_data; + int i, ret; srp_disconnect_target(target); /* - * Now get a new local CM ID so that we avoid confusing the - * target in case things are really fouled up. + * Now get a new local CM ID so that we avoid confusing the target in + * case things are really fouled up. Doing so also ensures that all CM + * callbacks will have finished before a new QP is allocated. */ - new_cm_id = ib_create_cm_id(target->srp_host->dev, - srp_cm_handler, target); - if (IS_ERR(new_cm_id)) { - ret = PTR_ERR(new_cm_id); - goto err; + ret = srp_new_cm_id(target); + + for (i = 0; i < target->req_ring_size; ++i) { + struct srp_request *req = &target->req_ring[i]; + srp_finish_req(target, req, NULL, DID_RESET << 16); } - ib_destroy_cm_id(target->cm_id); - target->cm_id = new_cm_id; - qp_attr.qp_state = IB_QPS_RESET; - ret = ib_modify_qp(target->qp, &qp_attr, IB_QP_STATE); - if (ret) - goto err; + /* + * Whether or not creating a new CM ID succeeded, create a new + * QP. This guarantees that all callback functions for the old QP have + * finished before any send requests are posted on the new QP. + */ + ret += srp_create_target_ib(target); - ret = srp_init_qp(target, target->qp); - if (ret) - goto err; + INIT_LIST_HEAD(&target->free_tx); + for (i = 0; i < target->queue_size; ++i) + list_add(&target->tx_ring[i]->list, &target->free_tx); - while (ib_poll_cq(target->cq, 1, &wc) > 0) - ; /* nothing */ + if (ret == 0) + ret = srp_connect_target(target); - list_for_each_entry(req, &target->req_queue, list) { - req->scmnd->result = DID_RESET << 16; - req->scmnd->scsi_done(req->scmnd); - } + if (ret == 0) + shost_printk(KERN_INFO, target->scsi_host, + PFX "reconnect succeeded\n"); - target->rx_head = 0; - target->tx_head = 0; - target->tx_tail = 0; - target->req_head = 0; - for (i = 0; i < SRP_SQ_SIZE - 1; ++i) - target->req_ring[i].next = i + 1; - target->req_ring[SRP_SQ_SIZE - 1].next = -1; - INIT_LIST_HEAD(&target->req_queue); + return ret; +} - ret = srp_connect_target(target); - if (ret) - goto err; +static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr, + unsigned int dma_len, u32 rkey) +{ + struct srp_direct_buf *desc = state->desc; - spin_lock_irq(target->scsi_host->host_lock); - if (target->state == SRP_TARGET_CONNECTING) { - ret = 0; - target->state = SRP_TARGET_LIVE; - } else - ret = -EAGAIN; - spin_unlock_irq(target->scsi_host->host_lock); + desc->va = cpu_to_be64(dma_addr); + desc->key = cpu_to_be32(rkey); + desc->len = cpu_to_be32(dma_len); + + state->total_len += dma_len; + state->desc++; + state->ndesc++; +} + +static int srp_map_finish_fmr(struct srp_map_state *state, + struct srp_target_port *target) +{ + struct ib_pool_fmr *fmr; + u64 io_addr = 0; + + fmr = ib_fmr_pool_map_phys(target->fmr_pool, state->pages, + state->npages, io_addr); + if (IS_ERR(fmr)) + return PTR_ERR(fmr); + + *state->next_fmr++ = fmr; + state->nmdesc++; + + srp_map_desc(state, 0, state->dma_len, fmr->fmr->rkey); + + return 0; +} + +static int srp_map_finish_fr(struct srp_map_state *state, + struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_send_wr *bad_wr; + struct ib_send_wr wr; + struct srp_fr_desc *desc; + u32 rkey; + + desc = srp_fr_pool_get(target->fr_pool); + if (!desc) + return -ENOMEM; + + rkey = ib_inc_rkey(desc->mr->rkey); + ib_update_fast_reg_key(desc->mr, rkey); + + memcpy(desc->frpl->page_list, state->pages, + sizeof(state->pages[0]) * state->npages); + + memset(&wr, 0, sizeof(wr)); + wr.opcode = IB_WR_FAST_REG_MR; + wr.wr_id = FAST_REG_WR_ID_MASK; + wr.wr.fast_reg.iova_start = state->base_dma_addr; + wr.wr.fast_reg.page_list = desc->frpl; + wr.wr.fast_reg.page_list_len = state->npages; + wr.wr.fast_reg.page_shift = ilog2(dev->mr_page_size); + wr.wr.fast_reg.length = state->dma_len; + wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE); + wr.wr.fast_reg.rkey = desc->mr->lkey; + + *state->next_fr++ = desc; + state->nmdesc++; + + srp_map_desc(state, state->base_dma_addr, state->dma_len, + desc->mr->rkey); + + return ib_post_send(target->qp, &wr, &bad_wr); +} + +static int srp_finish_mapping(struct srp_map_state *state, + struct srp_target_port *target) +{ + int ret = 0; + + if (state->npages == 0) + return 0; + + if (state->npages == 1 && !register_always) + srp_map_desc(state, state->base_dma_addr, state->dma_len, + target->rkey); + else + ret = target->srp_host->srp_dev->use_fast_reg ? + srp_map_finish_fr(state, target) : + srp_map_finish_fmr(state, target); + + if (ret == 0) { + state->npages = 0; + state->dma_len = 0; + } return ret; +} -err: - printk(KERN_ERR PFX "reconnect failed (%d), removing target port.\n", ret); +static void srp_map_update_start(struct srp_map_state *state, + struct scatterlist *sg, int sg_index, + dma_addr_t dma_addr) +{ + state->unmapped_sg = sg; + state->unmapped_index = sg_index; + state->unmapped_addr = dma_addr; +} + +static int srp_map_sg_entry(struct srp_map_state *state, + struct srp_target_port *target, + struct scatterlist *sg, int sg_index, + bool use_mr) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg); + unsigned int dma_len = ib_sg_dma_len(ibdev, sg); + unsigned int len; + int ret; + + if (!dma_len) + return 0; + + if (!use_mr) { + /* + * Once we're in direct map mode for a request, we don't + * go back to FMR or FR mode, so no need to update anything + * other than the descriptor. + */ + srp_map_desc(state, dma_addr, dma_len, target->rkey); + return 0; + } + + /* + * Since not all RDMA HW drivers support non-zero page offsets for + * FMR, if we start at an offset into a page, don't merge into the + * current FMR mapping. Finish it out, and use the kernel's MR for + * this sg entry. + */ + if ((!dev->use_fast_reg && dma_addr & ~dev->mr_page_mask) || + dma_len > dev->mr_max_size) { + ret = srp_finish_mapping(state, target); + if (ret) + return ret; + + srp_map_desc(state, dma_addr, dma_len, target->rkey); + srp_map_update_start(state, NULL, 0, 0); + return 0; + } /* - * We couldn't reconnect, so kill our target port off. - * However, we have to defer the real removal because we might - * be in the context of the SCSI error handler now, which - * would deadlock if we call scsi_remove_host(). + * If this is the first sg that will be mapped via FMR or via FR, save + * our position. We need to know the first unmapped entry, its index, + * and the first unmapped address within that entry to be able to + * restart mapping after an error. */ - spin_lock_irq(target->scsi_host->host_lock); - if (target->state == SRP_TARGET_CONNECTING) { - target->state = SRP_TARGET_DEAD; - INIT_WORK(&target->work, srp_remove_work, target); - schedule_work(&target->work); + if (!state->unmapped_sg) + srp_map_update_start(state, sg, sg_index, dma_addr); + + while (dma_len) { + unsigned offset = dma_addr & ~dev->mr_page_mask; + if (state->npages == dev->max_pages_per_mr || offset != 0) { + ret = srp_finish_mapping(state, target); + if (ret) + return ret; + + srp_map_update_start(state, sg, sg_index, dma_addr); + } + + len = min_t(unsigned int, dma_len, dev->mr_page_size - offset); + + if (!state->npages) + state->base_dma_addr = dma_addr; + state->pages[state->npages++] = dma_addr & dev->mr_page_mask; + state->dma_len += len; + dma_addr += len; + dma_len -= len; } - spin_unlock_irq(target->scsi_host->host_lock); + /* + * If the last entry of the MR wasn't a full page, then we need to + * close it out and start a new one -- we can only merge at page + * boundries. + */ + ret = 0; + if (len != dev->mr_page_size) { + ret = srp_finish_mapping(state, target); + if (!ret) + srp_map_update_start(state, NULL, 0, 0); + } return ret; } +static int srp_map_sg(struct srp_map_state *state, + struct srp_target_port *target, struct srp_request *req, + struct scatterlist *scat, int count) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + struct scatterlist *sg; + int i; + bool use_mr; + + state->desc = req->indirect_desc; + state->pages = req->map_page; + if (dev->use_fast_reg) { + state->next_fr = req->fr_list; + use_mr = !!target->fr_pool; + } else { + state->next_fmr = req->fmr_list; + use_mr = !!target->fmr_pool; + } + + for_each_sg(scat, sg, count, i) { + if (srp_map_sg_entry(state, target, sg, i, use_mr)) { + /* + * Memory registration failed, so backtrack to the + * first unmapped entry and continue on without using + * memory registration. + */ + dma_addr_t dma_addr; + unsigned int dma_len; + +backtrack: + sg = state->unmapped_sg; + i = state->unmapped_index; + + dma_addr = ib_sg_dma_address(ibdev, sg); + dma_len = ib_sg_dma_len(ibdev, sg); + dma_len -= (state->unmapped_addr - dma_addr); + dma_addr = state->unmapped_addr; + use_mr = false; + srp_map_desc(state, dma_addr, dma_len, target->rkey); + } + } + + if (use_mr && srp_finish_mapping(state, target)) + goto backtrack; + + req->nmdesc = state->nmdesc; + + return 0; +} + static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, struct srp_request *req) { + struct scatterlist *scat; struct srp_cmd *cmd = req->cmd->buf; - int len; + int len, nents, count; + struct srp_device *dev; + struct ib_device *ibdev; + struct srp_map_state state; + struct srp_indirect_buf *indirect_hdr; + u32 table_len; u8 fmt; - if (!scmnd->request_buffer || scmnd->sc_data_direction == DMA_NONE) + if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE) return sizeof (struct srp_cmd); if (scmnd->sc_data_direction != DMA_FROM_DEVICE && scmnd->sc_data_direction != DMA_TO_DEVICE) { - printk(KERN_WARNING PFX "Unhandled data direction %d\n", - scmnd->sc_data_direction); + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled data direction %d\n", + scmnd->sc_data_direction); return -EINVAL; } - if (scmnd->use_sg) { - struct scatterlist *scat = scmnd->request_buffer; - int n; - int i; + nents = scsi_sg_count(scmnd); + scat = scsi_sglist(scmnd); - n = dma_map_sg(target->srp_host->dev->dma_device, - scat, scmnd->use_sg, scmnd->sc_data_direction); + dev = target->srp_host->srp_dev; + ibdev = dev->dev; - if (n == 1) { - struct srp_direct_buf *buf = (void *) cmd->add_data; + count = ib_dma_map_sg(ibdev, scat, nents, scmnd->sc_data_direction); + if (unlikely(count == 0)) + return -EIO; - fmt = SRP_DATA_DESC_DIRECT; + fmt = SRP_DATA_DESC_DIRECT; + len = sizeof (struct srp_cmd) + sizeof (struct srp_direct_buf); - buf->va = cpu_to_be64(sg_dma_address(scat)); - buf->key = cpu_to_be32(target->srp_host->mr->rkey); - buf->len = cpu_to_be32(sg_dma_len(scat)); + if (count == 1 && !register_always) { + /* + * The midlayer only generated a single gather/scatter + * entry, or DMA mapping coalesced everything to a + * single entry. So a direct descriptor along with + * the DMA MR suffices. + */ + struct srp_direct_buf *buf = (void *) cmd->add_data; - len = sizeof (struct srp_cmd) + - sizeof (struct srp_direct_buf); - } else { - struct srp_indirect_buf *buf = (void *) cmd->add_data; - u32 datalen = 0; + buf->va = cpu_to_be64(ib_sg_dma_address(ibdev, scat)); + buf->key = cpu_to_be32(target->rkey); + buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat)); - fmt = SRP_DATA_DESC_INDIRECT; + req->nmdesc = 0; + goto map_complete; + } - if (scmnd->sc_data_direction == DMA_TO_DEVICE) - cmd->data_out_desc_cnt = n; - else - cmd->data_in_desc_cnt = n; - - buf->table_desc.va = cpu_to_be64(req->cmd->dma + - sizeof *cmd + - sizeof *buf); - buf->table_desc.key = - cpu_to_be32(target->srp_host->mr->rkey); - buf->table_desc.len = - cpu_to_be32(n * sizeof (struct srp_direct_buf)); - - for (i = 0; i < n; ++i) { - buf->desc_list[i].va = cpu_to_be64(sg_dma_address(&scat[i])); - buf->desc_list[i].key = - cpu_to_be32(target->srp_host->mr->rkey); - buf->desc_list[i].len = cpu_to_be32(sg_dma_len(&scat[i])); - - datalen += sg_dma_len(&scat[i]); - } + /* + * We have more than one scatter/gather entry, so build our indirect + * descriptor table, trying to merge as many entries as we can. + */ + indirect_hdr = (void *) cmd->add_data; - buf->len = cpu_to_be32(datalen); + ib_dma_sync_single_for_cpu(ibdev, req->indirect_dma_addr, + target->indirect_size, DMA_TO_DEVICE); - len = sizeof (struct srp_cmd) + - sizeof (struct srp_indirect_buf) + - n * sizeof (struct srp_direct_buf); - } - } else { + memset(&state, 0, sizeof(state)); + srp_map_sg(&state, target, req, scat, count); + + /* We've mapped the request, now pull as much of the indirect + * descriptor table as we can into the command buffer. If this + * target is not using an external indirect table, we are + * guaranteed to fit into the command, as the SCSI layer won't + * give us more S/G entries than we allow. + */ + if (state.ndesc == 1) { + /* + * Memory registration collapsed the sg-list into one entry, + * so use a direct descriptor. + */ struct srp_direct_buf *buf = (void *) cmd->add_data; - dma_addr_t dma; - - dma = dma_map_single(target->srp_host->dev->dma_device, - scmnd->request_buffer, scmnd->request_bufflen, - scmnd->sc_data_direction); - if (dma_mapping_error(dma)) { - printk(KERN_WARNING PFX "unable to map %p/%d (dir %d)\n", - scmnd->request_buffer, (int) scmnd->request_bufflen, - scmnd->sc_data_direction); - return -EINVAL; - } - pci_unmap_addr_set(req, direct_mapping, dma); + *buf = req->indirect_desc[0]; + goto map_complete; + } - buf->va = cpu_to_be64(dma); - buf->key = cpu_to_be32(target->srp_host->mr->rkey); - buf->len = cpu_to_be32(scmnd->request_bufflen); + if (unlikely(target->cmd_sg_cnt < state.ndesc && + !target->allow_ext_sg)) { + shost_printk(KERN_ERR, target->scsi_host, + "Could not fit S/G list into SRP_CMD\n"); + return -EIO; + } - fmt = SRP_DATA_DESC_DIRECT; + count = min(state.ndesc, target->cmd_sg_cnt); + table_len = state.ndesc * sizeof (struct srp_direct_buf); - len = sizeof (struct srp_cmd) + sizeof (struct srp_direct_buf); - } + fmt = SRP_DATA_DESC_INDIRECT; + len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf); + len += count * sizeof (struct srp_direct_buf); + + memcpy(indirect_hdr->desc_list, req->indirect_desc, + count * sizeof (struct srp_direct_buf)); + indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr); + indirect_hdr->table_desc.key = cpu_to_be32(target->rkey); + indirect_hdr->table_desc.len = cpu_to_be32(table_len); + indirect_hdr->len = cpu_to_be32(state.total_len); + + if (scmnd->sc_data_direction == DMA_TO_DEVICE) + cmd->data_out_desc_cnt = count; + else + cmd->data_in_desc_cnt = count; + + ib_dma_sync_single_for_device(ibdev, req->indirect_dma_addr, table_len, + DMA_TO_DEVICE); + +map_complete: if (scmnd->sc_data_direction == DMA_TO_DEVICE) cmd->buf_fmt = fmt << 4; else cmd->buf_fmt = fmt; - return len; } -static void srp_unmap_data(struct scsi_cmnd *scmnd, - struct srp_target_port *target, - struct srp_request *req) +/* + * Return an IU and possible credit to the free pool + */ +static void srp_put_tx_iu(struct srp_target_port *target, struct srp_iu *iu, + enum srp_iu_type iu_type) { - if (!scmnd->request_buffer || - (scmnd->sc_data_direction != DMA_TO_DEVICE && - scmnd->sc_data_direction != DMA_FROM_DEVICE)) - return; + unsigned long flags; - if (scmnd->use_sg) - dma_unmap_sg(target->srp_host->dev->dma_device, - (struct scatterlist *) scmnd->request_buffer, - scmnd->use_sg, scmnd->sc_data_direction); - else - dma_unmap_single(target->srp_host->dev->dma_device, - pci_unmap_addr(req, direct_mapping), - scmnd->request_bufflen, - scmnd->sc_data_direction); + spin_lock_irqsave(&target->lock, flags); + list_add(&iu->list, &target->free_tx); + if (iu_type != SRP_IU_RSP) + ++target->req_lim; + spin_unlock_irqrestore(&target->lock, flags); } -static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) +/* + * Must be called with target->lock held to protect req_lim and free_tx. + * If IU is not sent, it must be returned using srp_put_tx_iu(). + * + * Note: + * An upper limit for the number of allocated information units for each + * request type is: + * - SRP_IU_CMD: SRP_CMD_SQ_SIZE, since the SCSI mid-layer never queues + * more than Scsi_Host.can_queue requests. + * - SRP_IU_TSK_MGMT: SRP_TSK_MGMT_SQ_SIZE. + * - SRP_IU_RSP: 1, since a conforming SRP target never sends more than + * one unanswered SRP request to an initiator. + */ +static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target, + enum srp_iu_type iu_type) { - struct srp_request *req; - struct scsi_cmnd *scmnd; - unsigned long flags; - s32 delta; + s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE; + struct srp_iu *iu; - delta = (s32) be32_to_cpu(rsp->req_lim_delta); + srp_send_completion(target->send_cq, target); - spin_lock_irqsave(target->scsi_host->host_lock, flags); + if (list_empty(&target->free_tx)) + return NULL; - target->req_lim += delta; + /* Initiator responses to target requests do not consume credits */ + if (iu_type != SRP_IU_RSP) { + if (target->req_lim <= rsv) { + ++target->zero_req_lim; + return NULL; + } - req = &target->req_ring[rsp->tag & ~SRP_TAG_TSK_MGMT]; + --target->req_lim; + } + + iu = list_first_entry(&target->free_tx, struct srp_iu, list); + list_del(&iu->list); + return iu; +} + +static int srp_post_send(struct srp_target_port *target, + struct srp_iu *iu, int len) +{ + struct ib_sge list; + struct ib_send_wr wr, *bad_wr; + + list.addr = iu->dma; + list.length = len; + list.lkey = target->lkey; + + wr.next = NULL; + wr.wr_id = (uintptr_t) iu; + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + + return ib_post_send(target->qp, &wr, &bad_wr); +} + +static int srp_post_recv(struct srp_target_port *target, struct srp_iu *iu) +{ + struct ib_recv_wr wr, *bad_wr; + struct ib_sge list; + + list.addr = iu->dma; + list.length = iu->size; + list.lkey = target->lkey; + + wr.next = NULL; + wr.wr_id = (uintptr_t) iu; + wr.sg_list = &list; + wr.num_sge = 1; + + return ib_post_recv(target->qp, &wr, &bad_wr); +} + +static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) +{ + struct srp_request *req; + struct scsi_cmnd *scmnd; + unsigned long flags; if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) { - if (be32_to_cpu(rsp->resp_data_len) < 4) - req->tsk_status = -1; - else - req->tsk_status = rsp->data[3]; - complete(&req->done); + spin_lock_irqsave(&target->lock, flags); + target->req_lim += be32_to_cpu(rsp->req_lim_delta); + spin_unlock_irqrestore(&target->lock, flags); + + target->tsk_mgmt_status = -1; + if (be32_to_cpu(rsp->resp_data_len) >= 4) + target->tsk_mgmt_status = rsp->data[3]; + complete(&target->tsk_mgmt_done); } else { - scmnd = req->scmnd; - if (!scmnd) - printk(KERN_ERR "Null scmnd for RSP w/tag %016llx\n", - (unsigned long long) rsp->tag); + req = &target->req_ring[rsp->tag]; + scmnd = srp_claim_req(target, req, NULL, NULL); + if (!scmnd) { + shost_printk(KERN_ERR, target->scsi_host, + "Null scmnd for RSP w/tag %016llx\n", + (unsigned long long) rsp->tag); + + spin_lock_irqsave(&target->lock, flags); + target->req_lim += be32_to_cpu(rsp->req_lim_delta); + spin_unlock_irqrestore(&target->lock, flags); + + return; + } scmnd->result = rsp->status; if (rsp->flags & SRP_RSP_FLAG_SNSVALID) { @@ -660,61 +1644,99 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) } if (rsp->flags & (SRP_RSP_FLAG_DOOVER | SRP_RSP_FLAG_DOUNDER)) - scmnd->resid = be32_to_cpu(rsp->data_out_res_cnt); + scsi_set_resid(scmnd, be32_to_cpu(rsp->data_out_res_cnt)); else if (rsp->flags & (SRP_RSP_FLAG_DIOVER | SRP_RSP_FLAG_DIUNDER)) - scmnd->resid = be32_to_cpu(rsp->data_in_res_cnt); + scsi_set_resid(scmnd, be32_to_cpu(rsp->data_in_res_cnt)); - srp_unmap_data(scmnd, target, req); + srp_free_req(target, req, scmnd, + be32_to_cpu(rsp->req_lim_delta)); - if (!req->tsk_mgmt) { - req->scmnd = NULL; - scmnd->host_scribble = (void *) -1L; - scmnd->scsi_done(scmnd); + scmnd->host_scribble = NULL; + scmnd->scsi_done(scmnd); + } +} - list_del(&req->list); - req->next = target->req_head; - target->req_head = rsp->tag & ~SRP_TAG_TSK_MGMT; - } else - req->cmd_done = 1; +static int srp_response_common(struct srp_target_port *target, s32 req_delta, + void *rsp, int len) +{ + struct ib_device *dev = target->srp_host->srp_dev->dev; + unsigned long flags; + struct srp_iu *iu; + int err; + + spin_lock_irqsave(&target->lock, flags); + target->req_lim += req_delta; + iu = __srp_get_tx_iu(target, SRP_IU_RSP); + spin_unlock_irqrestore(&target->lock, flags); + + if (!iu) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "no IU available to send response\n"); + return 1; } - spin_unlock_irqrestore(target->scsi_host->host_lock, flags); + ib_dma_sync_single_for_cpu(dev, iu->dma, len, DMA_TO_DEVICE); + memcpy(iu->buf, rsp, len); + ib_dma_sync_single_for_device(dev, iu->dma, len, DMA_TO_DEVICE); + + err = srp_post_send(target, iu, len); + if (err) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "unable to post response: %d\n", err); + srp_put_tx_iu(target, iu, SRP_IU_RSP); + } + + return err; } -static void srp_reconnect_work(void *target_ptr) +static void srp_process_cred_req(struct srp_target_port *target, + struct srp_cred_req *req) { - struct srp_target_port *target = target_ptr; + struct srp_cred_rsp rsp = { + .opcode = SRP_CRED_RSP, + .tag = req->tag, + }; + s32 delta = be32_to_cpu(req->req_lim_delta); + + if (srp_response_common(target, delta, &rsp, sizeof rsp)) + shost_printk(KERN_ERR, target->scsi_host, PFX + "problems processing SRP_CRED_REQ\n"); +} - srp_reconnect_target(target); +static void srp_process_aer_req(struct srp_target_port *target, + struct srp_aer_req *req) +{ + struct srp_aer_rsp rsp = { + .opcode = SRP_AER_RSP, + .tag = req->tag, + }; + s32 delta = be32_to_cpu(req->req_lim_delta); + + shost_printk(KERN_ERR, target->scsi_host, PFX + "ignoring AER for LUN %llu\n", be64_to_cpu(req->lun)); + + if (srp_response_common(target, delta, &rsp, sizeof rsp)) + shost_printk(KERN_ERR, target->scsi_host, PFX + "problems processing SRP_AER_REQ\n"); } static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) { - struct srp_iu *iu; + struct ib_device *dev = target->srp_host->srp_dev->dev; + struct srp_iu *iu = (struct srp_iu *) (uintptr_t) wc->wr_id; + int res; u8 opcode; - iu = target->rx_ring[wc->wr_id & ~SRP_OP_RECV]; - - dma_sync_single_for_cpu(target->srp_host->dev->dma_device, iu->dma, - target->max_ti_iu_len, DMA_FROM_DEVICE); + ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_ti_iu_len, + DMA_FROM_DEVICE); opcode = *(u8 *) iu->buf; if (0) { - int i; - - printk(KERN_ERR PFX "recv completion, opcode 0x%02x\n", opcode); - - for (i = 0; i < wc->byte_len; ++i) { - if (i % 8 == 0) - printk(KERN_ERR " [%02x] ", i); - printk(" %02x", ((u8 *) iu->buf)[i]); - if ((i + 1) % 8 == 0) - printk("\n"); - } - - if (wc->byte_len % 8) - printk("\n"); + shost_printk(KERN_ERR, target->scsi_host, + PFX "recv completion, opcode 0x%02x\n", opcode); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 8, 1, + iu->buf, wc->byte_len, true); } switch (opcode) { @@ -722,218 +1744,234 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) srp_process_rsp(target, iu->buf); break; + case SRP_CRED_REQ: + srp_process_cred_req(target, iu->buf); + break; + + case SRP_AER_REQ: + srp_process_aer_req(target, iu->buf); + break; + case SRP_T_LOGOUT: /* XXX Handle target logout */ - printk(KERN_WARNING PFX "Got target logout request\n"); + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Got target logout request\n"); break; default: - printk(KERN_WARNING PFX "Unhandled SRP opcode 0x%02x\n", opcode); + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled SRP opcode 0x%02x\n", opcode); break; } - dma_sync_single_for_device(target->srp_host->dev->dma_device, iu->dma, - target->max_ti_iu_len, DMA_FROM_DEVICE); -} + ib_dma_sync_single_for_device(dev, iu->dma, target->max_ti_iu_len, + DMA_FROM_DEVICE); -static void srp_completion(struct ib_cq *cq, void *target_ptr) -{ - struct srp_target_port *target = target_ptr; - struct ib_wc wc; - unsigned long flags; - - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - while (ib_poll_cq(cq, 1, &wc) > 0) { - if (wc.status) { - printk(KERN_ERR PFX "failed %s status %d\n", - wc.wr_id & SRP_OP_RECV ? "receive" : "send", - wc.status); - spin_lock_irqsave(target->scsi_host->host_lock, flags); - if (target->state == SRP_TARGET_LIVE) - schedule_work(&target->work); - spin_unlock_irqrestore(target->scsi_host->host_lock, flags); - break; - } - - if (wc.wr_id & SRP_OP_RECV) - srp_handle_recv(target, &wc); - else - ++target->tx_tail; - } + res = srp_post_recv(target, iu); + if (res != 0) + shost_printk(KERN_ERR, target->scsi_host, + PFX "Recv failed with error code %d\n", res); } -static int __srp_post_recv(struct srp_target_port *target) +/** + * srp_tl_err_work() - handle a transport layer error + * @work: Work structure embedded in an SRP target port. + * + * Note: This function may get invoked before the rport has been created, + * hence the target->rport test. + */ +static void srp_tl_err_work(struct work_struct *work) { - struct srp_iu *iu; - struct ib_sge list; - struct ib_recv_wr wr, *bad_wr; - unsigned int next; - int ret; - - next = target->rx_head & (SRP_RQ_SIZE - 1); - wr.wr_id = next | SRP_OP_RECV; - iu = target->rx_ring[next]; - - list.addr = iu->dma; - list.length = iu->size; - list.lkey = target->srp_host->mr->lkey; - - wr.next = NULL; - wr.sg_list = &list; - wr.num_sge = 1; - - ret = ib_post_recv(target->qp, &wr, &bad_wr); - if (!ret) - ++target->rx_head; + struct srp_target_port *target; - return ret; + target = container_of(work, struct srp_target_port, tl_err_work); + if (target->rport) + srp_start_tl_fail_timers(target->rport); } -static int srp_post_recv(struct srp_target_port *target) +static void srp_handle_qp_err(u64 wr_id, enum ib_wc_status wc_status, + bool send_err, struct srp_target_port *target) { - unsigned long flags; - int ret; - - spin_lock_irqsave(target->scsi_host->host_lock, flags); - ret = __srp_post_recv(target); - spin_unlock_irqrestore(target->scsi_host->host_lock, flags); - - return ret; + if (target->connected && !target->qp_in_error) { + if (wr_id & LOCAL_INV_WR_ID_MASK) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "LOCAL_INV failed with status %d\n", + wc_status); + } else if (wr_id & FAST_REG_WR_ID_MASK) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "FAST_REG_MR failed status %d\n", + wc_status); + } else { + shost_printk(KERN_ERR, target->scsi_host, + PFX "failed %s status %d for iu %p\n", + send_err ? "send" : "receive", + wc_status, (void *)(uintptr_t)wr_id); + } + queue_work(system_long_wq, &target->tl_err_work); + } + target->qp_in_error = true; } -/* - * Must be called with target->scsi_host->host_lock held to protect - * req_lim and tx_head. - */ -static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target) +static void srp_recv_completion(struct ib_cq *cq, void *target_ptr) { - if (target->tx_head - target->tx_tail >= SRP_SQ_SIZE) - return NULL; + struct srp_target_port *target = target_ptr; + struct ib_wc wc; - return target->tx_ring[target->tx_head & SRP_SQ_SIZE]; + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + while (ib_poll_cq(cq, 1, &wc) > 0) { + if (likely(wc.status == IB_WC_SUCCESS)) { + srp_handle_recv(target, &wc); + } else { + srp_handle_qp_err(wc.wr_id, wc.status, false, target); + } + } } -/* - * Must be called with target->scsi_host->host_lock held to protect - * req_lim and tx_head. - */ -static int __srp_post_send(struct srp_target_port *target, - struct srp_iu *iu, int len) +static void srp_send_completion(struct ib_cq *cq, void *target_ptr) { - struct ib_sge list; - struct ib_send_wr wr, *bad_wr; - int ret = 0; - - if (target->req_lim < 1) { - printk(KERN_ERR PFX "Target has req_lim %d\n", target->req_lim); - return -EAGAIN; - } - - list.addr = iu->dma; - list.length = len; - list.lkey = target->srp_host->mr->lkey; - - wr.next = NULL; - wr.wr_id = target->tx_head & SRP_SQ_SIZE; - wr.sg_list = &list; - wr.num_sge = 1; - wr.opcode = IB_WR_SEND; - wr.send_flags = IB_SEND_SIGNALED; - - ret = ib_post_send(target->qp, &wr, &bad_wr); + struct srp_target_port *target = target_ptr; + struct ib_wc wc; + struct srp_iu *iu; - if (!ret) { - ++target->tx_head; - --target->req_lim; + while (ib_poll_cq(cq, 1, &wc) > 0) { + if (likely(wc.status == IB_WC_SUCCESS)) { + iu = (struct srp_iu *) (uintptr_t) wc.wr_id; + list_add(&iu->list, &target->free_tx); + } else { + srp_handle_qp_err(wc.wr_id, wc.status, true, target); + } } - - return ret; } -static int srp_queuecommand(struct scsi_cmnd *scmnd, - void (*done)(struct scsi_cmnd *)) +static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) { - struct srp_target_port *target = host_to_target(scmnd->device->host); + struct srp_target_port *target = host_to_target(shost); + struct srp_rport *rport = target->rport; struct srp_request *req; struct srp_iu *iu; struct srp_cmd *cmd; - long req_index; - int len; + struct ib_device *dev; + unsigned long flags; + int len, ret; + const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler; - if (target->state == SRP_TARGET_CONNECTING) - goto err; + /* + * The SCSI EH thread is the only context from which srp_queuecommand() + * can get invoked for blocked devices (SDEV_BLOCK / + * SDEV_CREATED_BLOCK). Avoid racing with srp_reconnect_rport() by + * locking the rport mutex if invoked from inside the SCSI EH. + */ + if (in_scsi_eh) + mutex_lock(&rport->mutex); - if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) { - scmnd->result = DID_BAD_TARGET << 16; - done(scmnd); - return 0; - } + scmnd->result = srp_chkready(target->rport); + if (unlikely(scmnd->result)) + goto err; - iu = __srp_get_tx_iu(target); + spin_lock_irqsave(&target->lock, flags); + iu = __srp_get_tx_iu(target, SRP_IU_CMD); if (!iu) - goto err; + goto err_unlock; - dma_sync_single_for_cpu(target->srp_host->dev->dma_device, iu->dma, - SRP_MAX_IU_LEN, DMA_TO_DEVICE); + req = list_first_entry(&target->free_reqs, struct srp_request, list); + list_del(&req->list); + spin_unlock_irqrestore(&target->lock, flags); - req_index = target->req_head; + dev = target->srp_host->srp_dev->dev; + ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_iu_len, + DMA_TO_DEVICE); - scmnd->scsi_done = done; - scmnd->result = 0; - scmnd->host_scribble = (void *) req_index; + scmnd->host_scribble = (void *) req; cmd = iu->buf; memset(cmd, 0, sizeof *cmd); cmd->opcode = SRP_CMD; cmd->lun = cpu_to_be64((u64) scmnd->device->lun << 48); - cmd->tag = req_index; + cmd->tag = req->index; memcpy(cmd->cdb, scmnd->cmnd, scmnd->cmd_len); - req = &target->req_ring[req_index]; - req->scmnd = scmnd; req->cmd = iu; - req->cmd_done = 0; - req->tsk_mgmt = NULL; len = srp_map_data(scmnd, target, req); if (len < 0) { - printk(KERN_ERR PFX "Failed to map data\n"); - goto err; - } - - if (__srp_post_recv(target)) { - printk(KERN_ERR PFX "Recv failed\n"); - goto err_unmap; + shost_printk(KERN_ERR, target->scsi_host, + PFX "Failed to map data (%d)\n", len); + /* + * If we ran out of memory descriptors (-ENOMEM) because an + * application is queuing many requests with more than + * max_pages_per_mr sg-list elements, tell the SCSI mid-layer + * to reduce queue depth temporarily. + */ + scmnd->result = len == -ENOMEM ? + DID_OK << 16 | QUEUE_FULL << 1 : DID_ERROR << 16; + goto err_iu; } - dma_sync_single_for_device(target->srp_host->dev->dma_device, iu->dma, - SRP_MAX_IU_LEN, DMA_TO_DEVICE); + ib_dma_sync_single_for_device(dev, iu->dma, target->max_iu_len, + DMA_TO_DEVICE); - if (__srp_post_send(target, iu, len)) { - printk(KERN_ERR PFX "Send failed\n"); + if (srp_post_send(target, iu, len)) { + shost_printk(KERN_ERR, target->scsi_host, PFX "Send failed\n"); goto err_unmap; } - target->req_head = req->next; - list_add_tail(&req->list, &target->req_queue); + ret = 0; - return 0; +unlock_rport: + if (in_scsi_eh) + mutex_unlock(&rport->mutex); + + return ret; err_unmap: srp_unmap_data(scmnd, target, req); +err_iu: + srp_put_tx_iu(target, iu, SRP_IU_CMD); + + /* + * Avoid that the loops that iterate over the request ring can + * encounter a dangling SCSI command pointer. + */ + req->scmnd = NULL; + + spin_lock_irqsave(&target->lock, flags); + list_add(&req->list, &target->free_reqs); + +err_unlock: + spin_unlock_irqrestore(&target->lock, flags); + err: - return SCSI_MLQUEUE_HOST_BUSY; + if (scmnd->result) { + scmnd->scsi_done(scmnd); + ret = 0; + } else { + ret = SCSI_MLQUEUE_HOST_BUSY; + } + + goto unlock_rport; } +/* + * Note: the resources allocated in this function are freed in + * srp_free_target_ib(). + */ static int srp_alloc_iu_bufs(struct srp_target_port *target) { int i; - for (i = 0; i < SRP_RQ_SIZE; ++i) { + target->rx_ring = kzalloc(target->queue_size * sizeof(*target->rx_ring), + GFP_KERNEL); + if (!target->rx_ring) + goto err_no_ring; + target->tx_ring = kzalloc(target->queue_size * sizeof(*target->tx_ring), + GFP_KERNEL); + if (!target->tx_ring) + goto err_no_ring; + + for (i = 0; i < target->queue_size; ++i) { target->rx_ring[i] = srp_alloc_iu(target->srp_host, target->max_ti_iu_len, GFP_KERNEL, DMA_FROM_DEVICE); @@ -941,34 +1979,143 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target) goto err; } - for (i = 0; i < SRP_SQ_SIZE + 1; ++i) { + for (i = 0; i < target->queue_size; ++i) { target->tx_ring[i] = srp_alloc_iu(target->srp_host, - SRP_MAX_IU_LEN, + target->max_iu_len, GFP_KERNEL, DMA_TO_DEVICE); if (!target->tx_ring[i]) goto err; + + list_add(&target->tx_ring[i]->list, &target->free_tx); } return 0; err: - for (i = 0; i < SRP_RQ_SIZE; ++i) { + for (i = 0; i < target->queue_size; ++i) { srp_free_iu(target->srp_host, target->rx_ring[i]); - target->rx_ring[i] = NULL; - } - - for (i = 0; i < SRP_SQ_SIZE + 1; ++i) { srp_free_iu(target->srp_host, target->tx_ring[i]); - target->tx_ring[i] = NULL; } + +err_no_ring: + kfree(target->tx_ring); + target->tx_ring = NULL; + kfree(target->rx_ring); + target->rx_ring = NULL; + return -ENOMEM; } +static uint32_t srp_compute_rq_tmo(struct ib_qp_attr *qp_attr, int attr_mask) +{ + uint64_t T_tr_ns, max_compl_time_ms; + uint32_t rq_tmo_jiffies; + + /* + * According to section 11.2.4.2 in the IBTA spec (Modify Queue Pair, + * table 91), both the QP timeout and the retry count have to be set + * for RC QP's during the RTR to RTS transition. + */ + WARN_ON_ONCE((attr_mask & (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)) != + (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)); + + /* + * Set target->rq_tmo_jiffies to one second more than the largest time + * it can take before an error completion is generated. See also + * C9-140..142 in the IBTA spec for more information about how to + * convert the QP Local ACK Timeout value to nanoseconds. + */ + T_tr_ns = 4096 * (1ULL << qp_attr->timeout); + max_compl_time_ms = qp_attr->retry_cnt * 4 * T_tr_ns; + do_div(max_compl_time_ms, NSEC_PER_MSEC); + rq_tmo_jiffies = msecs_to_jiffies(max_compl_time_ms + 1000); + + return rq_tmo_jiffies; +} + +static void srp_cm_rep_handler(struct ib_cm_id *cm_id, + struct srp_login_rsp *lrsp, + struct srp_target_port *target) +{ + struct ib_qp_attr *qp_attr = NULL; + int attr_mask = 0; + int ret; + int i; + + if (lrsp->opcode == SRP_LOGIN_RSP) { + target->max_ti_iu_len = be32_to_cpu(lrsp->max_ti_iu_len); + target->req_lim = be32_to_cpu(lrsp->req_lim_delta); + + /* + * Reserve credits for task management so we don't + * bounce requests back to the SCSI mid-layer. + */ + target->scsi_host->can_queue + = min(target->req_lim - SRP_TSK_MGMT_SQ_SIZE, + target->scsi_host->can_queue); + target->scsi_host->cmd_per_lun + = min_t(int, target->scsi_host->can_queue, + target->scsi_host->cmd_per_lun); + } else { + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled RSP opcode %#x\n", lrsp->opcode); + ret = -ECONNRESET; + goto error; + } + + if (!target->rx_ring) { + ret = srp_alloc_iu_bufs(target); + if (ret) + goto error; + } + + ret = -ENOMEM; + qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); + if (!qp_attr) + goto error; + + qp_attr->qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); + if (ret) + goto error_free; + + ret = ib_modify_qp(target->qp, qp_attr, attr_mask); + if (ret) + goto error_free; + + for (i = 0; i < target->queue_size; i++) { + struct srp_iu *iu = target->rx_ring[i]; + ret = srp_post_recv(target, iu); + if (ret) + goto error_free; + } + + qp_attr->qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); + if (ret) + goto error_free; + + target->rq_tmo_jiffies = srp_compute_rq_tmo(qp_attr, attr_mask); + + ret = ib_modify_qp(target->qp, qp_attr, attr_mask); + if (ret) + goto error_free; + + ret = ib_send_cm_rtu(cm_id, NULL, 0); + +error_free: + kfree(qp_attr); + +error: + target->status = ret; +} + static void srp_cm_rej_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event, struct srp_target_port *target) { + struct Scsi_Host *shost = target->scsi_host; struct ib_class_port_info *cpi; int opcode; @@ -985,8 +2132,7 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, break; case IB_CM_REJ_PORT_REDIRECT: - if (topspin_workarounds && - !memcmp(&target->ioc_guid, topspin_oui, 3)) { + if (srp_target_is_topspin(target)) { /* * Topspin/Cisco SRP gateways incorrectly send * reject reason code 25 when they mean 24 @@ -995,19 +2141,22 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, memcpy(target->path.dgid.raw, event->param.rej_rcvd.ari, 16); - printk(KERN_DEBUG PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n", - (unsigned long long) be64_to_cpu(target->path.dgid.global.subnet_prefix), - (unsigned long long) be64_to_cpu(target->path.dgid.global.interface_id)); + shost_printk(KERN_DEBUG, shost, + PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n", + (unsigned long long) be64_to_cpu(target->path.dgid.global.subnet_prefix), + (unsigned long long) be64_to_cpu(target->path.dgid.global.interface_id)); target->status = SRP_PORT_REDIRECT; } else { - printk(KERN_WARNING " REJ reason: IB_CM_REJ_PORT_REDIRECT\n"); + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_PORT_REDIRECT\n"); target->status = -ECONNRESET; } break; case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID: - printk(KERN_WARNING " REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n"); + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n"); target->status = -ECONNRESET; break; @@ -1018,20 +2167,28 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, u32 reason = be32_to_cpu(rej->reason); if (reason == SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE) - printk(KERN_WARNING PFX - "SRP_LOGIN_REJ: requested max_it_iu_len too large\n"); + shost_printk(KERN_WARNING, shost, + PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n"); else - printk(KERN_WARNING PFX - "SRP LOGIN REJECTED, reason 0x%08x\n", reason); + shost_printk(KERN_WARNING, shost, PFX + "SRP LOGIN from %pI6 to %pI6 REJECTED, reason 0x%08x\n", + target->path.sgid.raw, + target->orig_dgid, reason); } else - printk(KERN_WARNING " REJ reason: IB_CM_REJ_CONSUMER_DEFINED," - " opcode 0x%02x\n", opcode); + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_CONSUMER_DEFINED," + " opcode 0x%02x\n", opcode); target->status = -ECONNRESET; break; + case IB_CM_REJ_STALE_CONN: + shost_printk(KERN_WARNING, shost, " REJ reason: stale connection\n"); + target->status = SRP_STALE_CONN; + break; + default: - printk(KERN_WARNING " REJ reason 0x%x\n", - event->param.rej_rcvd.reason); + shost_printk(KERN_WARNING, shost, " REJ reason 0x%x\n", + event->param.rej_rcvd.reason); target->status = -ECONNRESET; } } @@ -1039,244 +2196,498 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct srp_target_port *target = cm_id->context; - struct ib_qp_attr *qp_attr = NULL; - int attr_mask = 0; int comp = 0; - int opcode = 0; switch (event->event) { case IB_CM_REQ_ERROR: - printk(KERN_DEBUG PFX "Sending CM REQ failed\n"); + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Sending CM REQ failed\n"); comp = 1; target->status = -ECONNRESET; break; case IB_CM_REP_RECEIVED: comp = 1; - opcode = *(u8 *) event->private_data; - - if (opcode == SRP_LOGIN_RSP) { - struct srp_login_rsp *rsp = event->private_data; - - target->max_ti_iu_len = be32_to_cpu(rsp->max_ti_iu_len); - target->req_lim = be32_to_cpu(rsp->req_lim_delta); - - target->scsi_host->can_queue = min(target->req_lim, - target->scsi_host->can_queue); - } else { - printk(KERN_WARNING PFX "Unhandled RSP opcode %#x\n", opcode); - target->status = -ECONNRESET; - break; - } - - target->status = srp_alloc_iu_bufs(target); - if (target->status) - break; - - qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); - if (!qp_attr) { - target->status = -ENOMEM; - break; - } - - qp_attr->qp_state = IB_QPS_RTR; - target->status = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); - if (target->status) - break; - - target->status = ib_modify_qp(target->qp, qp_attr, attr_mask); - if (target->status) - break; - - target->status = srp_post_recv(target); - if (target->status) - break; - - qp_attr->qp_state = IB_QPS_RTS; - target->status = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); - if (target->status) - break; - - target->status = ib_modify_qp(target->qp, qp_attr, attr_mask); - if (target->status) - break; - - target->status = ib_send_cm_rtu(cm_id, NULL, 0); - if (target->status) - break; - + srp_cm_rep_handler(cm_id, event->private_data, target); break; case IB_CM_REJ_RECEIVED: - printk(KERN_DEBUG PFX "REJ received\n"); + shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n"); comp = 1; srp_cm_rej_handler(cm_id, event, target); break; - case IB_CM_MRA_RECEIVED: - printk(KERN_ERR PFX "MRA received\n"); - break; - - case IB_CM_DREP_RECEIVED: + case IB_CM_DREQ_RECEIVED: + shost_printk(KERN_WARNING, target->scsi_host, + PFX "DREQ received - connection closed\n"); + srp_change_conn_state(target, false); + if (ib_send_cm_drep(cm_id, NULL, 0)) + shost_printk(KERN_ERR, target->scsi_host, + PFX "Sending CM DREP failed\n"); + queue_work(system_long_wq, &target->tl_err_work); break; case IB_CM_TIMEWAIT_EXIT: - printk(KERN_ERR PFX "connection closed\n"); - + shost_printk(KERN_ERR, target->scsi_host, + PFX "connection closed\n"); comp = 1; + target->status = 0; break; + case IB_CM_MRA_RECEIVED: + case IB_CM_DREQ_ERROR: + case IB_CM_DREP_RECEIVED: + break; + default: - printk(KERN_WARNING PFX "Unhandled CM event %d\n", event->event); + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled CM event %d\n", event->event); break; } if (comp) complete(&target->done); - kfree(qp_attr); - return 0; } -static int srp_send_tsk_mgmt(struct scsi_cmnd *scmnd, u8 func) +/** + * srp_change_queue_type - changing device queue tag type + * @sdev: scsi device struct + * @tag_type: requested tag type + * + * Returns queue tag type. + */ +static int +srp_change_queue_type(struct scsi_device *sdev, int tag_type) { - struct srp_target_port *target = host_to_target(scmnd->device->host); - struct srp_request *req; + if (sdev->tagged_supported) { + scsi_set_tag_type(sdev, tag_type); + if (tag_type) + scsi_activate_tcq(sdev, sdev->queue_depth); + else + scsi_deactivate_tcq(sdev, sdev->queue_depth); + } else + tag_type = 0; + + return tag_type; +} + +/** + * srp_change_queue_depth - setting device queue depth + * @sdev: scsi device struct + * @qdepth: requested queue depth + * @reason: SCSI_QDEPTH_DEFAULT/SCSI_QDEPTH_QFULL/SCSI_QDEPTH_RAMP_UP + * (see include/scsi/scsi_host.h for definition) + * + * Returns queue depth. + */ +static int +srp_change_queue_depth(struct scsi_device *sdev, int qdepth, int reason) +{ + struct Scsi_Host *shost = sdev->host; + int max_depth; + if (reason == SCSI_QDEPTH_DEFAULT || reason == SCSI_QDEPTH_RAMP_UP) { + max_depth = shost->can_queue; + if (!sdev->tagged_supported) + max_depth = 1; + if (qdepth > max_depth) + qdepth = max_depth; + scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), qdepth); + } else if (reason == SCSI_QDEPTH_QFULL) + scsi_track_queue_full(sdev, qdepth); + else + return -EOPNOTSUPP; + + return sdev->queue_depth; +} + +static int srp_send_tsk_mgmt(struct srp_target_port *target, + u64 req_tag, unsigned int lun, u8 func) +{ + struct srp_rport *rport = target->rport; + struct ib_device *dev = target->srp_host->srp_dev->dev; struct srp_iu *iu; struct srp_tsk_mgmt *tsk_mgmt; - int req_index; - int ret = FAILED; - spin_lock_irq(target->scsi_host->host_lock); + if (!target->connected || target->qp_in_error) + return -1; - if (scmnd->host_scribble == (void *) -1L) - goto out; + init_completion(&target->tsk_mgmt_done); - req_index = (long) scmnd->host_scribble; - printk(KERN_ERR "Abort for req_index %d\n", req_index); + /* + * Lock the rport mutex to avoid that srp_create_target_ib() is + * invoked while a task management function is being sent. + */ + mutex_lock(&rport->mutex); + spin_lock_irq(&target->lock); + iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT); + spin_unlock_irq(&target->lock); - req = &target->req_ring[req_index]; - init_completion(&req->done); + if (!iu) { + mutex_unlock(&rport->mutex); - iu = __srp_get_tx_iu(target); - if (!iu) - goto out; + return -1; + } + ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt, + DMA_TO_DEVICE); tsk_mgmt = iu->buf; memset(tsk_mgmt, 0, sizeof *tsk_mgmt); tsk_mgmt->opcode = SRP_TSK_MGMT; - tsk_mgmt->lun = cpu_to_be64((u64) scmnd->device->lun << 48); - tsk_mgmt->tag = req_index | SRP_TAG_TSK_MGMT; + tsk_mgmt->lun = cpu_to_be64((u64) lun << 48); + tsk_mgmt->tag = req_tag | SRP_TAG_TSK_MGMT; tsk_mgmt->tsk_mgmt_func = func; - tsk_mgmt->task_tag = req_index; + tsk_mgmt->task_tag = req_tag; - if (__srp_post_send(target, iu, sizeof *tsk_mgmt)) - goto out; + ib_dma_sync_single_for_device(dev, iu->dma, sizeof *tsk_mgmt, + DMA_TO_DEVICE); + if (srp_post_send(target, iu, sizeof *tsk_mgmt)) { + srp_put_tx_iu(target, iu, SRP_IU_TSK_MGMT); + mutex_unlock(&rport->mutex); - req->tsk_mgmt = iu; + return -1; + } + mutex_unlock(&rport->mutex); - spin_unlock_irq(target->scsi_host->host_lock); - if (!wait_for_completion_timeout(&req->done, + if (!wait_for_completion_timeout(&target->tsk_mgmt_done, msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS))) - return FAILED; - spin_lock_irq(target->scsi_host->host_lock); + return -1; - if (req->cmd_done) { - list_del(&req->list); - req->next = target->req_head; - target->req_head = req_index; - - scmnd->scsi_done(scmnd); - } else if (!req->tsk_status) { - scmnd->result = DID_ABORT << 16; - ret = SUCCESS; - } - -out: - spin_unlock_irq(target->scsi_host->host_lock); - return ret; + return 0; } static int srp_abort(struct scsi_cmnd *scmnd) { - printk(KERN_ERR "SRP abort called\n"); + struct srp_target_port *target = host_to_target(scmnd->device->host); + struct srp_request *req = (struct srp_request *) scmnd->host_scribble; + int ret; - return srp_send_tsk_mgmt(scmnd, SRP_TSK_ABORT_TASK); + shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n"); + + if (!req || !srp_claim_req(target, req, NULL, scmnd)) + return SUCCESS; + if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun, + SRP_TSK_ABORT_TASK) == 0) + ret = SUCCESS; + else if (target->rport->state == SRP_RPORT_LOST) + ret = FAST_IO_FAIL; + else + ret = FAILED; + srp_free_req(target, req, scmnd, 0); + scmnd->result = DID_ABORT << 16; + scmnd->scsi_done(scmnd); + + return ret; } static int srp_reset_device(struct scsi_cmnd *scmnd) { - printk(KERN_ERR "SRP reset_device called\n"); + struct srp_target_port *target = host_to_target(scmnd->device->host); + int i; + + shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n"); + + if (srp_send_tsk_mgmt(target, SRP_TAG_NO_REQ, scmnd->device->lun, + SRP_TSK_LUN_RESET)) + return FAILED; + if (target->tsk_mgmt_status) + return FAILED; + + for (i = 0; i < target->req_ring_size; ++i) { + struct srp_request *req = &target->req_ring[i]; + srp_finish_req(target, req, scmnd->device, DID_RESET << 16); + } - return srp_send_tsk_mgmt(scmnd, SRP_TSK_LUN_RESET); + return SUCCESS; } static int srp_reset_host(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); - int ret = FAILED; - printk(KERN_ERR PFX "SRP reset_host called\n"); + shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n"); - if (!srp_reconnect_target(target)) - ret = SUCCESS; + return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED; +} - return ret; +static int srp_slave_configure(struct scsi_device *sdev) +{ + struct Scsi_Host *shost = sdev->host; + struct srp_target_port *target = host_to_target(shost); + struct request_queue *q = sdev->request_queue; + unsigned long timeout; + + if (sdev->type == TYPE_DISK) { + timeout = max_t(unsigned, 30 * HZ, target->rq_tmo_jiffies); + blk_queue_rq_timeout(q, timeout); + } + + return 0; +} + +static ssize_t show_id_ext(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "0x%016llx\n", + (unsigned long long) be64_to_cpu(target->id_ext)); } +static ssize_t show_ioc_guid(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "0x%016llx\n", + (unsigned long long) be64_to_cpu(target->ioc_guid)); +} + +static ssize_t show_service_id(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "0x%016llx\n", + (unsigned long long) be64_to_cpu(target->service_id)); +} + +static ssize_t show_pkey(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "0x%04x\n", be16_to_cpu(target->path.pkey)); +} + +static ssize_t show_sgid(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%pI6\n", target->path.sgid.raw); +} + +static ssize_t show_dgid(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%pI6\n", target->path.dgid.raw); +} + +static ssize_t show_orig_dgid(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%pI6\n", target->orig_dgid); +} + +static ssize_t show_req_lim(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%d\n", target->req_lim); +} + +static ssize_t show_zero_req_lim(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%d\n", target->zero_req_lim); +} + +static ssize_t show_local_ib_port(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%d\n", target->srp_host->port); +} + +static ssize_t show_local_ib_device(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name); +} + +static ssize_t show_comp_vector(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%d\n", target->comp_vector); +} + +static ssize_t show_tl_retry_count(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%d\n", target->tl_retry_count); +} + +static ssize_t show_cmd_sg_entries(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%u\n", target->cmd_sg_cnt); +} + +static ssize_t show_allow_ext_sg(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%s\n", target->allow_ext_sg ? "true" : "false"); +} + +static DEVICE_ATTR(id_ext, S_IRUGO, show_id_ext, NULL); +static DEVICE_ATTR(ioc_guid, S_IRUGO, show_ioc_guid, NULL); +static DEVICE_ATTR(service_id, S_IRUGO, show_service_id, NULL); +static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); +static DEVICE_ATTR(sgid, S_IRUGO, show_sgid, NULL); +static DEVICE_ATTR(dgid, S_IRUGO, show_dgid, NULL); +static DEVICE_ATTR(orig_dgid, S_IRUGO, show_orig_dgid, NULL); +static DEVICE_ATTR(req_lim, S_IRUGO, show_req_lim, NULL); +static DEVICE_ATTR(zero_req_lim, S_IRUGO, show_zero_req_lim, NULL); +static DEVICE_ATTR(local_ib_port, S_IRUGO, show_local_ib_port, NULL); +static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL); +static DEVICE_ATTR(comp_vector, S_IRUGO, show_comp_vector, NULL); +static DEVICE_ATTR(tl_retry_count, S_IRUGO, show_tl_retry_count, NULL); +static DEVICE_ATTR(cmd_sg_entries, S_IRUGO, show_cmd_sg_entries, NULL); +static DEVICE_ATTR(allow_ext_sg, S_IRUGO, show_allow_ext_sg, NULL); + +static struct device_attribute *srp_host_attrs[] = { + &dev_attr_id_ext, + &dev_attr_ioc_guid, + &dev_attr_service_id, + &dev_attr_pkey, + &dev_attr_sgid, + &dev_attr_dgid, + &dev_attr_orig_dgid, + &dev_attr_req_lim, + &dev_attr_zero_req_lim, + &dev_attr_local_ib_port, + &dev_attr_local_ib_device, + &dev_attr_comp_vector, + &dev_attr_tl_retry_count, + &dev_attr_cmd_sg_entries, + &dev_attr_allow_ext_sg, + NULL +}; + static struct scsi_host_template srp_template = { .module = THIS_MODULE, - .name = DRV_NAME, + .name = "InfiniBand SRP initiator", + .proc_name = DRV_NAME, + .slave_configure = srp_slave_configure, .info = srp_target_info, .queuecommand = srp_queuecommand, + .change_queue_depth = srp_change_queue_depth, + .change_queue_type = srp_change_queue_type, .eh_abort_handler = srp_abort, .eh_device_reset_handler = srp_reset_device, .eh_host_reset_handler = srp_reset_host, - .can_queue = SRP_SQ_SIZE, + .skip_settle_delay = true, + .sg_tablesize = SRP_DEF_SG_TABLESIZE, + .can_queue = SRP_DEFAULT_CMD_SQ_SIZE, .this_id = -1, - .sg_tablesize = SRP_MAX_INDIRECT, - .cmd_per_lun = SRP_SQ_SIZE, - .use_clustering = ENABLE_CLUSTERING + .cmd_per_lun = SRP_DEFAULT_CMD_SQ_SIZE, + .use_clustering = ENABLE_CLUSTERING, + .shost_attrs = srp_host_attrs }; static int srp_add_target(struct srp_host *host, struct srp_target_port *target) { + struct srp_rport_identifiers ids; + struct srp_rport *rport; + sprintf(target->target_name, "SRP.T10:%016llX", (unsigned long long) be64_to_cpu(target->id_ext)); - if (scsi_add_host(target->scsi_host, host->dev->dma_device)) + if (scsi_add_host(target->scsi_host, host->srp_dev->dev->dma_device)) return -ENODEV; - down(&host->target_mutex); + memcpy(ids.port_id, &target->id_ext, 8); + memcpy(ids.port_id + 8, &target->ioc_guid, 8); + ids.roles = SRP_RPORT_ROLE_TARGET; + rport = srp_rport_add(target->scsi_host, &ids); + if (IS_ERR(rport)) { + scsi_remove_host(target->scsi_host); + return PTR_ERR(rport); + } + + rport->lld_data = target; + target->rport = rport; + + spin_lock(&host->target_lock); list_add_tail(&target->list, &host->target_list); - up(&host->target_mutex); + spin_unlock(&host->target_lock); target->state = SRP_TARGET_LIVE; - /* XXX: are we supposed to have a definition of SCAN_WILD_CARD ?? */ scsi_scan_target(&target->scsi_host->shost_gendev, - 0, target->scsi_id, ~0, 0); + 0, target->scsi_id, SCAN_WILD_CARD, 0); return 0; } -static void srp_release_class_dev(struct class_device *class_dev) +static void srp_release_dev(struct device *dev) { struct srp_host *host = - container_of(class_dev, struct srp_host, class_dev); + container_of(dev, struct srp_host, dev); complete(&host->released); } static struct class srp_class = { .name = "infiniband_srp", - .release = srp_release_class_dev + .dev_release = srp_release_dev }; +/** + * srp_conn_unique() - check whether the connection to a target is unique + * @host: SRP host. + * @target: SRP target port. + */ +static bool srp_conn_unique(struct srp_host *host, + struct srp_target_port *target) +{ + struct srp_target_port *t; + bool ret = false; + + if (target->state == SRP_TARGET_REMOVED) + goto out; + + ret = true; + + spin_lock(&host->target_lock); + list_for_each_entry(t, &host->target_list, list) { + if (t != target && + target->id_ext == t->id_ext && + target->ioc_guid == t->ioc_guid && + target->initiator_ext == t->initiator_ext) { + ret = false; + break; + } + } + spin_unlock(&host->target_lock); + +out: + return ret; +} + /* * Target ports are added by writing * @@ -1293,6 +2704,15 @@ enum { SRP_OPT_PKEY = 1 << 3, SRP_OPT_SERVICE_ID = 1 << 4, SRP_OPT_MAX_SECT = 1 << 5, + SRP_OPT_MAX_CMD_PER_LUN = 1 << 6, + SRP_OPT_IO_CLASS = 1 << 7, + SRP_OPT_INITIATOR_EXT = 1 << 8, + SRP_OPT_CMD_SG_ENTRIES = 1 << 9, + SRP_OPT_ALLOW_EXT_SG = 1 << 10, + SRP_OPT_SG_TABLESIZE = 1 << 11, + SRP_OPT_COMP_VECTOR = 1 << 12, + SRP_OPT_TL_RETRY_COUNT = 1 << 13, + SRP_OPT_QUEUE_SIZE = 1 << 14, SRP_OPT_ALL = (SRP_OPT_ID_EXT | SRP_OPT_IOC_GUID | SRP_OPT_DGID | @@ -1300,14 +2720,23 @@ enum { SRP_OPT_SERVICE_ID), }; -static match_table_t srp_opt_tokens = { - { SRP_OPT_ID_EXT, "id_ext=%s" }, - { SRP_OPT_IOC_GUID, "ioc_guid=%s" }, - { SRP_OPT_DGID, "dgid=%s" }, - { SRP_OPT_PKEY, "pkey=%x" }, - { SRP_OPT_SERVICE_ID, "service_id=%s" }, - { SRP_OPT_MAX_SECT, "max_sect=%d" }, - { SRP_OPT_ERR, NULL } +static const match_table_t srp_opt_tokens = { + { SRP_OPT_ID_EXT, "id_ext=%s" }, + { SRP_OPT_IOC_GUID, "ioc_guid=%s" }, + { SRP_OPT_DGID, "dgid=%s" }, + { SRP_OPT_PKEY, "pkey=%x" }, + { SRP_OPT_SERVICE_ID, "service_id=%s" }, + { SRP_OPT_MAX_SECT, "max_sect=%d" }, + { SRP_OPT_MAX_CMD_PER_LUN, "max_cmd_per_lun=%d" }, + { SRP_OPT_IO_CLASS, "io_class=%x" }, + { SRP_OPT_INITIATOR_EXT, "initiator_ext=%s" }, + { SRP_OPT_CMD_SG_ENTRIES, "cmd_sg_entries=%u" }, + { SRP_OPT_ALLOW_EXT_SG, "allow_ext_sg=%u" }, + { SRP_OPT_SG_TABLESIZE, "sg_tablesize=%u" }, + { SRP_OPT_COMP_VECTOR, "comp_vector=%u" }, + { SRP_OPT_TL_RETRY_COUNT, "tl_retry_count=%u" }, + { SRP_OPT_QUEUE_SIZE, "queue_size=%d" }, + { SRP_OPT_ERR, NULL } }; static int srp_parse_options(const char *buf, struct srp_target_port *target) @@ -1336,20 +2765,33 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) switch (token) { case SRP_OPT_ID_EXT: p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } target->id_ext = cpu_to_be64(simple_strtoull(p, NULL, 16)); kfree(p); break; case SRP_OPT_IOC_GUID: p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } target->ioc_guid = cpu_to_be64(simple_strtoull(p, NULL, 16)); kfree(p); break; case SRP_OPT_DGID: p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } if (strlen(p) != 32) { - printk(KERN_WARNING PFX "bad dest GID parameter '%s'\n", p); + pr_warn("bad dest GID parameter '%s'\n", p); + kfree(p); goto out; } @@ -1357,11 +2799,13 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) strlcpy(dgid, p + i * 2, 3); target->path.dgid.raw[i] = simple_strtoul(dgid, NULL, 16); } + kfree(p); + memcpy(target->orig_dgid, target->path.dgid.raw, 16); break; case SRP_OPT_PKEY: if (match_hex(args, &token)) { - printk(KERN_WARNING PFX "bad P_Key parameter '%s'\n", p); + pr_warn("bad P_Key parameter '%s'\n", p); goto out; } target->path.pkey = cpu_to_be16(token); @@ -1369,21 +2813,116 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) case SRP_OPT_SERVICE_ID: p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } target->service_id = cpu_to_be64(simple_strtoull(p, NULL, 16)); + target->path.service_id = target->service_id; kfree(p); break; case SRP_OPT_MAX_SECT: if (match_int(args, &token)) { - printk(KERN_WARNING PFX "bad max sect parameter '%s'\n", p); + pr_warn("bad max sect parameter '%s'\n", p); goto out; } target->scsi_host->max_sectors = token; break; + case SRP_OPT_QUEUE_SIZE: + if (match_int(args, &token) || token < 1) { + pr_warn("bad queue_size parameter '%s'\n", p); + goto out; + } + target->scsi_host->can_queue = token; + target->queue_size = token + SRP_RSP_SQ_SIZE + + SRP_TSK_MGMT_SQ_SIZE; + if (!(opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) + target->scsi_host->cmd_per_lun = token; + break; + + case SRP_OPT_MAX_CMD_PER_LUN: + if (match_int(args, &token) || token < 1) { + pr_warn("bad max cmd_per_lun parameter '%s'\n", + p); + goto out; + } + target->scsi_host->cmd_per_lun = token; + break; + + case SRP_OPT_IO_CLASS: + if (match_hex(args, &token)) { + pr_warn("bad IO class parameter '%s'\n", p); + goto out; + } + if (token != SRP_REV10_IB_IO_CLASS && + token != SRP_REV16A_IB_IO_CLASS) { + pr_warn("unknown IO class parameter value %x specified (use %x or %x).\n", + token, SRP_REV10_IB_IO_CLASS, + SRP_REV16A_IB_IO_CLASS); + goto out; + } + target->io_class = token; + break; + + case SRP_OPT_INITIATOR_EXT: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + target->initiator_ext = cpu_to_be64(simple_strtoull(p, NULL, 16)); + kfree(p); + break; + + case SRP_OPT_CMD_SG_ENTRIES: + if (match_int(args, &token) || token < 1 || token > 255) { + pr_warn("bad max cmd_sg_entries parameter '%s'\n", + p); + goto out; + } + target->cmd_sg_cnt = token; + break; + + case SRP_OPT_ALLOW_EXT_SG: + if (match_int(args, &token)) { + pr_warn("bad allow_ext_sg parameter '%s'\n", p); + goto out; + } + target->allow_ext_sg = !!token; + break; + + case SRP_OPT_SG_TABLESIZE: + if (match_int(args, &token) || token < 1 || + token > SCSI_MAX_SG_CHAIN_SEGMENTS) { + pr_warn("bad max sg_tablesize parameter '%s'\n", + p); + goto out; + } + target->sg_tablesize = token; + break; + + case SRP_OPT_COMP_VECTOR: + if (match_int(args, &token) || token < 0) { + pr_warn("bad comp_vector parameter '%s'\n", p); + goto out; + } + target->comp_vector = token; + break; + + case SRP_OPT_TL_RETRY_COUNT: + if (match_int(args, &token) || token < 2 || token > 7) { + pr_warn("bad tl_retry_count parameter '%s' (must be a number between 2 and 7)\n", + p); + goto out; + } + target->tl_retry_count = token; + break; + default: - printk(KERN_WARNING PFX "unknown parameter or missing value " - "'%s' in target creation request\n", p); + pr_warn("unknown parameter or missing value '%s' in target creation request\n", + p); goto out; } } @@ -1394,77 +2933,111 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) for (i = 0; i < ARRAY_SIZE(srp_opt_tokens); ++i) if ((srp_opt_tokens[i].token & SRP_OPT_ALL) && !(srp_opt_tokens[i].token & opt_mask)) - printk(KERN_WARNING PFX "target creation request is " - "missing parameter '%s'\n", - srp_opt_tokens[i].pattern); + pr_warn("target creation request is missing parameter '%s'\n", + srp_opt_tokens[i].pattern); + + if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue + && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) + pr_warn("cmd_per_lun = %d > queue_size = %d\n", + target->scsi_host->cmd_per_lun, + target->scsi_host->can_queue); out: kfree(options); return ret; } -static ssize_t srp_create_target(struct class_device *class_dev, +static ssize_t srp_create_target(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { struct srp_host *host = - container_of(class_dev, struct srp_host, class_dev); + container_of(dev, struct srp_host, dev); struct Scsi_Host *target_host; struct srp_target_port *target; + struct srp_device *srp_dev = host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; int ret; - int i; target_host = scsi_host_alloc(&srp_template, sizeof (struct srp_target_port)); if (!target_host) return -ENOMEM; - target = host_to_target(target_host); - memset(target, 0, sizeof *target); + target_host->transportt = ib_srp_transport_template; + target_host->max_channel = 0; + target_host->max_id = 1; + target_host->max_lun = SRP_MAX_LUN; + target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb; - target->scsi_host = target_host; - target->srp_host = host; + target = host_to_target(target_host); - INIT_WORK(&target->work, srp_reconnect_work, target); + target->io_class = SRP_REV16A_IB_IO_CLASS; + target->scsi_host = target_host; + target->srp_host = host; + target->lkey = host->srp_dev->mr->lkey; + target->rkey = host->srp_dev->mr->rkey; + target->cmd_sg_cnt = cmd_sg_entries; + target->sg_tablesize = indirect_sg_entries ? : cmd_sg_entries; + target->allow_ext_sg = allow_ext_sg; + target->tl_retry_count = 7; + target->queue_size = SRP_DEFAULT_QUEUE_SIZE; - for (i = 0; i < SRP_SQ_SIZE - 1; ++i) - target->req_ring[i].next = i + 1; - target->req_ring[SRP_SQ_SIZE - 1].next = -1; - INIT_LIST_HEAD(&target->req_queue); + mutex_lock(&host->add_target_mutex); ret = srp_parse_options(buf, target); if (ret) goto err; - ib_get_cached_gid(host->dev, host->port, 0, &target->path.sgid); - - printk(KERN_DEBUG PFX "new target: id_ext %016llx ioc_guid %016llx pkey %04x " - "service_id %016llx dgid %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", - (unsigned long long) be64_to_cpu(target->id_ext), - (unsigned long long) be64_to_cpu(target->ioc_guid), - be16_to_cpu(target->path.pkey), - (unsigned long long) be64_to_cpu(target->service_id), - (int) be16_to_cpu(*(__be16 *) &target->path.dgid.raw[0]), - (int) be16_to_cpu(*(__be16 *) &target->path.dgid.raw[2]), - (int) be16_to_cpu(*(__be16 *) &target->path.dgid.raw[4]), - (int) be16_to_cpu(*(__be16 *) &target->path.dgid.raw[6]), - (int) be16_to_cpu(*(__be16 *) &target->path.dgid.raw[8]), - (int) be16_to_cpu(*(__be16 *) &target->path.dgid.raw[10]), - (int) be16_to_cpu(*(__be16 *) &target->path.dgid.raw[12]), - (int) be16_to_cpu(*(__be16 *) &target->path.dgid.raw[14])); + target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE; - ret = srp_create_target_ib(target); - if (ret) + if (!srp_conn_unique(target->srp_host, target)) { + shost_printk(KERN_INFO, target->scsi_host, + PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + be64_to_cpu(target->initiator_ext)); + ret = -EEXIST; goto err; + } - target->cm_id = ib_create_cm_id(host->dev, srp_cm_handler, target); - if (IS_ERR(target->cm_id)) { - ret = PTR_ERR(target->cm_id); - goto err_free; + if (!srp_dev->has_fmr && !srp_dev->has_fr && !target->allow_ext_sg && + target->cmd_sg_cnt < target->sg_tablesize) { + pr_warn("No MR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); + target->sg_tablesize = target->cmd_sg_cnt; } + target_host->sg_tablesize = target->sg_tablesize; + target->indirect_size = target->sg_tablesize * + sizeof (struct srp_direct_buf); + target->max_iu_len = sizeof (struct srp_cmd) + + sizeof (struct srp_indirect_buf) + + target->cmd_sg_cnt * sizeof (struct srp_direct_buf); + + INIT_WORK(&target->tl_err_work, srp_tl_err_work); + INIT_WORK(&target->remove_work, srp_remove_work); + spin_lock_init(&target->lock); + INIT_LIST_HEAD(&target->free_tx); + ret = srp_alloc_req_data(target); + if (ret) + goto err_free_mem; + + ret = ib_query_gid(ibdev, host->port, 0, &target->path.sgid); + if (ret) + goto err_free_mem; + + ret = srp_create_target_ib(target); + if (ret) + goto err_free_mem; + + ret = srp_new_cm_id(target); + if (ret) + goto err_free_ib; + ret = srp_connect_target(target); if (ret) { - printk(KERN_ERR PFX "Connection failed\n"); + shost_printk(KERN_ERR, target->scsi_host, + PFX "Connection failed\n"); goto err_cm_id; } @@ -1472,7 +3045,19 @@ static ssize_t srp_create_target(struct class_device *class_dev, if (ret) goto err_disconnect; - return count; + shost_printk(KERN_DEBUG, target->scsi_host, PFX + "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + be16_to_cpu(target->path.pkey), + be64_to_cpu(target->service_id), + target->path.sgid.raw, target->path.dgid.raw); + + ret = count; + +out: + mutex_unlock(&host->add_target_mutex); + return ret; err_disconnect: srp_disconnect_target(target); @@ -1480,39 +3065,40 @@ err_disconnect: err_cm_id: ib_destroy_cm_id(target->cm_id); -err_free: +err_free_ib: srp_free_target_ib(target); +err_free_mem: + srp_free_req_data(target); + err: scsi_host_put(target_host); - - return ret; + goto out; } -static CLASS_DEVICE_ATTR(add_target, S_IWUSR, NULL, srp_create_target); +static DEVICE_ATTR(add_target, S_IWUSR, NULL, srp_create_target); -static ssize_t show_ibdev(struct class_device *class_dev, char *buf) +static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, + char *buf) { - struct srp_host *host = - container_of(class_dev, struct srp_host, class_dev); + struct srp_host *host = container_of(dev, struct srp_host, dev); - return sprintf(buf, "%s\n", host->dev->name); + return sprintf(buf, "%s\n", host->srp_dev->dev->name); } -static CLASS_DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); +static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); -static ssize_t show_port(struct class_device *class_dev, char *buf) +static ssize_t show_port(struct device *dev, struct device_attribute *attr, + char *buf) { - struct srp_host *host = - container_of(class_dev, struct srp_host, class_dev); + struct srp_host *host = container_of(dev, struct srp_host, dev); return sprintf(buf, "%d\n", host->port); } -static CLASS_DEVICE_ATTR(port, S_IRUGO, show_port, NULL); +static DEVICE_ATTR(port, S_IRUGO, show_port, NULL); -static struct srp_host *srp_add_port(struct ib_device *device, - __be64 node_guid, u8 port) +static struct srp_host *srp_add_port(struct srp_device *device, u8 port) { struct srp_host *host; @@ -1521,51 +3107,31 @@ static struct srp_host *srp_add_port(struct ib_device *device, return NULL; INIT_LIST_HEAD(&host->target_list); - init_MUTEX(&host->target_mutex); + spin_lock_init(&host->target_lock); init_completion(&host->released); - host->dev = device; + mutex_init(&host->add_target_mutex); + host->srp_dev = device; host->port = port; - host->initiator_port_id[7] = port; - memcpy(host->initiator_port_id + 8, &node_guid, 8); + host->dev.class = &srp_class; + host->dev.parent = device->dev->dma_device; + dev_set_name(&host->dev, "srp-%s-%d", device->dev->name, port); - host->pd = ib_alloc_pd(device); - if (IS_ERR(host->pd)) - goto err_free; - - host->mr = ib_get_dma_mr(host->pd, - IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE); - if (IS_ERR(host->mr)) - goto err_pd; - - host->class_dev.class = &srp_class; - host->class_dev.dev = device->dma_device; - snprintf(host->class_dev.class_id, BUS_ID_SIZE, "srp-%s-%d", - device->name, port); - - if (class_device_register(&host->class_dev)) - goto err_mr; - if (class_device_create_file(&host->class_dev, &class_device_attr_add_target)) + if (device_register(&host->dev)) + goto free_host; + if (device_create_file(&host->dev, &dev_attr_add_target)) goto err_class; - if (class_device_create_file(&host->class_dev, &class_device_attr_ibdev)) + if (device_create_file(&host->dev, &dev_attr_ibdev)) goto err_class; - if (class_device_create_file(&host->class_dev, &class_device_attr_port)) + if (device_create_file(&host->dev, &dev_attr_port)) goto err_class; return host; err_class: - class_device_unregister(&host->class_dev); - -err_mr: - ib_dereg_mr(host->mr); - -err_pd: - ib_dealloc_pd(host->pd); + device_unregister(&host->dev); -err_free: +free_host: kfree(host); return NULL; @@ -1573,28 +3139,74 @@ err_free: static void srp_add_one(struct ib_device *device) { - struct list_head *dev_list; - struct srp_host *host; + struct srp_device *srp_dev; struct ib_device_attr *dev_attr; - int s, e, p; + struct srp_host *host; + int mr_page_shift, s, e, p; + u64 max_pages_per_mr; dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); if (!dev_attr) return; if (ib_query_device(device, dev_attr)) { - printk(KERN_WARNING PFX "Couldn't query node GUID for %s.\n", - device->name); - goto out; + pr_warn("Query device failed for %s\n", device->name); + goto free_attr; } - dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); - if (!dev_list) - goto out; + srp_dev = kmalloc(sizeof *srp_dev, GFP_KERNEL); + if (!srp_dev) + goto free_attr; - INIT_LIST_HEAD(dev_list); + srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && + device->map_phys_fmr && device->unmap_fmr); + srp_dev->has_fr = (dev_attr->device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + if (!srp_dev->has_fmr && !srp_dev->has_fr) + dev_warn(&device->dev, "neither FMR nor FR is supported\n"); - if (device->node_type == IB_NODE_SWITCH) { + srp_dev->use_fast_reg = (srp_dev->has_fr && + (!srp_dev->has_fmr || prefer_fr)); + + /* + * Use the smallest page size supported by the HCA, down to a + * minimum of 4096 bytes. We're unlikely to build large sglists + * out of smaller entries. + */ + mr_page_shift = max(12, ffs(dev_attr->page_size_cap) - 1); + srp_dev->mr_page_size = 1 << mr_page_shift; + srp_dev->mr_page_mask = ~((u64) srp_dev->mr_page_size - 1); + max_pages_per_mr = dev_attr->max_mr_size; + do_div(max_pages_per_mr, srp_dev->mr_page_size); + srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR, + max_pages_per_mr); + if (srp_dev->use_fast_reg) { + srp_dev->max_pages_per_mr = + min_t(u32, srp_dev->max_pages_per_mr, + dev_attr->max_fast_reg_page_list_len); + } + srp_dev->mr_max_size = srp_dev->mr_page_size * + srp_dev->max_pages_per_mr; + pr_debug("%s: mr_page_shift = %d, dev_attr->max_mr_size = %#llx, dev_attr->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n", + device->name, mr_page_shift, dev_attr->max_mr_size, + dev_attr->max_fast_reg_page_list_len, + srp_dev->max_pages_per_mr, srp_dev->mr_max_size); + + INIT_LIST_HEAD(&srp_dev->dev_list); + + srp_dev->dev = device; + srp_dev->pd = ib_alloc_pd(device); + if (IS_ERR(srp_dev->pd)) + goto free_dev; + + srp_dev->mr = ib_get_dma_mr(srp_dev->pd, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE); + if (IS_ERR(srp_dev->mr)) + goto err_pd; + + if (device->node_type == RDMA_NODE_IB_SWITCH) { s = 0; e = 0; } else { @@ -1603,29 +3215,37 @@ static void srp_add_one(struct ib_device *device) } for (p = s; p <= e; ++p) { - host = srp_add_port(device, dev_attr->node_guid, p); + host = srp_add_port(srp_dev, p); if (host) - list_add_tail(&host->list, dev_list); + list_add_tail(&host->list, &srp_dev->dev_list); } - ib_set_client_data(device, &srp_client, dev_list); + ib_set_client_data(device, &srp_client, srp_dev); -out: + goto free_attr; + +err_pd: + ib_dealloc_pd(srp_dev->pd); + +free_dev: + kfree(srp_dev); + +free_attr: kfree(dev_attr); } static void srp_remove_one(struct ib_device *device) { - struct list_head *dev_list; + struct srp_device *srp_dev; struct srp_host *host, *tmp_host; - LIST_HEAD(target_list); - struct srp_target_port *target, *tmp_target; - unsigned long flags; + struct srp_target_port *target; - dev_list = ib_get_client_data(device, &srp_client); + srp_dev = ib_get_client_data(device, &srp_client); + if (!srp_dev) + return; - list_for_each_entry_safe(host, tmp_host, dev_list, list) { - class_device_unregister(&host->class_dev); + list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) { + device_unregister(&host->dev); /* * Wait for the sysfs entry to go away, so that no new * target ports can be created. @@ -1633,56 +3253,85 @@ static void srp_remove_one(struct ib_device *device) wait_for_completion(&host->released); /* - * Mark all target ports as removed, so we stop queueing - * commands and don't try to reconnect. + * Remove all target ports. */ - down(&host->target_mutex); - list_for_each_entry_safe(target, tmp_target, - &host->target_list, list) { - spin_lock_irqsave(target->scsi_host->host_lock, flags); - if (target->state != SRP_TARGET_REMOVED) - target->state = SRP_TARGET_REMOVED; - spin_unlock_irqrestore(target->scsi_host->host_lock, flags); - } - up(&host->target_mutex); + spin_lock(&host->target_lock); + list_for_each_entry(target, &host->target_list, list) + srp_queue_remove_work(target); + spin_unlock(&host->target_lock); /* - * Wait for any reconnection tasks that may have - * started before we marked our target ports as - * removed, and any target port removal tasks. + * Wait for target port removal tasks. */ - flush_scheduled_work(); - - list_for_each_entry_safe(target, tmp_target, - &host->target_list, list) { - scsi_remove_host(target->scsi_host); - srp_disconnect_target(target); - ib_destroy_cm_id(target->cm_id); - srp_free_target_ib(target); - scsi_host_put(target->scsi_host); - } + flush_workqueue(system_long_wq); - ib_dereg_mr(host->mr); - ib_dealloc_pd(host->pd); kfree(host); } - kfree(dev_list); + ib_dereg_mr(srp_dev->mr); + ib_dealloc_pd(srp_dev->pd); + + kfree(srp_dev); } +static struct srp_function_template ib_srp_transport_functions = { + .has_rport_state = true, + .reset_timer_if_blocked = true, + .reconnect_delay = &srp_reconnect_delay, + .fast_io_fail_tmo = &srp_fast_io_fail_tmo, + .dev_loss_tmo = &srp_dev_loss_tmo, + .reconnect = srp_rport_reconnect, + .rport_delete = srp_rport_delete, + .terminate_rport_io = srp_terminate_io, +}; + static int __init srp_init_module(void) { int ret; + BUILD_BUG_ON(FIELD_SIZEOF(struct ib_wc, wr_id) < sizeof(void *)); + + if (srp_sg_tablesize) { + pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n"); + if (!cmd_sg_entries) + cmd_sg_entries = srp_sg_tablesize; + } + + if (!cmd_sg_entries) + cmd_sg_entries = SRP_DEF_SG_TABLESIZE; + + if (cmd_sg_entries > 255) { + pr_warn("Clamping cmd_sg_entries to 255\n"); + cmd_sg_entries = 255; + } + + if (!indirect_sg_entries) + indirect_sg_entries = cmd_sg_entries; + else if (indirect_sg_entries < cmd_sg_entries) { + pr_warn("Bumping up indirect_sg_entries to match cmd_sg_entries (%u)\n", + cmd_sg_entries); + indirect_sg_entries = cmd_sg_entries; + } + + ib_srp_transport_template = + srp_attach_transport(&ib_srp_transport_functions); + if (!ib_srp_transport_template) + return -ENOMEM; + ret = class_register(&srp_class); if (ret) { - printk(KERN_ERR PFX "couldn't register class infiniband_srp\n"); + pr_err("couldn't register class infiniband_srp\n"); + srp_release_transport(ib_srp_transport_template); return ret; } + ib_sa_register_client(&srp_sa_client); + ret = ib_register_client(&srp_client); if (ret) { - printk(KERN_ERR PFX "couldn't register IB client\n"); + pr_err("couldn't register IB client\n"); + srp_release_transport(ib_srp_transport_template); + ib_sa_unregister_client(&srp_sa_client); class_unregister(&srp_class); return ret; } @@ -1693,7 +3342,9 @@ static int __init srp_init_module(void) static void __exit srp_cleanup_module(void) { ib_unregister_client(&srp_client); + ib_sa_unregister_client(&srp_sa_client); class_unregister(&srp_class); + srp_release_transport(ib_srp_transport_template); } module_init(srp_init_module); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 4fec28a7136..e46ecb15aa0 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -28,8 +28,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ib_srp.h 3932 2005-11-01 17:19:29Z roland $ */ #ifndef IB_SRP_H @@ -37,8 +35,8 @@ #include <linux/types.h> #include <linux/list.h> - -#include <asm/semaphore.h> +#include <linux/mutex.h> +#include <linux/scatterlist.h> #include <scsi/scsi_host.h> #include <scsi/scsi_cmnd.h> @@ -46,6 +44,7 @@ #include <rdma/ib_verbs.h> #include <rdma/ib_sa.h> #include <rdma/ib_cm.h> +#include <rdma/ib_fmr_pool.h> enum { SRP_PATH_REC_TIMEOUT_MS = 1000, @@ -53,98 +52,228 @@ enum { SRP_PORT_REDIRECT = 1, SRP_DLID_REDIRECT = 2, + SRP_STALE_CONN = 3, - SRP_MAX_IU_LEN = 256, + SRP_MAX_LUN = 512, + SRP_DEF_SG_TABLESIZE = 12, - SRP_RQ_SHIFT = 6, - SRP_RQ_SIZE = 1 << SRP_RQ_SHIFT, - SRP_SQ_SIZE = SRP_RQ_SIZE - 1, - SRP_CQ_SIZE = SRP_SQ_SIZE + SRP_RQ_SIZE, + SRP_DEFAULT_QUEUE_SIZE = 1 << 6, + SRP_RSP_SQ_SIZE = 1, + SRP_TSK_MGMT_SQ_SIZE = 1, + SRP_DEFAULT_CMD_SQ_SIZE = SRP_DEFAULT_QUEUE_SIZE - SRP_RSP_SQ_SIZE - + SRP_TSK_MGMT_SQ_SIZE, - SRP_TAG_TSK_MGMT = 1 << (SRP_RQ_SHIFT + 1) -}; + SRP_TAG_NO_REQ = ~0U, + SRP_TAG_TSK_MGMT = 1U << 31, -#define SRP_OP_RECV (1 << 31) -#define SRP_MAX_INDIRECT ((SRP_MAX_IU_LEN - \ - sizeof (struct srp_cmd) - \ - sizeof (struct srp_indirect_buf)) / 16) + SRP_MAX_PAGES_PER_MR = 512, + + LOCAL_INV_WR_ID_MASK = 1, + FAST_REG_WR_ID_MASK = 2, +}; enum srp_target_state { SRP_TARGET_LIVE, - SRP_TARGET_CONNECTING, - SRP_TARGET_DEAD, - SRP_TARGET_REMOVED + SRP_TARGET_REMOVED, }; -struct srp_host { - u8 initiator_port_id[16]; +enum srp_iu_type { + SRP_IU_CMD, + SRP_IU_TSK_MGMT, + SRP_IU_RSP, +}; + +/* + * @mr_page_mask: HCA memory registration page mask. + * @mr_page_size: HCA memory registration page size. + * @mr_max_size: Maximum size in bytes of a single FMR / FR registration + * request. + */ +struct srp_device { + struct list_head dev_list; struct ib_device *dev; - u8 port; struct ib_pd *pd; struct ib_mr *mr; - struct class_device class_dev; + u64 mr_page_mask; + int mr_page_size; + int mr_max_size; + int max_pages_per_mr; + bool has_fmr; + bool has_fr; + bool use_fast_reg; +}; + +struct srp_host { + struct srp_device *srp_dev; + u8 port; + struct device dev; struct list_head target_list; - struct semaphore target_mutex; + spinlock_t target_lock; struct completion released; struct list_head list; + struct mutex add_target_mutex; }; struct srp_request { struct list_head list; struct scsi_cmnd *scmnd; struct srp_iu *cmd; - struct srp_iu *tsk_mgmt; - DECLARE_PCI_UNMAP_ADDR(direct_mapping) - struct completion done; - short next; - u8 cmd_done; - u8 tsk_status; + union { + struct ib_pool_fmr **fmr_list; + struct srp_fr_desc **fr_list; + }; + u64 *map_page; + struct srp_direct_buf *indirect_desc; + dma_addr_t indirect_dma_addr; + short nmdesc; + short index; }; struct srp_target_port { + /* These are RW in the hot path, and commonly used together */ + struct list_head free_tx; + struct list_head free_reqs; + spinlock_t lock; + s32 req_lim; + + /* These are read-only in the hot path */ + struct ib_cq *send_cq ____cacheline_aligned_in_smp; + struct ib_cq *recv_cq; + struct ib_qp *qp; + union { + struct ib_fmr_pool *fmr_pool; + struct srp_fr_pool *fr_pool; + }; + u32 lkey; + u32 rkey; + enum srp_target_state state; + unsigned int max_iu_len; + unsigned int cmd_sg_cnt; + unsigned int indirect_size; + bool allow_ext_sg; + + /* Everything above this point is used in the hot path of + * command processing. Try to keep them packed into cachelines. + */ + __be64 id_ext; __be64 ioc_guid; __be64 service_id; + __be64 initiator_ext; + u16 io_class; struct srp_host *srp_host; struct Scsi_Host *scsi_host; + struct srp_rport *rport; char target_name[32]; unsigned int scsi_id; + unsigned int sg_tablesize; + int queue_size; + int req_ring_size; + int comp_vector; + int tl_retry_count; struct ib_sa_path_rec path; + __be16 orig_dgid[8]; struct ib_sa_query *path_query; int path_query_id; + u32 rq_tmo_jiffies; + bool connected; + struct ib_cm_id *cm_id; - struct ib_cq *cq; - struct ib_qp *qp; int max_ti_iu_len; - s32 req_lim; - - unsigned rx_head; - struct srp_iu *rx_ring[SRP_RQ_SIZE]; - unsigned tx_head; - unsigned tx_tail; - struct srp_iu *tx_ring[SRP_SQ_SIZE + 1]; + int zero_req_lim; - int req_head; - struct list_head req_queue; - struct srp_request req_ring[SRP_SQ_SIZE]; + struct srp_iu **tx_ring; + struct srp_iu **rx_ring; + struct srp_request *req_ring; - struct work_struct work; + struct work_struct tl_err_work; + struct work_struct remove_work; struct list_head list; struct completion done; int status; - enum srp_target_state state; + bool qp_in_error; + + struct completion tsk_mgmt_done; + u8 tsk_mgmt_status; }; struct srp_iu { - dma_addr_t dma; + struct list_head list; + u64 dma; void *buf; size_t size; enum dma_data_direction direction; }; +/** + * struct srp_fr_desc - fast registration work request arguments + * @entry: Entry in srp_fr_pool.free_list. + * @mr: Memory region. + * @frpl: Fast registration page list. + */ +struct srp_fr_desc { + struct list_head entry; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; +}; + +/** + * struct srp_fr_pool - pool of fast registration descriptors + * + * An entry is available for allocation if and only if it occurs in @free_list. + * + * @size: Number of descriptors in this pool. + * @max_page_list_len: Maximum fast registration work request page list length. + * @lock: Protects free_list. + * @free_list: List of free descriptors. + * @desc: Fast registration descriptor pool. + */ +struct srp_fr_pool { + int size; + int max_page_list_len; + spinlock_t lock; + struct list_head free_list; + struct srp_fr_desc desc[0]; +}; + +/** + * struct srp_map_state - per-request DMA memory mapping state + * @desc: Pointer to the element of the SRP buffer descriptor array + * that is being filled in. + * @pages: Array with DMA addresses of pages being considered for + * memory registration. + * @base_dma_addr: DMA address of the first page that has not yet been mapped. + * @dma_len: Number of bytes that will be registered with the next + * FMR or FR memory registration call. + * @total_len: Total number of bytes in the sg-list being mapped. + * @npages: Number of page addresses in the pages[] array. + * @nmdesc: Number of FMR or FR memory descriptors used for mapping. + * @ndesc: Number of SRP buffer descriptors that have been filled in. + * @unmapped_sg: First element of the sg-list that is mapped via FMR or FR. + * @unmapped_index: Index of the first element mapped via FMR or FR. + * @unmapped_addr: DMA address of the first element mapped via FMR or FR. + */ +struct srp_map_state { + union { + struct ib_pool_fmr **next_fmr; + struct srp_fr_desc **next_fr; + }; + struct srp_direct_buf *desc; + u64 *pages; + dma_addr_t base_dma_addr; + u32 dma_len; + u32 total_len; + unsigned int npages; + unsigned int nmdesc; + unsigned int ndesc; + struct scatterlist *unmapped_sg; + int unmapped_index; + dma_addr_t unmapped_addr; +}; + #endif /* IB_SRP_H */ diff --git a/drivers/infiniband/ulp/srpt/Kconfig b/drivers/infiniband/ulp/srpt/Kconfig new file mode 100644 index 00000000000..31ee83d528d --- /dev/null +++ b/drivers/infiniband/ulp/srpt/Kconfig @@ -0,0 +1,12 @@ +config INFINIBAND_SRPT + tristate "InfiniBand SCSI RDMA Protocol target support" + depends on INFINIBAND && TARGET_CORE + ---help--- + + Support for the SCSI RDMA Protocol (SRP) Target driver. The + SRP protocol is a protocol that allows an initiator to access + a block storage device on another host (target) over a network + that supports the RDMA protocol. Currently the RDMA protocol is + supported by InfiniBand and by iWarp network hardware. More + information about the SRP protocol can be found on the website + of the INCITS T10 technical committee (http://www.t10.org/). diff --git a/drivers/infiniband/ulp/srpt/Makefile b/drivers/infiniband/ulp/srpt/Makefile new file mode 100644 index 00000000000..e3ee4bdfffa --- /dev/null +++ b/drivers/infiniband/ulp/srpt/Makefile @@ -0,0 +1,2 @@ +ccflags-y := -Idrivers/target +obj-$(CONFIG_INFINIBAND_SRPT) += ib_srpt.o diff --git a/drivers/infiniband/ulp/srpt/ib_dm_mad.h b/drivers/infiniband/ulp/srpt/ib_dm_mad.h new file mode 100644 index 00000000000..fb1de1f6f29 --- /dev/null +++ b/drivers/infiniband/ulp/srpt/ib_dm_mad.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2006 - 2009 Mellanox Technology Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef IB_DM_MAD_H +#define IB_DM_MAD_H + +#include <linux/types.h> + +#include <rdma/ib_mad.h> + +enum { + /* + * See also section 13.4.7 Status Field, table 115 MAD Common Status + * Field Bit Values and also section 16.3.1.1 Status Field in the + * InfiniBand Architecture Specification. + */ + DM_MAD_STATUS_UNSUP_METHOD = 0x0008, + DM_MAD_STATUS_UNSUP_METHOD_ATTR = 0x000c, + DM_MAD_STATUS_INVALID_FIELD = 0x001c, + DM_MAD_STATUS_NO_IOC = 0x0100, + + /* + * See also the Device Management chapter, section 16.3.3 Attributes, + * table 279 Device Management Attributes in the InfiniBand + * Architecture Specification. + */ + DM_ATTR_CLASS_PORT_INFO = 0x01, + DM_ATTR_IOU_INFO = 0x10, + DM_ATTR_IOC_PROFILE = 0x11, + DM_ATTR_SVC_ENTRIES = 0x12 +}; + +struct ib_dm_hdr { + u8 reserved[28]; +}; + +/* + * Structure of management datagram sent by the SRP target implementation. + * Contains a management datagram header, reliable multi-packet transaction + * protocol (RMPP) header and ib_dm_hdr. Notes: + * - The SRP target implementation does not use RMPP or ib_dm_hdr when sending + * management datagrams. + * - The header size must be exactly 64 bytes (IB_MGMT_DEVICE_HDR), since this + * is the header size that is passed to ib_create_send_mad() in ib_srpt.c. + * - The maximum supported size for a management datagram when not using RMPP + * is 256 bytes -- 64 bytes header and 192 (IB_MGMT_DEVICE_DATA) bytes data. + */ +struct ib_dm_mad { + struct ib_mad_hdr mad_hdr; + struct ib_rmpp_hdr rmpp_hdr; + struct ib_dm_hdr dm_hdr; + u8 data[IB_MGMT_DEVICE_DATA]; +}; + +/* + * IOUnitInfo as defined in section 16.3.3.3 IOUnitInfo of the InfiniBand + * Architecture Specification. + */ +struct ib_dm_iou_info { + __be16 change_id; + u8 max_controllers; + u8 op_rom; + u8 controller_list[128]; +}; + +/* + * IOControllerprofile as defined in section 16.3.3.4 IOControllerProfile of + * the InfiniBand Architecture Specification. + */ +struct ib_dm_ioc_profile { + __be64 guid; + __be32 vendor_id; + __be32 device_id; + __be16 device_version; + __be16 reserved1; + __be32 subsys_vendor_id; + __be32 subsys_device_id; + __be16 io_class; + __be16 io_subclass; + __be16 protocol; + __be16 protocol_version; + __be16 service_conn; + __be16 initiators_supported; + __be16 send_queue_depth; + u8 reserved2; + u8 rdma_read_depth; + __be32 send_size; + __be32 rdma_size; + u8 op_cap_mask; + u8 svc_cap_mask; + u8 num_svc_entries; + u8 reserved3[9]; + u8 id_string[64]; +}; + +struct ib_dm_svc_entry { + u8 name[40]; + __be64 id; +}; + +/* + * See also section 16.3.3.5 ServiceEntries in the InfiniBand Architecture + * Specification. See also section B.7, table B.8 in the T10 SRP r16a document. + */ +struct ib_dm_svc_entries { + struct ib_dm_svc_entry service_entries[4]; +}; + +#endif diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c new file mode 100644 index 00000000000..fe09f2788b1 --- /dev/null +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -0,0 +1,4042 @@ +/* + * Copyright (c) 2006 - 2009 Mellanox Technology Inc. All rights reserved. + * Copyright (C) 2008 - 2011 Bart Van Assche <bvanassche@acm.org>. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/ctype.h> +#include <linux/kthread.h> +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/atomic.h> +#include <scsi/scsi_tcq.h> +#include <target/configfs_macros.h> +#include <target/target_core_base.h> +#include <target/target_core_fabric_configfs.h> +#include <target/target_core_fabric.h> +#include <target/target_core_configfs.h> +#include "ib_srpt.h" + +/* Name of this kernel module. */ +#define DRV_NAME "ib_srpt" +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "2011-02-14" + +#define SRPT_ID_STRING "Linux SRP target" + +#undef pr_fmt +#define pr_fmt(fmt) DRV_NAME " " fmt + +MODULE_AUTHOR("Vu Pham and Bart Van Assche"); +MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol target " + "v" DRV_VERSION " (" DRV_RELDATE ")"); +MODULE_LICENSE("Dual BSD/GPL"); + +/* + * Global Variables + */ + +static u64 srpt_service_guid; +static DEFINE_SPINLOCK(srpt_dev_lock); /* Protects srpt_dev_list. */ +static LIST_HEAD(srpt_dev_list); /* List of srpt_device structures. */ + +static unsigned srp_max_req_size = DEFAULT_MAX_REQ_SIZE; +module_param(srp_max_req_size, int, 0444); +MODULE_PARM_DESC(srp_max_req_size, + "Maximum size of SRP request messages in bytes."); + +static int srpt_srq_size = DEFAULT_SRPT_SRQ_SIZE; +module_param(srpt_srq_size, int, 0444); +MODULE_PARM_DESC(srpt_srq_size, + "Shared receive queue (SRQ) size."); + +static int srpt_get_u64_x(char *buffer, struct kernel_param *kp) +{ + return sprintf(buffer, "0x%016llx", *(u64 *)kp->arg); +} +module_param_call(srpt_service_guid, NULL, srpt_get_u64_x, &srpt_service_guid, + 0444); +MODULE_PARM_DESC(srpt_service_guid, + "Using this value for ioc_guid, id_ext, and cm_listen_id" + " instead of using the node_guid of the first HCA."); + +static struct ib_client srpt_client; +static struct target_fabric_configfs *srpt_target; +static void srpt_release_channel(struct srpt_rdma_ch *ch); +static int srpt_queue_status(struct se_cmd *cmd); + +/** + * opposite_dma_dir() - Swap DMA_TO_DEVICE and DMA_FROM_DEVICE. + */ +static inline +enum dma_data_direction opposite_dma_dir(enum dma_data_direction dir) +{ + switch (dir) { + case DMA_TO_DEVICE: return DMA_FROM_DEVICE; + case DMA_FROM_DEVICE: return DMA_TO_DEVICE; + default: return dir; + } +} + +/** + * srpt_sdev_name() - Return the name associated with the HCA. + * + * Examples are ib0, ib1, ... + */ +static inline const char *srpt_sdev_name(struct srpt_device *sdev) +{ + return sdev->device->name; +} + +static enum rdma_ch_state srpt_get_ch_state(struct srpt_rdma_ch *ch) +{ + unsigned long flags; + enum rdma_ch_state state; + + spin_lock_irqsave(&ch->spinlock, flags); + state = ch->state; + spin_unlock_irqrestore(&ch->spinlock, flags); + return state; +} + +static enum rdma_ch_state +srpt_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state new_state) +{ + unsigned long flags; + enum rdma_ch_state prev; + + spin_lock_irqsave(&ch->spinlock, flags); + prev = ch->state; + ch->state = new_state; + spin_unlock_irqrestore(&ch->spinlock, flags); + return prev; +} + +/** + * srpt_test_and_set_ch_state() - Test and set the channel state. + * + * Returns true if and only if the channel state has been set to the new state. + */ +static bool +srpt_test_and_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state old, + enum rdma_ch_state new) +{ + unsigned long flags; + enum rdma_ch_state prev; + + spin_lock_irqsave(&ch->spinlock, flags); + prev = ch->state; + if (prev == old) + ch->state = new; + spin_unlock_irqrestore(&ch->spinlock, flags); + return prev == old; +} + +/** + * srpt_event_handler() - Asynchronous IB event callback function. + * + * Callback function called by the InfiniBand core when an asynchronous IB + * event occurs. This callback may occur in interrupt context. See also + * section 11.5.2, Set Asynchronous Event Handler in the InfiniBand + * Architecture Specification. + */ +static void srpt_event_handler(struct ib_event_handler *handler, + struct ib_event *event) +{ + struct srpt_device *sdev; + struct srpt_port *sport; + + sdev = ib_get_client_data(event->device, &srpt_client); + if (!sdev || sdev->device != event->device) + return; + + pr_debug("ASYNC event= %d on device= %s\n", event->event, + srpt_sdev_name(sdev)); + + switch (event->event) { + case IB_EVENT_PORT_ERR: + if (event->element.port_num <= sdev->device->phys_port_cnt) { + sport = &sdev->port[event->element.port_num - 1]; + sport->lid = 0; + sport->sm_lid = 0; + } + break; + case IB_EVENT_PORT_ACTIVE: + case IB_EVENT_LID_CHANGE: + case IB_EVENT_PKEY_CHANGE: + case IB_EVENT_SM_CHANGE: + case IB_EVENT_CLIENT_REREGISTER: + /* Refresh port data asynchronously. */ + if (event->element.port_num <= sdev->device->phys_port_cnt) { + sport = &sdev->port[event->element.port_num - 1]; + if (!sport->lid && !sport->sm_lid) + schedule_work(&sport->work); + } + break; + default: + printk(KERN_ERR "received unrecognized IB event %d\n", + event->event); + break; + } +} + +/** + * srpt_srq_event() - SRQ event callback function. + */ +static void srpt_srq_event(struct ib_event *event, void *ctx) +{ + printk(KERN_INFO "SRQ event %d\n", event->event); +} + +/** + * srpt_qp_event() - QP event callback function. + */ +static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch) +{ + pr_debug("QP event %d on cm_id=%p sess_name=%s state=%d\n", + event->event, ch->cm_id, ch->sess_name, srpt_get_ch_state(ch)); + + switch (event->event) { + case IB_EVENT_COMM_EST: + ib_cm_notify(ch->cm_id, event->event); + break; + case IB_EVENT_QP_LAST_WQE_REACHED: + if (srpt_test_and_set_ch_state(ch, CH_DRAINING, + CH_RELEASING)) + srpt_release_channel(ch); + else + pr_debug("%s: state %d - ignored LAST_WQE.\n", + ch->sess_name, srpt_get_ch_state(ch)); + break; + default: + printk(KERN_ERR "received unrecognized IB QP event %d\n", + event->event); + break; + } +} + +/** + * srpt_set_ioc() - Helper function for initializing an IOUnitInfo structure. + * + * @slot: one-based slot number. + * @value: four-bit value. + * + * Copies the lowest four bits of value in element slot of the array of four + * bit elements called c_list (controller list). The index slot is one-based. + */ +static void srpt_set_ioc(u8 *c_list, u32 slot, u8 value) +{ + u16 id; + u8 tmp; + + id = (slot - 1) / 2; + if (slot & 0x1) { + tmp = c_list[id] & 0xf; + c_list[id] = (value << 4) | tmp; + } else { + tmp = c_list[id] & 0xf0; + c_list[id] = (value & 0xf) | tmp; + } +} + +/** + * srpt_get_class_port_info() - Copy ClassPortInfo to a management datagram. + * + * See also section 16.3.3.1 ClassPortInfo in the InfiniBand Architecture + * Specification. + */ +static void srpt_get_class_port_info(struct ib_dm_mad *mad) +{ + struct ib_class_port_info *cif; + + cif = (struct ib_class_port_info *)mad->data; + memset(cif, 0, sizeof *cif); + cif->base_version = 1; + cif->class_version = 1; + cif->resp_time_value = 20; + + mad->mad_hdr.status = 0; +} + +/** + * srpt_get_iou() - Write IOUnitInfo to a management datagram. + * + * See also section 16.3.3.3 IOUnitInfo in the InfiniBand Architecture + * Specification. See also section B.7, table B.6 in the SRP r16a document. + */ +static void srpt_get_iou(struct ib_dm_mad *mad) +{ + struct ib_dm_iou_info *ioui; + u8 slot; + int i; + + ioui = (struct ib_dm_iou_info *)mad->data; + ioui->change_id = __constant_cpu_to_be16(1); + ioui->max_controllers = 16; + + /* set present for slot 1 and empty for the rest */ + srpt_set_ioc(ioui->controller_list, 1, 1); + for (i = 1, slot = 2; i < 16; i++, slot++) + srpt_set_ioc(ioui->controller_list, slot, 0); + + mad->mad_hdr.status = 0; +} + +/** + * srpt_get_ioc() - Write IOControllerprofile to a management datagram. + * + * See also section 16.3.3.4 IOControllerProfile in the InfiniBand + * Architecture Specification. See also section B.7, table B.7 in the SRP + * r16a document. + */ +static void srpt_get_ioc(struct srpt_port *sport, u32 slot, + struct ib_dm_mad *mad) +{ + struct srpt_device *sdev = sport->sdev; + struct ib_dm_ioc_profile *iocp; + + iocp = (struct ib_dm_ioc_profile *)mad->data; + + if (!slot || slot > 16) { + mad->mad_hdr.status + = __constant_cpu_to_be16(DM_MAD_STATUS_INVALID_FIELD); + return; + } + + if (slot > 2) { + mad->mad_hdr.status + = __constant_cpu_to_be16(DM_MAD_STATUS_NO_IOC); + return; + } + + memset(iocp, 0, sizeof *iocp); + strcpy(iocp->id_string, SRPT_ID_STRING); + iocp->guid = cpu_to_be64(srpt_service_guid); + iocp->vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id); + iocp->device_id = cpu_to_be32(sdev->dev_attr.vendor_part_id); + iocp->device_version = cpu_to_be16(sdev->dev_attr.hw_ver); + iocp->subsys_vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id); + iocp->subsys_device_id = 0x0; + iocp->io_class = __constant_cpu_to_be16(SRP_REV16A_IB_IO_CLASS); + iocp->io_subclass = __constant_cpu_to_be16(SRP_IO_SUBCLASS); + iocp->protocol = __constant_cpu_to_be16(SRP_PROTOCOL); + iocp->protocol_version = __constant_cpu_to_be16(SRP_PROTOCOL_VERSION); + iocp->send_queue_depth = cpu_to_be16(sdev->srq_size); + iocp->rdma_read_depth = 4; + iocp->send_size = cpu_to_be32(srp_max_req_size); + iocp->rdma_size = cpu_to_be32(min(sport->port_attrib.srp_max_rdma_size, + 1U << 24)); + iocp->num_svc_entries = 1; + iocp->op_cap_mask = SRP_SEND_TO_IOC | SRP_SEND_FROM_IOC | + SRP_RDMA_READ_FROM_IOC | SRP_RDMA_WRITE_FROM_IOC; + + mad->mad_hdr.status = 0; +} + +/** + * srpt_get_svc_entries() - Write ServiceEntries to a management datagram. + * + * See also section 16.3.3.5 ServiceEntries in the InfiniBand Architecture + * Specification. See also section B.7, table B.8 in the SRP r16a document. + */ +static void srpt_get_svc_entries(u64 ioc_guid, + u16 slot, u8 hi, u8 lo, struct ib_dm_mad *mad) +{ + struct ib_dm_svc_entries *svc_entries; + + WARN_ON(!ioc_guid); + + if (!slot || slot > 16) { + mad->mad_hdr.status + = __constant_cpu_to_be16(DM_MAD_STATUS_INVALID_FIELD); + return; + } + + if (slot > 2 || lo > hi || hi > 1) { + mad->mad_hdr.status + = __constant_cpu_to_be16(DM_MAD_STATUS_NO_IOC); + return; + } + + svc_entries = (struct ib_dm_svc_entries *)mad->data; + memset(svc_entries, 0, sizeof *svc_entries); + svc_entries->service_entries[0].id = cpu_to_be64(ioc_guid); + snprintf(svc_entries->service_entries[0].name, + sizeof(svc_entries->service_entries[0].name), + "%s%016llx", + SRP_SERVICE_NAME_PREFIX, + ioc_guid); + + mad->mad_hdr.status = 0; +} + +/** + * srpt_mgmt_method_get() - Process a received management datagram. + * @sp: source port through which the MAD has been received. + * @rq_mad: received MAD. + * @rsp_mad: response MAD. + */ +static void srpt_mgmt_method_get(struct srpt_port *sp, struct ib_mad *rq_mad, + struct ib_dm_mad *rsp_mad) +{ + u16 attr_id; + u32 slot; + u8 hi, lo; + + attr_id = be16_to_cpu(rq_mad->mad_hdr.attr_id); + switch (attr_id) { + case DM_ATTR_CLASS_PORT_INFO: + srpt_get_class_port_info(rsp_mad); + break; + case DM_ATTR_IOU_INFO: + srpt_get_iou(rsp_mad); + break; + case DM_ATTR_IOC_PROFILE: + slot = be32_to_cpu(rq_mad->mad_hdr.attr_mod); + srpt_get_ioc(sp, slot, rsp_mad); + break; + case DM_ATTR_SVC_ENTRIES: + slot = be32_to_cpu(rq_mad->mad_hdr.attr_mod); + hi = (u8) ((slot >> 8) & 0xff); + lo = (u8) (slot & 0xff); + slot = (u16) ((slot >> 16) & 0xffff); + srpt_get_svc_entries(srpt_service_guid, + slot, hi, lo, rsp_mad); + break; + default: + rsp_mad->mad_hdr.status = + __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD_ATTR); + break; + } +} + +/** + * srpt_mad_send_handler() - Post MAD-send callback function. + */ +static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_wc *mad_wc) +{ + ib_destroy_ah(mad_wc->send_buf->ah); + ib_free_send_mad(mad_wc->send_buf); +} + +/** + * srpt_mad_recv_handler() - MAD reception callback function. + */ +static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_recv_wc *mad_wc) +{ + struct srpt_port *sport = (struct srpt_port *)mad_agent->context; + struct ib_ah *ah; + struct ib_mad_send_buf *rsp; + struct ib_dm_mad *dm_mad; + + if (!mad_wc || !mad_wc->recv_buf.mad) + return; + + ah = ib_create_ah_from_wc(mad_agent->qp->pd, mad_wc->wc, + mad_wc->recv_buf.grh, mad_agent->port_num); + if (IS_ERR(ah)) + goto err; + + BUILD_BUG_ON(offsetof(struct ib_dm_mad, data) != IB_MGMT_DEVICE_HDR); + + rsp = ib_create_send_mad(mad_agent, mad_wc->wc->src_qp, + mad_wc->wc->pkey_index, 0, + IB_MGMT_DEVICE_HDR, IB_MGMT_DEVICE_DATA, + GFP_KERNEL); + if (IS_ERR(rsp)) + goto err_rsp; + + rsp->ah = ah; + + dm_mad = rsp->mad; + memcpy(dm_mad, mad_wc->recv_buf.mad, sizeof *dm_mad); + dm_mad->mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + dm_mad->mad_hdr.status = 0; + + switch (mad_wc->recv_buf.mad->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + srpt_mgmt_method_get(sport, mad_wc->recv_buf.mad, dm_mad); + break; + case IB_MGMT_METHOD_SET: + dm_mad->mad_hdr.status = + __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD_ATTR); + break; + default: + dm_mad->mad_hdr.status = + __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD); + break; + } + + if (!ib_post_send_mad(rsp, NULL)) { + ib_free_recv_mad(mad_wc); + /* will destroy_ah & free_send_mad in send completion */ + return; + } + + ib_free_send_mad(rsp); + +err_rsp: + ib_destroy_ah(ah); +err: + ib_free_recv_mad(mad_wc); +} + +/** + * srpt_refresh_port() - Configure a HCA port. + * + * Enable InfiniBand management datagram processing, update the cached sm_lid, + * lid and gid values, and register a callback function for processing MADs + * on the specified port. + * + * Note: It is safe to call this function more than once for the same port. + */ +static int srpt_refresh_port(struct srpt_port *sport) +{ + struct ib_mad_reg_req reg_req; + struct ib_port_modify port_modify; + struct ib_port_attr port_attr; + int ret; + + memset(&port_modify, 0, sizeof port_modify); + port_modify.set_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP; + port_modify.clr_port_cap_mask = 0; + + ret = ib_modify_port(sport->sdev->device, sport->port, 0, &port_modify); + if (ret) + goto err_mod_port; + + ret = ib_query_port(sport->sdev->device, sport->port, &port_attr); + if (ret) + goto err_query_port; + + sport->sm_lid = port_attr.sm_lid; + sport->lid = port_attr.lid; + + ret = ib_query_gid(sport->sdev->device, sport->port, 0, &sport->gid); + if (ret) + goto err_query_port; + + if (!sport->mad_agent) { + memset(®_req, 0, sizeof reg_req); + reg_req.mgmt_class = IB_MGMT_CLASS_DEVICE_MGMT; + reg_req.mgmt_class_version = IB_MGMT_BASE_VERSION; + set_bit(IB_MGMT_METHOD_GET, reg_req.method_mask); + set_bit(IB_MGMT_METHOD_SET, reg_req.method_mask); + + sport->mad_agent = ib_register_mad_agent(sport->sdev->device, + sport->port, + IB_QPT_GSI, + ®_req, 0, + srpt_mad_send_handler, + srpt_mad_recv_handler, + sport); + if (IS_ERR(sport->mad_agent)) { + ret = PTR_ERR(sport->mad_agent); + sport->mad_agent = NULL; + goto err_query_port; + } + } + + return 0; + +err_query_port: + + port_modify.set_port_cap_mask = 0; + port_modify.clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP; + ib_modify_port(sport->sdev->device, sport->port, 0, &port_modify); + +err_mod_port: + + return ret; +} + +/** + * srpt_unregister_mad_agent() - Unregister MAD callback functions. + * + * Note: It is safe to call this function more than once for the same device. + */ +static void srpt_unregister_mad_agent(struct srpt_device *sdev) +{ + struct ib_port_modify port_modify = { + .clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP, + }; + struct srpt_port *sport; + int i; + + for (i = 1; i <= sdev->device->phys_port_cnt; i++) { + sport = &sdev->port[i - 1]; + WARN_ON(sport->port != i); + if (ib_modify_port(sdev->device, i, 0, &port_modify) < 0) + printk(KERN_ERR "disabling MAD processing failed.\n"); + if (sport->mad_agent) { + ib_unregister_mad_agent(sport->mad_agent); + sport->mad_agent = NULL; + } + } +} + +/** + * srpt_alloc_ioctx() - Allocate an SRPT I/O context structure. + */ +static struct srpt_ioctx *srpt_alloc_ioctx(struct srpt_device *sdev, + int ioctx_size, int dma_size, + enum dma_data_direction dir) +{ + struct srpt_ioctx *ioctx; + + ioctx = kmalloc(ioctx_size, GFP_KERNEL); + if (!ioctx) + goto err; + + ioctx->buf = kmalloc(dma_size, GFP_KERNEL); + if (!ioctx->buf) + goto err_free_ioctx; + + ioctx->dma = ib_dma_map_single(sdev->device, ioctx->buf, dma_size, dir); + if (ib_dma_mapping_error(sdev->device, ioctx->dma)) + goto err_free_buf; + + return ioctx; + +err_free_buf: + kfree(ioctx->buf); +err_free_ioctx: + kfree(ioctx); +err: + return NULL; +} + +/** + * srpt_free_ioctx() - Free an SRPT I/O context structure. + */ +static void srpt_free_ioctx(struct srpt_device *sdev, struct srpt_ioctx *ioctx, + int dma_size, enum dma_data_direction dir) +{ + if (!ioctx) + return; + + ib_dma_unmap_single(sdev->device, ioctx->dma, dma_size, dir); + kfree(ioctx->buf); + kfree(ioctx); +} + +/** + * srpt_alloc_ioctx_ring() - Allocate a ring of SRPT I/O context structures. + * @sdev: Device to allocate the I/O context ring for. + * @ring_size: Number of elements in the I/O context ring. + * @ioctx_size: I/O context size. + * @dma_size: DMA buffer size. + * @dir: DMA data direction. + */ +static struct srpt_ioctx **srpt_alloc_ioctx_ring(struct srpt_device *sdev, + int ring_size, int ioctx_size, + int dma_size, enum dma_data_direction dir) +{ + struct srpt_ioctx **ring; + int i; + + WARN_ON(ioctx_size != sizeof(struct srpt_recv_ioctx) + && ioctx_size != sizeof(struct srpt_send_ioctx)); + + ring = kmalloc(ring_size * sizeof(ring[0]), GFP_KERNEL); + if (!ring) + goto out; + for (i = 0; i < ring_size; ++i) { + ring[i] = srpt_alloc_ioctx(sdev, ioctx_size, dma_size, dir); + if (!ring[i]) + goto err; + ring[i]->index = i; + } + goto out; + +err: + while (--i >= 0) + srpt_free_ioctx(sdev, ring[i], dma_size, dir); + kfree(ring); + ring = NULL; +out: + return ring; +} + +/** + * srpt_free_ioctx_ring() - Free the ring of SRPT I/O context structures. + */ +static void srpt_free_ioctx_ring(struct srpt_ioctx **ioctx_ring, + struct srpt_device *sdev, int ring_size, + int dma_size, enum dma_data_direction dir) +{ + int i; + + for (i = 0; i < ring_size; ++i) + srpt_free_ioctx(sdev, ioctx_ring[i], dma_size, dir); + kfree(ioctx_ring); +} + +/** + * srpt_get_cmd_state() - Get the state of a SCSI command. + */ +static enum srpt_command_state srpt_get_cmd_state(struct srpt_send_ioctx *ioctx) +{ + enum srpt_command_state state; + unsigned long flags; + + BUG_ON(!ioctx); + + spin_lock_irqsave(&ioctx->spinlock, flags); + state = ioctx->state; + spin_unlock_irqrestore(&ioctx->spinlock, flags); + return state; +} + +/** + * srpt_set_cmd_state() - Set the state of a SCSI command. + * + * Does not modify the state of aborted commands. Returns the previous command + * state. + */ +static enum srpt_command_state srpt_set_cmd_state(struct srpt_send_ioctx *ioctx, + enum srpt_command_state new) +{ + enum srpt_command_state previous; + unsigned long flags; + + BUG_ON(!ioctx); + + spin_lock_irqsave(&ioctx->spinlock, flags); + previous = ioctx->state; + if (previous != SRPT_STATE_DONE) + ioctx->state = new; + spin_unlock_irqrestore(&ioctx->spinlock, flags); + + return previous; +} + +/** + * srpt_test_and_set_cmd_state() - Test and set the state of a command. + * + * Returns true if and only if the previous command state was equal to 'old'. + */ +static bool srpt_test_and_set_cmd_state(struct srpt_send_ioctx *ioctx, + enum srpt_command_state old, + enum srpt_command_state new) +{ + enum srpt_command_state previous; + unsigned long flags; + + WARN_ON(!ioctx); + WARN_ON(old == SRPT_STATE_DONE); + WARN_ON(new == SRPT_STATE_NEW); + + spin_lock_irqsave(&ioctx->spinlock, flags); + previous = ioctx->state; + if (previous == old) + ioctx->state = new; + spin_unlock_irqrestore(&ioctx->spinlock, flags); + return previous == old; +} + +/** + * srpt_post_recv() - Post an IB receive request. + */ +static int srpt_post_recv(struct srpt_device *sdev, + struct srpt_recv_ioctx *ioctx) +{ + struct ib_sge list; + struct ib_recv_wr wr, *bad_wr; + + BUG_ON(!sdev); + wr.wr_id = encode_wr_id(SRPT_RECV, ioctx->ioctx.index); + + list.addr = ioctx->ioctx.dma; + list.length = srp_max_req_size; + list.lkey = sdev->mr->lkey; + + wr.next = NULL; + wr.sg_list = &list; + wr.num_sge = 1; + + return ib_post_srq_recv(sdev->srq, &wr, &bad_wr); +} + +/** + * srpt_post_send() - Post an IB send request. + * + * Returns zero upon success and a non-zero value upon failure. + */ +static int srpt_post_send(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx, int len) +{ + struct ib_sge list; + struct ib_send_wr wr, *bad_wr; + struct srpt_device *sdev = ch->sport->sdev; + int ret; + + atomic_inc(&ch->req_lim); + + ret = -ENOMEM; + if (unlikely(atomic_dec_return(&ch->sq_wr_avail) < 0)) { + printk(KERN_WARNING "IB send queue full (needed 1)\n"); + goto out; + } + + ib_dma_sync_single_for_device(sdev->device, ioctx->ioctx.dma, len, + DMA_TO_DEVICE); + + list.addr = ioctx->ioctx.dma; + list.length = len; + list.lkey = sdev->mr->lkey; + + wr.next = NULL; + wr.wr_id = encode_wr_id(SRPT_SEND, ioctx->ioctx.index); + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(ch->qp, &wr, &bad_wr); + +out: + if (ret < 0) { + atomic_inc(&ch->sq_wr_avail); + atomic_dec(&ch->req_lim); + } + return ret; +} + +/** + * srpt_get_desc_tbl() - Parse the data descriptors of an SRP_CMD request. + * @ioctx: Pointer to the I/O context associated with the request. + * @srp_cmd: Pointer to the SRP_CMD request data. + * @dir: Pointer to the variable to which the transfer direction will be + * written. + * @data_len: Pointer to the variable to which the total data length of all + * descriptors in the SRP_CMD request will be written. + * + * This function initializes ioctx->nrbuf and ioctx->r_bufs. + * + * Returns -EINVAL when the SRP_CMD request contains inconsistent descriptors; + * -ENOMEM when memory allocation fails and zero upon success. + */ +static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx, + struct srp_cmd *srp_cmd, + enum dma_data_direction *dir, u64 *data_len) +{ + struct srp_indirect_buf *idb; + struct srp_direct_buf *db; + unsigned add_cdb_offset; + int ret; + + /* + * The pointer computations below will only be compiled correctly + * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check + * whether srp_cmd::add_data has been declared as a byte pointer. + */ + BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0) + && !__same_type(srp_cmd->add_data[0], (u8)0)); + + BUG_ON(!dir); + BUG_ON(!data_len); + + ret = 0; + *data_len = 0; + + /* + * The lower four bits of the buffer format field contain the DATA-IN + * buffer descriptor format, and the highest four bits contain the + * DATA-OUT buffer descriptor format. + */ + *dir = DMA_NONE; + if (srp_cmd->buf_fmt & 0xf) + /* DATA-IN: transfer data from target to initiator (read). */ + *dir = DMA_FROM_DEVICE; + else if (srp_cmd->buf_fmt >> 4) + /* DATA-OUT: transfer data from initiator to target (write). */ + *dir = DMA_TO_DEVICE; + + /* + * According to the SRP spec, the lower two bits of the 'ADDITIONAL + * CDB LENGTH' field are reserved and the size in bytes of this field + * is four times the value specified in bits 3..7. Hence the "& ~3". + */ + add_cdb_offset = srp_cmd->add_cdb_len & ~3; + if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_DIRECT) || + ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_DIRECT)) { + ioctx->n_rbuf = 1; + ioctx->rbufs = &ioctx->single_rbuf; + + db = (struct srp_direct_buf *)(srp_cmd->add_data + + add_cdb_offset); + memcpy(ioctx->rbufs, db, sizeof *db); + *data_len = be32_to_cpu(db->len); + } else if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_INDIRECT) || + ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_INDIRECT)) { + idb = (struct srp_indirect_buf *)(srp_cmd->add_data + + add_cdb_offset); + + ioctx->n_rbuf = be32_to_cpu(idb->table_desc.len) / sizeof *db; + + if (ioctx->n_rbuf > + (srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) { + printk(KERN_ERR "received unsupported SRP_CMD request" + " type (%u out + %u in != %u / %zu)\n", + srp_cmd->data_out_desc_cnt, + srp_cmd->data_in_desc_cnt, + be32_to_cpu(idb->table_desc.len), + sizeof(*db)); + ioctx->n_rbuf = 0; + ret = -EINVAL; + goto out; + } + + if (ioctx->n_rbuf == 1) + ioctx->rbufs = &ioctx->single_rbuf; + else { + ioctx->rbufs = + kmalloc(ioctx->n_rbuf * sizeof *db, GFP_ATOMIC); + if (!ioctx->rbufs) { + ioctx->n_rbuf = 0; + ret = -ENOMEM; + goto out; + } + } + + db = idb->desc_list; + memcpy(ioctx->rbufs, db, ioctx->n_rbuf * sizeof *db); + *data_len = be32_to_cpu(idb->len); + } +out: + return ret; +} + +/** + * srpt_init_ch_qp() - Initialize queue pair attributes. + * + * Initialized the attributes of queue pair 'qp' by allowing local write, + * remote read and remote write. Also transitions 'qp' to state IB_QPS_INIT. + */ +static int srpt_init_ch_qp(struct srpt_rdma_ch *ch, struct ib_qp *qp) +{ + struct ib_qp_attr *attr; + int ret; + + attr = kzalloc(sizeof *attr, GFP_KERNEL); + if (!attr) + return -ENOMEM; + + attr->qp_state = IB_QPS_INIT; + attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + attr->port_num = ch->sport->port; + attr->pkey_index = 0; + + ret = ib_modify_qp(qp, attr, + IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PORT | + IB_QP_PKEY_INDEX); + + kfree(attr); + return ret; +} + +/** + * srpt_ch_qp_rtr() - Change the state of a channel to 'ready to receive' (RTR). + * @ch: channel of the queue pair. + * @qp: queue pair to change the state of. + * + * Returns zero upon success and a negative value upon failure. + * + * Note: currently a struct ib_qp_attr takes 136 bytes on a 64-bit system. + * If this structure ever becomes larger, it might be necessary to allocate + * it dynamically instead of on the stack. + */ +static int srpt_ch_qp_rtr(struct srpt_rdma_ch *ch, struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + int attr_mask; + int ret; + + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(ch->cm_id, &qp_attr, &attr_mask); + if (ret) + goto out; + + qp_attr.max_dest_rd_atomic = 4; + + ret = ib_modify_qp(qp, &qp_attr, attr_mask); + +out: + return ret; +} + +/** + * srpt_ch_qp_rts() - Change the state of a channel to 'ready to send' (RTS). + * @ch: channel of the queue pair. + * @qp: queue pair to change the state of. + * + * Returns zero upon success and a negative value upon failure. + * + * Note: currently a struct ib_qp_attr takes 136 bytes on a 64-bit system. + * If this structure ever becomes larger, it might be necessary to allocate + * it dynamically instead of on the stack. + */ +static int srpt_ch_qp_rts(struct srpt_rdma_ch *ch, struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + int attr_mask; + int ret; + + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(ch->cm_id, &qp_attr, &attr_mask); + if (ret) + goto out; + + qp_attr.max_rd_atomic = 4; + + ret = ib_modify_qp(qp, &qp_attr, attr_mask); + +out: + return ret; +} + +/** + * srpt_ch_qp_err() - Set the channel queue pair state to 'error'. + */ +static int srpt_ch_qp_err(struct srpt_rdma_ch *ch) +{ + struct ib_qp_attr qp_attr; + + qp_attr.qp_state = IB_QPS_ERR; + return ib_modify_qp(ch->qp, &qp_attr, IB_QP_STATE); +} + +/** + * srpt_unmap_sg_to_ib_sge() - Unmap an IB SGE list. + */ +static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx) +{ + struct scatterlist *sg; + enum dma_data_direction dir; + + BUG_ON(!ch); + BUG_ON(!ioctx); + BUG_ON(ioctx->n_rdma && !ioctx->rdma_ius); + + while (ioctx->n_rdma) + kfree(ioctx->rdma_ius[--ioctx->n_rdma].sge); + + kfree(ioctx->rdma_ius); + ioctx->rdma_ius = NULL; + + if (ioctx->mapped_sg_count) { + sg = ioctx->sg; + WARN_ON(!sg); + dir = ioctx->cmd.data_direction; + BUG_ON(dir == DMA_NONE); + ib_dma_unmap_sg(ch->sport->sdev->device, sg, ioctx->sg_cnt, + opposite_dma_dir(dir)); + ioctx->mapped_sg_count = 0; + } +} + +/** + * srpt_map_sg_to_ib_sge() - Map an SG list to an IB SGE list. + */ +static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx) +{ + struct ib_device *dev = ch->sport->sdev->device; + struct se_cmd *cmd; + struct scatterlist *sg, *sg_orig; + int sg_cnt; + enum dma_data_direction dir; + struct rdma_iu *riu; + struct srp_direct_buf *db; + dma_addr_t dma_addr; + struct ib_sge *sge; + u64 raddr; + u32 rsize; + u32 tsize; + u32 dma_len; + int count, nrdma; + int i, j, k; + + BUG_ON(!ch); + BUG_ON(!ioctx); + cmd = &ioctx->cmd; + dir = cmd->data_direction; + BUG_ON(dir == DMA_NONE); + + ioctx->sg = sg = sg_orig = cmd->t_data_sg; + ioctx->sg_cnt = sg_cnt = cmd->t_data_nents; + + count = ib_dma_map_sg(ch->sport->sdev->device, sg, sg_cnt, + opposite_dma_dir(dir)); + if (unlikely(!count)) + return -EAGAIN; + + ioctx->mapped_sg_count = count; + + if (ioctx->rdma_ius && ioctx->n_rdma_ius) + nrdma = ioctx->n_rdma_ius; + else { + nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE + + ioctx->n_rbuf; + + ioctx->rdma_ius = kzalloc(nrdma * sizeof *riu, GFP_KERNEL); + if (!ioctx->rdma_ius) + goto free_mem; + + ioctx->n_rdma_ius = nrdma; + } + + db = ioctx->rbufs; + tsize = cmd->data_length; + dma_len = ib_sg_dma_len(dev, &sg[0]); + riu = ioctx->rdma_ius; + + /* + * For each remote desc - calculate the #ib_sge. + * If #ib_sge < SRPT_DEF_SG_PER_WQE per rdma operation then + * each remote desc rdma_iu is required a rdma wr; + * else + * we need to allocate extra rdma_iu to carry extra #ib_sge in + * another rdma wr + */ + for (i = 0, j = 0; + j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) { + rsize = be32_to_cpu(db->len); + raddr = be64_to_cpu(db->va); + riu->raddr = raddr; + riu->rkey = be32_to_cpu(db->key); + riu->sge_cnt = 0; + + /* calculate how many sge required for this remote_buf */ + while (rsize > 0 && tsize > 0) { + + if (rsize >= dma_len) { + tsize -= dma_len; + rsize -= dma_len; + raddr += dma_len; + + if (tsize > 0) { + ++j; + if (j < count) { + sg = sg_next(sg); + dma_len = ib_sg_dma_len( + dev, sg); + } + } + } else { + tsize -= rsize; + dma_len -= rsize; + rsize = 0; + } + + ++riu->sge_cnt; + + if (rsize > 0 && riu->sge_cnt == SRPT_DEF_SG_PER_WQE) { + ++ioctx->n_rdma; + riu->sge = + kmalloc(riu->sge_cnt * sizeof *riu->sge, + GFP_KERNEL); + if (!riu->sge) + goto free_mem; + + ++riu; + riu->sge_cnt = 0; + riu->raddr = raddr; + riu->rkey = be32_to_cpu(db->key); + } + } + + ++ioctx->n_rdma; + riu->sge = kmalloc(riu->sge_cnt * sizeof *riu->sge, + GFP_KERNEL); + if (!riu->sge) + goto free_mem; + } + + db = ioctx->rbufs; + tsize = cmd->data_length; + riu = ioctx->rdma_ius; + sg = sg_orig; + dma_len = ib_sg_dma_len(dev, &sg[0]); + dma_addr = ib_sg_dma_address(dev, &sg[0]); + + /* this second loop is really mapped sg_addres to rdma_iu->ib_sge */ + for (i = 0, j = 0; + j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) { + rsize = be32_to_cpu(db->len); + sge = riu->sge; + k = 0; + + while (rsize > 0 && tsize > 0) { + sge->addr = dma_addr; + sge->lkey = ch->sport->sdev->mr->lkey; + + if (rsize >= dma_len) { + sge->length = + (tsize < dma_len) ? tsize : dma_len; + tsize -= dma_len; + rsize -= dma_len; + + if (tsize > 0) { + ++j; + if (j < count) { + sg = sg_next(sg); + dma_len = ib_sg_dma_len( + dev, sg); + dma_addr = ib_sg_dma_address( + dev, sg); + } + } + } else { + sge->length = (tsize < rsize) ? tsize : rsize; + tsize -= rsize; + dma_len -= rsize; + dma_addr += rsize; + rsize = 0; + } + + ++k; + if (k == riu->sge_cnt && rsize > 0 && tsize > 0) { + ++riu; + sge = riu->sge; + k = 0; + } else if (rsize > 0 && tsize > 0) + ++sge; + } + } + + return 0; + +free_mem: + srpt_unmap_sg_to_ib_sge(ch, ioctx); + + return -ENOMEM; +} + +/** + * srpt_get_send_ioctx() - Obtain an I/O context for sending to the initiator. + */ +static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) +{ + struct srpt_send_ioctx *ioctx; + unsigned long flags; + + BUG_ON(!ch); + + ioctx = NULL; + spin_lock_irqsave(&ch->spinlock, flags); + if (!list_empty(&ch->free_list)) { + ioctx = list_first_entry(&ch->free_list, + struct srpt_send_ioctx, free_list); + list_del(&ioctx->free_list); + } + spin_unlock_irqrestore(&ch->spinlock, flags); + + if (!ioctx) + return ioctx; + + BUG_ON(ioctx->ch != ch); + spin_lock_init(&ioctx->spinlock); + ioctx->state = SRPT_STATE_NEW; + ioctx->n_rbuf = 0; + ioctx->rbufs = NULL; + ioctx->n_rdma = 0; + ioctx->n_rdma_ius = 0; + ioctx->rdma_ius = NULL; + ioctx->mapped_sg_count = 0; + init_completion(&ioctx->tx_done); + ioctx->queue_status_only = false; + /* + * transport_init_se_cmd() does not initialize all fields, so do it + * here. + */ + memset(&ioctx->cmd, 0, sizeof(ioctx->cmd)); + memset(&ioctx->sense_data, 0, sizeof(ioctx->sense_data)); + + return ioctx; +} + +/** + * srpt_abort_cmd() - Abort a SCSI command. + * @ioctx: I/O context associated with the SCSI command. + * @context: Preferred execution context. + */ +static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) +{ + enum srpt_command_state state; + unsigned long flags; + + BUG_ON(!ioctx); + + /* + * If the command is in a state where the target core is waiting for + * the ib_srpt driver, change the state to the next state. Changing + * the state of the command from SRPT_STATE_NEED_DATA to + * SRPT_STATE_DATA_IN ensures that srpt_xmit_response() will call this + * function a second time. + */ + + spin_lock_irqsave(&ioctx->spinlock, flags); + state = ioctx->state; + switch (state) { + case SRPT_STATE_NEED_DATA: + ioctx->state = SRPT_STATE_DATA_IN; + break; + case SRPT_STATE_DATA_IN: + case SRPT_STATE_CMD_RSP_SENT: + case SRPT_STATE_MGMT_RSP_SENT: + ioctx->state = SRPT_STATE_DONE; + break; + default: + break; + } + spin_unlock_irqrestore(&ioctx->spinlock, flags); + + if (state == SRPT_STATE_DONE) { + struct srpt_rdma_ch *ch = ioctx->ch; + + BUG_ON(ch->sess == NULL); + + target_put_sess_cmd(ch->sess, &ioctx->cmd); + goto out; + } + + pr_debug("Aborting cmd with state %d and tag %lld\n", state, + ioctx->tag); + + switch (state) { + case SRPT_STATE_NEW: + case SRPT_STATE_DATA_IN: + case SRPT_STATE_MGMT: + /* + * Do nothing - defer abort processing until + * srpt_queue_response() is invoked. + */ + WARN_ON(!transport_check_aborted_status(&ioctx->cmd, false)); + break; + case SRPT_STATE_NEED_DATA: + /* DMA_TO_DEVICE (write) - RDMA read error. */ + + /* XXX(hch): this is a horrible layering violation.. */ + spin_lock_irqsave(&ioctx->cmd.t_state_lock, flags); + ioctx->cmd.transport_state &= ~CMD_T_ACTIVE; + spin_unlock_irqrestore(&ioctx->cmd.t_state_lock, flags); + break; + case SRPT_STATE_CMD_RSP_SENT: + /* + * SRP_RSP sending failed or the SRP_RSP send completion has + * not been received in time. + */ + srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); + target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); + break; + case SRPT_STATE_MGMT_RSP_SENT: + srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); + target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); + break; + default: + WARN(1, "Unexpected command state (%d)", state); + break; + } + +out: + return state; +} + +/** + * srpt_handle_send_err_comp() - Process an IB_WC_SEND error completion. + */ +static void srpt_handle_send_err_comp(struct srpt_rdma_ch *ch, u64 wr_id) +{ + struct srpt_send_ioctx *ioctx; + enum srpt_command_state state; + struct se_cmd *cmd; + u32 index; + + atomic_inc(&ch->sq_wr_avail); + + index = idx_from_wr_id(wr_id); + ioctx = ch->ioctx_ring[index]; + state = srpt_get_cmd_state(ioctx); + cmd = &ioctx->cmd; + + WARN_ON(state != SRPT_STATE_CMD_RSP_SENT + && state != SRPT_STATE_MGMT_RSP_SENT + && state != SRPT_STATE_NEED_DATA + && state != SRPT_STATE_DONE); + + /* If SRP_RSP sending failed, undo the ch->req_lim change. */ + if (state == SRPT_STATE_CMD_RSP_SENT + || state == SRPT_STATE_MGMT_RSP_SENT) + atomic_dec(&ch->req_lim); + + srpt_abort_cmd(ioctx); +} + +/** + * srpt_handle_send_comp() - Process an IB send completion notification. + */ +static void srpt_handle_send_comp(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx) +{ + enum srpt_command_state state; + + atomic_inc(&ch->sq_wr_avail); + + state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); + + if (WARN_ON(state != SRPT_STATE_CMD_RSP_SENT + && state != SRPT_STATE_MGMT_RSP_SENT + && state != SRPT_STATE_DONE)) + pr_debug("state = %d\n", state); + + if (state != SRPT_STATE_DONE) { + srpt_unmap_sg_to_ib_sge(ch, ioctx); + transport_generic_free_cmd(&ioctx->cmd, 0); + } else { + printk(KERN_ERR "IB completion has been received too late for" + " wr_id = %u.\n", ioctx->ioctx.index); + } +} + +/** + * srpt_handle_rdma_comp() - Process an IB RDMA completion notification. + * + * XXX: what is now target_execute_cmd used to be asynchronous, and unmapping + * the data that has been transferred via IB RDMA had to be postponed until the + * check_stop_free() callback. None of this is necessary anymore and needs to + * be cleaned up. + */ +static void srpt_handle_rdma_comp(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx, + enum srpt_opcode opcode) +{ + WARN_ON(ioctx->n_rdma <= 0); + atomic_add(ioctx->n_rdma, &ch->sq_wr_avail); + + if (opcode == SRPT_RDMA_READ_LAST) { + if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA, + SRPT_STATE_DATA_IN)) + target_execute_cmd(&ioctx->cmd); + else + printk(KERN_ERR "%s[%d]: wrong state = %d\n", __func__, + __LINE__, srpt_get_cmd_state(ioctx)); + } else if (opcode == SRPT_RDMA_ABORT) { + ioctx->rdma_aborted = true; + } else { + WARN(true, "unexpected opcode %d\n", opcode); + } +} + +/** + * srpt_handle_rdma_err_comp() - Process an IB RDMA error completion. + */ +static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx, + enum srpt_opcode opcode) +{ + struct se_cmd *cmd; + enum srpt_command_state state; + + cmd = &ioctx->cmd; + state = srpt_get_cmd_state(ioctx); + switch (opcode) { + case SRPT_RDMA_READ_LAST: + if (ioctx->n_rdma <= 0) { + printk(KERN_ERR "Received invalid RDMA read" + " error completion with idx %d\n", + ioctx->ioctx.index); + break; + } + atomic_add(ioctx->n_rdma, &ch->sq_wr_avail); + if (state == SRPT_STATE_NEED_DATA) + srpt_abort_cmd(ioctx); + else + printk(KERN_ERR "%s[%d]: wrong state = %d\n", + __func__, __LINE__, state); + break; + case SRPT_RDMA_WRITE_LAST: + break; + default: + printk(KERN_ERR "%s[%d]: opcode = %u\n", __func__, + __LINE__, opcode); + break; + } +} + +/** + * srpt_build_cmd_rsp() - Build an SRP_RSP response. + * @ch: RDMA channel through which the request has been received. + * @ioctx: I/O context associated with the SRP_CMD request. The response will + * be built in the buffer ioctx->buf points at and hence this function will + * overwrite the request data. + * @tag: tag of the request for which this response is being generated. + * @status: value for the STATUS field of the SRP_RSP information unit. + * + * Returns the size in bytes of the SRP_RSP response. + * + * An SRP_RSP response contains a SCSI status or service response. See also + * section 6.9 in the SRP r16a document for the format of an SRP_RSP + * response. See also SPC-2 for more information about sense data. + */ +static int srpt_build_cmd_rsp(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx, u64 tag, + int status) +{ + struct srp_rsp *srp_rsp; + const u8 *sense_data; + int sense_data_len, max_sense_len; + + /* + * The lowest bit of all SAM-3 status codes is zero (see also + * paragraph 5.3 in SAM-3). + */ + WARN_ON(status & 1); + + srp_rsp = ioctx->ioctx.buf; + BUG_ON(!srp_rsp); + + sense_data = ioctx->sense_data; + sense_data_len = ioctx->cmd.scsi_sense_length; + WARN_ON(sense_data_len > sizeof(ioctx->sense_data)); + + memset(srp_rsp, 0, sizeof *srp_rsp); + srp_rsp->opcode = SRP_RSP; + srp_rsp->req_lim_delta = + __constant_cpu_to_be32(1 + atomic_xchg(&ch->req_lim_delta, 0)); + srp_rsp->tag = tag; + srp_rsp->status = status; + + if (sense_data_len) { + BUILD_BUG_ON(MIN_MAX_RSP_SIZE <= sizeof(*srp_rsp)); + max_sense_len = ch->max_ti_iu_len - sizeof(*srp_rsp); + if (sense_data_len > max_sense_len) { + printk(KERN_WARNING "truncated sense data from %d to %d" + " bytes\n", sense_data_len, max_sense_len); + sense_data_len = max_sense_len; + } + + srp_rsp->flags |= SRP_RSP_FLAG_SNSVALID; + srp_rsp->sense_data_len = cpu_to_be32(sense_data_len); + memcpy(srp_rsp + 1, sense_data, sense_data_len); + } + + return sizeof(*srp_rsp) + sense_data_len; +} + +/** + * srpt_build_tskmgmt_rsp() - Build a task management response. + * @ch: RDMA channel through which the request has been received. + * @ioctx: I/O context in which the SRP_RSP response will be built. + * @rsp_code: RSP_CODE that will be stored in the response. + * @tag: Tag of the request for which this response is being generated. + * + * Returns the size in bytes of the SRP_RSP response. + * + * An SRP_RSP response contains a SCSI status or service response. See also + * section 6.9 in the SRP r16a document for the format of an SRP_RSP + * response. + */ +static int srpt_build_tskmgmt_rsp(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx, + u8 rsp_code, u64 tag) +{ + struct srp_rsp *srp_rsp; + int resp_data_len; + int resp_len; + + resp_data_len = 4; + resp_len = sizeof(*srp_rsp) + resp_data_len; + + srp_rsp = ioctx->ioctx.buf; + BUG_ON(!srp_rsp); + memset(srp_rsp, 0, sizeof *srp_rsp); + + srp_rsp->opcode = SRP_RSP; + srp_rsp->req_lim_delta = __constant_cpu_to_be32(1 + + atomic_xchg(&ch->req_lim_delta, 0)); + srp_rsp->tag = tag; + + srp_rsp->flags |= SRP_RSP_FLAG_RSPVALID; + srp_rsp->resp_data_len = cpu_to_be32(resp_data_len); + srp_rsp->data[3] = rsp_code; + + return resp_len; +} + +#define NO_SUCH_LUN ((uint64_t)-1LL) + +/* + * SCSI LUN addressing method. See also SAM-2 and the section about + * eight byte LUNs. + */ +enum scsi_lun_addr_method { + SCSI_LUN_ADDR_METHOD_PERIPHERAL = 0, + SCSI_LUN_ADDR_METHOD_FLAT = 1, + SCSI_LUN_ADDR_METHOD_LUN = 2, + SCSI_LUN_ADDR_METHOD_EXTENDED_LUN = 3, +}; + +/* + * srpt_unpack_lun() - Convert from network LUN to linear LUN. + * + * Convert an 2-byte, 4-byte, 6-byte or 8-byte LUN structure in network byte + * order (big endian) to a linear LUN. Supports three LUN addressing methods: + * peripheral, flat and logical unit. See also SAM-2, section 4.9.4 (page 40). + */ +static uint64_t srpt_unpack_lun(const uint8_t *lun, int len) +{ + uint64_t res = NO_SUCH_LUN; + int addressing_method; + + if (unlikely(len < 2)) { + printk(KERN_ERR "Illegal LUN length %d, expected 2 bytes or " + "more", len); + goto out; + } + + switch (len) { + case 8: + if ((*((__be64 *)lun) & + __constant_cpu_to_be64(0x0000FFFFFFFFFFFFLL)) != 0) + goto out_err; + break; + case 4: + if (*((__be16 *)&lun[2]) != 0) + goto out_err; + break; + case 6: + if (*((__be32 *)&lun[2]) != 0) + goto out_err; + break; + case 2: + break; + default: + goto out_err; + } + + addressing_method = (*lun) >> 6; /* highest two bits of byte 0 */ + switch (addressing_method) { + case SCSI_LUN_ADDR_METHOD_PERIPHERAL: + case SCSI_LUN_ADDR_METHOD_FLAT: + case SCSI_LUN_ADDR_METHOD_LUN: + res = *(lun + 1) | (((*lun) & 0x3f) << 8); + break; + + case SCSI_LUN_ADDR_METHOD_EXTENDED_LUN: + default: + printk(KERN_ERR "Unimplemented LUN addressing method %u", + addressing_method); + break; + } + +out: + return res; + +out_err: + printk(KERN_ERR "Support for multi-level LUNs has not yet been" + " implemented"); + goto out; +} + +static int srpt_check_stop_free(struct se_cmd *cmd) +{ + struct srpt_send_ioctx *ioctx = container_of(cmd, + struct srpt_send_ioctx, cmd); + + return target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); +} + +/** + * srpt_handle_cmd() - Process SRP_CMD. + */ +static int srpt_handle_cmd(struct srpt_rdma_ch *ch, + struct srpt_recv_ioctx *recv_ioctx, + struct srpt_send_ioctx *send_ioctx) +{ + struct se_cmd *cmd; + struct srp_cmd *srp_cmd; + uint64_t unpacked_lun; + u64 data_len; + enum dma_data_direction dir; + sense_reason_t ret; + int rc; + + BUG_ON(!send_ioctx); + + srp_cmd = recv_ioctx->ioctx.buf; + cmd = &send_ioctx->cmd; + send_ioctx->tag = srp_cmd->tag; + + switch (srp_cmd->task_attr) { + case SRP_CMD_SIMPLE_Q: + cmd->sam_task_attr = MSG_SIMPLE_TAG; + break; + case SRP_CMD_ORDERED_Q: + default: + cmd->sam_task_attr = MSG_ORDERED_TAG; + break; + case SRP_CMD_HEAD_OF_Q: + cmd->sam_task_attr = MSG_HEAD_TAG; + break; + case SRP_CMD_ACA: + cmd->sam_task_attr = MSG_ACA_TAG; + break; + } + + if (srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &data_len)) { + printk(KERN_ERR "0x%llx: parsing SRP descriptor table failed.\n", + srp_cmd->tag); + ret = TCM_INVALID_CDB_FIELD; + goto send_sense; + } + + unpacked_lun = srpt_unpack_lun((uint8_t *)&srp_cmd->lun, + sizeof(srp_cmd->lun)); + rc = target_submit_cmd(cmd, ch->sess, srp_cmd->cdb, + &send_ioctx->sense_data[0], unpacked_lun, data_len, + MSG_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF); + if (rc != 0) { + ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + goto send_sense; + } + return 0; + +send_sense: + transport_send_check_condition_and_sense(cmd, ret, 0); + return -1; +} + +/** + * srpt_rx_mgmt_fn_tag() - Process a task management function by tag. + * @ch: RDMA channel of the task management request. + * @fn: Task management function to perform. + * @req_tag: Tag of the SRP task management request. + * @mgmt_ioctx: I/O context of the task management request. + * + * Returns zero if the target core will process the task management + * request asynchronously. + * + * Note: It is assumed that the initiator serializes tag-based task management + * requests. + */ +static int srpt_rx_mgmt_fn_tag(struct srpt_send_ioctx *ioctx, u64 tag) +{ + struct srpt_device *sdev; + struct srpt_rdma_ch *ch; + struct srpt_send_ioctx *target; + int ret, i; + + ret = -EINVAL; + ch = ioctx->ch; + BUG_ON(!ch); + BUG_ON(!ch->sport); + sdev = ch->sport->sdev; + BUG_ON(!sdev); + spin_lock_irq(&sdev->spinlock); + for (i = 0; i < ch->rq_size; ++i) { + target = ch->ioctx_ring[i]; + if (target->cmd.se_lun == ioctx->cmd.se_lun && + target->tag == tag && + srpt_get_cmd_state(target) != SRPT_STATE_DONE) { + ret = 0; + /* now let the target core abort &target->cmd; */ + break; + } + } + spin_unlock_irq(&sdev->spinlock); + return ret; +} + +static int srp_tmr_to_tcm(int fn) +{ + switch (fn) { + case SRP_TSK_ABORT_TASK: + return TMR_ABORT_TASK; + case SRP_TSK_ABORT_TASK_SET: + return TMR_ABORT_TASK_SET; + case SRP_TSK_CLEAR_TASK_SET: + return TMR_CLEAR_TASK_SET; + case SRP_TSK_LUN_RESET: + return TMR_LUN_RESET; + case SRP_TSK_CLEAR_ACA: + return TMR_CLEAR_ACA; + default: + return -1; + } +} + +/** + * srpt_handle_tsk_mgmt() - Process an SRP_TSK_MGMT information unit. + * + * Returns 0 if and only if the request will be processed by the target core. + * + * For more information about SRP_TSK_MGMT information units, see also section + * 6.7 in the SRP r16a document. + */ +static void srpt_handle_tsk_mgmt(struct srpt_rdma_ch *ch, + struct srpt_recv_ioctx *recv_ioctx, + struct srpt_send_ioctx *send_ioctx) +{ + struct srp_tsk_mgmt *srp_tsk; + struct se_cmd *cmd; + struct se_session *sess = ch->sess; + uint64_t unpacked_lun; + uint32_t tag = 0; + int tcm_tmr; + int rc; + + BUG_ON(!send_ioctx); + + srp_tsk = recv_ioctx->ioctx.buf; + cmd = &send_ioctx->cmd; + + pr_debug("recv tsk_mgmt fn %d for task_tag %lld and cmd tag %lld" + " cm_id %p sess %p\n", srp_tsk->tsk_mgmt_func, + srp_tsk->task_tag, srp_tsk->tag, ch->cm_id, ch->sess); + + srpt_set_cmd_state(send_ioctx, SRPT_STATE_MGMT); + send_ioctx->tag = srp_tsk->tag; + tcm_tmr = srp_tmr_to_tcm(srp_tsk->tsk_mgmt_func); + if (tcm_tmr < 0) { + send_ioctx->cmd.se_tmr_req->response = + TMR_TASK_MGMT_FUNCTION_NOT_SUPPORTED; + goto fail; + } + unpacked_lun = srpt_unpack_lun((uint8_t *)&srp_tsk->lun, + sizeof(srp_tsk->lun)); + + if (srp_tsk->tsk_mgmt_func == SRP_TSK_ABORT_TASK) { + rc = srpt_rx_mgmt_fn_tag(send_ioctx, srp_tsk->task_tag); + if (rc < 0) { + send_ioctx->cmd.se_tmr_req->response = + TMR_TASK_DOES_NOT_EXIST; + goto fail; + } + tag = srp_tsk->task_tag; + } + rc = target_submit_tmr(&send_ioctx->cmd, sess, NULL, unpacked_lun, + srp_tsk, tcm_tmr, GFP_KERNEL, tag, + TARGET_SCF_ACK_KREF); + if (rc != 0) { + send_ioctx->cmd.se_tmr_req->response = TMR_FUNCTION_REJECTED; + goto fail; + } + return; +fail: + transport_send_check_condition_and_sense(cmd, 0, 0); // XXX: +} + +/** + * srpt_handle_new_iu() - Process a newly received information unit. + * @ch: RDMA channel through which the information unit has been received. + * @ioctx: SRPT I/O context associated with the information unit. + */ +static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, + struct srpt_recv_ioctx *recv_ioctx, + struct srpt_send_ioctx *send_ioctx) +{ + struct srp_cmd *srp_cmd; + enum rdma_ch_state ch_state; + + BUG_ON(!ch); + BUG_ON(!recv_ioctx); + + ib_dma_sync_single_for_cpu(ch->sport->sdev->device, + recv_ioctx->ioctx.dma, srp_max_req_size, + DMA_FROM_DEVICE); + + ch_state = srpt_get_ch_state(ch); + if (unlikely(ch_state == CH_CONNECTING)) { + list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list); + goto out; + } + + if (unlikely(ch_state != CH_LIVE)) + goto out; + + srp_cmd = recv_ioctx->ioctx.buf; + if (srp_cmd->opcode == SRP_CMD || srp_cmd->opcode == SRP_TSK_MGMT) { + if (!send_ioctx) + send_ioctx = srpt_get_send_ioctx(ch); + if (unlikely(!send_ioctx)) { + list_add_tail(&recv_ioctx->wait_list, + &ch->cmd_wait_list); + goto out; + } + } + + switch (srp_cmd->opcode) { + case SRP_CMD: + srpt_handle_cmd(ch, recv_ioctx, send_ioctx); + break; + case SRP_TSK_MGMT: + srpt_handle_tsk_mgmt(ch, recv_ioctx, send_ioctx); + break; + case SRP_I_LOGOUT: + printk(KERN_ERR "Not yet implemented: SRP_I_LOGOUT\n"); + break; + case SRP_CRED_RSP: + pr_debug("received SRP_CRED_RSP\n"); + break; + case SRP_AER_RSP: + pr_debug("received SRP_AER_RSP\n"); + break; + case SRP_RSP: + printk(KERN_ERR "Received SRP_RSP\n"); + break; + default: + printk(KERN_ERR "received IU with unknown opcode 0x%x\n", + srp_cmd->opcode); + break; + } + + srpt_post_recv(ch->sport->sdev, recv_ioctx); +out: + return; +} + +static void srpt_process_rcv_completion(struct ib_cq *cq, + struct srpt_rdma_ch *ch, + struct ib_wc *wc) +{ + struct srpt_device *sdev = ch->sport->sdev; + struct srpt_recv_ioctx *ioctx; + u32 index; + + index = idx_from_wr_id(wc->wr_id); + if (wc->status == IB_WC_SUCCESS) { + int req_lim; + + req_lim = atomic_dec_return(&ch->req_lim); + if (unlikely(req_lim < 0)) + printk(KERN_ERR "req_lim = %d < 0\n", req_lim); + ioctx = sdev->ioctx_ring[index]; + srpt_handle_new_iu(ch, ioctx, NULL); + } else { + printk(KERN_INFO "receiving failed for idx %u with status %d\n", + index, wc->status); + } +} + +/** + * srpt_process_send_completion() - Process an IB send completion. + * + * Note: Although this has not yet been observed during tests, at least in + * theory it is possible that the srpt_get_send_ioctx() call invoked by + * srpt_handle_new_iu() fails. This is possible because the req_lim_delta + * value in each response is set to one, and it is possible that this response + * makes the initiator send a new request before the send completion for that + * response has been processed. This could e.g. happen if the call to + * srpt_put_send_iotcx() is delayed because of a higher priority interrupt or + * if IB retransmission causes generation of the send completion to be + * delayed. Incoming information units for which srpt_get_send_ioctx() fails + * are queued on cmd_wait_list. The code below processes these delayed + * requests one at a time. + */ +static void srpt_process_send_completion(struct ib_cq *cq, + struct srpt_rdma_ch *ch, + struct ib_wc *wc) +{ + struct srpt_send_ioctx *send_ioctx; + uint32_t index; + enum srpt_opcode opcode; + + index = idx_from_wr_id(wc->wr_id); + opcode = opcode_from_wr_id(wc->wr_id); + send_ioctx = ch->ioctx_ring[index]; + if (wc->status == IB_WC_SUCCESS) { + if (opcode == SRPT_SEND) + srpt_handle_send_comp(ch, send_ioctx); + else { + WARN_ON(opcode != SRPT_RDMA_ABORT && + wc->opcode != IB_WC_RDMA_READ); + srpt_handle_rdma_comp(ch, send_ioctx, opcode); + } + } else { + if (opcode == SRPT_SEND) { + printk(KERN_INFO "sending response for idx %u failed" + " with status %d\n", index, wc->status); + srpt_handle_send_err_comp(ch, wc->wr_id); + } else if (opcode != SRPT_RDMA_MID) { + printk(KERN_INFO "RDMA t %d for idx %u failed with" + " status %d", opcode, index, wc->status); + srpt_handle_rdma_err_comp(ch, send_ioctx, opcode); + } + } + + while (unlikely(opcode == SRPT_SEND + && !list_empty(&ch->cmd_wait_list) + && srpt_get_ch_state(ch) == CH_LIVE + && (send_ioctx = srpt_get_send_ioctx(ch)) != NULL)) { + struct srpt_recv_ioctx *recv_ioctx; + + recv_ioctx = list_first_entry(&ch->cmd_wait_list, + struct srpt_recv_ioctx, + wait_list); + list_del(&recv_ioctx->wait_list); + srpt_handle_new_iu(ch, recv_ioctx, send_ioctx); + } +} + +static void srpt_process_completion(struct ib_cq *cq, struct srpt_rdma_ch *ch) +{ + struct ib_wc *const wc = ch->wc; + int i, n; + + WARN_ON(cq != ch->cq); + + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + while ((n = ib_poll_cq(cq, ARRAY_SIZE(ch->wc), wc)) > 0) { + for (i = 0; i < n; i++) { + if (opcode_from_wr_id(wc[i].wr_id) == SRPT_RECV) + srpt_process_rcv_completion(cq, ch, &wc[i]); + else + srpt_process_send_completion(cq, ch, &wc[i]); + } + } +} + +/** + * srpt_completion() - IB completion queue callback function. + * + * Notes: + * - It is guaranteed that a completion handler will never be invoked + * concurrently on two different CPUs for the same completion queue. See also + * Documentation/infiniband/core_locking.txt and the implementation of + * handle_edge_irq() in kernel/irq/chip.c. + * - When threaded IRQs are enabled, completion handlers are invoked in thread + * context instead of interrupt context. + */ +static void srpt_completion(struct ib_cq *cq, void *ctx) +{ + struct srpt_rdma_ch *ch = ctx; + + wake_up_interruptible(&ch->wait_queue); +} + +static int srpt_compl_thread(void *arg) +{ + struct srpt_rdma_ch *ch; + + /* Hibernation / freezing of the SRPT kernel thread is not supported. */ + current->flags |= PF_NOFREEZE; + + ch = arg; + BUG_ON(!ch); + printk(KERN_INFO "Session %s: kernel thread %s (PID %d) started\n", + ch->sess_name, ch->thread->comm, current->pid); + while (!kthread_should_stop()) { + wait_event_interruptible(ch->wait_queue, + (srpt_process_completion(ch->cq, ch), + kthread_should_stop())); + } + printk(KERN_INFO "Session %s: kernel thread %s (PID %d) stopped\n", + ch->sess_name, ch->thread->comm, current->pid); + return 0; +} + +/** + * srpt_create_ch_ib() - Create receive and send completion queues. + */ +static int srpt_create_ch_ib(struct srpt_rdma_ch *ch) +{ + struct ib_qp_init_attr *qp_init; + struct srpt_port *sport = ch->sport; + struct srpt_device *sdev = sport->sdev; + u32 srp_sq_size = sport->port_attrib.srp_sq_size; + int ret; + + WARN_ON(ch->rq_size < 1); + + ret = -ENOMEM; + qp_init = kzalloc(sizeof *qp_init, GFP_KERNEL); + if (!qp_init) + goto out; + + ch->cq = ib_create_cq(sdev->device, srpt_completion, NULL, ch, + ch->rq_size + srp_sq_size, 0); + if (IS_ERR(ch->cq)) { + ret = PTR_ERR(ch->cq); + printk(KERN_ERR "failed to create CQ cqe= %d ret= %d\n", + ch->rq_size + srp_sq_size, ret); + goto out; + } + + qp_init->qp_context = (void *)ch; + qp_init->event_handler + = (void(*)(struct ib_event *, void*))srpt_qp_event; + qp_init->send_cq = ch->cq; + qp_init->recv_cq = ch->cq; + qp_init->srq = sdev->srq; + qp_init->sq_sig_type = IB_SIGNAL_REQ_WR; + qp_init->qp_type = IB_QPT_RC; + qp_init->cap.max_send_wr = srp_sq_size; + qp_init->cap.max_send_sge = SRPT_DEF_SG_PER_WQE; + + ch->qp = ib_create_qp(sdev->pd, qp_init); + if (IS_ERR(ch->qp)) { + ret = PTR_ERR(ch->qp); + printk(KERN_ERR "failed to create_qp ret= %d\n", ret); + goto err_destroy_cq; + } + + atomic_set(&ch->sq_wr_avail, qp_init->cap.max_send_wr); + + pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", + __func__, ch->cq->cqe, qp_init->cap.max_send_sge, + qp_init->cap.max_send_wr, ch->cm_id); + + ret = srpt_init_ch_qp(ch, ch->qp); + if (ret) + goto err_destroy_qp; + + init_waitqueue_head(&ch->wait_queue); + + pr_debug("creating thread for session %s\n", ch->sess_name); + + ch->thread = kthread_run(srpt_compl_thread, ch, "ib_srpt_compl"); + if (IS_ERR(ch->thread)) { + printk(KERN_ERR "failed to create kernel thread %ld\n", + PTR_ERR(ch->thread)); + ch->thread = NULL; + goto err_destroy_qp; + } + +out: + kfree(qp_init); + return ret; + +err_destroy_qp: + ib_destroy_qp(ch->qp); +err_destroy_cq: + ib_destroy_cq(ch->cq); + goto out; +} + +static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch) +{ + if (ch->thread) + kthread_stop(ch->thread); + + ib_destroy_qp(ch->qp); + ib_destroy_cq(ch->cq); +} + +/** + * __srpt_close_ch() - Close an RDMA channel by setting the QP error state. + * + * Reset the QP and make sure all resources associated with the channel will + * be deallocated at an appropriate time. + * + * Note: The caller must hold ch->sport->sdev->spinlock. + */ +static void __srpt_close_ch(struct srpt_rdma_ch *ch) +{ + struct srpt_device *sdev; + enum rdma_ch_state prev_state; + unsigned long flags; + + sdev = ch->sport->sdev; + + spin_lock_irqsave(&ch->spinlock, flags); + prev_state = ch->state; + switch (prev_state) { + case CH_CONNECTING: + case CH_LIVE: + ch->state = CH_DISCONNECTING; + break; + default: + break; + } + spin_unlock_irqrestore(&ch->spinlock, flags); + + switch (prev_state) { + case CH_CONNECTING: + ib_send_cm_rej(ch->cm_id, IB_CM_REJ_NO_RESOURCES, NULL, 0, + NULL, 0); + /* fall through */ + case CH_LIVE: + if (ib_send_cm_dreq(ch->cm_id, NULL, 0) < 0) + printk(KERN_ERR "sending CM DREQ failed.\n"); + break; + case CH_DISCONNECTING: + break; + case CH_DRAINING: + case CH_RELEASING: + break; + } +} + +/** + * srpt_close_ch() - Close an RDMA channel. + */ +static void srpt_close_ch(struct srpt_rdma_ch *ch) +{ + struct srpt_device *sdev; + + sdev = ch->sport->sdev; + spin_lock_irq(&sdev->spinlock); + __srpt_close_ch(ch); + spin_unlock_irq(&sdev->spinlock); +} + +/** + * srpt_shutdown_session() - Whether or not a session may be shut down. + */ +static int srpt_shutdown_session(struct se_session *se_sess) +{ + struct srpt_rdma_ch *ch = se_sess->fabric_sess_ptr; + unsigned long flags; + + spin_lock_irqsave(&ch->spinlock, flags); + if (ch->in_shutdown) { + spin_unlock_irqrestore(&ch->spinlock, flags); + return true; + } + + ch->in_shutdown = true; + target_sess_cmd_list_set_waiting(se_sess); + spin_unlock_irqrestore(&ch->spinlock, flags); + + return true; +} + +/** + * srpt_drain_channel() - Drain a channel by resetting the IB queue pair. + * @cm_id: Pointer to the CM ID of the channel to be drained. + * + * Note: Must be called from inside srpt_cm_handler to avoid a race between + * accessing sdev->spinlock and the call to kfree(sdev) in srpt_remove_one() + * (the caller of srpt_cm_handler holds the cm_id spinlock; srpt_remove_one() + * waits until all target sessions for the associated IB device have been + * unregistered and target session registration involves a call to + * ib_destroy_cm_id(), which locks the cm_id spinlock and hence waits until + * this function has finished). + */ +static void srpt_drain_channel(struct ib_cm_id *cm_id) +{ + struct srpt_device *sdev; + struct srpt_rdma_ch *ch; + int ret; + bool do_reset = false; + + WARN_ON_ONCE(irqs_disabled()); + + sdev = cm_id->context; + BUG_ON(!sdev); + spin_lock_irq(&sdev->spinlock); + list_for_each_entry(ch, &sdev->rch_list, list) { + if (ch->cm_id == cm_id) { + do_reset = srpt_test_and_set_ch_state(ch, + CH_CONNECTING, CH_DRAINING) || + srpt_test_and_set_ch_state(ch, + CH_LIVE, CH_DRAINING) || + srpt_test_and_set_ch_state(ch, + CH_DISCONNECTING, CH_DRAINING); + break; + } + } + spin_unlock_irq(&sdev->spinlock); + + if (do_reset) { + if (ch->sess) + srpt_shutdown_session(ch->sess); + + ret = srpt_ch_qp_err(ch); + if (ret < 0) + printk(KERN_ERR "Setting queue pair in error state" + " failed: %d\n", ret); + } +} + +/** + * srpt_find_channel() - Look up an RDMA channel. + * @cm_id: Pointer to the CM ID of the channel to be looked up. + * + * Return NULL if no matching RDMA channel has been found. + */ +static struct srpt_rdma_ch *srpt_find_channel(struct srpt_device *sdev, + struct ib_cm_id *cm_id) +{ + struct srpt_rdma_ch *ch; + bool found; + + WARN_ON_ONCE(irqs_disabled()); + BUG_ON(!sdev); + + found = false; + spin_lock_irq(&sdev->spinlock); + list_for_each_entry(ch, &sdev->rch_list, list) { + if (ch->cm_id == cm_id) { + found = true; + break; + } + } + spin_unlock_irq(&sdev->spinlock); + + return found ? ch : NULL; +} + +/** + * srpt_release_channel() - Release channel resources. + * + * Schedules the actual release because: + * - Calling the ib_destroy_cm_id() call from inside an IB CM callback would + * trigger a deadlock. + * - It is not safe to call TCM transport_* functions from interrupt context. + */ +static void srpt_release_channel(struct srpt_rdma_ch *ch) +{ + schedule_work(&ch->release_work); +} + +static void srpt_release_channel_work(struct work_struct *w) +{ + struct srpt_rdma_ch *ch; + struct srpt_device *sdev; + struct se_session *se_sess; + + ch = container_of(w, struct srpt_rdma_ch, release_work); + pr_debug("ch = %p; ch->sess = %p; release_done = %p\n", ch, ch->sess, + ch->release_done); + + sdev = ch->sport->sdev; + BUG_ON(!sdev); + + se_sess = ch->sess; + BUG_ON(!se_sess); + + target_wait_for_sess_cmds(se_sess); + + transport_deregister_session_configfs(se_sess); + transport_deregister_session(se_sess); + ch->sess = NULL; + + ib_destroy_cm_id(ch->cm_id); + + srpt_destroy_ch_ib(ch); + + srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring, + ch->sport->sdev, ch->rq_size, + ch->rsp_size, DMA_TO_DEVICE); + + spin_lock_irq(&sdev->spinlock); + list_del(&ch->list); + spin_unlock_irq(&sdev->spinlock); + + if (ch->release_done) + complete(ch->release_done); + + wake_up(&sdev->ch_releaseQ); + + kfree(ch); +} + +static struct srpt_node_acl *__srpt_lookup_acl(struct srpt_port *sport, + u8 i_port_id[16]) +{ + struct srpt_node_acl *nacl; + + list_for_each_entry(nacl, &sport->port_acl_list, list) + if (memcmp(nacl->i_port_id, i_port_id, + sizeof(nacl->i_port_id)) == 0) + return nacl; + + return NULL; +} + +static struct srpt_node_acl *srpt_lookup_acl(struct srpt_port *sport, + u8 i_port_id[16]) +{ + struct srpt_node_acl *nacl; + + spin_lock_irq(&sport->port_acl_lock); + nacl = __srpt_lookup_acl(sport, i_port_id); + spin_unlock_irq(&sport->port_acl_lock); + + return nacl; +} + +/** + * srpt_cm_req_recv() - Process the event IB_CM_REQ_RECEIVED. + * + * Ownership of the cm_id is transferred to the target session if this + * functions returns zero. Otherwise the caller remains the owner of cm_id. + */ +static int srpt_cm_req_recv(struct ib_cm_id *cm_id, + struct ib_cm_req_event_param *param, + void *private_data) +{ + struct srpt_device *sdev = cm_id->context; + struct srpt_port *sport = &sdev->port[param->port - 1]; + struct srp_login_req *req; + struct srp_login_rsp *rsp; + struct srp_login_rej *rej; + struct ib_cm_rep_param *rep_param; + struct srpt_rdma_ch *ch, *tmp_ch; + struct srpt_node_acl *nacl; + u32 it_iu_len; + int i; + int ret = 0; + + WARN_ON_ONCE(irqs_disabled()); + + if (WARN_ON(!sdev || !private_data)) + return -EINVAL; + + req = (struct srp_login_req *)private_data; + + it_iu_len = be32_to_cpu(req->req_it_iu_len); + + printk(KERN_INFO "Received SRP_LOGIN_REQ with i_port_id 0x%llx:0x%llx," + " t_port_id 0x%llx:0x%llx and it_iu_len %d on port %d" + " (guid=0x%llx:0x%llx)\n", + be64_to_cpu(*(__be64 *)&req->initiator_port_id[0]), + be64_to_cpu(*(__be64 *)&req->initiator_port_id[8]), + be64_to_cpu(*(__be64 *)&req->target_port_id[0]), + be64_to_cpu(*(__be64 *)&req->target_port_id[8]), + it_iu_len, + param->port, + be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[0]), + be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[8])); + + rsp = kzalloc(sizeof *rsp, GFP_KERNEL); + rej = kzalloc(sizeof *rej, GFP_KERNEL); + rep_param = kzalloc(sizeof *rep_param, GFP_KERNEL); + + if (!rsp || !rej || !rep_param) { + ret = -ENOMEM; + goto out; + } + + if (it_iu_len > srp_max_req_size || it_iu_len < 64) { + rej->reason = __constant_cpu_to_be32( + SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE); + ret = -EINVAL; + printk(KERN_ERR "rejected SRP_LOGIN_REQ because its" + " length (%d bytes) is out of range (%d .. %d)\n", + it_iu_len, 64, srp_max_req_size); + goto reject; + } + + if (!sport->enabled) { + rej->reason = __constant_cpu_to_be32( + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + ret = -EINVAL; + printk(KERN_ERR "rejected SRP_LOGIN_REQ because the target port" + " has not yet been enabled\n"); + goto reject; + } + + if ((req->req_flags & SRP_MTCH_ACTION) == SRP_MULTICHAN_SINGLE) { + rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_NO_CHAN; + + spin_lock_irq(&sdev->spinlock); + + list_for_each_entry_safe(ch, tmp_ch, &sdev->rch_list, list) { + if (!memcmp(ch->i_port_id, req->initiator_port_id, 16) + && !memcmp(ch->t_port_id, req->target_port_id, 16) + && param->port == ch->sport->port + && param->listen_id == ch->sport->sdev->cm_id + && ch->cm_id) { + enum rdma_ch_state ch_state; + + ch_state = srpt_get_ch_state(ch); + if (ch_state != CH_CONNECTING + && ch_state != CH_LIVE) + continue; + + /* found an existing channel */ + pr_debug("Found existing channel %s" + " cm_id= %p state= %d\n", + ch->sess_name, ch->cm_id, ch_state); + + __srpt_close_ch(ch); + + rsp->rsp_flags = + SRP_LOGIN_RSP_MULTICHAN_TERMINATED; + } + } + + spin_unlock_irq(&sdev->spinlock); + + } else + rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_MAINTAINED; + + if (*(__be64 *)req->target_port_id != cpu_to_be64(srpt_service_guid) + || *(__be64 *)(req->target_port_id + 8) != + cpu_to_be64(srpt_service_guid)) { + rej->reason = __constant_cpu_to_be32( + SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL); + ret = -ENOMEM; + printk(KERN_ERR "rejected SRP_LOGIN_REQ because it" + " has an invalid target port identifier.\n"); + goto reject; + } + + ch = kzalloc(sizeof *ch, GFP_KERNEL); + if (!ch) { + rej->reason = __constant_cpu_to_be32( + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + printk(KERN_ERR "rejected SRP_LOGIN_REQ because no memory.\n"); + ret = -ENOMEM; + goto reject; + } + + INIT_WORK(&ch->release_work, srpt_release_channel_work); + memcpy(ch->i_port_id, req->initiator_port_id, 16); + memcpy(ch->t_port_id, req->target_port_id, 16); + ch->sport = &sdev->port[param->port - 1]; + ch->cm_id = cm_id; + /* + * Avoid QUEUE_FULL conditions by limiting the number of buffers used + * for the SRP protocol to the command queue size. + */ + ch->rq_size = SRPT_RQ_SIZE; + spin_lock_init(&ch->spinlock); + ch->state = CH_CONNECTING; + INIT_LIST_HEAD(&ch->cmd_wait_list); + ch->rsp_size = ch->sport->port_attrib.srp_max_rsp_size; + + ch->ioctx_ring = (struct srpt_send_ioctx **) + srpt_alloc_ioctx_ring(ch->sport->sdev, ch->rq_size, + sizeof(*ch->ioctx_ring[0]), + ch->rsp_size, DMA_TO_DEVICE); + if (!ch->ioctx_ring) + goto free_ch; + + INIT_LIST_HEAD(&ch->free_list); + for (i = 0; i < ch->rq_size; i++) { + ch->ioctx_ring[i]->ch = ch; + list_add_tail(&ch->ioctx_ring[i]->free_list, &ch->free_list); + } + + ret = srpt_create_ch_ib(ch); + if (ret) { + rej->reason = __constant_cpu_to_be32( + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + printk(KERN_ERR "rejected SRP_LOGIN_REQ because creating" + " a new RDMA channel failed.\n"); + goto free_ring; + } + + ret = srpt_ch_qp_rtr(ch, ch->qp); + if (ret) { + rej->reason = __constant_cpu_to_be32( + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + printk(KERN_ERR "rejected SRP_LOGIN_REQ because enabling" + " RTR failed (error code = %d)\n", ret); + goto destroy_ib; + } + /* + * Use the initator port identifier as the session name. + */ + snprintf(ch->sess_name, sizeof(ch->sess_name), "0x%016llx%016llx", + be64_to_cpu(*(__be64 *)ch->i_port_id), + be64_to_cpu(*(__be64 *)(ch->i_port_id + 8))); + + pr_debug("registering session %s\n", ch->sess_name); + + nacl = srpt_lookup_acl(sport, ch->i_port_id); + if (!nacl) { + printk(KERN_INFO "Rejected login because no ACL has been" + " configured yet for initiator %s.\n", ch->sess_name); + rej->reason = __constant_cpu_to_be32( + SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED); + goto destroy_ib; + } + + ch->sess = transport_init_session(TARGET_PROT_NORMAL); + if (IS_ERR(ch->sess)) { + rej->reason = __constant_cpu_to_be32( + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + pr_debug("Failed to create session\n"); + goto deregister_session; + } + ch->sess->se_node_acl = &nacl->nacl; + transport_register_session(&sport->port_tpg_1, &nacl->nacl, ch->sess, ch); + + pr_debug("Establish connection sess=%p name=%s cm_id=%p\n", ch->sess, + ch->sess_name, ch->cm_id); + + /* create srp_login_response */ + rsp->opcode = SRP_LOGIN_RSP; + rsp->tag = req->tag; + rsp->max_it_iu_len = req->req_it_iu_len; + rsp->max_ti_iu_len = req->req_it_iu_len; + ch->max_ti_iu_len = it_iu_len; + rsp->buf_fmt = __constant_cpu_to_be16(SRP_BUF_FORMAT_DIRECT + | SRP_BUF_FORMAT_INDIRECT); + rsp->req_lim_delta = cpu_to_be32(ch->rq_size); + atomic_set(&ch->req_lim, ch->rq_size); + atomic_set(&ch->req_lim_delta, 0); + + /* create cm reply */ + rep_param->qp_num = ch->qp->qp_num; + rep_param->private_data = (void *)rsp; + rep_param->private_data_len = sizeof *rsp; + rep_param->rnr_retry_count = 7; + rep_param->flow_control = 1; + rep_param->failover_accepted = 0; + rep_param->srq = 1; + rep_param->responder_resources = 4; + rep_param->initiator_depth = 4; + + ret = ib_send_cm_rep(cm_id, rep_param); + if (ret) { + printk(KERN_ERR "sending SRP_LOGIN_REQ response failed" + " (error code = %d)\n", ret); + goto release_channel; + } + + spin_lock_irq(&sdev->spinlock); + list_add_tail(&ch->list, &sdev->rch_list); + spin_unlock_irq(&sdev->spinlock); + + goto out; + +release_channel: + srpt_set_ch_state(ch, CH_RELEASING); + transport_deregister_session_configfs(ch->sess); + +deregister_session: + transport_deregister_session(ch->sess); + ch->sess = NULL; + +destroy_ib: + srpt_destroy_ch_ib(ch); + +free_ring: + srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring, + ch->sport->sdev, ch->rq_size, + ch->rsp_size, DMA_TO_DEVICE); +free_ch: + kfree(ch); + +reject: + rej->opcode = SRP_LOGIN_REJ; + rej->tag = req->tag; + rej->buf_fmt = __constant_cpu_to_be16(SRP_BUF_FORMAT_DIRECT + | SRP_BUF_FORMAT_INDIRECT); + + ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, + (void *)rej, sizeof *rej); + +out: + kfree(rep_param); + kfree(rsp); + kfree(rej); + + return ret; +} + +static void srpt_cm_rej_recv(struct ib_cm_id *cm_id) +{ + printk(KERN_INFO "Received IB REJ for cm_id %p.\n", cm_id); + srpt_drain_channel(cm_id); +} + +/** + * srpt_cm_rtu_recv() - Process an IB_CM_RTU_RECEIVED or USER_ESTABLISHED event. + * + * An IB_CM_RTU_RECEIVED message indicates that the connection is established + * and that the recipient may begin transmitting (RTU = ready to use). + */ +static void srpt_cm_rtu_recv(struct ib_cm_id *cm_id) +{ + struct srpt_rdma_ch *ch; + int ret; + + ch = srpt_find_channel(cm_id->context, cm_id); + BUG_ON(!ch); + + if (srpt_test_and_set_ch_state(ch, CH_CONNECTING, CH_LIVE)) { + struct srpt_recv_ioctx *ioctx, *ioctx_tmp; + + ret = srpt_ch_qp_rts(ch, ch->qp); + + list_for_each_entry_safe(ioctx, ioctx_tmp, &ch->cmd_wait_list, + wait_list) { + list_del(&ioctx->wait_list); + srpt_handle_new_iu(ch, ioctx, NULL); + } + if (ret) + srpt_close_ch(ch); + } +} + +static void srpt_cm_timewait_exit(struct ib_cm_id *cm_id) +{ + printk(KERN_INFO "Received IB TimeWait exit for cm_id %p.\n", cm_id); + srpt_drain_channel(cm_id); +} + +static void srpt_cm_rep_error(struct ib_cm_id *cm_id) +{ + printk(KERN_INFO "Received IB REP error for cm_id %p.\n", cm_id); + srpt_drain_channel(cm_id); +} + +/** + * srpt_cm_dreq_recv() - Process reception of a DREQ message. + */ +static void srpt_cm_dreq_recv(struct ib_cm_id *cm_id) +{ + struct srpt_rdma_ch *ch; + unsigned long flags; + bool send_drep = false; + + ch = srpt_find_channel(cm_id->context, cm_id); + BUG_ON(!ch); + + pr_debug("cm_id= %p ch->state= %d\n", cm_id, srpt_get_ch_state(ch)); + + spin_lock_irqsave(&ch->spinlock, flags); + switch (ch->state) { + case CH_CONNECTING: + case CH_LIVE: + send_drep = true; + ch->state = CH_DISCONNECTING; + break; + case CH_DISCONNECTING: + case CH_DRAINING: + case CH_RELEASING: + WARN(true, "unexpected channel state %d\n", ch->state); + break; + } + spin_unlock_irqrestore(&ch->spinlock, flags); + + if (send_drep) { + if (ib_send_cm_drep(ch->cm_id, NULL, 0) < 0) + printk(KERN_ERR "Sending IB DREP failed.\n"); + printk(KERN_INFO "Received DREQ and sent DREP for session %s.\n", + ch->sess_name); + } +} + +/** + * srpt_cm_drep_recv() - Process reception of a DREP message. + */ +static void srpt_cm_drep_recv(struct ib_cm_id *cm_id) +{ + printk(KERN_INFO "Received InfiniBand DREP message for cm_id %p.\n", + cm_id); + srpt_drain_channel(cm_id); +} + +/** + * srpt_cm_handler() - IB connection manager callback function. + * + * A non-zero return value will cause the caller destroy the CM ID. + * + * Note: srpt_cm_handler() must only return a non-zero value when transferring + * ownership of the cm_id to a channel by srpt_cm_req_recv() failed. Returning + * a non-zero value in any other case will trigger a race with the + * ib_destroy_cm_id() call in srpt_release_channel(). + */ +static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +{ + int ret; + + ret = 0; + switch (event->event) { + case IB_CM_REQ_RECEIVED: + ret = srpt_cm_req_recv(cm_id, &event->param.req_rcvd, + event->private_data); + break; + case IB_CM_REJ_RECEIVED: + srpt_cm_rej_recv(cm_id); + break; + case IB_CM_RTU_RECEIVED: + case IB_CM_USER_ESTABLISHED: + srpt_cm_rtu_recv(cm_id); + break; + case IB_CM_DREQ_RECEIVED: + srpt_cm_dreq_recv(cm_id); + break; + case IB_CM_DREP_RECEIVED: + srpt_cm_drep_recv(cm_id); + break; + case IB_CM_TIMEWAIT_EXIT: + srpt_cm_timewait_exit(cm_id); + break; + case IB_CM_REP_ERROR: + srpt_cm_rep_error(cm_id); + break; + case IB_CM_DREQ_ERROR: + printk(KERN_INFO "Received IB DREQ ERROR event.\n"); + break; + case IB_CM_MRA_RECEIVED: + printk(KERN_INFO "Received IB MRA event\n"); + break; + default: + printk(KERN_ERR "received unrecognized IB CM event %d\n", + event->event); + break; + } + + return ret; +} + +/** + * srpt_perform_rdmas() - Perform IB RDMA. + * + * Returns zero upon success or a negative number upon failure. + */ +static int srpt_perform_rdmas(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx) +{ + struct ib_send_wr wr; + struct ib_send_wr *bad_wr; + struct rdma_iu *riu; + int i; + int ret; + int sq_wr_avail; + enum dma_data_direction dir; + const int n_rdma = ioctx->n_rdma; + + dir = ioctx->cmd.data_direction; + if (dir == DMA_TO_DEVICE) { + /* write */ + ret = -ENOMEM; + sq_wr_avail = atomic_sub_return(n_rdma, &ch->sq_wr_avail); + if (sq_wr_avail < 0) { + printk(KERN_WARNING "IB send queue full (needed %d)\n", + n_rdma); + goto out; + } + } + + ioctx->rdma_aborted = false; + ret = 0; + riu = ioctx->rdma_ius; + memset(&wr, 0, sizeof wr); + + for (i = 0; i < n_rdma; ++i, ++riu) { + if (dir == DMA_FROM_DEVICE) { + wr.opcode = IB_WR_RDMA_WRITE; + wr.wr_id = encode_wr_id(i == n_rdma - 1 ? + SRPT_RDMA_WRITE_LAST : + SRPT_RDMA_MID, + ioctx->ioctx.index); + } else { + wr.opcode = IB_WR_RDMA_READ; + wr.wr_id = encode_wr_id(i == n_rdma - 1 ? + SRPT_RDMA_READ_LAST : + SRPT_RDMA_MID, + ioctx->ioctx.index); + } + wr.next = NULL; + wr.wr.rdma.remote_addr = riu->raddr; + wr.wr.rdma.rkey = riu->rkey; + wr.num_sge = riu->sge_cnt; + wr.sg_list = riu->sge; + + /* only get completion event for the last rdma write */ + if (i == (n_rdma - 1) && dir == DMA_TO_DEVICE) + wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(ch->qp, &wr, &bad_wr); + if (ret) + break; + } + + if (ret) + printk(KERN_ERR "%s[%d]: ib_post_send() returned %d for %d/%d", + __func__, __LINE__, ret, i, n_rdma); + if (ret && i > 0) { + wr.num_sge = 0; + wr.wr_id = encode_wr_id(SRPT_RDMA_ABORT, ioctx->ioctx.index); + wr.send_flags = IB_SEND_SIGNALED; + while (ch->state == CH_LIVE && + ib_post_send(ch->qp, &wr, &bad_wr) != 0) { + printk(KERN_INFO "Trying to abort failed RDMA transfer [%d]", + ioctx->ioctx.index); + msleep(1000); + } + while (ch->state != CH_RELEASING && !ioctx->rdma_aborted) { + printk(KERN_INFO "Waiting until RDMA abort finished [%d]", + ioctx->ioctx.index); + msleep(1000); + } + } +out: + if (unlikely(dir == DMA_TO_DEVICE && ret < 0)) + atomic_add(n_rdma, &ch->sq_wr_avail); + return ret; +} + +/** + * srpt_xfer_data() - Start data transfer from initiator to target. + */ +static int srpt_xfer_data(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx) +{ + int ret; + + ret = srpt_map_sg_to_ib_sge(ch, ioctx); + if (ret) { + printk(KERN_ERR "%s[%d] ret=%d\n", __func__, __LINE__, ret); + goto out; + } + + ret = srpt_perform_rdmas(ch, ioctx); + if (ret) { + if (ret == -EAGAIN || ret == -ENOMEM) + printk(KERN_INFO "%s[%d] queue full -- ret=%d\n", + __func__, __LINE__, ret); + else + printk(KERN_ERR "%s[%d] fatal error -- ret=%d\n", + __func__, __LINE__, ret); + goto out_unmap; + } + +out: + return ret; +out_unmap: + srpt_unmap_sg_to_ib_sge(ch, ioctx); + goto out; +} + +static int srpt_write_pending_status(struct se_cmd *se_cmd) +{ + struct srpt_send_ioctx *ioctx; + + ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); + return srpt_get_cmd_state(ioctx) == SRPT_STATE_NEED_DATA; +} + +/* + * srpt_write_pending() - Start data transfer from initiator to target (write). + */ +static int srpt_write_pending(struct se_cmd *se_cmd) +{ + struct srpt_rdma_ch *ch; + struct srpt_send_ioctx *ioctx; + enum srpt_command_state new_state; + enum rdma_ch_state ch_state; + int ret; + + ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); + + new_state = srpt_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA); + WARN_ON(new_state == SRPT_STATE_DONE); + + ch = ioctx->ch; + BUG_ON(!ch); + + ch_state = srpt_get_ch_state(ch); + switch (ch_state) { + case CH_CONNECTING: + WARN(true, "unexpected channel state %d\n", ch_state); + ret = -EINVAL; + goto out; + case CH_LIVE: + break; + case CH_DISCONNECTING: + case CH_DRAINING: + case CH_RELEASING: + pr_debug("cmd with tag %lld: channel disconnecting\n", + ioctx->tag); + srpt_set_cmd_state(ioctx, SRPT_STATE_DATA_IN); + ret = -EINVAL; + goto out; + } + ret = srpt_xfer_data(ch, ioctx); + +out: + return ret; +} + +static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status) +{ + switch (tcm_mgmt_status) { + case TMR_FUNCTION_COMPLETE: + return SRP_TSK_MGMT_SUCCESS; + case TMR_FUNCTION_REJECTED: + return SRP_TSK_MGMT_FUNC_NOT_SUPP; + } + return SRP_TSK_MGMT_FAILED; +} + +/** + * srpt_queue_response() - Transmits the response to a SCSI command. + * + * Callback function called by the TCM core. Must not block since it can be + * invoked on the context of the IB completion handler. + */ +static void srpt_queue_response(struct se_cmd *cmd) +{ + struct srpt_rdma_ch *ch; + struct srpt_send_ioctx *ioctx; + enum srpt_command_state state; + unsigned long flags; + int ret; + enum dma_data_direction dir; + int resp_len; + u8 srp_tm_status; + + ioctx = container_of(cmd, struct srpt_send_ioctx, cmd); + ch = ioctx->ch; + BUG_ON(!ch); + + spin_lock_irqsave(&ioctx->spinlock, flags); + state = ioctx->state; + switch (state) { + case SRPT_STATE_NEW: + case SRPT_STATE_DATA_IN: + ioctx->state = SRPT_STATE_CMD_RSP_SENT; + break; + case SRPT_STATE_MGMT: + ioctx->state = SRPT_STATE_MGMT_RSP_SENT; + break; + default: + WARN(true, "ch %p; cmd %d: unexpected command state %d\n", + ch, ioctx->ioctx.index, ioctx->state); + break; + } + spin_unlock_irqrestore(&ioctx->spinlock, flags); + + if (unlikely(transport_check_aborted_status(&ioctx->cmd, false) + || WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT))) { + atomic_inc(&ch->req_lim_delta); + srpt_abort_cmd(ioctx); + return; + } + + dir = ioctx->cmd.data_direction; + + /* For read commands, transfer the data to the initiator. */ + if (dir == DMA_FROM_DEVICE && ioctx->cmd.data_length && + !ioctx->queue_status_only) { + ret = srpt_xfer_data(ch, ioctx); + if (ret) { + printk(KERN_ERR "xfer_data failed for tag %llu\n", + ioctx->tag); + return; + } + } + + if (state != SRPT_STATE_MGMT) + resp_len = srpt_build_cmd_rsp(ch, ioctx, ioctx->tag, + cmd->scsi_status); + else { + srp_tm_status + = tcm_to_srp_tsk_mgmt_status(cmd->se_tmr_req->response); + resp_len = srpt_build_tskmgmt_rsp(ch, ioctx, srp_tm_status, + ioctx->tag); + } + ret = srpt_post_send(ch, ioctx, resp_len); + if (ret) { + printk(KERN_ERR "sending cmd response failed for tag %llu\n", + ioctx->tag); + srpt_unmap_sg_to_ib_sge(ch, ioctx); + srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); + target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); + } +} + +static int srpt_queue_data_in(struct se_cmd *cmd) +{ + srpt_queue_response(cmd); + return 0; +} + +static void srpt_queue_tm_rsp(struct se_cmd *cmd) +{ + srpt_queue_response(cmd); +} + +static void srpt_aborted_task(struct se_cmd *cmd) +{ + struct srpt_send_ioctx *ioctx = container_of(cmd, + struct srpt_send_ioctx, cmd); + + srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); +} + +static int srpt_queue_status(struct se_cmd *cmd) +{ + struct srpt_send_ioctx *ioctx; + + ioctx = container_of(cmd, struct srpt_send_ioctx, cmd); + BUG_ON(ioctx->sense_data != cmd->sense_buffer); + if (cmd->se_cmd_flags & + (SCF_TRANSPORT_TASK_SENSE | SCF_EMULATED_TASK_SENSE)) + WARN_ON(cmd->scsi_status != SAM_STAT_CHECK_CONDITION); + ioctx->queue_status_only = true; + srpt_queue_response(cmd); + return 0; +} + +static void srpt_refresh_port_work(struct work_struct *work) +{ + struct srpt_port *sport = container_of(work, struct srpt_port, work); + + srpt_refresh_port(sport); +} + +static int srpt_ch_list_empty(struct srpt_device *sdev) +{ + int res; + + spin_lock_irq(&sdev->spinlock); + res = list_empty(&sdev->rch_list); + spin_unlock_irq(&sdev->spinlock); + + return res; +} + +/** + * srpt_release_sdev() - Free the channel resources associated with a target. + */ +static int srpt_release_sdev(struct srpt_device *sdev) +{ + struct srpt_rdma_ch *ch, *tmp_ch; + int res; + + WARN_ON_ONCE(irqs_disabled()); + + BUG_ON(!sdev); + + spin_lock_irq(&sdev->spinlock); + list_for_each_entry_safe(ch, tmp_ch, &sdev->rch_list, list) + __srpt_close_ch(ch); + spin_unlock_irq(&sdev->spinlock); + + res = wait_event_interruptible(sdev->ch_releaseQ, + srpt_ch_list_empty(sdev)); + if (res) + printk(KERN_ERR "%s: interrupted.\n", __func__); + + return 0; +} + +static struct srpt_port *__srpt_lookup_port(const char *name) +{ + struct ib_device *dev; + struct srpt_device *sdev; + struct srpt_port *sport; + int i; + + list_for_each_entry(sdev, &srpt_dev_list, list) { + dev = sdev->device; + if (!dev) + continue; + + for (i = 0; i < dev->phys_port_cnt; i++) { + sport = &sdev->port[i]; + + if (!strcmp(sport->port_guid, name)) + return sport; + } + } + + return NULL; +} + +static struct srpt_port *srpt_lookup_port(const char *name) +{ + struct srpt_port *sport; + + spin_lock(&srpt_dev_lock); + sport = __srpt_lookup_port(name); + spin_unlock(&srpt_dev_lock); + + return sport; +} + +/** + * srpt_add_one() - Infiniband device addition callback function. + */ +static void srpt_add_one(struct ib_device *device) +{ + struct srpt_device *sdev; + struct srpt_port *sport; + struct ib_srq_init_attr srq_attr; + int i; + + pr_debug("device = %p, device->dma_ops = %p\n", device, + device->dma_ops); + + sdev = kzalloc(sizeof *sdev, GFP_KERNEL); + if (!sdev) + goto err; + + sdev->device = device; + INIT_LIST_HEAD(&sdev->rch_list); + init_waitqueue_head(&sdev->ch_releaseQ); + spin_lock_init(&sdev->spinlock); + + if (ib_query_device(device, &sdev->dev_attr)) + goto free_dev; + + sdev->pd = ib_alloc_pd(device); + if (IS_ERR(sdev->pd)) + goto free_dev; + + sdev->mr = ib_get_dma_mr(sdev->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(sdev->mr)) + goto err_pd; + + sdev->srq_size = min(srpt_srq_size, sdev->dev_attr.max_srq_wr); + + srq_attr.event_handler = srpt_srq_event; + srq_attr.srq_context = (void *)sdev; + srq_attr.attr.max_wr = sdev->srq_size; + srq_attr.attr.max_sge = 1; + srq_attr.attr.srq_limit = 0; + srq_attr.srq_type = IB_SRQT_BASIC; + + sdev->srq = ib_create_srq(sdev->pd, &srq_attr); + if (IS_ERR(sdev->srq)) + goto err_mr; + + pr_debug("%s: create SRQ #wr= %d max_allow=%d dev= %s\n", + __func__, sdev->srq_size, sdev->dev_attr.max_srq_wr, + device->name); + + if (!srpt_service_guid) + srpt_service_guid = be64_to_cpu(device->node_guid); + + sdev->cm_id = ib_create_cm_id(device, srpt_cm_handler, sdev); + if (IS_ERR(sdev->cm_id)) + goto err_srq; + + /* print out target login information */ + pr_debug("Target login info: id_ext=%016llx,ioc_guid=%016llx," + "pkey=ffff,service_id=%016llx\n", srpt_service_guid, + srpt_service_guid, srpt_service_guid); + + /* + * We do not have a consistent service_id (ie. also id_ext of target_id) + * to identify this target. We currently use the guid of the first HCA + * in the system as service_id; therefore, the target_id will change + * if this HCA is gone bad and replaced by different HCA + */ + if (ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0, NULL)) + goto err_cm; + + INIT_IB_EVENT_HANDLER(&sdev->event_handler, sdev->device, + srpt_event_handler); + if (ib_register_event_handler(&sdev->event_handler)) + goto err_cm; + + sdev->ioctx_ring = (struct srpt_recv_ioctx **) + srpt_alloc_ioctx_ring(sdev, sdev->srq_size, + sizeof(*sdev->ioctx_ring[0]), + srp_max_req_size, DMA_FROM_DEVICE); + if (!sdev->ioctx_ring) + goto err_event; + + for (i = 0; i < sdev->srq_size; ++i) + srpt_post_recv(sdev, sdev->ioctx_ring[i]); + + WARN_ON(sdev->device->phys_port_cnt > ARRAY_SIZE(sdev->port)); + + for (i = 1; i <= sdev->device->phys_port_cnt; i++) { + sport = &sdev->port[i - 1]; + sport->sdev = sdev; + sport->port = i; + sport->port_attrib.srp_max_rdma_size = DEFAULT_MAX_RDMA_SIZE; + sport->port_attrib.srp_max_rsp_size = DEFAULT_MAX_RSP_SIZE; + sport->port_attrib.srp_sq_size = DEF_SRPT_SQ_SIZE; + INIT_WORK(&sport->work, srpt_refresh_port_work); + INIT_LIST_HEAD(&sport->port_acl_list); + spin_lock_init(&sport->port_acl_lock); + + if (srpt_refresh_port(sport)) { + printk(KERN_ERR "MAD registration failed for %s-%d.\n", + srpt_sdev_name(sdev), i); + goto err_ring; + } + snprintf(sport->port_guid, sizeof(sport->port_guid), + "0x%016llx%016llx", + be64_to_cpu(sport->gid.global.subnet_prefix), + be64_to_cpu(sport->gid.global.interface_id)); + } + + spin_lock(&srpt_dev_lock); + list_add_tail(&sdev->list, &srpt_dev_list); + spin_unlock(&srpt_dev_lock); + +out: + ib_set_client_data(device, &srpt_client, sdev); + pr_debug("added %s.\n", device->name); + return; + +err_ring: + srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev, + sdev->srq_size, srp_max_req_size, + DMA_FROM_DEVICE); +err_event: + ib_unregister_event_handler(&sdev->event_handler); +err_cm: + ib_destroy_cm_id(sdev->cm_id); +err_srq: + ib_destroy_srq(sdev->srq); +err_mr: + ib_dereg_mr(sdev->mr); +err_pd: + ib_dealloc_pd(sdev->pd); +free_dev: + kfree(sdev); +err: + sdev = NULL; + printk(KERN_INFO "%s(%s) failed.\n", __func__, device->name); + goto out; +} + +/** + * srpt_remove_one() - InfiniBand device removal callback function. + */ +static void srpt_remove_one(struct ib_device *device) +{ + struct srpt_device *sdev; + int i; + + sdev = ib_get_client_data(device, &srpt_client); + if (!sdev) { + printk(KERN_INFO "%s(%s): nothing to do.\n", __func__, + device->name); + return; + } + + srpt_unregister_mad_agent(sdev); + + ib_unregister_event_handler(&sdev->event_handler); + + /* Cancel any work queued by the just unregistered IB event handler. */ + for (i = 0; i < sdev->device->phys_port_cnt; i++) + cancel_work_sync(&sdev->port[i].work); + + ib_destroy_cm_id(sdev->cm_id); + + /* + * Unregistering a target must happen after destroying sdev->cm_id + * such that no new SRP_LOGIN_REQ information units can arrive while + * destroying the target. + */ + spin_lock(&srpt_dev_lock); + list_del(&sdev->list); + spin_unlock(&srpt_dev_lock); + srpt_release_sdev(sdev); + + ib_destroy_srq(sdev->srq); + ib_dereg_mr(sdev->mr); + ib_dealloc_pd(sdev->pd); + + srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev, + sdev->srq_size, srp_max_req_size, DMA_FROM_DEVICE); + sdev->ioctx_ring = NULL; + kfree(sdev); +} + +static struct ib_client srpt_client = { + .name = DRV_NAME, + .add = srpt_add_one, + .remove = srpt_remove_one +}; + +static int srpt_check_true(struct se_portal_group *se_tpg) +{ + return 1; +} + +static int srpt_check_false(struct se_portal_group *se_tpg) +{ + return 0; +} + +static char *srpt_get_fabric_name(void) +{ + return "srpt"; +} + +static u8 srpt_get_fabric_proto_ident(struct se_portal_group *se_tpg) +{ + return SCSI_TRANSPORTID_PROTOCOLID_SRP; +} + +static char *srpt_get_fabric_wwn(struct se_portal_group *tpg) +{ + struct srpt_port *sport = container_of(tpg, struct srpt_port, port_tpg_1); + + return sport->port_guid; +} + +static u16 srpt_get_tag(struct se_portal_group *tpg) +{ + return 1; +} + +static u32 srpt_get_default_depth(struct se_portal_group *se_tpg) +{ + return 1; +} + +static u32 srpt_get_pr_transport_id(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl, + struct t10_pr_registration *pr_reg, + int *format_code, unsigned char *buf) +{ + struct srpt_node_acl *nacl; + struct spc_rdma_transport_id *tr_id; + + nacl = container_of(se_nacl, struct srpt_node_acl, nacl); + tr_id = (void *)buf; + tr_id->protocol_identifier = SCSI_TRANSPORTID_PROTOCOLID_SRP; + memcpy(tr_id->i_port_id, nacl->i_port_id, sizeof(tr_id->i_port_id)); + return sizeof(*tr_id); +} + +static u32 srpt_get_pr_transport_id_len(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl, + struct t10_pr_registration *pr_reg, + int *format_code) +{ + *format_code = 0; + return sizeof(struct spc_rdma_transport_id); +} + +static char *srpt_parse_pr_out_transport_id(struct se_portal_group *se_tpg, + const char *buf, u32 *out_tid_len, + char **port_nexus_ptr) +{ + struct spc_rdma_transport_id *tr_id; + + *port_nexus_ptr = NULL; + *out_tid_len = sizeof(struct spc_rdma_transport_id); + tr_id = (void *)buf; + return (char *)tr_id->i_port_id; +} + +static struct se_node_acl *srpt_alloc_fabric_acl(struct se_portal_group *se_tpg) +{ + struct srpt_node_acl *nacl; + + nacl = kzalloc(sizeof(struct srpt_node_acl), GFP_KERNEL); + if (!nacl) { + printk(KERN_ERR "Unable to allocate struct srpt_node_acl\n"); + return NULL; + } + + return &nacl->nacl; +} + +static void srpt_release_fabric_acl(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl) +{ + struct srpt_node_acl *nacl; + + nacl = container_of(se_nacl, struct srpt_node_acl, nacl); + kfree(nacl); +} + +static u32 srpt_tpg_get_inst_index(struct se_portal_group *se_tpg) +{ + return 1; +} + +static void srpt_release_cmd(struct se_cmd *se_cmd) +{ + struct srpt_send_ioctx *ioctx = container_of(se_cmd, + struct srpt_send_ioctx, cmd); + struct srpt_rdma_ch *ch = ioctx->ch; + unsigned long flags; + + WARN_ON(ioctx->state != SRPT_STATE_DONE); + WARN_ON(ioctx->mapped_sg_count != 0); + + if (ioctx->n_rbuf > 1) { + kfree(ioctx->rbufs); + ioctx->rbufs = NULL; + ioctx->n_rbuf = 0; + } + + spin_lock_irqsave(&ch->spinlock, flags); + list_add(&ioctx->free_list, &ch->free_list); + spin_unlock_irqrestore(&ch->spinlock, flags); +} + +/** + * srpt_close_session() - Forcibly close a session. + * + * Callback function invoked by the TCM core to clean up sessions associated + * with a node ACL when the user invokes + * rmdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id + */ +static void srpt_close_session(struct se_session *se_sess) +{ + DECLARE_COMPLETION_ONSTACK(release_done); + struct srpt_rdma_ch *ch; + struct srpt_device *sdev; + int res; + + ch = se_sess->fabric_sess_ptr; + WARN_ON(ch->sess != se_sess); + + pr_debug("ch %p state %d\n", ch, srpt_get_ch_state(ch)); + + sdev = ch->sport->sdev; + spin_lock_irq(&sdev->spinlock); + BUG_ON(ch->release_done); + ch->release_done = &release_done; + __srpt_close_ch(ch); + spin_unlock_irq(&sdev->spinlock); + + res = wait_for_completion_timeout(&release_done, 60 * HZ); + WARN_ON(res <= 0); +} + +/** + * srpt_sess_get_index() - Return the value of scsiAttIntrPortIndex (SCSI-MIB). + * + * A quote from RFC 4455 (SCSI-MIB) about this MIB object: + * This object represents an arbitrary integer used to uniquely identify a + * particular attached remote initiator port to a particular SCSI target port + * within a particular SCSI target device within a particular SCSI instance. + */ +static u32 srpt_sess_get_index(struct se_session *se_sess) +{ + return 0; +} + +static void srpt_set_default_node_attrs(struct se_node_acl *nacl) +{ +} + +static u32 srpt_get_task_tag(struct se_cmd *se_cmd) +{ + struct srpt_send_ioctx *ioctx; + + ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); + return ioctx->tag; +} + +/* Note: only used from inside debug printk's by the TCM core. */ +static int srpt_get_tcm_cmd_state(struct se_cmd *se_cmd) +{ + struct srpt_send_ioctx *ioctx; + + ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); + return srpt_get_cmd_state(ioctx); +} + +/** + * srpt_parse_i_port_id() - Parse an initiator port ID. + * @name: ASCII representation of a 128-bit initiator port ID. + * @i_port_id: Binary 128-bit port ID. + */ +static int srpt_parse_i_port_id(u8 i_port_id[16], const char *name) +{ + const char *p; + unsigned len, count, leading_zero_bytes; + int ret, rc; + + p = name; + if (strnicmp(p, "0x", 2) == 0) + p += 2; + ret = -EINVAL; + len = strlen(p); + if (len % 2) + goto out; + count = min(len / 2, 16U); + leading_zero_bytes = 16 - count; + memset(i_port_id, 0, leading_zero_bytes); + rc = hex2bin(i_port_id + leading_zero_bytes, p, count); + if (rc < 0) + pr_debug("hex2bin failed for srpt_parse_i_port_id: %d\n", rc); + ret = 0; +out: + return ret; +} + +/* + * configfs callback function invoked for + * mkdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id + */ +static struct se_node_acl *srpt_make_nodeacl(struct se_portal_group *tpg, + struct config_group *group, + const char *name) +{ + struct srpt_port *sport = container_of(tpg, struct srpt_port, port_tpg_1); + struct se_node_acl *se_nacl, *se_nacl_new; + struct srpt_node_acl *nacl; + int ret = 0; + u32 nexus_depth = 1; + u8 i_port_id[16]; + + if (srpt_parse_i_port_id(i_port_id, name) < 0) { + printk(KERN_ERR "invalid initiator port ID %s\n", name); + ret = -EINVAL; + goto err; + } + + se_nacl_new = srpt_alloc_fabric_acl(tpg); + if (!se_nacl_new) { + ret = -ENOMEM; + goto err; + } + /* + * nacl_new may be released by core_tpg_add_initiator_node_acl() + * when converting a node ACL from demo mode to explict + */ + se_nacl = core_tpg_add_initiator_node_acl(tpg, se_nacl_new, name, + nexus_depth); + if (IS_ERR(se_nacl)) { + ret = PTR_ERR(se_nacl); + goto err; + } + /* Locate our struct srpt_node_acl and set sdev and i_port_id. */ + nacl = container_of(se_nacl, struct srpt_node_acl, nacl); + memcpy(&nacl->i_port_id[0], &i_port_id[0], 16); + nacl->sport = sport; + + spin_lock_irq(&sport->port_acl_lock); + list_add_tail(&nacl->list, &sport->port_acl_list); + spin_unlock_irq(&sport->port_acl_lock); + + return se_nacl; +err: + return ERR_PTR(ret); +} + +/* + * configfs callback function invoked for + * rmdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id + */ +static void srpt_drop_nodeacl(struct se_node_acl *se_nacl) +{ + struct srpt_node_acl *nacl; + struct srpt_device *sdev; + struct srpt_port *sport; + + nacl = container_of(se_nacl, struct srpt_node_acl, nacl); + sport = nacl->sport; + sdev = sport->sdev; + spin_lock_irq(&sport->port_acl_lock); + list_del(&nacl->list); + spin_unlock_irq(&sport->port_acl_lock); + core_tpg_del_initiator_node_acl(&sport->port_tpg_1, se_nacl, 1); + srpt_release_fabric_acl(NULL, se_nacl); +} + +static ssize_t srpt_tpg_attrib_show_srp_max_rdma_size( + struct se_portal_group *se_tpg, + char *page) +{ + struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + + return sprintf(page, "%u\n", sport->port_attrib.srp_max_rdma_size); +} + +static ssize_t srpt_tpg_attrib_store_srp_max_rdma_size( + struct se_portal_group *se_tpg, + const char *page, + size_t count) +{ + struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + unsigned long val; + int ret; + + ret = kstrtoul(page, 0, &val); + if (ret < 0) { + pr_err("kstrtoul() failed with ret: %d\n", ret); + return -EINVAL; + } + if (val > MAX_SRPT_RDMA_SIZE) { + pr_err("val: %lu exceeds MAX_SRPT_RDMA_SIZE: %d\n", val, + MAX_SRPT_RDMA_SIZE); + return -EINVAL; + } + if (val < DEFAULT_MAX_RDMA_SIZE) { + pr_err("val: %lu smaller than DEFAULT_MAX_RDMA_SIZE: %d\n", + val, DEFAULT_MAX_RDMA_SIZE); + return -EINVAL; + } + sport->port_attrib.srp_max_rdma_size = val; + + return count; +} + +TF_TPG_ATTRIB_ATTR(srpt, srp_max_rdma_size, S_IRUGO | S_IWUSR); + +static ssize_t srpt_tpg_attrib_show_srp_max_rsp_size( + struct se_portal_group *se_tpg, + char *page) +{ + struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + + return sprintf(page, "%u\n", sport->port_attrib.srp_max_rsp_size); +} + +static ssize_t srpt_tpg_attrib_store_srp_max_rsp_size( + struct se_portal_group *se_tpg, + const char *page, + size_t count) +{ + struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + unsigned long val; + int ret; + + ret = kstrtoul(page, 0, &val); + if (ret < 0) { + pr_err("kstrtoul() failed with ret: %d\n", ret); + return -EINVAL; + } + if (val > MAX_SRPT_RSP_SIZE) { + pr_err("val: %lu exceeds MAX_SRPT_RSP_SIZE: %d\n", val, + MAX_SRPT_RSP_SIZE); + return -EINVAL; + } + if (val < MIN_MAX_RSP_SIZE) { + pr_err("val: %lu smaller than MIN_MAX_RSP_SIZE: %d\n", val, + MIN_MAX_RSP_SIZE); + return -EINVAL; + } + sport->port_attrib.srp_max_rsp_size = val; + + return count; +} + +TF_TPG_ATTRIB_ATTR(srpt, srp_max_rsp_size, S_IRUGO | S_IWUSR); + +static ssize_t srpt_tpg_attrib_show_srp_sq_size( + struct se_portal_group *se_tpg, + char *page) +{ + struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + + return sprintf(page, "%u\n", sport->port_attrib.srp_sq_size); +} + +static ssize_t srpt_tpg_attrib_store_srp_sq_size( + struct se_portal_group *se_tpg, + const char *page, + size_t count) +{ + struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + unsigned long val; + int ret; + + ret = kstrtoul(page, 0, &val); + if (ret < 0) { + pr_err("kstrtoul() failed with ret: %d\n", ret); + return -EINVAL; + } + if (val > MAX_SRPT_SRQ_SIZE) { + pr_err("val: %lu exceeds MAX_SRPT_SRQ_SIZE: %d\n", val, + MAX_SRPT_SRQ_SIZE); + return -EINVAL; + } + if (val < MIN_SRPT_SRQ_SIZE) { + pr_err("val: %lu smaller than MIN_SRPT_SRQ_SIZE: %d\n", val, + MIN_SRPT_SRQ_SIZE); + return -EINVAL; + } + sport->port_attrib.srp_sq_size = val; + + return count; +} + +TF_TPG_ATTRIB_ATTR(srpt, srp_sq_size, S_IRUGO | S_IWUSR); + +static struct configfs_attribute *srpt_tpg_attrib_attrs[] = { + &srpt_tpg_attrib_srp_max_rdma_size.attr, + &srpt_tpg_attrib_srp_max_rsp_size.attr, + &srpt_tpg_attrib_srp_sq_size.attr, + NULL, +}; + +static ssize_t srpt_tpg_show_enable( + struct se_portal_group *se_tpg, + char *page) +{ + struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + + return snprintf(page, PAGE_SIZE, "%d\n", (sport->enabled) ? 1: 0); +} + +static ssize_t srpt_tpg_store_enable( + struct se_portal_group *se_tpg, + const char *page, + size_t count) +{ + struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + unsigned long tmp; + int ret; + + ret = kstrtoul(page, 0, &tmp); + if (ret < 0) { + printk(KERN_ERR "Unable to extract srpt_tpg_store_enable\n"); + return -EINVAL; + } + + if ((tmp != 0) && (tmp != 1)) { + printk(KERN_ERR "Illegal value for srpt_tpg_store_enable: %lu\n", tmp); + return -EINVAL; + } + if (tmp == 1) + sport->enabled = true; + else + sport->enabled = false; + + return count; +} + +TF_TPG_BASE_ATTR(srpt, enable, S_IRUGO | S_IWUSR); + +static struct configfs_attribute *srpt_tpg_attrs[] = { + &srpt_tpg_enable.attr, + NULL, +}; + +/** + * configfs callback invoked for + * mkdir /sys/kernel/config/target/$driver/$port/$tpg + */ +static struct se_portal_group *srpt_make_tpg(struct se_wwn *wwn, + struct config_group *group, + const char *name) +{ + struct srpt_port *sport = container_of(wwn, struct srpt_port, port_wwn); + int res; + + /* Initialize sport->port_wwn and sport->port_tpg_1 */ + res = core_tpg_register(&srpt_target->tf_ops, &sport->port_wwn, + &sport->port_tpg_1, sport, TRANSPORT_TPG_TYPE_NORMAL); + if (res) + return ERR_PTR(res); + + return &sport->port_tpg_1; +} + +/** + * configfs callback invoked for + * rmdir /sys/kernel/config/target/$driver/$port/$tpg + */ +static void srpt_drop_tpg(struct se_portal_group *tpg) +{ + struct srpt_port *sport = container_of(tpg, + struct srpt_port, port_tpg_1); + + sport->enabled = false; + core_tpg_deregister(&sport->port_tpg_1); +} + +/** + * configfs callback invoked for + * mkdir /sys/kernel/config/target/$driver/$port + */ +static struct se_wwn *srpt_make_tport(struct target_fabric_configfs *tf, + struct config_group *group, + const char *name) +{ + struct srpt_port *sport; + int ret; + + sport = srpt_lookup_port(name); + pr_debug("make_tport(%s)\n", name); + ret = -EINVAL; + if (!sport) + goto err; + + return &sport->port_wwn; + +err: + return ERR_PTR(ret); +} + +/** + * configfs callback invoked for + * rmdir /sys/kernel/config/target/$driver/$port + */ +static void srpt_drop_tport(struct se_wwn *wwn) +{ + struct srpt_port *sport = container_of(wwn, struct srpt_port, port_wwn); + + pr_debug("drop_tport(%s\n", config_item_name(&sport->port_wwn.wwn_group.cg_item)); +} + +static ssize_t srpt_wwn_show_attr_version(struct target_fabric_configfs *tf, + char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%s\n", DRV_VERSION); +} + +TF_WWN_ATTR_RO(srpt, version); + +static struct configfs_attribute *srpt_wwn_attrs[] = { + &srpt_wwn_version.attr, + NULL, +}; + +static struct target_core_fabric_ops srpt_template = { + .get_fabric_name = srpt_get_fabric_name, + .get_fabric_proto_ident = srpt_get_fabric_proto_ident, + .tpg_get_wwn = srpt_get_fabric_wwn, + .tpg_get_tag = srpt_get_tag, + .tpg_get_default_depth = srpt_get_default_depth, + .tpg_get_pr_transport_id = srpt_get_pr_transport_id, + .tpg_get_pr_transport_id_len = srpt_get_pr_transport_id_len, + .tpg_parse_pr_out_transport_id = srpt_parse_pr_out_transport_id, + .tpg_check_demo_mode = srpt_check_false, + .tpg_check_demo_mode_cache = srpt_check_true, + .tpg_check_demo_mode_write_protect = srpt_check_true, + .tpg_check_prod_mode_write_protect = srpt_check_false, + .tpg_alloc_fabric_acl = srpt_alloc_fabric_acl, + .tpg_release_fabric_acl = srpt_release_fabric_acl, + .tpg_get_inst_index = srpt_tpg_get_inst_index, + .release_cmd = srpt_release_cmd, + .check_stop_free = srpt_check_stop_free, + .shutdown_session = srpt_shutdown_session, + .close_session = srpt_close_session, + .sess_get_index = srpt_sess_get_index, + .sess_get_initiator_sid = NULL, + .write_pending = srpt_write_pending, + .write_pending_status = srpt_write_pending_status, + .set_default_node_attributes = srpt_set_default_node_attrs, + .get_task_tag = srpt_get_task_tag, + .get_cmd_state = srpt_get_tcm_cmd_state, + .queue_data_in = srpt_queue_data_in, + .queue_status = srpt_queue_status, + .queue_tm_rsp = srpt_queue_tm_rsp, + .aborted_task = srpt_aborted_task, + /* + * Setup function pointers for generic logic in + * target_core_fabric_configfs.c + */ + .fabric_make_wwn = srpt_make_tport, + .fabric_drop_wwn = srpt_drop_tport, + .fabric_make_tpg = srpt_make_tpg, + .fabric_drop_tpg = srpt_drop_tpg, + .fabric_post_link = NULL, + .fabric_pre_unlink = NULL, + .fabric_make_np = NULL, + .fabric_drop_np = NULL, + .fabric_make_nodeacl = srpt_make_nodeacl, + .fabric_drop_nodeacl = srpt_drop_nodeacl, +}; + +/** + * srpt_init_module() - Kernel module initialization. + * + * Note: Since ib_register_client() registers callback functions, and since at + * least one of these callback functions (srpt_add_one()) calls target core + * functions, this driver must be registered with the target core before + * ib_register_client() is called. + */ +static int __init srpt_init_module(void) +{ + int ret; + + ret = -EINVAL; + if (srp_max_req_size < MIN_MAX_REQ_SIZE) { + printk(KERN_ERR "invalid value %d for kernel module parameter" + " srp_max_req_size -- must be at least %d.\n", + srp_max_req_size, MIN_MAX_REQ_SIZE); + goto out; + } + + if (srpt_srq_size < MIN_SRPT_SRQ_SIZE + || srpt_srq_size > MAX_SRPT_SRQ_SIZE) { + printk(KERN_ERR "invalid value %d for kernel module parameter" + " srpt_srq_size -- must be in the range [%d..%d].\n", + srpt_srq_size, MIN_SRPT_SRQ_SIZE, MAX_SRPT_SRQ_SIZE); + goto out; + } + + srpt_target = target_fabric_configfs_init(THIS_MODULE, "srpt"); + if (IS_ERR(srpt_target)) { + printk(KERN_ERR "couldn't register\n"); + ret = PTR_ERR(srpt_target); + goto out; + } + + srpt_target->tf_ops = srpt_template; + + /* + * Set up default attribute lists. + */ + srpt_target->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = srpt_wwn_attrs; + srpt_target->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = srpt_tpg_attrs; + srpt_target->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = srpt_tpg_attrib_attrs; + srpt_target->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = NULL; + srpt_target->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL; + srpt_target->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = NULL; + srpt_target->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL; + srpt_target->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL; + srpt_target->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL; + + ret = target_fabric_configfs_register(srpt_target); + if (ret < 0) { + printk(KERN_ERR "couldn't register\n"); + goto out_free_target; + } + + ret = ib_register_client(&srpt_client); + if (ret) { + printk(KERN_ERR "couldn't register IB client\n"); + goto out_unregister_target; + } + + return 0; + +out_unregister_target: + target_fabric_configfs_deregister(srpt_target); + srpt_target = NULL; +out_free_target: + if (srpt_target) + target_fabric_configfs_free(srpt_target); +out: + return ret; +} + +static void __exit srpt_cleanup_module(void) +{ + ib_unregister_client(&srpt_client); + target_fabric_configfs_deregister(srpt_target); + srpt_target = NULL; +} + +module_init(srpt_init_module); +module_exit(srpt_cleanup_module); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h new file mode 100644 index 00000000000..3dae156905d --- /dev/null +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -0,0 +1,443 @@ +/* + * Copyright (c) 2006 - 2009 Mellanox Technology Inc. All rights reserved. + * Copyright (C) 2009 - 2010 Bart Van Assche <bvanassche@acm.org>. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef IB_SRPT_H +#define IB_SRPT_H + +#include <linux/types.h> +#include <linux/list.h> +#include <linux/wait.h> + +#include <rdma/ib_verbs.h> +#include <rdma/ib_sa.h> +#include <rdma/ib_cm.h> + +#include <scsi/srp.h> + +#include "ib_dm_mad.h" + +/* + * The prefix the ServiceName field must start with in the device management + * ServiceEntries attribute pair. See also the SRP specification. + */ +#define SRP_SERVICE_NAME_PREFIX "SRP.T10:" + +enum { + /* + * SRP IOControllerProfile attributes for SRP target ports that have + * not been defined in <scsi/srp.h>. Source: section B.7, table B.7 + * in the SRP specification. + */ + SRP_PROTOCOL = 0x0108, + SRP_PROTOCOL_VERSION = 0x0001, + SRP_IO_SUBCLASS = 0x609e, + SRP_SEND_TO_IOC = 0x01, + SRP_SEND_FROM_IOC = 0x02, + SRP_RDMA_READ_FROM_IOC = 0x08, + SRP_RDMA_WRITE_FROM_IOC = 0x20, + + /* + * srp_login_cmd.req_flags bitmasks. See also table 9 in the SRP + * specification. + */ + SRP_MTCH_ACTION = 0x03, /* MULTI-CHANNEL ACTION */ + SRP_LOSOLNT = 0x10, /* logout solicited notification */ + SRP_CRSOLNT = 0x20, /* credit request solicited notification */ + SRP_AESOLNT = 0x40, /* asynchronous event solicited notification */ + + /* + * srp_cmd.sol_nt / srp_tsk_mgmt.sol_not bitmasks. See also tables + * 18 and 20 in the SRP specification. + */ + SRP_SCSOLNT = 0x02, /* SCSOLNT = successful solicited notification */ + SRP_UCSOLNT = 0x04, /* UCSOLNT = unsuccessful solicited notification */ + + /* + * srp_rsp.sol_not / srp_t_logout.sol_not bitmasks. See also tables + * 16 and 22 in the SRP specification. + */ + SRP_SOLNT = 0x01, /* SOLNT = solicited notification */ + + /* See also table 24 in the SRP specification. */ + SRP_TSK_MGMT_SUCCESS = 0x00, + SRP_TSK_MGMT_FUNC_NOT_SUPP = 0x04, + SRP_TSK_MGMT_FAILED = 0x05, + + /* See also table 21 in the SRP specification. */ + SRP_CMD_SIMPLE_Q = 0x0, + SRP_CMD_HEAD_OF_Q = 0x1, + SRP_CMD_ORDERED_Q = 0x2, + SRP_CMD_ACA = 0x4, + + SRP_LOGIN_RSP_MULTICHAN_NO_CHAN = 0x0, + SRP_LOGIN_RSP_MULTICHAN_TERMINATED = 0x1, + SRP_LOGIN_RSP_MULTICHAN_MAINTAINED = 0x2, + + SRPT_DEF_SG_TABLESIZE = 128, + SRPT_DEF_SG_PER_WQE = 16, + + MIN_SRPT_SQ_SIZE = 16, + DEF_SRPT_SQ_SIZE = 4096, + SRPT_RQ_SIZE = 128, + MIN_SRPT_SRQ_SIZE = 4, + DEFAULT_SRPT_SRQ_SIZE = 4095, + MAX_SRPT_SRQ_SIZE = 65535, + MAX_SRPT_RDMA_SIZE = 1U << 24, + MAX_SRPT_RSP_SIZE = 1024, + + MIN_MAX_REQ_SIZE = 996, + DEFAULT_MAX_REQ_SIZE + = sizeof(struct srp_cmd)/*48*/ + + sizeof(struct srp_indirect_buf)/*20*/ + + 128 * sizeof(struct srp_direct_buf)/*16*/, + + MIN_MAX_RSP_SIZE = sizeof(struct srp_rsp)/*36*/ + 4, + DEFAULT_MAX_RSP_SIZE = 256, /* leaves 220 bytes for sense data */ + + DEFAULT_MAX_RDMA_SIZE = 65536, +}; + +enum srpt_opcode { + SRPT_RECV, + SRPT_SEND, + SRPT_RDMA_MID, + SRPT_RDMA_ABORT, + SRPT_RDMA_READ_LAST, + SRPT_RDMA_WRITE_LAST, +}; + +static inline u64 encode_wr_id(u8 opcode, u32 idx) +{ + return ((u64)opcode << 32) | idx; +} +static inline enum srpt_opcode opcode_from_wr_id(u64 wr_id) +{ + return wr_id >> 32; +} +static inline u32 idx_from_wr_id(u64 wr_id) +{ + return (u32)wr_id; +} + +struct rdma_iu { + u64 raddr; + u32 rkey; + struct ib_sge *sge; + u32 sge_cnt; + int mem_id; +}; + +/** + * enum srpt_command_state - SCSI command state managed by SRPT. + * @SRPT_STATE_NEW: New command arrived and is being processed. + * @SRPT_STATE_NEED_DATA: Processing a write or bidir command and waiting + * for data arrival. + * @SRPT_STATE_DATA_IN: Data for the write or bidir command arrived and is + * being processed. + * @SRPT_STATE_CMD_RSP_SENT: SRP_RSP for SRP_CMD has been sent. + * @SRPT_STATE_MGMT: Processing a SCSI task management command. + * @SRPT_STATE_MGMT_RSP_SENT: SRP_RSP for SRP_TSK_MGMT has been sent. + * @SRPT_STATE_DONE: Command processing finished successfully, command + * processing has been aborted or command processing + * failed. + */ +enum srpt_command_state { + SRPT_STATE_NEW = 0, + SRPT_STATE_NEED_DATA = 1, + SRPT_STATE_DATA_IN = 2, + SRPT_STATE_CMD_RSP_SENT = 3, + SRPT_STATE_MGMT = 4, + SRPT_STATE_MGMT_RSP_SENT = 5, + SRPT_STATE_DONE = 6, +}; + +/** + * struct srpt_ioctx - Shared SRPT I/O context information. + * @buf: Pointer to the buffer. + * @dma: DMA address of the buffer. + * @index: Index of the I/O context in its ioctx_ring array. + */ +struct srpt_ioctx { + void *buf; + dma_addr_t dma; + uint32_t index; +}; + +/** + * struct srpt_recv_ioctx - SRPT receive I/O context. + * @ioctx: See above. + * @wait_list: Node for insertion in srpt_rdma_ch.cmd_wait_list. + */ +struct srpt_recv_ioctx { + struct srpt_ioctx ioctx; + struct list_head wait_list; +}; + +/** + * struct srpt_send_ioctx - SRPT send I/O context. + * @ioctx: See above. + * @ch: Channel pointer. + * @free_list: Node in srpt_rdma_ch.free_list. + * @n_rbuf: Number of data buffers in the received SRP command. + * @rbufs: Pointer to SRP data buffer array. + * @single_rbuf: SRP data buffer if the command has only a single buffer. + * @sg: Pointer to sg-list associated with this I/O context. + * @sg_cnt: SG-list size. + * @mapped_sg_count: ib_dma_map_sg() return value. + * @n_rdma_ius: Number of elements in the rdma_ius array. + * @rdma_ius: Array with information about the RDMA mapping. + * @tag: Tag of the received SRP information unit. + * @spinlock: Protects 'state'. + * @state: I/O context state. + * @rdma_aborted: If initiating a multipart RDMA transfer failed, whether + * the already initiated transfers have finished. + * @cmd: Target core command data structure. + * @sense_data: SCSI sense data. + */ +struct srpt_send_ioctx { + struct srpt_ioctx ioctx; + struct srpt_rdma_ch *ch; + struct rdma_iu *rdma_ius; + struct srp_direct_buf *rbufs; + struct srp_direct_buf single_rbuf; + struct scatterlist *sg; + struct list_head free_list; + spinlock_t spinlock; + enum srpt_command_state state; + bool rdma_aborted; + struct se_cmd cmd; + struct completion tx_done; + u64 tag; + int sg_cnt; + int mapped_sg_count; + u16 n_rdma_ius; + u8 n_rdma; + u8 n_rbuf; + bool queue_status_only; + u8 sense_data[SCSI_SENSE_BUFFERSIZE]; +}; + +/** + * enum rdma_ch_state - SRP channel state. + * @CH_CONNECTING: QP is in RTR state; waiting for RTU. + * @CH_LIVE: QP is in RTS state. + * @CH_DISCONNECTING: DREQ has been received; waiting for DREP + * or DREQ has been send and waiting for DREP + * or . + * @CH_DRAINING: QP is in ERR state; waiting for last WQE event. + * @CH_RELEASING: Last WQE event has been received; releasing resources. + */ +enum rdma_ch_state { + CH_CONNECTING, + CH_LIVE, + CH_DISCONNECTING, + CH_DRAINING, + CH_RELEASING +}; + +/** + * struct srpt_rdma_ch - RDMA channel. + * @wait_queue: Allows the kernel thread to wait for more work. + * @thread: Kernel thread that processes the IB queues associated with + * the channel. + * @cm_id: IB CM ID associated with the channel. + * @qp: IB queue pair used for communicating over this channel. + * @cq: IB completion queue for this channel. + * @rq_size: IB receive queue size. + * @rsp_size IB response message size in bytes. + * @sq_wr_avail: number of work requests available in the send queue. + * @sport: pointer to the information of the HCA port used by this + * channel. + * @i_port_id: 128-bit initiator port identifier copied from SRP_LOGIN_REQ. + * @t_port_id: 128-bit target port identifier copied from SRP_LOGIN_REQ. + * @max_ti_iu_len: maximum target-to-initiator information unit length. + * @req_lim: request limit: maximum number of requests that may be sent + * by the initiator without having received a response. + * @req_lim_delta: Number of credits not yet sent back to the initiator. + * @spinlock: Protects free_list and state. + * @free_list: Head of list with free send I/O contexts. + * @state: channel state. See also enum rdma_ch_state. + * @ioctx_ring: Send ring. + * @wc: IB work completion array for srpt_process_completion(). + * @list: Node for insertion in the srpt_device.rch_list list. + * @cmd_wait_list: List of SCSI commands that arrived before the RTU event. This + * list contains struct srpt_ioctx elements and is protected + * against concurrent modification by the cm_id spinlock. + * @sess: Session information associated with this SRP channel. + * @sess_name: Session name. + * @release_work: Allows scheduling of srpt_release_channel(). + * @release_done: Enables waiting for srpt_release_channel() completion. + */ +struct srpt_rdma_ch { + wait_queue_head_t wait_queue; + struct task_struct *thread; + struct ib_cm_id *cm_id; + struct ib_qp *qp; + struct ib_cq *cq; + int rq_size; + u32 rsp_size; + atomic_t sq_wr_avail; + struct srpt_port *sport; + u8 i_port_id[16]; + u8 t_port_id[16]; + int max_ti_iu_len; + atomic_t req_lim; + atomic_t req_lim_delta; + spinlock_t spinlock; + struct list_head free_list; + enum rdma_ch_state state; + struct srpt_send_ioctx **ioctx_ring; + struct ib_wc wc[16]; + struct list_head list; + struct list_head cmd_wait_list; + struct se_session *sess; + u8 sess_name[36]; + struct work_struct release_work; + struct completion *release_done; + bool in_shutdown; +}; + +/** + * struct srpt_port_attib - Attributes for SRPT port + * @srp_max_rdma_size: Maximum size of SRP RDMA transfers for new connections. + * @srp_max_rsp_size: Maximum size of SRP response messages in bytes. + * @srp_sq_size: Shared receive queue (SRQ) size. + */ +struct srpt_port_attrib { + u32 srp_max_rdma_size; + u32 srp_max_rsp_size; + u32 srp_sq_size; +}; + +/** + * struct srpt_port - Information associated by SRPT with a single IB port. + * @sdev: backpointer to the HCA information. + * @mad_agent: per-port management datagram processing information. + * @enabled: Whether or not this target port is enabled. + * @port_guid: ASCII representation of Port GUID + * @port: one-based port number. + * @sm_lid: cached value of the port's sm_lid. + * @lid: cached value of the port's lid. + * @gid: cached value of the port's gid. + * @port_acl_lock spinlock for port_acl_list: + * @work: work structure for refreshing the aforementioned cached values. + * @port_tpg_1 Target portal group = 1 data. + * @port_wwn: Target core WWN data. + * @port_acl_list: Head of the list with all node ACLs for this port. + */ +struct srpt_port { + struct srpt_device *sdev; + struct ib_mad_agent *mad_agent; + bool enabled; + u8 port_guid[64]; + u8 port; + u16 sm_lid; + u16 lid; + union ib_gid gid; + spinlock_t port_acl_lock; + struct work_struct work; + struct se_portal_group port_tpg_1; + struct se_wwn port_wwn; + struct list_head port_acl_list; + struct srpt_port_attrib port_attrib; +}; + +/** + * struct srpt_device - Information associated by SRPT with a single HCA. + * @device: Backpointer to the struct ib_device managed by the IB core. + * @pd: IB protection domain. + * @mr: L_Key (local key) with write access to all local memory. + * @srq: Per-HCA SRQ (shared receive queue). + * @cm_id: Connection identifier. + * @dev_attr: Attributes of the InfiniBand device as obtained during the + * ib_client.add() callback. + * @srq_size: SRQ size. + * @ioctx_ring: Per-HCA SRQ. + * @rch_list: Per-device channel list -- see also srpt_rdma_ch.list. + * @ch_releaseQ: Enables waiting for removal from rch_list. + * @spinlock: Protects rch_list and tpg. + * @port: Information about the ports owned by this HCA. + * @event_handler: Per-HCA asynchronous IB event handler. + * @list: Node in srpt_dev_list. + */ +struct srpt_device { + struct ib_device *device; + struct ib_pd *pd; + struct ib_mr *mr; + struct ib_srq *srq; + struct ib_cm_id *cm_id; + struct ib_device_attr dev_attr; + int srq_size; + struct srpt_recv_ioctx **ioctx_ring; + struct list_head rch_list; + wait_queue_head_t ch_releaseQ; + spinlock_t spinlock; + struct srpt_port port[2]; + struct ib_event_handler event_handler; + struct list_head list; +}; + +/** + * struct srpt_node_acl - Per-initiator ACL data (managed via configfs). + * @i_port_id: 128-bit SRP initiator port ID. + * @sport: port information. + * @nacl: Target core node ACL information. + * @list: Element of the per-HCA ACL list. + */ +struct srpt_node_acl { + u8 i_port_id[16]; + struct srpt_port *sport; + struct se_node_acl nacl; + struct list_head list; +}; + +/* + * SRP-releated SCSI persistent reservation definitions. + * + * See also SPC4r28, section 7.6.1 (Protocol specific parameters introduction). + * See also SPC4r28, section 7.6.4.5 (TransportID for initiator ports using + * SCSI over an RDMA interface). + */ + +enum { + SCSI_TRANSPORTID_PROTOCOLID_SRP = 4, +}; + +struct spc_rdma_transport_id { + uint8_t protocol_identifier; + uint8_t reserved[7]; + uint8_t i_port_id[16]; +}; + +#endif /* IB_SRPT_H */ |
