aboutsummaryrefslogtreecommitdiff
path: root/drivers/infiniband/ulp/ipoib/ipoib_ib.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/ulp/ipoib/ipoib_ib.c')
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c240
1 files changed, 163 insertions, 77 deletions
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 66cafa20c24..6a7003ddb0b 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -34,9 +34,10 @@
*/
#include <linux/delay.h>
+#include <linux/moduleparam.h>
#include <linux/dma-mapping.h>
+#include <linux/slab.h>
-#include <rdma/ib_cache.h>
#include <linux/ip.h>
#include <linux/tcp.h>
@@ -56,21 +57,24 @@ struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
struct ib_pd *pd, struct ib_ah_attr *attr)
{
struct ipoib_ah *ah;
+ struct ib_ah *vah;
ah = kmalloc(sizeof *ah, GFP_KERNEL);
if (!ah)
- return NULL;
+ return ERR_PTR(-ENOMEM);
ah->dev = dev;
ah->last_send = 0;
kref_init(&ah->ref);
- ah->ah = ib_create_ah(pd, attr);
- if (IS_ERR(ah->ah)) {
+ vah = ib_create_ah(pd, attr);
+ if (IS_ERR(vah)) {
kfree(ah);
- ah = NULL;
- } else
+ ah = (struct ipoib_ah *)vah;
+ } else {
+ ah->ah = vah;
ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah);
+ }
return ah;
}
@@ -117,9 +121,9 @@ static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv,
size = length - IPOIB_UD_HEAD_SIZE;
- frag->size = size;
+ skb_frag_size_set(frag, size);
skb->data_len += size;
- skb->truesize += size;
+ skb->truesize += PAGE_SIZE;
} else
skb_put(skb, length);
@@ -152,14 +156,18 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct sk_buff *skb;
int buf_size;
+ int tailroom;
u64 *mapping;
- if (ipoib_ud_need_sg(priv->max_ib_mtu))
+ if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
buf_size = IPOIB_UD_HEAD_SIZE;
- else
+ tailroom = 128; /* reserve some tailroom for IP/TCP headers */
+ } else {
buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
+ tailroom = 0;
+ }
- skb = dev_alloc_skb(buf_size + 4);
+ skb = dev_alloc_skb(buf_size + tailroom + 4);
if (unlikely(!skb))
return NULL;
@@ -182,7 +190,7 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
goto partial_error;
skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE);
mapping[1] =
- ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[0].page,
+ ib_dma_map_page(priv->ca, page,
0, PAGE_SIZE, DMA_FROM_DEVICE);
if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1])))
goto partial_error;
@@ -223,6 +231,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
struct sk_buff *skb;
u64 mapping[IPOIB_UD_RX_SG];
+ union ib_gid *dgid;
ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
wr_id, wc->status);
@@ -271,27 +280,31 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
ipoib_ud_dma_unmap_rx(priv, mapping);
ipoib_ud_skb_put_frags(priv, skb, wc->byte_len);
+ /* First byte of dgid signals multicast when 0xff */
+ dgid = &((struct ib_grh *)skb->data)->dgid;
+
+ if (!(wc->wc_flags & IB_WC_GRH) || dgid->raw[0] != 0xff)
+ skb->pkt_type = PACKET_HOST;
+ else if (memcmp(dgid, dev->broadcast + 4, sizeof(union ib_gid)) == 0)
+ skb->pkt_type = PACKET_BROADCAST;
+ else
+ skb->pkt_type = PACKET_MULTICAST;
+
skb_pull(skb, IB_GRH_BYTES);
skb->protocol = ((struct ipoib_header *) skb->data)->proto;
skb_reset_mac_header(skb);
skb_pull(skb, IPOIB_ENCAP_LEN);
- dev->last_rx = jiffies;
++dev->stats.rx_packets;
dev->stats.rx_bytes += skb->len;
skb->dev = dev;
- /* XXX get correct PACKET_ type here */
- skb->pkt_type = PACKET_HOST;
-
- if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok))
+ if ((dev->features & NETIF_F_RXCSUM) &&
+ likely(wc->wc_flags & IB_WC_IP_CSUM_OK))
skb->ip_summed = CHECKSUM_UNNECESSARY;
- if (dev->features & NETIF_F_LRO)
- lro_receive_skb(&priv->lro.lro_mgr, skb, NULL);
- else
- netif_receive_skb(skb);
+ napi_gro_receive(&priv->napi, skb);
repost:
if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
@@ -318,9 +331,10 @@ static int ipoib_dma_map_tx(struct ib_device *ca,
off = 0;
for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- mapping[i + off] = ib_dma_map_page(ca, frag->page,
- frag->page_offset, frag->size,
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ mapping[i + off] = ib_dma_map_page(ca,
+ skb_frag_page(frag),
+ frag->page_offset, skb_frag_size(frag),
DMA_TO_DEVICE);
if (unlikely(ib_dma_mapping_error(ca, mapping[i + off])))
goto partial_error;
@@ -329,8 +343,9 @@ static int ipoib_dma_map_tx(struct ib_device *ca,
partial_error:
for (; i > 0; --i) {
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
- ib_dma_unmap_page(ca, mapping[i - !off], frag->size, DMA_TO_DEVICE);
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
+
+ ib_dma_unmap_page(ca, mapping[i - !off], skb_frag_size(frag), DMA_TO_DEVICE);
}
if (off)
@@ -354,8 +369,9 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
off = 0;
for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- ib_dma_unmap_page(ca, mapping[i + off], frag->size,
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ ib_dma_unmap_page(ca, mapping[i + off], skb_frag_size(frag),
DMA_TO_DEVICE);
}
}
@@ -443,14 +459,11 @@ poll_more:
}
if (done < budget) {
- if (dev->features & NETIF_F_LRO)
- lro_flush_all(&priv->lro.lro_mgr);
-
- netif_rx_complete(dev, napi);
+ napi_complete(napi);
if (unlikely(ib_req_notify_cq(priv->recv_cq,
IB_CQ_NEXT_COMP |
IB_CQ_REPORT_MISSED_EVENTS)) &&
- netif_rx_reschedule(dev, napi))
+ napi_reschedule(napi))
goto poll_more;
}
@@ -462,27 +475,28 @@ void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
struct net_device *dev = dev_ptr;
struct ipoib_dev_priv *priv = netdev_priv(dev);
- netif_rx_schedule(dev, &priv->napi);
+ napi_schedule(&priv->napi);
}
static void drain_tx_cq(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
- unsigned long flags;
- spin_lock_irqsave(&priv->tx_lock, flags);
+ netif_tx_lock(dev);
while (poll_tx(priv))
; /* nothing */
if (netif_queue_stopped(dev))
mod_timer(&priv->poll_timer, jiffies + 1);
- spin_unlock_irqrestore(&priv->tx_lock, flags);
+ netif_tx_unlock(dev);
}
void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr)
{
- drain_tx_cq((struct net_device *)dev_ptr);
+ struct ipoib_dev_priv *priv = netdev_priv(dev_ptr);
+
+ mod_timer(&priv->poll_timer, jiffies);
}
static inline int post_send(struct ipoib_dev_priv *priv,
@@ -507,7 +521,7 @@ static inline int post_send(struct ipoib_dev_priv *priv,
for (i = 0; i < nr_frags; ++i) {
priv->tx_sge[i + off].addr = mapping[i + off];
- priv->tx_sge[i + off].length = frags[i].size;
+ priv->tx_sge[i + off].length = skb_frag_size(&frags[i]);
}
priv->tx_wr.num_sge = nr_frags + off;
priv->tx_wr.wr_id = wr_id;
@@ -530,7 +544,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_tx_buf *tx_req;
- int hlen;
+ int hlen, rc;
void *phead;
if (skb_is_gso(skb)) {
@@ -586,9 +600,13 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
netif_stop_queue(dev);
}
- if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
- address->ah, qpn, tx_req, phead, hlen))) {
- ipoib_warn(priv, "post_send failed\n");
+ skb_orphan(skb);
+ skb_dst_drop(skb);
+
+ rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
+ address->ah, qpn, tx_req, phead, hlen);
+ if (unlikely(rc)) {
+ ipoib_warn(priv, "post_send failed, error %d\n", rc);
++dev->stats.tx_errors;
--priv->tx_outstanding;
ipoib_dma_unmap_tx(priv->ca, tx_req);
@@ -600,8 +618,6 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
address->last_send = priv->tx_head;
++priv->tx_head;
- skb_orphan(skb);
-
}
if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
@@ -614,17 +630,20 @@ static void __ipoib_reap_ah(struct net_device *dev)
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_ah *ah, *tah;
LIST_HEAD(remove_list);
+ unsigned long flags;
+
+ netif_tx_lock_bh(dev);
+ spin_lock_irqsave(&priv->lock, flags);
- spin_lock_irq(&priv->tx_lock);
- spin_lock(&priv->lock);
list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
list_del(&ah->list);
ib_destroy_ah(ah->ah);
kfree(ah);
}
- spin_unlock(&priv->lock);
- spin_unlock_irq(&priv->tx_lock);
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+ netif_tx_unlock_bh(dev);
}
void ipoib_reap_ah(struct work_struct *work)
@@ -666,28 +685,28 @@ int ipoib_ib_dev_open(struct net_device *dev)
ret = ipoib_ib_post_receives(dev);
if (ret) {
ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
- ipoib_ib_dev_stop(dev, 1);
- return -1;
+ goto dev_stop;
}
ret = ipoib_cm_dev_open(dev);
if (ret) {
ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
- ipoib_ib_dev_stop(dev, 1);
- return -1;
+ goto dev_stop;
}
clear_bit(IPOIB_STOP_REAPER, &priv->flags);
queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
round_jiffies_relative(HZ));
- init_timer(&priv->poll_timer);
- priv->poll_timer.function = ipoib_ib_tx_timer_func;
- priv->poll_timer.data = (unsigned long)dev;
-
- set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
+ if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
+ napi_enable(&priv->napi);
return 0;
+dev_stop:
+ if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
+ napi_enable(&priv->napi);
+ ipoib_ib_dev_stop(dev, 1);
+ return -1;
}
static void ipoib_pkey_dev_check_presence(struct net_device *dev)
@@ -730,10 +749,8 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush)
if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
mutex_lock(&pkey_mutex);
set_bit(IPOIB_PKEY_STOP, &priv->flags);
- cancel_delayed_work(&priv->pkey_poll_task);
+ cancel_delayed_work_sync(&priv->pkey_poll_task);
mutex_unlock(&pkey_mutex);
- if (flush)
- flush_workqueue(ipoib_workqueue);
}
ipoib_mcast_stop_thread(dev, flush);
@@ -761,6 +778,14 @@ void ipoib_drain_cq(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
int i, n;
+
+ /*
+ * We call completion handling routines that expect to be
+ * called from the BH-disabled NAPI poll context, so disable
+ * BHs here too.
+ */
+ local_bh_disable();
+
do {
n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
for (i = 0; i < n; ++i) {
@@ -784,6 +809,8 @@ void ipoib_drain_cq(struct net_device *dev)
while (poll_tx(priv))
; /* nothing */
+
+ local_bh_enable();
}
int ipoib_ib_dev_stop(struct net_device *dev, int flush)
@@ -794,7 +821,8 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
struct ipoib_tx_buf *tx_req;
int i;
- clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
+ if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
+ napi_disable(&priv->napi);
ipoib_cm_dev_stop(dev);
@@ -892,6 +920,9 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
return -ENODEV;
}
+ setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func,
+ (unsigned long) dev);
+
if (dev->flags & IFF_UP) {
if (ipoib_ib_dev_open(dev)) {
ipoib_transport_dev_cleanup(dev);
@@ -902,14 +933,49 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
return 0;
}
+/*
+ * Takes whatever value which is in pkey index 0 and updates priv->pkey
+ * returns 0 if the pkey value was changed.
+ */
+static inline int update_parent_pkey(struct ipoib_dev_priv *priv)
+{
+ int result;
+ u16 prev_pkey;
+
+ prev_pkey = priv->pkey;
+ result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey);
+ if (result) {
+ ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n",
+ priv->port, result);
+ return result;
+ }
+
+ priv->pkey |= 0x8000;
+
+ if (prev_pkey != priv->pkey) {
+ ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n",
+ prev_pkey, priv->pkey);
+ /*
+ * Update the pkey in the broadcast address, while making sure to set
+ * the full membership bit, so that we join the right broadcast group.
+ */
+ priv->dev->broadcast[8] = priv->pkey >> 8;
+ priv->dev->broadcast[9] = priv->pkey & 0xff;
+ return 0;
+ }
+
+ return 1;
+}
+
static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
enum ipoib_flush_level level)
{
struct ipoib_dev_priv *cpriv;
struct net_device *dev = priv->dev;
u16 new_index;
+ int result;
- mutex_lock(&priv->vlan_mutex);
+ down_read(&priv->vlan_rwsem);
/*
* Flush any child interfaces too -- they might be up even if
@@ -918,9 +984,13 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
list_for_each_entry(cpriv, &priv->child_intfs, list)
__ipoib_ib_dev_flush(cpriv, level);
- mutex_unlock(&priv->vlan_mutex);
+ up_read(&priv->vlan_rwsem);
if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) {
+ /* for non-child devices must check/update the pkey value here */
+ if (level == IPOIB_FLUSH_HEAVY &&
+ !test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
+ update_parent_pkey(priv);
ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
return;
}
@@ -931,21 +1001,32 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
}
if (level == IPOIB_FLUSH_HEAVY) {
- if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
- clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
- ipoib_ib_dev_down(dev, 0);
- ipoib_ib_dev_stop(dev, 0);
- if (ipoib_pkey_dev_delay_open(dev))
+ /* child devices chase their origin pkey value, while non-child
+ * (parent) devices should always takes what present in pkey index 0
+ */
+ if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+ if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
+ clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+ ipoib_ib_dev_down(dev, 0);
+ ipoib_ib_dev_stop(dev, 0);
+ if (ipoib_pkey_dev_delay_open(dev))
+ return;
+ }
+ /* restart QP only if P_Key index is changed */
+ if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
+ new_index == priv->pkey_index) {
+ ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
return;
+ }
+ priv->pkey_index = new_index;
+ } else {
+ result = update_parent_pkey(priv);
+ /* restart QP only if P_Key value changed */
+ if (result) {
+ ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n");
+ return;
+ }
}
-
- /* restart QP only if P_Key index is changed */
- if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
- new_index == priv->pkey_index) {
- ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
- return;
- }
- priv->pkey_index = new_index;
}
if (level == IPOIB_FLUSH_LIGHT) {
@@ -1001,6 +1082,11 @@ void ipoib_ib_dev_cleanup(struct net_device *dev)
struct ipoib_dev_priv *priv = netdev_priv(dev);
ipoib_dbg(priv, "cleaning up ib_dev\n");
+ /*
+ * We must make sure there are no more (path) completions
+ * that may wish to touch priv fields that are no longer valid
+ */
+ ipoib_flush_paths(dev);
ipoib_mcast_stop_thread(dev, 1);
ipoib_mcast_dev_flush(dev);