diff options
Diffstat (limited to 'drivers/infiniband/ulp/ipoib')
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/Kconfig | 3 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/Makefile | 3 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib.h | 104 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_cm.c | 138 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 83 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_fs.c | 9 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_ib.c | 203 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_main.c | 904 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 130 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_netlink.c | 182 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 5 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 152 |
12 files changed, 1280 insertions, 636 deletions
diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig index 9d9a9dc51f1..cda8eac55ff 100644 --- a/drivers/infiniband/ulp/ipoib/Kconfig +++ b/drivers/infiniband/ulp/ipoib/Kconfig @@ -1,7 +1,6 @@ config INFINIBAND_IPOIB tristate "IP-over-InfiniBand" depends on NETDEVICES && INET && (IPV6 || IPV6=n) - select INET_LRO ---help--- Support for the IP-over-InfiniBand protocol (IPoIB). This transports IP packets over InfiniBand so you can use your IB @@ -25,7 +24,7 @@ config INFINIBAND_IPOIB_CM unless you limit mtu for these destinations to 2044. config INFINIBAND_IPOIB_DEBUG - bool "IP-over-InfiniBand debugging" if EMBEDDED + bool "IP-over-InfiniBand debugging" if EXPERT depends on INFINIBAND_IPOIB default y ---help--- diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile index 3090100f0de..e5430dd5076 100644 --- a/drivers/infiniband/ulp/ipoib/Makefile +++ b/drivers/infiniband/ulp/ipoib/Makefile @@ -5,7 +5,8 @@ ib_ipoib-y := ipoib_main.o \ ipoib_multicast.o \ ipoib_verbs.o \ ipoib_vlan.o \ - ipoib_ethtool.o + ipoib_ethtool.o \ + ipoib_netlink.o ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM) += ipoib_cm.o ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 753a983a5fd..c639f90cfda 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -44,13 +44,14 @@ #include <linux/mutex.h> #include <net/neighbour.h> +#include <net/sch_generic.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <rdma/ib_verbs.h> #include <rdma/ib_pack.h> #include <rdma/ib_sa.h> -#include <linux/inet_lro.h> +#include <linux/sched.h> /* constants */ @@ -91,7 +92,8 @@ enum { IPOIB_STOP_REAPER = 7, IPOIB_FLAG_ADMIN_CM = 9, IPOIB_FLAG_UMCAST = 10, - IPOIB_FLAG_CSUM = 11, + IPOIB_STOP_NEIGH_GC = 11, + IPOIB_NEIGH_TBL_FLUSH = 12, IPOIB_MAX_BACKOFF_SECONDS = 16, @@ -99,12 +101,14 @@ enum { IPOIB_MCAST_FLAG_SENDONLY = 1, IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ IPOIB_MCAST_FLAG_ATTACHED = 3, - - IPOIB_MAX_LRO_DESCRIPTORS = 8, - IPOIB_LRO_MAX_AGGR = 64, + IPOIB_MCAST_JOIN_STARTED = 4, MAX_SEND_CQE = 16, IPOIB_CM_COPYBREAK = 256, + + IPOIB_NON_CHILD = 0, + IPOIB_LEGACY_CHILD = 1, + IPOIB_RTNL_CHILD = 2, }; #define IPOIB_OP_RECV (1ul << 31) @@ -114,6 +118,8 @@ enum { #define IPOIB_OP_CM (0) #endif +#define IPOIB_QPN_MASK ((__force u32) cpu_to_be32(0xFFFFFF)) + /* structs */ struct ipoib_header { @@ -121,8 +127,9 @@ struct ipoib_header { u16 reserved; }; -struct ipoib_pseudoheader { - u8 hwaddr[INFINIBAND_ALEN]; +struct ipoib_cb { + struct qdisc_skb_cb qdisc_cb; + u8 hwaddr[INFINIBAND_ALEN]; }; /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ @@ -145,6 +152,7 @@ struct ipoib_mcast { struct sk_buff_head pkt_queue; struct net_device *dev; + struct completion done; }; struct ipoib_rx_buf { @@ -262,9 +270,21 @@ struct ipoib_ethtool_st { u16 max_coalesced_frames; }; -struct ipoib_lro { - struct net_lro_mgr lro_mgr; - struct net_lro_desc lro_desc[IPOIB_MAX_LRO_DESCRIPTORS]; +struct ipoib_neigh_table; + +struct ipoib_neigh_hash { + struct ipoib_neigh_table *ntbl; + struct ipoib_neigh __rcu **buckets; + struct rcu_head rcu; + u32 mask; + u32 size; +}; + +struct ipoib_neigh_table { + struct ipoib_neigh_hash __rcu *htbl; + atomic_t entries; + struct completion flushed; + struct completion deleted; }; /* @@ -281,11 +301,13 @@ struct ipoib_dev_priv { unsigned long flags; - struct mutex vlan_mutex; + struct rw_semaphore vlan_rwsem; struct rb_root path_tree; struct list_head path_list; + struct ipoib_neigh_table ntbl; + struct ipoib_mcast *broadcast; struct list_head multicast_list; struct rb_root multicast_tree; @@ -298,7 +320,7 @@ struct ipoib_dev_priv { struct work_struct flush_heavy; struct work_struct restart_task; struct delayed_work ah_reap_task; - + struct delayed_work neigh_reap_task; struct ib_device *ca; u8 port; u16 pkey; @@ -339,6 +361,7 @@ struct ipoib_dev_priv { struct net_device *parent; struct list_head child_intfs; struct list_head list; + int child_type; #ifdef CONFIG_INFINIBAND_IPOIB_CM struct ipoib_cm_dev_priv cm; @@ -352,8 +375,6 @@ struct ipoib_dev_priv { int hca_caps; struct ipoib_ethtool_st ethtool; struct timer_list poll_timer; - - struct ipoib_lro lro; }; struct ipoib_ah { @@ -386,13 +407,16 @@ struct ipoib_neigh { #ifdef CONFIG_INFINIBAND_IPOIB_CM struct ipoib_cm_tx *cm; #endif - union ib_gid dgid; + u8 daddr[INFINIBAND_ALEN]; struct sk_buff_head queue; - struct neighbour *neighbour; struct net_device *dev; struct list_head list; + struct ipoib_neigh __rcu *hnext; + struct rcu_head rcu; + atomic_t refcnt; + unsigned long alive; }; #define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN) @@ -403,21 +427,17 @@ static inline int ipoib_ud_need_sg(unsigned int ib_mtu) return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE; } -/* - * We stash a pointer to our private neighbour information after our - * hardware address in neigh->ha. The ALIGN() expression here makes - * sure that this pointer is stored aligned so that an unaligned - * load is not needed to dereference it. - */ -static inline struct ipoib_neigh **to_ipoib_neigh(struct neighbour *neigh) +void ipoib_neigh_dtor(struct ipoib_neigh *neigh); +static inline void ipoib_neigh_put(struct ipoib_neigh *neigh) { - return (void*) neigh + ALIGN(offsetof(struct neighbour, ha) + - INFINIBAND_ALEN, sizeof(void *)); + if (atomic_dec_and_test(&neigh->refcnt)) + ipoib_neigh_dtor(neigh); } - -struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh, +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr); +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, struct net_device *dev); -void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh); +void ipoib_neigh_free(struct ipoib_neigh *neigh); +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid); extern struct workqueue_struct *ipoib_workqueue; @@ -434,7 +454,6 @@ static inline void ipoib_put_ah(struct ipoib_ah *ah) { kref_put(&ah->ref, ipoib_free_ah); } - int ipoib_open(struct net_device *dev); int ipoib_add_pkey_attr(struct net_device *dev); int ipoib_add_umcast_attr(struct net_device *dev); @@ -464,7 +483,7 @@ void ipoib_dev_cleanup(struct net_device *dev); void ipoib_mcast_join_task(struct work_struct *work); void ipoib_mcast_carrier_on_task(struct work_struct *work); -void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb); +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); void ipoib_mcast_restart_task(struct work_struct *work); int ipoib_mcast_start_thread(struct net_device *dev); @@ -502,6 +521,17 @@ void ipoib_event(struct ib_event_handler *handler, int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey); int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey); +int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, + u16 pkey, int child_type); + +int __init ipoib_netlink_init(void); +void __exit ipoib_netlink_fini(void); + +void ipoib_set_umcast(struct net_device *ndev, int umcast_val); +int ipoib_set_mode(struct net_device *dev, const char *buf); + +void ipoib_setup(struct net_device *dev); + void ipoib_pkey_poll(struct work_struct *work); int ipoib_pkey_dev_delay_open(struct net_device *dev); void ipoib_drain_cq(struct net_device *dev); @@ -509,14 +539,14 @@ void ipoib_drain_cq(struct net_device *dev); void ipoib_set_ethtool_ops(struct net_device *dev); int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca); -#ifdef CONFIG_INFINIBAND_IPOIB_CM - #define IPOIB_FLAGS_RC 0x80 #define IPOIB_FLAGS_UC 0x40 /* We don't support UC connections at the moment */ #define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC)) +#ifdef CONFIG_INFINIBAND_IPOIB_CM + extern int ipoib_max_conn_qp; static inline int ipoib_cm_admin_enabled(struct net_device *dev) @@ -526,10 +556,10 @@ static inline int ipoib_cm_admin_enabled(struct net_device *dev) test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); } -static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr) { struct ipoib_dev_priv *priv = netdev_priv(dev); - return IPOIB_CM_SUPPORTED(n->ha) && + return IPOIB_CM_SUPPORTED(hwaddr) && test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); } @@ -584,7 +614,7 @@ static inline int ipoib_cm_admin_enabled(struct net_device *dev) { return 0; } -static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr) { return 0; @@ -734,4 +764,6 @@ extern int ipoib_debug_level; #define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) +extern const char ipoib_driver_version[]; + #endif /* _IPOIB_H */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 47d588ba2a7..933efcea0d0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -31,12 +31,13 @@ */ #include <rdma/ib_cm.h> -#include <rdma/ib_cache.h> #include <net/dst.h> #include <net/icmp.h> #include <linux/icmpv6.h> #include <linux/delay.h> +#include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/moduleparam.h> #include "ipoib.h" @@ -84,7 +85,7 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); for (i = 0; i < frags; ++i) - ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); + ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); } static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) @@ -139,7 +140,8 @@ static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, struct ipoib_cm_rx_buf *rx_ring, int id, int frags, - u64 mapping[IPOIB_CM_RX_SG]) + u64 mapping[IPOIB_CM_RX_SG], + gfp_t gfp) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; @@ -163,13 +165,13 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, } for (i = 0; i < frags; i++) { - struct page *page = alloc_page(GFP_ATOMIC); + struct page *page = alloc_page(gfp); if (!page) goto partial_error; skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE); - mapping[i + 1] = ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[i].page, + mapping[i + 1] = ib_dma_map_page(priv->ca, page, 0, PAGE_SIZE, DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1]))) goto partial_error; @@ -183,7 +185,7 @@ partial_error: ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); for (; i > 0; --i) - ib_dma_unmap_single(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE); + ib_dma_unmap_page(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE); dev_kfree_skb_any(skb); return NULL; @@ -352,15 +354,13 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i int ret; int i; - rx->rx_ring = vmalloc(ipoib_recvq_size * sizeof *rx->rx_ring); + rx->rx_ring = vzalloc(ipoib_recvq_size * sizeof *rx->rx_ring); if (!rx->rx_ring) { printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n", priv->ca->name, ipoib_recvq_size); return -ENOMEM; } - memset(rx->rx_ring, 0, ipoib_recvq_size * sizeof *rx->rx_ring); - t = kmalloc(sizeof *t, GFP_KERNEL); if (!t) { ret = -ENOMEM; @@ -383,7 +383,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1, - rx->rx_ring[i].mapping)) { + rx->rx_ring[i].mapping, + GFP_KERNEL)) { ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); ret = -ENOMEM; goto err_count; @@ -461,7 +462,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even goto err_qp; } - psn = random32() & 0xffffff; + psn = prandom_u32() & 0xffffff; ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); if (ret) goto err_modify; @@ -539,12 +540,13 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, if (length == 0) { /* don't need this page */ - skb_fill_page_desc(toskb, i, frag->page, 0, PAGE_SIZE); + skb_fill_page_desc(toskb, i, skb_frag_page(frag), + 0, PAGE_SIZE); --skb_shinfo(skb)->nr_frags; } else { size = min(length, (unsigned) PAGE_SIZE); - frag->size = size; + skb_frag_size_set(frag, size); skb->data_len += size; skb->truesize += size; skb->len += size; @@ -639,7 +641,8 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; - newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping); + newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, + mapping, GFP_ATOMIC); if (unlikely(!newskb)) { /* * If we can't allocate a new RX buffer, dump @@ -663,7 +666,6 @@ copied: skb_reset_mac_header(skb); skb_pull(skb, IPOIB_ENCAP_LEN); - dev->last_rx = jiffies; ++dev->stats.rx_packets; dev->stats.rx_bytes += skb->len; @@ -710,6 +712,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_cm_tx_buf *tx_req; u64 addr; + int rc; if (unlikely(skb->len > tx->mtu)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", @@ -741,9 +744,13 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ tx_req->mapping = addr; - if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), - addr, skb->len))) { - ipoib_warn(priv, "post_send failed\n"); + skb_orphan(skb); + skb_dst_drop(skb); + + rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), + addr, skb->len); + if (unlikely(rc)) { + ipoib_warn(priv, "post_send failed, error %d\n", rc); ++dev->stats.tx_errors; ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); dev_kfree_skb_any(skb); @@ -755,6 +762,12 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", tx->qp->qp_num); netif_stop_queue(dev); + rc = ib_req_notify_cq(priv->send_cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc < 0) + ipoib_warn(priv, "request notify on send CQ failed\n"); + else if (rc) + ipoib_send_comp_handler(priv->send_cq, dev); } } } @@ -807,10 +820,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) if (neigh) { neigh->cm = NULL; - list_del(&neigh->list); - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); + ipoib_neigh_free(neigh); tx->neigh = NULL; } @@ -1020,10 +1030,20 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ .cap.max_send_sge = 1, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, - .qp_context = tx + .qp_context = tx, + .create_flags = IB_QP_CREATE_USE_GFP_NOIO }; - return ib_create_qp(priv->pd, &attr); + struct ib_qp *tx_qp; + + tx_qp = ib_create_qp(priv->pd, &attr); + if (PTR_ERR(tx_qp) == -EINVAL) { + ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n", + priv->ca->name); + attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO; + tx_qp = ib_create_qp(priv->pd, &attr); + } + return tx_qp; } static int ipoib_cm_send_req(struct net_device *dev, @@ -1094,7 +1114,8 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, struct ipoib_dev_priv *priv = netdev_priv(p->dev); int ret; - p->tx_ring = vmalloc(ipoib_sendq_size * sizeof *p->tx_ring); + p->tx_ring = __vmalloc(ipoib_sendq_size * sizeof *p->tx_ring, + GFP_NOIO, PAGE_KERNEL); if (!p->tx_ring) { ipoib_warn(priv, "failed to allocate tx ring\n"); ret = -ENOMEM; @@ -1227,10 +1248,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, if (neigh) { neigh->cm = NULL; - list_del(&neigh->list); - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); + ipoib_neigh_free(neigh); tx->neigh = NULL; } @@ -1273,12 +1291,15 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) { struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + unsigned long flags; if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + spin_lock_irqsave(&priv->lock, flags); list_move(&tx->list, &priv->cm.reap_list); queue_work(ipoib_workqueue, &priv->cm.reap_task); ipoib_dbg(priv, "Reap connection for gid %pI6\n", - tx->neigh->dgid.raw); + tx->neigh->daddr + 4); tx->neigh = NULL; + spin_unlock_irqrestore(&priv->lock, flags); } } @@ -1302,7 +1323,7 @@ static void ipoib_cm_tx_start(struct work_struct *work) p = list_entry(priv->cm.start_list.next, typeof(*p), list); list_del_init(&p->list); neigh = p->neigh; - qpn = IPOIB_QPN(neigh->neighbour->ha); + qpn = IPOIB_QPN(neigh->daddr); memcpy(&pathrec, &p->path->pathrec, sizeof pathrec); spin_unlock_irqrestore(&priv->lock, flags); @@ -1317,10 +1338,7 @@ static void ipoib_cm_tx_start(struct work_struct *work) neigh = p->neigh; if (neigh) { neigh->cm = NULL; - list_del(&neigh->list); - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); + ipoib_neigh_free(neigh); } list_del(&p->list); kfree(p); @@ -1374,9 +1392,9 @@ static void ipoib_cm_skb_reap(struct work_struct *work) if (skb->protocol == htons(ETH_P_IP)) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6) else if (skb->protocol == htons(ETH_P_IPV6)) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, priv->dev); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); #endif dev_kfree_skb_any(skb); @@ -1394,8 +1412,8 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, struct ipoib_dev_priv *priv = netdev_priv(dev); int e = skb_queue_empty(&priv->cm.skb_queue); - if (skb->dst) - skb->dst->ops->update_pmtu(skb->dst, mtu); + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); skb_queue_tail(&priv->cm.skb_queue, skb); if (e) @@ -1453,40 +1471,19 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr, const char *buf, size_t count) { struct net_device *dev = to_net_dev(d); - struct ipoib_dev_priv *priv = netdev_priv(dev); - - /* flush paths if we switch modes so that connections are restarted */ - if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { - set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); - ipoib_warn(priv, "enabling connected mode " - "will cause multicast packet drops\n"); - - rtnl_lock(); - dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO); - rtnl_unlock(); - priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + int ret; - ipoib_flush_paths(dev); - return count; - } + if (!rtnl_trylock()) + return restart_syscall(); - if (!strcmp(buf, "datagram\n")) { - clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + ret = ipoib_set_mode(dev, buf); - rtnl_lock(); - if (test_bit(IPOIB_FLAG_CSUM, &priv->flags)) { - dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG; - if (priv->hca_caps & IB_DEVICE_UD_TSO) - dev->features |= NETIF_F_TSO; - } - dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); - rtnl_unlock(); - ipoib_flush_paths(dev); + rtnl_unlock(); + if (!ret) return count; - } - return -EINVAL; + return ret; } static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode); @@ -1500,6 +1497,7 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_srq_init_attr srq_init_attr = { + .srq_type = IB_SRQT_BASIC, .attr = { .max_wr = ipoib_recvq_size, .max_sge = max_sge @@ -1515,7 +1513,7 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) return; } - priv->cm.srq_ring = vmalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring); + priv->cm.srq_ring = vzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring); if (!priv->cm.srq_ring) { printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n", priv->ca->name, ipoib_recvq_size); @@ -1524,7 +1522,6 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) return; } - memset(priv->cm.srq_ring, 0, ipoib_recvq_size * sizeof *priv->cm.srq_ring); } int ipoib_cm_dev_init(struct net_device *dev) @@ -1574,7 +1571,8 @@ int ipoib_cm_dev_init(struct net_device *dev) for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i, priv->cm.num_frags - 1, - priv->cm.srq_ring[i].mapping)) { + priv->cm.srq_ring[i].mapping, + GFP_KERNEL)) { ipoib_warn(priv, "failed to allocate " "receive buffer %d\n", i); ipoib_cm_dev_cleanup(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index e9795f60e5d..078cadd6c79 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -39,14 +39,24 @@ static void ipoib_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { - strncpy(drvinfo->driver, "ipoib", sizeof(drvinfo->driver) - 1); -} + struct ipoib_dev_priv *priv = netdev_priv(netdev); + struct ib_device_attr *attr; -static u32 ipoib_get_rx_csum(struct net_device *dev) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); - return test_bit(IPOIB_FLAG_CSUM, &priv->flags) && - !test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + attr = kmalloc(sizeof(*attr), GFP_KERNEL); + if (attr && !ib_query_device(priv->ca, attr)) + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%d", (int)(attr->fw_ver >> 32), + (int)(attr->fw_ver >> 16) & 0xffff, + (int)attr->fw_ver & 0xffff); + kfree(attr); + + strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device), + sizeof(drvinfo->bus_info)); + + strlcpy(drvinfo->version, ipoib_driver_version, + sizeof(drvinfo->version)); + + strlcpy(drvinfo->driver, "ib_ipoib", sizeof(drvinfo->driver)); } static int ipoib_get_coalesce(struct net_device *dev, @@ -55,9 +65,7 @@ static int ipoib_get_coalesce(struct net_device *dev, struct ipoib_dev_priv *priv = netdev_priv(dev); coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs; - coal->tx_coalesce_usecs = priv->ethtool.coalesce_usecs; coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames; - coal->tx_max_coalesced_frames = priv->ethtool.max_coalesced_frames; return 0; } @@ -69,10 +77,8 @@ static int ipoib_set_coalesce(struct net_device *dev, int ret; /* - * Since IPoIB uses a single CQ for both rx and tx, we assume - * that rx params dictate the configuration. These values are - * saved in the private data and returned when ipoib_get_coalesce() - * is called. + * These values are saved in the private data and returned + * when ipoib_get_coalesce() is called */ if (coal->rx_coalesce_usecs > 0xffff || coal->rx_max_coalesced_frames > 0xffff) @@ -85,68 +91,19 @@ static int ipoib_set_coalesce(struct net_device *dev, return ret; } - coal->tx_coalesce_usecs = coal->rx_coalesce_usecs; - coal->tx_max_coalesced_frames = coal->rx_max_coalesced_frames; priv->ethtool.coalesce_usecs = coal->rx_coalesce_usecs; priv->ethtool.max_coalesced_frames = coal->rx_max_coalesced_frames; return 0; } -static const char ipoib_stats_keys[][ETH_GSTRING_LEN] = { - "LRO aggregated", "LRO flushed", - "LRO avg aggr", "LRO no desc" -}; - -static void ipoib_get_strings(struct net_device *netdev, u32 stringset, u8 *data) -{ - switch (stringset) { - case ETH_SS_STATS: - memcpy(data, *ipoib_stats_keys, sizeof(ipoib_stats_keys)); - break; - } -} - -static int ipoib_get_sset_count(struct net_device *dev, int sset) -{ - switch (sset) { - case ETH_SS_STATS: - return ARRAY_SIZE(ipoib_stats_keys); - default: - return -EOPNOTSUPP; - } -} - -static void ipoib_get_ethtool_stats(struct net_device *dev, - struct ethtool_stats *stats, uint64_t *data) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); - int index = 0; - - /* Get LRO statistics */ - data[index++] = priv->lro.lro_mgr.stats.aggregated; - data[index++] = priv->lro.lro_mgr.stats.flushed; - if (priv->lro.lro_mgr.stats.flushed) - data[index++] = priv->lro.lro_mgr.stats.aggregated / - priv->lro.lro_mgr.stats.flushed; - else - data[index++] = 0; - data[index++] = priv->lro.lro_mgr.stats.no_desc; -} - static const struct ethtool_ops ipoib_ethtool_ops = { .get_drvinfo = ipoib_get_drvinfo, - .get_rx_csum = ipoib_get_rx_csum, .get_coalesce = ipoib_get_coalesce, .set_coalesce = ipoib_set_coalesce, - .get_flags = ethtool_op_get_flags, - .set_flags = ethtool_op_set_flags, - .get_strings = ipoib_get_strings, - .get_sset_count = ipoib_get_sset_count, - .get_ethtool_stats = ipoib_get_ethtool_stats, }; void ipoib_set_ethtool_ops(struct net_device *dev) { - SET_ETHTOOL_OPS(dev, &ipoib_ethtool_ops); + dev->ethtool_ops = &ipoib_ethtool_ops; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c index 961c585da21..50061854616 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c @@ -32,10 +32,12 @@ #include <linux/err.h> #include <linux/seq_file.h> +#include <linux/slab.h> struct file_operations; #include <linux/debugfs.h> +#include <linux/export.h> #include "ipoib.h" @@ -211,16 +213,15 @@ static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr) gid_buf, path.pathrec.dlid ? "yes" : "no"); if (path.pathrec.dlid) { - rate = ib_rate_to_mult(path.pathrec.rate) * 25; + rate = ib_rate_to_mbps(path.pathrec.rate); seq_printf(file, " DLID: 0x%04x\n" " SL: %12d\n" - " rate: %*d%s Gb/sec\n", + " rate: %8d.%d Gb/sec\n", be16_to_cpu(path.pathrec.dlid), path.pathrec.sl, - 10 - ((rate % 10) ? 2 : 0), - rate / 10, rate % 10 ? ".5" : ""); + rate / 1000, rate % 1000); } seq_putc(file, '\n'); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index a1925810be3..6a7003ddb0b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -34,9 +34,10 @@ */ #include <linux/delay.h> +#include <linux/moduleparam.h> #include <linux/dma-mapping.h> +#include <linux/slab.h> -#include <rdma/ib_cache.h> #include <linux/ip.h> #include <linux/tcp.h> @@ -56,21 +57,24 @@ struct ipoib_ah *ipoib_create_ah(struct net_device *dev, struct ib_pd *pd, struct ib_ah_attr *attr) { struct ipoib_ah *ah; + struct ib_ah *vah; ah = kmalloc(sizeof *ah, GFP_KERNEL); if (!ah) - return NULL; + return ERR_PTR(-ENOMEM); ah->dev = dev; ah->last_send = 0; kref_init(&ah->ref); - ah->ah = ib_create_ah(pd, attr); - if (IS_ERR(ah->ah)) { + vah = ib_create_ah(pd, attr); + if (IS_ERR(vah)) { kfree(ah); - ah = NULL; - } else + ah = (struct ipoib_ah *)vah; + } else { + ah->ah = vah; ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah); + } return ah; } @@ -117,9 +121,9 @@ static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv, size = length - IPOIB_UD_HEAD_SIZE; - frag->size = size; + skb_frag_size_set(frag, size); skb->data_len += size; - skb->truesize += size; + skb->truesize += PAGE_SIZE; } else skb_put(skb, length); @@ -152,14 +156,18 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; int buf_size; + int tailroom; u64 *mapping; - if (ipoib_ud_need_sg(priv->max_ib_mtu)) + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { buf_size = IPOIB_UD_HEAD_SIZE; - else + tailroom = 128; /* reserve some tailroom for IP/TCP headers */ + } else { buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); + tailroom = 0; + } - skb = dev_alloc_skb(buf_size + 4); + skb = dev_alloc_skb(buf_size + tailroom + 4); if (unlikely(!skb)) return NULL; @@ -182,7 +190,7 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) goto partial_error; skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE); mapping[1] = - ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[0].page, + ib_dma_map_page(priv->ca, page, 0, PAGE_SIZE, DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1]))) goto partial_error; @@ -223,6 +231,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV; struct sk_buff *skb; u64 mapping[IPOIB_UD_RX_SG]; + union ib_gid *dgid; ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n", wr_id, wc->status); @@ -271,27 +280,31 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) ipoib_ud_dma_unmap_rx(priv, mapping); ipoib_ud_skb_put_frags(priv, skb, wc->byte_len); + /* First byte of dgid signals multicast when 0xff */ + dgid = &((struct ib_grh *)skb->data)->dgid; + + if (!(wc->wc_flags & IB_WC_GRH) || dgid->raw[0] != 0xff) + skb->pkt_type = PACKET_HOST; + else if (memcmp(dgid, dev->broadcast + 4, sizeof(union ib_gid)) == 0) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + skb_pull(skb, IB_GRH_BYTES); skb->protocol = ((struct ipoib_header *) skb->data)->proto; skb_reset_mac_header(skb); skb_pull(skb, IPOIB_ENCAP_LEN); - dev->last_rx = jiffies; ++dev->stats.rx_packets; dev->stats.rx_bytes += skb->len; skb->dev = dev; - /* XXX get correct PACKET_ type here */ - skb->pkt_type = PACKET_HOST; - - if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok)) + if ((dev->features & NETIF_F_RXCSUM) && + likely(wc->wc_flags & IB_WC_IP_CSUM_OK)) skb->ip_summed = CHECKSUM_UNNECESSARY; - if (dev->features & NETIF_F_LRO) - lro_receive_skb(&priv->lro.lro_mgr, skb, NULL); - else - netif_receive_skb(skb); + napi_gro_receive(&priv->napi, skb); repost: if (unlikely(ipoib_ib_post_receive(dev, wr_id))) @@ -318,9 +331,10 @@ static int ipoib_dma_map_tx(struct ib_device *ca, off = 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - mapping[i + off] = ib_dma_map_page(ca, frag->page, - frag->page_offset, frag->size, + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + mapping[i + off] = ib_dma_map_page(ca, + skb_frag_page(frag), + frag->page_offset, skb_frag_size(frag), DMA_TO_DEVICE); if (unlikely(ib_dma_mapping_error(ca, mapping[i + off]))) goto partial_error; @@ -329,8 +343,9 @@ static int ipoib_dma_map_tx(struct ib_device *ca, partial_error: for (; i > 0; --i) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; - ib_dma_unmap_page(ca, mapping[i - !off], frag->size, DMA_TO_DEVICE); + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; + + ib_dma_unmap_page(ca, mapping[i - !off], skb_frag_size(frag), DMA_TO_DEVICE); } if (off) @@ -354,8 +369,9 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, off = 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - ib_dma_unmap_page(ca, mapping[i + off], frag->size, + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + ib_dma_unmap_page(ca, mapping[i + off], skb_frag_size(frag), DMA_TO_DEVICE); } } @@ -443,14 +459,11 @@ poll_more: } if (done < budget) { - if (dev->features & NETIF_F_LRO) - lro_flush_all(&priv->lro.lro_mgr); - - netif_rx_complete(napi); + napi_complete(napi); if (unlikely(ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)) && - netif_rx_reschedule(napi)) + napi_reschedule(napi)) goto poll_more; } @@ -462,7 +475,7 @@ void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr) struct net_device *dev = dev_ptr; struct ipoib_dev_priv *priv = netdev_priv(dev); - netif_rx_schedule(&priv->napi); + napi_schedule(&priv->napi); } static void drain_tx_cq(struct net_device *dev) @@ -508,7 +521,7 @@ static inline int post_send(struct ipoib_dev_priv *priv, for (i = 0; i < nr_frags; ++i) { priv->tx_sge[i + off].addr = mapping[i + off]; - priv->tx_sge[i + off].length = frags[i].size; + priv->tx_sge[i + off].length = skb_frag_size(&frags[i]); } priv->tx_wr.num_sge = nr_frags + off; priv->tx_wr.wr_id = wr_id; @@ -531,7 +544,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_tx_buf *tx_req; - int hlen; + int hlen, rc; void *phead; if (skb_is_gso(skb)) { @@ -587,9 +600,13 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, netif_stop_queue(dev); } - if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), - address->ah, qpn, tx_req, phead, hlen))) { - ipoib_warn(priv, "post_send failed\n"); + skb_orphan(skb); + skb_dst_drop(skb); + + rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), + address->ah, qpn, tx_req, phead, hlen); + if (unlikely(rc)) { + ipoib_warn(priv, "post_send failed, error %d\n", rc); ++dev->stats.tx_errors; --priv->tx_outstanding; ipoib_dma_unmap_tx(priv->ca, tx_req); @@ -601,8 +618,6 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, address->last_send = priv->tx_head; ++priv->tx_head; - skb_orphan(skb); - } if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) @@ -670,24 +685,28 @@ int ipoib_ib_dev_open(struct net_device *dev) ret = ipoib_ib_post_receives(dev); if (ret) { ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); - ipoib_ib_dev_stop(dev, 1); - return -1; + goto dev_stop; } ret = ipoib_cm_dev_open(dev); if (ret) { ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); - ipoib_ib_dev_stop(dev, 1); - return -1; + goto dev_stop; } clear_bit(IPOIB_STOP_REAPER, &priv->flags); queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, round_jiffies_relative(HZ)); - set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); + if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_enable(&priv->napi); return 0; +dev_stop: + if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_enable(&priv->napi); + ipoib_ib_dev_stop(dev, 1); + return -1; } static void ipoib_pkey_dev_check_presence(struct net_device *dev) @@ -730,10 +749,8 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush) if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { mutex_lock(&pkey_mutex); set_bit(IPOIB_PKEY_STOP, &priv->flags); - cancel_delayed_work(&priv->pkey_poll_task); + cancel_delayed_work_sync(&priv->pkey_poll_task); mutex_unlock(&pkey_mutex); - if (flush) - flush_workqueue(ipoib_workqueue); } ipoib_mcast_stop_thread(dev, flush); @@ -804,7 +821,8 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush) struct ipoib_tx_buf *tx_req; int i; - clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_disable(&priv->napi); ipoib_cm_dev_stop(dev); @@ -915,14 +933,49 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) return 0; } +/* + * Takes whatever value which is in pkey index 0 and updates priv->pkey + * returns 0 if the pkey value was changed. + */ +static inline int update_parent_pkey(struct ipoib_dev_priv *priv) +{ + int result; + u16 prev_pkey; + + prev_pkey = priv->pkey; + result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); + if (result) { + ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n", + priv->port, result); + return result; + } + + priv->pkey |= 0x8000; + + if (prev_pkey != priv->pkey) { + ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n", + prev_pkey, priv->pkey); + /* + * Update the pkey in the broadcast address, while making sure to set + * the full membership bit, so that we join the right broadcast group. + */ + priv->dev->broadcast[8] = priv->pkey >> 8; + priv->dev->broadcast[9] = priv->pkey & 0xff; + return 0; + } + + return 1; +} + static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, enum ipoib_flush_level level) { struct ipoib_dev_priv *cpriv; struct net_device *dev = priv->dev; u16 new_index; + int result; - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); /* * Flush any child interfaces too -- they might be up even if @@ -931,9 +984,13 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, list_for_each_entry(cpriv, &priv->child_intfs, list) __ipoib_ib_dev_flush(cpriv, level); - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { + /* for non-child devices must check/update the pkey value here */ + if (level == IPOIB_FLUSH_HEAVY && + !test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) + update_parent_pkey(priv); ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); return; } @@ -944,21 +1001,32 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, } if (level == IPOIB_FLUSH_HEAVY) { - if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { - clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); - ipoib_ib_dev_down(dev, 0); - ipoib_ib_dev_stop(dev, 0); - if (ipoib_pkey_dev_delay_open(dev)) + /* child devices chase their origin pkey value, while non-child + * (parent) devices should always takes what present in pkey index 0 + */ + if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + ipoib_ib_dev_down(dev, 0); + ipoib_ib_dev_stop(dev, 0); + if (ipoib_pkey_dev_delay_open(dev)) + return; + } + /* restart QP only if P_Key index is changed */ + if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && + new_index == priv->pkey_index) { + ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); return; + } + priv->pkey_index = new_index; + } else { + result = update_parent_pkey(priv); + /* restart QP only if P_Key value changed */ + if (result) { + ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n"); + return; + } } - - /* restart QP only if P_Key index is changed */ - if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && - new_index == priv->pkey_index) { - ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); - return; - } - priv->pkey_index = new_index; } if (level == IPOIB_FLUSH_LIGHT) { @@ -1014,6 +1082,11 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) struct ipoib_dev_priv *priv = netdev_priv(dev); ipoib_dbg(priv, "cleaning up ib_dev\n"); + /* + * We must make sure there are no more (path) completions + * that may wish to touch priv fields that are no longer valid + */ + ipoib_flush_paths(dev); ipoib_mcast_stop_thread(dev, 1); ipoib_mcast_dev_flush(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 0bd2a4ff084..5786a78ff8b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -46,11 +46,17 @@ #include <linux/ip.h> #include <linux/in.h> -#include <net/dst.h> +#include <linux/jhash.h> +#include <net/arp.h> + +#define DRV_VERSION "1.0.0" + +const char ipoib_driver_version[] = DRV_VERSION; MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; @@ -60,15 +66,6 @@ MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); -static int lro; -module_param(lro, bool, 0444); -MODULE_PARM_DESC(lro, "Enable LRO (Large Receive Offload)"); - -static int lro_max_aggr = IPOIB_LRO_MAX_AGGR; -module_param(lro_max_aggr, int, 0644); -MODULE_PARM_DESC(lro_max_aggr, "LRO: Max packets to be aggregated " - "(default = 64)"); - #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level; @@ -93,6 +90,7 @@ struct ib_sa_client ipoib_sa_client; static void ipoib_add_one(struct ib_device *device); static void ipoib_remove_one(struct ib_device *device); +static void ipoib_neigh_reclaim(struct rcu_head *rp); static struct ib_client ipoib_client = { .name = "ipoib", @@ -106,8 +104,9 @@ int ipoib_open(struct net_device *dev) ipoib_dbg(priv, "bringing up interface\n"); - if (!test_and_set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) - napi_enable(&priv->napi); + netif_carrier_off(dev); + + set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_pkey_dev_delay_open(dev)) return 0; @@ -122,7 +121,7 @@ int ipoib_open(struct net_device *dev) struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; @@ -132,7 +131,7 @@ int ipoib_open(struct net_device *dev) dev_change_flags(cpriv->dev, flags | IFF_UP); } - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); } netif_start_queue(dev); @@ -143,7 +142,6 @@ err_stop: ipoib_ib_dev_stop(dev, 1); err_disable: - napi_disable(&priv->napi); clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); return -EINVAL; @@ -156,18 +154,17 @@ static int ipoib_stop(struct net_device *dev) ipoib_dbg(priv, "stopping interface\n"); clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); - napi_disable(&priv->napi); netif_stop_queue(dev); - ipoib_ib_dev_down(dev, 0); + ipoib_ib_dev_down(dev, 1); ipoib_ib_dev_stop(dev, 0); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; @@ -177,12 +174,27 @@ static int ipoib_stop(struct net_device *dev) dev_change_flags(cpriv->dev, flags & ~IFF_UP); } - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); } return 0; } +static void ipoib_uninit(struct net_device *dev) +{ + ipoib_dev_cleanup(dev); +} + +static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) + features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO); + + return features; +} + static int ipoib_change_mtu(struct net_device *dev, int new_mtu) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -210,6 +222,37 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu) return 0; } +int ipoib_set_mode(struct net_device *dev, const char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + /* flush paths if we switch modes so that connections are restarted */ + if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { + set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + ipoib_warn(priv, "enabling connected mode " + "will cause multicast packet drops\n"); + netdev_update_features(dev); + rtnl_unlock(); + priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + + ipoib_flush_paths(dev); + rtnl_lock(); + return 0; + } + + if (!strcmp(buf, "datagram\n")) { + clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + netdev_update_features(dev); + dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); + rtnl_unlock(); + ipoib_flush_paths(dev); + rtnl_lock(); + return 0; + } + + return -EINVAL; +} + static struct ipoib_path *__path_find(struct net_device *dev, void *gid) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -266,30 +309,15 @@ static int __path_add(struct net_device *dev, struct ipoib_path *path) static void path_free(struct net_device *dev, struct ipoib_path *path) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ipoib_neigh *neigh, *tn; struct sk_buff *skb; - unsigned long flags; while ((skb = __skb_dequeue(&path->queue))) dev_kfree_skb_irq(skb); - spin_lock_irqsave(&priv->lock, flags); - - list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { - /* - * It's safe to call ipoib_put_ah() inside priv->lock - * here, because we know that path->ah will always - * hold one more reference, so ipoib_put_ah() will - * never do more than decrement the ref count. - */ - if (neigh->ah) - ipoib_put_ah(neigh->ah); - - ipoib_neigh_free(dev, neigh); - } + ipoib_dbg(netdev_priv(dev), "path_free\n"); - spin_unlock_irqrestore(&priv->lock, flags); + /* remove all neigh connected to this path */ + ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); if (path->ah) ipoib_put_ah(path->ah); @@ -434,7 +462,7 @@ static void path_rec_completion(int status, spin_lock_irqsave(&priv->lock, flags); - if (ah) { + if (!IS_ERR_OR_NULL(ah)) { path->pathrec = *pathrec; old_ah = path->ah; @@ -460,19 +488,14 @@ static void path_rec_completion(int status, } kref_get(&path->ah->ref); neigh->ah = path->ah; - memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, - sizeof(union ib_gid)); - if (ipoib_cm_enabled(dev, neigh->neighbour)) { + if (ipoib_cm_enabled(dev, neigh->daddr)) { if (!ipoib_cm_get(neigh)) ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); if (!ipoib_cm_get(neigh)) { - list_del(&neigh->list); - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); + ipoib_neigh_free(neigh); continue; } } @@ -488,6 +511,9 @@ static void path_rec_completion(int status, spin_unlock_irqrestore(&priv->lock, flags); + if (IS_ERR_OR_NULL(ah)) + ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); + if (old_ah) ipoib_put_ah(old_ah); @@ -557,25 +583,26 @@ static int path_rec_start(struct net_device *dev, return 0; } -static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) +static void neigh_add_path(struct sk_buff *skb, u8 *daddr, + struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_path *path; struct ipoib_neigh *neigh; unsigned long flags; - neigh = ipoib_neigh_alloc(skb->dst->neighbour, skb->dev); + spin_lock_irqsave(&priv->lock, flags); + neigh = ipoib_neigh_alloc(daddr, dev); if (!neigh) { + spin_unlock_irqrestore(&priv->lock, flags); ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return; } - spin_lock_irqsave(&priv->lock, flags); - - path = __path_find(dev, skb->dst->neighbour->ha + 4); + path = __path_find(dev, daddr + 4); if (!path) { - path = path_rec_create(dev, skb->dst->neighbour->ha + 4); + path = path_rec_create(dev, daddr + 4); if (!path) goto err_path; @@ -587,17 +614,12 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) if (path->ah) { kref_get(&path->ah->ref); neigh->ah = path->ah; - memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, - sizeof(union ib_gid)); - if (ipoib_cm_enabled(dev, neigh->neighbour)) { + if (ipoib_cm_enabled(dev, neigh->daddr)) { if (!ipoib_cm_get(neigh)) ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); if (!ipoib_cm_get(neigh)) { - list_del(&neigh->list); - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); + ipoib_neigh_free(neigh); goto err_drop; } if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) @@ -607,50 +629,37 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) skb_queue_len(&neigh->queue)); goto err_drop; } - } else - ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha)); + } else { + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr)); + ipoib_neigh_put(neigh); + return; + } } else { neigh->ah = NULL; if (!path->query && path_rec_start(dev, path)) - goto err_list; + goto err_path; __skb_queue_tail(&neigh->queue, skb); } spin_unlock_irqrestore(&priv->lock, flags); + ipoib_neigh_put(neigh); return; -err_list: - list_del(&neigh->list); - err_path: - ipoib_neigh_free(dev, neigh); + ipoib_neigh_free(neigh); err_drop: ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); spin_unlock_irqrestore(&priv->lock, flags); -} - -static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev) -{ - struct ipoib_dev_priv *priv = netdev_priv(skb->dev); - - /* Look up path record for unicasts */ - if (skb->dst->neighbour->ha[4] != 0xff) { - neigh_add_path(skb, dev); - return; - } - - /* Add in the P_Key for multicasts */ - skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff; - skb->dst->neighbour->ha[9] = priv->pkey & 0xff; - ipoib_mcast_send(dev, skb->dst->neighbour->ha + 4, skb); + ipoib_neigh_put(neigh); } static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, - struct ipoib_pseudoheader *phdr) + struct ipoib_cb *cb) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_path *path; @@ -658,18 +667,21 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, spin_lock_irqsave(&priv->lock, flags); - path = __path_find(dev, phdr->hwaddr + 4); + path = __path_find(dev, cb->hwaddr + 4); if (!path || !path->valid) { - if (!path) - path = path_rec_create(dev, phdr->hwaddr + 4); + int new_path = 0; + + if (!path) { + path = path_rec_create(dev, cb->hwaddr + 4); + new_path = 1; + } if (path) { - /* put pseudoheader back on for next time */ - skb_push(skb, sizeof *phdr); __skb_queue_tail(&path->queue, skb); if (!path->query && path_rec_start(dev, path)) { spin_unlock_irqrestore(&priv->lock, flags); - path_free(dev, path); + if (new_path) + path_free(dev, path); return; } else __path_add(dev, path); @@ -686,11 +698,11 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, ipoib_dbg(priv, "Send unicast ARP to %04x\n", be16_to_cpu(path->pathrec.dlid)); - ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr)); + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr)); + return; } else if ((path->query || !path_rec_start(dev, path)) && skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { - /* put pseudoheader back on for next time */ - skb_push(skb, sizeof *phdr); __skb_queue_tail(&path->queue, skb); } else { ++dev->stats.tx_dropped; @@ -704,85 +716,82 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_neigh *neigh; + struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; + struct ipoib_header *header; unsigned long flags; - if (likely(skb->dst && skb->dst->neighbour)) { - if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) { - ipoib_path_lookup(skb, dev); - return NETDEV_TX_OK; - } + header = (struct ipoib_header *) skb->data; - neigh = *to_ipoib_neigh(skb->dst->neighbour); - - if (unlikely((memcmp(&neigh->dgid.raw, - skb->dst->neighbour->ha + 4, - sizeof(union ib_gid))) || - (neigh->dev != dev))) { - spin_lock_irqsave(&priv->lock, flags); - /* - * It's safe to call ipoib_put_ah() inside - * priv->lock here, because we know that - * path->ah will always hold one more reference, - * so ipoib_put_ah() will never do more than - * decrement the ref count. - */ - if (neigh->ah) - ipoib_put_ah(neigh->ah); - list_del(&neigh->list); - ipoib_neigh_free(dev, neigh); - spin_unlock_irqrestore(&priv->lock, flags); - ipoib_path_lookup(skb, dev); + if (unlikely(cb->hwaddr[4] == 0xff)) { + /* multicast, arrange "if" according to probability */ + if ((header->proto != htons(ETH_P_IP)) && + (header->proto != htons(ETH_P_IPV6)) && + (header->proto != htons(ETH_P_ARP)) && + (header->proto != htons(ETH_P_RARP)) && + (header->proto != htons(ETH_P_TIPC))) { + /* ethertype not supported by IPoIB */ + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); return NETDEV_TX_OK; } + /* Add in the P_Key for multicast*/ + cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; + cb->hwaddr[9] = priv->pkey & 0xff; + + neigh = ipoib_neigh_get(dev, cb->hwaddr); + if (likely(neigh)) + goto send_using_neigh; + ipoib_mcast_send(dev, cb->hwaddr, skb); + return NETDEV_TX_OK; + } - if (ipoib_cm_get(neigh)) { - if (ipoib_cm_up(neigh)) { - ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); - return NETDEV_TX_OK; - } - } else if (neigh->ah) { - ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb->dst->neighbour->ha)); + /* unicast, arrange "switch" according to probability */ + switch (header->proto) { + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + case htons(ETH_P_TIPC): + neigh = ipoib_neigh_get(dev, cb->hwaddr); + if (unlikely(!neigh)) { + neigh_add_path(skb, cb->hwaddr, dev); return NETDEV_TX_OK; } + break; + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): + /* for unicast ARP and RARP should always perform path find */ + unicast_arp_send(skb, dev, cb); + return NETDEV_TX_OK; + default: + /* ethertype not supported by IPoIB */ + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } - if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { - spin_lock_irqsave(&priv->lock, flags); - __skb_queue_tail(&neigh->queue, skb); - spin_unlock_irqrestore(&priv->lock, flags); - } else { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); +send_using_neigh: + /* note we now hold a ref to neigh */ + if (ipoib_cm_get(neigh)) { + if (ipoib_cm_up(neigh)) { + ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); + goto unref; } - } else { - struct ipoib_pseudoheader *phdr = - (struct ipoib_pseudoheader *) skb->data; - skb_pull(skb, sizeof *phdr); - - if (phdr->hwaddr[4] == 0xff) { - /* Add in the P_Key for multicast*/ - phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; - phdr->hwaddr[9] = priv->pkey & 0xff; - - ipoib_mcast_send(dev, phdr->hwaddr + 4, skb); - } else { - /* unicast GID -- should be ARP or RARP reply */ - - if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) && - (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) { - ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x %pI6\n", - skb->dst ? "neigh" : "dst", - be16_to_cpup((__be16 *) skb->data), - IPOIB_QPN(phdr->hwaddr), - phdr->hwaddr + 4); - dev_kfree_skb_any(skb); - ++dev->stats.tx_dropped; - return NETDEV_TX_OK; - } + } else if (neigh->ah) { + ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr)); + goto unref; + } - unicast_arp_send(skb, dev, phdr); - } + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + spin_lock_irqsave(&priv->lock, flags); + __skb_queue_tail(&neigh->queue, skb); + spin_unlock_irqrestore(&priv->lock, flags); + } else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); } +unref: + ipoib_neigh_put(neigh); + return NETDEV_TX_OK; } @@ -804,6 +813,7 @@ static int ipoib_hard_header(struct sk_buff *skb, const void *daddr, const void *saddr, unsigned len) { struct ipoib_header *header; + struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; header = (struct ipoib_header *) skb_push(skb, sizeof *header); @@ -811,17 +821,13 @@ static int ipoib_hard_header(struct sk_buff *skb, header->reserved = 0; /* - * If we don't have a neighbour structure, stuff the - * destination address onto the front of the skb so we can - * figure out where to send the packet later. + * we don't rely on dst_entry structure, always stuff the + * destination address into skb->cb so we can figure out where + * to send the packet later. */ - if ((!skb->dst || !skb->dst->neighbour) && daddr) { - struct ipoib_pseudoheader *phdr = - (struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr); - memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); - } + memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); - return 0; + return sizeof *header; } static void ipoib_set_mcast_list(struct net_device *dev) @@ -836,94 +842,443 @@ static void ipoib_set_mcast_list(struct net_device *dev) queue_work(ipoib_workqueue, &priv->restart_task); } -static void ipoib_neigh_cleanup(struct neighbour *n) +static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) { - struct ipoib_neigh *neigh; - struct ipoib_dev_priv *priv = netdev_priv(n->dev); + /* + * Use only the address parts that contributes to spreading + * The subnet prefix is not used as one can not connect to + * same remote port (GUID) using the same remote QPN via two + * different subnets. + */ + /* qpn octets[1:4) & port GUID octets[12:20) */ + u32 *d32 = (u32 *) daddr; + u32 hv; + + hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0); + return hv & htbl->mask; +} + +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh *neigh = NULL; + u32 hash_val; + + rcu_read_lock_bh(); + + htbl = rcu_dereference_bh(ntbl->htbl); + + if (!htbl) + goto out_unlock; + + hash_val = ipoib_addr_hash(htbl, daddr); + for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]); + neigh != NULL; + neigh = rcu_dereference_bh(neigh->hnext)) { + if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { + /* found, take one ref on behalf of the caller */ + if (!atomic_inc_not_zero(&neigh->refcnt)) { + /* deleted */ + neigh = NULL; + goto out_unlock; + } + neigh->alive = jiffies; + goto out_unlock; + } + } + +out_unlock: + rcu_read_unlock_bh(); + return neigh; +} + +static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long neigh_obsolete; + unsigned long dt; unsigned long flags; - struct ipoib_ah *ah = NULL; + int i; - neigh = *to_ipoib_neigh(n); - if (neigh) - priv = netdev_priv(neigh->dev); - else + if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) return; - ipoib_dbg(priv, - "neigh_cleanup for %06x %pI6\n", - IPOIB_QPN(n->ha), - n->ha + 4); spin_lock_irqsave(&priv->lock, flags); - if (neigh->ah) - ah = neigh->ah; - list_del(&neigh->list); - ipoib_neigh_free(n->dev, neigh); + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + + if (!htbl) + goto out_unlock; + + /* neigh is obsolete if it was idle for two GC periods */ + dt = 2 * arp_tbl.gc_interval; + neigh_obsolete = jiffies - dt; + /* handle possible race condition */ + if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) + goto out_unlock; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) != NULL) { + /* was the neigh idle for two GC periods */ + if (time_after(neigh_obsolete, neigh->alive)) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from path/mc list */ + list_del(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } else { + np = &neigh->hnext; + } + + } + } +out_unlock: spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_reap_neigh(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, neigh_reap_task.work); + + __ipoib_reap_neigh(priv); - if (ah) - ipoib_put_ah(ah); + if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + arp_tbl.gc_interval); } -struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour, + +static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr, struct net_device *dev) { struct ipoib_neigh *neigh; - neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); + neigh = kzalloc(sizeof *neigh, GFP_ATOMIC); if (!neigh) return NULL; - neigh->neighbour = neighbour; neigh->dev = dev; - *to_ipoib_neigh(neighbour) = neigh; + memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr)); skb_queue_head_init(&neigh->queue); + INIT_LIST_HEAD(&neigh->list); ipoib_cm_set(neigh, NULL); + /* one ref on behalf of the caller */ + atomic_set(&neigh->refcnt, 1); + + return neigh; +} + +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, + struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh *neigh; + u32 hash_val; + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + if (!htbl) { + neigh = NULL; + goto out_unlock; + } + + /* need to add a new neigh, but maybe some other thread succeeded? + * recalc hash, maybe hash resize took place so we do a search + */ + hash_val = ipoib_addr_hash(htbl, daddr); + for (neigh = rcu_dereference_protected(htbl->buckets[hash_val], + lockdep_is_held(&priv->lock)); + neigh != NULL; + neigh = rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))) { + if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { + /* found, take one ref on behalf of the caller */ + if (!atomic_inc_not_zero(&neigh->refcnt)) { + /* deleted */ + neigh = NULL; + break; + } + neigh->alive = jiffies; + goto out_unlock; + } + } + + neigh = ipoib_neigh_ctor(daddr, dev); + if (!neigh) + goto out_unlock; + + /* one ref on behalf of the hash table */ + atomic_inc(&neigh->refcnt); + neigh->alive = jiffies; + /* put in hash */ + rcu_assign_pointer(neigh->hnext, + rcu_dereference_protected(htbl->buckets[hash_val], + lockdep_is_held(&priv->lock))); + rcu_assign_pointer(htbl->buckets[hash_val], neigh); + atomic_inc(&ntbl->entries); + +out_unlock: return neigh; } -void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh) +void ipoib_neigh_dtor(struct ipoib_neigh *neigh) { + /* neigh reference count was dropprd to zero */ + struct net_device *dev = neigh->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; - *to_ipoib_neigh(neigh->neighbour) = NULL; + if (neigh->ah) + ipoib_put_ah(neigh->ah); while ((skb = __skb_dequeue(&neigh->queue))) { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } if (ipoib_cm_get(neigh)) ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); + ipoib_dbg(netdev_priv(dev), + "neigh free for %06x %pI6\n", + IPOIB_QPN(neigh->daddr), + neigh->daddr + 4); kfree(neigh); + if (atomic_dec_and_test(&priv->ntbl.entries)) { + if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags)) + complete(&priv->ntbl.flushed); + } } -static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms) +static void ipoib_neigh_reclaim(struct rcu_head *rp) { - parms->neigh_cleanup = ipoib_neigh_cleanup; + /* Called as a result of removal from hash table */ + struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu); + /* note TX context may hold another ref */ + ipoib_neigh_put(neigh); +} + +void ipoib_neigh_free(struct ipoib_neigh *neigh) +{ + struct net_device *dev = neigh->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh __rcu **np; + struct ipoib_neigh *n; + u32 hash_val; + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + if (!htbl) + return; + + hash_val = ipoib_addr_hash(htbl, neigh->daddr); + np = &htbl->buckets[hash_val]; + for (n = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock)); + n != NULL; + n = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) { + if (n == neigh) { + /* found */ + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from parent list */ + list_del(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + return; + } else { + np = &n->hnext; + } + } +} + +static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh **buckets; + u32 size; + + clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); + ntbl->htbl = NULL; + htbl = kzalloc(sizeof(*htbl), GFP_KERNEL); + if (!htbl) + return -ENOMEM; + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + size = roundup_pow_of_two(arp_tbl.gc_thresh3); + buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL); + if (!buckets) { + kfree(htbl); + return -ENOMEM; + } + htbl->size = size; + htbl->mask = (size - 1); + htbl->buckets = buckets; + ntbl->htbl = htbl; + htbl->ntbl = ntbl; + atomic_set(&ntbl->entries, 0); + + /* start garbage collection */ + clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + arp_tbl.gc_interval); return 0; } +static void neigh_hash_free_rcu(struct rcu_head *head) +{ + struct ipoib_neigh_hash *htbl = container_of(head, + struct ipoib_neigh_hash, + rcu); + struct ipoib_neigh __rcu **buckets = htbl->buckets; + struct ipoib_neigh_table *ntbl = htbl->ntbl; + + kfree(buckets); + kfree(htbl); + complete(&ntbl->deleted); +} + +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long flags; + int i; + + /* remove all neigh connected to a given path or mcast */ + spin_lock_irqsave(&priv->lock, flags); + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + + if (!htbl) + goto out_unlock; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) != NULL) { + /* delete neighs belong to this parent */ + if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from parent list */ + list_del(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } else { + np = &neigh->hnext; + } + + } + } +out_unlock: + spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long flags; + int i, wait_flushed = 0; + + init_completion(&priv->ntbl.flushed); + + spin_lock_irqsave(&priv->lock, flags); + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + if (!htbl) + goto out_unlock; + + wait_flushed = atomic_read(&priv->ntbl.entries); + if (!wait_flushed) + goto free_htbl; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) != NULL) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from path/mc list */ + list_del(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } + } + +free_htbl: + rcu_assign_pointer(ntbl->htbl, NULL); + call_rcu(&htbl->rcu, neigh_hash_free_rcu); + +out_unlock: + spin_unlock_irqrestore(&priv->lock, flags); + if (wait_flushed) + wait_for_completion(&priv->ntbl.flushed); +} + +static void ipoib_neigh_hash_uninit(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int stopped; + + ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n"); + init_completion(&priv->ntbl.deleted); + set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); + + /* Stop GC if called at init fail need to cancel work */ + stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + if (!stopped) + cancel_delayed_work(&priv->neigh_reap_task); + + ipoib_flush_neighs(priv); + + wait_for_completion(&priv->ntbl.deleted); +} + + int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) { struct ipoib_dev_priv *priv = netdev_priv(dev); + if (ipoib_neigh_hash_init(priv) < 0) + goto out; /* Allocate RX/TX "rings" to hold queued skbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", ca->name, ipoib_recvq_size); - goto out; + goto out_neigh_hash_cleanup; } - priv->tx_ring = vmalloc(ipoib_sendq_size * sizeof *priv->tx_ring); + priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); if (!priv->tx_ring) { printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } - memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring); /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ @@ -938,6 +1293,8 @@ out_tx_ring_cleanup: out_rx_ring_cleanup: kfree(priv->rx_ring); +out_neigh_hash_cleanup: + ipoib_neigh_hash_uninit(dev); out: return -ENOMEM; } @@ -945,15 +1302,20 @@ out: void ipoib_dev_cleanup(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv; + LIST_HEAD(head); + + ASSERT_RTNL(); ipoib_delete_debug_files(dev); /* Delete any child interfaces first */ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { - unregister_netdev(cpriv->dev); - ipoib_dev_cleanup(cpriv->dev); - free_netdev(cpriv->dev); + /* Stop GC on child */ + set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags); + cancel_delayed_work(&cpriv->neigh_reap_task); + unregister_netdevice_queue(cpriv->dev, &head); } + unregister_netdevice_many(&head); ipoib_ib_dev_cleanup(dev); @@ -962,103 +1324,55 @@ void ipoib_dev_cleanup(struct net_device *dev) priv->rx_ring = NULL; priv->tx_ring = NULL; + + ipoib_neigh_hash_uninit(dev); } static const struct header_ops ipoib_header_ops = { .create = ipoib_hard_header, }; -static int get_skb_hdr(struct sk_buff *skb, void **iphdr, - void **tcph, u64 *hdr_flags, void *priv) -{ - unsigned int ip_len; - struct iphdr *iph; - - if (unlikely(skb->protocol != htons(ETH_P_IP))) - return -1; - - /* - * In the future we may add an else clause that verifies the - * checksum and allows devices which do not calculate checksum - * to use LRO. - */ - if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY)) - return -1; - - /* Check for non-TCP packet */ - skb_reset_network_header(skb); - iph = ip_hdr(skb); - if (iph->protocol != IPPROTO_TCP) - return -1; - - ip_len = ip_hdrlen(skb); - skb_set_transport_header(skb, ip_len); - *tcph = tcp_hdr(skb); - - /* check if IP header and TCP header are complete */ - if (ntohs(iph->tot_len) < ip_len + tcp_hdrlen(skb)) - return -1; - - *hdr_flags = LRO_IPV4 | LRO_TCP; - *iphdr = iph; - - return 0; -} - -static void ipoib_lro_setup(struct ipoib_dev_priv *priv) -{ - priv->lro.lro_mgr.max_aggr = lro_max_aggr; - priv->lro.lro_mgr.max_desc = IPOIB_MAX_LRO_DESCRIPTORS; - priv->lro.lro_mgr.lro_arr = priv->lro.lro_desc; - priv->lro.lro_mgr.get_skb_header = get_skb_hdr; - priv->lro.lro_mgr.features = LRO_F_NAPI; - priv->lro.lro_mgr.dev = priv->dev; - priv->lro.lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY; -} +static const struct net_device_ops ipoib_netdev_ops = { + .ndo_uninit = ipoib_uninit, + .ndo_open = ipoib_open, + .ndo_stop = ipoib_stop, + .ndo_change_mtu = ipoib_change_mtu, + .ndo_fix_features = ipoib_fix_features, + .ndo_start_xmit = ipoib_start_xmit, + .ndo_tx_timeout = ipoib_timeout, + .ndo_set_rx_mode = ipoib_set_mcast_list, +}; -static void ipoib_setup(struct net_device *dev) +void ipoib_setup(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); - dev->open = ipoib_open; - dev->stop = ipoib_stop; - dev->change_mtu = ipoib_change_mtu; - dev->hard_start_xmit = ipoib_start_xmit; - dev->tx_timeout = ipoib_timeout; + dev->netdev_ops = &ipoib_netdev_ops; dev->header_ops = &ipoib_header_ops; - dev->set_multicast_list = ipoib_set_mcast_list; - dev->neigh_setup = ipoib_neigh_setup_dev; ipoib_set_ethtool_ops(dev); - netif_napi_add(dev, &priv->napi, ipoib_poll, 100); + netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT); dev->watchdog_timeo = HZ; dev->flags |= IFF_BROADCAST | IFF_MULTICAST; - /* - * We add in INFINIBAND_ALEN to allow for the destination - * address "pseudoheader" for skbs without neighbour struct. - */ - dev->hard_header_len = IPOIB_ENCAP_LEN + INFINIBAND_ALEN; + dev->hard_header_len = IPOIB_ENCAP_LEN; dev->addr_len = INFINIBAND_ALEN; dev->type = ARPHRD_INFINIBAND; dev->tx_queue_len = ipoib_sendq_size * 2; dev->features = (NETIF_F_VLAN_CHALLENGED | NETIF_F_HIGHDMA); + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); - netif_carrier_off(dev); - priv->dev = dev; - ipoib_lro_setup(priv); - spin_lock_init(&priv->lock); - mutex_init(&priv->vlan_mutex); + init_rwsem(&priv->vlan_rwsem); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); @@ -1073,6 +1387,7 @@ static void ipoib_setup(struct net_device *dev) INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); + INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh); } struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) @@ -1104,12 +1419,9 @@ static ssize_t show_umcast(struct device *dev, return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags)); } -static ssize_t set_umcast(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +void ipoib_set_umcast(struct net_device *ndev, int umcast_val) { - struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); - unsigned long umcast_val = simple_strtoul(buf, NULL, 0); + struct ipoib_dev_priv *priv = netdev_priv(ndev); if (umcast_val > 0) { set_bit(IPOIB_FLAG_UMCAST, &priv->flags); @@ -1117,6 +1429,15 @@ static ssize_t set_umcast(struct device *dev, "by userspace\n"); } else clear_bit(IPOIB_FLAG_UMCAST, &priv->flags); +} + +static ssize_t set_umcast(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long umcast_val = simple_strtoul(buf, NULL, 0); + + ipoib_set_umcast(to_net_dev(dev), umcast_val); return count; } @@ -1137,7 +1458,7 @@ static ssize_t create_child(struct device *dev, if (sscanf(buf, "%i", &pkey) != 1) return -EINVAL; - if (pkey < 0 || pkey > 0xffff) + if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000) return -EINVAL; /* @@ -1150,7 +1471,7 @@ static ssize_t create_child(struct device *dev, return ret ? ret : count; } -static DEVICE_ATTR(create_child, S_IWUGO, NULL, create_child); +static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child); static ssize_t delete_child(struct device *dev, struct device_attribute *attr, @@ -1170,7 +1491,7 @@ static ssize_t delete_child(struct device *dev, return ret ? ret : count; } -static DEVICE_ATTR(delete_child, S_IWUGO, NULL, delete_child); +static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child); int ipoib_add_pkey_attr(struct net_device *dev) { @@ -1201,20 +1522,18 @@ int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) kfree(device_attr); if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { - set_bit(IPOIB_FLAG_CSUM, &priv->flags); - priv->dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM; - } + priv->dev->hw_features = NETIF_F_SG | + NETIF_F_IP_CSUM | NETIF_F_RXCSUM; - if (lro) - priv->dev->features |= NETIF_F_LRO; + if (priv->hca_caps & IB_DEVICE_UD_TSO) + priv->dev->hw_features |= NETIF_F_TSO; - if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) - priv->dev->features |= NETIF_F_TSO; + priv->dev->features |= priv->dev->hw_features; + } return 0; } - static struct net_device *ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { @@ -1227,6 +1546,7 @@ static struct net_device *ipoib_add_port(const char *format, goto alloc_mem_failed; SET_NETDEV_DEV(priv->dev, hca->dma_device); + priv->dev->dev_id = port - 1; if (!ib_query_port(hca, port, &attr)) priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); @@ -1240,6 +1560,8 @@ static struct net_device *ipoib_add_port(const char *format, priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; + priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh); + result = ib_query_pkey(hca, port, 0, &priv->pkey); if (result) { printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", @@ -1312,6 +1634,9 @@ sysfs_failed: register_failed: ib_unregister_event_handler(&priv->event_handler); + /* Stop GC if started before flush */ + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + cancel_delayed_work(&priv->neigh_reap_task); flush_workqueue(ipoib_workqueue); event_failed: @@ -1349,6 +1674,8 @@ static void ipoib_add_one(struct ib_device *device) } for (p = s; p <= e; ++p) { + if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) + continue; dev = ipoib_add_port("ib%d", device, p); if (!IS_ERR(dev)) { priv = netdev_priv(dev); @@ -1368,6 +1695,8 @@ static void ipoib_remove_one(struct ib_device *device) return; dev_list = ib_get_client_data(device, &ipoib_client); + if (!dev_list) + return; list_for_each_entry_safe(priv, tmp, dev_list, list) { ib_unregister_event_handler(&priv->event_handler); @@ -1376,10 +1705,12 @@ static void ipoib_remove_one(struct ib_device *device) dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); rtnl_unlock(); + /* Stop GC */ + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + cancel_delayed_work(&priv->neigh_reap_task); flush_workqueue(ipoib_workqueue); unregister_netdev(priv->dev); - ipoib_dev_cleanup(priv->dev); free_netdev(priv->dev); } @@ -1396,8 +1727,7 @@ static int __init ipoib_init_module(void) ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); - ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE, - IPOIB_MIN_QUEUE_SIZE)); + ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); #endif @@ -1432,8 +1762,15 @@ static int __init ipoib_init_module(void) if (ret) goto err_sa; + ret = ipoib_netlink_init(); + if (ret) + goto err_client; + return 0; +err_client: + ib_unregister_client(&ipoib_client); + err_sa: ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); @@ -1446,6 +1783,7 @@ err_fs: static void __exit ipoib_cleanup_module(void) { + ipoib_netlink_fini(); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); ipoib_unregister_debugfs(); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 425e31112ed..d4e005720d0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -34,12 +34,14 @@ #include <linux/skbuff.h> #include <linux/rtnetlink.h> +#include <linux/moduleparam.h> #include <linux/ip.h> #include <linux/in.h> #include <linux/igmp.h> #include <linux/inetdevice.h> #include <linux/delay.h> #include <linux/completion.h> +#include <linux/slab.h> #include <net/dst.h> @@ -67,28 +69,13 @@ struct ipoib_mcast_iter { static void ipoib_mcast_free(struct ipoib_mcast *mcast) { struct net_device *dev = mcast->dev; - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ipoib_neigh *neigh, *tmp; int tx_dropped = 0; ipoib_dbg_mcast(netdev_priv(dev), "deleting multicast group %pI6\n", mcast->mcmember.mgid.raw); - spin_lock_irq(&priv->lock); - - list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list) { - /* - * It's safe to call ipoib_put_ah() inside priv->lock - * here, because we know that mcast->ah will always - * hold one more reference, so ipoib_put_ah() will - * never do more than decrement the ref count. - */ - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); - } - - spin_unlock_irq(&priv->lock); + /* remove all neigh connected to this mcast */ + ipoib_del_neighs_by_gid(dev, mcast->mcmember.mgid.raw); if (mcast->ah) ipoib_put_ah(mcast->ah); @@ -188,7 +175,9 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, mcast->mcmember = *mcmember; - /* Set the cached Q_Key before we attach if it's the broadcast group */ + /* Set the multicast MTU and cached Q_Key before we attach if it's + * the broadcast group. + */ if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4, sizeof (union ib_gid))) { spin_lock_irq(&priv->lock); @@ -196,10 +185,17 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, spin_unlock_irq(&priv->lock); return -EAGAIN; } + priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); spin_unlock_irq(&priv->lock); priv->tx_wr.wr.ud.remote_qkey = priv->qkey; set_qkey = 1; + + if (!ipoib_cm_admin_enabled(dev)) { + rtnl_lock(); + dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); + rtnl_unlock(); + } } if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { @@ -238,8 +234,11 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, av.grh.dgid = mcast->mcmember.mgid; ah = ipoib_create_ah(dev, priv->pd, &av); - if (!ah) { - ipoib_warn(priv, "ib_address_create failed\n"); + if (IS_ERR(ah)) { + ipoib_warn(priv, "ib_address_create failed %ld\n", + -PTR_ERR(ah)); + /* use original error */ + return PTR_ERR(ah); } else { spin_lock_irq(&priv->lock); mcast->ah = ah; @@ -257,17 +256,13 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, netif_tx_lock_bh(dev); while (!skb_queue_empty(&mcast->pkt_queue)) { struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue); + netif_tx_unlock_bh(dev); skb->dev = dev; - - if (!skb->dst || !skb->dst->neighbour) { - /* put pseudoheader back on for next time */ - skb_push(skb, sizeof (struct ipoib_pseudoheader)); - } - if (dev_queue_xmit(skb)) ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n"); + netif_tx_lock_bh(dev); } netif_tx_unlock_bh(dev); @@ -362,12 +357,19 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, carrier_on_task); + struct ib_port_attr attr; /* * Take rtnl_lock to avoid racing with ipoib_stop() and * turning the carrier back on while a device is being * removed. */ + if (ib_query_port(priv->ca, priv->port, &attr) || + attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); + return; + } + rtnl_lock(); netif_carrier_on(priv->dev); rtnl_unlock(); @@ -384,8 +386,10 @@ static int ipoib_mcast_join_complete(int status, mcast->mcmember.mgid.raw, status); /* We trap for port events ourselves. */ - if (status == -ENETRESET) - return 0; + if (status == -ENETRESET) { + status = 0; + goto out; + } if (!status) status = ipoib_mcast_join_finish(mcast, &multicast->rec); @@ -405,7 +409,8 @@ static int ipoib_mcast_join_complete(int status, if (mcast == priv->broadcast) queue_work(ipoib_workqueue, &priv->carrier_on_task); - return 0; + status = 0; + goto out; } if (mcast->logcount++ < 20) { @@ -432,7 +437,8 @@ static int ipoib_mcast_join_complete(int status, mcast->backoff * HZ); spin_unlock_irq(&priv->lock); mutex_unlock(&mcast_mutex); - +out: + complete(&mcast->done); return status; } @@ -482,11 +488,15 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, } set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, comp_mask, GFP_KERNEL, ipoib_mcast_join_complete, mcast); if (IS_ERR(mcast->mc)) { clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + complete(&mcast->done); ret = PTR_ERR(mcast->mc); ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); @@ -508,10 +518,18 @@ void ipoib_mcast_join_task(struct work_struct *work) struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, mcast_task.work); struct net_device *dev = priv->dev; + struct ib_port_attr port_attr; if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) return; + if (ib_query_port(priv->ca, priv->port, &port_attr) || + port_attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "port state is not ACTIVE (state = %d) suspending join task\n", + port_attr.state); + return; + } + if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) ipoib_warn(priv, "ib_query_gid() failed\n"); else @@ -581,14 +599,6 @@ void ipoib_mcast_join_task(struct work_struct *work) return; } - priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); - - if (!ipoib_cm_admin_enabled(dev)) { - rtnl_lock(); - dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); - rtnl_unlock(); - } - ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); clear_bit(IPOIB_MCAST_RUN, &priv->flags); @@ -647,11 +657,12 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) return 0; } -void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb) +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_mcast *mcast; unsigned long flags; + void *mgid = daddr + 4; spin_lock_irqsave(&priv->lock, flags); @@ -707,20 +718,24 @@ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb) out: if (mcast && mcast->ah) { - if (skb->dst && - skb->dst->neighbour && - !*to_ipoib_neigh(skb->dst->neighbour)) { - struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb->dst->neighbour, - skb->dev); + struct ipoib_neigh *neigh; + spin_unlock_irqrestore(&priv->lock, flags); + neigh = ipoib_neigh_get(dev, daddr); + spin_lock_irqsave(&priv->lock, flags); + if (!neigh) { + neigh = ipoib_neigh_alloc(daddr, dev); if (neigh) { kref_get(&mcast->ah->ref); neigh->ah = mcast->ah; list_add_tail(&neigh->list, &mcast->neigh_list); } } - + spin_unlock_irqrestore(&priv->lock, flags); ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); + if (neigh) + ipoib_neigh_put(neigh); + return; } unlock: @@ -752,18 +767,34 @@ void ipoib_mcast_dev_flush(struct net_device *dev) spin_unlock_irqrestore(&priv->lock, flags); + /* seperate between the wait to the leave*/ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) + if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) + wait_for_completion(&mcast->done); + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(dev, mcast); ipoib_mcast_free(mcast); } } +static int ipoib_mcast_addr_is_valid(const u8 *addr, const u8 *broadcast) +{ + /* reserved QPN, prefix, scope */ + if (memcmp(addr, broadcast, 6)) + return 0; + /* signature lower, pkey */ + if (memcmp(addr + 7, broadcast + 7, 3)) + return 0; + return 1; +} + void ipoib_mcast_restart_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, restart_task); struct net_device *dev = priv->dev; - struct dev_mc_list *mclist; + struct netdev_hw_addr *ha; struct ipoib_mcast *mcast, *tmcast; LIST_HEAD(remove_list); unsigned long flags; @@ -788,10 +819,13 @@ void ipoib_mcast_restart_task(struct work_struct *work) clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); /* Mark all of the entries that are found or don't exist */ - for (mclist = dev->mc_list; mclist; mclist = mclist->next) { + netdev_for_each_mc_addr(ha, dev) { union ib_gid mgid; - memcpy(mgid.raw, mclist->dmi_addr + 4, sizeof mgid); + if (!ipoib_mcast_addr_is_valid(ha->addr, dev->broadcast)) + continue; + + memcpy(mgid.raw, ha->addr + 4, sizeof mgid); mcast = __ipoib_mcast_find(dev, &mgid); if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c new file mode 100644 index 00000000000..cdc7df4fdb8 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. - All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/netdevice.h> +#include <linux/if_arp.h> /* For ARPHRD_xxx */ +#include <linux/module.h> +#include <net/rtnetlink.h> +#include "ipoib.h" + +static const struct nla_policy ipoib_policy[IFLA_IPOIB_MAX + 1] = { + [IFLA_IPOIB_PKEY] = { .type = NLA_U16 }, + [IFLA_IPOIB_MODE] = { .type = NLA_U16 }, + [IFLA_IPOIB_UMCAST] = { .type = NLA_U16 }, +}; + +static int ipoib_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + u16 val; + + if (nla_put_u16(skb, IFLA_IPOIB_PKEY, priv->pkey)) + goto nla_put_failure; + + val = test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + if (nla_put_u16(skb, IFLA_IPOIB_MODE, val)) + goto nla_put_failure; + + val = test_bit(IPOIB_FLAG_UMCAST, &priv->flags); + if (nla_put_u16(skb, IFLA_IPOIB_UMCAST, val)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int ipoib_changelink(struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + u16 mode, umcast; + int ret = 0; + + if (data[IFLA_IPOIB_MODE]) { + mode = nla_get_u16(data[IFLA_IPOIB_MODE]); + if (mode == IPOIB_MODE_DATAGRAM) + ret = ipoib_set_mode(dev, "datagram\n"); + else if (mode == IPOIB_MODE_CONNECTED) + ret = ipoib_set_mode(dev, "connected\n"); + else + ret = -EINVAL; + + if (ret < 0) + goto out_err; + } + + if (data[IFLA_IPOIB_UMCAST]) { + umcast = nla_get_u16(data[IFLA_IPOIB_UMCAST]); + ipoib_set_umcast(dev, umcast); + } + +out_err: + return ret; +} + +static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct net_device *pdev; + struct ipoib_dev_priv *ppriv; + u16 child_pkey; + int err; + + if (!tb[IFLA_LINK]) + return -EINVAL; + + pdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + if (!pdev || pdev->type != ARPHRD_INFINIBAND) + return -ENODEV; + + ppriv = netdev_priv(pdev); + + if (test_bit(IPOIB_FLAG_SUBINTERFACE, &ppriv->flags)) { + ipoib_warn(ppriv, "child creation disallowed for child devices\n"); + return -EINVAL; + } + + if (!data || !data[IFLA_IPOIB_PKEY]) { + ipoib_dbg(ppriv, "no pkey specified, using parent pkey\n"); + child_pkey = ppriv->pkey; + } else + child_pkey = nla_get_u16(data[IFLA_IPOIB_PKEY]); + + if (child_pkey == 0 || child_pkey == 0x8000) + return -EINVAL; + + /* + * Set the full membership bit, so that we join the right + * broadcast group, etc. + */ + child_pkey |= 0x8000; + + err = __ipoib_vlan_add(ppriv, netdev_priv(dev), child_pkey, IPOIB_RTNL_CHILD); + + if (!err && data) + err = ipoib_changelink(dev, tb, data); + return err; +} + +static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head *head) +{ + struct ipoib_dev_priv *priv, *ppriv; + + priv = netdev_priv(dev); + ppriv = netdev_priv(priv->parent); + + down_write(&ppriv->vlan_rwsem); + unregister_netdevice_queue(dev, head); + list_del(&priv->list); + up_write(&ppriv->vlan_rwsem); +} + +static size_t ipoib_get_size(const struct net_device *dev) +{ + return nla_total_size(2) + /* IFLA_IPOIB_PKEY */ + nla_total_size(2) + /* IFLA_IPOIB_MODE */ + nla_total_size(2); /* IFLA_IPOIB_UMCAST */ +} + +static struct rtnl_link_ops ipoib_link_ops __read_mostly = { + .kind = "ipoib", + .maxtype = IFLA_IPOIB_MAX, + .policy = ipoib_policy, + .priv_size = sizeof(struct ipoib_dev_priv), + .setup = ipoib_setup, + .newlink = ipoib_new_child_link, + .changelink = ipoib_changelink, + .dellink = ipoib_unregister_child_dev, + .get_size = ipoib_get_size, + .fill_info = ipoib_fill_info, +}; + +int __init ipoib_netlink_init(void) +{ + return rtnl_link_register(&ipoib_link_ops); +} + +void __exit ipoib_netlink_fini(void) +{ + rtnl_link_unregister(&ipoib_link_ops); +} + +MODULE_ALIAS_RTNL_LINK("ipoib"); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 68325119f74..c56d5d44c53 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -31,6 +31,8 @@ * SOFTWARE. */ +#include <linux/slab.h> + #include "ipoib.h" int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int set_qkey) @@ -190,6 +192,9 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK) init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING) + init_attr.create_flags |= IB_QP_CREATE_NETIF_QP; + if (dev->features & NETIF_F_SG) init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 5a76a551035..9fad7b5ac8b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -33,7 +33,6 @@ #include <linux/module.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/seq_file.h> #include <asm/uaccess.h> @@ -50,44 +49,11 @@ static ssize_t show_parent(struct device *d, struct device_attribute *attr, } static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL); -int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) +int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, + u16 pkey, int type) { - struct ipoib_dev_priv *ppriv, *priv; - char intf_name[IFNAMSIZ]; int result; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - ppriv = netdev_priv(pdev); - - rtnl_lock(); - mutex_lock(&ppriv->vlan_mutex); - - /* - * First ensure this isn't a duplicate. We check the parent device and - * then all of the child interfaces to make sure the Pkey doesn't match. - */ - if (ppriv->pkey == pkey) { - result = -ENOTUNIQ; - goto err; - } - - list_for_each_entry(priv, &ppriv->child_intfs, list) { - if (priv->pkey == pkey) { - result = -ENOTUNIQ; - goto err; - } - } - - snprintf(intf_name, sizeof intf_name, "%s.%04x", - ppriv->dev->name, pkey); - priv = ipoib_intf_alloc(intf_name); - if (!priv) { - result = -ENOMEM; - goto err; - } - priv->max_ib_mtu = ppriv->max_ib_mtu; /* MTU will be reset when mcast join happens */ priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); @@ -96,7 +62,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) result = ipoib_set_dev_features(priv, ppriv->ca); if (result) - goto device_init_failed; + goto err; priv->pkey = pkey; @@ -109,7 +75,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) ipoib_warn(ppriv, "failed to initialize subinterface: " "device %s, port %d", ppriv->ca->name, ppriv->port); - goto device_init_failed; + goto err; } result = register_netdevice(priv->dev); @@ -122,64 +88,122 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) ipoib_create_debug_files(priv->dev); - if (ipoib_cm_add_mode_attr(priv->dev)) - goto sysfs_failed; - if (ipoib_add_pkey_attr(priv->dev)) - goto sysfs_failed; - if (ipoib_add_umcast_attr(priv->dev)) - goto sysfs_failed; - - if (device_create_file(&priv->dev->dev, &dev_attr_parent)) - goto sysfs_failed; + /* RTNL childs don't need proprietary sysfs entries */ + if (type == IPOIB_LEGACY_CHILD) { + if (ipoib_cm_add_mode_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_pkey_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_umcast_attr(priv->dev)) + goto sysfs_failed; + + if (device_create_file(&priv->dev->dev, &dev_attr_parent)) + goto sysfs_failed; + } + priv->child_type = type; + priv->dev->iflink = ppriv->dev->ifindex; list_add_tail(&priv->list, &ppriv->child_intfs); - mutex_unlock(&ppriv->vlan_mutex); - rtnl_unlock(); - return 0; sysfs_failed: + result = -ENOMEM; ipoib_delete_debug_files(priv->dev); unregister_netdevice(priv->dev); register_failed: ipoib_dev_cleanup(priv->dev); -device_init_failed: - free_netdev(priv->dev); - err: - mutex_unlock(&ppriv->vlan_mutex); + return result; +} + +int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) +{ + struct ipoib_dev_priv *ppriv, *priv; + char intf_name[IFNAMSIZ]; + struct ipoib_dev_priv *tpriv; + int result; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + ppriv = netdev_priv(pdev); + + snprintf(intf_name, sizeof intf_name, "%s.%04x", + ppriv->dev->name, pkey); + priv = ipoib_intf_alloc(intf_name); + if (!priv) + return -ENOMEM; + + if (!rtnl_trylock()) + return restart_syscall(); + + down_write(&ppriv->vlan_rwsem); + + /* + * First ensure this isn't a duplicate. We check the parent device and + * then all of the legacy child interfaces to make sure the Pkey + * doesn't match. + */ + if (ppriv->pkey == pkey) { + result = -ENOTUNIQ; + goto out; + } + + list_for_each_entry(tpriv, &ppriv->child_intfs, list) { + if (tpriv->pkey == pkey && + tpriv->child_type == IPOIB_LEGACY_CHILD) { + result = -ENOTUNIQ; + goto out; + } + } + + result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD); + +out: + up_write(&ppriv->vlan_rwsem); + + if (result) + free_netdev(priv->dev); + rtnl_unlock(); + return result; } int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) { struct ipoib_dev_priv *ppriv, *priv, *tpriv; - int ret = -ENOENT; + struct net_device *dev = NULL; if (!capable(CAP_NET_ADMIN)) return -EPERM; ppriv = netdev_priv(pdev); - rtnl_lock(); - mutex_lock(&ppriv->vlan_mutex); + if (!rtnl_trylock()) + return restart_syscall(); + + down_write(&ppriv->vlan_rwsem); list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { - if (priv->pkey == pkey) { + if (priv->pkey == pkey && + priv->child_type == IPOIB_LEGACY_CHILD) { unregister_netdevice(priv->dev); - ipoib_dev_cleanup(priv->dev); list_del(&priv->list); - free_netdev(priv->dev); - - ret = 0; + dev = priv->dev; break; } } - mutex_unlock(&ppriv->vlan_mutex); + up_write(&ppriv->vlan_rwsem); + rtnl_unlock(); - return ret; + if (dev) { + free_netdev(dev); + return 0; + } + + return -ENODEV; } |
