diff options
Diffstat (limited to 'drivers/infiniband/ulp/ipoib')
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/Makefile | 3 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib.h | 33 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_cm.c | 78 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 21 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_ib.c | 105 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_main.c | 216 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 50 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_netlink.c | 182 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 3 | ||||
| -rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 132 |
10 files changed, 604 insertions, 219 deletions
diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile index 3090100f0de..e5430dd5076 100644 --- a/drivers/infiniband/ulp/ipoib/Makefile +++ b/drivers/infiniband/ulp/ipoib/Makefile @@ -5,7 +5,8 @@ ib_ipoib-y := ipoib_main.o \ ipoib_multicast.o \ ipoib_verbs.o \ ipoib_vlan.o \ - ipoib_ethtool.o + ipoib_ethtool.o \ + ipoib_netlink.o ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM) += ipoib_cm.o ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index ca43901ed86..c639f90cfda 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -101,9 +101,14 @@ enum { IPOIB_MCAST_FLAG_SENDONLY = 1, IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ IPOIB_MCAST_FLAG_ATTACHED = 3, + IPOIB_MCAST_JOIN_STARTED = 4, MAX_SEND_CQE = 16, IPOIB_CM_COPYBREAK = 256, + + IPOIB_NON_CHILD = 0, + IPOIB_LEGACY_CHILD = 1, + IPOIB_RTNL_CHILD = 2, }; #define IPOIB_OP_RECV (1ul << 31) @@ -113,6 +118,8 @@ enum { #define IPOIB_OP_CM (0) #endif +#define IPOIB_QPN_MASK ((__force u32) cpu_to_be32(0xFFFFFF)) + /* structs */ struct ipoib_header { @@ -145,6 +152,7 @@ struct ipoib_mcast { struct sk_buff_head pkt_queue; struct net_device *dev; + struct completion done; }; struct ipoib_rx_buf { @@ -262,7 +270,10 @@ struct ipoib_ethtool_st { u16 max_coalesced_frames; }; +struct ipoib_neigh_table; + struct ipoib_neigh_hash { + struct ipoib_neigh_table *ntbl; struct ipoib_neigh __rcu **buckets; struct rcu_head rcu; u32 mask; @@ -271,9 +282,9 @@ struct ipoib_neigh_hash { struct ipoib_neigh_table { struct ipoib_neigh_hash __rcu *htbl; - rwlock_t rwlock; atomic_t entries; struct completion flushed; + struct completion deleted; }; /* @@ -290,7 +301,7 @@ struct ipoib_dev_priv { unsigned long flags; - struct mutex vlan_mutex; + struct rw_semaphore vlan_rwsem; struct rb_root path_tree; struct list_head path_list; @@ -350,6 +361,7 @@ struct ipoib_dev_priv { struct net_device *parent; struct list_head child_intfs; struct list_head list; + int child_type; #ifdef CONFIG_INFINIBAND_IPOIB_CM struct ipoib_cm_dev_priv cm; @@ -509,6 +521,17 @@ void ipoib_event(struct ib_event_handler *handler, int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey); int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey); +int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, + u16 pkey, int child_type); + +int __init ipoib_netlink_init(void); +void __exit ipoib_netlink_fini(void); + +void ipoib_set_umcast(struct net_device *ndev, int umcast_val); +int ipoib_set_mode(struct net_device *dev, const char *buf); + +void ipoib_setup(struct net_device *dev); + void ipoib_pkey_poll(struct work_struct *work); int ipoib_pkey_dev_delay_open(struct net_device *dev); void ipoib_drain_cq(struct net_device *dev); @@ -516,14 +539,14 @@ void ipoib_drain_cq(struct net_device *dev); void ipoib_set_ethtool_ops(struct net_device *dev); int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca); -#ifdef CONFIG_INFINIBAND_IPOIB_CM - #define IPOIB_FLAGS_RC 0x80 #define IPOIB_FLAGS_UC 0x40 /* We don't support UC connections at the moment */ #define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC)) +#ifdef CONFIG_INFINIBAND_IPOIB_CM + extern int ipoib_max_conn_qp; static inline int ipoib_cm_admin_enabled(struct net_device *dev) @@ -741,4 +764,6 @@ extern int ipoib_debug_level; #define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) +extern const char ipoib_driver_version[]; + #endif /* _IPOIB_H */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 95ecf4eadf5..933efcea0d0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -140,7 +140,8 @@ static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, struct ipoib_cm_rx_buf *rx_ring, int id, int frags, - u64 mapping[IPOIB_CM_RX_SG]) + u64 mapping[IPOIB_CM_RX_SG], + gfp_t gfp) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; @@ -164,7 +165,7 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, } for (i = 0; i < frags; i++) { - struct page *page = alloc_page(GFP_ATOMIC); + struct page *page = alloc_page(gfp); if (!page) goto partial_error; @@ -382,7 +383,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1, - rx->rx_ring[i].mapping)) { + rx->rx_ring[i].mapping, + GFP_KERNEL)) { ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); ret = -ENOMEM; goto err_count; @@ -460,7 +462,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even goto err_qp; } - psn = random32() & 0xffffff; + psn = prandom_u32() & 0xffffff; ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); if (ret) goto err_modify; @@ -639,7 +641,8 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; - newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping); + newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, + mapping, GFP_ATOMIC); if (unlikely(!newskb)) { /* * If we can't allocate a new RX buffer, dump @@ -741,6 +744,9 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ tx_req->mapping = addr; + skb_orphan(skb); + skb_dst_drop(skb); + rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), addr, skb->len); if (unlikely(rc)) { @@ -755,9 +761,13 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ if (++priv->tx_outstanding == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", tx->qp->qp_num); - if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) - ipoib_warn(priv, "request notify on send CQ failed\n"); netif_stop_queue(dev); + rc = ib_req_notify_cq(priv->send_cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc < 0) + ipoib_warn(priv, "request notify on send CQ failed\n"); + else if (rc) + ipoib_send_comp_handler(priv->send_cq, dev); } } } @@ -810,7 +820,6 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) if (neigh) { neigh->cm = NULL; - list_del(&neigh->list); ipoib_neigh_free(neigh); tx->neigh = NULL; @@ -1021,10 +1030,20 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ .cap.max_send_sge = 1, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, - .qp_context = tx + .qp_context = tx, + .create_flags = IB_QP_CREATE_USE_GFP_NOIO }; - return ib_create_qp(priv->pd, &attr); + struct ib_qp *tx_qp; + + tx_qp = ib_create_qp(priv->pd, &attr); + if (PTR_ERR(tx_qp) == -EINVAL) { + ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n", + priv->ca->name); + attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO; + tx_qp = ib_create_qp(priv->pd, &attr); + } + return tx_qp; } static int ipoib_cm_send_req(struct net_device *dev, @@ -1095,12 +1114,14 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, struct ipoib_dev_priv *priv = netdev_priv(p->dev); int ret; - p->tx_ring = vzalloc(ipoib_sendq_size * sizeof *p->tx_ring); + p->tx_ring = __vmalloc(ipoib_sendq_size * sizeof *p->tx_ring, + GFP_NOIO, PAGE_KERNEL); if (!p->tx_ring) { ipoib_warn(priv, "failed to allocate tx ring\n"); ret = -ENOMEM; goto err_tx; } + memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring); p->qp = ipoib_cm_create_tx_qp(p->dev, p); if (IS_ERR(p->qp)) { @@ -1227,7 +1248,6 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, if (neigh) { neigh->cm = NULL; - list_del(&neigh->list); ipoib_neigh_free(neigh); tx->neigh = NULL; @@ -1271,12 +1291,15 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) { struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + unsigned long flags; if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + spin_lock_irqsave(&priv->lock, flags); list_move(&tx->list, &priv->cm.reap_list); queue_work(ipoib_workqueue, &priv->cm.reap_task); ipoib_dbg(priv, "Reap connection for gid %pI6\n", tx->neigh->daddr + 4); tx->neigh = NULL; + spin_unlock_irqrestore(&priv->lock, flags); } } @@ -1315,7 +1338,6 @@ static void ipoib_cm_tx_start(struct work_struct *work) neigh = p->neigh; if (neigh) { neigh->cm = NULL; - list_del(&neigh->list); ipoib_neigh_free(neigh); } list_del(&p->list); @@ -1449,36 +1471,19 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr, const char *buf, size_t count) { struct net_device *dev = to_net_dev(d); - struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; if (!rtnl_trylock()) return restart_syscall(); - /* flush paths if we switch modes so that connections are restarted */ - if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { - set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); - ipoib_warn(priv, "enabling connected mode " - "will cause multicast packet drops\n"); - netdev_update_features(dev); - rtnl_unlock(); - priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; - - ipoib_flush_paths(dev); - return count; - } + ret = ipoib_set_mode(dev, buf); - if (!strcmp(buf, "datagram\n")) { - clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); - netdev_update_features(dev); - dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); - rtnl_unlock(); - ipoib_flush_paths(dev); + rtnl_unlock(); + if (!ret) return count; - } - rtnl_unlock(); - return -EINVAL; + return ret; } static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode); @@ -1566,7 +1571,8 @@ int ipoib_cm_dev_init(struct net_device *dev) for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i, priv->cm.num_frags - 1, - priv->cm.srq_ring[i].mapping)) { + priv->cm.srq_ring[i].mapping, + GFP_KERNEL)) { ipoib_warn(priv, "failed to allocate " "receive buffer %d\n", i); ipoib_cm_dev_cleanup(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 29bc7b5724a..078cadd6c79 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -39,7 +39,24 @@ static void ipoib_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { - strncpy(drvinfo->driver, "ipoib", sizeof(drvinfo->driver) - 1); + struct ipoib_dev_priv *priv = netdev_priv(netdev); + struct ib_device_attr *attr; + + attr = kmalloc(sizeof(*attr), GFP_KERNEL); + if (attr && !ib_query_device(priv->ca, attr)) + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%d", (int)(attr->fw_ver >> 32), + (int)(attr->fw_ver >> 16) & 0xffff, + (int)attr->fw_ver & 0xffff); + kfree(attr); + + strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device), + sizeof(drvinfo->bus_info)); + + strlcpy(drvinfo->version, ipoib_driver_version, + sizeof(drvinfo->version)); + + strlcpy(drvinfo->driver, "ib_ipoib", sizeof(drvinfo->driver)); } static int ipoib_get_coalesce(struct net_device *dev, @@ -88,5 +105,5 @@ static const struct ethtool_ops ipoib_ethtool_ops = { void ipoib_set_ethtool_ops(struct net_device *dev) { - SET_ETHTOOL_OPS(dev, &ipoib_ethtool_ops); + dev->ethtool_ops = &ipoib_ethtool_ops; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index f10221f4080..6a7003ddb0b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -600,6 +600,9 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, netif_stop_queue(dev); } + skb_orphan(skb); + skb_dst_drop(skb); + rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), address->ah, qpn, tx_req, phead, hlen); if (unlikely(rc)) { @@ -615,8 +618,6 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, address->last_send = priv->tx_head; ++priv->tx_head; - skb_orphan(skb); - } if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) @@ -684,15 +685,13 @@ int ipoib_ib_dev_open(struct net_device *dev) ret = ipoib_ib_post_receives(dev); if (ret) { ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); - ipoib_ib_dev_stop(dev, 1); - return -1; + goto dev_stop; } ret = ipoib_cm_dev_open(dev); if (ret) { ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); - ipoib_ib_dev_stop(dev, 1); - return -1; + goto dev_stop; } clear_bit(IPOIB_STOP_REAPER, &priv->flags); @@ -703,6 +702,11 @@ int ipoib_ib_dev_open(struct net_device *dev) napi_enable(&priv->napi); return 0; +dev_stop: + if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_enable(&priv->napi); + ipoib_ib_dev_stop(dev, 1); + return -1; } static void ipoib_pkey_dev_check_presence(struct net_device *dev) @@ -745,10 +749,8 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush) if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { mutex_lock(&pkey_mutex); set_bit(IPOIB_PKEY_STOP, &priv->flags); - cancel_delayed_work(&priv->pkey_poll_task); + cancel_delayed_work_sync(&priv->pkey_poll_task); mutex_unlock(&pkey_mutex); - if (flush) - flush_workqueue(ipoib_workqueue); } ipoib_mcast_stop_thread(dev, flush); @@ -931,14 +933,49 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) return 0; } +/* + * Takes whatever value which is in pkey index 0 and updates priv->pkey + * returns 0 if the pkey value was changed. + */ +static inline int update_parent_pkey(struct ipoib_dev_priv *priv) +{ + int result; + u16 prev_pkey; + + prev_pkey = priv->pkey; + result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); + if (result) { + ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n", + priv->port, result); + return result; + } + + priv->pkey |= 0x8000; + + if (prev_pkey != priv->pkey) { + ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n", + prev_pkey, priv->pkey); + /* + * Update the pkey in the broadcast address, while making sure to set + * the full membership bit, so that we join the right broadcast group. + */ + priv->dev->broadcast[8] = priv->pkey >> 8; + priv->dev->broadcast[9] = priv->pkey & 0xff; + return 0; + } + + return 1; +} + static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, enum ipoib_flush_level level) { struct ipoib_dev_priv *cpriv; struct net_device *dev = priv->dev; u16 new_index; + int result; - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); /* * Flush any child interfaces too -- they might be up even if @@ -947,9 +984,13 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, list_for_each_entry(cpriv, &priv->child_intfs, list) __ipoib_ib_dev_flush(cpriv, level); - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { + /* for non-child devices must check/update the pkey value here */ + if (level == IPOIB_FLUSH_HEAVY && + !test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) + update_parent_pkey(priv); ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); return; } @@ -960,21 +1001,32 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, } if (level == IPOIB_FLUSH_HEAVY) { - if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { - clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); - ipoib_ib_dev_down(dev, 0); - ipoib_ib_dev_stop(dev, 0); - if (ipoib_pkey_dev_delay_open(dev)) + /* child devices chase their origin pkey value, while non-child + * (parent) devices should always takes what present in pkey index 0 + */ + if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + ipoib_ib_dev_down(dev, 0); + ipoib_ib_dev_stop(dev, 0); + if (ipoib_pkey_dev_delay_open(dev)) + return; + } + /* restart QP only if P_Key index is changed */ + if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && + new_index == priv->pkey_index) { + ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); return; + } + priv->pkey_index = new_index; + } else { + result = update_parent_pkey(priv); + /* restart QP only if P_Key value changed */ + if (result) { + ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n"); + return; + } } - - /* restart QP only if P_Key index is changed */ - if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && - new_index == priv->pkey_index) { - ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); - return; - } - priv->pkey_index = new_index; } if (level == IPOIB_FLUSH_LIGHT) { @@ -1030,6 +1082,11 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) struct ipoib_dev_priv *priv = netdev_priv(dev); ipoib_dbg(priv, "cleaning up ib_dev\n"); + /* + * We must make sure there are no more (path) completions + * that may wish to touch priv fields that are no longer valid + */ + ipoib_flush_paths(dev); ipoib_mcast_stop_thread(dev, 1); ipoib_mcast_dev_flush(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 97920b77a5d..5786a78ff8b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -49,9 +49,14 @@ #include <linux/jhash.h> #include <net/arp.h> +#define DRV_VERSION "1.0.0" + +const char ipoib_driver_version[] = DRV_VERSION; + MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; @@ -99,6 +104,8 @@ int ipoib_open(struct net_device *dev) ipoib_dbg(priv, "bringing up interface\n"); + netif_carrier_off(dev); + set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_pkey_dev_delay_open(dev)) @@ -114,7 +121,7 @@ int ipoib_open(struct net_device *dev) struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; @@ -124,7 +131,7 @@ int ipoib_open(struct net_device *dev) dev_change_flags(cpriv->dev, flags | IFF_UP); } - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); } netif_start_queue(dev); @@ -150,14 +157,14 @@ static int ipoib_stop(struct net_device *dev) netif_stop_queue(dev); - ipoib_ib_dev_down(dev, 0); + ipoib_ib_dev_down(dev, 1); ipoib_ib_dev_stop(dev, 0); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; @@ -167,12 +174,17 @@ static int ipoib_stop(struct net_device *dev) dev_change_flags(cpriv->dev, flags & ~IFF_UP); } - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); } return 0; } +static void ipoib_uninit(struct net_device *dev) +{ + ipoib_dev_cleanup(dev); +} + static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -210,6 +222,37 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu) return 0; } +int ipoib_set_mode(struct net_device *dev, const char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + /* flush paths if we switch modes so that connections are restarted */ + if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { + set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + ipoib_warn(priv, "enabling connected mode " + "will cause multicast packet drops\n"); + netdev_update_features(dev); + rtnl_unlock(); + priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + + ipoib_flush_paths(dev); + rtnl_lock(); + return 0; + } + + if (!strcmp(buf, "datagram\n")) { + clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + netdev_update_features(dev); + dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); + rtnl_unlock(); + ipoib_flush_paths(dev); + rtnl_lock(); + return 0; + } + + return -EINVAL; +} + static struct ipoib_path *__path_find(struct net_device *dev, void *gid) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -452,7 +495,6 @@ static void path_rec_completion(int status, path, neigh)); if (!ipoib_cm_get(neigh)) { - list_del(&neigh->list); ipoib_neigh_free(neigh); continue; } @@ -469,6 +511,9 @@ static void path_rec_completion(int status, spin_unlock_irqrestore(&priv->lock, flags); + if (IS_ERR_OR_NULL(ah)) + ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); + if (old_ah) ipoib_put_ah(old_ah); @@ -546,15 +591,15 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, struct ipoib_neigh *neigh; unsigned long flags; + spin_lock_irqsave(&priv->lock, flags); neigh = ipoib_neigh_alloc(daddr, dev); if (!neigh) { + spin_unlock_irqrestore(&priv->lock, flags); ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return; } - spin_lock_irqsave(&priv->lock, flags); - path = __path_find(dev, daddr + 4); if (!path) { path = path_rec_create(dev, daddr + 4); @@ -574,7 +619,6 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, if (!ipoib_cm_get(neigh)) ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); if (!ipoib_cm_get(neigh)) { - list_del(&neigh->list); ipoib_neigh_free(neigh); goto err_drop; } @@ -595,7 +639,7 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, neigh->ah = NULL; if (!path->query && path_rec_start(dev, path)) - goto err_list; + goto err_path; __skb_queue_tail(&neigh->queue, skb); } @@ -604,9 +648,6 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, ipoib_neigh_put(neigh); return; -err_list: - list_del(&neigh->list); - err_path: ipoib_neigh_free(neigh); err_drop: @@ -686,7 +727,8 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) if ((header->proto != htons(ETH_P_IP)) && (header->proto != htons(ETH_P_IPV6)) && (header->proto != htons(ETH_P_ARP)) && - (header->proto != htons(ETH_P_RARP))) { + (header->proto != htons(ETH_P_RARP)) && + (header->proto != htons(ETH_P_TIPC))) { /* ethertype not supported by IPoIB */ ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); @@ -707,6 +749,7 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) switch (header->proto) { case htons(ETH_P_IP): case htons(ETH_P_IPV6): + case htons(ETH_P_TIPC): neigh = ipoib_neigh_get(dev, cb->hwaddr); if (unlikely(!neigh)) { neigh_add_path(skb, cb->hwaddr, dev); @@ -784,7 +827,7 @@ static int ipoib_hard_header(struct sk_buff *skb, */ memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); - return 0; + return sizeof *header; } static void ipoib_set_mcast_list(struct net_device *dev) @@ -808,10 +851,10 @@ static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) * different subnets. */ /* qpn octets[1:4) & port GUID octets[12:20) */ - u32 *daddr_32 = (u32 *) daddr; + u32 *d32 = (u32 *) daddr; u32 hv; - hv = jhash_3words(daddr_32[3], daddr_32[4], 0xFFFFFF & daddr_32[0], 0); + hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0); return hv & htbl->mask; } @@ -863,10 +906,10 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) return; - write_lock_bh(&ntbl->rwlock); + spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, - lockdep_is_held(&ntbl->rwlock)); + lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; @@ -883,16 +926,14 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, - lockdep_is_held(&ntbl->rwlock))) != NULL) { + lockdep_is_held(&priv->lock))) != NULL) { /* was the neigh idle for two GC periods */ if (time_after(neigh_obsolete, neigh->alive)) { rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, - lockdep_is_held(&ntbl->rwlock))); + lockdep_is_held(&priv->lock))); /* remove from path/mc list */ - spin_lock_irqsave(&priv->lock, flags); list_del(&neigh->list); - spin_unlock_irqrestore(&priv->lock, flags); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; @@ -902,7 +943,7 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) } out_unlock: - write_unlock_bh(&ntbl->rwlock); + spin_unlock_irqrestore(&priv->lock, flags); } static void ipoib_reap_neigh(struct work_struct *work) @@ -947,10 +988,8 @@ struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, struct ipoib_neigh *neigh; u32 hash_val; - write_lock_bh(&ntbl->rwlock); - htbl = rcu_dereference_protected(ntbl->htbl, - lockdep_is_held(&ntbl->rwlock)); + lockdep_is_held(&priv->lock)); if (!htbl) { neigh = NULL; goto out_unlock; @@ -961,10 +1000,10 @@ struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, */ hash_val = ipoib_addr_hash(htbl, daddr); for (neigh = rcu_dereference_protected(htbl->buckets[hash_val], - lockdep_is_held(&ntbl->rwlock)); + lockdep_is_held(&priv->lock)); neigh != NULL; neigh = rcu_dereference_protected(neigh->hnext, - lockdep_is_held(&ntbl->rwlock))) { + lockdep_is_held(&priv->lock))) { if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { /* found, take one ref on behalf of the caller */ if (!atomic_inc_not_zero(&neigh->refcnt)) { @@ -987,12 +1026,11 @@ struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, /* put in hash */ rcu_assign_pointer(neigh->hnext, rcu_dereference_protected(htbl->buckets[hash_val], - lockdep_is_held(&ntbl->rwlock))); + lockdep_is_held(&priv->lock))); rcu_assign_pointer(htbl->buckets[hash_val], neigh); atomic_inc(&ntbl->entries); out_unlock: - write_unlock_bh(&ntbl->rwlock); return neigh; } @@ -1040,35 +1078,31 @@ void ipoib_neigh_free(struct ipoib_neigh *neigh) struct ipoib_neigh *n; u32 hash_val; - write_lock_bh(&ntbl->rwlock); - htbl = rcu_dereference_protected(ntbl->htbl, - lockdep_is_held(&ntbl->rwlock)); + lockdep_is_held(&priv->lock)); if (!htbl) - goto out_unlock; + return; hash_val = ipoib_addr_hash(htbl, neigh->daddr); np = &htbl->buckets[hash_val]; for (n = rcu_dereference_protected(*np, - lockdep_is_held(&ntbl->rwlock)); + lockdep_is_held(&priv->lock)); n != NULL; - n = rcu_dereference_protected(neigh->hnext, - lockdep_is_held(&ntbl->rwlock))) { + n = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) { if (n == neigh) { /* found */ rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, - lockdep_is_held(&ntbl->rwlock))); + lockdep_is_held(&priv->lock))); + /* remove from parent list */ + list_del(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); - goto out_unlock; + return; } else { np = &n->hnext; } } - -out_unlock: - write_unlock_bh(&ntbl->rwlock); - } static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) @@ -1080,7 +1114,6 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); ntbl->htbl = NULL; - rwlock_init(&ntbl->rwlock); htbl = kzalloc(sizeof(*htbl), GFP_KERNEL); if (!htbl) return -ENOMEM; @@ -1095,6 +1128,7 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) htbl->mask = (size - 1); htbl->buckets = buckets; ntbl->htbl = htbl; + htbl->ntbl = ntbl; atomic_set(&ntbl->entries, 0); /* start garbage collection */ @@ -1111,9 +1145,11 @@ static void neigh_hash_free_rcu(struct rcu_head *head) struct ipoib_neigh_hash, rcu); struct ipoib_neigh __rcu **buckets = htbl->buckets; + struct ipoib_neigh_table *ntbl = htbl->ntbl; kfree(buckets); kfree(htbl); + complete(&ntbl->deleted); } void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) @@ -1125,10 +1161,10 @@ void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) int i; /* remove all neigh connected to a given path or mcast */ - write_lock_bh(&ntbl->rwlock); + spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, - lockdep_is_held(&ntbl->rwlock)); + lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; @@ -1138,16 +1174,14 @@ void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, - lockdep_is_held(&ntbl->rwlock))) != NULL) { + lockdep_is_held(&priv->lock))) != NULL) { /* delete neighs belong to this parent */ if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) { rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, - lockdep_is_held(&ntbl->rwlock))); + lockdep_is_held(&priv->lock))); /* remove from parent list */ - spin_lock_irqsave(&priv->lock, flags); list_del(&neigh->list); - spin_unlock_irqrestore(&priv->lock, flags); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; @@ -1156,7 +1190,7 @@ void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) } } out_unlock: - write_unlock_bh(&ntbl->rwlock); + spin_unlock_irqrestore(&priv->lock, flags); } static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) @@ -1164,37 +1198,44 @@ static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long flags; - int i; + int i, wait_flushed = 0; - write_lock_bh(&ntbl->rwlock); + init_completion(&priv->ntbl.flushed); + + spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, - lockdep_is_held(&ntbl->rwlock)); + lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; + wait_flushed = atomic_read(&priv->ntbl.entries); + if (!wait_flushed) + goto free_htbl; + for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, - lockdep_is_held(&ntbl->rwlock))) != NULL) { + lockdep_is_held(&priv->lock))) != NULL) { rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, - lockdep_is_held(&ntbl->rwlock))); + lockdep_is_held(&priv->lock))); /* remove from path/mc list */ - spin_lock_irqsave(&priv->lock, flags); list_del(&neigh->list); - spin_unlock_irqrestore(&priv->lock, flags); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } } +free_htbl: rcu_assign_pointer(ntbl->htbl, NULL); call_rcu(&htbl->rcu, neigh_hash_free_rcu); out_unlock: - write_unlock_bh(&ntbl->rwlock); + spin_unlock_irqrestore(&priv->lock, flags); + if (wait_flushed) + wait_for_completion(&priv->ntbl.flushed); } static void ipoib_neigh_hash_uninit(struct net_device *dev) @@ -1203,7 +1244,7 @@ static void ipoib_neigh_hash_uninit(struct net_device *dev) int stopped; ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n"); - init_completion(&priv->ntbl.flushed); + init_completion(&priv->ntbl.deleted); set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); /* Stop GC if called at init fail need to cancel work */ @@ -1211,10 +1252,9 @@ static void ipoib_neigh_hash_uninit(struct net_device *dev) if (!stopped) cancel_delayed_work(&priv->neigh_reap_task); - if (atomic_read(&priv->ntbl.entries)) { - ipoib_flush_neighs(priv); - wait_for_completion(&priv->ntbl.flushed); - } + ipoib_flush_neighs(priv); + + wait_for_completion(&priv->ntbl.deleted); } @@ -1262,6 +1302,9 @@ out: void ipoib_dev_cleanup(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv; + LIST_HEAD(head); + + ASSERT_RTNL(); ipoib_delete_debug_files(dev); @@ -1270,10 +1313,9 @@ void ipoib_dev_cleanup(struct net_device *dev) /* Stop GC on child */ set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags); cancel_delayed_work(&cpriv->neigh_reap_task); - unregister_netdev(cpriv->dev); - ipoib_dev_cleanup(cpriv->dev); - free_netdev(cpriv->dev); + unregister_netdevice_queue(cpriv->dev, &head); } + unregister_netdevice_many(&head); ipoib_ib_dev_cleanup(dev); @@ -1291,6 +1333,7 @@ static const struct header_ops ipoib_header_ops = { }; static const struct net_device_ops ipoib_netdev_ops = { + .ndo_uninit = ipoib_uninit, .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, .ndo_change_mtu = ipoib_change_mtu, @@ -1300,7 +1343,7 @@ static const struct net_device_ops ipoib_netdev_ops = { .ndo_set_rx_mode = ipoib_set_mcast_list, }; -static void ipoib_setup(struct net_device *dev) +void ipoib_setup(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -1309,7 +1352,7 @@ static void ipoib_setup(struct net_device *dev) ipoib_set_ethtool_ops(dev); - netif_napi_add(dev, &priv->napi, ipoib_poll, 100); + netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT); dev->watchdog_timeo = HZ; @@ -1325,13 +1368,11 @@ static void ipoib_setup(struct net_device *dev) memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); - netif_carrier_off(dev); - priv->dev = dev; spin_lock_init(&priv->lock); - mutex_init(&priv->vlan_mutex); + init_rwsem(&priv->vlan_rwsem); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); @@ -1378,12 +1419,9 @@ static ssize_t show_umcast(struct device *dev, return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags)); } -static ssize_t set_umcast(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +void ipoib_set_umcast(struct net_device *ndev, int umcast_val) { - struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); - unsigned long umcast_val = simple_strtoul(buf, NULL, 0); + struct ipoib_dev_priv *priv = netdev_priv(ndev); if (umcast_val > 0) { set_bit(IPOIB_FLAG_UMCAST, &priv->flags); @@ -1391,6 +1429,15 @@ static ssize_t set_umcast(struct device *dev, "by userspace\n"); } else clear_bit(IPOIB_FLAG_UMCAST, &priv->flags); +} + +static ssize_t set_umcast(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long umcast_val = simple_strtoul(buf, NULL, 0); + + ipoib_set_umcast(to_net_dev(dev), umcast_val); return count; } @@ -1411,7 +1458,7 @@ static ssize_t create_child(struct device *dev, if (sscanf(buf, "%i", &pkey) != 1) return -EINVAL; - if (pkey < 0 || pkey > 0xffff) + if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000) return -EINVAL; /* @@ -1648,6 +1695,8 @@ static void ipoib_remove_one(struct ib_device *device) return; dev_list = ib_get_client_data(device, &ipoib_client); + if (!dev_list) + return; list_for_each_entry_safe(priv, tmp, dev_list, list) { ib_unregister_event_handler(&priv->event_handler); @@ -1662,7 +1711,6 @@ static void ipoib_remove_one(struct ib_device *device) flush_workqueue(ipoib_workqueue); unregister_netdev(priv->dev); - ipoib_dev_cleanup(priv->dev); free_netdev(priv->dev); } @@ -1714,8 +1762,15 @@ static int __init ipoib_init_module(void) if (ret) goto err_sa; + ret = ipoib_netlink_init(); + if (ret) + goto err_client; + return 0; +err_client: + ib_unregister_client(&ipoib_client); + err_sa: ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); @@ -1728,6 +1783,7 @@ err_fs: static void __exit ipoib_cleanup_module(void) { + ipoib_netlink_fini(); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); ipoib_unregister_debugfs(); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 13f4aa7593c..d4e005720d0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -175,7 +175,9 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, mcast->mcmember = *mcmember; - /* Set the cached Q_Key before we attach if it's the broadcast group */ + /* Set the multicast MTU and cached Q_Key before we attach if it's + * the broadcast group. + */ if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4, sizeof (union ib_gid))) { spin_lock_irq(&priv->lock); @@ -183,10 +185,17 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, spin_unlock_irq(&priv->lock); return -EAGAIN; } + priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); spin_unlock_irq(&priv->lock); priv->tx_wr.wr.ud.remote_qkey = priv->qkey; set_qkey = 1; + + if (!ipoib_cm_admin_enabled(dev)) { + rtnl_lock(); + dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); + rtnl_unlock(); + } } if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { @@ -377,8 +386,10 @@ static int ipoib_mcast_join_complete(int status, mcast->mcmember.mgid.raw, status); /* We trap for port events ourselves. */ - if (status == -ENETRESET) - return 0; + if (status == -ENETRESET) { + status = 0; + goto out; + } if (!status) status = ipoib_mcast_join_finish(mcast, &multicast->rec); @@ -398,7 +409,8 @@ static int ipoib_mcast_join_complete(int status, if (mcast == priv->broadcast) queue_work(ipoib_workqueue, &priv->carrier_on_task); - return 0; + status = 0; + goto out; } if (mcast->logcount++ < 20) { @@ -425,7 +437,8 @@ static int ipoib_mcast_join_complete(int status, mcast->backoff * HZ); spin_unlock_irq(&priv->lock); mutex_unlock(&mcast_mutex); - +out: + complete(&mcast->done); return status; } @@ -475,11 +488,15 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, } set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, comp_mask, GFP_KERNEL, ipoib_mcast_join_complete, mcast); if (IS_ERR(mcast->mc)) { clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + complete(&mcast->done); ret = PTR_ERR(mcast->mc); ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); @@ -501,10 +518,18 @@ void ipoib_mcast_join_task(struct work_struct *work) struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, mcast_task.work); struct net_device *dev = priv->dev; + struct ib_port_attr port_attr; if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) return; + if (ib_query_port(priv->ca, priv->port, &port_attr) || + port_attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "port state is not ACTIVE (state = %d) suspending join task\n", + port_attr.state); + return; + } + if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) ipoib_warn(priv, "ib_query_gid() failed\n"); else @@ -574,14 +599,6 @@ void ipoib_mcast_join_task(struct work_struct *work) return; } - priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); - - if (!ipoib_cm_admin_enabled(dev)) { - rtnl_lock(); - dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); - rtnl_unlock(); - } - ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); clear_bit(IPOIB_MCAST_RUN, &priv->flags); @@ -707,9 +724,7 @@ out: neigh = ipoib_neigh_get(dev, daddr); spin_lock_irqsave(&priv->lock, flags); if (!neigh) { - spin_unlock_irqrestore(&priv->lock, flags); neigh = ipoib_neigh_alloc(daddr, dev); - spin_lock_irqsave(&priv->lock, flags); if (neigh) { kref_get(&mcast->ah->ref); neigh->ah = mcast->ah; @@ -752,6 +767,11 @@ void ipoib_mcast_dev_flush(struct net_device *dev) spin_unlock_irqrestore(&priv->lock, flags); + /* seperate between the wait to the leave*/ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) + if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) + wait_for_completion(&mcast->done); + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(dev, mcast); ipoib_mcast_free(mcast); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c new file mode 100644 index 00000000000..cdc7df4fdb8 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. - All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/netdevice.h> +#include <linux/if_arp.h> /* For ARPHRD_xxx */ +#include <linux/module.h> +#include <net/rtnetlink.h> +#include "ipoib.h" + +static const struct nla_policy ipoib_policy[IFLA_IPOIB_MAX + 1] = { + [IFLA_IPOIB_PKEY] = { .type = NLA_U16 }, + [IFLA_IPOIB_MODE] = { .type = NLA_U16 }, + [IFLA_IPOIB_UMCAST] = { .type = NLA_U16 }, +}; + +static int ipoib_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + u16 val; + + if (nla_put_u16(skb, IFLA_IPOIB_PKEY, priv->pkey)) + goto nla_put_failure; + + val = test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + if (nla_put_u16(skb, IFLA_IPOIB_MODE, val)) + goto nla_put_failure; + + val = test_bit(IPOIB_FLAG_UMCAST, &priv->flags); + if (nla_put_u16(skb, IFLA_IPOIB_UMCAST, val)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int ipoib_changelink(struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + u16 mode, umcast; + int ret = 0; + + if (data[IFLA_IPOIB_MODE]) { + mode = nla_get_u16(data[IFLA_IPOIB_MODE]); + if (mode == IPOIB_MODE_DATAGRAM) + ret = ipoib_set_mode(dev, "datagram\n"); + else if (mode == IPOIB_MODE_CONNECTED) + ret = ipoib_set_mode(dev, "connected\n"); + else + ret = -EINVAL; + + if (ret < 0) + goto out_err; + } + + if (data[IFLA_IPOIB_UMCAST]) { + umcast = nla_get_u16(data[IFLA_IPOIB_UMCAST]); + ipoib_set_umcast(dev, umcast); + } + +out_err: + return ret; +} + +static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct net_device *pdev; + struct ipoib_dev_priv *ppriv; + u16 child_pkey; + int err; + + if (!tb[IFLA_LINK]) + return -EINVAL; + + pdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + if (!pdev || pdev->type != ARPHRD_INFINIBAND) + return -ENODEV; + + ppriv = netdev_priv(pdev); + + if (test_bit(IPOIB_FLAG_SUBINTERFACE, &ppriv->flags)) { + ipoib_warn(ppriv, "child creation disallowed for child devices\n"); + return -EINVAL; + } + + if (!data || !data[IFLA_IPOIB_PKEY]) { + ipoib_dbg(ppriv, "no pkey specified, using parent pkey\n"); + child_pkey = ppriv->pkey; + } else + child_pkey = nla_get_u16(data[IFLA_IPOIB_PKEY]); + + if (child_pkey == 0 || child_pkey == 0x8000) + return -EINVAL; + + /* + * Set the full membership bit, so that we join the right + * broadcast group, etc. + */ + child_pkey |= 0x8000; + + err = __ipoib_vlan_add(ppriv, netdev_priv(dev), child_pkey, IPOIB_RTNL_CHILD); + + if (!err && data) + err = ipoib_changelink(dev, tb, data); + return err; +} + +static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head *head) +{ + struct ipoib_dev_priv *priv, *ppriv; + + priv = netdev_priv(dev); + ppriv = netdev_priv(priv->parent); + + down_write(&ppriv->vlan_rwsem); + unregister_netdevice_queue(dev, head); + list_del(&priv->list); + up_write(&ppriv->vlan_rwsem); +} + +static size_t ipoib_get_size(const struct net_device *dev) +{ + return nla_total_size(2) + /* IFLA_IPOIB_PKEY */ + nla_total_size(2) + /* IFLA_IPOIB_MODE */ + nla_total_size(2); /* IFLA_IPOIB_UMCAST */ +} + +static struct rtnl_link_ops ipoib_link_ops __read_mostly = { + .kind = "ipoib", + .maxtype = IFLA_IPOIB_MAX, + .policy = ipoib_policy, + .priv_size = sizeof(struct ipoib_dev_priv), + .setup = ipoib_setup, + .newlink = ipoib_new_child_link, + .changelink = ipoib_changelink, + .dellink = ipoib_unregister_child_dev, + .get_size = ipoib_get_size, + .fill_info = ipoib_fill_info, +}; + +int __init ipoib_netlink_init(void) +{ + return rtnl_link_register(&ipoib_link_ops); +} + +void __exit ipoib_netlink_fini(void) +{ + rtnl_link_unregister(&ipoib_link_ops); +} + +MODULE_ALIAS_RTNL_LINK("ipoib"); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 049a997caff..c56d5d44c53 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -192,6 +192,9 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK) init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING) + init_attr.create_flags |= IB_QP_CREATE_NETIF_QP; + if (dev->features & NETIF_F_SG) init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index d7e9740c724..9fad7b5ac8b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -49,47 +49,11 @@ static ssize_t show_parent(struct device *d, struct device_attribute *attr, } static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL); -int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) +int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, + u16 pkey, int type) { - struct ipoib_dev_priv *ppriv, *priv; - char intf_name[IFNAMSIZ]; int result; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - ppriv = netdev_priv(pdev); - - if (!rtnl_trylock()) - return restart_syscall(); - mutex_lock(&ppriv->vlan_mutex); - - /* - * First ensure this isn't a duplicate. We check the parent device and - * then all of the child interfaces to make sure the Pkey doesn't match. - */ - if (ppriv->pkey == pkey) { - result = -ENOTUNIQ; - priv = NULL; - goto err; - } - - list_for_each_entry(priv, &ppriv->child_intfs, list) { - if (priv->pkey == pkey) { - result = -ENOTUNIQ; - priv = NULL; - goto err; - } - } - - snprintf(intf_name, sizeof intf_name, "%s.%04x", - ppriv->dev->name, pkey); - priv = ipoib_intf_alloc(intf_name); - if (!priv) { - result = -ENOMEM; - goto err; - } - priv->max_ib_mtu = ppriv->max_ib_mtu; /* MTU will be reset when mcast join happens */ priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); @@ -124,24 +88,27 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) ipoib_create_debug_files(priv->dev); - if (ipoib_cm_add_mode_attr(priv->dev)) - goto sysfs_failed; - if (ipoib_add_pkey_attr(priv->dev)) - goto sysfs_failed; - if (ipoib_add_umcast_attr(priv->dev)) - goto sysfs_failed; - - if (device_create_file(&priv->dev->dev, &dev_attr_parent)) - goto sysfs_failed; + /* RTNL childs don't need proprietary sysfs entries */ + if (type == IPOIB_LEGACY_CHILD) { + if (ipoib_cm_add_mode_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_pkey_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_umcast_attr(priv->dev)) + goto sysfs_failed; + + if (device_create_file(&priv->dev->dev, &dev_attr_parent)) + goto sysfs_failed; + } + priv->child_type = type; + priv->dev->iflink = ppriv->dev->ifindex; list_add_tail(&priv->list, &ppriv->child_intfs); - mutex_unlock(&ppriv->vlan_mutex); - rtnl_unlock(); - return 0; sysfs_failed: + result = -ENOMEM; ipoib_delete_debug_files(priv->dev); unregister_netdevice(priv->dev); @@ -149,11 +116,60 @@ register_failed: ipoib_dev_cleanup(priv->dev); err: - mutex_unlock(&ppriv->vlan_mutex); - rtnl_unlock(); - if (priv) + return result; +} + +int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) +{ + struct ipoib_dev_priv *ppriv, *priv; + char intf_name[IFNAMSIZ]; + struct ipoib_dev_priv *tpriv; + int result; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + ppriv = netdev_priv(pdev); + + snprintf(intf_name, sizeof intf_name, "%s.%04x", + ppriv->dev->name, pkey); + priv = ipoib_intf_alloc(intf_name); + if (!priv) + return -ENOMEM; + + if (!rtnl_trylock()) + return restart_syscall(); + + down_write(&ppriv->vlan_rwsem); + + /* + * First ensure this isn't a duplicate. We check the parent device and + * then all of the legacy child interfaces to make sure the Pkey + * doesn't match. + */ + if (ppriv->pkey == pkey) { + result = -ENOTUNIQ; + goto out; + } + + list_for_each_entry(tpriv, &ppriv->child_intfs, list) { + if (tpriv->pkey == pkey && + tpriv->child_type == IPOIB_LEGACY_CHILD) { + result = -ENOTUNIQ; + goto out; + } + } + + result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD); + +out: + up_write(&ppriv->vlan_rwsem); + + if (result) free_netdev(priv->dev); + rtnl_unlock(); + return result; } @@ -169,17 +185,19 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) if (!rtnl_trylock()) return restart_syscall(); - mutex_lock(&ppriv->vlan_mutex); + + down_write(&ppriv->vlan_rwsem); list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { - if (priv->pkey == pkey) { + if (priv->pkey == pkey && + priv->child_type == IPOIB_LEGACY_CHILD) { unregister_netdevice(priv->dev); - ipoib_dev_cleanup(priv->dev); list_del(&priv->list); dev = priv->dev; break; } } - mutex_unlock(&ppriv->vlan_mutex); + up_write(&ppriv->vlan_rwsem); + rtnl_unlock(); if (dev) { |
