diff options
Diffstat (limited to 'net/core/dev.c')
-rw-r--r-- | net/core/dev.c | 952 |
1 files changed, 647 insertions, 305 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index fe10551d367..1c8a0ce473a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -79,6 +79,8 @@ #include <linux/cpu.h> #include <linux/types.h> #include <linux/kernel.h> +#include <linux/hash.h> +#include <linux/slab.h> #include <linux/sched.h> #include <linux/mutex.h> #include <linux/string.h> @@ -104,6 +106,7 @@ #include <net/dst.h> #include <net/pkt_sched.h> #include <net/checksum.h> +#include <net/xfrm.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/kmod.h> @@ -175,7 +178,7 @@ static struct list_head ptype_all __read_mostly; /* Taps */ * The @dev_base_head list is protected by @dev_base_lock and the rtnl * semaphore. * - * Pure readers hold dev_base_lock for reading. + * Pure readers hold dev_base_lock for reading, or rcu_read_lock() * * Writers must hold the rtnl semaphore while they loop through the * dev_base_head list, and hold dev_base_lock for writing when they do the @@ -193,18 +196,15 @@ static struct list_head ptype_all __read_mostly; /* Taps */ DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); -#define NETDEV_HASHBITS 8 -#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) - static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); - return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; + return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; } static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) { - return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; + return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } /* Device list insertion */ @@ -215,23 +215,26 @@ static int list_netdevice(struct net_device *dev) ASSERT_RTNL(); write_lock_bh(&dev_base_lock); - list_add_tail(&dev->dev_list, &net->dev_base_head); - hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); - hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); + list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); + hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + hlist_add_head_rcu(&dev->index_hlist, + dev_index_hash(net, dev->ifindex)); write_unlock_bh(&dev_base_lock); return 0; } -/* Device list removal */ +/* Device list removal + * caller must respect a RCU grace period before freeing/reusing dev + */ static void unlist_netdevice(struct net_device *dev) { ASSERT_RTNL(); /* Unlink dev from the device chain */ write_lock_bh(&dev_base_lock); - list_del(&dev->dev_list); - hlist_del(&dev->name_hlist); - hlist_del(&dev->index_hlist); + list_del_rcu(&dev->dev_list); + hlist_del_rcu(&dev->name_hlist); + hlist_del_rcu(&dev->index_hlist); write_unlock_bh(&dev_base_lock); } @@ -587,18 +590,44 @@ __setup("netdev=", netdev_boot_setup); struct net_device *__dev_get_by_name(struct net *net, const char *name) { struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_name_hash(net, name); - hlist_for_each(p, dev_name_hash(net, name)) { - struct net_device *dev - = hlist_entry(p, struct net_device, name_hlist); + hlist_for_each_entry(dev, p, head, name_hlist) if (!strncmp(dev->name, name, IFNAMSIZ)) return dev; - } + return NULL; } EXPORT_SYMBOL(__dev_get_by_name); /** + * dev_get_by_name_rcu - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) +{ + struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_name_hash(net, name); + + hlist_for_each_entry_rcu(dev, p, head, name_hlist) + if (!strncmp(dev->name, name, IFNAMSIZ)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_get_by_name_rcu); + +/** * dev_get_by_name - find a device by its name * @net: the applicable net namespace * @name: name to find @@ -614,11 +643,11 @@ struct net_device *dev_get_by_name(struct net *net, const char *name) { struct net_device *dev; - read_lock(&dev_base_lock); - dev = __dev_get_by_name(net, name); + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); if (dev) dev_hold(dev); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_name); @@ -638,17 +667,42 @@ EXPORT_SYMBOL(dev_get_by_name); struct net_device *__dev_get_by_index(struct net *net, int ifindex) { struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_index_hash(net, ifindex); - hlist_for_each(p, dev_index_hash(net, ifindex)) { - struct net_device *dev - = hlist_entry(p, struct net_device, index_hlist); + hlist_for_each_entry(dev, p, head, index_hlist) if (dev->ifindex == ifindex) return dev; - } + return NULL; } EXPORT_SYMBOL(__dev_get_by_index); +/** + * dev_get_by_index_rcu - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns %NULL if the device + * is not found or a pointer to the device. The device has not + * had its reference counter increased so the caller must be careful + * about locking. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) +{ + struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_index_hash(net, ifindex); + + hlist_for_each_entry_rcu(dev, p, head, index_hlist) + if (dev->ifindex == ifindex) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_get_by_index_rcu); + /** * dev_get_by_index - find a device by its ifindex @@ -665,11 +719,11 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; - read_lock(&dev_base_lock); - dev = __dev_get_by_index(net, ifindex); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); if (dev) dev_hold(dev); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_index); @@ -748,15 +802,15 @@ struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags, struct net_device *dev, *ret; ret = NULL; - read_lock(&dev_base_lock); - for_each_netdev(net, dev) { + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { dev_hold(dev); ret = dev; break; } } - read_unlock(&dev_base_lock); + rcu_read_unlock(); return ret; } EXPORT_SYMBOL(dev_get_by_flags); @@ -841,7 +895,8 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) free_page((unsigned long) inuse); } - snprintf(buf, IFNAMSIZ, name, i); + if (buf != name) + snprintf(buf, IFNAMSIZ, name, i); if (!__dev_get_by_name(net, buf)) return i; @@ -881,6 +936,21 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); +static int dev_get_valid_name(struct net *net, const char *name, char *buf, + bool fmt) +{ + if (!dev_valid_name(name)) + return -EINVAL; + + if (fmt && strchr(name, '%')) + return __dev_alloc_name(net, name, buf); + else if (__dev_get_by_name(net, name)) + return -EEXIST; + else if (buf != name) + strlcpy(buf, name, IFNAMSIZ); + + return 0; +} /** * dev_change_name - change name of a device @@ -904,28 +974,20 @@ int dev_change_name(struct net_device *dev, const char *newname) if (dev->flags & IFF_UP) return -EBUSY; - if (!dev_valid_name(newname)) - return -EINVAL; - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) return 0; memcpy(oldname, dev->name, IFNAMSIZ); - if (strchr(newname, '%')) { - err = dev_alloc_name(dev, newname); - if (err < 0) - return err; - } else if (__dev_get_by_name(net, newname)) - return -EEXIST; - else - strlcpy(dev->name, newname, IFNAMSIZ); + err = dev_get_valid_name(net, newname, dev->name, 1); + if (err < 0) + return err; rollback: /* For now only devices in the initial network namespace * are in sysfs. */ - if (net == &init_net) { + if (net_eq(net, &init_net)) { ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); @@ -935,7 +997,12 @@ rollback: write_lock_bh(&dev_base_lock); hlist_del(&dev->name_hlist); - hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); + write_unlock_bh(&dev_base_lock); + + synchronize_rcu(); + + write_lock_bh(&dev_base_lock); + hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); write_unlock_bh(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); @@ -1038,28 +1105,16 @@ void dev_load(struct net *net, const char *name) { struct net_device *dev; - read_lock(&dev_base_lock); - dev = __dev_get_by_name(net, name); - read_unlock(&dev_base_lock); + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + rcu_read_unlock(); if (!dev && capable(CAP_NET_ADMIN)) request_module("%s", name); } EXPORT_SYMBOL(dev_load); -/** - * dev_open - prepare an interface for use. - * @dev: device to open - * - * Takes a device from down to up state. The device's private open - * function is invoked and then the multicast lists are loaded. Finally - * the device is moved into the up state and a %NETDEV_UP message is - * sent to the netdev notifier chain. - * - * Calling this function on an active interface is a nop. On a failure - * a negative errno code is returned. - */ -int dev_open(struct net_device *dev) +static int __dev_open(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; int ret; @@ -1067,13 +1122,6 @@ int dev_open(struct net_device *dev) ASSERT_RTNL(); /* - * Is it already up? - */ - - if (dev->flags & IFF_UP) - return 0; - - /* * Is it even present? */ if (!netif_device_present(dev)) @@ -1121,36 +1169,57 @@ int dev_open(struct net_device *dev) * Wakeup transmit queue engine */ dev_activate(dev); - - /* - * ... and announce new interface. - */ - call_netdevice_notifiers(NETDEV_UP, dev); } return ret; } -EXPORT_SYMBOL(dev_open); /** - * dev_close - shutdown an interface. - * @dev: device to shutdown + * dev_open - prepare an interface for use. + * @dev: device to open * - * This function moves an active device into down state. A - * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device - * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier - * chain. + * Takes a device from down to up state. The device's private open + * function is invoked and then the multicast lists are loaded. Finally + * the device is moved into the up state and a %NETDEV_UP message is + * sent to the netdev notifier chain. + * + * Calling this function on an active interface is a nop. On a failure + * a negative errno code is returned. */ -int dev_close(struct net_device *dev) +int dev_open(struct net_device *dev) +{ + int ret; + + /* + * Is it already up? + */ + if (dev->flags & IFF_UP) + return 0; + + /* + * Open device + */ + ret = __dev_open(dev); + if (ret < 0) + return ret; + + /* + * ... and announce new interface. + */ + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + call_netdevice_notifiers(NETDEV_UP, dev); + + return ret; +} +EXPORT_SYMBOL(dev_open); + +static int __dev_close(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; - ASSERT_RTNL(); + ASSERT_RTNL(); might_sleep(); - if (!(dev->flags & IFF_UP)) - return 0; - /* * Tell people we are going down, so that they can * prepare to death, when device is still operating. @@ -1186,14 +1255,34 @@ int dev_close(struct net_device *dev) dev->flags &= ~IFF_UP; /* - * Tell people we are down + * Shutdown NET_DMA */ - call_netdevice_notifiers(NETDEV_DOWN, dev); + net_dmaengine_put(); + + return 0; +} + +/** + * dev_close - shutdown an interface. + * @dev: device to shutdown + * + * This function moves an active device into down state. A + * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device + * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier + * chain. + */ +int dev_close(struct net_device *dev) +{ + if (!(dev->flags & IFF_UP)) + return 0; + + __dev_close(dev); /* - * Shutdown NET_DMA + * Tell people we are down */ - net_dmaengine_put(); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + call_netdevice_notifiers(NETDEV_DOWN, dev); return 0; } @@ -1287,6 +1376,7 @@ rollback: nb->notifier_call(nb, NETDEV_DOWN, dev); } nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); } } @@ -1353,6 +1443,42 @@ static inline void net_timestamp(struct sk_buff *skb) skb->tstamp.tv64 = 0; } +/** + * dev_forward_skb - loopback an skb to another netif + * + * @dev: destination network device + * @skb: buffer to forward + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped) + * + * dev_forward_skb can be used for injecting an skb from the + * start_xmit function of one device into the receive queue + * of another device. + * + * The receiving device may be in another namespace, so + * we have to clear all information in the skb that could + * impact namespace isolation. + */ +int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + skb_orphan(skb); + + if (!(dev->flags & IFF_UP)) + return NET_RX_DROP; + + if (skb->len > (dev->mtu + dev->hard_header_len)) + return NET_RX_DROP; + + skb_set_dev(skb, dev); + skb->tstamp.tv64 = 0; + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, dev); + return netif_rx(skb); +} +EXPORT_SYMBOL_GPL(dev_forward_skb); + /* * Support routine. Sends outgoing frames to any network * taps currently in use. @@ -1508,6 +1634,36 @@ static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) return false; } +/** + * skb_dev_set -- assign a new device to a buffer + * @skb: buffer for the new device + * @dev: network device + * + * If an skb is owned by a device already, we have to reset + * all data private to the namespace a device belongs to + * before assigning it a new device. + */ +#ifdef CONFIG_NET_NS +void skb_set_dev(struct sk_buff *skb, struct net_device *dev) +{ + skb_dst_drop(skb); + if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) { + secpath_reset(skb); + nf_reset(skb); + skb_init_secmark(skb); + skb->mark = 0; + skb->priority = 0; + skb->nf_trace = 0; + skb->ipvs_property = 0; +#ifdef CONFIG_NET_SCHED + skb->tc_index = 0; +#endif + } + skb->dev = dev; +} +EXPORT_SYMBOL(skb_set_dev); +#endif /* CONFIG_NET_NS */ + /* * Invalidate hardware checksum when packet is to be mangled, and * complete checksum manually on outgoing path. @@ -1701,7 +1857,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { const struct net_device_ops *ops = dev->netdev_ops; - int rc; + int rc = NETDEV_TX_OK; if (likely(!skb->next)) { if (!list_empty(&ptype_all)) @@ -1747,8 +1903,18 @@ gso: skb->next = nskb->next; nskb->next = NULL; + + /* + * If device doesnt need nskb->dst, release it right now while + * its hot in this cpu cache + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(nskb); + rc = ops->ndo_start_xmit(nskb, dev); if (unlikely(rc != NETDEV_TX_OK)) { + if (rc & ~NETDEV_TX_MASK) + goto out_kfree_gso_skb; nskb->next = skb->next; skb->next = nskb; return rc; @@ -1758,11 +1924,12 @@ gso: return NETDEV_TX_BUSY; } while (skb->next); - skb->destructor = DEV_GSO_CB(skb)->destructor; - +out_kfree_gso_skb: + if (likely(skb->next == NULL)) + skb->destructor = DEV_GSO_CB(skb)->destructor; out_kfree_skb: kfree_skb(skb); - return NETDEV_TX_OK; + return rc; } static u32 skb_tx_hashrnd; @@ -1789,16 +1956,43 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) } EXPORT_SYMBOL(skb_tx_hash); +static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) +{ + if (unlikely(queue_index >= dev->real_num_tx_queues)) { + if (net_ratelimit()) { + WARN(1, "%s selects TX queue %d, but " + "real number of TX queues is %d\n", + dev->name, queue_index, + dev->real_num_tx_queues); + } + return 0; + } + return queue_index; +} + static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) { - const struct net_device_ops *ops = dev->netdev_ops; - u16 queue_index = 0; + u16 queue_index; + struct sock *sk = skb->sk; + + if (sk_tx_queue_recorded(sk)) { + queue_index = sk_tx_queue_get(sk); + } else { + const struct net_device_ops *ops = dev->netdev_ops; - if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb); - else if (dev->real_num_tx_queues > 1) - queue_index = skb_tx_hash(dev, skb); + if (ops->ndo_select_queue) { + queue_index = ops->ndo_select_queue(dev, skb); + queue_index = dev_cap_txqueue(dev, queue_index); + } else { + queue_index = 0; + if (dev->real_num_tx_queues > 1) + queue_index = skb_tx_hash(dev, skb); + + if (sk && sk->sk_dst_cache) + sk_tx_queue_set(sk, queue_index); + } + } skb_set_queue_mapping(skb, queue_index); return netdev_get_tx_queue(dev, queue_index); @@ -1838,6 +2032,21 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } +/* + * Returns true if either: + * 1. skb has frag_list and the device doesn't support FRAGLIST, or + * 2. skb is fragmented and the device does not support SG, or if + * at least one of fragments is in highmem and device does not + * support DMA from it. + */ +static inline int skb_needs_linearize(struct sk_buff *skb, + struct net_device *dev) +{ + return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) || + (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || + illegal_highdma(dev, skb))); +} + /** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit @@ -1874,18 +2083,8 @@ int dev_queue_xmit(struct sk_buff *skb) if (netif_needs_gso(dev, skb)) goto gso; - if (skb_has_frags(skb) && - !(dev->features & NETIF_F_FRAGLIST) && - __skb_linearize(skb)) - goto out_kfree_skb; - - /* Fragmented skb is linearized if device does not support SG, - * or if at least one of fragments is in highmem and device - * does not support DMA from it. - */ - if (skb_shinfo(skb)->nr_frags && - (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && - __skb_linearize(skb)) + /* Convert a paged skb to linear, if required */ + if (skb_needs_linearize(skb, dev) && __skb_linearize(skb)) goto out_kfree_skb; /* If packet is not checksummed and device does not support @@ -1905,7 +2104,7 @@ gso: rcu_read_lock_bh(); txq = dev_pick_tx(dev, skb); - q = rcu_dereference(txq->qdisc); + q = rcu_dereference_bh(txq->qdisc); #ifdef CONFIG_NET_CLS_ACT skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); @@ -1935,8 +2134,8 @@ gso: HARD_TX_LOCK(dev, txq, cpu); if (!netif_tx_queue_stopped(txq)) { - rc = NET_XMIT_SUCCESS; - if (!dev_hard_start_xmit(skb, dev, txq)) { + rc = dev_hard_start_xmit(skb, dev, txq); + if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; } @@ -2191,7 +2390,7 @@ static int ing_filter(struct sk_buff *skb) if (MAX_RED_LOOP < ttl++) { printk(KERN_WARNING "Redir loop detected Dropping packet (%d->%d)\n", - skb->iif, dev->ifindex); + skb->skb_iif, dev->ifindex); return TC_ACT_SHOT; } @@ -2285,30 +2484,33 @@ int netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; struct net_device *orig_dev; + struct net_device *master; struct net_device *null_or_orig; + struct net_device *null_or_bond; int ret = NET_RX_DROP; __be16 type; if (!skb->tstamp.tv64) net_timestamp(skb); - if (skb->vlan_tci && vlan_hwaccel_do_receive(skb)) + if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) return NET_RX_SUCCESS; /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) return NET_RX_DROP; - if (!skb->iif) - skb->iif = skb->dev->ifindex; + if (!skb->skb_iif) + skb->skb_iif = skb->dev->ifindex; null_or_orig = NULL; orig_dev = skb->dev; - if (orig_dev->master) { - if (skb_bond_should_drop(skb)) + master = ACCESS_ONCE(orig_dev->master); + if (master) { + if (skb_bond_should_drop(skb, master)) null_or_orig = orig_dev; /* deliver only exact match */ else - skb->dev = orig_dev->master; + skb->dev = master; } __get_cpu_var(netdev_rx_stat).total++; @@ -2351,12 +2553,24 @@ ncls: if (!skb) goto out; + /* + * Make sure frames received on VLAN interfaces stacked on + * bonding interfaces still make their way to any base bonding + * device that may have registered for a specific ptype. The + * handler may have to adjust skb->dev and orig_dev. + */ + null_or_bond = NULL; + if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && + (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) { + null_or_bond = vlan_dev_real_dev(skb->dev); + } + type = skb->protocol; list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { - if (ptype->type == type && - (ptype->dev == null_or_orig || ptype->dev == skb->dev || - ptype->dev == orig_dev)) { + if (ptype->type == type && (ptype->dev == null_or_orig || + ptype->dev == skb->dev || ptype->dev == orig_dev || + ptype->dev == null_or_bond)) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; @@ -2425,7 +2639,7 @@ out: return netif_receive_skb(skb); } -void napi_gro_flush(struct napi_struct *napi) +static void napi_gro_flush(struct napi_struct *napi) { struct sk_buff *skb, *next; @@ -2438,9 +2652,8 @@ void napi_gro_flush(struct napi_struct *napi) napi->gro_count = 0; napi->gro_list = NULL; } -EXPORT_SYMBOL(napi_gro_flush); -int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff **pp = NULL; struct packet_type *ptype; @@ -2448,7 +2661,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; int same_flow; int mac_len; - int ret; + enum gro_result ret; if (!(skb->dev->features & NETIF_F_GRO)) goto normal; @@ -2532,7 +2745,8 @@ normal: } EXPORT_SYMBOL(dev_gro_receive); -static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +static gro_result_t +__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; @@ -2540,33 +2754,35 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) return GRO_NORMAL; for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev) - && !compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); + NAPI_GRO_CB(p)->same_flow = + (p->dev == skb->dev) && + !compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); NAPI_GRO_CB(p)->flush = 0; } return dev_gro_receive(napi, skb); } -int napi_skb_finish(int ret, struct sk_buff *skb) +gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) { - int err = NET_RX_SUCCESS; - switch (ret) { case GRO_NORMAL: - return netif_receive_skb(skb); + if (netif_receive_skb(skb)) + ret = GRO_DROP; + break; case GRO_DROP: - err = NET_RX_DROP; - /* fall through */ - case GRO_MERGED_FREE: kfree_skb(skb); break; + + case GRO_HELD: + case GRO_MERGED: + break; } - return err; + return ret; } EXPORT_SYMBOL(napi_skb_finish); @@ -2586,7 +2802,7 @@ void skb_gro_reset_offset(struct sk_buff *skb) } EXPORT_SYMBOL(skb_gro_reset_offset); -int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { skb_gro_reset_offset(skb); @@ -2605,49 +2821,41 @@ EXPORT_SYMBOL(napi_reuse_skb); struct sk_buff *napi_get_frags(struct napi_struct *napi) { - struct net_device *dev = napi->dev; struct sk_buff *skb = napi->skb; if (!skb) { - skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN); - if (!skb) - goto out; - - skb_reserve(skb, NET_IP_ALIGN); - - napi->skb = skb; + skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); + if (skb) + napi->skb = skb; } - -out: return skb; } EXPORT_SYMBOL(napi_get_frags); -int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) +gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, + gro_result_t ret) { - int err = NET_RX_SUCCESS; - switch (ret) { case GRO_NORMAL: case GRO_HELD: - skb->protocol = eth_type_trans(skb, napi->dev); + skb->protocol = eth_type_trans(skb, skb->dev); - if (ret == GRO_NORMAL) - return netif_receive_skb(skb); - - skb_gro_pull(skb, -ETH_HLEN); + if (ret == GRO_HELD) + skb_gro_pull(skb, -ETH_HLEN); + else if (netif_receive_skb(skb)) + ret = GRO_DROP; break; case GRO_DROP: - err = NET_RX_DROP; - /* fall through */ - case GRO_MERGED_FREE: napi_reuse_skb(napi, skb); break; + + case GRO_MERGED: + break; } - return err; + return ret; } EXPORT_SYMBOL(napi_frags_finish); @@ -2688,12 +2896,12 @@ out: } EXPORT_SYMBOL(napi_frags_skb); -int napi_gro_frags(struct napi_struct *napi) +gro_result_t napi_gro_frags(struct napi_struct *napi) { struct sk_buff *skb = napi_frags_skb(napi); if (!skb) - return NET_RX_DROP; + return GRO_DROP; return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); } @@ -2835,7 +3043,7 @@ static void net_rx_action(struct softirq_action *h) * entries to the tail of this list, and only ->poll() * calls can remove this head entry from the list. */ - n = list_entry(list->next, struct napi_struct, poll_list); + n = list_first_entry(list, struct napi_struct, poll_list); have = netpoll_poll_lock(n); @@ -2938,15 +3146,15 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg) if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) return -EFAULT; - read_lock(&dev_base_lock); - dev = __dev_get_by_index(net, ifr.ifr_ifindex); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); if (!dev) { - read_unlock(&dev_base_lock); + rcu_read_unlock(); return -ENODEV; } strcpy(ifr.ifr_name, dev->name); - read_unlock(&dev_base_lock); + rcu_read_unlock(); if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) return -EFAULT; @@ -3016,18 +3224,18 @@ static int dev_ifconf(struct net *net, char __user *arg) * in detail. */ void *dev_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(dev_base_lock) + __acquires(RCU) { struct net *net = seq_file_net(seq); loff_t off; struct net_device *dev; - read_lock(&dev_base_lock); + rcu_read_lock(); if (!*pos) return SEQ_START_TOKEN; off = 1; - for_each_netdev(net, dev) + for_each_netdev_rcu(net, dev) if (off++ == *pos) return dev; @@ -3036,23 +3244,25 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos) void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct net *net = seq_file_net(seq); + struct net_device *dev = (v == SEQ_START_TOKEN) ? + first_net_device(seq_file_net(seq)) : + next_net_device((struct net_device *)v); + ++*pos; - return v == SEQ_START_TOKEN ? - first_net_device(net) : next_net_device((struct net_device *)v); + return rcu_dereference(dev); } void dev_seq_stop(struct seq_file *seq, void *v) - __releases(dev_base_lock) + __releases(RCU) { - read_unlock(&dev_base_lock); + rcu_read_unlock(); } static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) { const struct net_device_stats *stats = dev_get_stats(dev); - seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " + seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", dev->name, stats->rx_bytes, stats->rx_packets, stats->rx_errors, @@ -3507,10 +3717,10 @@ void __dev_set_rx_mode(struct net_device *dev) /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe. */ - if (dev->uc.count > 0 && !dev->uc_promisc) { + if (!netdev_uc_empty(dev) && !dev->uc_promisc) { __dev_set_promiscuity(dev, 1); dev->uc_promisc = 1; - } else if (dev->uc.count == 0 && dev->uc_promisc) { + } else if (netdev_uc_empty(dev) && dev->uc_promisc) { __dev_set_promiscuity(dev, -1); dev->uc_promisc = 0; } @@ -4078,7 +4288,7 @@ static void dev_addr_discard(struct net_device *dev) netif_addr_lock_bh(dev); __dev_addr_discard(&dev->mc_list); - dev->mc_count = 0; + netdev_mc_count(dev) = 0; netif_addr_unlock_bh(dev); } @@ -4114,18 +4324,10 @@ unsigned dev_get_flags(const struct net_device *dev) } EXPORT_SYMBOL(dev_get_flags); -/** - * dev_change_flags - change device settings - * @dev: device - * @flags: device state flags - * - * Change settings on device based state flags. The flags are - * in the userspace exported format. - */ -int dev_change_flags(struct net_device *dev, unsigned flags) +int __dev_change_flags(struct net_device *dev, unsigned int flags) { - int ret, changes; int old_flags = dev->flags; + int ret; ASSERT_RTNL(); @@ -4156,17 +4358,12 @@ int dev_change_flags(struct net_device *dev, unsigned flags) ret = 0; if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ - ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); + ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); if (!ret) dev_set_rx_mode(dev); } - if (dev->flags & IFF_UP && - ((old_flags ^ dev->flags) & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | - IFF_VOLATILE))) - call_netdevice_notifiers(NETDEV_CHANGE, dev); - if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; @@ -4185,11 +4382,47 @@ int dev_change_flags(struct net_device *dev, unsigned flags) dev_set_allmulti(dev, inc); } - /* Exclude state transition flags, already notified */ - changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING); + return ret; +} + +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) +{ + unsigned int changes = dev->flags ^ old_flags; + + if (changes & IFF_UP) { + if (dev->flags & IFF_UP) + call_netdevice_notifiers(NETDEV_UP, dev); + else + call_netdevice_notifiers(NETDEV_DOWN, dev); + } + + if (dev->flags & IFF_UP && + (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) + call_netdevice_notifiers(NETDEV_CHANGE, dev); +} + +/** + * dev_change_flags - change device settings + * @dev: device + * @flags: device state flags + * + * Change settings on device based state flags. The flags are + * in the userspace exported format. + */ +int dev_change_flags(struct net_device *dev, unsigned flags) +{ + int ret, changes; + int old_flags = dev->flags; + + ret = __dev_change_flags(dev, flags); + if (ret < 0) + return ret; + + changes = old_flags ^ dev->flags; if (changes) rtmsg_ifinfo(RTM_NEWLINK, dev, changes); + __dev_notify_flags(dev, old_flags); return ret; } EXPORT_SYMBOL(dev_change_flags); @@ -4254,12 +4487,12 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) EXPORT_SYMBOL(dev_set_mac_address); /* - * Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock) + * Perform the SIOCxIFxxx calls, inside rcu_read_lock() */ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) { int err; - struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); if (!dev) return -ENODEV; @@ -4491,9 +4724,9 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCGIFINDEX: case SIOCGIFTXQLEN: dev_load(net, ifr.ifr_name); - read_lock(&dev_base_lock); + rcu_read_lock(); ret = dev_ifsioc_locked(net, &ifr, cmd); - read_unlock(&dev_base_lock); + rcu_read_unlock(); if (!ret) { if (colon) *colon = ':'; @@ -4636,59 +4869,86 @@ static void net_set_todo(struct net_device *dev) list_add_tail(&dev->todo_list, &net_todo_list); } -static void rollback_registered(struct net_device *dev) +static void rollback_registered_many(struct list_head *head) { + struct net_device *dev, *tmp; + BUG_ON(dev_boot_phase); ASSERT_RTNL(); - /* Some devices call without registering for initialization unwind. */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - printk(KERN_DEBUG "unregister_netdevice: device %s/%p never " - "was registered\n", dev->name, dev); + list_for_each_entry_safe(dev, tmp, head, unreg_list) { + /* Some devices call without registering + * for initialization unwind. Remove those + * devices and proceed with the remaining. + */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + pr_debug("unregister_netdevice: device %s/%p never " + "was registered\n", dev->name, dev); - WARN_ON(1); - return; - } + WARN_ON(1); + list_del(&dev->unreg_list); + continue; + } - BUG_ON(dev->reg_state != NETREG_REGISTERED); + BUG_ON(dev->reg_state != NETREG_REGISTERED); - /* If device is running, close it first. */ - dev_close(dev); + /* If device is running, close it first. */ + dev_close(dev); - /* And unlink it from device chain. */ - unlist_netdevice(dev); + /* And unlink it from device chain. */ + unlist_netdevice(dev); - dev->reg_state = NETREG_UNREGISTERING; + dev->reg_state = NETREG_UNREGISTERING; + } synchronize_net(); - /* Shutdown queueing discipline. */ - dev_shutdown(dev); + list_for_each_entry(dev, head, unreg_list) { + /* Shutdown queueing discipline. */ + dev_shutdown(dev); - /* Notify protocols, that we are about to destroy - this device. They should clean all the things. - */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - /* - * Flush the unicast and multicast chains - */ - dev_unicast_flush(dev); - dev_addr_discard(dev); + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); - if (dev->netdev_ops->ndo_uninit) - dev->netdev_ops->ndo_uninit(dev); + /* + * Flush the unicast and multicast chains + */ + dev_unicast_flush(dev); + dev_addr_discard(dev); - /* Notifier chain MUST detach us from master device. */ - WARN_ON(dev->master); + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); - /* Remove entries from kobject tree */ - netdev_unregister_kobject(dev); + /* Notifier chain MUST detach us from master device. */ + WARN_ON(dev->master); + + /* Remove entries from kobject tree */ + netdev_unregister_kobject(dev); + } + + /* Process any work delayed until the end of the batch */ + dev = list_first_entry(head, struct net_device, unreg_list); + call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); synchronize_net(); - dev_put(dev); + list_for_each_entry(dev, head, unreg_list) + dev_put(dev); +} + +static void rollback_registered(struct net_device *dev) +{ + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + rollback_registered_many(&single); } static void __netdev_init_queue_locks_one(struct net_device *dev, @@ -4747,6 +5007,33 @@ unsigned long netdev_fix_features(unsigned long features, const char *name) EXPORT_SYMBOL(netdev_fix_features); /** + * netif_stacked_transfer_operstate - transfer operstate + * @rootdev: the root or lower level device to transfer state from + * @dev: the device to transfer operstate to + * + * Transfer operational state from root to device. This is normally + * called when a stacking relationship exists between the root + * device and the device(a leaf device). + */ +void netif_stacked_transfer_operstate(const struct net_device *rootdev, + struct net_device *dev) +{ + if (rootdev->operstate == IF_OPER_DORMANT) + netif_dormant_on(dev); + else + netif_dormant_off(dev); + + if (netif_carrier_ok(rootdev)) { + if (!netif_carrier_ok(dev)) + netif_carrier_on(dev); + } else { + if (netif_carrier_ok(dev)) + netif_carrier_off(dev); + } +} +EXPORT_SYMBOL(netif_stacked_transfer_operstate); + +/** * register_netdevice - register a network device * @dev: device to register * @@ -4765,8 +5052,6 @@ EXPORT_SYMBOL(netdev_fix_features); int register_netdevice(struct net_device *dev) { - struct hlist_head *head; - struct hlist_node *p; int ret; struct net *net = dev_net(dev); @@ -4795,26 +5080,14 @@ int register_netdevice(struct net_device *dev) } } - if (!dev_valid_name(dev->name)) { - ret = -EINVAL; + ret = dev_get_valid_name(net, dev->name, dev->name, 0); + if (ret) goto err_uninit; - } dev->ifindex = dev_new_index(net); if (dev->iflink == -1) dev->iflink = dev->ifindex; - /* Check for existence of name */ - head = dev_name_hash(net, dev->name); - hlist_for_each(p, head) { - struct net_device *d - = hlist_entry(p, struct net_device, name_hlist); - if (!strncmp(d->name, dev->name, IFNAMSIZ)) { - ret = -EEXIST; - goto err_uninit; - } - } - /* Fix illegal checksum combinations */ if ((dev->features & NETIF_F_HW_CSUM) && (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { @@ -4837,6 +5110,12 @@ int register_netdevice(struct net_device *dev) dev->features |= NETIF_F_GSO; netdev_initialize_kobject(dev); + + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); + ret = notifier_to_errno(ret); + if (ret) + goto err_uninit; + ret = netdev_register_kobject(dev); if (ret) goto err_uninit; @@ -4860,6 +5139,13 @@ int register_netdevice(struct net_device *dev) rollback_registered(dev); dev->reg_state = NETREG_UNREGISTERED; } + /* + * Prevent userspace races by waiting until the network + * device is fully setup before sending notifications. + */ + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); out: return ret; @@ -4961,6 +5247,8 @@ static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; + linkwatch_forget_dev(dev); + rebroadcast_time = warning_time = jiffies; while (atomic_read(&dev->refcnt) != 0) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { @@ -4968,6 +5256,8 @@ static void netdev_wait_allrefs(struct net_device *dev) /* Rebroadcast unregister notification */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users + * should have already handle it the first time */ if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { @@ -5032,7 +5322,7 @@ void netdev_run_todo(void) while (!list_empty(&list)) { struct net_device *dev - = list_entry(list.next, struct net_device, todo_list); + = list_first_entry(&list, struct net_device, todo_list); list_del(&dev->todo_list); if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { @@ -5063,6 +5353,32 @@ void netdev_run_todo(void) } /** + * dev_txq_stats_fold - fold tx_queues stats + * @dev: device to get statistics from + * @stats: struct net_device_stats to hold results + */ +void dev_txq_stats_fold(const struct net_device *dev, + struct net_device_stats *stats) +{ + unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0; + unsigned int i; + struct netdev_queue *txq; + + for (i = 0; i < dev->num_tx_queues; i++) { + txq = netdev_get_tx_queue(dev, i); + tx_bytes += txq->tx_bytes; + tx_packets += txq->tx_packets; + tx_dropped += txq->tx_dropped; + } + if (tx_bytes || tx_packets || tx_dropped) { + stats->tx_bytes = tx_bytes; + stats->tx_packets = tx_packets; + stats->tx_dropped = tx_dropped; + } +} +EXPORT_SYMBOL(dev_txq_stats_fold); + +/** * dev_get_stats - get network device statistics * @dev: device to get statistics from * @@ -5076,25 +5392,9 @@ const struct net_device_stats *dev_get_stats(struct net_device *dev) if (ops->ndo_get_stats) return ops->ndo_get_stats(dev); - else { - unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0; - struct net_device_stats *stats = &dev->stats; - unsigned int i; - struct netdev_queue *txq; - - for (i = 0; i < dev->num_tx_queues; i++) { - txq = netdev_get_tx_queue(dev, i); - tx_bytes += txq->tx_bytes; - tx_packets += txq->tx_packets; - tx_dropped += txq->tx_dropped; - } - if (tx_bytes || tx_packets || tx_dropped) { - stats->tx_bytes = tx_bytes; - stats->tx_packets = tx_packets; - stats->tx_dropped = tx_dropped; - } - return stats; - } + + dev_txq_stats_fold(dev, &dev->stats); + return &dev->stats; } EXPORT_SYMBOL(dev_get_stats); @@ -5173,7 +5473,11 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, netdev_init_queues(dev); + INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list); + dev->ethtool_ntuple_list.count = 0; INIT_LIST_HEAD(&dev->napi_list); + INIT_LIST_HEAD(&dev->unreg_list); + INIT_LIST_HEAD(&dev->link_watch_list); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); strcpy(dev->name, name); @@ -5207,6 +5511,9 @@ void free_netdev(struct net_device *dev) /* Flush device addresses */ dev_addr_flush(dev); + /* Clear ethtool n-tuple list */ + ethtool_ntuple_flush(dev); + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); @@ -5238,25 +5545,47 @@ void synchronize_net(void) EXPORT_SYMBOL(synchronize_net); /** - * unregister_netdevice - remove device from the kernel + * unregister_netdevice_queue - remove device from the kernel * @dev: device + * @head: list * * This function shuts down a device interface and removes it * from the kernel tables. + * If head not NULL, device is queued to be unregistered later. * * Callers must hold the rtnl semaphore. You may want * unregister_netdev() instead of this. */ -void unregister_netdevice(struct net_device *dev) +void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) { ASSERT_RTNL(); - rollback_registered(dev); - /* Finish processing unregister after unlock */ - net_set_todo(dev); + if (head) { + list_move_tail(&dev->unreg_list, head); + } else { + rollback_registered(dev); + /* Finish processing unregister after unlock */ + net_set_todo(dev); + } } -EXPORT_SYMBOL(unregister_netdevice); +EXPORT_SYMBOL(unregister_netdevice_queue); + +/** + * unregister_netdevice_many - unregister many devices + * @head: list of devices + */ +void unregister_netdevice_many(struct list_head *head) +{ + struct net_device *dev; + + if (!list_empty(head)) { + rollback_registered_many(head); + list_for_each_entry(dev, head, unreg_list) + net_set_todo(dev); + } +} +EXPORT_SYMBOL(unregister_netdevice_many); /** * unregister_netdev - remove device from the kernel @@ -5293,8 +5622,6 @@ EXPORT_SYMBOL(unregister_netdev); int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { - char buf[IFNAMSIZ]; - const char *destname; int err; ASSERT_RTNL(); @@ -5327,20 +5654,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char * we can use it in the destination network namespace. */ err = -EEXIST; - destname = dev->name; - if (__dev_get_by_name(net, destname)) { + if (__dev_get_by_name(net, dev->name)) { /* We get here if we can't use the current device name */ if (!pat) goto out; - if (!dev_valid_name(pat)) - goto out; - if (strchr(pat, '%')) { - if (__dev_alloc_name(net, pat, buf) < 0) - goto out; - destname = buf; - } else - destname = pat; - if (__dev_get_by_name(net, destname)) + if (dev_get_valid_name(net, pat, dev->name, 1)) goto out; } @@ -5364,6 +5682,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char this device. They should clean all the things. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); /* * Flush the unicast and multicast chains @@ -5376,10 +5695,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Actually switch the network namespace */ dev_net_set(dev, net); - /* Assign the new device name */ - if (destname != dev->name) - strcpy(dev->name, destname); - /* If there is an ifindex conflict assign a new one */ if (__dev_get_by_index(net, dev->ifindex)) { int iflink = (dev->iflink == dev->ifindex); @@ -5398,6 +5713,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Notify protocols, that a new device appeared. */ call_netdevice_notifiers(NETDEV_REGISTER, dev); + /* + * Prevent userspace races by waiting until the network + * device is fully setup before sending notifications. + */ + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + synchronize_net(); err = 0; out: @@ -5484,7 +5805,7 @@ unsigned long netdev_increment_features(unsigned long all, unsigned long one, one |= NETIF_F_ALL_CSUM; one |= all & NETIF_F_ONE_FOR_ALL; - all &= one | NETIF_F_LLTX | NETIF_F_GSO; + all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO; all |= one & mask & NETIF_F_ONE_FOR_ALL; return all; @@ -5566,14 +5887,13 @@ static struct pernet_operations __net_initdata netdev_net_ops = { static void __net_exit default_device_exit(struct net *net) { - struct net_device *dev; + struct net_device *dev, *aux; /* - * Push all migratable of the network devices back to the + * Push all migratable network devices back to the * initial network namespace */ rtnl_lock(); -restart: - for_each_netdev(net, dev) { + for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ]; @@ -5581,11 +5901,9 @@ restart: if (dev->features & NETIF_F_NETNS_LOCAL) continue; - /* Delete virtual devices */ - if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) { - dev->rtnl_link_ops->dellink(dev); - goto restart; - } + /* Leave virtual devices for the generic cleanup */ + if (dev->rtnl_link_ops) + continue; /* Push remaing network devices to init_net */ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); @@ -5595,13 +5913,37 @@ restart: __func__, dev->name, err); BUG(); } - goto restart; } rtnl_unlock(); } +static void __net_exit default_device_exit_batch(struct list_head *net_list) +{ + /* At exit all network devices most be removed from a network + * namespace. Do this in the reverse order of registeration. + * Do this across as many network namespaces as possible to + * improve batching efficiency. + */ + struct net_device *dev; + struct net *net; + LIST_HEAD(dev_kill_list); + + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + for_each_netdev_reverse(net, dev) { + if (dev->rtnl_link_ops) + dev->rtnl_link_ops->dellink(dev, &dev_kill_list); + else + unregister_netdevice_queue(dev, &dev_kill_list); + } + } + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); +} + static struct pernet_operations __net_initdata default_device_ops = { .exit = default_device_exit, + .exit_batch = default_device_exit_batch, }; /* |