diff options
Diffstat (limited to 'net/core/dev.c')
| -rw-r--r-- | net/core/dev.c | 3927 | 
1 files changed, 2300 insertions, 1627 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index baf2dc13a34..a3ef808b5e3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -79,6 +79,8 @@  #include <linux/cpu.h>  #include <linux/types.h>  #include <linux/kernel.h> +#include <linux/hash.h> +#include <linux/slab.h>  #include <linux/sched.h>  #include <linux/mutex.h>  #include <linux/string.h> @@ -99,11 +101,10 @@  #include <linux/proc_fs.h>  #include <linux/seq_file.h>  #include <linux/stat.h> -#include <linux/if_bridge.h> -#include <linux/if_macvlan.h>  #include <net/dst.h>  #include <net/pkt_sched.h>  #include <net/checksum.h> +#include <net/xfrm.h>  #include <linux/highmem.h>  #include <linux/init.h>  #include <linux/kmod.h> @@ -127,6 +128,10 @@  #include <linux/jhash.h>  #include <linux/random.h>  #include <trace/events/napi.h> +#include <trace/events/net.h> +#include <trace/events/skb.h> +#include <linux/pci.h> +#include <linux/inetdevice.h>  #include "net-sysfs.h" @@ -175,7 +180,7 @@ static struct list_head ptype_all __read_mostly;	/* Taps */   * The @dev_base_head list is protected by @dev_base_lock and the rtnl   * semaphore.   * - * Pure readers hold dev_base_lock for reading. + * Pure readers hold dev_base_lock for reading, or rcu_read_lock()   *   * Writers must hold the rtnl semaphore while they loop through the   * dev_base_head list, and hold dev_base_lock for writing when they do the @@ -191,21 +196,31 @@ static struct list_head ptype_all __read_mostly;	/* Taps */   * semaphore held.   */  DEFINE_RWLOCK(dev_base_lock); -  EXPORT_SYMBOL(dev_base_lock); -#define NETDEV_HASHBITS	8 -#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) -  static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)  {  	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); -	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; +	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];  }  static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)  { -	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; +	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; +} + +static inline void rps_lock(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS +	spin_lock(&sd->input_pkt_queue.lock); +#endif +} + +static inline void rps_unlock(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS +	spin_unlock(&sd->input_pkt_queue.lock); +#endif  }  /* Device list insertion */ @@ -216,23 +231,26 @@ static int list_netdevice(struct net_device *dev)  	ASSERT_RTNL();  	write_lock_bh(&dev_base_lock); -	list_add_tail(&dev->dev_list, &net->dev_base_head); -	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); -	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); +	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); +	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); +	hlist_add_head_rcu(&dev->index_hlist, +			   dev_index_hash(net, dev->ifindex));  	write_unlock_bh(&dev_base_lock);  	return 0;  } -/* Device list removal */ +/* Device list removal + * caller must respect a RCU grace period before freeing/reusing dev + */  static void unlist_netdevice(struct net_device *dev)  {  	ASSERT_RTNL();  	/* Unlink dev from the device chain */  	write_lock_bh(&dev_base_lock); -	list_del(&dev->dev_list); -	hlist_del(&dev->name_hlist); -	hlist_del(&dev->index_hlist); +	list_del_rcu(&dev->dev_list); +	hlist_del_rcu(&dev->name_hlist); +	hlist_del_rcu(&dev->index_hlist);  	write_unlock_bh(&dev_base_lock);  } @@ -247,7 +265,8 @@ static RAW_NOTIFIER_HEAD(netdev_chain);   *	queue in the local softnet handler.   */ -DEFINE_PER_CPU(struct softnet_data, softnet_data); +DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +EXPORT_PER_CPU_SYMBOL(softnet_data);  #ifdef CONFIG_LOCKDEP  /* @@ -269,10 +288,10 @@ static const unsigned short netdev_lock_type[] =  	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,  	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,  	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, -	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_IEEE802154_PHY, +	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,  	 ARPHRD_VOID, ARPHRD_NONE}; -static const char *netdev_lock_name[] = +static const char *const netdev_lock_name[] =  	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",  	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",  	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", @@ -287,7 +306,7 @@ static const char *netdev_lock_name[] =  	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",  	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",  	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", -	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154", "_xmit_IEEE802154_PHY", +	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",  	 "_xmit_VOID", "_xmit_NONE"};  static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; @@ -355,6 +374,14 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)   *							--ANK (980803)   */ +static inline struct list_head *ptype_head(const struct packet_type *pt) +{ +	if (pt->type == htons(ETH_P_ALL)) +		return &ptype_all; +	else +		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; +} +  /**   *	dev_add_pack - add packet handler   *	@pt: packet type declaration @@ -370,17 +397,13 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)  void dev_add_pack(struct packet_type *pt)  { -	int hash; +	struct list_head *head = ptype_head(pt); -	spin_lock_bh(&ptype_lock); -	if (pt->type == htons(ETH_P_ALL)) -		list_add_rcu(&pt->list, &ptype_all); -	else { -		hash = ntohs(pt->type) & PTYPE_HASH_MASK; -		list_add_rcu(&pt->list, &ptype_base[hash]); -	} -	spin_unlock_bh(&ptype_lock); +	spin_lock(&ptype_lock); +	list_add_rcu(&pt->list, head); +	spin_unlock(&ptype_lock);  } +EXPORT_SYMBOL(dev_add_pack);  /**   *	__dev_remove_pack	 - remove packet handler @@ -397,15 +420,10 @@ void dev_add_pack(struct packet_type *pt)   */  void __dev_remove_pack(struct packet_type *pt)  { -	struct list_head *head; +	struct list_head *head = ptype_head(pt);  	struct packet_type *pt1; -	spin_lock_bh(&ptype_lock); - -	if (pt->type == htons(ETH_P_ALL)) -		head = &ptype_all; -	else -		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; +	spin_lock(&ptype_lock);  	list_for_each_entry(pt1, head, list) {  		if (pt == pt1) { @@ -416,8 +434,10 @@ void __dev_remove_pack(struct packet_type *pt)  	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);  out: -	spin_unlock_bh(&ptype_lock); +	spin_unlock(&ptype_lock);  } +EXPORT_SYMBOL(__dev_remove_pack); +  /**   *	dev_remove_pack	 - remove packet handler   *	@pt: packet type declaration @@ -436,6 +456,7 @@ void dev_remove_pack(struct packet_type *pt)  	synchronize_net();  } +EXPORT_SYMBOL(dev_remove_pack);  /****************************************************************************** @@ -499,6 +520,7 @@ int netdev_boot_setup_check(struct net_device *dev)  	}  	return 0;  } +EXPORT_SYMBOL(netdev_boot_setup_check);  /** @@ -582,15 +604,42 @@ __setup("netdev=", netdev_boot_setup);  struct net_device *__dev_get_by_name(struct net *net, const char *name)  {  	struct hlist_node *p; +	struct net_device *dev; +	struct hlist_head *head = dev_name_hash(net, name); -	hlist_for_each(p, dev_name_hash(net, name)) { -		struct net_device *dev -			= hlist_entry(p, struct net_device, name_hlist); +	hlist_for_each_entry(dev, p, head, name_hlist)  		if (!strncmp(dev->name, name, IFNAMSIZ))  			return dev; -	} +  	return NULL;  } +EXPORT_SYMBOL(__dev_get_by_name); + +/** + *	dev_get_by_name_rcu	- find a device by its name + *	@net: the applicable net namespace + *	@name: name to find + * + *	Find an interface by name. + *	If the name is found a pointer to the device is returned. + * 	If the name is not found then %NULL is returned. + *	The reference counters are not incremented so the caller must be + *	careful with locks. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) +{ +	struct hlist_node *p; +	struct net_device *dev; +	struct hlist_head *head = dev_name_hash(net, name); + +	hlist_for_each_entry_rcu(dev, p, head, name_hlist) +		if (!strncmp(dev->name, name, IFNAMSIZ)) +			return dev; + +	return NULL; +} +EXPORT_SYMBOL(dev_get_by_name_rcu);  /**   *	dev_get_by_name		- find a device by its name @@ -608,13 +657,14 @@ struct net_device *dev_get_by_name(struct net *net, const char *name)  {  	struct net_device *dev; -	read_lock(&dev_base_lock); -	dev = __dev_get_by_name(net, name); +	rcu_read_lock(); +	dev = dev_get_by_name_rcu(net, name);  	if (dev)  		dev_hold(dev); -	read_unlock(&dev_base_lock); +	rcu_read_unlock();  	return dev;  } +EXPORT_SYMBOL(dev_get_by_name);  /**   *	__dev_get_by_index - find a device by its ifindex @@ -631,15 +681,41 @@ struct net_device *dev_get_by_name(struct net *net, const char *name)  struct net_device *__dev_get_by_index(struct net *net, int ifindex)  {  	struct hlist_node *p; +	struct net_device *dev; +	struct hlist_head *head = dev_index_hash(net, ifindex); -	hlist_for_each(p, dev_index_hash(net, ifindex)) { -		struct net_device *dev -			= hlist_entry(p, struct net_device, index_hlist); +	hlist_for_each_entry(dev, p, head, index_hlist)  		if (dev->ifindex == ifindex)  			return dev; -	} + +	return NULL; +} +EXPORT_SYMBOL(__dev_get_by_index); + +/** + *	dev_get_by_index_rcu - find a device by its ifindex + *	@net: the applicable net namespace + *	@ifindex: index of device + * + *	Search for an interface by index. Returns %NULL if the device + *	is not found or a pointer to the device. The device has not + *	had its reference counter increased so the caller must be careful + *	about locking. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) +{ +	struct hlist_node *p; +	struct net_device *dev; +	struct hlist_head *head = dev_index_hash(net, ifindex); + +	hlist_for_each_entry_rcu(dev, p, head, index_hlist) +		if (dev->ifindex == ifindex) +			return dev; +  	return NULL;  } +EXPORT_SYMBOL(dev_get_by_index_rcu);  /** @@ -657,44 +733,41 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex)  {  	struct net_device *dev; -	read_lock(&dev_base_lock); -	dev = __dev_get_by_index(net, ifindex); +	rcu_read_lock(); +	dev = dev_get_by_index_rcu(net, ifindex);  	if (dev)  		dev_hold(dev); -	read_unlock(&dev_base_lock); +	rcu_read_unlock();  	return dev;  } +EXPORT_SYMBOL(dev_get_by_index);  /** - *	dev_getbyhwaddr - find a device by its hardware address + *	dev_getbyhwaddr_rcu - find a device by its hardware address   *	@net: the applicable net namespace   *	@type: media type of device   *	@ha: hardware address   *   *	Search for an interface by MAC address. Returns NULL if the device - *	is not found or a pointer to the device. The caller must hold the - *	rtnl semaphore. The returned device has not had its ref count increased + *	is not found or a pointer to the device. The caller must hold RCU + *	The returned device has not had its ref count increased   *	and the caller must therefore be careful about locking   * - *	BUGS: - *	If the API was consistent this would be __dev_get_by_hwaddr   */ -struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha) +struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, +				       const char *ha)  {  	struct net_device *dev; -	ASSERT_RTNL(); - -	for_each_netdev(net, dev) +	for_each_netdev_rcu(net, dev)  		if (dev->type == type &&  		    !memcmp(dev->dev_addr, ha, dev->addr_len))  			return dev;  	return NULL;  } - -EXPORT_SYMBOL(dev_getbyhwaddr); +EXPORT_SYMBOL(dev_getbyhwaddr_rcu);  struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)  { @@ -707,51 +780,50 @@ struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)  	return NULL;  } -  EXPORT_SYMBOL(__dev_getfirstbyhwtype);  struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)  { -	struct net_device *dev; +	struct net_device *dev, *ret = NULL; -	rtnl_lock(); -	dev = __dev_getfirstbyhwtype(net, type); -	if (dev) -		dev_hold(dev); -	rtnl_unlock(); -	return dev; +	rcu_read_lock(); +	for_each_netdev_rcu(net, dev) +		if (dev->type == type) { +			dev_hold(dev); +			ret = dev; +			break; +		} +	rcu_read_unlock(); +	return ret;  } -  EXPORT_SYMBOL(dev_getfirstbyhwtype);  /** - *	dev_get_by_flags - find any device with given flags + *	dev_get_by_flags_rcu - find any device with given flags   *	@net: the applicable net namespace   *	@if_flags: IFF_* values   *	@mask: bitmask of bits in if_flags to check   *   *	Search for any interface with the given flags. Returns NULL if a device - *	is not found or a pointer to the device. The device returned has - *	had a reference added and the pointer is safe until the user calls - *	dev_put to indicate they have finished with it. + *	is not found or a pointer to the device. Must be called inside + *	rcu_read_lock(), and result refcount is unchanged.   */ -struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask) +struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, +				    unsigned short mask)  {  	struct net_device *dev, *ret;  	ret = NULL; -	read_lock(&dev_base_lock); -	for_each_netdev(net, dev) { +	for_each_netdev_rcu(net, dev) {  		if (((dev->flags ^ if_flags) & mask) == 0) { -			dev_hold(dev);  			ret = dev;  			break;  		}  	} -	read_unlock(&dev_base_lock);  	return ret;  } +EXPORT_SYMBOL(dev_get_by_flags_rcu);  /**   *	dev_valid_name - check if name is okay for network device @@ -777,6 +849,7 @@ int dev_valid_name(const char *name)  	}  	return 1;  } +EXPORT_SYMBOL(dev_valid_name);  /**   *	__dev_alloc_name - allocate a name for a device @@ -832,7 +905,8 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)  		free_page((unsigned long) inuse);  	} -	snprintf(buf, IFNAMSIZ, name, i); +	if (buf != name) +		snprintf(buf, IFNAMSIZ, name, i);  	if (!__dev_get_by_name(net, buf))  		return i; @@ -870,7 +944,27 @@ int dev_alloc_name(struct net_device *dev, const char *name)  		strlcpy(dev->name, buf, IFNAMSIZ);  	return ret;  } +EXPORT_SYMBOL(dev_alloc_name); +static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt) +{ +	struct net *net; + +	BUG_ON(!dev_net(dev)); +	net = dev_net(dev); + +	if (!dev_valid_name(name)) +		return -EINVAL; + +	if (fmt && strchr(name, '%')) +		return dev_alloc_name(dev, name); +	else if (__dev_get_by_name(net, name)) +		return -EEXIST; +	else if (dev->name != name) +		strlcpy(dev->name, name, IFNAMSIZ); + +	return 0; +}  /**   *	dev_change_name - change name of a device @@ -894,53 +988,45 @@ int dev_change_name(struct net_device *dev, const char *newname)  	if (dev->flags & IFF_UP)  		return -EBUSY; -	if (!dev_valid_name(newname)) -		return -EINVAL; -  	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)  		return 0;  	memcpy(oldname, dev->name, IFNAMSIZ); -	if (strchr(newname, '%')) { -		err = dev_alloc_name(dev, newname); -		if (err < 0) -			return err; -	} -	else if (__dev_get_by_name(net, newname)) -		return -EEXIST; -	else -		strlcpy(dev->name, newname, IFNAMSIZ); +	err = dev_get_valid_name(dev, newname, 1); +	if (err < 0) +		return err;  rollback: -	/* For now only devices in the initial network namespace -	 * are in sysfs. -	 */ -	if (net == &init_net) { -		ret = device_rename(&dev->dev, dev->name); -		if (ret) { -			memcpy(dev->name, oldname, IFNAMSIZ); -			return ret; -		} +	ret = device_rename(&dev->dev, dev->name); +	if (ret) { +		memcpy(dev->name, oldname, IFNAMSIZ); +		return ret;  	}  	write_lock_bh(&dev_base_lock);  	hlist_del(&dev->name_hlist); -	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); +	write_unlock_bh(&dev_base_lock); + +	synchronize_rcu(); + +	write_lock_bh(&dev_base_lock); +	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));  	write_unlock_bh(&dev_base_lock);  	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);  	ret = notifier_to_errno(ret);  	if (ret) { -		if (err) { -			printk(KERN_ERR -			       "%s: name change rollback failed: %d.\n", -			       dev->name, ret); -		} else { +		/* err >= 0 after dev_alloc_name() or stores the first errno */ +		if (err >= 0) {  			err = ret;  			memcpy(dev->name, oldname, IFNAMSIZ);  			goto rollback; +		} else { +			printk(KERN_ERR +			       "%s: name change rollback failed: %d.\n", +			       dev->name, ret);  		}  	} @@ -970,7 +1056,7 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)  		return 0;  	} -	dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL); +	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);  	if (!dev->ifalias)  		return -ENOMEM; @@ -1006,10 +1092,11 @@ void netdev_state_change(struct net_device *dev)  		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);  	}  } +EXPORT_SYMBOL(netdev_state_change); -void netdev_bonding_change(struct net_device *dev) +int netdev_bonding_change(struct net_device *dev, unsigned long event)  { -	call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev); +	return call_netdevice_notifiers(event, dev);  }  EXPORT_SYMBOL(netdev_bonding_change); @@ -1027,27 +1114,16 @@ void dev_load(struct net *net, const char *name)  {  	struct net_device *dev; -	read_lock(&dev_base_lock); -	dev = __dev_get_by_name(net, name); -	read_unlock(&dev_base_lock); +	rcu_read_lock(); +	dev = dev_get_by_name_rcu(net, name); +	rcu_read_unlock(); -	if (!dev && capable(CAP_SYS_MODULE)) +	if (!dev && capable(CAP_NET_ADMIN))  		request_module("%s", name);  } +EXPORT_SYMBOL(dev_load); -/** - *	dev_open	- prepare an interface for use. - *	@dev:	device to open - * - *	Takes a device from down to up state. The device's private open - *	function is invoked and then the multicast lists are loaded. Finally - *	the device is moved into the up state and a %NETDEV_UP message is - *	sent to the netdev notifier chain. - * - *	Calling this function on an active interface is a nop. On a failure - *	a negative errno code is returned. - */ -int dev_open(struct net_device *dev) +static int __dev_open(struct net_device *dev)  {  	const struct net_device_ops *ops = dev->netdev_ops;  	int ret; @@ -1055,13 +1131,6 @@ int dev_open(struct net_device *dev)  	ASSERT_RTNL();  	/* -	 *	Is it already up? -	 */ - -	if (dev->flags & IFF_UP) -		return 0; - -	/*  	 *	Is it even present?  	 */  	if (!netif_device_present(dev)) @@ -1109,81 +1178,156 @@ int dev_open(struct net_device *dev)  		 *	Wakeup transmit queue engine  		 */  		dev_activate(dev); - -		/* -		 *	... and announce new interface. -		 */ -		call_netdevice_notifiers(NETDEV_UP, dev);  	}  	return ret;  }  /** - *	dev_close - shutdown an interface. - *	@dev: device to shutdown + *	dev_open	- prepare an interface for use. + *	@dev:	device to open   * - *	This function moves an active device into down state. A - *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device - *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier - *	chain. + *	Takes a device from down to up state. The device's private open + *	function is invoked and then the multicast lists are loaded. Finally + *	the device is moved into the up state and a %NETDEV_UP message is + *	sent to the netdev notifier chain. + * + *	Calling this function on an active interface is a nop. On a failure + *	a negative errno code is returned.   */ -int dev_close(struct net_device *dev) +int dev_open(struct net_device *dev)  { -	const struct net_device_ops *ops = dev->netdev_ops; -	ASSERT_RTNL(); - -	might_sleep(); +	int ret; -	if (!(dev->flags & IFF_UP)) +	/* +	 *	Is it already up? +	 */ +	if (dev->flags & IFF_UP)  		return 0;  	/* -	 *	Tell people we are going down, so that they can -	 *	prepare to death, when device is still operating. +	 *	Open device  	 */ -	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); - -	clear_bit(__LINK_STATE_START, &dev->state); +	ret = __dev_open(dev); +	if (ret < 0) +		return ret; -	/* Synchronize to scheduled poll. We cannot touch poll list, -	 * it can be even on different cpu. So just clear netif_running(). -	 * -	 * dev->stop() will invoke napi_disable() on all of it's -	 * napi_struct instances on this device. +	/* +	 *	... and announce new interface.  	 */ -	smp_mb__after_clear_bit(); /* Commit netif_running(). */ +	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); +	call_netdevice_notifiers(NETDEV_UP, dev); -	dev_deactivate(dev); +	return ret; +} +EXPORT_SYMBOL(dev_open); -	/* -	 *	Call the device specific close. This cannot fail. -	 *	Only if device is UP -	 * -	 *	We allow it to be called even after a DETACH hot-plug -	 *	event. -	 */ -	if (ops->ndo_stop) -		ops->ndo_stop(dev); +static int __dev_close_many(struct list_head *head) +{ +	struct net_device *dev; -	/* -	 *	Device is now down. -	 */ +	ASSERT_RTNL(); +	might_sleep(); -	dev->flags &= ~IFF_UP; +	list_for_each_entry(dev, head, unreg_list) { +		/* +		 *	Tell people we are going down, so that they can +		 *	prepare to death, when device is still operating. +		 */ +		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); + +		clear_bit(__LINK_STATE_START, &dev->state); + +		/* Synchronize to scheduled poll. We cannot touch poll list, it +		 * can be even on different cpu. So just clear netif_running(). +		 * +		 * dev->stop() will invoke napi_disable() on all of it's +		 * napi_struct instances on this device. +		 */ +		smp_mb__after_clear_bit(); /* Commit netif_running(). */ +	} + +	dev_deactivate_many(head); + +	list_for_each_entry(dev, head, unreg_list) { +		const struct net_device_ops *ops = dev->netdev_ops; + +		/* +		 *	Call the device specific close. This cannot fail. +		 *	Only if device is UP +		 * +		 *	We allow it to be called even after a DETACH hot-plug +		 *	event. +		 */ +		if (ops->ndo_stop) +			ops->ndo_stop(dev); + +		/* +		 *	Device is now down. +		 */ + +		dev->flags &= ~IFF_UP; + +		/* +		 *	Shutdown NET_DMA +		 */ +		net_dmaengine_put(); +	} + +	return 0; +} + +static int __dev_close(struct net_device *dev) +{ +	LIST_HEAD(single); + +	list_add(&dev->unreg_list, &single); +	return __dev_close_many(&single); +} + +int dev_close_many(struct list_head *head) +{ +	struct net_device *dev, *tmp; +	LIST_HEAD(tmp_list); + +	list_for_each_entry_safe(dev, tmp, head, unreg_list) +		if (!(dev->flags & IFF_UP)) +			list_move(&dev->unreg_list, &tmp_list); + +	__dev_close_many(head);  	/*  	 * Tell people we are down  	 */ -	call_netdevice_notifiers(NETDEV_DOWN, dev); +	list_for_each_entry(dev, head, unreg_list) { +		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); +		call_netdevice_notifiers(NETDEV_DOWN, dev); +	} -	/* -	 *	Shutdown NET_DMA -	 */ -	net_dmaengine_put(); +	/* rollback_registered_many needs the complete original list */ +	list_splice(&tmp_list, head); +	return 0; +} + +/** + *	dev_close - shutdown an interface. + *	@dev: device to shutdown + * + *	This function moves an active device into down state. A + *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device + *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier + *	chain. + */ +int dev_close(struct net_device *dev) +{ +	LIST_HEAD(single); + +	list_add(&dev->unreg_list, &single); +	dev_close_many(&single);  	return 0;  } +EXPORT_SYMBOL(dev_close);  /** @@ -1273,12 +1417,14 @@ rollback:  				nb->notifier_call(nb, NETDEV_DOWN, dev);  			}  			nb->notifier_call(nb, NETDEV_UNREGISTER, dev); +			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);  		}  	}  	raw_notifier_chain_unregister(&netdev_chain, nb);  	goto unlock;  } +EXPORT_SYMBOL(register_netdevice_notifier);  /**   *	unregister_netdevice_notifier - unregister a network notifier block @@ -1299,6 +1445,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb)  	rtnl_unlock();  	return err;  } +EXPORT_SYMBOL(unregister_netdevice_notifier);  /**   *	call_netdevice_notifiers - call all network notifier blocks @@ -1311,6 +1458,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb)  int call_netdevice_notifiers(unsigned long val, struct net_device *dev)  { +	ASSERT_RTNL();  	return raw_notifier_call_chain(&netdev_chain, val, dev);  } @@ -1321,13 +1469,15 @@ void net_enable_timestamp(void)  {  	atomic_inc(&netstamp_needed);  } +EXPORT_SYMBOL(net_enable_timestamp);  void net_disable_timestamp(void)  {  	atomic_dec(&netstamp_needed);  } +EXPORT_SYMBOL(net_disable_timestamp); -static inline void net_timestamp(struct sk_buff *skb) +static inline void net_timestamp_set(struct sk_buff *skb)  {  	if (atomic_read(&netstamp_needed))  		__net_timestamp(skb); @@ -1335,6 +1485,57 @@ static inline void net_timestamp(struct sk_buff *skb)  		skb->tstamp.tv64 = 0;  } +static inline void net_timestamp_check(struct sk_buff *skb) +{ +	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed)) +		__net_timestamp(skb); +} + +/** + * dev_forward_skb - loopback an skb to another netif + * + * @dev: destination network device + * @skb: buffer to forward + * + * return values: + *	NET_RX_SUCCESS	(no congestion) + *	NET_RX_DROP     (packet was dropped, but freed) + * + * dev_forward_skb can be used for injecting an skb from the + * start_xmit function of one device into the receive queue + * of another device. + * + * The receiving device may be in another namespace, so + * we have to clear all information in the skb that could + * impact namespace isolation. + */ +int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ +	skb_orphan(skb); +	nf_reset(skb); + +	if (unlikely(!(dev->flags & IFF_UP) || +		     (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) { +		atomic_long_inc(&dev->rx_dropped); +		kfree_skb(skb); +		return NET_RX_DROP; +	} +	skb_set_dev(skb, dev); +	skb->tstamp.tv64 = 0; +	skb->pkt_type = PACKET_HOST; +	skb->protocol = eth_type_trans(skb, dev); +	return netif_rx(skb); +} +EXPORT_SYMBOL_GPL(dev_forward_skb); + +static inline int deliver_skb(struct sk_buff *skb, +			      struct packet_type *pt_prev, +			      struct net_device *orig_dev) +{ +	atomic_inc(&skb->users); +	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); +} +  /*   *	Support routine. Sends outgoing frames to any network   *	taps currently in use. @@ -1343,13 +1544,8 @@ static inline void net_timestamp(struct sk_buff *skb)  static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)  {  	struct packet_type *ptype; - -#ifdef CONFIG_NET_CLS_ACT -	if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS))) -		net_timestamp(skb); -#else -	net_timestamp(skb); -#endif +	struct sk_buff *skb2 = NULL; +	struct packet_type *pt_prev = NULL;  	rcu_read_lock();  	list_for_each_entry_rcu(ptype, &ptype_all, list) { @@ -1359,10 +1555,18 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)  		if ((ptype->dev == dev || !ptype->dev) &&  		    (ptype->af_packet_priv == NULL ||  		     (struct sock *)ptype->af_packet_priv != skb->sk)) { -			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC); +			if (pt_prev) { +				deliver_skb(skb2, pt_prev, skb->dev); +				pt_prev = ptype; +				continue; +			} + +			skb2 = skb_clone(skb, GFP_ATOMIC);  			if (!skb2)  				break; +			net_timestamp_set(skb2); +  			/* skb->nh should be correctly  			   set by sender, so that the second statement is  			   just protection against buggy protocols. @@ -1374,18 +1578,81 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)  				if (net_ratelimit())  					printk(KERN_CRIT "protocol %04x is "  					       "buggy, dev %s\n", -					       skb2->protocol, dev->name); +					       ntohs(skb2->protocol), +					       dev->name);  				skb_reset_network_header(skb2);  			}  			skb2->transport_header = skb2->network_header;  			skb2->pkt_type = PACKET_OUTGOING; -			ptype->func(skb2, skb->dev, ptype, skb->dev); +			pt_prev = ptype;  		}  	} +	if (pt_prev) +		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);  	rcu_read_unlock();  } +/* + * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues + * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. + */ +int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) +{ +	int rc; + +	if (txq < 1 || txq > dev->num_tx_queues) +		return -EINVAL; + +	if (dev->reg_state == NETREG_REGISTERED) { +		ASSERT_RTNL(); + +		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, +						  txq); +		if (rc) +			return rc; + +		if (txq < dev->real_num_tx_queues) +			qdisc_reset_all_tx_gt(dev, txq); +	} + +	dev->real_num_tx_queues = txq; +	return 0; +} +EXPORT_SYMBOL(netif_set_real_num_tx_queues); + +#ifdef CONFIG_RPS +/** + *	netif_set_real_num_rx_queues - set actual number of RX queues used + *	@dev: Network device + *	@rxq: Actual number of RX queues + * + *	This must be called either with the rtnl_lock held or before + *	registration of the net device.  Returns 0 on success, or a + *	negative error code.  If called before registration, it always + *	succeeds. + */ +int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) +{ +	int rc; + +	if (rxq < 1 || rxq > dev->num_rx_queues) +		return -EINVAL; + +	if (dev->reg_state == NETREG_REGISTERED) { +		ASSERT_RTNL(); + +		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, +						  rxq); +		if (rc) +			return rc; +	} + +	dev->real_num_rx_queues = rxq; +	return 0; +} +EXPORT_SYMBOL(netif_set_real_num_rx_queues); +#endif  static inline void __netif_reschedule(struct Qdisc *q)  { @@ -1394,8 +1661,9 @@ static inline void __netif_reschedule(struct Qdisc *q)  	local_irq_save(flags);  	sd = &__get_cpu_var(softnet_data); -	q->next_sched = sd->output_queue; -	sd->output_queue = q; +	q->next_sched = NULL; +	*sd->output_queue_tailp = q; +	sd->output_queue_tailp = &q->next_sched;  	raise_softirq_irqoff(NET_TX_SOFTIRQ);  	local_irq_restore(flags);  } @@ -1464,31 +1732,35 @@ void netif_device_attach(struct net_device *dev)  }  EXPORT_SYMBOL(netif_device_attach); -static bool can_checksum_protocol(unsigned long features, __be16 protocol) -{ -	return ((features & NETIF_F_GEN_CSUM) || -		((features & NETIF_F_IP_CSUM) && -		 protocol == htons(ETH_P_IP)) || -		((features & NETIF_F_IPV6_CSUM) && -		 protocol == htons(ETH_P_IPV6)) || -		((features & NETIF_F_FCOE_CRC) && -		 protocol == htons(ETH_P_FCOE))); -} - -static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) -{ -	if (can_checksum_protocol(dev->features, skb->protocol)) -		return true; - -	if (skb->protocol == htons(ETH_P_8021Q)) { -		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; -		if (can_checksum_protocol(dev->features & dev->vlan_features, -					  veh->h_vlan_encapsulated_proto)) -			return true; +/** + * skb_dev_set -- assign a new device to a buffer + * @skb: buffer for the new device + * @dev: network device + * + * If an skb is owned by a device already, we have to reset + * all data private to the namespace a device belongs to + * before assigning it a new device. + */ +#ifdef CONFIG_NET_NS +void skb_set_dev(struct sk_buff *skb, struct net_device *dev) +{ +	skb_dst_drop(skb); +	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) { +		secpath_reset(skb); +		nf_reset(skb); +		skb_init_secmark(skb); +		skb->mark = 0; +		skb->priority = 0; +		skb->nf_trace = 0; +		skb->ipvs_property = 0; +#ifdef CONFIG_NET_SCHED +		skb->tc_index = 0; +#endif  	} - -	return false; +	skb->dev = dev;  } +EXPORT_SYMBOL(skb_set_dev); +#endif /* CONFIG_NET_NS */  /*   * Invalidate hardware checksum when packet is to be mangled, and @@ -1507,7 +1779,7 @@ int skb_checksum_help(struct sk_buff *skb)  		goto out_set_summed;  	} -	offset = skb->csum_start - skb_headroom(skb); +	offset = skb_checksum_start_offset(skb);  	BUG_ON(offset >= skb_headlen(skb));  	csum = skb_checksum(skb, offset, skb->len - offset, 0); @@ -1527,6 +1799,7 @@ out_set_summed:  out:  	return ret;  } +EXPORT_SYMBOL(skb_checksum_help);  /**   *	skb_gso_segment - Perform segmentation on skb. @@ -1543,8 +1816,20 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)  	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);  	struct packet_type *ptype;  	__be16 type = skb->protocol; +	int vlan_depth = ETH_HLEN;  	int err; +	while (type == htons(ETH_P_8021Q)) { +		struct vlan_hdr *vh; + +		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) +			return ERR_PTR(-EINVAL); + +		vh = (struct vlan_hdr *)(skb->data + vlan_depth); +		type = vh->h_vlan_encapsulated_proto; +		vlan_depth += VLAN_HLEN; +	} +  	skb_reset_mac_header(skb);  	skb->mac_len = skb->network_header - skb->mac_header;  	__skb_pull(skb, skb->mac_len); @@ -1556,8 +1841,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)  		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)  			dev->ethtool_ops->get_drvinfo(dev, &info); -		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d " -			"ip_summed=%d", +		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",  		     info.driver, dev ? dev->features : 0L,  		     skb->sk ? skb->sk->sk_route_caps : 0L,  		     skb->len, skb->data_len, skb->ip_summed); @@ -1589,7 +1873,6 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)  	return segs;  } -  EXPORT_SYMBOL(skb_gso_segment);  /* Take action when hardware reception checksum errors are detected. */ @@ -1610,18 +1893,27 @@ EXPORT_SYMBOL(netdev_rx_csum_fault);   * 2. No high memory really exists on this machine.   */ -static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) +static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)  {  #ifdef CONFIG_HIGHMEM  	int i; +	if (!(dev->features & NETIF_F_HIGHDMA)) { +		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +			if (PageHighMem(skb_shinfo(skb)->frags[i].page)) +				return 1; +	} -	if (dev->features & NETIF_F_HIGHDMA) -		return 0; - -	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) -		if (PageHighMem(skb_shinfo(skb)->frags[i].page)) -			return 1; +	if (PCI_DMA_BUS_IS_PHYS) { +		struct device *pdev = dev->dev.parent; +		if (!pdev) +			return 0; +		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { +			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page); +			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) +				return 1; +		} +	}  #endif  	return 0;  } @@ -1652,16 +1944,14 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)  /**   *	dev_gso_segment - Perform emulated hardware segmentation on skb.   *	@skb: buffer to segment + *	@features: device features as applicable to this skb   *   *	This function segments the given skb and stores the list of segments   *	in skb->next.   */ -static int dev_gso_segment(struct sk_buff *skb) +static int dev_gso_segment(struct sk_buff *skb, int features)  { -	struct net_device *dev = skb->dev;  	struct sk_buff *segs; -	int features = dev->features & ~(illegal_highdma(dev, skb) ? -					 NETIF_F_SG : 0);  	segs = skb_gso_segment(skb, features); @@ -1679,22 +1969,97 @@ static int dev_gso_segment(struct sk_buff *skb)  	return 0;  } +/* + * Try to orphan skb early, right before transmission by the device. + * We cannot orphan skb if tx timestamp is requested or the sk-reference + * is needed on driver level for other reasons, e.g. see net/can/raw.c + */ +static inline void skb_orphan_try(struct sk_buff *skb) +{ +	struct sock *sk = skb->sk; + +	if (sk && !skb_shinfo(skb)->tx_flags) { +		/* skb_tx_hash() wont be able to get sk. +		 * We copy sk_hash into skb->rxhash +		 */ +		if (!skb->rxhash) +			skb->rxhash = sk->sk_hash; +		skb_orphan(skb); +	} +} + +static bool can_checksum_protocol(unsigned long features, __be16 protocol) +{ +	return ((features & NETIF_F_GEN_CSUM) || +		((features & NETIF_F_V4_CSUM) && +		 protocol == htons(ETH_P_IP)) || +		((features & NETIF_F_V6_CSUM) && +		 protocol == htons(ETH_P_IPV6)) || +		((features & NETIF_F_FCOE_CRC) && +		 protocol == htons(ETH_P_FCOE))); +} + +static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features) +{ +	if (!can_checksum_protocol(protocol, features)) { +		features &= ~NETIF_F_ALL_CSUM; +		features &= ~NETIF_F_SG; +	} else if (illegal_highdma(skb->dev, skb)) { +		features &= ~NETIF_F_SG; +	} + +	return features; +} + +int netif_skb_features(struct sk_buff *skb) +{ +	__be16 protocol = skb->protocol; +	int features = skb->dev->features; + +	if (protocol == htons(ETH_P_8021Q)) { +		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; +		protocol = veh->h_vlan_encapsulated_proto; +	} else if (!vlan_tx_tag_present(skb)) { +		return harmonize_features(skb, protocol, features); +	} + +	features &= skb->dev->vlan_features; + +	if (protocol != htons(ETH_P_8021Q)) { +		return harmonize_features(skb, protocol, features); +	} else { +		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | +				NETIF_F_GEN_CSUM; +		return harmonize_features(skb, protocol, features); +	} +} +EXPORT_SYMBOL(netif_skb_features); + +/* + * Returns true if either: + *	1. skb has frag_list and the device doesn't support FRAGLIST, or + *	2. skb is fragmented and the device does not support SG, or if + *	   at least one of fragments is in highmem and device does not + *	   support DMA from it. + */ +static inline int skb_needs_linearize(struct sk_buff *skb, +				      int features) +{ +	return skb_is_nonlinear(skb) && +			((skb_has_frag_list(skb) && +				!(features & NETIF_F_FRAGLIST)) || +			(skb_shinfo(skb)->nr_frags && +				!(features & NETIF_F_SG))); +} +  int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,  			struct netdev_queue *txq)  {  	const struct net_device_ops *ops = dev->netdev_ops; -	int rc; +	int rc = NETDEV_TX_OK;  	if (likely(!skb->next)) { -		if (!list_empty(&ptype_all)) -			dev_queue_xmit_nit(skb, dev); - -		if (netif_needs_gso(dev, skb)) { -			if (unlikely(dev_gso_segment(skb))) -				goto out_kfree_skb; -			if (skb->next) -				goto gso; -		} +		int features;  		/*  		 * If device doesnt need skb->dst, release it right now while @@ -1703,23 +2068,49 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,  		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)  			skb_dst_drop(skb); +		if (!list_empty(&ptype_all)) +			dev_queue_xmit_nit(skb, dev); + +		skb_orphan_try(skb); + +		features = netif_skb_features(skb); + +		if (vlan_tx_tag_present(skb) && +		    !(features & NETIF_F_HW_VLAN_TX)) { +			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); +			if (unlikely(!skb)) +				goto out; + +			skb->vlan_tci = 0; +		} + +		if (netif_needs_gso(skb, features)) { +			if (unlikely(dev_gso_segment(skb, features))) +				goto out_kfree_skb; +			if (skb->next) +				goto gso; +		} else { +			if (skb_needs_linearize(skb, features) && +			    __skb_linearize(skb)) +				goto out_kfree_skb; + +			/* If packet is not checksummed and device does not +			 * support checksumming for this protocol, complete +			 * checksumming here. +			 */ +			if (skb->ip_summed == CHECKSUM_PARTIAL) { +				skb_set_transport_header(skb, +					skb_checksum_start_offset(skb)); +				if (!(features & NETIF_F_ALL_CSUM) && +				     skb_checksum_help(skb)) +					goto out_kfree_skb; +			} +		} +  		rc = ops->ndo_start_xmit(skb, dev); -		if (rc == 0) +		trace_net_dev_xmit(skb, rc); +		if (rc == NETDEV_TX_OK)  			txq_trans_update(txq); -		/* -		 * TODO: if skb_orphan() was called by -		 * dev->hard_start_xmit() (for example, the unmodified -		 * igb driver does that; bnx2 doesn't), then -		 * skb_tx_software_timestamp() will be unable to send -		 * back the time stamp. -		 * -		 * How can this be prevented? Always create another -		 * reference to the socket before calling -		 * dev->hard_start_xmit()? Prevent that skb_orphan() -		 * does anything in dev->hard_start_xmit() by clearing -		 * the skb destructor before the call and restoring it -		 * afterwards, then doing the skb_orphan() ourselves? -		 */  		return rc;  	} @@ -1729,8 +2120,19 @@ gso:  		skb->next = nskb->next;  		nskb->next = NULL; + +		/* +		 * If device doesnt need nskb->dst, release it right now while +		 * its hot in this cpu cache +		 */ +		if (dev->priv_flags & IFF_XMIT_DST_RELEASE) +			skb_dst_drop(nskb); +  		rc = ops->ndo_start_xmit(nskb, dev); -		if (unlikely(rc)) { +		trace_net_dev_xmit(nskb, rc); +		if (unlikely(rc != NETDEV_TX_OK)) { +			if (rc & ~NETDEV_TX_MASK) +				goto out_kfree_gso_skb;  			nskb->next = skb->next;  			skb->next = nskb;  			return rc; @@ -1740,52 +2142,195 @@ gso:  			return NETDEV_TX_BUSY;  	} while (skb->next); -	skb->destructor = DEV_GSO_CB(skb)->destructor; - +out_kfree_gso_skb: +	if (likely(skb->next == NULL)) +		skb->destructor = DEV_GSO_CB(skb)->destructor;  out_kfree_skb:  	kfree_skb(skb); -	return 0; +out: +	return rc;  } -static u32 skb_tx_hashrnd; +static u32 hashrnd __read_mostly; -u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, +		  unsigned int num_tx_queues)  {  	u32 hash;  	if (skb_rx_queue_recorded(skb)) {  		hash = skb_get_rx_queue(skb); -		while (unlikely (hash >= dev->real_num_tx_queues)) -			hash -= dev->real_num_tx_queues; +		while (unlikely(hash >= num_tx_queues)) +			hash -= num_tx_queues;  		return hash;  	}  	if (skb->sk && skb->sk->sk_hash)  		hash = skb->sk->sk_hash;  	else -		hash = skb->protocol; +		hash = (__force u16) skb->protocol ^ skb->rxhash; +	hash = jhash_1word(hash, hashrnd); + +	return (u16) (((u64) hash * num_tx_queues) >> 32); +} +EXPORT_SYMBOL(__skb_tx_hash); + +static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) +{ +	if (unlikely(queue_index >= dev->real_num_tx_queues)) { +		if (net_ratelimit()) { +			pr_warning("%s selects TX queue %d, but " +				"real number of TX queues is %d\n", +				dev->name, queue_index, dev->real_num_tx_queues); +		} +		return 0; +	} +	return queue_index; +} -	hash = jhash_1word(hash, skb_tx_hashrnd); +static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_XPS +	struct xps_dev_maps *dev_maps; +	struct xps_map *map; +	int queue_index = -1; + +	rcu_read_lock(); +	dev_maps = rcu_dereference(dev->xps_maps); +	if (dev_maps) { +		map = rcu_dereference( +		    dev_maps->cpu_map[raw_smp_processor_id()]); +		if (map) { +			if (map->len == 1) +				queue_index = map->queues[0]; +			else { +				u32 hash; +				if (skb->sk && skb->sk->sk_hash) +					hash = skb->sk->sk_hash; +				else +					hash = (__force u16) skb->protocol ^ +					    skb->rxhash; +				hash = jhash_1word(hash, hashrnd); +				queue_index = map->queues[ +				    ((u64)hash * map->len) >> 32]; +			} +			if (unlikely(queue_index >= dev->real_num_tx_queues)) +				queue_index = -1; +		} +	} +	rcu_read_unlock(); -	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); +	return queue_index; +#else +	return -1; +#endif  } -EXPORT_SYMBOL(skb_tx_hash);  static struct netdev_queue *dev_pick_tx(struct net_device *dev,  					struct sk_buff *skb)  { +	int queue_index;  	const struct net_device_ops *ops = dev->netdev_ops; -	u16 queue_index = 0; -	if (ops->ndo_select_queue) +	if (dev->real_num_tx_queues == 1) +		queue_index = 0; +	else if (ops->ndo_select_queue) {  		queue_index = ops->ndo_select_queue(dev, skb); -	else if (dev->real_num_tx_queues > 1) -		queue_index = skb_tx_hash(dev, skb); +		queue_index = dev_cap_txqueue(dev, queue_index); +	} else { +		struct sock *sk = skb->sk; +		queue_index = sk_tx_queue_get(sk); + +		if (queue_index < 0 || skb->ooo_okay || +		    queue_index >= dev->real_num_tx_queues) { +			int old_index = queue_index; + +			queue_index = get_xps_queue(dev, skb); +			if (queue_index < 0) +				queue_index = skb_tx_hash(dev, skb); + +			if (queue_index != old_index && sk) { +				struct dst_entry *dst = +				    rcu_dereference_check(sk->sk_dst_cache, 1); + +				if (dst && skb_dst(skb) == dst) +					sk_tx_queue_set(sk, queue_index); +			} +		} +	}  	skb_set_queue_mapping(skb, queue_index);  	return netdev_get_tx_queue(dev, queue_index);  } +static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, +				 struct net_device *dev, +				 struct netdev_queue *txq) +{ +	spinlock_t *root_lock = qdisc_lock(q); +	bool contended = qdisc_is_running(q); +	int rc; + +	/* +	 * Heuristic to force contended enqueues to serialize on a +	 * separate lock before trying to get qdisc main lock. +	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often +	 * and dequeue packets faster. +	 */ +	if (unlikely(contended)) +		spin_lock(&q->busylock); + +	spin_lock(root_lock); +	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { +		kfree_skb(skb); +		rc = NET_XMIT_DROP; +	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && +		   qdisc_run_begin(q)) { +		/* +		 * This is a work-conserving queue; there are no old skbs +		 * waiting to be sent out; and the qdisc is not running - +		 * xmit the skb directly. +		 */ +		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) +			skb_dst_force(skb); + +		qdisc_skb_cb(skb)->pkt_len = skb->len; +		qdisc_bstats_update(q, skb); + +		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { +			if (unlikely(contended)) { +				spin_unlock(&q->busylock); +				contended = false; +			} +			__qdisc_run(q); +		} else +			qdisc_run_end(q); + +		rc = NET_XMIT_SUCCESS; +	} else { +		skb_dst_force(skb); +		rc = qdisc_enqueue_root(skb, q); +		if (qdisc_run_begin(q)) { +			if (unlikely(contended)) { +				spin_unlock(&q->busylock); +				contended = false; +			} +			__qdisc_run(q); +		} +	} +	spin_unlock(root_lock); +	if (unlikely(contended)) +		spin_unlock(&q->busylock); +	return rc; +} + +static DEFINE_PER_CPU(int, xmit_recursion); +#define RECURSION_LIMIT 10 +  /**   *	dev_queue_xmit - transmit a buffer   *	@skb: buffer to transmit @@ -1818,60 +2363,20 @@ int dev_queue_xmit(struct sk_buff *skb)  	struct Qdisc *q;  	int rc = -ENOMEM; -	/* GSO will handle the following emulations directly. */ -	if (netif_needs_gso(dev, skb)) -		goto gso; - -	if (skb_has_frags(skb) && -	    !(dev->features & NETIF_F_FRAGLIST) && -	    __skb_linearize(skb)) -		goto out_kfree_skb; - -	/* Fragmented skb is linearized if device does not support SG, -	 * or if at least one of fragments is in highmem and device -	 * does not support DMA from it. -	 */ -	if (skb_shinfo(skb)->nr_frags && -	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && -	    __skb_linearize(skb)) -		goto out_kfree_skb; - -	/* If packet is not checksummed and device does not support -	 * checksumming for this protocol, complete checksumming here. -	 */ -	if (skb->ip_summed == CHECKSUM_PARTIAL) { -		skb_set_transport_header(skb, skb->csum_start - -					      skb_headroom(skb)); -		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb)) -			goto out_kfree_skb; -	} - -gso:  	/* Disable soft irqs for various locks below. Also  	 * stops preemption for RCU.  	 */  	rcu_read_lock_bh();  	txq = dev_pick_tx(dev, skb); -	q = rcu_dereference(txq->qdisc); +	q = rcu_dereference_bh(txq->qdisc);  #ifdef CONFIG_NET_CLS_ACT -	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); +	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);  #endif +	trace_net_dev_queue(skb);  	if (q->enqueue) { -		spinlock_t *root_lock = qdisc_lock(q); - -		spin_lock(root_lock); - -		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { -			kfree_skb(skb); -			rc = NET_XMIT_DROP; -		} else { -			rc = qdisc_enqueue_root(skb, q); -			qdisc_run(q); -		} -		spin_unlock(root_lock); - +		rc = __dev_xmit_skb(skb, q, dev, txq);  		goto out;  	} @@ -1892,11 +2397,16 @@ gso:  		if (txq->xmit_lock_owner != cpu) { +			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) +				goto recursion_alert; +  			HARD_TX_LOCK(dev, txq, cpu);  			if (!netif_tx_queue_stopped(txq)) { -				rc = 0; -				if (!dev_hard_start_xmit(skb, dev, txq)) { +				__this_cpu_inc(xmit_recursion); +				rc = dev_hard_start_xmit(skb, dev, txq); +				__this_cpu_dec(xmit_recursion); +				if (dev_xmit_complete(rc)) {  					HARD_TX_UNLOCK(dev, txq);  					goto out;  				} @@ -1907,7 +2417,9 @@ gso:  				       "queue packet!\n", dev->name);  		} else {  			/* Recursion is detected! It is possible, -			 * unfortunately */ +			 * unfortunately +			 */ +recursion_alert:  			if (net_ratelimit())  				printk(KERN_CRIT "Dead loop on virtual device "  				       "%s, fix it urgently!\n", dev->name); @@ -1917,13 +2429,13 @@ gso:  	rc = -ENETDOWN;  	rcu_read_unlock_bh(); -out_kfree_skb:  	kfree_skb(skb);  	return rc;  out:  	rcu_read_unlock_bh();  	return rc;  } +EXPORT_SYMBOL(dev_queue_xmit);  /*======================================================================= @@ -1931,11 +2443,267 @@ out:    =======================================================================*/  int netdev_max_backlog __read_mostly = 1000; +int netdev_tstamp_prequeue __read_mostly = 1;  int netdev_budget __read_mostly = 300;  int weight_p __read_mostly = 64;            /* old backlog weight */ -DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; +/* Called with irq disabled */ +static inline void ____napi_schedule(struct softnet_data *sd, +				     struct napi_struct *napi) +{ +	list_add_tail(&napi->poll_list, &sd->poll_list); +	__raise_softirq_irqoff(NET_RX_SOFTIRQ); +} + +/* + * __skb_get_rxhash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers. Returns a non-zero hash number on success + * and 0 on failure. + */ +__u32 __skb_get_rxhash(struct sk_buff *skb) +{ +	int nhoff, hash = 0, poff; +	struct ipv6hdr *ip6; +	struct iphdr *ip; +	u8 ip_proto; +	u32 addr1, addr2, ihl; +	union { +		u32 v32; +		u16 v16[2]; +	} ports; + +	nhoff = skb_network_offset(skb); + +	switch (skb->protocol) { +	case __constant_htons(ETH_P_IP): +		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) +			goto done; + +		ip = (struct iphdr *) (skb->data + nhoff); +		if (ip->frag_off & htons(IP_MF | IP_OFFSET)) +			ip_proto = 0; +		else +			ip_proto = ip->protocol; +		addr1 = (__force u32) ip->saddr; +		addr2 = (__force u32) ip->daddr; +		ihl = ip->ihl; +		break; +	case __constant_htons(ETH_P_IPV6): +		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) +			goto done; + +		ip6 = (struct ipv6hdr *) (skb->data + nhoff); +		ip_proto = ip6->nexthdr; +		addr1 = (__force u32) ip6->saddr.s6_addr32[3]; +		addr2 = (__force u32) ip6->daddr.s6_addr32[3]; +		ihl = (40 >> 2); +		break; +	default: +		goto done; +	} + +	ports.v32 = 0; +	poff = proto_ports_offset(ip_proto); +	if (poff >= 0) { +		nhoff += ihl * 4 + poff; +		if (pskb_may_pull(skb, nhoff + 4)) { +			ports.v32 = * (__force u32 *) (skb->data + nhoff); +			if (ports.v16[1] < ports.v16[0]) +				swap(ports.v16[0], ports.v16[1]); +		} +	} + +	/* get a consistent hash (same value on both flow directions) */ +	if (addr2 < addr1) +		swap(addr1, addr2); + +	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); +	if (!hash) +		hash = 1; + +done: +	return hash; +} +EXPORT_SYMBOL(__skb_get_rxhash); + +#ifdef CONFIG_RPS + +/* One global table that all flow-based protocols share. */ +struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; +EXPORT_SYMBOL(rps_sock_flow_table); + +/* + * get_rps_cpu is called from netif_receive_skb and returns the target + * CPU from the RPS map of the receiving queue for a given skb. + * rcu_read_lock must be held on entry. + */ +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, +		       struct rps_dev_flow **rflowp) +{ +	struct netdev_rx_queue *rxqueue; +	struct rps_map *map; +	struct rps_dev_flow_table *flow_table; +	struct rps_sock_flow_table *sock_flow_table; +	int cpu = -1; +	u16 tcpu; + +	if (skb_rx_queue_recorded(skb)) { +		u16 index = skb_get_rx_queue(skb); +		if (unlikely(index >= dev->real_num_rx_queues)) { +			WARN_ONCE(dev->real_num_rx_queues > 1, +				  "%s received packet on queue %u, but number " +				  "of RX queues is %u\n", +				  dev->name, index, dev->real_num_rx_queues); +			goto done; +		} +		rxqueue = dev->_rx + index; +	} else +		rxqueue = dev->_rx; + +	map = rcu_dereference(rxqueue->rps_map); +	if (map) { +		if (map->len == 1) { +			tcpu = map->cpus[0]; +			if (cpu_online(tcpu)) +				cpu = tcpu; +			goto done; +		} +	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) { +		goto done; +	} + +	skb_reset_network_header(skb); +	if (!skb_get_rxhash(skb)) +		goto done; + +	flow_table = rcu_dereference(rxqueue->rps_flow_table); +	sock_flow_table = rcu_dereference(rps_sock_flow_table); +	if (flow_table && sock_flow_table) { +		u16 next_cpu; +		struct rps_dev_flow *rflow; + +		rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; +		tcpu = rflow->cpu; + +		next_cpu = sock_flow_table->ents[skb->rxhash & +		    sock_flow_table->mask]; +		/* +		 * If the desired CPU (where last recvmsg was done) is +		 * different from current CPU (one in the rx-queue flow +		 * table entry), switch if one of the following holds: +		 *   - Current CPU is unset (equal to RPS_NO_CPU). +		 *   - Current CPU is offline. +		 *   - The current CPU's queue tail has advanced beyond the +		 *     last packet that was enqueued using this table entry. +		 *     This guarantees that all previous packets for the flow +		 *     have been dequeued, thus preserving in order delivery. +		 */ +		if (unlikely(tcpu != next_cpu) && +		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || +		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head - +		      rflow->last_qtail)) >= 0)) { +			tcpu = rflow->cpu = next_cpu; +			if (tcpu != RPS_NO_CPU) +				rflow->last_qtail = per_cpu(softnet_data, +				    tcpu).input_queue_head; +		} +		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { +			*rflowp = rflow; +			cpu = tcpu; +			goto done; +		} +	} + +	if (map) { +		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; + +		if (cpu_online(tcpu)) { +			cpu = tcpu; +			goto done; +		} +	} + +done: +	return cpu; +} + +/* Called from hardirq (IPI) context */ +static void rps_trigger_softirq(void *data) +{ +	struct softnet_data *sd = data; + +	____napi_schedule(sd, &sd->backlog); +	sd->received_rps++; +} + +#endif /* CONFIG_RPS */ + +/* + * Check if this softnet_data structure is another cpu one + * If yes, queue it to our IPI list and return 1 + * If no, return 0 + */ +static int rps_ipi_queued(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS +	struct softnet_data *mysd = &__get_cpu_var(softnet_data); + +	if (sd != mysd) { +		sd->rps_ipi_next = mysd->rps_ipi_list; +		mysd->rps_ipi_list = sd; + +		__raise_softirq_irqoff(NET_RX_SOFTIRQ); +		return 1; +	} +#endif /* CONFIG_RPS */ +	return 0; +} + +/* + * enqueue_to_backlog is called to queue an skb to a per CPU backlog + * queue (may be a remote CPU queue). + */ +static int enqueue_to_backlog(struct sk_buff *skb, int cpu, +			      unsigned int *qtail) +{ +	struct softnet_data *sd; +	unsigned long flags; + +	sd = &per_cpu(softnet_data, cpu); + +	local_irq_save(flags); + +	rps_lock(sd); +	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { +		if (skb_queue_len(&sd->input_pkt_queue)) { +enqueue: +			__skb_queue_tail(&sd->input_pkt_queue, skb); +			input_queue_tail_incr_save(sd, qtail); +			rps_unlock(sd); +			local_irq_restore(flags); +			return NET_RX_SUCCESS; +		} + +		/* Schedule NAPI for backlog device +		 * We can use non atomic operation since we own the queue lock +		 */ +		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { +			if (!rps_ipi_queued(sd)) +				____napi_schedule(sd, &sd->backlog); +		} +		goto enqueue; +	} + +	sd->dropped++; +	rps_unlock(sd); + +	local_irq_restore(flags); + +	atomic_long_inc(&skb->dev->rx_dropped); +	kfree_skb(skb); +	return NET_RX_DROP; +}  /**   *	netif_rx	-	post buffer to the network code @@ -1954,42 +2722,43 @@ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };  int netif_rx(struct sk_buff *skb)  { -	struct softnet_data *queue; -	unsigned long flags; +	int ret;  	/* if netpoll wants it, pretend we never saw it */  	if (netpoll_rx(skb))  		return NET_RX_DROP; -	if (!skb->tstamp.tv64) -		net_timestamp(skb); +	if (netdev_tstamp_prequeue) +		net_timestamp_check(skb); -	/* -	 * The code is rearranged so that the path is the most -	 * short when CPU is congested, but is still operating. -	 */ -	local_irq_save(flags); -	queue = &__get_cpu_var(softnet_data); +	trace_netif_rx(skb); +#ifdef CONFIG_RPS +	{ +		struct rps_dev_flow voidflow, *rflow = &voidflow; +		int cpu; -	__get_cpu_var(netdev_rx_stat).total++; -	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { -		if (queue->input_pkt_queue.qlen) { -enqueue: -			__skb_queue_tail(&queue->input_pkt_queue, skb); -			local_irq_restore(flags); -			return NET_RX_SUCCESS; -		} +		preempt_disable(); +		rcu_read_lock(); -		napi_schedule(&queue->backlog); -		goto enqueue; -	} +		cpu = get_rps_cpu(skb->dev, skb, &rflow); +		if (cpu < 0) +			cpu = smp_processor_id(); -	__get_cpu_var(netdev_rx_stat).dropped++; -	local_irq_restore(flags); +		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); -	kfree_skb(skb); -	return NET_RX_DROP; +		rcu_read_unlock(); +		preempt_enable(); +	} +#else +	{ +		unsigned int qtail; +		ret = enqueue_to_backlog(skb, get_cpu(), &qtail); +		put_cpu(); +	} +#endif +	return ret;  } +EXPORT_SYMBOL(netif_rx);  int netif_rx_ni(struct sk_buff *skb)  { @@ -2003,7 +2772,6 @@ int netif_rx_ni(struct sk_buff *skb)  	return err;  } -  EXPORT_SYMBOL(netif_rx_ni);  static void net_tx_action(struct softirq_action *h) @@ -2023,6 +2791,7 @@ static void net_tx_action(struct softirq_action *h)  			clist = clist->next;  			WARN_ON(atomic_read(&skb->users)); +			trace_kfree_skb(skb, net_tx_action);  			__kfree_skb(skb);  		}  	} @@ -2033,6 +2802,7 @@ static void net_tx_action(struct softirq_action *h)  		local_irq_disable();  		head = sd->output_queue;  		sd->output_queue = NULL; +		sd->output_queue_tailp = &sd->output_queue;  		local_irq_enable();  		while (head) { @@ -2062,72 +2832,12 @@ static void net_tx_action(struct softirq_action *h)  	}  } -static inline int deliver_skb(struct sk_buff *skb, -			      struct packet_type *pt_prev, -			      struct net_device *orig_dev) -{ -	atomic_inc(&skb->users); -	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); -} - -#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) - -#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE) +#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ +    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))  /* This hook is defined here for ATM LANE */  int (*br_fdb_test_addr_hook)(struct net_device *dev,  			     unsigned char *addr) __read_mostly; -EXPORT_SYMBOL(br_fdb_test_addr_hook); -#endif - -/* - * If bridge module is loaded call bridging hook. - *  returns NULL if packet was consumed. - */ -struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, -					struct sk_buff *skb) __read_mostly; -EXPORT_SYMBOL(br_handle_frame_hook); - -static inline struct sk_buff *handle_bridge(struct sk_buff *skb, -					    struct packet_type **pt_prev, int *ret, -					    struct net_device *orig_dev) -{ -	struct net_bridge_port *port; - -	if (skb->pkt_type == PACKET_LOOPBACK || -	    (port = rcu_dereference(skb->dev->br_port)) == NULL) -		return skb; - -	if (*pt_prev) { -		*ret = deliver_skb(skb, *pt_prev, orig_dev); -		*pt_prev = NULL; -	} - -	return br_handle_frame_hook(port, skb); -} -#else -#define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb) -#endif - -#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) -struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly; -EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook); - -static inline struct sk_buff *handle_macvlan(struct sk_buff *skb, -					     struct packet_type **pt_prev, -					     int *ret, -					     struct net_device *orig_dev) -{ -	if (skb->dev->macvlan_port == NULL) -		return skb; - -	if (*pt_prev) { -		*ret = deliver_skb(skb, *pt_prev, orig_dev); -		*pt_prev = NULL; -	} -	return macvlan_handle_frame_hook(skb); -} -#else -#define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb) +EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);  #endif  #ifdef CONFIG_NET_CLS_ACT @@ -2139,26 +2849,23 @@ static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,   * the ingress scheduler, you just cant add policies on ingress.   *   */ -static int ing_filter(struct sk_buff *skb) +static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)  {  	struct net_device *dev = skb->dev;  	u32 ttl = G_TC_RTTL(skb->tc_verd); -	struct netdev_queue *rxq;  	int result = TC_ACT_OK;  	struct Qdisc *q; -	if (MAX_RED_LOOP < ttl++) { -		printk(KERN_WARNING -		       "Redir loop detected Dropping packet (%d->%d)\n", -		       skb->iif, dev->ifindex); +	if (unlikely(MAX_RED_LOOP < ttl++)) { +		if (net_ratelimit()) +			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n", +			       skb->skb_iif, dev->ifindex);  		return TC_ACT_SHOT;  	}  	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);  	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); -	rxq = &dev->rx_queue; -  	q = rxq->qdisc;  	if (q != &noop_qdisc) {  		spin_lock(qdisc_lock(q)); @@ -2174,18 +2881,17 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,  					 struct packet_type **pt_prev,  					 int *ret, struct net_device *orig_dev)  { -	if (skb->dev->rx_queue.qdisc == &noop_qdisc) +	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); + +	if (!rxq || rxq->qdisc == &noop_qdisc)  		goto out;  	if (*pt_prev) {  		*ret = deliver_skb(skb, *pt_prev, orig_dev);  		*pt_prev = NULL; -	} else { -		/* Huh? Why does turning on AF_PACKET affect this? */ -		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);  	} -	switch (ing_filter(skb)) { +	switch (ing_filter(skb, rxq)) {  	case TC_ACT_SHOT:  	case TC_ACT_STOLEN:  		kfree_skb(skb); @@ -2198,80 +2904,145 @@ out:  }  #endif -/* - * 	netif_nit_deliver - deliver received packets to network taps - * 	@skb: buffer +/** + *	netdev_rx_handler_register - register receive handler + *	@dev: device to register a handler for + *	@rx_handler: receive handler to register + *	@rx_handler_data: data pointer that is used by rx handler   * - * 	This function is used to deliver incoming packets to network - * 	taps. It should be used when the normal netif_receive_skb path - * 	is bypassed, for example because of VLAN acceleration. + *	Register a receive hander for a device. This handler will then be + *	called from __netif_receive_skb. A negative errno code is returned + *	on a failure. + * + *	The caller must hold the rtnl_mutex.   */ -void netif_nit_deliver(struct sk_buff *skb) +int netdev_rx_handler_register(struct net_device *dev, +			       rx_handler_func_t *rx_handler, +			       void *rx_handler_data)  { -	struct packet_type *ptype; +	ASSERT_RTNL(); -	if (list_empty(&ptype_all)) -		return; +	if (dev->rx_handler) +		return -EBUSY; -	skb_reset_network_header(skb); -	skb_reset_transport_header(skb); -	skb->mac_len = skb->network_header - skb->mac_header; +	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); +	rcu_assign_pointer(dev->rx_handler, rx_handler); -	rcu_read_lock(); -	list_for_each_entry_rcu(ptype, &ptype_all, list) { -		if (!ptype->dev || ptype->dev == skb->dev) -			deliver_skb(skb, ptype, skb->dev); -	} -	rcu_read_unlock(); +	return 0;  } +EXPORT_SYMBOL_GPL(netdev_rx_handler_register);  /** - *	netif_receive_skb - process receive buffer from network - *	@skb: buffer to process - * - *	netif_receive_skb() is the main receive data processing function. - *	It always succeeds. The buffer may be dropped during processing - *	for congestion control or by the protocol layers. + *	netdev_rx_handler_unregister - unregister receive handler + *	@dev: device to unregister a handler from   * - *	This function may only be called from softirq context and interrupts - *	should be enabled. + *	Unregister a receive hander from a device.   * - *	Return values (usually ignored): - *	NET_RX_SUCCESS: no congestion - *	NET_RX_DROP: packet was dropped + *	The caller must hold the rtnl_mutex.   */ -int netif_receive_skb(struct sk_buff *skb) +void netdev_rx_handler_unregister(struct net_device *dev) +{ + +	ASSERT_RTNL(); +	rcu_assign_pointer(dev->rx_handler, NULL); +	rcu_assign_pointer(dev->rx_handler_data, NULL); +} +EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); + +static inline void skb_bond_set_mac_by_master(struct sk_buff *skb, +					      struct net_device *master) +{ +	if (skb->pkt_type == PACKET_HOST) { +		u16 *dest = (u16 *) eth_hdr(skb)->h_dest; + +		memcpy(dest, master->dev_addr, ETH_ALEN); +	} +} + +/* On bonding slaves other than the currently active slave, suppress + * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and + * ARP on active-backup slaves with arp_validate enabled. + */ +int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master) +{ +	struct net_device *dev = skb->dev; + +	if (master->priv_flags & IFF_MASTER_ARPMON) +		dev->last_rx = jiffies; + +	if ((master->priv_flags & IFF_MASTER_ALB) && +	    (master->priv_flags & IFF_BRIDGE_PORT)) { +		/* Do address unmangle. The local destination address +		 * will be always the one master has. Provides the right +		 * functionality in a bridge. +		 */ +		skb_bond_set_mac_by_master(skb, master); +	} + +	if (dev->priv_flags & IFF_SLAVE_INACTIVE) { +		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && +		    skb->protocol == __cpu_to_be16(ETH_P_ARP)) +			return 0; + +		if (master->priv_flags & IFF_MASTER_ALB) { +			if (skb->pkt_type != PACKET_BROADCAST && +			    skb->pkt_type != PACKET_MULTICAST) +				return 0; +		} +		if (master->priv_flags & IFF_MASTER_8023AD && +		    skb->protocol == __cpu_to_be16(ETH_P_SLOW)) +			return 0; + +		return 1; +	} +	return 0; +} +EXPORT_SYMBOL(__skb_bond_should_drop); + +static int __netif_receive_skb(struct sk_buff *skb)  {  	struct packet_type *ptype, *pt_prev; +	rx_handler_func_t *rx_handler;  	struct net_device *orig_dev; +	struct net_device *master;  	struct net_device *null_or_orig; +	struct net_device *orig_or_bond;  	int ret = NET_RX_DROP;  	__be16 type; -	if (skb->vlan_tci && vlan_hwaccel_do_receive(skb)) -		return NET_RX_SUCCESS; +	if (!netdev_tstamp_prequeue) +		net_timestamp_check(skb); + +	trace_netif_receive_skb(skb);  	/* if we've gotten here through NAPI, check netpoll */  	if (netpoll_receive_skb(skb))  		return NET_RX_DROP; -	if (!skb->tstamp.tv64) -		net_timestamp(skb); - -	if (!skb->iif) -		skb->iif = skb->dev->ifindex; +	if (!skb->skb_iif) +		skb->skb_iif = skb->dev->ifindex; +	/* +	 * bonding note: skbs received on inactive slaves should only +	 * be delivered to pkt handlers that are exact matches.  Also +	 * the deliver_no_wcard flag will be set.  If packet handlers +	 * are sensitive to duplicate packets these skbs will need to +	 * be dropped at the handler. +	 */  	null_or_orig = NULL;  	orig_dev = skb->dev; -	if (orig_dev->master) { -		if (skb_bond_should_drop(skb)) +	master = ACCESS_ONCE(orig_dev->master); +	if (skb->deliver_no_wcard) +		null_or_orig = orig_dev; +	else if (master) { +		if (skb_bond_should_drop(skb, master)) { +			skb->deliver_no_wcard = 1;  			null_or_orig = orig_dev; /* deliver only exact match */ -		else -			skb->dev = orig_dev->master; +		} else +			skb->dev = master;  	} -	__get_cpu_var(netdev_rx_stat).total++; - +	__this_cpu_inc(softnet_data.processed);  	skb_reset_network_header(skb);  	skb_reset_transport_header(skb);  	skb->mac_len = skb->network_header - skb->mac_header; @@ -2303,21 +3074,48 @@ int netif_receive_skb(struct sk_buff *skb)  ncls:  #endif -	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev); -	if (!skb) -		goto out; -	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev); -	if (!skb) -		goto out; +	/* Handle special case of bridge or macvlan */ +	rx_handler = rcu_dereference(skb->dev->rx_handler); +	if (rx_handler) { +		if (pt_prev) { +			ret = deliver_skb(skb, pt_prev, orig_dev); +			pt_prev = NULL; +		} +		skb = rx_handler(skb); +		if (!skb) +			goto out; +	} -	skb_orphan(skb); +	if (vlan_tx_tag_present(skb)) { +		if (pt_prev) { +			ret = deliver_skb(skb, pt_prev, orig_dev); +			pt_prev = NULL; +		} +		if (vlan_hwaccel_do_receive(&skb)) { +			ret = __netif_receive_skb(skb); +			goto out; +		} else if (unlikely(!skb)) +			goto out; +	} + +	/* +	 * Make sure frames received on VLAN interfaces stacked on +	 * bonding interfaces still make their way to any base bonding +	 * device that may have registered for a specific ptype.  The +	 * handler may have to adjust skb->dev and orig_dev. +	 */ +	orig_or_bond = orig_dev; +	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && +	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) { +		orig_or_bond = vlan_dev_real_dev(skb->dev); +	}  	type = skb->protocol;  	list_for_each_entry_rcu(ptype,  			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { -		if (ptype->type == type && -		    (ptype->dev == null_or_orig || ptype->dev == skb->dev || -		     ptype->dev == orig_dev)) { +		if (ptype->type == type && (ptype->dev == null_or_orig || +		     ptype->dev == skb->dev || ptype->dev == orig_dev || +		     ptype->dev == orig_or_bond)) {  			if (pt_prev)  				ret = deliver_skb(skb, pt_prev, orig_dev);  			pt_prev = ptype; @@ -2327,6 +3125,7 @@ ncls:  	if (pt_prev) {  		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);  	} else { +		atomic_long_inc(&skb->dev->rx_dropped);  		kfree_skb(skb);  		/* Jamal, now you will not able to escape explaining  		 * me how you were going to use this. :-) @@ -2339,18 +3138,80 @@ out:  	return ret;  } -/* Network device is going away, flush any packets still pending  */ +/** + *	netif_receive_skb - process receive buffer from network + *	@skb: buffer to process + * + *	netif_receive_skb() is the main receive data processing function. + *	It always succeeds. The buffer may be dropped during processing + *	for congestion control or by the protocol layers. + * + *	This function may only be called from softirq context and interrupts + *	should be enabled. + * + *	Return values (usually ignored): + *	NET_RX_SUCCESS: no congestion + *	NET_RX_DROP: packet was dropped + */ +int netif_receive_skb(struct sk_buff *skb) +{ +	if (netdev_tstamp_prequeue) +		net_timestamp_check(skb); + +	if (skb_defer_rx_timestamp(skb)) +		return NET_RX_SUCCESS; + +#ifdef CONFIG_RPS +	{ +		struct rps_dev_flow voidflow, *rflow = &voidflow; +		int cpu, ret; + +		rcu_read_lock(); + +		cpu = get_rps_cpu(skb->dev, skb, &rflow); + +		if (cpu >= 0) { +			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); +			rcu_read_unlock(); +		} else { +			rcu_read_unlock(); +			ret = __netif_receive_skb(skb); +		} + +		return ret; +	} +#else +	return __netif_receive_skb(skb); +#endif +} +EXPORT_SYMBOL(netif_receive_skb); + +/* Network device is going away, flush any packets still pending + * Called with irqs disabled. + */  static void flush_backlog(void *arg)  {  	struct net_device *dev = arg; -	struct softnet_data *queue = &__get_cpu_var(softnet_data); +	struct softnet_data *sd = &__get_cpu_var(softnet_data);  	struct sk_buff *skb, *tmp; -	skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp) +	rps_lock(sd); +	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { +		if (skb->dev == dev) { +			__skb_unlink(skb, &sd->input_pkt_queue); +			kfree_skb(skb); +			input_queue_head_incr(sd); +		} +	} +	rps_unlock(sd); + +	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {  		if (skb->dev == dev) { -			__skb_unlink(skb, &queue->input_pkt_queue); +			__skb_unlink(skb, &sd->process_queue);  			kfree_skb(skb); +			input_queue_head_incr(sd);  		} +	}  }  static int napi_gro_complete(struct sk_buff *skb) @@ -2385,7 +3246,7 @@ out:  	return netif_receive_skb(skb);  } -void napi_gro_flush(struct napi_struct *napi) +inline void napi_gro_flush(struct napi_struct *napi)  {  	struct sk_buff *skb, *next; @@ -2400,7 +3261,7 @@ void napi_gro_flush(struct napi_struct *napi)  }  EXPORT_SYMBOL(napi_gro_flush); -int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  {  	struct sk_buff **pp = NULL;  	struct packet_type *ptype; @@ -2408,12 +3269,12 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];  	int same_flow;  	int mac_len; -	int ret; +	enum gro_result ret; -	if (!(skb->dev->features & NETIF_F_GRO)) +	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))  		goto normal; -	if (skb_is_gso(skb) || skb_has_frags(skb)) +	if (skb_is_gso(skb) || skb_has_frag_list(skb))  		goto normal;  	rcu_read_lock(); @@ -2479,7 +3340,7 @@ pull:  			put_page(skb_shinfo(skb)->frags[0].page);  			memmove(skb_shinfo(skb)->frags,  				skb_shinfo(skb)->frags + 1, -				--skb_shinfo(skb)->nr_frags); +				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));  		}  	} @@ -2492,41 +3353,44 @@ normal:  }  EXPORT_SYMBOL(dev_gro_receive); -static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +static inline gro_result_t +__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  {  	struct sk_buff *p; -	if (netpoll_rx_on(skb)) -		return GRO_NORMAL; -  	for (p = napi->gro_list; p; p = p->next) { -		NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev) -			&& !compare_ether_header(skb_mac_header(p), -						 skb_gro_mac_header(skb)); +		unsigned long diffs; + +		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; +		diffs |= p->vlan_tci ^ skb->vlan_tci; +		diffs |= compare_ether_header(skb_mac_header(p), +					      skb_gro_mac_header(skb)); +		NAPI_GRO_CB(p)->same_flow = !diffs;  		NAPI_GRO_CB(p)->flush = 0;  	}  	return dev_gro_receive(napi, skb);  } -int napi_skb_finish(int ret, struct sk_buff *skb) +gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)  { -	int err = NET_RX_SUCCESS; -  	switch (ret) {  	case GRO_NORMAL: -		return netif_receive_skb(skb); +		if (netif_receive_skb(skb)) +			ret = GRO_DROP; +		break;  	case GRO_DROP: -		err = NET_RX_DROP; -		/* fall through */ -  	case GRO_MERGED_FREE:  		kfree_skb(skb);  		break; + +	case GRO_HELD: +	case GRO_MERGED: +		break;  	} -	return err; +	return ret;  }  EXPORT_SYMBOL(napi_skb_finish); @@ -2546,7 +3410,7 @@ void skb_gro_reset_offset(struct sk_buff *skb)  }  EXPORT_SYMBOL(skb_gro_reset_offset); -int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  {  	skb_gro_reset_offset(skb); @@ -2554,60 +3418,52 @@ int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  }  EXPORT_SYMBOL(napi_gro_receive); -void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)  {  	__skb_pull(skb, skb_headlen(skb));  	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); +	skb->vlan_tci = 0;  	napi->skb = skb;  } -EXPORT_SYMBOL(napi_reuse_skb);  struct sk_buff *napi_get_frags(struct napi_struct *napi)  { -	struct net_device *dev = napi->dev;  	struct sk_buff *skb = napi->skb;  	if (!skb) { -		skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN); -		if (!skb) -			goto out; - -		skb_reserve(skb, NET_IP_ALIGN); - -		napi->skb = skb; +		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); +		if (skb) +			napi->skb = skb;  	} - -out:  	return skb;  }  EXPORT_SYMBOL(napi_get_frags); -int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) +gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, +			       gro_result_t ret)  { -	int err = NET_RX_SUCCESS; -  	switch (ret) {  	case GRO_NORMAL:  	case GRO_HELD: -		skb->protocol = eth_type_trans(skb, napi->dev); - -		if (ret == GRO_NORMAL) -			return netif_receive_skb(skb); +		skb->protocol = eth_type_trans(skb, skb->dev); -		skb_gro_pull(skb, -ETH_HLEN); +		if (ret == GRO_HELD) +			skb_gro_pull(skb, -ETH_HLEN); +		else if (netif_receive_skb(skb)) +			ret = GRO_DROP;  		break;  	case GRO_DROP: -		err = NET_RX_DROP; -		/* fall through */ -  	case GRO_MERGED_FREE:  		napi_reuse_skb(napi, skb);  		break; + +	case GRO_MERGED: +		break;  	} -	return err; +	return ret;  }  EXPORT_SYMBOL(napi_frags_finish); @@ -2648,38 +3504,98 @@ out:  }  EXPORT_SYMBOL(napi_frags_skb); -int napi_gro_frags(struct napi_struct *napi) +gro_result_t napi_gro_frags(struct napi_struct *napi)  {  	struct sk_buff *skb = napi_frags_skb(napi);  	if (!skb) -		return NET_RX_DROP; +		return GRO_DROP;  	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));  }  EXPORT_SYMBOL(napi_gro_frags); +/* + * net_rps_action sends any pending IPI's for rps. + * Note: called with local irq disabled, but exits with local irq enabled. + */ +static void net_rps_action_and_irq_enable(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS +	struct softnet_data *remsd = sd->rps_ipi_list; + +	if (remsd) { +		sd->rps_ipi_list = NULL; + +		local_irq_enable(); + +		/* Send pending IPI's to kick RPS processing on remote cpus. */ +		while (remsd) { +			struct softnet_data *next = remsd->rps_ipi_next; + +			if (cpu_online(remsd->cpu)) +				__smp_call_function_single(remsd->cpu, +							   &remsd->csd, 0); +			remsd = next; +		} +	} else +#endif +		local_irq_enable(); +} +  static int process_backlog(struct napi_struct *napi, int quota)  {  	int work = 0; -	struct softnet_data *queue = &__get_cpu_var(softnet_data); -	unsigned long start_time = jiffies; +	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); +#ifdef CONFIG_RPS +	/* Check if we have pending ipi, its better to send them now, +	 * not waiting net_rx_action() end. +	 */ +	if (sd->rps_ipi_list) { +		local_irq_disable(); +		net_rps_action_and_irq_enable(sd); +	} +#endif  	napi->weight = weight_p; -	do { +	local_irq_disable(); +	while (work < quota) {  		struct sk_buff *skb; +		unsigned int qlen; -		local_irq_disable(); -		skb = __skb_dequeue(&queue->input_pkt_queue); -		if (!skb) { -			__napi_complete(napi); +		while ((skb = __skb_dequeue(&sd->process_queue))) {  			local_irq_enable(); -			break; +			__netif_receive_skb(skb); +			local_irq_disable(); +			input_queue_head_incr(sd); +			if (++work >= quota) { +				local_irq_enable(); +				return work; +			}  		} -		local_irq_enable(); -		netif_receive_skb(skb); -	} while (++work < quota && jiffies == start_time); +		rps_lock(sd); +		qlen = skb_queue_len(&sd->input_pkt_queue); +		if (qlen) +			skb_queue_splice_tail_init(&sd->input_pkt_queue, +						   &sd->process_queue); + +		if (qlen < quota - work) { +			/* +			 * Inline a custom version of __napi_complete(). +			 * only current cpu owns and manipulates this napi, +			 * and NAPI_STATE_SCHED is the only possible flag set on backlog. +			 * we can use a plain write instead of clear_bit(), +			 * and we dont need an smp_mb() memory barrier. +			 */ +			list_del(&napi->poll_list); +			napi->state = 0; + +			quota = work + qlen; +		} +		rps_unlock(sd); +	} +	local_irq_enable();  	return work;  } @@ -2695,8 +3611,7 @@ void __napi_schedule(struct napi_struct *n)  	unsigned long flags;  	local_irq_save(flags); -	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list); -	__raise_softirq_irqoff(NET_RX_SOFTIRQ); +	____napi_schedule(&__get_cpu_var(softnet_data), n);  	local_irq_restore(flags);  }  EXPORT_SYMBOL(__napi_schedule); @@ -2767,17 +3682,16 @@ void netif_napi_del(struct napi_struct *napi)  }  EXPORT_SYMBOL(netif_napi_del); -  static void net_rx_action(struct softirq_action *h)  { -	struct list_head *list = &__get_cpu_var(softnet_data).poll_list; +	struct softnet_data *sd = &__get_cpu_var(softnet_data);  	unsigned long time_limit = jiffies + 2;  	int budget = netdev_budget;  	void *have;  	local_irq_disable(); -	while (!list_empty(list)) { +	while (!list_empty(&sd->poll_list)) {  		struct napi_struct *n;  		int work, weight; @@ -2795,7 +3709,7 @@ static void net_rx_action(struct softirq_action *h)  		 * entries to the tail of this list, and only ->poll()  		 * calls can remove this head entry from the list.  		 */ -		n = list_entry(list->next, struct napi_struct, poll_list); +		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);  		have = netpoll_poll_lock(n); @@ -2825,16 +3739,18 @@ static void net_rx_action(struct softirq_action *h)  		 * move the instance around on the list at-will.  		 */  		if (unlikely(work == weight)) { -			if (unlikely(napi_disable_pending(n))) -				__napi_complete(n); -			else -				list_move_tail(&n->poll_list, list); +			if (unlikely(napi_disable_pending(n))) { +				local_irq_enable(); +				napi_complete(n); +				local_irq_disable(); +			} else +				list_move_tail(&n->poll_list, &sd->poll_list);  		}  		netpoll_poll_unlock(have);  	}  out: -	local_irq_enable(); +	net_rps_action_and_irq_enable(sd);  #ifdef CONFIG_NET_DMA  	/* @@ -2847,12 +3763,12 @@ out:  	return;  softnet_break: -	__get_cpu_var(netdev_rx_stat).time_squeeze++; +	sd->time_squeeze++;  	__raise_softirq_irqoff(NET_RX_SOFTIRQ);  	goto out;  } -static gifconf_func_t * gifconf_list [NPROTO]; +static gifconf_func_t *gifconf_list[NPROTO];  /**   *	register_gifconf	-	register a SIOCGIF handler @@ -2863,13 +3779,14 @@ static gifconf_func_t * gifconf_list [NPROTO];   *	that is passed must not be freed or reused until it has been replaced   *	by another handler.   */ -int register_gifconf(unsigned int family, gifconf_func_t * gifconf) +int register_gifconf(unsigned int family, gifconf_func_t *gifconf)  {  	if (family >= NPROTO)  		return -EINVAL;  	gifconf_list[family] = gifconf;  	return 0;  } +EXPORT_SYMBOL(register_gifconf);  /* @@ -2895,15 +3812,15 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg)  	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))  		return -EFAULT; -	read_lock(&dev_base_lock); -	dev = __dev_get_by_index(net, ifr.ifr_ifindex); +	rcu_read_lock(); +	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);  	if (!dev) { -		read_unlock(&dev_base_lock); +		rcu_read_unlock();  		return -ENODEV;  	}  	strcpy(ifr.ifr_name, dev->name); -	read_unlock(&dev_base_lock); +	rcu_read_unlock();  	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))  		return -EFAULT; @@ -2973,18 +3890,18 @@ static int dev_ifconf(struct net *net, char __user *arg)   *	in detail.   */  void *dev_seq_start(struct seq_file *seq, loff_t *pos) -	__acquires(dev_base_lock) +	__acquires(RCU)  {  	struct net *net = seq_file_net(seq);  	loff_t off;  	struct net_device *dev; -	read_lock(&dev_base_lock); +	rcu_read_lock();  	if (!*pos)  		return SEQ_START_TOKEN;  	off = 1; -	for_each_netdev(net, dev) +	for_each_netdev_rcu(net, dev)  		if (off++ == *pos)  			return dev; @@ -2993,24 +3910,27 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos)  void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)  { -	struct net *net = seq_file_net(seq); +	struct net_device *dev = (v == SEQ_START_TOKEN) ? +				  first_net_device(seq_file_net(seq)) : +				  next_net_device((struct net_device *)v); +  	++*pos; -	return v == SEQ_START_TOKEN ? -		first_net_device(net) : next_net_device((struct net_device *)v); +	return rcu_dereference(dev);  }  void dev_seq_stop(struct seq_file *seq, void *v) -	__releases(dev_base_lock) +	__releases(RCU)  { -	read_unlock(&dev_base_lock); +	rcu_read_unlock();  }  static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)  { -	const struct net_device_stats *stats = dev_get_stats(dev); +	struct rtnl_link_stats64 temp; +	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); -	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " -		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", +	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " +		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",  		   dev->name, stats->rx_bytes, stats->rx_packets,  		   stats->rx_errors,  		   stats->rx_dropped + stats->rx_missed_errors, @@ -3045,17 +3965,17 @@ static int dev_seq_show(struct seq_file *seq, void *v)  	return 0;  } -static struct netif_rx_stats *softnet_get_online(loff_t *pos) +static struct softnet_data *softnet_get_online(loff_t *pos)  { -	struct netif_rx_stats *rc = NULL; +	struct softnet_data *sd = NULL;  	while (*pos < nr_cpu_ids)  		if (cpu_online(*pos)) { -			rc = &per_cpu(netdev_rx_stat, *pos); +			sd = &per_cpu(softnet_data, *pos);  			break;  		} else  			++*pos; -	return rc; +	return sd;  }  static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) @@ -3075,12 +3995,12 @@ static void softnet_seq_stop(struct seq_file *seq, void *v)  static int softnet_seq_show(struct seq_file *seq, void *v)  { -	struct netif_rx_stats *s = v; +	struct softnet_data *sd = v; -	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", -		   s->total, s->dropped, s->time_squeeze, 0, +	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", +		   sd->processed, sd->dropped, sd->time_squeeze, 0,  		   0, 0, 0, 0, /* was fastroute */ -		   s->cpu_collision ); +		   sd->cpu_collision, sd->received_rps);  	return 0;  } @@ -3303,11 +4223,10 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)  	slave->master = master; -	synchronize_net(); - -	if (old) +	if (old) { +		synchronize_net();  		dev_put(old); - +	}  	if (master)  		slave->flags |= IFF_SLAVE;  	else @@ -3316,6 +4235,7 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)  	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);  	return 0;  } +EXPORT_SYMBOL(netdev_set_master);  static void dev_change_rx_flags(struct net_device *dev, int flags)  { @@ -3394,6 +4314,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc)  		dev_set_rx_mode(dev);  	return err;  } +EXPORT_SYMBOL(dev_set_promiscuity);  /**   *	dev_set_allmulti	- update allmulti count on a device @@ -3437,6 +4358,7 @@ int dev_set_allmulti(struct net_device *dev, int inc)  	}  	return 0;  } +EXPORT_SYMBOL(dev_set_allmulti);  /*   *	Upload unicast and multicast address lists to device and @@ -3461,10 +4383,10 @@ void __dev_set_rx_mode(struct net_device *dev)  		/* Unicast addresses changes may only happen under the rtnl,  		 * therefore calling __dev_set_promiscuity here is safe.  		 */ -		if (dev->uc.count > 0 && !dev->uc_promisc) { +		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {  			__dev_set_promiscuity(dev, 1);  			dev->uc_promisc = 1; -		} else if (dev->uc.count == 0 && dev->uc_promisc) { +		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {  			__dev_set_promiscuity(dev, -1);  			dev->uc_promisc = 0;  		} @@ -3481,555 +4403,6 @@ void dev_set_rx_mode(struct net_device *dev)  	netif_addr_unlock_bh(dev);  } -/* hw addresses list handling functions */ - -static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, -			 int addr_len, unsigned char addr_type) -{ -	struct netdev_hw_addr *ha; -	int alloc_size; - -	if (addr_len > MAX_ADDR_LEN) -		return -EINVAL; - -	list_for_each_entry(ha, &list->list, list) { -		if (!memcmp(ha->addr, addr, addr_len) && -		    ha->type == addr_type) { -			ha->refcount++; -			return 0; -		} -	} - - -	alloc_size = sizeof(*ha); -	if (alloc_size < L1_CACHE_BYTES) -		alloc_size = L1_CACHE_BYTES; -	ha = kmalloc(alloc_size, GFP_ATOMIC); -	if (!ha) -		return -ENOMEM; -	memcpy(ha->addr, addr, addr_len); -	ha->type = addr_type; -	ha->refcount = 1; -	ha->synced = false; -	list_add_tail_rcu(&ha->list, &list->list); -	list->count++; -	return 0; -} - -static void ha_rcu_free(struct rcu_head *head) -{ -	struct netdev_hw_addr *ha; - -	ha = container_of(head, struct netdev_hw_addr, rcu_head); -	kfree(ha); -} - -static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr, -			 int addr_len, unsigned char addr_type) -{ -	struct netdev_hw_addr *ha; - -	list_for_each_entry(ha, &list->list, list) { -		if (!memcmp(ha->addr, addr, addr_len) && -		    (ha->type == addr_type || !addr_type)) { -			if (--ha->refcount) -				return 0; -			list_del_rcu(&ha->list); -			call_rcu(&ha->rcu_head, ha_rcu_free); -			list->count--; -			return 0; -		} -	} -	return -ENOENT; -} - -static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, -				  struct netdev_hw_addr_list *from_list, -				  int addr_len, -				  unsigned char addr_type) -{ -	int err; -	struct netdev_hw_addr *ha, *ha2; -	unsigned char type; - -	list_for_each_entry(ha, &from_list->list, list) { -		type = addr_type ? addr_type : ha->type; -		err = __hw_addr_add(to_list, ha->addr, addr_len, type); -		if (err) -			goto unroll; -	} -	return 0; - -unroll: -	list_for_each_entry(ha2, &from_list->list, list) { -		if (ha2 == ha) -			break; -		type = addr_type ? addr_type : ha2->type; -		__hw_addr_del(to_list, ha2->addr, addr_len, type); -	} -	return err; -} - -static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, -				   struct netdev_hw_addr_list *from_list, -				   int addr_len, -				   unsigned char addr_type) -{ -	struct netdev_hw_addr *ha; -	unsigned char type; - -	list_for_each_entry(ha, &from_list->list, list) { -		type = addr_type ? addr_type : ha->type; -		__hw_addr_del(to_list, ha->addr, addr_len, addr_type); -	} -} - -static int __hw_addr_sync(struct netdev_hw_addr_list *to_list, -			  struct netdev_hw_addr_list *from_list, -			  int addr_len) -{ -	int err = 0; -	struct netdev_hw_addr *ha, *tmp; - -	list_for_each_entry_safe(ha, tmp, &from_list->list, list) { -		if (!ha->synced) { -			err = __hw_addr_add(to_list, ha->addr, -					    addr_len, ha->type); -			if (err) -				break; -			ha->synced = true; -			ha->refcount++; -		} else if (ha->refcount == 1) { -			__hw_addr_del(to_list, ha->addr, addr_len, ha->type); -			__hw_addr_del(from_list, ha->addr, addr_len, ha->type); -		} -	} -	return err; -} - -static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, -			     struct netdev_hw_addr_list *from_list, -			     int addr_len) -{ -	struct netdev_hw_addr *ha, *tmp; - -	list_for_each_entry_safe(ha, tmp, &from_list->list, list) { -		if (ha->synced) { -			__hw_addr_del(to_list, ha->addr, -				      addr_len, ha->type); -			ha->synced = false; -			__hw_addr_del(from_list, ha->addr, -				      addr_len, ha->type); -		} -	} -} - -static void __hw_addr_flush(struct netdev_hw_addr_list *list) -{ -	struct netdev_hw_addr *ha, *tmp; - -	list_for_each_entry_safe(ha, tmp, &list->list, list) { -		list_del_rcu(&ha->list); -		call_rcu(&ha->rcu_head, ha_rcu_free); -	} -	list->count = 0; -} - -static void __hw_addr_init(struct netdev_hw_addr_list *list) -{ -	INIT_LIST_HEAD(&list->list); -	list->count = 0; -} - -/* Device addresses handling functions */ - -static void dev_addr_flush(struct net_device *dev) -{ -	/* rtnl_mutex must be held here */ - -	__hw_addr_flush(&dev->dev_addrs); -	dev->dev_addr = NULL; -} - -static int dev_addr_init(struct net_device *dev) -{ -	unsigned char addr[MAX_ADDR_LEN]; -	struct netdev_hw_addr *ha; -	int err; - -	/* rtnl_mutex must be held here */ - -	__hw_addr_init(&dev->dev_addrs); -	memset(addr, 0, sizeof(addr)); -	err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr), -			    NETDEV_HW_ADDR_T_LAN); -	if (!err) { -		/* -		 * Get the first (previously created) address from the list -		 * and set dev_addr pointer to this location. -		 */ -		ha = list_first_entry(&dev->dev_addrs.list, -				      struct netdev_hw_addr, list); -		dev->dev_addr = ha->addr; -	} -	return err; -} - -/** - *	dev_addr_add	- Add a device address - *	@dev: device - *	@addr: address to add - *	@addr_type: address type - * - *	Add a device address to the device or increase the reference count if - *	it already exists. - * - *	The caller must hold the rtnl_mutex. - */ -int dev_addr_add(struct net_device *dev, unsigned char *addr, -		 unsigned char addr_type) -{ -	int err; - -	ASSERT_RTNL(); - -	err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); -	if (!err) -		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); -	return err; -} -EXPORT_SYMBOL(dev_addr_add); - -/** - *	dev_addr_del	- Release a device address. - *	@dev: device - *	@addr: address to delete - *	@addr_type: address type - * - *	Release reference to a device address and remove it from the device - *	if the reference count drops to zero. - * - *	The caller must hold the rtnl_mutex. - */ -int dev_addr_del(struct net_device *dev, unsigned char *addr, -		 unsigned char addr_type) -{ -	int err; -	struct netdev_hw_addr *ha; - -	ASSERT_RTNL(); - -	/* -	 * We can not remove the first address from the list because -	 * dev->dev_addr points to that. -	 */ -	ha = list_first_entry(&dev->dev_addrs.list, -			      struct netdev_hw_addr, list); -	if (ha->addr == dev->dev_addr && ha->refcount == 1) -		return -ENOENT; - -	err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len, -			    addr_type); -	if (!err) -		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); -	return err; -} -EXPORT_SYMBOL(dev_addr_del); - -/** - *	dev_addr_add_multiple	- Add device addresses from another device - *	@to_dev: device to which addresses will be added - *	@from_dev: device from which addresses will be added - *	@addr_type: address type - 0 means type will be used from from_dev - * - *	Add device addresses of the one device to another. - ** - *	The caller must hold the rtnl_mutex. - */ -int dev_addr_add_multiple(struct net_device *to_dev, -			  struct net_device *from_dev, -			  unsigned char addr_type) -{ -	int err; - -	ASSERT_RTNL(); - -	if (from_dev->addr_len != to_dev->addr_len) -		return -EINVAL; -	err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, -				     to_dev->addr_len, addr_type); -	if (!err) -		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); -	return err; -} -EXPORT_SYMBOL(dev_addr_add_multiple); - -/** - *	dev_addr_del_multiple	- Delete device addresses by another device - *	@to_dev: device where the addresses will be deleted - *	@from_dev: device by which addresses the addresses will be deleted - *	@addr_type: address type - 0 means type will used from from_dev - * - *	Deletes addresses in to device by the list of addresses in from device. - * - *	The caller must hold the rtnl_mutex. - */ -int dev_addr_del_multiple(struct net_device *to_dev, -			  struct net_device *from_dev, -			  unsigned char addr_type) -{ -	ASSERT_RTNL(); - -	if (from_dev->addr_len != to_dev->addr_len) -		return -EINVAL; -	__hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, -			       to_dev->addr_len, addr_type); -	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); -	return 0; -} -EXPORT_SYMBOL(dev_addr_del_multiple); - -/* multicast addresses handling functions */ - -int __dev_addr_delete(struct dev_addr_list **list, int *count, -		      void *addr, int alen, int glbl) -{ -	struct dev_addr_list *da; - -	for (; (da = *list) != NULL; list = &da->next) { -		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 && -		    alen == da->da_addrlen) { -			if (glbl) { -				int old_glbl = da->da_gusers; -				da->da_gusers = 0; -				if (old_glbl == 0) -					break; -			} -			if (--da->da_users) -				return 0; - -			*list = da->next; -			kfree(da); -			(*count)--; -			return 0; -		} -	} -	return -ENOENT; -} - -int __dev_addr_add(struct dev_addr_list **list, int *count, -		   void *addr, int alen, int glbl) -{ -	struct dev_addr_list *da; - -	for (da = *list; da != NULL; da = da->next) { -		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 && -		    da->da_addrlen == alen) { -			if (glbl) { -				int old_glbl = da->da_gusers; -				da->da_gusers = 1; -				if (old_glbl) -					return 0; -			} -			da->da_users++; -			return 0; -		} -	} - -	da = kzalloc(sizeof(*da), GFP_ATOMIC); -	if (da == NULL) -		return -ENOMEM; -	memcpy(da->da_addr, addr, alen); -	da->da_addrlen = alen; -	da->da_users = 1; -	da->da_gusers = glbl ? 1 : 0; -	da->next = *list; -	*list = da; -	(*count)++; -	return 0; -} - -/** - *	dev_unicast_delete	- Release secondary unicast address. - *	@dev: device - *	@addr: address to delete - * - *	Release reference to a secondary unicast address and remove it - *	from the device if the reference count drops to zero. - * - * 	The caller must hold the rtnl_mutex. - */ -int dev_unicast_delete(struct net_device *dev, void *addr) -{ -	int err; - -	ASSERT_RTNL(); - -	err = __hw_addr_del(&dev->uc, addr, dev->addr_len, -			    NETDEV_HW_ADDR_T_UNICAST); -	if (!err) -		__dev_set_rx_mode(dev); -	return err; -} -EXPORT_SYMBOL(dev_unicast_delete); - -/** - *	dev_unicast_add		- add a secondary unicast address - *	@dev: device - *	@addr: address to add - * - *	Add a secondary unicast address to the device or increase - *	the reference count if it already exists. - * - *	The caller must hold the rtnl_mutex. - */ -int dev_unicast_add(struct net_device *dev, void *addr) -{ -	int err; - -	ASSERT_RTNL(); - -	err = __hw_addr_add(&dev->uc, addr, dev->addr_len, -			    NETDEV_HW_ADDR_T_UNICAST); -	if (!err) -		__dev_set_rx_mode(dev); -	return err; -} -EXPORT_SYMBOL(dev_unicast_add); - -int __dev_addr_sync(struct dev_addr_list **to, int *to_count, -		    struct dev_addr_list **from, int *from_count) -{ -	struct dev_addr_list *da, *next; -	int err = 0; - -	da = *from; -	while (da != NULL) { -		next = da->next; -		if (!da->da_synced) { -			err = __dev_addr_add(to, to_count, -					     da->da_addr, da->da_addrlen, 0); -			if (err < 0) -				break; -			da->da_synced = 1; -			da->da_users++; -		} else if (da->da_users == 1) { -			__dev_addr_delete(to, to_count, -					  da->da_addr, da->da_addrlen, 0); -			__dev_addr_delete(from, from_count, -					  da->da_addr, da->da_addrlen, 0); -		} -		da = next; -	} -	return err; -} - -void __dev_addr_unsync(struct dev_addr_list **to, int *to_count, -		       struct dev_addr_list **from, int *from_count) -{ -	struct dev_addr_list *da, *next; - -	da = *from; -	while (da != NULL) { -		next = da->next; -		if (da->da_synced) { -			__dev_addr_delete(to, to_count, -					  da->da_addr, da->da_addrlen, 0); -			da->da_synced = 0; -			__dev_addr_delete(from, from_count, -					  da->da_addr, da->da_addrlen, 0); -		} -		da = next; -	} -} - -/** - *	dev_unicast_sync - Synchronize device's unicast list to another device - *	@to: destination device - *	@from: source device - * - *	Add newly added addresses to the destination device and release - *	addresses that have no users left. - * - *	This function is intended to be called from the dev->set_rx_mode - *	function of layered software devices. - */ -int dev_unicast_sync(struct net_device *to, struct net_device *from) -{ -	int err = 0; - -	ASSERT_RTNL(); - -	if (to->addr_len != from->addr_len) -		return -EINVAL; - -	err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); -	if (!err) -		__dev_set_rx_mode(to); -	return err; -} -EXPORT_SYMBOL(dev_unicast_sync); - -/** - *	dev_unicast_unsync - Remove synchronized addresses from the destination device - *	@to: destination device - *	@from: source device - * - *	Remove all addresses that were added to the destination device by - *	dev_unicast_sync(). This function is intended to be called from the - *	dev->stop function of layered software devices. - */ -void dev_unicast_unsync(struct net_device *to, struct net_device *from) -{ -	ASSERT_RTNL(); - -	if (to->addr_len != from->addr_len) -		return; - -	__hw_addr_unsync(&to->uc, &from->uc, to->addr_len); -	__dev_set_rx_mode(to); -} -EXPORT_SYMBOL(dev_unicast_unsync); - -static void dev_unicast_flush(struct net_device *dev) -{ -	/* rtnl_mutex must be held here */ - -	__hw_addr_flush(&dev->uc); -} - -static void dev_unicast_init(struct net_device *dev) -{ -	/* rtnl_mutex must be held here */ - -	__hw_addr_init(&dev->uc); -} - - -static void __dev_addr_discard(struct dev_addr_list **list) -{ -	struct dev_addr_list *tmp; - -	while (*list != NULL) { -		tmp = *list; -		*list = tmp->next; -		if (tmp->da_users > tmp->da_gusers) -			printk("__dev_addr_discard: address leakage! " -			       "da_users=%d\n", tmp->da_users); -		kfree(tmp); -	} -} - -static void dev_addr_discard(struct net_device *dev) -{ -	netif_addr_lock_bh(dev); - -	__dev_addr_discard(&dev->mc_list); -	dev->mc_count = 0; - -	netif_addr_unlock_bh(dev); -} -  /**   *	dev_get_flags - get flags reported to userspace   *	@dev: device @@ -4059,19 +4432,12 @@ unsigned dev_get_flags(const struct net_device *dev)  	return flags;  } +EXPORT_SYMBOL(dev_get_flags); -/** - *	dev_change_flags - change device settings - *	@dev: device - *	@flags: device state flags - * - *	Change settings on device based state flags. The flags are - *	in the userspace exported format. - */ -int dev_change_flags(struct net_device *dev, unsigned flags) +int __dev_change_flags(struct net_device *dev, unsigned int flags)  { -	int ret, changes;  	int old_flags = dev->flags; +	int ret;  	ASSERT_RTNL(); @@ -4102,19 +4468,15 @@ int dev_change_flags(struct net_device *dev, unsigned flags)  	ret = 0;  	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */ -		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); +		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);  		if (!ret)  			dev_set_rx_mode(dev);  	} -	if (dev->flags & IFF_UP && -	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI | -					  IFF_VOLATILE))) -		call_netdevice_notifiers(NETDEV_CHANGE, dev); -  	if ((flags ^ dev->gflags) & IFF_PROMISC) { -		int inc = (flags & IFF_PROMISC) ? +1 : -1; +		int inc = (flags & IFF_PROMISC) ? 1 : -1; +  		dev->gflags ^= IFF_PROMISC;  		dev_set_promiscuity(dev, inc);  	} @@ -4124,18 +4486,56 @@ int dev_change_flags(struct net_device *dev, unsigned flags)  	   IFF_ALLMULTI is requested not asking us and not reporting.  	 */  	if ((flags ^ dev->gflags) & IFF_ALLMULTI) { -		int inc = (flags & IFF_ALLMULTI) ? +1 : -1; +		int inc = (flags & IFF_ALLMULTI) ? 1 : -1; +  		dev->gflags ^= IFF_ALLMULTI;  		dev_set_allmulti(dev, inc);  	} -	/* Exclude state transition flags, already notified */ -	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING); +	return ret; +} + +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) +{ +	unsigned int changes = dev->flags ^ old_flags; + +	if (changes & IFF_UP) { +		if (dev->flags & IFF_UP) +			call_netdevice_notifiers(NETDEV_UP, dev); +		else +			call_netdevice_notifiers(NETDEV_DOWN, dev); +	} + +	if (dev->flags & IFF_UP && +	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) +		call_netdevice_notifiers(NETDEV_CHANGE, dev); +} + +/** + *	dev_change_flags - change device settings + *	@dev: device + *	@flags: device state flags + * + *	Change settings on device based state flags. The flags are + *	in the userspace exported format. + */ +int dev_change_flags(struct net_device *dev, unsigned flags) +{ +	int ret, changes; +	int old_flags = dev->flags; + +	ret = __dev_change_flags(dev, flags); +	if (ret < 0) +		return ret; + +	changes = old_flags ^ dev->flags;  	if (changes)  		rtmsg_ifinfo(RTM_NEWLINK, dev, changes); +	__dev_notify_flags(dev, old_flags);  	return ret;  } +EXPORT_SYMBOL(dev_change_flags);  /**   *	dev_set_mtu - Change maximum transfer unit @@ -4169,6 +4569,7 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)  		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);  	return err;  } +EXPORT_SYMBOL(dev_set_mtu);  /**   *	dev_set_mac_address - Change Media Access Control Address @@ -4193,69 +4594,70 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)  		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);  	return err;  } +EXPORT_SYMBOL(dev_set_mac_address);  /* - *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock) + *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()   */  static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)  {  	int err; -	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); +	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);  	if (!dev)  		return -ENODEV;  	switch (cmd) { -		case SIOCGIFFLAGS:	/* Get interface flags */ -			ifr->ifr_flags = (short) dev_get_flags(dev); -			return 0; +	case SIOCGIFFLAGS:	/* Get interface flags */ +		ifr->ifr_flags = (short) dev_get_flags(dev); +		return 0; -		case SIOCGIFMETRIC:	/* Get the metric on the interface -					   (currently unused) */ -			ifr->ifr_metric = 0; -			return 0; +	case SIOCGIFMETRIC:	/* Get the metric on the interface +				   (currently unused) */ +		ifr->ifr_metric = 0; +		return 0; -		case SIOCGIFMTU:	/* Get the MTU of a device */ -			ifr->ifr_mtu = dev->mtu; -			return 0; +	case SIOCGIFMTU:	/* Get the MTU of a device */ +		ifr->ifr_mtu = dev->mtu; +		return 0; -		case SIOCGIFHWADDR: -			if (!dev->addr_len) -				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); -			else -				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, -				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); -			ifr->ifr_hwaddr.sa_family = dev->type; -			return 0; +	case SIOCGIFHWADDR: +		if (!dev->addr_len) +			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); +		else +			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, +			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); +		ifr->ifr_hwaddr.sa_family = dev->type; +		return 0; -		case SIOCGIFSLAVE: -			err = -EINVAL; -			break; +	case SIOCGIFSLAVE: +		err = -EINVAL; +		break; -		case SIOCGIFMAP: -			ifr->ifr_map.mem_start = dev->mem_start; -			ifr->ifr_map.mem_end   = dev->mem_end; -			ifr->ifr_map.base_addr = dev->base_addr; -			ifr->ifr_map.irq       = dev->irq; -			ifr->ifr_map.dma       = dev->dma; -			ifr->ifr_map.port      = dev->if_port; -			return 0; +	case SIOCGIFMAP: +		ifr->ifr_map.mem_start = dev->mem_start; +		ifr->ifr_map.mem_end   = dev->mem_end; +		ifr->ifr_map.base_addr = dev->base_addr; +		ifr->ifr_map.irq       = dev->irq; +		ifr->ifr_map.dma       = dev->dma; +		ifr->ifr_map.port      = dev->if_port; +		return 0; -		case SIOCGIFINDEX: -			ifr->ifr_ifindex = dev->ifindex; -			return 0; +	case SIOCGIFINDEX: +		ifr->ifr_ifindex = dev->ifindex; +		return 0; -		case SIOCGIFTXQLEN: -			ifr->ifr_qlen = dev->tx_queue_len; -			return 0; +	case SIOCGIFTXQLEN: +		ifr->ifr_qlen = dev->tx_queue_len; +		return 0; -		default: -			/* dev_ioctl() should ensure this case -			 * is never reached -			 */ -			WARN_ON(1); -			err = -EINVAL; -			break; +	default: +		/* dev_ioctl() should ensure this case +		 * is never reached +		 */ +		WARN_ON(1); +		err = -EINVAL; +		break;  	}  	return err; @@ -4276,92 +4678,89 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)  	ops = dev->netdev_ops;  	switch (cmd) { -		case SIOCSIFFLAGS:	/* Set interface flags */ -			return dev_change_flags(dev, ifr->ifr_flags); - -		case SIOCSIFMETRIC:	/* Set the metric on the interface -					   (currently unused) */ -			return -EOPNOTSUPP; +	case SIOCSIFFLAGS:	/* Set interface flags */ +		return dev_change_flags(dev, ifr->ifr_flags); -		case SIOCSIFMTU:	/* Set the MTU of a device */ -			return dev_set_mtu(dev, ifr->ifr_mtu); +	case SIOCSIFMETRIC:	/* Set the metric on the interface +				   (currently unused) */ +		return -EOPNOTSUPP; -		case SIOCSIFHWADDR: -			return dev_set_mac_address(dev, &ifr->ifr_hwaddr); +	case SIOCSIFMTU:	/* Set the MTU of a device */ +		return dev_set_mtu(dev, ifr->ifr_mtu); -		case SIOCSIFHWBROADCAST: -			if (ifr->ifr_hwaddr.sa_family != dev->type) -				return -EINVAL; -			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, -			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); -			call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); -			return 0; +	case SIOCSIFHWADDR: +		return dev_set_mac_address(dev, &ifr->ifr_hwaddr); -		case SIOCSIFMAP: -			if (ops->ndo_set_config) { -				if (!netif_device_present(dev)) -					return -ENODEV; -				return ops->ndo_set_config(dev, &ifr->ifr_map); -			} -			return -EOPNOTSUPP; +	case SIOCSIFHWBROADCAST: +		if (ifr->ifr_hwaddr.sa_family != dev->type) +			return -EINVAL; +		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, +		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); +		return 0; -		case SIOCADDMULTI: -			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || -			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC) -				return -EINVAL; +	case SIOCSIFMAP: +		if (ops->ndo_set_config) {  			if (!netif_device_present(dev))  				return -ENODEV; -			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, -					  dev->addr_len, 1); +			return ops->ndo_set_config(dev, &ifr->ifr_map); +		} +		return -EOPNOTSUPP; -		case SIOCDELMULTI: -			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || -			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC) -				return -EINVAL; -			if (!netif_device_present(dev)) -				return -ENODEV; -			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, -					     dev->addr_len, 1); +	case SIOCADDMULTI: +		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || +		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC) +			return -EINVAL; +		if (!netif_device_present(dev)) +			return -ENODEV; +		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); -		case SIOCSIFTXQLEN: -			if (ifr->ifr_qlen < 0) -				return -EINVAL; -			dev->tx_queue_len = ifr->ifr_qlen; -			return 0; +	case SIOCDELMULTI: +		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || +		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC) +			return -EINVAL; +		if (!netif_device_present(dev)) +			return -ENODEV; +		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); -		case SIOCSIFNAME: -			ifr->ifr_newname[IFNAMSIZ-1] = '\0'; -			return dev_change_name(dev, ifr->ifr_newname); +	case SIOCSIFTXQLEN: +		if (ifr->ifr_qlen < 0) +			return -EINVAL; +		dev->tx_queue_len = ifr->ifr_qlen; +		return 0; -		/* -		 *	Unknown or private ioctl -		 */ +	case SIOCSIFNAME: +		ifr->ifr_newname[IFNAMSIZ-1] = '\0'; +		return dev_change_name(dev, ifr->ifr_newname); -		default: -			if ((cmd >= SIOCDEVPRIVATE && -			    cmd <= SIOCDEVPRIVATE + 15) || -			    cmd == SIOCBONDENSLAVE || -			    cmd == SIOCBONDRELEASE || -			    cmd == SIOCBONDSETHWADDR || -			    cmd == SIOCBONDSLAVEINFOQUERY || -			    cmd == SIOCBONDINFOQUERY || -			    cmd == SIOCBONDCHANGEACTIVE || -			    cmd == SIOCGMIIPHY || -			    cmd == SIOCGMIIREG || -			    cmd == SIOCSMIIREG || -			    cmd == SIOCBRADDIF || -			    cmd == SIOCBRDELIF || -			    cmd == SIOCSHWTSTAMP || -			    cmd == SIOCWANDEV) { -				err = -EOPNOTSUPP; -				if (ops->ndo_do_ioctl) { -					if (netif_device_present(dev)) -						err = ops->ndo_do_ioctl(dev, ifr, cmd); -					else -						err = -ENODEV; -				} -			} else -				err = -EINVAL; +	/* +	 *	Unknown or private ioctl +	 */ +	default: +		if ((cmd >= SIOCDEVPRIVATE && +		    cmd <= SIOCDEVPRIVATE + 15) || +		    cmd == SIOCBONDENSLAVE || +		    cmd == SIOCBONDRELEASE || +		    cmd == SIOCBONDSETHWADDR || +		    cmd == SIOCBONDSLAVEINFOQUERY || +		    cmd == SIOCBONDINFOQUERY || +		    cmd == SIOCBONDCHANGEACTIVE || +		    cmd == SIOCGMIIPHY || +		    cmd == SIOCGMIIREG || +		    cmd == SIOCSMIIREG || +		    cmd == SIOCBRADDIF || +		    cmd == SIOCBRDELIF || +		    cmd == SIOCSHWTSTAMP || +		    cmd == SIOCWANDEV) { +			err = -EOPNOTSUPP; +			if (ops->ndo_do_ioctl) { +				if (netif_device_present(dev)) +					err = ops->ndo_do_ioctl(dev, ifr, cmd); +				else +					err = -ENODEV; +			} +		} else +			err = -EINVAL;  	}  	return err; @@ -4418,135 +4817,135 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)  	 */  	switch (cmd) { -		/* -		 *	These ioctl calls: -		 *	- can be done by all. -		 *	- atomic and do not require locking. -		 *	- return a value -		 */ -		case SIOCGIFFLAGS: -		case SIOCGIFMETRIC: -		case SIOCGIFMTU: -		case SIOCGIFHWADDR: -		case SIOCGIFSLAVE: -		case SIOCGIFMAP: -		case SIOCGIFINDEX: -		case SIOCGIFTXQLEN: -			dev_load(net, ifr.ifr_name); -			read_lock(&dev_base_lock); -			ret = dev_ifsioc_locked(net, &ifr, cmd); -			read_unlock(&dev_base_lock); -			if (!ret) { -				if (colon) -					*colon = ':'; -				if (copy_to_user(arg, &ifr, -						 sizeof(struct ifreq))) -					ret = -EFAULT; -			} -			return ret; +	/* +	 *	These ioctl calls: +	 *	- can be done by all. +	 *	- atomic and do not require locking. +	 *	- return a value +	 */ +	case SIOCGIFFLAGS: +	case SIOCGIFMETRIC: +	case SIOCGIFMTU: +	case SIOCGIFHWADDR: +	case SIOCGIFSLAVE: +	case SIOCGIFMAP: +	case SIOCGIFINDEX: +	case SIOCGIFTXQLEN: +		dev_load(net, ifr.ifr_name); +		rcu_read_lock(); +		ret = dev_ifsioc_locked(net, &ifr, cmd); +		rcu_read_unlock(); +		if (!ret) { +			if (colon) +				*colon = ':'; +			if (copy_to_user(arg, &ifr, +					 sizeof(struct ifreq))) +				ret = -EFAULT; +		} +		return ret; -		case SIOCETHTOOL: -			dev_load(net, ifr.ifr_name); -			rtnl_lock(); -			ret = dev_ethtool(net, &ifr); -			rtnl_unlock(); -			if (!ret) { -				if (colon) -					*colon = ':'; -				if (copy_to_user(arg, &ifr, -						 sizeof(struct ifreq))) -					ret = -EFAULT; -			} -			return ret; +	case SIOCETHTOOL: +		dev_load(net, ifr.ifr_name); +		rtnl_lock(); +		ret = dev_ethtool(net, &ifr); +		rtnl_unlock(); +		if (!ret) { +			if (colon) +				*colon = ':'; +			if (copy_to_user(arg, &ifr, +					 sizeof(struct ifreq))) +				ret = -EFAULT; +		} +		return ret; -		/* -		 *	These ioctl calls: -		 *	- require superuser power. -		 *	- require strict serialization. -		 *	- return a value -		 */ -		case SIOCGMIIPHY: -		case SIOCGMIIREG: -		case SIOCSIFNAME: -			if (!capable(CAP_NET_ADMIN)) -				return -EPERM; -			dev_load(net, ifr.ifr_name); -			rtnl_lock(); -			ret = dev_ifsioc(net, &ifr, cmd); -			rtnl_unlock(); -			if (!ret) { -				if (colon) -					*colon = ':'; -				if (copy_to_user(arg, &ifr, -						 sizeof(struct ifreq))) -					ret = -EFAULT; -			} -			return ret; +	/* +	 *	These ioctl calls: +	 *	- require superuser power. +	 *	- require strict serialization. +	 *	- return a value +	 */ +	case SIOCGMIIPHY: +	case SIOCGMIIREG: +	case SIOCSIFNAME: +		if (!capable(CAP_NET_ADMIN)) +			return -EPERM; +		dev_load(net, ifr.ifr_name); +		rtnl_lock(); +		ret = dev_ifsioc(net, &ifr, cmd); +		rtnl_unlock(); +		if (!ret) { +			if (colon) +				*colon = ':'; +			if (copy_to_user(arg, &ifr, +					 sizeof(struct ifreq))) +				ret = -EFAULT; +		} +		return ret; -		/* -		 *	These ioctl calls: -		 *	- require superuser power. -		 *	- require strict serialization. -		 *	- do not return a value -		 */ -		case SIOCSIFFLAGS: -		case SIOCSIFMETRIC: -		case SIOCSIFMTU: -		case SIOCSIFMAP: -		case SIOCSIFHWADDR: -		case SIOCSIFSLAVE: -		case SIOCADDMULTI: -		case SIOCDELMULTI: -		case SIOCSIFHWBROADCAST: -		case SIOCSIFTXQLEN: -		case SIOCSMIIREG: -		case SIOCBONDENSLAVE: -		case SIOCBONDRELEASE: -		case SIOCBONDSETHWADDR: -		case SIOCBONDCHANGEACTIVE: -		case SIOCBRADDIF: -		case SIOCBRDELIF: -		case SIOCSHWTSTAMP: -			if (!capable(CAP_NET_ADMIN)) -				return -EPERM; -			/* fall through */ -		case SIOCBONDSLAVEINFOQUERY: -		case SIOCBONDINFOQUERY: +	/* +	 *	These ioctl calls: +	 *	- require superuser power. +	 *	- require strict serialization. +	 *	- do not return a value +	 */ +	case SIOCSIFFLAGS: +	case SIOCSIFMETRIC: +	case SIOCSIFMTU: +	case SIOCSIFMAP: +	case SIOCSIFHWADDR: +	case SIOCSIFSLAVE: +	case SIOCADDMULTI: +	case SIOCDELMULTI: +	case SIOCSIFHWBROADCAST: +	case SIOCSIFTXQLEN: +	case SIOCSMIIREG: +	case SIOCBONDENSLAVE: +	case SIOCBONDRELEASE: +	case SIOCBONDSETHWADDR: +	case SIOCBONDCHANGEACTIVE: +	case SIOCBRADDIF: +	case SIOCBRDELIF: +	case SIOCSHWTSTAMP: +		if (!capable(CAP_NET_ADMIN)) +			return -EPERM; +		/* fall through */ +	case SIOCBONDSLAVEINFOQUERY: +	case SIOCBONDINFOQUERY: +		dev_load(net, ifr.ifr_name); +		rtnl_lock(); +		ret = dev_ifsioc(net, &ifr, cmd); +		rtnl_unlock(); +		return ret; + +	case SIOCGIFMEM: +		/* Get the per device memory space. We can add this but +		 * currently do not support it */ +	case SIOCSIFMEM: +		/* Set the per device memory buffer space. +		 * Not applicable in our case */ +	case SIOCSIFLINK: +		return -EINVAL; + +	/* +	 *	Unknown or private ioctl. +	 */ +	default: +		if (cmd == SIOCWANDEV || +		    (cmd >= SIOCDEVPRIVATE && +		     cmd <= SIOCDEVPRIVATE + 15)) {  			dev_load(net, ifr.ifr_name);  			rtnl_lock();  			ret = dev_ifsioc(net, &ifr, cmd);  			rtnl_unlock(); +			if (!ret && copy_to_user(arg, &ifr, +						 sizeof(struct ifreq))) +				ret = -EFAULT;  			return ret; - -		case SIOCGIFMEM: -			/* Get the per device memory space. We can add this but -			 * currently do not support it */ -		case SIOCSIFMEM: -			/* Set the per device memory buffer space. -			 * Not applicable in our case */ -		case SIOCSIFLINK: -			return -EINVAL; - -		/* -		 *	Unknown or private ioctl. -		 */ -		default: -			if (cmd == SIOCWANDEV || -			    (cmd >= SIOCDEVPRIVATE && -			     cmd <= SIOCDEVPRIVATE + 15)) { -				dev_load(net, ifr.ifr_name); -				rtnl_lock(); -				ret = dev_ifsioc(net, &ifr, cmd); -				rtnl_unlock(); -				if (!ret && copy_to_user(arg, &ifr, -							 sizeof(struct ifreq))) -					ret = -EFAULT; -				return ret; -			} -			/* Take care of Wireless Extensions */ -			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) -				return wext_handle_ioctl(net, &ifr, cmd, arg); -			return -EINVAL; +		} +		/* Take care of Wireless Extensions */ +		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) +			return wext_handle_ioctl(net, &ifr, cmd, arg); +		return -EINVAL;  	}  } @@ -4578,74 +4977,88 @@ static void net_set_todo(struct net_device *dev)  	list_add_tail(&dev->todo_list, &net_todo_list);  } -static void rollback_registered(struct net_device *dev) +static void rollback_registered_many(struct list_head *head)  { +	struct net_device *dev, *tmp; +  	BUG_ON(dev_boot_phase);  	ASSERT_RTNL(); -	/* Some devices call without registering for initialization unwind. */ -	if (dev->reg_state == NETREG_UNINITIALIZED) { -		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never " -				  "was registered\n", dev->name, dev); +	list_for_each_entry_safe(dev, tmp, head, unreg_list) { +		/* Some devices call without registering +		 * for initialization unwind. Remove those +		 * devices and proceed with the remaining. +		 */ +		if (dev->reg_state == NETREG_UNINITIALIZED) { +			pr_debug("unregister_netdevice: device %s/%p never " +				 "was registered\n", dev->name, dev); -		WARN_ON(1); -		return; -	} +			WARN_ON(1); +			list_del(&dev->unreg_list); +			continue; +		} -	BUG_ON(dev->reg_state != NETREG_REGISTERED); +		BUG_ON(dev->reg_state != NETREG_REGISTERED); +	}  	/* If device is running, close it first. */ -	dev_close(dev); +	dev_close_many(head); -	/* And unlink it from device chain. */ -	unlist_netdevice(dev); +	list_for_each_entry(dev, head, unreg_list) { +		/* And unlink it from device chain. */ +		unlist_netdevice(dev); -	dev->reg_state = NETREG_UNREGISTERING; +		dev->reg_state = NETREG_UNREGISTERING; +	}  	synchronize_net(); -	/* Shutdown queueing discipline. */ -	dev_shutdown(dev); +	list_for_each_entry(dev, head, unreg_list) { +		/* Shutdown queueing discipline. */ +		dev_shutdown(dev); -	/* Notify protocols, that we are about to destroy -	   this device. They should clean all the things. -	*/ -	call_netdevice_notifiers(NETDEV_UNREGISTER, dev); +		/* Notify protocols, that we are about to destroy +		   this device. They should clean all the things. +		*/ +		call_netdevice_notifiers(NETDEV_UNREGISTER, dev); -	/* -	 *	Flush the unicast and multicast chains -	 */ -	dev_unicast_flush(dev); -	dev_addr_discard(dev); +		if (!dev->rtnl_link_ops || +		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED) +			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); -	if (dev->netdev_ops->ndo_uninit) -		dev->netdev_ops->ndo_uninit(dev); +		/* +		 *	Flush the unicast and multicast chains +		 */ +		dev_uc_flush(dev); +		dev_mc_flush(dev); -	/* Notifier chain MUST detach us from master device. */ -	WARN_ON(dev->master); +		if (dev->netdev_ops->ndo_uninit) +			dev->netdev_ops->ndo_uninit(dev); -	/* Remove entries from kobject tree */ -	netdev_unregister_kobject(dev); +		/* Notifier chain MUST detach us from master device. */ +		WARN_ON(dev->master); -	synchronize_net(); +		/* Remove entries from kobject tree */ +		netdev_unregister_kobject(dev); +	} -	dev_put(dev); -} +	/* Process any work delayed until the end of the batch */ +	dev = list_first_entry(head, struct net_device, unreg_list); +	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); -static void __netdev_init_queue_locks_one(struct net_device *dev, -					  struct netdev_queue *dev_queue, -					  void *_unused) -{ -	spin_lock_init(&dev_queue->_xmit_lock); -	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); -	dev_queue->xmit_lock_owner = -1; +	rcu_barrier(); + +	list_for_each_entry(dev, head, unreg_list) +		dev_put(dev);  } -static void netdev_init_queue_locks(struct net_device *dev) +static void rollback_registered(struct net_device *dev)  { -	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); -	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); +	LIST_HEAD(single); + +	list_add(&dev->unreg_list, &single); +	rollback_registered_many(&single);  }  unsigned long netdev_fix_features(unsigned long features, const char *name) @@ -4668,10 +5081,13 @@ unsigned long netdev_fix_features(unsigned long features, const char *name)  	}  	if (features & NETIF_F_UFO) { -		if (!(features & NETIF_F_GEN_CSUM)) { +		/* maybe split UFO into V4 and V6? */ +		if (!((features & NETIF_F_GEN_CSUM) || +		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) +			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {  			if (name)  				printk(KERN_ERR "%s: Dropping NETIF_F_UFO " -				       "since no NETIF_F_HW_CSUM feature.\n", +				       "since no checksum offload features.\n",  				       name);  			features &= ~NETIF_F_UFO;  		} @@ -4689,6 +5105,86 @@ unsigned long netdev_fix_features(unsigned long features, const char *name)  EXPORT_SYMBOL(netdev_fix_features);  /** + *	netif_stacked_transfer_operstate -	transfer operstate + *	@rootdev: the root or lower level device to transfer state from + *	@dev: the device to transfer operstate to + * + *	Transfer operational state from root to device. This is normally + *	called when a stacking relationship exists between the root + *	device and the device(a leaf device). + */ +void netif_stacked_transfer_operstate(const struct net_device *rootdev, +					struct net_device *dev) +{ +	if (rootdev->operstate == IF_OPER_DORMANT) +		netif_dormant_on(dev); +	else +		netif_dormant_off(dev); + +	if (netif_carrier_ok(rootdev)) { +		if (!netif_carrier_ok(dev)) +			netif_carrier_on(dev); +	} else { +		if (netif_carrier_ok(dev)) +			netif_carrier_off(dev); +	} +} +EXPORT_SYMBOL(netif_stacked_transfer_operstate); + +#ifdef CONFIG_RPS +static int netif_alloc_rx_queues(struct net_device *dev) +{ +	unsigned int i, count = dev->num_rx_queues; +	struct netdev_rx_queue *rx; + +	BUG_ON(count < 1); + +	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); +	if (!rx) { +		pr_err("netdev: Unable to allocate %u rx queues.\n", count); +		return -ENOMEM; +	} +	dev->_rx = rx; + +	for (i = 0; i < count; i++) +		rx[i].dev = dev; +	return 0; +} +#endif + +static void netdev_init_one_queue(struct net_device *dev, +				  struct netdev_queue *queue, void *_unused) +{ +	/* Initialize queue lock */ +	spin_lock_init(&queue->_xmit_lock); +	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); +	queue->xmit_lock_owner = -1; +	netdev_queue_numa_node_write(queue, NUMA_NO_NODE); +	queue->dev = dev; +} + +static int netif_alloc_netdev_queues(struct net_device *dev) +{ +	unsigned int count = dev->num_tx_queues; +	struct netdev_queue *tx; + +	BUG_ON(count < 1); + +	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); +	if (!tx) { +		pr_err("netdev: Unable to allocate %u tx queues.\n", +		       count); +		return -ENOMEM; +	} +	dev->_tx = tx; + +	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); +	spin_lock_init(&dev->tx_global_lock); + +	return 0; +} + +/**   *	register_netdevice	- register a network device   *	@dev: device to register   * @@ -4707,8 +5203,6 @@ EXPORT_SYMBOL(netdev_fix_features);  int register_netdevice(struct net_device *dev)  { -	struct hlist_head *head; -	struct hlist_node *p;  	int ret;  	struct net *net = dev_net(dev); @@ -4723,7 +5217,6 @@ int register_netdevice(struct net_device *dev)  	spin_lock_init(&dev->addr_list_lock);  	netdev_set_addr_lockdep_class(dev); -	netdev_init_queue_locks(dev);  	dev->iflink = -1; @@ -4737,26 +5230,14 @@ int register_netdevice(struct net_device *dev)  		}  	} -	if (!dev_valid_name(dev->name)) { -		ret = -EINVAL; +	ret = dev_get_valid_name(dev, dev->name, 0); +	if (ret)  		goto err_uninit; -	}  	dev->ifindex = dev_new_index(net);  	if (dev->iflink == -1)  		dev->iflink = dev->ifindex; -	/* Check for existence of name */ -	head = dev_name_hash(net, dev->name); -	hlist_for_each(p, head) { -		struct net_device *d -			= hlist_entry(p, struct net_device, name_hlist); -		if (!strncmp(d->name, dev->name, IFNAMSIZ)) { -			ret = -EEXIST; -			goto err_uninit; -		} -	} -  	/* Fix illegal checksum combinations */  	if ((dev->features & NETIF_F_HW_CSUM) &&  	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { @@ -4778,7 +5259,17 @@ int register_netdevice(struct net_device *dev)  	if (dev->features & NETIF_F_SG)  		dev->features |= NETIF_F_GSO; -	netdev_initialize_kobject(dev); +	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default, +	 * vlan_dev_init() will do the dev->features check, so these features +	 * are enabled only if supported by underlying device. +	 */ +	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA); + +	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); +	ret = notifier_to_errno(ret); +	if (ret) +		goto err_uninit; +  	ret = netdev_register_kobject(dev);  	if (ret)  		goto err_uninit; @@ -4802,6 +5293,13 @@ int register_netdevice(struct net_device *dev)  		rollback_registered(dev);  		dev->reg_state = NETREG_UNREGISTERED;  	} +	/* +	 *	Prevent userspace races by waiting until the network +	 *	device is fully setup before sending notifications. +	 */ +	if (!dev->rtnl_link_ops || +	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED) +		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);  out:  	return ret; @@ -4811,6 +5309,7 @@ err_uninit:  		dev->netdev_ops->ndo_uninit(dev);  	goto out;  } +EXPORT_SYMBOL(register_netdevice);  /**   *	init_dummy_netdev	- init a dummy network device for NAPI @@ -4836,9 +5335,6 @@ int init_dummy_netdev(struct net_device *dev)  	 */  	dev->reg_state = NETREG_DUMMY; -	/* initialize the ref count */ -	atomic_set(&dev->refcnt, 1); -  	/* NAPI wants this */  	INIT_LIST_HEAD(&dev->napi_list); @@ -4846,6 +5342,11 @@ int init_dummy_netdev(struct net_device *dev)  	set_bit(__LINK_STATE_PRESENT, &dev->state);  	set_bit(__LINK_STATE_START, &dev->state); +	/* Note : We dont allocate pcpu_refcnt for dummy devices, +	 * because users of this 'device' dont need to change +	 * its refcount. +	 */ +  	return 0;  }  EXPORT_SYMBOL_GPL(init_dummy_netdev); @@ -4887,6 +5388,16 @@ out:  }  EXPORT_SYMBOL(register_netdev); +int netdev_refcnt_read(const struct net_device *dev) +{ +	int i, refcnt = 0; + +	for_each_possible_cpu(i) +		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); +	return refcnt; +} +EXPORT_SYMBOL(netdev_refcnt_read); +  /*   * netdev_wait_allrefs - wait until all references are gone.   * @@ -4901,14 +5412,21 @@ EXPORT_SYMBOL(register_netdev);  static void netdev_wait_allrefs(struct net_device *dev)  {  	unsigned long rebroadcast_time, warning_time; +	int refcnt; + +	linkwatch_forget_dev(dev);  	rebroadcast_time = warning_time = jiffies; -	while (atomic_read(&dev->refcnt) != 0) { +	refcnt = netdev_refcnt_read(dev); + +	while (refcnt != 0) {  		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {  			rtnl_lock();  			/* Rebroadcast unregister notification */  			call_netdevice_notifiers(NETDEV_UNREGISTER, dev); +			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users +			 * should have already handle it the first time */  			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,  				     &dev->state)) { @@ -4928,11 +5446,13 @@ static void netdev_wait_allrefs(struct net_device *dev)  		msleep(250); +		refcnt = netdev_refcnt_read(dev); +  		if (time_after(jiffies, warning_time + 10 * HZ)) {  			printk(KERN_EMERG "unregister_netdevice: "  			       "waiting for %s to become free. Usage "  			       "count = %d\n", -			       dev->name, atomic_read(&dev->refcnt)); +			       dev->name, refcnt);  			warning_time = jiffies;  		}  	} @@ -4973,7 +5493,7 @@ void netdev_run_todo(void)  	while (!list_empty(&list)) {  		struct net_device *dev -			= list_entry(list.next, struct net_device, todo_list); +			= list_first_entry(&list, struct net_device, todo_list);  		list_del(&dev->todo_list);  		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { @@ -4990,9 +5510,9 @@ void netdev_run_todo(void)  		netdev_wait_allrefs(dev);  		/* paranoia */ -		BUG_ON(atomic_read(&dev->refcnt)); -		WARN_ON(dev->ip_ptr); -		WARN_ON(dev->ip6_ptr); +		BUG_ON(netdev_refcnt_read(dev)); +		WARN_ON(rcu_dereference_raw(dev->ip_ptr)); +		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));  		WARN_ON(dev->dn_ptr);  		if (dev->destructor) @@ -5004,76 +5524,137 @@ void netdev_run_todo(void)  }  /** + *	dev_txq_stats_fold - fold tx_queues stats + *	@dev: device to get statistics from + *	@stats: struct rtnl_link_stats64 to hold results + */ +void dev_txq_stats_fold(const struct net_device *dev, +			struct rtnl_link_stats64 *stats) +{ +	u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0; +	unsigned int i; +	struct netdev_queue *txq; + +	for (i = 0; i < dev->num_tx_queues; i++) { +		txq = netdev_get_tx_queue(dev, i); +		spin_lock_bh(&txq->_xmit_lock); +		tx_bytes   += txq->tx_bytes; +		tx_packets += txq->tx_packets; +		tx_dropped += txq->tx_dropped; +		spin_unlock_bh(&txq->_xmit_lock); +	} +	if (tx_bytes || tx_packets || tx_dropped) { +		stats->tx_bytes   = tx_bytes; +		stats->tx_packets = tx_packets; +		stats->tx_dropped = tx_dropped; +	} +} +EXPORT_SYMBOL(dev_txq_stats_fold); + +/* Convert net_device_stats to rtnl_link_stats64.  They have the same + * fields in the same order, with only the type differing. + */ +static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, +				    const struct net_device_stats *netdev_stats) +{ +#if BITS_PER_LONG == 64 +        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); +        memcpy(stats64, netdev_stats, sizeof(*stats64)); +#else +	size_t i, n = sizeof(*stats64) / sizeof(u64); +	const unsigned long *src = (const unsigned long *)netdev_stats; +	u64 *dst = (u64 *)stats64; + +	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != +		     sizeof(*stats64) / sizeof(u64)); +	for (i = 0; i < n; i++) +		dst[i] = src[i]; +#endif +} + +/**   *	dev_get_stats	- get network device statistics   *	@dev: device to get statistics from + *	@storage: place to store stats   * - *	Get network statistics from device. The device driver may provide - *	its own method by setting dev->netdev_ops->get_stats; otherwise - *	the internal statistics structure is used. + *	Get network statistics from device. Return @storage. + *	The device driver may provide its own method by setting + *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; + *	otherwise the internal statistics structure is used.   */ -const struct net_device_stats *dev_get_stats(struct net_device *dev) +struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, +					struct rtnl_link_stats64 *storage)  {  	const struct net_device_ops *ops = dev->netdev_ops; -	if (ops->ndo_get_stats) -		return ops->ndo_get_stats(dev); -	else { -		unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0; -		struct net_device_stats *stats = &dev->stats; -		unsigned int i; -		struct netdev_queue *txq; - -		for (i = 0; i < dev->num_tx_queues; i++) { -			txq = netdev_get_tx_queue(dev, i); -			tx_bytes   += txq->tx_bytes; -			tx_packets += txq->tx_packets; -			tx_dropped += txq->tx_dropped; -		} -		if (tx_bytes || tx_packets || tx_dropped) { -			stats->tx_bytes   = tx_bytes; -			stats->tx_packets = tx_packets; -			stats->tx_dropped = tx_dropped; -		} -		return stats; +	if (ops->ndo_get_stats64) { +		memset(storage, 0, sizeof(*storage)); +		ops->ndo_get_stats64(dev, storage); +	} else if (ops->ndo_get_stats) { +		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); +	} else { +		netdev_stats_to_stats64(storage, &dev->stats); +		dev_txq_stats_fold(dev, storage);  	} +	storage->rx_dropped += atomic_long_read(&dev->rx_dropped); +	return storage;  }  EXPORT_SYMBOL(dev_get_stats); -static void netdev_init_one_queue(struct net_device *dev, -				  struct netdev_queue *queue, -				  void *_unused) +struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)  { -	queue->dev = dev; -} +	struct netdev_queue *queue = dev_ingress_queue(dev); -static void netdev_init_queues(struct net_device *dev) -{ -	netdev_init_one_queue(dev, &dev->rx_queue, NULL); -	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); -	spin_lock_init(&dev->tx_global_lock); +#ifdef CONFIG_NET_CLS_ACT +	if (queue) +		return queue; +	queue = kzalloc(sizeof(*queue), GFP_KERNEL); +	if (!queue) +		return NULL; +	netdev_init_one_queue(dev, queue, NULL); +	queue->qdisc = &noop_qdisc; +	queue->qdisc_sleeping = &noop_qdisc; +	rcu_assign_pointer(dev->ingress_queue, queue); +#endif +	return queue;  }  /** - *	alloc_netdev_mq - allocate network device + *	alloc_netdev_mqs - allocate network device   *	@sizeof_priv:	size of private data to allocate space for   *	@name:		device name format string   *	@setup:		callback to initialize device - *	@queue_count:	the number of subqueues to allocate + *	@txqs:		the number of TX subqueues to allocate + *	@rxqs:		the number of RX subqueues to allocate   *   *	Allocates a struct net_device with private data area for driver use   *	and performs basic initialization.  Also allocates subquue structs - *	for each queue on the device at the end of the netdevice. + *	for each queue on the device.   */ -struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, -		void (*setup)(struct net_device *), unsigned int queue_count) +struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, +		void (*setup)(struct net_device *), +		unsigned int txqs, unsigned int rxqs)  { -	struct netdev_queue *tx;  	struct net_device *dev;  	size_t alloc_size;  	struct net_device *p;  	BUG_ON(strlen(name) >= sizeof(dev->name)); +	if (txqs < 1) { +		pr_err("alloc_netdev: Unable to allocate device " +		       "with zero queues.\n"); +		return NULL; +	} + +#ifdef CONFIG_RPS +	if (rxqs < 1) { +		pr_err("alloc_netdev: Unable to allocate device " +		       "with zero RX queues.\n"); +		return NULL; +	} +#endif +  	alloc_size = sizeof(struct net_device);  	if (sizeof_priv) {  		/* ensure 32-byte alignment of private area */ @@ -5089,45 +5670,57 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,  		return NULL;  	} -	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL); -	if (!tx) { -		printk(KERN_ERR "alloc_netdev: Unable to allocate " -		       "tx qdiscs.\n"); -		goto free_p; -	} -  	dev = PTR_ALIGN(p, NETDEV_ALIGN);  	dev->padded = (char *)dev - (char *)p; +	dev->pcpu_refcnt = alloc_percpu(int); +	if (!dev->pcpu_refcnt) +		goto free_p; +  	if (dev_addr_init(dev)) -		goto free_tx; +		goto free_pcpu; -	dev_unicast_init(dev); +	dev_mc_init(dev); +	dev_uc_init(dev);  	dev_net_set(dev, &init_net); -	dev->_tx = tx; -	dev->num_tx_queues = queue_count; -	dev->real_num_tx_queues = queue_count; +	dev->num_tx_queues = txqs; +	dev->real_num_tx_queues = txqs; +	if (netif_alloc_netdev_queues(dev)) +		goto free_pcpu; -	dev->gso_max_size = GSO_MAX_SIZE; +#ifdef CONFIG_RPS +	dev->num_rx_queues = rxqs; +	dev->real_num_rx_queues = rxqs; +	if (netif_alloc_rx_queues(dev)) +		goto free_pcpu; +#endif -	netdev_init_queues(dev); +	dev->gso_max_size = GSO_MAX_SIZE; +	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list); +	dev->ethtool_ntuple_list.count = 0;  	INIT_LIST_HEAD(&dev->napi_list); +	INIT_LIST_HEAD(&dev->unreg_list); +	INIT_LIST_HEAD(&dev->link_watch_list);  	dev->priv_flags = IFF_XMIT_DST_RELEASE;  	setup(dev);  	strcpy(dev->name, name);  	return dev; -free_tx: -	kfree(tx); +free_pcpu: +	free_percpu(dev->pcpu_refcnt); +	kfree(dev->_tx); +#ifdef CONFIG_RPS +	kfree(dev->_rx); +#endif  free_p:  	kfree(p);  	return NULL;  } -EXPORT_SYMBOL(alloc_netdev_mq); +EXPORT_SYMBOL(alloc_netdev_mqs);  /**   *	free_netdev - free network device @@ -5144,13 +5737,24 @@ void free_netdev(struct net_device *dev)  	release_net(dev_net(dev));  	kfree(dev->_tx); +#ifdef CONFIG_RPS +	kfree(dev->_rx); +#endif + +	kfree(rcu_dereference_raw(dev->ingress_queue));  	/* Flush device addresses */  	dev_addr_flush(dev); +	/* Clear ethtool n-tuple list */ +	ethtool_ntuple_flush(dev); +  	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)  		netif_napi_del(p); +	free_percpu(dev->pcpu_refcnt); +	dev->pcpu_refcnt = NULL; +  	/*  Compatibility with error handling in drivers */  	if (dev->reg_state == NETREG_UNINITIALIZED) {  		kfree((char *)dev - dev->padded); @@ -5163,6 +5767,7 @@ void free_netdev(struct net_device *dev)  	/* will free via device release */  	put_device(&dev->dev);  } +EXPORT_SYMBOL(free_netdev);  /**   *	synchronize_net -  Synchronize with packet receive processing @@ -5175,26 +5780,50 @@ void synchronize_net(void)  	might_sleep();  	synchronize_rcu();  } +EXPORT_SYMBOL(synchronize_net);  /** - *	unregister_netdevice - remove device from the kernel + *	unregister_netdevice_queue - remove device from the kernel   *	@dev: device + *	@head: list   *   *	This function shuts down a device interface and removes it   *	from the kernel tables. + *	If head not NULL, device is queued to be unregistered later.   *   *	Callers must hold the rtnl semaphore.  You may want   *	unregister_netdev() instead of this.   */ -void unregister_netdevice(struct net_device *dev) +void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)  {  	ASSERT_RTNL(); -	rollback_registered(dev); -	/* Finish processing unregister after unlock */ -	net_set_todo(dev); +	if (head) { +		list_move_tail(&dev->unreg_list, head); +	} else { +		rollback_registered(dev); +		/* Finish processing unregister after unlock */ +		net_set_todo(dev); +	} +} +EXPORT_SYMBOL(unregister_netdevice_queue); + +/** + *	unregister_netdevice_many - unregister many devices + *	@head: list of devices + */ +void unregister_netdevice_many(struct list_head *head) +{ +	struct net_device *dev; + +	if (!list_empty(head)) { +		rollback_registered_many(head); +		list_for_each_entry(dev, head, unreg_list) +			net_set_todo(dev); +	}  } +EXPORT_SYMBOL(unregister_netdevice_many);  /**   *	unregister_netdev - remove device from the kernel @@ -5213,7 +5842,6 @@ void unregister_netdev(struct net_device *dev)  	unregister_netdevice(dev);  	rtnl_unlock();  } -  EXPORT_SYMBOL(unregister_netdev);  /** @@ -5232,8 +5860,6 @@ EXPORT_SYMBOL(unregister_netdev);  int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)  { -	char buf[IFNAMSIZ]; -	const char *destname;  	int err;  	ASSERT_RTNL(); @@ -5243,15 +5869,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  	if (dev->features & NETIF_F_NETNS_LOCAL)  		goto out; -#ifdef CONFIG_SYSFS -	/* Don't allow real devices to be moved when sysfs -	 * is enabled. -	 */ -	err = -EINVAL; -	if (dev->dev.parent) -		goto out; -#endif -  	/* Ensure the device has been registrered */  	err = -EINVAL;  	if (dev->reg_state != NETREG_REGISTERED) @@ -5266,20 +5883,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  	 * we can use it in the destination network namespace.  	 */  	err = -EEXIST; -	destname = dev->name; -	if (__dev_get_by_name(net, destname)) { +	if (__dev_get_by_name(net, dev->name)) {  		/* We get here if we can't use the current device name */  		if (!pat)  			goto out; -		if (!dev_valid_name(pat)) -			goto out; -		if (strchr(pat, '%')) { -			if (__dev_alloc_name(net, pat, buf) < 0) -				goto out; -			destname = buf; -		} else -			destname = pat; -		if (__dev_get_by_name(net, destname)) +		if (dev_get_valid_name(dev, pat, 1))  			goto out;  	} @@ -5301,24 +5909,23 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  	/* Notify protocols, that we are about to destroy  	   this device. They should clean all the things. + +	   Note that dev->reg_state stays at NETREG_REGISTERED. +	   This is wanted because this way 8021q and macvlan know +	   the device is just moving and can keep their slaves up.  	*/  	call_netdevice_notifiers(NETDEV_UNREGISTER, dev); +	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);  	/*  	 *	Flush the unicast and multicast chains  	 */ -	dev_unicast_flush(dev); -	dev_addr_discard(dev); - -	netdev_unregister_kobject(dev); +	dev_uc_flush(dev); +	dev_mc_flush(dev);  	/* Actually switch the network namespace */  	dev_net_set(dev, net); -	/* Assign the new device name */ -	if (destname != dev->name) -		strcpy(dev->name, destname); -  	/* If there is an ifindex conflict assign a new one */  	if (__dev_get_by_index(net, dev->ifindex)) {  		int iflink = (dev->iflink == dev->ifindex); @@ -5328,7 +5935,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  	}  	/* Fixup kobjects */ -	err = netdev_register_kobject(dev); +	err = device_rename(&dev->dev, dev->name);  	WARN_ON(err);  	/* Add the device back in the hashes */ @@ -5337,18 +5944,24 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  	/* Notify protocols, that a new device appeared. */  	call_netdevice_notifiers(NETDEV_REGISTER, dev); +	/* +	 *	Prevent userspace races by waiting until the network +	 *	device is fully setup before sending notifications. +	 */ +	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); +  	synchronize_net();  	err = 0;  out:  	return err;  } +EXPORT_SYMBOL_GPL(dev_change_net_namespace);  static int dev_cpu_callback(struct notifier_block *nfb,  			    unsigned long action,  			    void *ocpu)  {  	struct sk_buff **list_skb; -	struct Qdisc **list_net;  	struct sk_buff *skb;  	unsigned int cpu, oldcpu = (unsigned long)ocpu;  	struct softnet_data *sd, *oldsd; @@ -5369,20 +5982,26 @@ static int dev_cpu_callback(struct notifier_block *nfb,  	*list_skb = oldsd->completion_queue;  	oldsd->completion_queue = NULL; -	/* Find end of our output_queue. */ -	list_net = &sd->output_queue; -	while (*list_net) -		list_net = &(*list_net)->next_sched;  	/* Append output queue from offline CPU. */ -	*list_net = oldsd->output_queue; -	oldsd->output_queue = NULL; +	if (oldsd->output_queue) { +		*sd->output_queue_tailp = oldsd->output_queue; +		sd->output_queue_tailp = oldsd->output_queue_tailp; +		oldsd->output_queue = NULL; +		oldsd->output_queue_tailp = &oldsd->output_queue; +	}  	raise_softirq_irqoff(NET_TX_SOFTIRQ);  	local_irq_enable();  	/* Process offline CPU's input_pkt_queue */ -	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) +	while ((skb = __skb_dequeue(&oldsd->process_queue))) { +		netif_rx(skb); +		input_queue_head_incr(oldsd); +	} +	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {  		netif_rx(skb); +		input_queue_head_incr(oldsd); +	}  	return NOTIFY_OK;  } @@ -5402,7 +6021,7 @@ unsigned long netdev_increment_features(unsigned long all, unsigned long one,  					unsigned long mask)  {  	/* If device needs checksumming, downgrade to it. */ -        if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) +	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))  		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);  	else if (mask & NETIF_F_ALL_CSUM) {  		/* If one device supports v4/v6 checksumming, set for all. */ @@ -5422,7 +6041,7 @@ unsigned long netdev_increment_features(unsigned long all, unsigned long one,  	one |= NETIF_F_ALL_CSUM;  	one |= all & NETIF_F_ONE_FOR_ALL; -	all &= one | NETIF_F_LLTX | NETIF_F_GSO; +	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;  	all |= one & mask & NETIF_F_ONE_FOR_ALL;  	return all; @@ -5491,6 +6110,68 @@ char *netdev_drivername(const struct net_device *dev, char *buffer, int len)  	return buffer;  } +static int __netdev_printk(const char *level, const struct net_device *dev, +			   struct va_format *vaf) +{ +	int r; + +	if (dev && dev->dev.parent) +		r = dev_printk(level, dev->dev.parent, "%s: %pV", +			       netdev_name(dev), vaf); +	else if (dev) +		r = printk("%s%s: %pV", level, netdev_name(dev), vaf); +	else +		r = printk("%s(NULL net_device): %pV", level, vaf); + +	return r; +} + +int netdev_printk(const char *level, const struct net_device *dev, +		  const char *format, ...) +{ +	struct va_format vaf; +	va_list args; +	int r; + +	va_start(args, format); + +	vaf.fmt = format; +	vaf.va = &args; + +	r = __netdev_printk(level, dev, &vaf); +	va_end(args); + +	return r; +} +EXPORT_SYMBOL(netdev_printk); + +#define define_netdev_printk_level(func, level)			\ +int func(const struct net_device *dev, const char *fmt, ...)	\ +{								\ +	int r;							\ +	struct va_format vaf;					\ +	va_list args;						\ +								\ +	va_start(args, fmt);					\ +								\ +	vaf.fmt = fmt;						\ +	vaf.va = &args;						\ +								\ +	r = __netdev_printk(level, dev, &vaf);			\ +	va_end(args);						\ +								\ +	return r;						\ +}								\ +EXPORT_SYMBOL(func); + +define_netdev_printk_level(netdev_emerg, KERN_EMERG); +define_netdev_printk_level(netdev_alert, KERN_ALERT); +define_netdev_printk_level(netdev_crit, KERN_CRIT); +define_netdev_printk_level(netdev_err, KERN_ERR); +define_netdev_printk_level(netdev_warn, KERN_WARNING); +define_netdev_printk_level(netdev_notice, KERN_NOTICE); +define_netdev_printk_level(netdev_info, KERN_INFO); +  static void __net_exit netdev_exit(struct net *net)  {  	kfree(net->dev_name_head); @@ -5504,14 +6185,13 @@ static struct pernet_operations __net_initdata netdev_net_ops = {  static void __net_exit default_device_exit(struct net *net)  { -	struct net_device *dev; +	struct net_device *dev, *aux;  	/* -	 * Push all migratable of the network devices back to the +	 * Push all migratable network devices back to the  	 * initial network namespace  	 */  	rtnl_lock(); -restart: -	for_each_netdev(net, dev) { +	for_each_netdev_safe(net, dev, aux) {  		int err;  		char fb_name[IFNAMSIZ]; @@ -5519,11 +6199,9 @@ restart:  		if (dev->features & NETIF_F_NETNS_LOCAL)  			continue; -		/* Delete virtual devices */ -		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) { -			dev->rtnl_link_ops->dellink(dev); -			goto restart; -		} +		/* Leave virtual devices for the generic cleanup */ +		if (dev->rtnl_link_ops) +			continue;  		/* Push remaing network devices to init_net */  		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); @@ -5533,13 +6211,37 @@ restart:  				__func__, dev->name, err);  			BUG();  		} -		goto restart;  	}  	rtnl_unlock();  } +static void __net_exit default_device_exit_batch(struct list_head *net_list) +{ +	/* At exit all network devices most be removed from a network +	 * namespace.  Do this in the reverse order of registeration. +	 * Do this across as many network namespaces as possible to +	 * improve batching efficiency. +	 */ +	struct net_device *dev; +	struct net *net; +	LIST_HEAD(dev_kill_list); + +	rtnl_lock(); +	list_for_each_entry(net, net_list, exit_list) { +		for_each_netdev_reverse(net, dev) { +			if (dev->rtnl_link_ops) +				dev->rtnl_link_ops->dellink(dev, &dev_kill_list); +			else +				unregister_netdevice_queue(dev, &dev_kill_list); +		} +	} +	unregister_netdevice_many(&dev_kill_list); +	rtnl_unlock(); +} +  static struct pernet_operations __net_initdata default_device_ops = {  	.exit = default_device_exit, +	.exit_batch = default_device_exit_batch,  };  /* @@ -5577,17 +6279,26 @@ static int __init net_dev_init(void)  	 */  	for_each_possible_cpu(i) { -		struct softnet_data *queue; +		struct softnet_data *sd = &per_cpu(softnet_data, i); -		queue = &per_cpu(softnet_data, i); -		skb_queue_head_init(&queue->input_pkt_queue); -		queue->completion_queue = NULL; -		INIT_LIST_HEAD(&queue->poll_list); +		memset(sd, 0, sizeof(*sd)); +		skb_queue_head_init(&sd->input_pkt_queue); +		skb_queue_head_init(&sd->process_queue); +		sd->completion_queue = NULL; +		INIT_LIST_HEAD(&sd->poll_list); +		sd->output_queue = NULL; +		sd->output_queue_tailp = &sd->output_queue; +#ifdef CONFIG_RPS +		sd->csd.func = rps_trigger_softirq; +		sd->csd.info = sd; +		sd->csd.flags = 0; +		sd->cpu = i; +#endif -		queue->backlog.poll = process_backlog; -		queue->backlog.weight = weight_p; -		queue->backlog.gro_list = NULL; -		queue->backlog.gro_count = 0; +		sd->backlog.poll = process_backlog; +		sd->backlog.weight = weight_p; +		sd->backlog.gro_list = NULL; +		sd->backlog.gro_count = 0;  	}  	dev_boot_phase = 0; @@ -5622,47 +6333,9 @@ subsys_initcall(net_dev_init);  static int __init initialize_hashrnd(void)  { -	get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd)); +	get_random_bytes(&hashrnd, sizeof(hashrnd));  	return 0;  }  late_initcall_sync(initialize_hashrnd); -EXPORT_SYMBOL(__dev_get_by_index); -EXPORT_SYMBOL(__dev_get_by_name); -EXPORT_SYMBOL(__dev_remove_pack); -EXPORT_SYMBOL(dev_valid_name); -EXPORT_SYMBOL(dev_add_pack); -EXPORT_SYMBOL(dev_alloc_name); -EXPORT_SYMBOL(dev_close); -EXPORT_SYMBOL(dev_get_by_flags); -EXPORT_SYMBOL(dev_get_by_index); -EXPORT_SYMBOL(dev_get_by_name); -EXPORT_SYMBOL(dev_open); -EXPORT_SYMBOL(dev_queue_xmit); -EXPORT_SYMBOL(dev_remove_pack); -EXPORT_SYMBOL(dev_set_allmulti); -EXPORT_SYMBOL(dev_set_promiscuity); -EXPORT_SYMBOL(dev_change_flags); -EXPORT_SYMBOL(dev_set_mtu); -EXPORT_SYMBOL(dev_set_mac_address); -EXPORT_SYMBOL(free_netdev); -EXPORT_SYMBOL(netdev_boot_setup_check); -EXPORT_SYMBOL(netdev_set_master); -EXPORT_SYMBOL(netdev_state_change); -EXPORT_SYMBOL(netif_receive_skb); -EXPORT_SYMBOL(netif_rx); -EXPORT_SYMBOL(register_gifconf); -EXPORT_SYMBOL(register_netdevice); -EXPORT_SYMBOL(register_netdevice_notifier); -EXPORT_SYMBOL(skb_checksum_help); -EXPORT_SYMBOL(synchronize_net); -EXPORT_SYMBOL(unregister_netdevice); -EXPORT_SYMBOL(unregister_netdevice_notifier); -EXPORT_SYMBOL(net_enable_timestamp); -EXPORT_SYMBOL(net_disable_timestamp); -EXPORT_SYMBOL(dev_get_flags); - -EXPORT_SYMBOL(dev_load); - -EXPORT_PER_CPU_SYMBOL(softnet_data);  | 
