diff options
Diffstat (limited to 'net/core/dev.c')
| -rw-r--r-- | net/core/dev.c | 1407 | 
1 files changed, 941 insertions, 466 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 5c713f2239c..367a586d0c8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -131,6 +131,7 @@  #include <linux/static_key.h>  #include <linux/hashtable.h>  #include <linux/vmalloc.h> +#include <linux/if_macvlan.h>  #include "net-sysfs.h" @@ -146,6 +147,11 @@ struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;  struct list_head ptype_all __read_mostly;	/* Taps */  static struct list_head offload_base __read_mostly; +static int netif_rx_internal(struct sk_buff *skb); +static int call_netdevice_notifiers_info(unsigned long val, +					 struct net_device *dev, +					 struct netdev_notifier_info *info); +  /*   * The @dev_base_head list is protected by @dev_base_lock and the rtnl   * semaphore. @@ -479,7 +485,7 @@ EXPORT_SYMBOL(dev_add_offload);   *	and must not be freed until after all the CPU's have gone   *	through a quiescent state.   */ -void __dev_remove_offload(struct packet_offload *po) +static void __dev_remove_offload(struct packet_offload *po)  {  	struct list_head *head = &offload_base;  	struct packet_offload *po1; @@ -497,7 +503,6 @@ void __dev_remove_offload(struct packet_offload *po)  out:  	spin_unlock(&offload_lock);  } -EXPORT_SYMBOL(__dev_remove_offload);  /**   *	dev_remove_offload	 - remove packet offload handler @@ -1117,6 +1122,8 @@ rollback:  	write_seqcount_end(&devnet_rename_seq); +	netdev_adjacent_rename_links(dev, oldname); +  	write_lock_bh(&dev_base_lock);  	hlist_del_rcu(&dev->name_hlist);  	write_unlock_bh(&dev_base_lock); @@ -1136,6 +1143,7 @@ rollback:  			err = ret;  			write_seqcount_begin(&devnet_rename_seq);  			memcpy(dev->name, oldname, IFNAMSIZ); +			memcpy(oldname, newname, IFNAMSIZ);  			goto rollback;  		} else {  			pr_err("%s: name change rollback failed: %d\n", @@ -1202,8 +1210,12 @@ EXPORT_SYMBOL(netdev_features_change);  void netdev_state_change(struct net_device *dev)  {  	if (dev->flags & IFF_UP) { -		call_netdevice_notifiers(NETDEV_CHANGE, dev); -		rtmsg_ifinfo(RTM_NEWLINK, dev, 0); +		struct netdev_notifier_change_info change_info; + +		change_info.flags_changed = 0; +		call_netdevice_notifiers_info(NETDEV_CHANGE, dev, +					      &change_info.info); +		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);  	}  }  EXPORT_SYMBOL(netdev_state_change); @@ -1240,7 +1252,7 @@ static int __dev_open(struct net_device *dev)  	 * If we don't do this there is a chance ndo_poll_controller  	 * or ndo_poll may be running while we open the device  	 */ -	netpoll_rx_disable(dev); +	netpoll_poll_disable(dev);  	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);  	ret = notifier_to_errno(ret); @@ -1255,7 +1267,7 @@ static int __dev_open(struct net_device *dev)  	if (!ret && ops->ndo_open)  		ret = ops->ndo_open(dev); -	netpoll_rx_enable(dev); +	netpoll_poll_enable(dev);  	if (ret)  		clear_bit(__LINK_STATE_START, &dev->state); @@ -1293,7 +1305,7 @@ int dev_open(struct net_device *dev)  	if (ret < 0)  		return ret; -	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); +	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);  	call_netdevice_notifiers(NETDEV_UP, dev);  	return ret; @@ -1307,7 +1319,10 @@ static int __dev_close_many(struct list_head *head)  	ASSERT_RTNL();  	might_sleep(); -	list_for_each_entry(dev, head, unreg_list) { +	list_for_each_entry(dev, head, close_list) { +		/* Temporarily disable netpoll until the interface is down */ +		netpoll_poll_disable(dev); +  		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);  		clear_bit(__LINK_STATE_START, &dev->state); @@ -1318,12 +1333,12 @@ static int __dev_close_many(struct list_head *head)  		 * dev->stop() will invoke napi_disable() on all of it's  		 * napi_struct instances on this device.  		 */ -		smp_mb__after_clear_bit(); /* Commit netif_running(). */ +		smp_mb__after_atomic(); /* Commit netif_running(). */  	}  	dev_deactivate_many(head); -	list_for_each_entry(dev, head, unreg_list) { +	list_for_each_entry(dev, head, close_list) {  		const struct net_device_ops *ops = dev->netdev_ops;  		/* @@ -1338,6 +1353,7 @@ static int __dev_close_many(struct list_head *head)  		dev->flags &= ~IFF_UP;  		net_dmaengine_put(); +		netpoll_poll_enable(dev);  	}  	return 0; @@ -1348,35 +1364,30 @@ static int __dev_close(struct net_device *dev)  	int retval;  	LIST_HEAD(single); -	/* Temporarily disable netpoll until the interface is down */ -	netpoll_rx_disable(dev); - -	list_add(&dev->unreg_list, &single); +	list_add(&dev->close_list, &single);  	retval = __dev_close_many(&single);  	list_del(&single); -	netpoll_rx_enable(dev);  	return retval;  }  static int dev_close_many(struct list_head *head)  {  	struct net_device *dev, *tmp; -	LIST_HEAD(tmp_list); -	list_for_each_entry_safe(dev, tmp, head, unreg_list) +	/* Remove the devices that don't need to be closed */ +	list_for_each_entry_safe(dev, tmp, head, close_list)  		if (!(dev->flags & IFF_UP)) -			list_move(&dev->unreg_list, &tmp_list); +			list_del_init(&dev->close_list);  	__dev_close_many(head); -	list_for_each_entry(dev, head, unreg_list) { -		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); +	list_for_each_entry_safe(dev, tmp, head, close_list) { +		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);  		call_netdevice_notifiers(NETDEV_DOWN, dev); +		list_del_init(&dev->close_list);  	} -	/* rollback_registered_many needs the complete original list */ -	list_splice(&tmp_list, head);  	return 0;  } @@ -1394,14 +1405,9 @@ int dev_close(struct net_device *dev)  	if (dev->flags & IFF_UP) {  		LIST_HEAD(single); -		/* Block netpoll rx while the interface is going down */ -		netpoll_rx_disable(dev); - -		list_add(&dev->unreg_list, &single); +		list_add(&dev->close_list, &single);  		dev_close_many(&single);  		list_del(&single); - -		netpoll_rx_enable(dev);  	}  	return 0;  } @@ -1425,6 +1431,10 @@ void dev_disable_lro(struct net_device *dev)  	if (is_vlan_dev(dev))  		dev = vlan_dev_real_dev(dev); +	/* the same for macvlan devices */ +	if (netif_is_macvlan(dev)) +		dev = macvlan_dev_real_dev(dev); +  	dev->wanted_features &= ~NETIF_F_LRO;  	netdev_update_features(dev); @@ -1562,14 +1572,14 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);   *	are as for raw_notifier_call_chain().   */ -int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, -				  struct netdev_notifier_info *info) +static int call_netdevice_notifiers_info(unsigned long val, +					 struct net_device *dev, +					 struct netdev_notifier_info *info)  {  	ASSERT_RTNL();  	netdev_notifier_info_init(info, dev);  	return raw_notifier_call_chain(&netdev_chain, val, info);  } -EXPORT_SYMBOL(call_netdevice_notifiers_info);  /**   *	call_netdevice_notifiers - call all network notifier blocks @@ -1637,8 +1647,7 @@ static inline void net_timestamp_set(struct sk_buff *skb)  			__net_timestamp(SKB);		\  	}						\ -static inline bool is_skb_forwardable(struct net_device *dev, -				      struct sk_buff *skb) +bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)  {  	unsigned int len; @@ -1657,6 +1666,30 @@ static inline bool is_skb_forwardable(struct net_device *dev,  	return false;  } +EXPORT_SYMBOL_GPL(is_skb_forwardable); + +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ +	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { +		if (skb_copy_ubufs(skb, GFP_ATOMIC)) { +			atomic_long_inc(&dev->rx_dropped); +			kfree_skb(skb); +			return NET_RX_DROP; +		} +	} + +	if (unlikely(!is_skb_forwardable(dev, skb))) { +		atomic_long_inc(&dev->rx_dropped); +		kfree_skb(skb); +		return NET_RX_DROP; +	} + +	skb_scrub_packet(skb, true); +	skb->protocol = eth_type_trans(skb, dev); + +	return 0; +} +EXPORT_SYMBOL_GPL(__dev_forward_skb);  /**   * dev_forward_skb - loopback an skb to another netif @@ -1678,28 +1711,7 @@ static inline bool is_skb_forwardable(struct net_device *dev,   */  int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)  { -	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { -		if (skb_copy_ubufs(skb, GFP_ATOMIC)) { -			atomic_long_inc(&dev->rx_dropped); -			kfree_skb(skb); -			return NET_RX_DROP; -		} -	} - -	if (unlikely(!is_skb_forwardable(dev, skb))) { -		atomic_long_inc(&dev->rx_dropped); -		kfree_skb(skb); -		return NET_RX_DROP; -	} -	skb->protocol = eth_type_trans(skb, dev); - -	/* eth_type_trans() can set pkt_type. -	 * call skb_scrub_packet() after it to clear pkt_type _after_ calling -	 * eth_type_trans(). -	 */ -	skb_scrub_packet(skb, true); - -	return netif_rx(skb); +	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);  }  EXPORT_SYMBOL_GPL(dev_forward_skb); @@ -1917,7 +1929,8 @@ static struct xps_map *expand_xps_map(struct xps_map *map,  	return new_map;  } -int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index) +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, +			u16 index)  {  	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;  	struct xps_map *map, *new_map; @@ -2078,7 +2091,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)  }  EXPORT_SYMBOL(netif_set_real_num_tx_queues); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS  /**   *	netif_set_real_num_rx_queues - set actual number of RX queues used   *	@dev: Network device @@ -2144,30 +2157,42 @@ void __netif_schedule(struct Qdisc *q)  }  EXPORT_SYMBOL(__netif_schedule); -void dev_kfree_skb_irq(struct sk_buff *skb) +struct dev_kfree_skb_cb { +	enum skb_free_reason reason; +}; + +static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)  { -	if (atomic_dec_and_test(&skb->users)) { -		struct softnet_data *sd; -		unsigned long flags; +	return (struct dev_kfree_skb_cb *)skb->cb; +} + +void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) +{ +	unsigned long flags; -		local_irq_save(flags); -		sd = &__get_cpu_var(softnet_data); -		skb->next = sd->completion_queue; -		sd->completion_queue = skb; -		raise_softirq_irqoff(NET_TX_SOFTIRQ); -		local_irq_restore(flags); +	if (likely(atomic_read(&skb->users) == 1)) { +		smp_rmb(); +		atomic_set(&skb->users, 0); +	} else if (likely(!atomic_dec_and_test(&skb->users))) { +		return;  	} +	get_kfree_skb_cb(skb)->reason = reason; +	local_irq_save(flags); +	skb->next = __this_cpu_read(softnet_data.completion_queue); +	__this_cpu_write(softnet_data.completion_queue, skb); +	raise_softirq_irqoff(NET_TX_SOFTIRQ); +	local_irq_restore(flags);  } -EXPORT_SYMBOL(dev_kfree_skb_irq); +EXPORT_SYMBOL(__dev_kfree_skb_irq); -void dev_kfree_skb_any(struct sk_buff *skb) +void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)  {  	if (in_irq() || irqs_disabled()) -		dev_kfree_skb_irq(skb); +		__dev_kfree_skb_irq(skb, reason);  	else  		dev_kfree_skb(skb);  } -EXPORT_SYMBOL(dev_kfree_skb_any); +EXPORT_SYMBOL(__dev_kfree_skb_any);  /** @@ -2269,10 +2294,10 @@ out:  }  EXPORT_SYMBOL(skb_checksum_help); -__be16 skb_network_protocol(struct sk_buff *skb) +__be16 skb_network_protocol(struct sk_buff *skb, int *depth)  { +	unsigned int vlan_depth = skb->mac_len;  	__be16 type = skb->protocol; -	int vlan_depth = ETH_HLEN;  	/* Tunnel gso handlers can set protocol to ethernet. */  	if (type == htons(ETH_P_TEB)) { @@ -2285,17 +2310,34 @@ __be16 skb_network_protocol(struct sk_buff *skb)  		type = eth->h_proto;  	} -	while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { -		struct vlan_hdr *vh; +	/* if skb->protocol is 802.1Q/AD then the header should already be +	 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at +	 * ETH_HLEN otherwise +	 */ +	if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { +		if (vlan_depth) { +			if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN))) +				return 0; +			vlan_depth -= VLAN_HLEN; +		} else { +			vlan_depth = ETH_HLEN; +		} +		do { +			struct vlan_hdr *vh; -		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) -			return 0; +			if (unlikely(!pskb_may_pull(skb, +						    vlan_depth + VLAN_HLEN))) +				return 0; -		vh = (struct vlan_hdr *)(skb->data + vlan_depth); -		type = vh->h_vlan_encapsulated_proto; -		vlan_depth += VLAN_HLEN; +			vh = (struct vlan_hdr *)(skb->data + vlan_depth); +			type = vh->h_vlan_encapsulated_proto; +			vlan_depth += VLAN_HLEN; +		} while (type == htons(ETH_P_8021Q) || +			 type == htons(ETH_P_8021AD));  	} +	*depth = vlan_depth; +  	return type;  } @@ -2309,12 +2351,13 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,  {  	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);  	struct packet_offload *ptype; -	__be16 type = skb_network_protocol(skb); +	int vlan_depth = skb->mac_len; +	__be16 type = skb_network_protocol(skb, &vlan_depth);  	if (unlikely(!type))  		return ERR_PTR(-EINVAL); -	__skb_pull(skb, skb->mac_len); +	__skb_pull(skb, vlan_depth);  	rcu_read_lock();  	list_for_each_entry_rcu(ptype, &offload_base, list) { @@ -2377,6 +2420,8 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,  	}  	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); +	SKB_GSO_CB(skb)->encap_level = 0; +  	skb_reset_mac_header(skb);  	skb_reset_mac_len(skb); @@ -2439,13 +2484,8 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)  {  	struct dev_gso_cb *cb; -	do { -		struct sk_buff *nskb = skb->next; - -		skb->next = nskb->next; -		nskb->next = NULL; -		kfree_skb(nskb); -	} while (skb->next); +	kfree_skb_list(skb->next); +	skb->next = NULL;  	cb = DEV_GSO_CB(skb);  	if (cb->destructor) @@ -2480,11 +2520,39 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)  	return 0;  } +/* If MPLS offload request, verify we are testing hardware MPLS features + * instead of standard features for the netdev. + */ +#ifdef CONFIG_NET_MPLS_GSO +static netdev_features_t net_mpls_features(struct sk_buff *skb, +					   netdev_features_t features, +					   __be16 type) +{ +	if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC)) +		features &= skb->dev->mpls_features; + +	return features; +} +#else +static netdev_features_t net_mpls_features(struct sk_buff *skb, +					   netdev_features_t features, +					   __be16 type) +{ +	return features; +} +#endif +  static netdev_features_t harmonize_features(struct sk_buff *skb,  	netdev_features_t features)  { +	int tmp; +	__be16 type; + +	type = skb_network_protocol(skb, &tmp); +	features = net_mpls_features(skb, features, type); +  	if (skb->ip_summed != CHECKSUM_NONE && -	    !can_checksum_protocol(features, skb_network_protocol(skb))) { +	    !can_checksum_protocol(features, type)) {  		features &= ~NETIF_F_ALL_CSUM;  	} else if (illegal_highdma(skb->dev, skb)) {  		features &= ~NETIF_F_SG; @@ -2520,21 +2588,6 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)  }  EXPORT_SYMBOL(netif_skb_features); -/* - * Returns true if either: - *	1. skb has frag_list and the device doesn't support FRAGLIST, or - *	2. skb is fragmented and the device does not support SG. - */ -static inline int skb_needs_linearize(struct sk_buff *skb, -				      netdev_features_t features) -{ -	return skb_is_nonlinear(skb) && -			((skb_has_frag_list(skb) && -				!(features & NETIF_F_FRAGLIST)) || -			(skb_shinfo(skb)->nr_frags && -				!(features & NETIF_F_SG))); -} -  int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,  			struct netdev_queue *txq)  { @@ -2602,6 +2655,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,  			dev_queue_xmit_nit(skb, dev);  		skb_len = skb->len; +		trace_net_dev_start_xmit(skb, dev);  		rc = ops->ndo_start_xmit(skb, dev);  		trace_net_dev_xmit(skb, rc, dev, skb_len);  		if (rc == NETDEV_TX_OK) @@ -2620,6 +2674,7 @@ gso:  			dev_queue_xmit_nit(nskb, dev);  		skb_len = nskb->len; +		trace_net_dev_start_xmit(nskb, dev);  		rc = ops->ndo_start_xmit(nskb, dev);  		trace_net_dev_xmit(nskb, rc, dev, skb_len);  		if (unlikely(rc != NETDEV_TX_OK)) { @@ -2645,6 +2700,7 @@ out_kfree_skb:  out:  	return rc;  } +EXPORT_SYMBOL_GPL(dev_hard_start_xmit);  static void qdisc_pkt_len_init(struct sk_buff *skb)  { @@ -2739,7 +2795,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,  	return rc;  } -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)  static void skb_update_prio(struct sk_buff *skb)  {  	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); @@ -2776,8 +2832,9 @@ int dev_loopback_xmit(struct sk_buff *skb)  EXPORT_SYMBOL(dev_loopback_xmit);  /** - *	dev_queue_xmit - transmit a buffer + *	__dev_queue_xmit - transmit a buffer   *	@skb: buffer to transmit + *	@accel_priv: private data used for L2 forwarding offload   *   *	Queue a buffer for transmission to a network device. The caller must   *	have set the device and priority and built the buffer before calling @@ -2800,7 +2857,7 @@ EXPORT_SYMBOL(dev_loopback_xmit);   *      the BH enable code must have IRQs enabled so that it will not deadlock.   *          --BLG   */ -int dev_queue_xmit(struct sk_buff *skb) +static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)  {  	struct net_device *dev = skb->dev;  	struct netdev_queue *txq; @@ -2816,7 +2873,7 @@ int dev_queue_xmit(struct sk_buff *skb)  	skb_update_prio(skb); -	txq = netdev_pick_tx(dev, skb); +	txq = netdev_pick_tx(dev, skb, accel_priv);  	q = rcu_dereference_bh(txq->qdisc);  #ifdef CONFIG_NET_CLS_ACT @@ -2875,14 +2932,26 @@ recursion_alert:  	rc = -ENETDOWN;  	rcu_read_unlock_bh(); +	atomic_long_inc(&dev->tx_dropped);  	kfree_skb(skb);  	return rc;  out:  	rcu_read_unlock_bh();  	return rc;  } + +int dev_queue_xmit(struct sk_buff *skb) +{ +	return __dev_queue_xmit(skb, NULL); +}  EXPORT_SYMBOL(dev_queue_xmit); +int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) +{ +	return __dev_queue_xmit(skb, accel_priv); +} +EXPORT_SYMBOL(dev_queue_xmit_accel); +  /*=======================================================================  			Receiver routines @@ -2936,7 +3005,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,  		flow_table = rcu_dereference(rxqueue->rps_flow_table);  		if (!flow_table)  			goto out; -		flow_id = skb->rxhash & flow_table->mask; +		flow_id = skb_get_hash(skb) & flow_table->mask;  		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,  							rxq_index, flow_id);  		if (rc < 0) @@ -2970,6 +3039,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  	struct rps_sock_flow_table *sock_flow_table;  	int cpu = -1;  	u16 tcpu; +	u32 hash;  	if (skb_rx_queue_recorded(skb)) {  		u16 index = skb_get_rx_queue(skb); @@ -2998,7 +3068,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  	}  	skb_reset_network_header(skb); -	if (!skb_get_rxhash(skb)) +	hash = skb_get_hash(skb); +	if (!hash)  		goto done;  	flow_table = rcu_dereference(rxqueue->rps_flow_table); @@ -3007,11 +3078,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  		u16 next_cpu;  		struct rps_dev_flow *rflow; -		rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; +		rflow = &flow_table->flows[hash & flow_table->mask];  		tcpu = rflow->cpu; -		next_cpu = sock_flow_table->ents[skb->rxhash & -		    sock_flow_table->mask]; +		next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];  		/*  		 * If the desired CPU (where last recvmsg was done) is @@ -3040,7 +3110,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  	}  	if (map) { -		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; +		tcpu = map->cpus[((u64) hash * map->len) >> 32];  		if (cpu_online(tcpu)) {  			cpu = tcpu; @@ -3143,7 +3213,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)  	rcu_read_lock();  	fl = rcu_dereference(sd->flow_limit);  	if (fl) { -		new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1); +		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);  		old_flow = fl->history[fl->history_head];  		fl->history[fl->history_head] = new_flow; @@ -3211,29 +3281,10 @@ enqueue:  	return NET_RX_DROP;  } -/** - *	netif_rx	-	post buffer to the network code - *	@skb: buffer to post - * - *	This function receives a packet from a device driver and queues it for - *	the upper (protocol) levels to process.  It always succeeds. The buffer - *	may be dropped during processing for congestion control or by the - *	protocol layers. - * - *	return values: - *	NET_RX_SUCCESS	(no congestion) - *	NET_RX_DROP     (packet was dropped) - * - */ - -int netif_rx(struct sk_buff *skb) +static int netif_rx_internal(struct sk_buff *skb)  {  	int ret; -	/* if netpoll wants it, pretend we never saw it */ -	if (netpoll_rx(skb)) -		return NET_RX_DROP; -  	net_timestamp_check(netdev_tstamp_prequeue, skb);  	trace_netif_rx(skb); @@ -3262,14 +3313,38 @@ int netif_rx(struct sk_buff *skb)  	}  	return ret;  } + +/** + *	netif_rx	-	post buffer to the network code + *	@skb: buffer to post + * + *	This function receives a packet from a device driver and queues it for + *	the upper (protocol) levels to process.  It always succeeds. The buffer + *	may be dropped during processing for congestion control or by the + *	protocol layers. + * + *	return values: + *	NET_RX_SUCCESS	(no congestion) + *	NET_RX_DROP     (packet was dropped) + * + */ + +int netif_rx(struct sk_buff *skb) +{ +	trace_netif_rx_entry(skb); + +	return netif_rx_internal(skb); +}  EXPORT_SYMBOL(netif_rx);  int netif_rx_ni(struct sk_buff *skb)  {  	int err; +	trace_netif_rx_ni_entry(skb); +  	preempt_disable(); -	err = netif_rx(skb); +	err = netif_rx_internal(skb);  	if (local_softirq_pending())  		do_softirq();  	preempt_enable(); @@ -3295,7 +3370,10 @@ static void net_tx_action(struct softirq_action *h)  			clist = clist->next;  			WARN_ON(atomic_read(&skb->users)); -			trace_kfree_skb(skb, net_tx_action); +			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) +				trace_consume_skb(skb); +			else +				trace_kfree_skb(skb, net_tx_action);  			__kfree_skb(skb);  		}  	} @@ -3317,7 +3395,7 @@ static void net_tx_action(struct softirq_action *h)  			root_lock = qdisc_lock(q);  			if (spin_trylock(root_lock)) { -				smp_mb__before_clear_bit(); +				smp_mb__before_atomic();  				clear_bit(__QDISC_STATE_SCHED,  					  &q->state);  				qdisc_run(q); @@ -3327,7 +3405,7 @@ static void net_tx_action(struct softirq_action *h)  					      &q->state)) {  					__netif_reschedule(q);  				} else { -					smp_mb__before_clear_bit(); +					smp_mb__before_atomic();  					clear_bit(__QDISC_STATE_SCHED,  						  &q->state);  				} @@ -3413,7 +3491,7 @@ out:   *	@rx_handler: receive handler to register   *	@rx_handler_data: data pointer that is used by rx handler   * - *	Register a receive hander for a device. This handler will then be + *	Register a receive handler for a device. This handler will then be   *	called from __netif_receive_skb. A negative errno code is returned   *	on a failure.   * @@ -3467,11 +3545,11 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);  static bool skb_pfmemalloc_protocol(struct sk_buff *skb)  {  	switch (skb->protocol) { -	case __constant_htons(ETH_P_ARP): -	case __constant_htons(ETH_P_IP): -	case __constant_htons(ETH_P_IPV6): -	case __constant_htons(ETH_P_8021Q): -	case __constant_htons(ETH_P_8021AD): +	case htons(ETH_P_ARP): +	case htons(ETH_P_IP): +	case htons(ETH_P_IPV6): +	case htons(ETH_P_8021Q): +	case htons(ETH_P_8021AD):  		return true;  	default:  		return false; @@ -3492,10 +3570,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)  	trace_netif_receive_skb(skb); -	/* if we've gotten here through NAPI, check netpoll */ -	if (netpoll_receive_skb(skb)) -		goto out; -  	orig_dev = skb->dev;  	skb_reset_network_header(skb); @@ -3622,7 +3696,6 @@ drop:  unlock:  	rcu_read_unlock(); -out:  	return ret;  } @@ -3651,22 +3724,7 @@ static int __netif_receive_skb(struct sk_buff *skb)  	return ret;  } -/** - *	netif_receive_skb - process receive buffer from network - *	@skb: buffer to process - * - *	netif_receive_skb() is the main receive data processing function. - *	It always succeeds. The buffer may be dropped during processing - *	for congestion control or by the protocol layers. - * - *	This function may only be called from softirq context and interrupts - *	should be enabled. - * - *	Return values (usually ignored): - *	NET_RX_SUCCESS: no congestion - *	NET_RX_DROP: packet was dropped - */ -int netif_receive_skb(struct sk_buff *skb) +static int netif_receive_skb_internal(struct sk_buff *skb)  {  	net_timestamp_check(netdev_tstamp_prequeue, skb); @@ -3692,6 +3750,28 @@ int netif_receive_skb(struct sk_buff *skb)  #endif  	return __netif_receive_skb(skb);  } + +/** + *	netif_receive_skb - process receive buffer from network + *	@skb: buffer to process + * + *	netif_receive_skb() is the main receive data processing function. + *	It always succeeds. The buffer may be dropped during processing + *	for congestion control or by the protocol layers. + * + *	This function may only be called from softirq context and interrupts + *	should be enabled. + * + *	Return values (usually ignored): + *	NET_RX_SUCCESS: no congestion + *	NET_RX_DROP: packet was dropped + */ +int netif_receive_skb(struct sk_buff *skb) +{ +	trace_netif_receive_skb_entry(skb); + +	return netif_receive_skb_internal(skb); +}  EXPORT_SYMBOL(netif_receive_skb);  /* Network device is going away, flush any packets still pending @@ -3741,7 +3821,7 @@ static int napi_gro_complete(struct sk_buff *skb)  		if (ptype->type != type || !ptype->callbacks.gro_complete)  			continue; -		err = ptype->callbacks.gro_complete(skb); +		err = ptype->callbacks.gro_complete(skb, 0);  		break;  	}  	rcu_read_unlock(); @@ -3753,7 +3833,7 @@ static int napi_gro_complete(struct sk_buff *skb)  	}  out: -	return netif_receive_skb(skb); +	return netif_receive_skb_internal(skb);  }  /* napi->gro_list contains packets ordered by age. @@ -3789,21 +3869,66 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)  {  	struct sk_buff *p;  	unsigned int maclen = skb->dev->hard_header_len; +	u32 hash = skb_get_hash_raw(skb);  	for (p = napi->gro_list; p; p = p->next) {  		unsigned long diffs; +		NAPI_GRO_CB(p)->flush = 0; + +		if (hash != skb_get_hash_raw(p)) { +			NAPI_GRO_CB(p)->same_flow = 0; +			continue; +		} +  		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;  		diffs |= p->vlan_tci ^ skb->vlan_tci;  		if (maclen == ETH_HLEN)  			diffs |= compare_ether_header(skb_mac_header(p), -						      skb_gro_mac_header(skb)); +						      skb_mac_header(skb));  		else if (!diffs)  			diffs = memcmp(skb_mac_header(p), -				       skb_gro_mac_header(skb), +				       skb_mac_header(skb),  				       maclen);  		NAPI_GRO_CB(p)->same_flow = !diffs; -		NAPI_GRO_CB(p)->flush = 0; +	} +} + +static void skb_gro_reset_offset(struct sk_buff *skb) +{ +	const struct skb_shared_info *pinfo = skb_shinfo(skb); +	const skb_frag_t *frag0 = &pinfo->frags[0]; + +	NAPI_GRO_CB(skb)->data_offset = 0; +	NAPI_GRO_CB(skb)->frag0 = NULL; +	NAPI_GRO_CB(skb)->frag0_len = 0; + +	if (skb_mac_header(skb) == skb_tail_pointer(skb) && +	    pinfo->nr_frags && +	    !PageHighMem(skb_frag_page(frag0))) { +		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); +		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); +	} +} + +static void gro_pull_from_frag0(struct sk_buff *skb, int grow) +{ +	struct skb_shared_info *pinfo = skb_shinfo(skb); + +	BUG_ON(skb->end - skb->tail < grow); + +	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + +	skb->data_len -= grow; +	skb->tail += grow; + +	pinfo->frags[0].page_offset += grow; +	skb_frag_size_sub(&pinfo->frags[0], grow); + +	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { +		skb_frag_unref(skb, 0); +		memmove(pinfo->frags, pinfo->frags + 1, +			--pinfo->nr_frags * sizeof(pinfo->frags[0]));  	}  } @@ -3815,14 +3940,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff  	struct list_head *head = &offload_base;  	int same_flow;  	enum gro_result ret; +	int grow; -	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) +	if (!(skb->dev->features & NETIF_F_GRO))  		goto normal;  	if (skb_is_gso(skb) || skb_has_frag_list(skb))  		goto normal;  	gro_list_prepare(napi, skb); +	NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */  	rcu_read_lock();  	list_for_each_entry_rcu(ptype, head, list) { @@ -3834,6 +3961,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff  		NAPI_GRO_CB(skb)->same_flow = 0;  		NAPI_GRO_CB(skb)->flush = 0;  		NAPI_GRO_CB(skb)->free = 0; +		NAPI_GRO_CB(skb)->udp_mark = 0;  		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);  		break; @@ -3858,39 +3986,35 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff  	if (same_flow)  		goto ok; -	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) +	if (NAPI_GRO_CB(skb)->flush)  		goto normal; -	napi->gro_count++; +	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { +		struct sk_buff *nskb = napi->gro_list; + +		/* locate the end of the list to select the 'oldest' flow */ +		while (nskb->next) { +			pp = &nskb->next; +			nskb = *pp; +		} +		*pp = NULL; +		nskb->next = NULL; +		napi_gro_complete(nskb); +	} else { +		napi->gro_count++; +	}  	NAPI_GRO_CB(skb)->count = 1;  	NAPI_GRO_CB(skb)->age = jiffies; +	NAPI_GRO_CB(skb)->last = skb;  	skb_shinfo(skb)->gso_size = skb_gro_len(skb);  	skb->next = napi->gro_list;  	napi->gro_list = skb;  	ret = GRO_HELD;  pull: -	if (skb_headlen(skb) < skb_gro_offset(skb)) { -		int grow = skb_gro_offset(skb) - skb_headlen(skb); - -		BUG_ON(skb->end - skb->tail < grow); - -		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); - -		skb->tail += grow; -		skb->data_len -= grow; - -		skb_shinfo(skb)->frags[0].page_offset += grow; -		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow); - -		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) { -			skb_frag_unref(skb, 0); -			memmove(skb_shinfo(skb)->frags, -				skb_shinfo(skb)->frags + 1, -				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); -		} -	} - +	grow = skb_gro_offset(skb) - skb_headlen(skb); +	if (grow > 0) +		gro_pull_from_frag0(skb, grow);  ok:  	return ret; @@ -3899,12 +4023,39 @@ normal:  	goto pull;  } +struct packet_offload *gro_find_receive_by_type(__be16 type) +{ +	struct list_head *offload_head = &offload_base; +	struct packet_offload *ptype; + +	list_for_each_entry_rcu(ptype, offload_head, list) { +		if (ptype->type != type || !ptype->callbacks.gro_receive) +			continue; +		return ptype; +	} +	return NULL; +} +EXPORT_SYMBOL(gro_find_receive_by_type); + +struct packet_offload *gro_find_complete_by_type(__be16 type) +{ +	struct list_head *offload_head = &offload_base; +	struct packet_offload *ptype; + +	list_for_each_entry_rcu(ptype, offload_head, list) { +		if (ptype->type != type || !ptype->callbacks.gro_complete) +			continue; +		return ptype; +	} +	return NULL; +} +EXPORT_SYMBOL(gro_find_complete_by_type);  static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)  {  	switch (ret) {  	case GRO_NORMAL: -		if (netif_receive_skb(skb)) +		if (netif_receive_skb_internal(skb))  			ret = GRO_DROP;  		break; @@ -3927,25 +4078,10 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)  	return ret;  } -static void skb_gro_reset_offset(struct sk_buff *skb) -{ -	const struct skb_shared_info *pinfo = skb_shinfo(skb); -	const skb_frag_t *frag0 = &pinfo->frags[0]; - -	NAPI_GRO_CB(skb)->data_offset = 0; -	NAPI_GRO_CB(skb)->frag0 = NULL; -	NAPI_GRO_CB(skb)->frag0_len = 0; - -	if (skb_mac_header(skb) == skb_tail_pointer(skb) && -	    pinfo->nr_frags && -	    !PageHighMem(skb_frag_page(frag0))) { -		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); -		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); -	} -} -  gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  { +	trace_napi_gro_receive_entry(skb); +  	skb_gro_reset_offset(skb);  	return napi_skb_finish(dev_gro_receive(napi, skb), skb); @@ -3960,6 +4096,9 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)  	skb->vlan_tci = 0;  	skb->dev = napi->dev;  	skb->skb_iif = 0; +	skb->encapsulation = 0; +	skb_shinfo(skb)->gso_type = 0; +	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));  	napi->skb = skb;  } @@ -3970,24 +4109,22 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)  	if (!skb) {  		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); -		if (skb) -			napi->skb = skb; +		napi->skb = skb;  	}  	return skb;  }  EXPORT_SYMBOL(napi_get_frags); -static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, -			       gro_result_t ret) +static gro_result_t napi_frags_finish(struct napi_struct *napi, +				      struct sk_buff *skb, +				      gro_result_t ret)  {  	switch (ret) {  	case GRO_NORMAL:  	case GRO_HELD: +		__skb_push(skb, ETH_HLEN);  		skb->protocol = eth_type_trans(skb, skb->dev); - -		if (ret == GRO_HELD) -			skb_gro_pull(skb, -ETH_HLEN); -		else if (netif_receive_skb(skb)) +		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))  			ret = GRO_DROP;  		break; @@ -4003,39 +4140,42 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *  	return ret;  } +/* Upper GRO stack assumes network header starts at gro_offset=0 + * Drivers could call both napi_gro_frags() and napi_gro_receive() + * We copy ethernet header into skb->data to have a common layout. + */  static struct sk_buff *napi_frags_skb(struct napi_struct *napi)  {  	struct sk_buff *skb = napi->skb; -	struct ethhdr *eth; -	unsigned int hlen; -	unsigned int off; +	const struct ethhdr *eth; +	unsigned int hlen = sizeof(*eth);  	napi->skb = NULL;  	skb_reset_mac_header(skb);  	skb_gro_reset_offset(skb); -	off = skb_gro_offset(skb); -	hlen = off + sizeof(*eth); -	eth = skb_gro_header_fast(skb, off); -	if (skb_gro_header_hard(skb, hlen)) { -		eth = skb_gro_header_slow(skb, hlen, off); +	eth = skb_gro_header_fast(skb, 0); +	if (unlikely(skb_gro_header_hard(skb, hlen))) { +		eth = skb_gro_header_slow(skb, hlen, 0);  		if (unlikely(!eth)) {  			napi_reuse_skb(napi, skb); -			skb = NULL; -			goto out; +			return NULL;  		} +	} else { +		gro_pull_from_frag0(skb, hlen); +		NAPI_GRO_CB(skb)->frag0 += hlen; +		NAPI_GRO_CB(skb)->frag0_len -= hlen;  	} - -	skb_gro_pull(skb, sizeof(*eth)); +	__skb_pull(skb, hlen);  	/*  	 * This works because the only protocols we care about don't require -	 * special handling.  We'll fix it up properly at the end. +	 * special handling. +	 * We'll fix it up properly in napi_frags_finish()  	 */  	skb->protocol = eth->h_proto; -out:  	return skb;  } @@ -4046,12 +4186,14 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)  	if (!skb)  		return GRO_DROP; +	trace_napi_gro_frags_entry(skb); +  	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));  }  EXPORT_SYMBOL(napi_gro_frags);  /* - * net_rps_action sends any pending IPI's for rps. + * net_rps_action_and_irq_enable sends any pending IPI's for rps.   * Note: called with local irq disabled, but exits with local irq enabled.   */  static void net_rps_action_and_irq_enable(struct softnet_data *sd) @@ -4069,8 +4211,8 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)  			struct softnet_data *next = remsd->rps_ipi_next;  			if (cpu_online(remsd->cpu)) -				__smp_call_function_single(remsd->cpu, -							   &remsd->csd, 0); +				smp_call_function_single_async(remsd->cpu, +							   &remsd->csd);  			remsd = next;  		}  	} else @@ -4094,9 +4236,8 @@ static int process_backlog(struct napi_struct *napi, int quota)  #endif  	napi->weight = weight_p;  	local_irq_disable(); -	while (work < quota) { +	while (1) {  		struct sk_buff *skb; -		unsigned int qlen;  		while ((skb = __skb_dequeue(&sd->process_queue))) {  			local_irq_enable(); @@ -4110,24 +4251,24 @@ static int process_backlog(struct napi_struct *napi, int quota)  		}  		rps_lock(sd); -		qlen = skb_queue_len(&sd->input_pkt_queue); -		if (qlen) -			skb_queue_splice_tail_init(&sd->input_pkt_queue, -						   &sd->process_queue); - -		if (qlen < quota - work) { +		if (skb_queue_empty(&sd->input_pkt_queue)) {  			/*  			 * Inline a custom version of __napi_complete().  			 * only current cpu owns and manipulates this napi, -			 * and NAPI_STATE_SCHED is the only possible flag set on backlog. -			 * we can use a plain write instead of clear_bit(), +			 * and NAPI_STATE_SCHED is the only possible flag set +			 * on backlog. +			 * We can use a plain write instead of clear_bit(),  			 * and we dont need an smp_mb() memory barrier.  			 */  			list_del(&napi->poll_list);  			napi->state = 0; +			rps_unlock(sd); -			quota = work + qlen; +			break;  		} + +		skb_queue_splice_tail_init(&sd->input_pkt_queue, +					   &sd->process_queue);  		rps_unlock(sd);  	}  	local_irq_enable(); @@ -4157,7 +4298,7 @@ void __napi_complete(struct napi_struct *n)  	BUG_ON(n->gro_list);  	list_del(&n->poll_list); -	smp_mb__before_clear_bit(); +	smp_mb__before_atomic();  	clear_bit(NAPI_STATE_SCHED, &n->state);  }  EXPORT_SYMBOL(__napi_complete); @@ -4256,17 +4397,10 @@ EXPORT_SYMBOL(netif_napi_add);  void netif_napi_del(struct napi_struct *napi)  { -	struct sk_buff *skb, *next; -  	list_del_init(&napi->dev_list);  	napi_free_frags(napi); -	for (skb = napi->gro_list; skb; skb = next) { -		next = skb->next; -		skb->next = NULL; -		kfree_skb(skb); -	} - +	kfree_skb_list(napi->gro_list);  	napi->gro_list = NULL;  	napi->gro_count = 0;  } @@ -4373,44 +4507,29 @@ struct netdev_adjacent {  	/* upper master flag, there can only be one master device per list */  	bool master; -	/* indicates that this dev is our first-level lower/upper device */ -	bool neighbour; -  	/* counter for the number of times this device was added to us */  	u16 ref_nr; +	/* private field for the users */ +	void *private; +  	struct list_head list;  	struct rcu_head rcu;  };  static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,  						 struct net_device *adj_dev, -						 bool upper) +						 struct list_head *adj_list)  {  	struct netdev_adjacent *adj; -	struct list_head *dev_list; - -	dev_list = upper ? &dev->upper_dev_list : &dev->lower_dev_list; -	list_for_each_entry(adj, dev_list, list) { +	list_for_each_entry(adj, adj_list, list) {  		if (adj->dev == adj_dev)  			return adj;  	}  	return NULL;  } -static inline struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, -							  struct net_device *udev) -{ -	return __netdev_find_adj(dev, udev, true); -} - -static inline struct netdev_adjacent *__netdev_find_lower(struct net_device *dev, -							  struct net_device *ldev) -{ -	return __netdev_find_adj(dev, ldev, false); -} -  /**   * netdev_has_upper_dev - Check if device is linked to an upper device   * @dev: device @@ -4425,7 +4544,7 @@ bool netdev_has_upper_dev(struct net_device *dev,  {  	ASSERT_RTNL(); -	return __netdev_find_upper(dev, upper_dev); +	return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);  }  EXPORT_SYMBOL(netdev_has_upper_dev); @@ -4436,13 +4555,12 @@ EXPORT_SYMBOL(netdev_has_upper_dev);   * Find out if a device is linked to an upper device and return true in case   * it is. The caller must hold the RTNL lock.   */ -bool netdev_has_any_upper_dev(struct net_device *dev) +static bool netdev_has_any_upper_dev(struct net_device *dev)  {  	ASSERT_RTNL(); -	return !list_empty(&dev->upper_dev_list); +	return !list_empty(&dev->all_adj_list.upper);  } -EXPORT_SYMBOL(netdev_has_any_upper_dev);  /**   * netdev_master_upper_dev_get - Get master upper device @@ -4457,10 +4575,10 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)  	ASSERT_RTNL(); -	if (list_empty(&dev->upper_dev_list)) +	if (list_empty(&dev->adj_list.upper))  		return NULL; -	upper = list_first_entry(&dev->upper_dev_list, +	upper = list_first_entry(&dev->adj_list.upper,  				 struct netdev_adjacent, list);  	if (likely(upper->master))  		return upper->dev; @@ -4468,7 +4586,18 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)  }  EXPORT_SYMBOL(netdev_master_upper_dev_get); -/* netdev_upper_get_next_dev_rcu - Get the next dev from upper list +void *netdev_adjacent_get_private(struct list_head *adj_list) +{ +	struct netdev_adjacent *adj; + +	adj = list_entry(adj_list, struct netdev_adjacent, list); + +	return adj->private; +} +EXPORT_SYMBOL(netdev_adjacent_get_private); + +/** + * netdev_upper_get_next_dev_rcu - Get the next dev from upper list   * @dev: device   * @iter: list_head ** of the current position   * @@ -4480,11 +4609,11 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,  {  	struct netdev_adjacent *upper; -	WARN_ON_ONCE(!rcu_read_lock_held()); +	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());  	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); -	if (&upper->list == &dev->upper_dev_list) +	if (&upper->list == &dev->adj_list.upper)  		return NULL;  	*iter = &upper->list; @@ -4494,6 +4623,134 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,  EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);  /** + * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, +						     struct list_head **iter) +{ +	struct netdev_adjacent *upper; + +	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + +	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + +	if (&upper->list == &dev->all_adj_list.upper) +		return NULL; + +	*iter = &upper->list; + +	return upper->dev; +} +EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); + +/** + * netdev_lower_get_next_private - Get the next ->private from the + *				   lower neighbour list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold either hold the + * RTNL lock or its own locking that guarantees that the neighbour lower + * list will remain unchainged. + */ +void *netdev_lower_get_next_private(struct net_device *dev, +				    struct list_head **iter) +{ +	struct netdev_adjacent *lower; + +	lower = list_entry(*iter, struct netdev_adjacent, list); + +	if (&lower->list == &dev->adj_list.lower) +		return NULL; + +	*iter = lower->list.next; + +	return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private); + +/** + * netdev_lower_get_next_private_rcu - Get the next ->private from the + *				       lower neighbour list, RCU + *				       variant + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold RCU read lock. + */ +void *netdev_lower_get_next_private_rcu(struct net_device *dev, +					struct list_head **iter) +{ +	struct netdev_adjacent *lower; + +	WARN_ON_ONCE(!rcu_read_lock_held()); + +	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + +	if (&lower->list == &dev->adj_list.lower) +		return NULL; + +	*iter = &lower->list; + +	return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); + +/** + * netdev_lower_get_next - Get the next device from the lower neighbour + *                         list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's lower neighbour + * list, starting from iter position. The caller must hold RTNL lock or + * its own locking that guarantees that the neighbour lower + * list will remain unchainged. + */ +void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) +{ +	struct netdev_adjacent *lower; + +	lower = list_entry((*iter)->next, struct netdev_adjacent, list); + +	if (&lower->list == &dev->adj_list.lower) +		return NULL; + +	*iter = &lower->list; + +	return lower->dev; +} +EXPORT_SYMBOL(netdev_lower_get_next); + +/** + * netdev_lower_get_first_private_rcu - Get the first ->private from the + *				       lower neighbour list, RCU + *				       variant + * @dev: device + * + * Gets the first netdev_adjacent->private from the dev's lower neighbour + * list. The caller must hold RCU read lock. + */ +void *netdev_lower_get_first_private_rcu(struct net_device *dev) +{ +	struct netdev_adjacent *lower; + +	lower = list_first_or_null_rcu(&dev->adj_list.lower, +			struct netdev_adjacent, list); +	if (lower) +		return lower->private; +	return NULL; +} +EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); + +/**   * netdev_master_upper_dev_get_rcu - Get master upper device   * @dev: device   * @@ -4504,7 +4761,7 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)  {  	struct netdev_adjacent *upper; -	upper = list_first_or_null_rcu(&dev->upper_dev_list, +	upper = list_first_or_null_rcu(&dev->adj_list.upper,  				       struct netdev_adjacent, list);  	if (upper && likely(upper->master))  		return upper->dev; @@ -4512,17 +4769,41 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)  }  EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); +static int netdev_adjacent_sysfs_add(struct net_device *dev, +			      struct net_device *adj_dev, +			      struct list_head *dev_list) +{ +	char linkname[IFNAMSIZ+7]; +	sprintf(linkname, dev_list == &dev->adj_list.upper ? +		"upper_%s" : "lower_%s", adj_dev->name); +	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), +				 linkname); +} +static void netdev_adjacent_sysfs_del(struct net_device *dev, +			       char *name, +			       struct list_head *dev_list) +{ +	char linkname[IFNAMSIZ+7]; +	sprintf(linkname, dev_list == &dev->adj_list.upper ? +		"upper_%s" : "lower_%s", name); +	sysfs_remove_link(&(dev->dev.kobj), linkname); +} + +#define netdev_adjacent_is_neigh_list(dev, dev_list) \ +		(dev_list == &dev->adj_list.upper || \ +		 dev_list == &dev->adj_list.lower) +  static int __netdev_adjacent_dev_insert(struct net_device *dev,  					struct net_device *adj_dev, -					bool neighbour, bool master, -					bool upper) +					struct list_head *dev_list, +					void *private, bool master)  {  	struct netdev_adjacent *adj; +	int ret; -	adj = __netdev_find_adj(dev, adj_dev, upper); +	adj = __netdev_find_adj(dev, adj_dev, dev_list);  	if (adj) { -		BUG_ON(neighbour);  		adj->ref_nr++;  		return 0;  	} @@ -4533,124 +4814,159 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,  	adj->dev = adj_dev;  	adj->master = master; -	adj->neighbour = neighbour;  	adj->ref_nr = 1; - +	adj->private = private;  	dev_hold(adj_dev); -	pr_debug("dev_hold for %s, because of %s link added from %s to %s\n", -		 adj_dev->name, upper ? "upper" : "lower", dev->name, -		 adj_dev->name); -	if (!upper) { -		list_add_tail_rcu(&adj->list, &dev->lower_dev_list); -		return 0; +	pr_debug("dev_hold for %s, because of link added from %s to %s\n", +		 adj_dev->name, dev->name, adj_dev->name); + +	if (netdev_adjacent_is_neigh_list(dev, dev_list)) { +		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); +		if (ret) +			goto free_adj;  	} -	/* Ensure that master upper link is always the first item in list. */ -	if (master) -		list_add_rcu(&adj->list, &dev->upper_dev_list); -	else -		list_add_tail_rcu(&adj->list, &dev->upper_dev_list); +	/* Ensure that master link is always the first item in list. */ +	if (master) { +		ret = sysfs_create_link(&(dev->dev.kobj), +					&(adj_dev->dev.kobj), "master"); +		if (ret) +			goto remove_symlinks; + +		list_add_rcu(&adj->list, dev_list); +	} else { +		list_add_tail_rcu(&adj->list, dev_list); +	}  	return 0; -} -static inline int __netdev_upper_dev_insert(struct net_device *dev, -					    struct net_device *udev, -					    bool master, bool neighbour) -{ -	return __netdev_adjacent_dev_insert(dev, udev, neighbour, master, -					    true); -} +remove_symlinks: +	if (netdev_adjacent_is_neigh_list(dev, dev_list)) +		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); +free_adj: +	kfree(adj); +	dev_put(adj_dev); -static inline int __netdev_lower_dev_insert(struct net_device *dev, -					    struct net_device *ldev, -					    bool neighbour) -{ -	return __netdev_adjacent_dev_insert(dev, ldev, neighbour, false, -					    false); +	return ret;  } -void __netdev_adjacent_dev_remove(struct net_device *dev, -				  struct net_device *adj_dev, bool upper) +static void __netdev_adjacent_dev_remove(struct net_device *dev, +					 struct net_device *adj_dev, +					 struct list_head *dev_list)  {  	struct netdev_adjacent *adj; -	if (upper) -		adj = __netdev_find_upper(dev, adj_dev); -	else -		adj = __netdev_find_lower(dev, adj_dev); +	adj = __netdev_find_adj(dev, adj_dev, dev_list); -	if (!adj) +	if (!adj) { +		pr_err("tried to remove device %s from %s\n", +		       dev->name, adj_dev->name);  		BUG(); +	}  	if (adj->ref_nr > 1) { +		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, +			 adj->ref_nr-1);  		adj->ref_nr--;  		return;  	} +	if (adj->master) +		sysfs_remove_link(&(dev->dev.kobj), "master"); + +	if (netdev_adjacent_is_neigh_list(dev, dev_list)) +		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); +  	list_del_rcu(&adj->list); -	pr_debug("dev_put for %s, because of %s link removed from %s to %s\n", -		 adj_dev->name, upper ? "upper" : "lower", dev->name, -		 adj_dev->name); +	pr_debug("dev_put for %s, because link removed from %s to %s\n", +		 adj_dev->name, dev->name, adj_dev->name);  	dev_put(adj_dev);  	kfree_rcu(adj, rcu);  } -static inline void __netdev_upper_dev_remove(struct net_device *dev, -					     struct net_device *udev) -{ -	return __netdev_adjacent_dev_remove(dev, udev, true); -} - -static inline void __netdev_lower_dev_remove(struct net_device *dev, -					     struct net_device *ldev) -{ -	return __netdev_adjacent_dev_remove(dev, ldev, false); -} - -int __netdev_adjacent_dev_insert_link(struct net_device *dev, -				      struct net_device *upper_dev, -				      bool master, bool neighbour) +static int __netdev_adjacent_dev_link_lists(struct net_device *dev, +					    struct net_device *upper_dev, +					    struct list_head *up_list, +					    struct list_head *down_list, +					    void *private, bool master)  {  	int ret; -	ret = __netdev_upper_dev_insert(dev, upper_dev, master, neighbour); +	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, +					   master);  	if (ret)  		return ret; -	ret = __netdev_lower_dev_insert(upper_dev, dev, neighbour); +	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, +					   false);  	if (ret) { -		__netdev_upper_dev_remove(dev, upper_dev); +		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);  		return ret;  	}  	return 0;  } -static inline int __netdev_adjacent_dev_link(struct net_device *dev, -					     struct net_device *udev) +static int __netdev_adjacent_dev_link(struct net_device *dev, +				      struct net_device *upper_dev)  { -	return __netdev_adjacent_dev_insert_link(dev, udev, false, false); +	return __netdev_adjacent_dev_link_lists(dev, upper_dev, +						&dev->all_adj_list.upper, +						&upper_dev->all_adj_list.lower, +						NULL, false);  } -static inline int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, -						       struct net_device *udev, -						       bool master) +static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, +					       struct net_device *upper_dev, +					       struct list_head *up_list, +					       struct list_head *down_list)  { -	return __netdev_adjacent_dev_insert_link(dev, udev, master, true); +	__netdev_adjacent_dev_remove(dev, upper_dev, up_list); +	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);  } -void __netdev_adjacent_dev_unlink(struct net_device *dev, -				  struct net_device *upper_dev) +static void __netdev_adjacent_dev_unlink(struct net_device *dev, +					 struct net_device *upper_dev)  { -	__netdev_upper_dev_remove(dev, upper_dev); -	__netdev_lower_dev_remove(upper_dev, dev); +	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, +					   &dev->all_adj_list.upper, +					   &upper_dev->all_adj_list.lower);  } +static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, +						struct net_device *upper_dev, +						void *private, bool master) +{ +	int ret = __netdev_adjacent_dev_link(dev, upper_dev); + +	if (ret) +		return ret; + +	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, +					       &dev->adj_list.upper, +					       &upper_dev->adj_list.lower, +					       private, master); +	if (ret) { +		__netdev_adjacent_dev_unlink(dev, upper_dev); +		return ret; +	} + +	return 0; +} + +static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, +						   struct net_device *upper_dev) +{ +	__netdev_adjacent_dev_unlink(dev, upper_dev); +	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, +					   &dev->adj_list.upper, +					   &upper_dev->adj_list.lower); +}  static int __netdev_upper_dev_link(struct net_device *dev, -				   struct net_device *upper_dev, bool master) +				   struct net_device *upper_dev, bool master, +				   void *private)  {  	struct netdev_adjacent *i, *j, *to_i, *to_j;  	int ret = 0; @@ -4661,26 +4977,29 @@ static int __netdev_upper_dev_link(struct net_device *dev,  		return -EBUSY;  	/* To prevent loops, check if dev is not upper device to upper_dev. */ -	if (__netdev_find_upper(upper_dev, dev)) +	if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))  		return -EBUSY; -	if (__netdev_find_upper(dev, upper_dev)) +	if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))  		return -EEXIST;  	if (master && netdev_master_upper_dev_get(dev))  		return -EBUSY; -	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, master); +	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, +						   master);  	if (ret)  		return ret;  	/* Now that we linked these devs, make all the upper_dev's -	 * upper_dev_list visible to every dev's lower_dev_list and vice +	 * all_adj_list.upper visible to every dev's all_adj_list.lower an  	 * versa, and don't forget the devices itself. All of these  	 * links are non-neighbours.  	 */ -	list_for_each_entry(i, &dev->lower_dev_list, list) { -		list_for_each_entry(j, &upper_dev->upper_dev_list, list) { +	list_for_each_entry(i, &dev->all_adj_list.lower, list) { +		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { +			pr_debug("Interlinking %s with %s, non-neighbour\n", +				 i->dev->name, j->dev->name);  			ret = __netdev_adjacent_dev_link(i->dev, j->dev);  			if (ret)  				goto rollback_mesh; @@ -4688,14 +5007,18 @@ static int __netdev_upper_dev_link(struct net_device *dev,  	}  	/* add dev to every upper_dev's upper device */ -	list_for_each_entry(i, &upper_dev->upper_dev_list, list) { +	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { +		pr_debug("linking %s's upper device %s with %s\n", +			 upper_dev->name, i->dev->name, dev->name);  		ret = __netdev_adjacent_dev_link(dev, i->dev);  		if (ret)  			goto rollback_upper_mesh;  	}  	/* add upper_dev to every dev's lower device */ -	list_for_each_entry(i, &dev->lower_dev_list, list) { +	list_for_each_entry(i, &dev->all_adj_list.lower, list) { +		pr_debug("linking %s's lower device %s with %s\n", dev->name, +			 i->dev->name, upper_dev->name);  		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);  		if (ret)  			goto rollback_lower_mesh; @@ -4706,7 +5029,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,  rollback_lower_mesh:  	to_i = i; -	list_for_each_entry(i, &dev->lower_dev_list, list) { +	list_for_each_entry(i, &dev->all_adj_list.lower, list) {  		if (i == to_i)  			break;  		__netdev_adjacent_dev_unlink(i->dev, upper_dev); @@ -4716,7 +5039,7 @@ rollback_lower_mesh:  rollback_upper_mesh:  	to_i = i; -	list_for_each_entry(i, &upper_dev->upper_dev_list, list) { +	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {  		if (i == to_i)  			break;  		__netdev_adjacent_dev_unlink(dev, i->dev); @@ -4727,8 +5050,8 @@ rollback_upper_mesh:  rollback_mesh:  	to_i = i;  	to_j = j; -	list_for_each_entry(i, &dev->lower_dev_list, list) { -		list_for_each_entry(j, &upper_dev->upper_dev_list, list) { +	list_for_each_entry(i, &dev->all_adj_list.lower, list) { +		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {  			if (i == to_i && j == to_j)  				break;  			__netdev_adjacent_dev_unlink(i->dev, j->dev); @@ -4737,7 +5060,7 @@ rollback_mesh:  			break;  	} -	__netdev_adjacent_dev_unlink(dev, upper_dev); +	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);  	return ret;  } @@ -4755,7 +5078,7 @@ rollback_mesh:  int netdev_upper_dev_link(struct net_device *dev,  			  struct net_device *upper_dev)  { -	return __netdev_upper_dev_link(dev, upper_dev, false); +	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);  }  EXPORT_SYMBOL(netdev_upper_dev_link); @@ -4773,10 +5096,18 @@ EXPORT_SYMBOL(netdev_upper_dev_link);  int netdev_master_upper_dev_link(struct net_device *dev,  				 struct net_device *upper_dev)  { -	return __netdev_upper_dev_link(dev, upper_dev, true); +	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);  }  EXPORT_SYMBOL(netdev_master_upper_dev_link); +int netdev_master_upper_dev_link_private(struct net_device *dev, +					 struct net_device *upper_dev, +					 void *private) +{ +	return __netdev_upper_dev_link(dev, upper_dev, true, private); +} +EXPORT_SYMBOL(netdev_master_upper_dev_link_private); +  /**   * netdev_upper_dev_unlink - Removes a link to upper device   * @dev: device @@ -4791,38 +5122,96 @@ void netdev_upper_dev_unlink(struct net_device *dev,  	struct netdev_adjacent *i, *j;  	ASSERT_RTNL(); -	__netdev_adjacent_dev_unlink(dev, upper_dev); +	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);  	/* Here is the tricky part. We must remove all dev's lower  	 * devices from all upper_dev's upper devices and vice  	 * versa, to maintain the graph relationship.  	 */ -	list_for_each_entry(i, &dev->lower_dev_list, list) -		list_for_each_entry(j, &upper_dev->upper_dev_list, list) +	list_for_each_entry(i, &dev->all_adj_list.lower, list) +		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)  			__netdev_adjacent_dev_unlink(i->dev, j->dev);  	/* remove also the devices itself from lower/upper device  	 * list  	 */ -	list_for_each_entry(i, &dev->lower_dev_list, list) +	list_for_each_entry(i, &dev->all_adj_list.lower, list)  		__netdev_adjacent_dev_unlink(i->dev, upper_dev); -	list_for_each_entry(i, &upper_dev->upper_dev_list, list) +	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)  		__netdev_adjacent_dev_unlink(dev, i->dev);  	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);  }  EXPORT_SYMBOL(netdev_upper_dev_unlink); +void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) +{ +	struct netdev_adjacent *iter; + +	list_for_each_entry(iter, &dev->adj_list.upper, list) { +		netdev_adjacent_sysfs_del(iter->dev, oldname, +					  &iter->dev->adj_list.lower); +		netdev_adjacent_sysfs_add(iter->dev, dev, +					  &iter->dev->adj_list.lower); +	} + +	list_for_each_entry(iter, &dev->adj_list.lower, list) { +		netdev_adjacent_sysfs_del(iter->dev, oldname, +					  &iter->dev->adj_list.upper); +		netdev_adjacent_sysfs_add(iter->dev, dev, +					  &iter->dev->adj_list.upper); +	} +} + +void *netdev_lower_dev_get_private(struct net_device *dev, +				   struct net_device *lower_dev) +{ +	struct netdev_adjacent *lower; + +	if (!lower_dev) +		return NULL; +	lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower); +	if (!lower) +		return NULL; + +	return lower->private; +} +EXPORT_SYMBOL(netdev_lower_dev_get_private); + + +int dev_get_nest_level(struct net_device *dev, +		       bool (*type_check)(struct net_device *dev)) +{ +	struct net_device *lower = NULL; +	struct list_head *iter; +	int max_nest = -1; +	int nest; + +	ASSERT_RTNL(); + +	netdev_for_each_lower_dev(dev, lower, iter) { +		nest = dev_get_nest_level(lower, type_check); +		if (max_nest < nest) +			max_nest = nest; +	} + +	if (type_check(dev)) +		max_nest++; + +	return max_nest; +} +EXPORT_SYMBOL(dev_get_nest_level); +  static void dev_change_rx_flags(struct net_device *dev, int flags)  {  	const struct net_device_ops *ops = dev->netdev_ops; -	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags) +	if (ops->ndo_change_rx_flags)  		ops->ndo_change_rx_flags(dev, flags);  } -static int __dev_set_promiscuity(struct net_device *dev, int inc) +static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)  {  	unsigned int old_flags = dev->flags;  	kuid_t uid; @@ -4865,6 +5254,8 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc)  		dev_change_rx_flags(dev, IFF_PROMISC);  	} +	if (notify) +		__dev_notify_flags(dev, old_flags, IFF_PROMISC);  	return 0;  } @@ -4884,7 +5275,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc)  	unsigned int old_flags = dev->flags;  	int err; -	err = __dev_set_promiscuity(dev, inc); +	err = __dev_set_promiscuity(dev, inc, true);  	if (err < 0)  		return err;  	if (dev->flags != old_flags) @@ -4893,22 +5284,9 @@ int dev_set_promiscuity(struct net_device *dev, int inc)  }  EXPORT_SYMBOL(dev_set_promiscuity); -/** - *	dev_set_allmulti	- update allmulti count on a device - *	@dev: device - *	@inc: modifier - * - *	Add or remove reception of all multicast frames to a device. While the - *	count in the device remains above zero the interface remains listening - *	to all interfaces. Once it hits zero the device reverts back to normal - *	filtering operation. A negative @inc value is used to drop the counter - *	when releasing a resource needing all multicasts. - *	Return 0 if successful or a negative errno code on error. - */ - -int dev_set_allmulti(struct net_device *dev, int inc) +static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)  { -	unsigned int old_flags = dev->flags; +	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;  	ASSERT_RTNL(); @@ -4931,9 +5309,30 @@ int dev_set_allmulti(struct net_device *dev, int inc)  	if (dev->flags ^ old_flags) {  		dev_change_rx_flags(dev, IFF_ALLMULTI);  		dev_set_rx_mode(dev); +		if (notify) +			__dev_notify_flags(dev, old_flags, +					   dev->gflags ^ old_gflags);  	}  	return 0;  } + +/** + *	dev_set_allmulti	- update allmulti count on a device + *	@dev: device + *	@inc: modifier + * + *	Add or remove reception of all multicast frames to a device. While the + *	count in the device remains above zero the interface remains listening + *	to all interfaces. Once it hits zero the device reverts back to normal + *	filtering operation. A negative @inc value is used to drop the counter + *	when releasing a resource needing all multicasts. + *	Return 0 if successful or a negative errno code on error. + */ + +int dev_set_allmulti(struct net_device *dev, int inc) +{ +	return __dev_set_allmulti(dev, inc, true); +}  EXPORT_SYMBOL(dev_set_allmulti);  /* @@ -4958,10 +5357,10 @@ void __dev_set_rx_mode(struct net_device *dev)  		 * therefore calling __dev_set_promiscuity here is safe.  		 */  		if (!netdev_uc_empty(dev) && !dev->uc_promisc) { -			__dev_set_promiscuity(dev, 1); +			__dev_set_promiscuity(dev, 1, false);  			dev->uc_promisc = true;  		} else if (netdev_uc_empty(dev) && dev->uc_promisc) { -			__dev_set_promiscuity(dev, -1); +			__dev_set_promiscuity(dev, -1, false);  			dev->uc_promisc = false;  		}  	} @@ -5050,9 +5449,13 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)  	if ((flags ^ dev->gflags) & IFF_PROMISC) {  		int inc = (flags & IFF_PROMISC) ? 1 : -1; +		unsigned int old_flags = dev->flags;  		dev->gflags ^= IFF_PROMISC; -		dev_set_promiscuity(dev, inc); + +		if (__dev_set_promiscuity(dev, inc, false) >= 0) +			if (dev->flags != old_flags) +				dev_set_rx_mode(dev);  	}  	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI @@ -5063,16 +5466,20 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)  		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;  		dev->gflags ^= IFF_ALLMULTI; -		dev_set_allmulti(dev, inc); +		__dev_set_allmulti(dev, inc, false);  	}  	return ret;  } -void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, +			unsigned int gchanges)  {  	unsigned int changes = dev->flags ^ old_flags; +	if (gchanges) +		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); +  	if (changes & IFF_UP) {  		if (dev->flags & IFF_UP)  			call_netdevice_notifiers(NETDEV_UP, dev); @@ -5101,21 +5508,29 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)  int dev_change_flags(struct net_device *dev, unsigned int flags)  {  	int ret; -	unsigned int changes, old_flags = dev->flags; +	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;  	ret = __dev_change_flags(dev, flags);  	if (ret < 0)  		return ret; -	changes = old_flags ^ dev->flags; -	if (changes) -		rtmsg_ifinfo(RTM_NEWLINK, dev, changes); - -	__dev_notify_flags(dev, old_flags); +	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); +	__dev_notify_flags(dev, old_flags, changes);  	return ret;  }  EXPORT_SYMBOL(dev_change_flags); +static int __dev_set_mtu(struct net_device *dev, int new_mtu) +{ +	const struct net_device_ops *ops = dev->netdev_ops; + +	if (ops->ndo_change_mtu) +		return ops->ndo_change_mtu(dev, new_mtu); + +	dev->mtu = new_mtu; +	return 0; +} +  /**   *	dev_set_mtu - Change maximum transfer unit   *	@dev: device @@ -5125,8 +5540,7 @@ EXPORT_SYMBOL(dev_change_flags);   */  int dev_set_mtu(struct net_device *dev, int new_mtu)  { -	const struct net_device_ops *ops = dev->netdev_ops; -	int err; +	int err, orig_mtu;  	if (new_mtu == dev->mtu)  		return 0; @@ -5138,14 +5552,25 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)  	if (!netif_device_present(dev))  		return -ENODEV; -	err = 0; -	if (ops->ndo_change_mtu) -		err = ops->ndo_change_mtu(dev, new_mtu); -	else -		dev->mtu = new_mtu; +	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); +	err = notifier_to_errno(err); +	if (err) +		return err; -	if (!err) -		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); +	orig_mtu = dev->mtu; +	err = __dev_set_mtu(dev, new_mtu); + +	if (!err) { +		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); +		err = notifier_to_errno(err); +		if (err) { +			/* setting mtu back and notifying everyone again, +			 * so that they have a chance to revert changes. +			 */ +			__dev_set_mtu(dev, orig_mtu); +			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); +		} +	}  	return err;  }  EXPORT_SYMBOL(dev_set_mtu); @@ -5247,15 +5672,18 @@ static int dev_new_index(struct net *net)  /* Delayed registration/unregisteration */  static LIST_HEAD(net_todo_list); +DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);  static void net_set_todo(struct net_device *dev)  {  	list_add_tail(&dev->todo_list, &net_todo_list); +	dev_net(dev)->dev_unreg_count++;  }  static void rollback_registered_many(struct list_head *head)  {  	struct net_device *dev, *tmp; +	LIST_HEAD(close_head);  	BUG_ON(dev_boot_phase);  	ASSERT_RTNL(); @@ -5278,7 +5706,9 @@ static void rollback_registered_many(struct list_head *head)  	}  	/* If device is running, close it first. */ -	dev_close_many(head); +	list_for_each_entry(dev, head, unreg_list) +		list_add_tail(&dev->close_list, &close_head); +	dev_close_many(&close_head);  	list_for_each_entry(dev, head, unreg_list) {  		/* And unlink it from device chain. */ @@ -5299,10 +5729,6 @@ static void rollback_registered_many(struct list_head *head)  		*/  		call_netdevice_notifiers(NETDEV_UNREGISTER, dev); -		if (!dev->rtnl_link_ops || -		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED) -			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); -  		/*  		 *	Flush the unicast and multicast chains  		 */ @@ -5312,6 +5738,10 @@ static void rollback_registered_many(struct list_head *head)  		if (dev->netdev_ops->ndo_uninit)  			dev->netdev_ops->ndo_uninit(dev); +		if (!dev->rtnl_link_ops || +		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED) +			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); +  		/* Notifier chain MUST detach us all upper devices. */  		WARN_ON(netdev_has_any_upper_dev(dev)); @@ -5395,6 +5825,13 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,  		}  	} +#ifdef CONFIG_NET_RX_BUSY_POLL +	if (dev->netdev_ops->ndo_busy_poll) +		features |= NETIF_F_BUSY_POLL; +	else +#endif +		features &= ~NETIF_F_BUSY_POLL; +  	return features;  } @@ -5494,7 +5931,7 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,  }  EXPORT_SYMBOL(netif_stacked_transfer_operstate); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS  static int netif_alloc_rx_queues(struct net_device *dev)  {  	unsigned int i, count = dev->num_rx_queues; @@ -5530,10 +5967,7 @@ static void netdev_init_one_queue(struct net_device *dev,  static void netif_free_tx_queues(struct net_device *dev)  { -	if (is_vmalloc_addr(dev->_tx)) -		vfree(dev->_tx); -	else -		kfree(dev->_tx); +	kvfree(dev->_tx);  }  static int netif_alloc_netdev_queues(struct net_device *dev) @@ -5633,13 +6067,8 @@ int register_netdevice(struct net_device *dev)  	dev->features |= NETIF_F_SOFT_FEATURES;  	dev->wanted_features = dev->features & dev->hw_features; -	/* Turn on no cache copy if HW is doing checksum */  	if (!(dev->flags & IFF_LOOPBACK)) {  		dev->hw_features |= NETIF_F_NOCACHE_COPY; -		if (dev->features & NETIF_F_ALL_CSUM) { -			dev->wanted_features |= NETIF_F_NOCACHE_COPY; -			dev->features |= NETIF_F_NOCACHE_COPY; -		}  	}  	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices. @@ -5700,7 +6129,7 @@ int register_netdevice(struct net_device *dev)  	 */  	if (!dev->rtnl_link_ops ||  	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED) -		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); +		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);  out:  	return ret; @@ -5918,6 +6347,12 @@ void netdev_run_todo(void)  		if (dev->destructor)  			dev->destructor(dev); +		/* Report a network device has been unregistered */ +		rtnl_lock(); +		dev_net(dev)->dev_unreg_count--; +		__rtnl_unlock(); +		wake_up(&netdev_unregistering_wq); +  		/* Free network device */  		kobject_put(&dev->dev.kobj);  	} @@ -5969,6 +6404,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,  		netdev_stats_to_stats64(storage, &dev->stats);  	}  	storage->rx_dropped += atomic_long_read(&dev->rx_dropped); +	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);  	return storage;  }  EXPORT_SYMBOL(dev_get_stats); @@ -6001,6 +6437,13 @@ void netdev_set_default_ethtool_ops(struct net_device *dev,  }  EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); +void netdev_freemem(struct net_device *dev) +{ +	char *addr = (char *)dev - dev->padded; + +	kvfree(addr); +} +  /**   *	alloc_netdev_mqs - allocate network device   *	@sizeof_priv:	size of private data to allocate space for @@ -6010,7 +6453,7 @@ EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);   *	@rxqs:		the number of RX subqueues to allocate   *   *	Allocates a struct net_device with private data area for driver use - *	and performs basic initialization.  Also allocates subquue structs + *	and performs basic initialization.  Also allocates subqueue structs   *	for each queue on the device.   */  struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, @@ -6028,7 +6471,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  		return NULL;  	} -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS  	if (rxqs < 1) {  		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");  		return NULL; @@ -6044,7 +6487,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  	/* ensure 32-byte alignment of whole construct */  	alloc_size += NETDEV_ALIGN - 1; -	p = kzalloc(alloc_size, GFP_KERNEL); +	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); +	if (!p) +		p = vzalloc(alloc_size);  	if (!p)  		return NULL; @@ -6053,7 +6498,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  	dev->pcpu_refcnt = alloc_percpu(int);  	if (!dev->pcpu_refcnt) -		goto free_p; +		goto free_dev;  	if (dev_addr_init(dev))  		goto free_pcpu; @@ -6068,9 +6513,12 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  	INIT_LIST_HEAD(&dev->napi_list);  	INIT_LIST_HEAD(&dev->unreg_list); +	INIT_LIST_HEAD(&dev->close_list);  	INIT_LIST_HEAD(&dev->link_watch_list); -	INIT_LIST_HEAD(&dev->upper_dev_list); -	INIT_LIST_HEAD(&dev->lower_dev_list); +	INIT_LIST_HEAD(&dev->adj_list.upper); +	INIT_LIST_HEAD(&dev->adj_list.lower); +	INIT_LIST_HEAD(&dev->all_adj_list.upper); +	INIT_LIST_HEAD(&dev->all_adj_list.lower);  	dev->priv_flags = IFF_XMIT_DST_RELEASE;  	setup(dev); @@ -6079,7 +6527,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  	if (netif_alloc_netdev_queues(dev))  		goto free_all; -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS  	dev->num_rx_queues = rxqs;  	dev->real_num_rx_queues = rxqs;  	if (netif_alloc_rx_queues(dev)) @@ -6098,13 +6546,8 @@ free_all:  free_pcpu:  	free_percpu(dev->pcpu_refcnt); -	netif_free_tx_queues(dev); -#ifdef CONFIG_RPS -	kfree(dev->_rx); -#endif - -free_p: -	kfree(p); +free_dev: +	netdev_freemem(dev);  	return NULL;  }  EXPORT_SYMBOL(alloc_netdev_mqs); @@ -6124,7 +6567,7 @@ void free_netdev(struct net_device *dev)  	release_net(dev_net(dev));  	netif_free_tx_queues(dev); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS  	kfree(dev->_rx);  #endif @@ -6141,7 +6584,7 @@ void free_netdev(struct net_device *dev)  	/*  Compatibility with error handling in drivers */  	if (dev->reg_state == NETREG_UNINITIALIZED) { -		kfree((char *)dev - dev->padded); +		netdev_freemem(dev);  		return;  	} @@ -6199,6 +6642,9 @@ EXPORT_SYMBOL(unregister_netdevice_queue);  /**   *	unregister_netdevice_many - unregister many devices   *	@head: list of devices + * + *  Note: As most callers use a stack allocated list_head, + *  we force a list_del() to make sure stack wont be corrupted later.   */  void unregister_netdevice_many(struct list_head *head)  { @@ -6208,6 +6654,7 @@ void unregister_netdevice_many(struct list_head *head)  		rollback_registered_many(head);  		list_for_each_entry(dev, head, unreg_list)  			net_set_todo(dev); +		list_del(head);  	}  }  EXPORT_SYMBOL(unregister_netdevice_many); @@ -6303,7 +6750,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);  	rcu_barrier();  	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); -	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); +	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);  	/*  	 *	Flush the unicast and multicast chains @@ -6342,7 +6789,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  	 *	Prevent userspace races by waiting until the network  	 *	device is fully setup before sending notifications.  	 */ -	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); +	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);  	synchronize_net();  	err = 0; @@ -6394,11 +6841,11 @@ static int dev_cpu_callback(struct notifier_block *nfb,  	/* Process offline CPU's input_pkt_queue */  	while ((skb = __skb_dequeue(&oldsd->process_queue))) { -		netif_rx(skb); +		netif_rx_internal(skb);  		input_queue_head_incr(oldsd);  	}  	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { -		netif_rx(skb); +		netif_rx_internal(skb);  		input_queue_head_incr(oldsd);  	} @@ -6603,6 +7050,34 @@ static void __net_exit default_device_exit(struct net *net)  	rtnl_unlock();  } +static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) +{ +	/* Return with the rtnl_lock held when there are no network +	 * devices unregistering in any network namespace in net_list. +	 */ +	struct net *net; +	bool unregistering; +	DEFINE_WAIT(wait); + +	for (;;) { +		prepare_to_wait(&netdev_unregistering_wq, &wait, +				TASK_UNINTERRUPTIBLE); +		unregistering = false; +		rtnl_lock(); +		list_for_each_entry(net, net_list, exit_list) { +			if (net->dev_unreg_count > 0) { +				unregistering = true; +				break; +			} +		} +		if (!unregistering) +			break; +		__rtnl_unlock(); +		schedule(); +	} +	finish_wait(&netdev_unregistering_wq, &wait); +} +  static void __net_exit default_device_exit_batch(struct list_head *net_list)  {  	/* At exit all network devices most be removed from a network @@ -6614,7 +7089,18 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)  	struct net *net;  	LIST_HEAD(dev_kill_list); -	rtnl_lock(); +	/* To prevent network device cleanup code from dereferencing +	 * loopback devices or network devices that have been freed +	 * wait here for all pending unregistrations to complete, +	 * before unregistring the loopback device and allowing the +	 * network namespace be freed. +	 * +	 * The netdev todo list containing all network devices +	 * unregistrations that happen in default_device_exit_batch +	 * will run in the rtnl_unlock() at the end of +	 * default_device_exit_batch. +	 */ +	rtnl_lock_unregistering(net_list);  	list_for_each_entry(net, net_list, exit_list) {  		for_each_netdev_reverse(net, dev) {  			if (dev->rtnl_link_ops) @@ -6624,7 +7110,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)  		}  	}  	unregister_netdevice_many(&dev_kill_list); -	list_del(&dev_kill_list);  	rtnl_unlock();  } @@ -6672,28 +7157,18 @@ static int __init net_dev_init(void)  	for_each_possible_cpu(i) {  		struct softnet_data *sd = &per_cpu(softnet_data, i); -		memset(sd, 0, sizeof(*sd));  		skb_queue_head_init(&sd->input_pkt_queue);  		skb_queue_head_init(&sd->process_queue); -		sd->completion_queue = NULL;  		INIT_LIST_HEAD(&sd->poll_list); -		sd->output_queue = NULL;  		sd->output_queue_tailp = &sd->output_queue;  #ifdef CONFIG_RPS  		sd->csd.func = rps_trigger_softirq;  		sd->csd.info = sd; -		sd->csd.flags = 0;  		sd->cpu = i;  #endif  		sd->backlog.poll = process_backlog;  		sd->backlog.weight = weight_p; -		sd->backlog.gro_list = NULL; -		sd->backlog.gro_count = 0; - -#ifdef CONFIG_NET_FLOW_LIMIT -		sd->flow_limit = NULL; -#endif  	}  	dev_boot_phase = 0;  | 
